Add discovery + multi-page extract modes for setup wizard

- Discovery mode: quick Playwright crawl returns detected menu sub-pages - Extract_page mode: processes single menu page through Claude individually - More aggressive HTML stripping: removes SVG, nav, footer, form, attributes - Increased truncation limit from 100KB to 200KB for generic fallback path - Enables interactive wizard flow: discover → confirm → extract each page Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 17:03:54 -07:00 · 2026-03-14 17:03:54 -07:00 · 4ac13de09d
commit 4ac13de09d
parent 552c404cf6
1 changed files with 237 additions and 3 deletions
--- a/api/setup/analyzeMenuUrl.php
+++ b/api/setup/analyzeMenuUrl.php
@ -36,6 +36,222 @@ try {
    $data = readJsonBody();
    if (empty($data)) throw new Exception('No request body provided');

+    // ============================================================
+    // DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
+    // ============================================================
+    if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) {
+        $discoverUrl = trim($data['url']);
+        $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1");
+        if (empty(trim($pwOutput ?? ''))) {
+            throw new Exception("Playwright returned empty response");
+        }
+        $pwResult = json_decode($pwOutput, true);
+        if (isset($pwResult['error'])) {
+            throw new Exception("Playwright error: " . $pwResult['error']);
+        }
+
+        $subPages = $pwResult['subPagesVisited'] ?? [];
+        $platformPages = $pwResult['platformPagesVisited'] ?? [];
+
+        // Extract business name from title
+        $html = $pwResult['html'] ?? '';
+        $siteName = '';
+        if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
+            $siteName = trim($tm[1]);
+            $siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
+            $siteName = trim($siteName);
+        }
+
+        // Extract menu names from sub-page URLs
+        $menuPages = [];
+        foreach ($subPages as $spUrl) {
+            $path = parse_url($spUrl, PHP_URL_PATH);
+            $slug = trim($path, '/');
+            if (strpos($slug, '/') !== false) $slug = basename($slug);
+            $menuName = ucwords(str_replace(['-', '_'], ' ', $slug));
+            $menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug];
+        }
+
+        // Check for ordering platform links
+        $hasPlatform = count($platformPages) > 0;
+
+        jsonResponse([
+            'OK' => true,
+            'mode' => 'discover',
+            'siteName' => $siteName,
+            'mainUrl' => $discoverUrl,
+            'menuPages' => $menuPages,
+            'platformPages' => $platformPages,
+            'hasPlatform' => $hasPlatform,
+            'totalPagesFound' => count($menuPages),
+        ]);
+    }
+
+    // ============================================================
+    // MULTI-PAGE MODE: Process specific URLs individually through Claude
+    // ============================================================
+    if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) {
+        // Process a single menu page through Playwright + Claude
+        // The frontend calls this once per confirmed menu page
+        $singleUrl = trim($data['url']);
+        $menuName = $data['menuName'] ?? '';
+
+        $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1");
+        if (empty(trim($pwOutput ?? ''))) {
+            throw new Exception("Playwright returned empty response for $singleUrl");
+        }
+        $pwResult = json_decode($pwOutput, true);
+        if (isset($pwResult['error'])) {
+            throw new Exception("Playwright error: " . $pwResult['error']);
+        }
+
+        $singleHtml = $pwResult['html'] ?? '';
+        // Strip to just the main page (no sub-page following for single-page extract)
+        $marker = strpos($singleHtml, '<!-- === SUB-PAGE:');
+        if ($marker !== false) {
+            $singleHtml = substr($singleHtml, 0, $marker);
+        }
+
+        // Aggressive cleanup
+        $singleHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#<!--.*?-->#s', '', $singleHtml);
+        $singleHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $singleHtml);
+        $singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml);
+        $singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml);
+        $singleHtml = preg_replace('#>\s+<#', '><', $singleHtml);
+        if (strlen($singleHtml) > 100000) {
+            $singleHtml = substr($singleHtml, 0, 100000);
+        }
+
+        // Extract images from this page
+        $singleImages = [];
+        $singleImageData = [];
+        if (preg_match_all('#<img[^>]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) {
+            foreach ($imgMatches as $imgMatch) {
+                $imgSrc = $imgMatch[1];
+                if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue;
+                if (!preg_match('#^https?://#i', $imgSrc)) {
+                    $origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl);
+                    $imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc;
+                }
+                $singleImages[] = $imgSrc;
+            }
+        }
+        // Also add Playwright-captured images
+        foreach (($pwResult['images'] ?? []) as $pwImg) {
+            if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
+                $singleImages[] = $pwImg;
+            }
+        }
+        $singleImages = array_values(array_unique($singleImages));
+
+        // Download up to 5 images for this page
+        $imgContent = [];
+        foreach (array_slice($singleImages, 0, 5) as $imgUrl) {
+            try {
+                $ch = curl_init($imgUrl);
+                curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]);
+                $imgData = curl_exec($ch);
+                $ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg';
+                curl_close($ch);
+                if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) {
+                    $mt = 'image/jpeg';
+                    if (stripos($ct, 'png') !== false) $mt = 'image/png';
+                    elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp';
+                    $imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl];
+                }
+            } catch (Exception $e) {}
+        }
+
+        // Send to Claude for this single menu page
+        $singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.';
+
+        $msgContent = [];
+        foreach (array_slice($imgContent, 0, 5) as $ic) {
+            $msgContent[] = ['type' => 'image', 'source' => $ic['source']];
+        }
+        $menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : '';
+        $msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"];
+
+        $claudeReq = [
+            'model' => 'claude-sonnet-4-20250514',
+            'max_tokens' => 16384,
+            'temperature' => 0,
+            'system' => $singlePrompt,
+            'messages' => [['role' => 'user', 'content' => $msgContent]],
+        ];
+
+        $httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array {
+            $ch = curl_init($url);
+            curl_setopt_array($ch, [
+                CURLOPT_RETURNTRANSFER => true,
+                CURLOPT_POST => true,
+                CURLOPT_POSTFIELDS => $body,
+                CURLOPT_HTTPHEADER => $headers,
+                CURLOPT_TIMEOUT => $timeout,
+            ]);
+            $resp = curl_exec($ch);
+            $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+            curl_close($ch);
+            return ['code' => $code, 'body' => $resp ?: ''];
+        };
+
+        $claudeResult = $httpPost(
+            'https://api.anthropic.com/v1/messages',
+            json_encode($claudeReq),
+            ['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
+            120
+        );
+
+        if ($claudeResult['code'] !== 200) {
+            $errData = json_decode($claudeResult['body'], true);
+            $errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500);
+            throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
+        }
+
+        $claudeResponse = json_decode($claudeResult['body'], true);
+        $responseText = '';
+        foreach (($claudeResponse['content'] ?? []) as $block) {
+            if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; }
+        }
+
+        // Clean Claude JSON
+        $responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText));
+        $responseText = preg_replace('/\s*```\s*$/', '', $responseText);
+
+        $menuData = json_decode($responseText, true);
+        if (!is_array($menuData)) {
+            throw new Exception("Failed to parse Claude response as JSON");
+        }
+
+        // Tag items with menu name
+        if (strlen($menuName) && !empty($menuData['items'])) {
+            foreach ($menuData['items'] as &$item) {
+                $item['menu'] = $menuName;
+            }
+            unset($item);
+        }
+
+        jsonResponse([
+            'OK' => true,
+            'mode' => 'extract_page',
+            'menuName' => $menuName,
+            'url' => $singleUrl,
+            'DATA' => [
+                'business' => $menuData['business'] ?? [],
+                'categories' => $menuData['categories'] ?? [],
+                'items' => $menuData['items'] ?? [],
+                'modifiers' => $menuData['modifiers'] ?? [],
+            ],
+            'itemCount' => count($menuData['items'] ?? []),
+            'categoryCount' => count($menuData['categories'] ?? []),
+        ]);
+    }
+
    $response['steps'] = [];
    $response['debug'] = [
        'hasHtmlKey' => isset($data['html']),
@ -1620,13 +1836,28 @@ try {
        $response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
    }

-    // Combine HTML, strip scripts/styles
+    // Combine HTML, strip aggressively to keep menu content
    $combinedHtml = '';
    foreach ($menuPages as $menuPage) {
        $cleanHtml = $menuPage['html'];
+        // Remove non-content elements
        $cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
        $cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
        $cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<noscript[^>]*>.*?</noscript>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<iframe[^>]*>.*?</iframe>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml);
+        $cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml);
+        // Strip class/style/data/id/aria attributes to reduce size
+        $cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml);
+        $cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml);
+        // Collapse whitespace
+        $cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml);
+        $cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml);
        $combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
    }

@ -1634,8 +1865,11 @@ try {
        $combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
    }

-    if (strlen($combinedHtml) > 100000) {
-        $combinedHtml = substr($combinedHtml, 0, 100000);
+    $response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes";
+
+    if (strlen($combinedHtml) > 200000) {
+        $combinedHtml = substr($combinedHtml, 0, 200000);
+        $response['steps'][] = "Truncated to 200KB";
    }

    // Server-side heading hierarchy detection