From 4ac13de09db00c53ff4532f75831d50ddaa805fb Mon Sep 17 00:00:00 2001 From: John Mizerek Date: Sat, 14 Mar 2026 17:03:54 -0700 Subject: [PATCH] Add discovery + multi-page extract modes for setup wizard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Discovery mode: quick Playwright crawl returns detected menu sub-pages - Extract_page mode: processes single menu page through Claude individually - More aggressive HTML stripping: removes SVG, nav, footer, form, attributes - Increased truncation limit from 100KB to 200KB for generic fallback path - Enables interactive wizard flow: discover → confirm → extract each page Co-Authored-By: Claude Opus 4.6 --- api/setup/analyzeMenuUrl.php | 240 ++++++++++++++++++++++++++++++++++- 1 file changed, 237 insertions(+), 3 deletions(-) diff --git a/api/setup/analyzeMenuUrl.php b/api/setup/analyzeMenuUrl.php index ede81f4..4196d56 100644 --- a/api/setup/analyzeMenuUrl.php +++ b/api/setup/analyzeMenuUrl.php @@ -36,6 +36,222 @@ try { $data = readJsonBody(); if (empty($data)) throw new Exception('No request body provided'); + // ============================================================ + // DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude + // ============================================================ + if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) { + $discoverUrl = trim($data['url']); + $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1"); + if (empty(trim($pwOutput ?? ''))) { + throw new Exception("Playwright returned empty response"); + } + $pwResult = json_decode($pwOutput, true); + if (isset($pwResult['error'])) { + throw new Exception("Playwright error: " . $pwResult['error']); + } + + $subPages = $pwResult['subPagesVisited'] ?? []; + $platformPages = $pwResult['platformPagesVisited'] ?? []; + + // Extract business name from title + $html = $pwResult['html'] ?? ''; + $siteName = ''; + if (preg_match('#]*>([^<]+)#i', $html, $tm)) { + $siteName = trim($tm[1]); + $siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName); + $siteName = trim($siteName); + } + + // Extract menu names from sub-page URLs + $menuPages = []; + foreach ($subPages as $spUrl) { + $path = parse_url($spUrl, PHP_URL_PATH); + $slug = trim($path, '/'); + if (strpos($slug, '/') !== false) $slug = basename($slug); + $menuName = ucwords(str_replace(['-', '_'], ' ', $slug)); + $menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug]; + } + + // Check for ordering platform links + $hasPlatform = count($platformPages) > 0; + + jsonResponse([ + 'OK' => true, + 'mode' => 'discover', + 'siteName' => $siteName, + 'mainUrl' => $discoverUrl, + 'menuPages' => $menuPages, + 'platformPages' => $platformPages, + 'hasPlatform' => $hasPlatform, + 'totalPagesFound' => count($menuPages), + ]); + } + + // ============================================================ + // MULTI-PAGE MODE: Process specific URLs individually through Claude + // ============================================================ + if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) { + // Process a single menu page through Playwright + Claude + // The frontend calls this once per confirmed menu page + $singleUrl = trim($data['url']); + $menuName = $data['menuName'] ?? ''; + + $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1"); + if (empty(trim($pwOutput ?? ''))) { + throw new Exception("Playwright returned empty response for $singleUrl"); + } + $pwResult = json_decode($pwOutput, true); + if (isset($pwResult['error'])) { + throw new Exception("Playwright error: " . $pwResult['error']); + } + + $singleHtml = $pwResult['html'] ?? ''; + // Strip to just the main page (no sub-page following for single-page extract) + $marker = strpos($singleHtml, '#s', '', $singleHtml); + $singleHtml = preg_replace('#]*>.*?#is', '', $singleHtml); + $singleHtml = preg_replace('#]*>.*?#is', '', $singleHtml); + $singleHtml = preg_replace('#]*>.*?#is', '', $singleHtml); + $singleHtml = preg_replace('#]*>.*?#is', '', $singleHtml); + $singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml); + $singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml); + $singleHtml = preg_replace('#>\s+<#', '><', $singleHtml); + if (strlen($singleHtml) > 100000) { + $singleHtml = substr($singleHtml, 0, 100000); + } + + // Extract images from this page + $singleImages = []; + $singleImageData = []; + if (preg_match_all('#]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) { + foreach ($imgMatches as $imgMatch) { + $imgSrc = $imgMatch[1]; + if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue; + if (!preg_match('#^https?://#i', $imgSrc)) { + $origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl); + $imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc; + } + $singleImages[] = $imgSrc; + } + } + // Also add Playwright-captured images + foreach (($pwResult['images'] ?? []) as $pwImg) { + if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) { + $singleImages[] = $pwImg; + } + } + $singleImages = array_values(array_unique($singleImages)); + + // Download up to 5 images for this page + $imgContent = []; + foreach (array_slice($singleImages, 0, 5) as $imgUrl) { + try { + $ch = curl_init($imgUrl); + curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]); + $imgData = curl_exec($ch); + $ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg'; + curl_close($ch); + if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) { + $mt = 'image/jpeg'; + if (stripos($ct, 'png') !== false) $mt = 'image/png'; + elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp'; + $imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl]; + } + } catch (Exception $e) {} + } + + // Send to Claude for this single menu page + $singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.'; + + $msgContent = []; + foreach (array_slice($imgContent, 0, 5) as $ic) { + $msgContent[] = ['type' => 'image', 'source' => $ic['source']]; + } + $menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : ''; + $msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"]; + + $claudeReq = [ + 'model' => 'claude-sonnet-4-20250514', + 'max_tokens' => 16384, + 'temperature' => 0, + 'system' => $singlePrompt, + 'messages' => [['role' => 'user', 'content' => $msgContent]], + ]; + + $httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array { + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_TIMEOUT => $timeout, + ]); + $resp = curl_exec($ch); + $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + return ['code' => $code, 'body' => $resp ?: '']; + }; + + $claudeResult = $httpPost( + 'https://api.anthropic.com/v1/messages', + json_encode($claudeReq), + ['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'], + 120 + ); + + if ($claudeResult['code'] !== 200) { + $errData = json_decode($claudeResult['body'], true); + $errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500); + throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail"); + } + + $claudeResponse = json_decode($claudeResult['body'], true); + $responseText = ''; + foreach (($claudeResponse['content'] ?? []) as $block) { + if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; } + } + + // Clean Claude JSON + $responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText)); + $responseText = preg_replace('/\s*```\s*$/', '', $responseText); + + $menuData = json_decode($responseText, true); + if (!is_array($menuData)) { + throw new Exception("Failed to parse Claude response as JSON"); + } + + // Tag items with menu name + if (strlen($menuName) && !empty($menuData['items'])) { + foreach ($menuData['items'] as &$item) { + $item['menu'] = $menuName; + } + unset($item); + } + + jsonResponse([ + 'OK' => true, + 'mode' => 'extract_page', + 'menuName' => $menuName, + 'url' => $singleUrl, + 'DATA' => [ + 'business' => $menuData['business'] ?? [], + 'categories' => $menuData['categories'] ?? [], + 'items' => $menuData['items'] ?? [], + 'modifiers' => $menuData['modifiers'] ?? [], + ], + 'itemCount' => count($menuData['items'] ?? []), + 'categoryCount' => count($menuData['categories'] ?? []), + ]); + } + $response['steps'] = []; $response['debug'] = [ 'hasHtmlKey' => isset($data['html']), @@ -1620,13 +1836,28 @@ try { $response['DEBUG_EMBEDDED_JSON_FOUND'] = false; } - // Combine HTML, strip scripts/styles + // Combine HTML, strip aggressively to keep menu content $combinedHtml = ''; foreach ($menuPages as $menuPage) { $cleanHtml = $menuPage['html']; + // Remove non-content elements $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); $cleanHtml = preg_replace('##s', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#]*>.*?#is', '', $cleanHtml); + $cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml); + $cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml); + // Strip class/style/data/id/aria attributes to reduce size + $cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml); + $cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml); + // Collapse whitespace + $cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml); + $cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml); $combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml; } @@ -1634,8 +1865,11 @@ try { $combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData; } - if (strlen($combinedHtml) > 100000) { - $combinedHtml = substr($combinedHtml, 0, 100000); + $response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes"; + + if (strlen($combinedHtml) > 200000) { + $combinedHtml = substr($combinedHtml, 0, 200000); + $response['steps'][] = "Truncated to 200KB"; } // Server-side heading hierarchy detection