diff --git a/api/setup/analyzeMenuUrl.php b/api/setup/analyzeMenuUrl.php
index ede81f4..4196d56 100644
--- a/api/setup/analyzeMenuUrl.php
+++ b/api/setup/analyzeMenuUrl.php
@@ -36,6 +36,222 @@ try {
$data = readJsonBody();
if (empty($data)) throw new Exception('No request body provided');
+ // ============================================================
+ // DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
+ // ============================================================
+ if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) {
+ $discoverUrl = trim($data['url']);
+ $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1");
+ if (empty(trim($pwOutput ?? ''))) {
+ throw new Exception("Playwright returned empty response");
+ }
+ $pwResult = json_decode($pwOutput, true);
+ if (isset($pwResult['error'])) {
+ throw new Exception("Playwright error: " . $pwResult['error']);
+ }
+
+ $subPages = $pwResult['subPagesVisited'] ?? [];
+ $platformPages = $pwResult['platformPagesVisited'] ?? [];
+
+ // Extract business name from title
+ $html = $pwResult['html'] ?? '';
+ $siteName = '';
+ if (preg_match('#
]*>([^<]+)#i', $html, $tm)) {
+ $siteName = trim($tm[1]);
+ $siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
+ $siteName = trim($siteName);
+ }
+
+ // Extract menu names from sub-page URLs
+ $menuPages = [];
+ foreach ($subPages as $spUrl) {
+ $path = parse_url($spUrl, PHP_URL_PATH);
+ $slug = trim($path, '/');
+ if (strpos($slug, '/') !== false) $slug = basename($slug);
+ $menuName = ucwords(str_replace(['-', '_'], ' ', $slug));
+ $menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug];
+ }
+
+ // Check for ordering platform links
+ $hasPlatform = count($platformPages) > 0;
+
+ jsonResponse([
+ 'OK' => true,
+ 'mode' => 'discover',
+ 'siteName' => $siteName,
+ 'mainUrl' => $discoverUrl,
+ 'menuPages' => $menuPages,
+ 'platformPages' => $platformPages,
+ 'hasPlatform' => $hasPlatform,
+ 'totalPagesFound' => count($menuPages),
+ ]);
+ }
+
+ // ============================================================
+ // MULTI-PAGE MODE: Process specific URLs individually through Claude
+ // ============================================================
+ if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) {
+ // Process a single menu page through Playwright + Claude
+ // The frontend calls this once per confirmed menu page
+ $singleUrl = trim($data['url']);
+ $menuName = $data['menuName'] ?? '';
+
+ $pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1");
+ if (empty(trim($pwOutput ?? ''))) {
+ throw new Exception("Playwright returned empty response for $singleUrl");
+ }
+ $pwResult = json_decode($pwOutput, true);
+ if (isset($pwResult['error'])) {
+ throw new Exception("Playwright error: " . $pwResult['error']);
+ }
+
+ $singleHtml = $pwResult['html'] ?? '';
+ // Strip to just the main page (no sub-page following for single-page extract)
+ $marker = strpos($singleHtml, '#s', '', $singleHtml);
+ $singleHtml = preg_replace('##is', '', $singleHtml);
+ $singleHtml = preg_replace('##is', '', $singleHtml);
+ $singleHtml = preg_replace('##is', '', $singleHtml);
+ $singleHtml = preg_replace('##is', '', $singleHtml);
+ $singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml);
+ $singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml);
+ $singleHtml = preg_replace('#>\s+<#', '><', $singleHtml);
+ if (strlen($singleHtml) > 100000) {
+ $singleHtml = substr($singleHtml, 0, 100000);
+ }
+
+ // Extract images from this page
+ $singleImages = [];
+ $singleImageData = [];
+ if (preg_match_all('#
]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) {
+ foreach ($imgMatches as $imgMatch) {
+ $imgSrc = $imgMatch[1];
+ if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue;
+ if (!preg_match('#^https?://#i', $imgSrc)) {
+ $origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl);
+ $imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc;
+ }
+ $singleImages[] = $imgSrc;
+ }
+ }
+ // Also add Playwright-captured images
+ foreach (($pwResult['images'] ?? []) as $pwImg) {
+ if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
+ $singleImages[] = $pwImg;
+ }
+ }
+ $singleImages = array_values(array_unique($singleImages));
+
+ // Download up to 5 images for this page
+ $imgContent = [];
+ foreach (array_slice($singleImages, 0, 5) as $imgUrl) {
+ try {
+ $ch = curl_init($imgUrl);
+ curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]);
+ $imgData = curl_exec($ch);
+ $ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg';
+ curl_close($ch);
+ if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) {
+ $mt = 'image/jpeg';
+ if (stripos($ct, 'png') !== false) $mt = 'image/png';
+ elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp';
+ $imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl];
+ }
+ } catch (Exception $e) {}
+ }
+
+ // Send to Claude for this single menu page
+ $singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.';
+
+ $msgContent = [];
+ foreach (array_slice($imgContent, 0, 5) as $ic) {
+ $msgContent[] = ['type' => 'image', 'source' => $ic['source']];
+ }
+ $menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : '';
+ $msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"];
+
+ $claudeReq = [
+ 'model' => 'claude-sonnet-4-20250514',
+ 'max_tokens' => 16384,
+ 'temperature' => 0,
+ 'system' => $singlePrompt,
+ 'messages' => [['role' => 'user', 'content' => $msgContent]],
+ ];
+
+ $httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array {
+ $ch = curl_init($url);
+ curl_setopt_array($ch, [
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_POST => true,
+ CURLOPT_POSTFIELDS => $body,
+ CURLOPT_HTTPHEADER => $headers,
+ CURLOPT_TIMEOUT => $timeout,
+ ]);
+ $resp = curl_exec($ch);
+ $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+ curl_close($ch);
+ return ['code' => $code, 'body' => $resp ?: ''];
+ };
+
+ $claudeResult = $httpPost(
+ 'https://api.anthropic.com/v1/messages',
+ json_encode($claudeReq),
+ ['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
+ 120
+ );
+
+ if ($claudeResult['code'] !== 200) {
+ $errData = json_decode($claudeResult['body'], true);
+ $errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500);
+ throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
+ }
+
+ $claudeResponse = json_decode($claudeResult['body'], true);
+ $responseText = '';
+ foreach (($claudeResponse['content'] ?? []) as $block) {
+ if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; }
+ }
+
+ // Clean Claude JSON
+ $responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText));
+ $responseText = preg_replace('/\s*```\s*$/', '', $responseText);
+
+ $menuData = json_decode($responseText, true);
+ if (!is_array($menuData)) {
+ throw new Exception("Failed to parse Claude response as JSON");
+ }
+
+ // Tag items with menu name
+ if (strlen($menuName) && !empty($menuData['items'])) {
+ foreach ($menuData['items'] as &$item) {
+ $item['menu'] = $menuName;
+ }
+ unset($item);
+ }
+
+ jsonResponse([
+ 'OK' => true,
+ 'mode' => 'extract_page',
+ 'menuName' => $menuName,
+ 'url' => $singleUrl,
+ 'DATA' => [
+ 'business' => $menuData['business'] ?? [],
+ 'categories' => $menuData['categories'] ?? [],
+ 'items' => $menuData['items'] ?? [],
+ 'modifiers' => $menuData['modifiers'] ?? [],
+ ],
+ 'itemCount' => count($menuData['items'] ?? []),
+ 'categoryCount' => count($menuData['categories'] ?? []),
+ ]);
+ }
+
$response['steps'] = [];
$response['debug'] = [
'hasHtmlKey' => isset($data['html']),
@@ -1620,13 +1836,28 @@ try {
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
}
- // Combine HTML, strip scripts/styles
+ // Combine HTML, strip aggressively to keep menu content
$combinedHtml = '';
foreach ($menuPages as $menuPage) {
$cleanHtml = $menuPage['html'];
+ // Remove non-content elements
$cleanHtml = preg_replace('##is', '', $cleanHtml);
$cleanHtml = preg_replace('##is', '', $cleanHtml);
$cleanHtml = preg_replace('##s', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('##is', '', $cleanHtml);
+ $cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml);
+ $cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml);
+ // Strip class/style/data/id/aria attributes to reduce size
+ $cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml);
+ $cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml);
+ // Collapse whitespace
+ $cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml);
+ $cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml);
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
}
@@ -1634,8 +1865,11 @@ try {
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
}
- if (strlen($combinedHtml) > 100000) {
- $combinedHtml = substr($combinedHtml, 0, 100000);
+ $response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes";
+
+ if (strlen($combinedHtml) > 200000) {
+ $combinedHtml = substr($combinedHtml, 0, 200000);
+ $response['steps'][] = "Truncated to 200KB";
}
// Server-side heading hierarchy detection