Add discovery + multi-page extract modes for setup wizard

- Discovery mode: quick Playwright crawl returns detected menu sub-pages
- Extract_page mode: processes single menu page through Claude individually
- More aggressive HTML stripping: removes SVG, nav, footer, form, attributes
- Increased truncation limit from 100KB to 200KB for generic fallback path
- Enables interactive wizard flow: discover → confirm → extract each page

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-14 17:03:54 -07:00
parent 552c404cf6
commit 4ac13de09d

View file

@ -36,6 +36,222 @@ try {
$data = readJsonBody();
if (empty($data)) throw new Exception('No request body provided');
// ============================================================
// DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
// ============================================================
if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) {
$discoverUrl = trim($data['url']);
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1");
if (empty(trim($pwOutput ?? ''))) {
throw new Exception("Playwright returned empty response");
}
$pwResult = json_decode($pwOutput, true);
if (isset($pwResult['error'])) {
throw new Exception("Playwright error: " . $pwResult['error']);
}
$subPages = $pwResult['subPagesVisited'] ?? [];
$platformPages = $pwResult['platformPagesVisited'] ?? [];
// Extract business name from title
$html = $pwResult['html'] ?? '';
$siteName = '';
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
$siteName = trim($tm[1]);
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
$siteName = trim($siteName);
}
// Extract menu names from sub-page URLs
$menuPages = [];
foreach ($subPages as $spUrl) {
$path = parse_url($spUrl, PHP_URL_PATH);
$slug = trim($path, '/');
if (strpos($slug, '/') !== false) $slug = basename($slug);
$menuName = ucwords(str_replace(['-', '_'], ' ', $slug));
$menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug];
}
// Check for ordering platform links
$hasPlatform = count($platformPages) > 0;
jsonResponse([
'OK' => true,
'mode' => 'discover',
'siteName' => $siteName,
'mainUrl' => $discoverUrl,
'menuPages' => $menuPages,
'platformPages' => $platformPages,
'hasPlatform' => $hasPlatform,
'totalPagesFound' => count($menuPages),
]);
}
// ============================================================
// MULTI-PAGE MODE: Process specific URLs individually through Claude
// ============================================================
if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) {
// Process a single menu page through Playwright + Claude
// The frontend calls this once per confirmed menu page
$singleUrl = trim($data['url']);
$menuName = $data['menuName'] ?? '';
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1");
if (empty(trim($pwOutput ?? ''))) {
throw new Exception("Playwright returned empty response for $singleUrl");
}
$pwResult = json_decode($pwOutput, true);
if (isset($pwResult['error'])) {
throw new Exception("Playwright error: " . $pwResult['error']);
}
$singleHtml = $pwResult['html'] ?? '';
// Strip to just the main page (no sub-page following for single-page extract)
$marker = strpos($singleHtml, '<!-- === SUB-PAGE:');
if ($marker !== false) {
$singleHtml = substr($singleHtml, 0, $marker);
}
// Aggressive cleanup
$singleHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $singleHtml);
$singleHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $singleHtml);
$singleHtml = preg_replace('#<!--.*?-->#s', '', $singleHtml);
$singleHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $singleHtml);
$singleHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $singleHtml);
$singleHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $singleHtml);
$singleHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $singleHtml);
$singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml);
$singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml);
$singleHtml = preg_replace('#>\s+<#', '><', $singleHtml);
if (strlen($singleHtml) > 100000) {
$singleHtml = substr($singleHtml, 0, 100000);
}
// Extract images from this page
$singleImages = [];
$singleImageData = [];
if (preg_match_all('#<img[^>]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) {
foreach ($imgMatches as $imgMatch) {
$imgSrc = $imgMatch[1];
if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue;
if (!preg_match('#^https?://#i', $imgSrc)) {
$origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl);
$imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc;
}
$singleImages[] = $imgSrc;
}
}
// Also add Playwright-captured images
foreach (($pwResult['images'] ?? []) as $pwImg) {
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
$singleImages[] = $pwImg;
}
}
$singleImages = array_values(array_unique($singleImages));
// Download up to 5 images for this page
$imgContent = [];
foreach (array_slice($singleImages, 0, 5) as $imgUrl) {
try {
$ch = curl_init($imgUrl);
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]);
$imgData = curl_exec($ch);
$ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg';
curl_close($ch);
if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) {
$mt = 'image/jpeg';
if (stripos($ct, 'png') !== false) $mt = 'image/png';
elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp';
$imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl];
}
} catch (Exception $e) {}
}
// Send to Claude for this single menu page
$singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.';
$msgContent = [];
foreach (array_slice($imgContent, 0, 5) as $ic) {
$msgContent[] = ['type' => 'image', 'source' => $ic['source']];
}
$menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : '';
$msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"];
$claudeReq = [
'model' => 'claude-sonnet-4-20250514',
'max_tokens' => 16384,
'temperature' => 0,
'system' => $singlePrompt,
'messages' => [['role' => 'user', 'content' => $msgContent]],
];
$httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $timeout,
]);
$resp = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return ['code' => $code, 'body' => $resp ?: ''];
};
$claudeResult = $httpPost(
'https://api.anthropic.com/v1/messages',
json_encode($claudeReq),
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
120
);
if ($claudeResult['code'] !== 200) {
$errData = json_decode($claudeResult['body'], true);
$errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500);
throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
}
$claudeResponse = json_decode($claudeResult['body'], true);
$responseText = '';
foreach (($claudeResponse['content'] ?? []) as $block) {
if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; }
}
// Clean Claude JSON
$responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText));
$responseText = preg_replace('/\s*```\s*$/', '', $responseText);
$menuData = json_decode($responseText, true);
if (!is_array($menuData)) {
throw new Exception("Failed to parse Claude response as JSON");
}
// Tag items with menu name
if (strlen($menuName) && !empty($menuData['items'])) {
foreach ($menuData['items'] as &$item) {
$item['menu'] = $menuName;
}
unset($item);
}
jsonResponse([
'OK' => true,
'mode' => 'extract_page',
'menuName' => $menuName,
'url' => $singleUrl,
'DATA' => [
'business' => $menuData['business'] ?? [],
'categories' => $menuData['categories'] ?? [],
'items' => $menuData['items'] ?? [],
'modifiers' => $menuData['modifiers'] ?? [],
],
'itemCount' => count($menuData['items'] ?? []),
'categoryCount' => count($menuData['categories'] ?? []),
]);
}
$response['steps'] = [];
$response['debug'] = [
'hasHtmlKey' => isset($data['html']),
@ -1620,13 +1836,28 @@ try {
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
}
// Combine HTML, strip scripts/styles
// Combine HTML, strip aggressively to keep menu content
$combinedHtml = '';
foreach ($menuPages as $menuPage) {
$cleanHtml = $menuPage['html'];
// Remove non-content elements
$cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
$cleanHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<noscript[^>]*>.*?</noscript>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<iframe[^>]*>.*?</iframe>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml);
$cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml);
// Strip class/style/data/id/aria attributes to reduce size
$cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml);
$cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml);
// Collapse whitespace
$cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml);
$cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml);
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
}
@ -1634,8 +1865,11 @@ try {
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
}
if (strlen($combinedHtml) > 100000) {
$combinedHtml = substr($combinedHtml, 0, 100000);
$response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes";
if (strlen($combinedHtml) > 200000) {
$combinedHtml = substr($combinedHtml, 0, 200000);
$response['steps'][] = "Truncated to 200KB";
}
// Server-side heading hierarchy detection