Add discovery + multi-page extract modes for setup wizard
- Discovery mode: quick Playwright crawl returns detected menu sub-pages - Extract_page mode: processes single menu page through Claude individually - More aggressive HTML stripping: removes SVG, nav, footer, form, attributes - Increased truncation limit from 100KB to 200KB for generic fallback path - Enables interactive wizard flow: discover → confirm → extract each page Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
552c404cf6
commit
4ac13de09d
1 changed files with 237 additions and 3 deletions
|
|
@ -36,6 +36,222 @@ try {
|
||||||
$data = readJsonBody();
|
$data = readJsonBody();
|
||||||
if (empty($data)) throw new Exception('No request body provided');
|
if (empty($data)) throw new Exception('No request body provided');
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
|
||||||
|
// ============================================================
|
||||||
|
if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) {
|
||||||
|
$discoverUrl = trim($data['url']);
|
||||||
|
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1");
|
||||||
|
if (empty(trim($pwOutput ?? ''))) {
|
||||||
|
throw new Exception("Playwright returned empty response");
|
||||||
|
}
|
||||||
|
$pwResult = json_decode($pwOutput, true);
|
||||||
|
if (isset($pwResult['error'])) {
|
||||||
|
throw new Exception("Playwright error: " . $pwResult['error']);
|
||||||
|
}
|
||||||
|
|
||||||
|
$subPages = $pwResult['subPagesVisited'] ?? [];
|
||||||
|
$platformPages = $pwResult['platformPagesVisited'] ?? [];
|
||||||
|
|
||||||
|
// Extract business name from title
|
||||||
|
$html = $pwResult['html'] ?? '';
|
||||||
|
$siteName = '';
|
||||||
|
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
||||||
|
$siteName = trim($tm[1]);
|
||||||
|
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
|
||||||
|
$siteName = trim($siteName);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract menu names from sub-page URLs
|
||||||
|
$menuPages = [];
|
||||||
|
foreach ($subPages as $spUrl) {
|
||||||
|
$path = parse_url($spUrl, PHP_URL_PATH);
|
||||||
|
$slug = trim($path, '/');
|
||||||
|
if (strpos($slug, '/') !== false) $slug = basename($slug);
|
||||||
|
$menuName = ucwords(str_replace(['-', '_'], ' ', $slug));
|
||||||
|
$menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for ordering platform links
|
||||||
|
$hasPlatform = count($platformPages) > 0;
|
||||||
|
|
||||||
|
jsonResponse([
|
||||||
|
'OK' => true,
|
||||||
|
'mode' => 'discover',
|
||||||
|
'siteName' => $siteName,
|
||||||
|
'mainUrl' => $discoverUrl,
|
||||||
|
'menuPages' => $menuPages,
|
||||||
|
'platformPages' => $platformPages,
|
||||||
|
'hasPlatform' => $hasPlatform,
|
||||||
|
'totalPagesFound' => count($menuPages),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MULTI-PAGE MODE: Process specific URLs individually through Claude
|
||||||
|
// ============================================================
|
||||||
|
if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) {
|
||||||
|
// Process a single menu page through Playwright + Claude
|
||||||
|
// The frontend calls this once per confirmed menu page
|
||||||
|
$singleUrl = trim($data['url']);
|
||||||
|
$menuName = $data['menuName'] ?? '';
|
||||||
|
|
||||||
|
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1");
|
||||||
|
if (empty(trim($pwOutput ?? ''))) {
|
||||||
|
throw new Exception("Playwright returned empty response for $singleUrl");
|
||||||
|
}
|
||||||
|
$pwResult = json_decode($pwOutput, true);
|
||||||
|
if (isset($pwResult['error'])) {
|
||||||
|
throw new Exception("Playwright error: " . $pwResult['error']);
|
||||||
|
}
|
||||||
|
|
||||||
|
$singleHtml = $pwResult['html'] ?? '';
|
||||||
|
// Strip to just the main page (no sub-page following for single-page extract)
|
||||||
|
$marker = strpos($singleHtml, '<!-- === SUB-PAGE:');
|
||||||
|
if ($marker !== false) {
|
||||||
|
$singleHtml = substr($singleHtml, 0, $marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggressive cleanup
|
||||||
|
$singleHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<!--.*?-->#s', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml);
|
||||||
|
$singleHtml = preg_replace('#>\s+<#', '><', $singleHtml);
|
||||||
|
if (strlen($singleHtml) > 100000) {
|
||||||
|
$singleHtml = substr($singleHtml, 0, 100000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract images from this page
|
||||||
|
$singleImages = [];
|
||||||
|
$singleImageData = [];
|
||||||
|
if (preg_match_all('#<img[^>]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) {
|
||||||
|
foreach ($imgMatches as $imgMatch) {
|
||||||
|
$imgSrc = $imgMatch[1];
|
||||||
|
if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue;
|
||||||
|
if (!preg_match('#^https?://#i', $imgSrc)) {
|
||||||
|
$origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl);
|
||||||
|
$imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc;
|
||||||
|
}
|
||||||
|
$singleImages[] = $imgSrc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Also add Playwright-captured images
|
||||||
|
foreach (($pwResult['images'] ?? []) as $pwImg) {
|
||||||
|
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
|
||||||
|
$singleImages[] = $pwImg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$singleImages = array_values(array_unique($singleImages));
|
||||||
|
|
||||||
|
// Download up to 5 images for this page
|
||||||
|
$imgContent = [];
|
||||||
|
foreach (array_slice($singleImages, 0, 5) as $imgUrl) {
|
||||||
|
try {
|
||||||
|
$ch = curl_init($imgUrl);
|
||||||
|
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]);
|
||||||
|
$imgData = curl_exec($ch);
|
||||||
|
$ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg';
|
||||||
|
curl_close($ch);
|
||||||
|
if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) {
|
||||||
|
$mt = 'image/jpeg';
|
||||||
|
if (stripos($ct, 'png') !== false) $mt = 'image/png';
|
||||||
|
elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp';
|
||||||
|
$imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl];
|
||||||
|
}
|
||||||
|
} catch (Exception $e) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send to Claude for this single menu page
|
||||||
|
$singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.';
|
||||||
|
|
||||||
|
$msgContent = [];
|
||||||
|
foreach (array_slice($imgContent, 0, 5) as $ic) {
|
||||||
|
$msgContent[] = ['type' => 'image', 'source' => $ic['source']];
|
||||||
|
}
|
||||||
|
$menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : '';
|
||||||
|
$msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"];
|
||||||
|
|
||||||
|
$claudeReq = [
|
||||||
|
'model' => 'claude-sonnet-4-20250514',
|
||||||
|
'max_tokens' => 16384,
|
||||||
|
'temperature' => 0,
|
||||||
|
'system' => $singlePrompt,
|
||||||
|
'messages' => [['role' => 'user', 'content' => $msgContent]],
|
||||||
|
];
|
||||||
|
|
||||||
|
$httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array {
|
||||||
|
$ch = curl_init($url);
|
||||||
|
curl_setopt_array($ch, [
|
||||||
|
CURLOPT_RETURNTRANSFER => true,
|
||||||
|
CURLOPT_POST => true,
|
||||||
|
CURLOPT_POSTFIELDS => $body,
|
||||||
|
CURLOPT_HTTPHEADER => $headers,
|
||||||
|
CURLOPT_TIMEOUT => $timeout,
|
||||||
|
]);
|
||||||
|
$resp = curl_exec($ch);
|
||||||
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||||
|
curl_close($ch);
|
||||||
|
return ['code' => $code, 'body' => $resp ?: ''];
|
||||||
|
};
|
||||||
|
|
||||||
|
$claudeResult = $httpPost(
|
||||||
|
'https://api.anthropic.com/v1/messages',
|
||||||
|
json_encode($claudeReq),
|
||||||
|
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
|
||||||
|
120
|
||||||
|
);
|
||||||
|
|
||||||
|
if ($claudeResult['code'] !== 200) {
|
||||||
|
$errData = json_decode($claudeResult['body'], true);
|
||||||
|
$errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500);
|
||||||
|
throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
|
||||||
|
}
|
||||||
|
|
||||||
|
$claudeResponse = json_decode($claudeResult['body'], true);
|
||||||
|
$responseText = '';
|
||||||
|
foreach (($claudeResponse['content'] ?? []) as $block) {
|
||||||
|
if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean Claude JSON
|
||||||
|
$responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText));
|
||||||
|
$responseText = preg_replace('/\s*```\s*$/', '', $responseText);
|
||||||
|
|
||||||
|
$menuData = json_decode($responseText, true);
|
||||||
|
if (!is_array($menuData)) {
|
||||||
|
throw new Exception("Failed to parse Claude response as JSON");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tag items with menu name
|
||||||
|
if (strlen($menuName) && !empty($menuData['items'])) {
|
||||||
|
foreach ($menuData['items'] as &$item) {
|
||||||
|
$item['menu'] = $menuName;
|
||||||
|
}
|
||||||
|
unset($item);
|
||||||
|
}
|
||||||
|
|
||||||
|
jsonResponse([
|
||||||
|
'OK' => true,
|
||||||
|
'mode' => 'extract_page',
|
||||||
|
'menuName' => $menuName,
|
||||||
|
'url' => $singleUrl,
|
||||||
|
'DATA' => [
|
||||||
|
'business' => $menuData['business'] ?? [],
|
||||||
|
'categories' => $menuData['categories'] ?? [],
|
||||||
|
'items' => $menuData['items'] ?? [],
|
||||||
|
'modifiers' => $menuData['modifiers'] ?? [],
|
||||||
|
],
|
||||||
|
'itemCount' => count($menuData['items'] ?? []),
|
||||||
|
'categoryCount' => count($menuData['categories'] ?? []),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
$response['steps'] = [];
|
$response['steps'] = [];
|
||||||
$response['debug'] = [
|
$response['debug'] = [
|
||||||
'hasHtmlKey' => isset($data['html']),
|
'hasHtmlKey' => isset($data['html']),
|
||||||
|
|
@ -1620,13 +1836,28 @@ try {
|
||||||
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
|
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Combine HTML, strip scripts/styles
|
// Combine HTML, strip aggressively to keep menu content
|
||||||
$combinedHtml = '';
|
$combinedHtml = '';
|
||||||
foreach ($menuPages as $menuPage) {
|
foreach ($menuPages as $menuPage) {
|
||||||
$cleanHtml = $menuPage['html'];
|
$cleanHtml = $menuPage['html'];
|
||||||
|
// Remove non-content elements
|
||||||
$cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
|
$cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
|
||||||
$cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
|
$cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
|
||||||
$cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
|
$cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<noscript[^>]*>.*?</noscript>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<iframe[^>]*>.*?</iframe>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml);
|
||||||
|
// Strip class/style/data/id/aria attributes to reduce size
|
||||||
|
$cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml);
|
||||||
|
// Collapse whitespace
|
||||||
|
$cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml);
|
||||||
|
$cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml);
|
||||||
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
|
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1634,8 +1865,11 @@ try {
|
||||||
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
|
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strlen($combinedHtml) > 100000) {
|
$response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes";
|
||||||
$combinedHtml = substr($combinedHtml, 0, 100000);
|
|
||||||
|
if (strlen($combinedHtml) > 200000) {
|
||||||
|
$combinedHtml = substr($combinedHtml, 0, 200000);
|
||||||
|
$response['steps'][] = "Truncated to 200KB";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Server-side heading hierarchy detection
|
// Server-side heading hierarchy detection
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue