Add discovery + multi-page extract modes for setup wizard
- Discovery mode: quick Playwright crawl returns detected menu sub-pages - Extract_page mode: processes single menu page through Claude individually - More aggressive HTML stripping: removes SVG, nav, footer, form, attributes - Increased truncation limit from 100KB to 200KB for generic fallback path - Enables interactive wizard flow: discover → confirm → extract each page Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
552c404cf6
commit
4ac13de09d
1 changed files with 237 additions and 3 deletions
|
|
@ -36,6 +36,222 @@ try {
|
|||
$data = readJsonBody();
|
||||
if (empty($data)) throw new Exception('No request body provided');
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
|
||||
// ============================================================
|
||||
if (!empty($data['mode']) && $data['mode'] === 'discover' && !empty($data['url'])) {
|
||||
$discoverUrl = trim($data['url']);
|
||||
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($discoverUrl) . " 8000 2>&1");
|
||||
if (empty(trim($pwOutput ?? ''))) {
|
||||
throw new Exception("Playwright returned empty response");
|
||||
}
|
||||
$pwResult = json_decode($pwOutput, true);
|
||||
if (isset($pwResult['error'])) {
|
||||
throw new Exception("Playwright error: " . $pwResult['error']);
|
||||
}
|
||||
|
||||
$subPages = $pwResult['subPagesVisited'] ?? [];
|
||||
$platformPages = $pwResult['platformPagesVisited'] ?? [];
|
||||
|
||||
// Extract business name from title
|
||||
$html = $pwResult['html'] ?? '';
|
||||
$siteName = '';
|
||||
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
||||
$siteName = trim($tm[1]);
|
||||
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
|
||||
$siteName = trim($siteName);
|
||||
}
|
||||
|
||||
// Extract menu names from sub-page URLs
|
||||
$menuPages = [];
|
||||
foreach ($subPages as $spUrl) {
|
||||
$path = parse_url($spUrl, PHP_URL_PATH);
|
||||
$slug = trim($path, '/');
|
||||
if (strpos($slug, '/') !== false) $slug = basename($slug);
|
||||
$menuName = ucwords(str_replace(['-', '_'], ' ', $slug));
|
||||
$menuPages[] = ['url' => $spUrl, 'name' => $menuName, 'slug' => $slug];
|
||||
}
|
||||
|
||||
// Check for ordering platform links
|
||||
$hasPlatform = count($platformPages) > 0;
|
||||
|
||||
jsonResponse([
|
||||
'OK' => true,
|
||||
'mode' => 'discover',
|
||||
'siteName' => $siteName,
|
||||
'mainUrl' => $discoverUrl,
|
||||
'menuPages' => $menuPages,
|
||||
'platformPages' => $platformPages,
|
||||
'hasPlatform' => $hasPlatform,
|
||||
'totalPagesFound' => count($menuPages),
|
||||
]);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MULTI-PAGE MODE: Process specific URLs individually through Claude
|
||||
// ============================================================
|
||||
if (!empty($data['mode']) && $data['mode'] === 'extract_page' && !empty($data['url'])) {
|
||||
// Process a single menu page through Playwright + Claude
|
||||
// The frontend calls this once per confirmed menu page
|
||||
$singleUrl = trim($data['url']);
|
||||
$menuName = $data['menuName'] ?? '';
|
||||
|
||||
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($singleUrl) . " 8000 2>&1");
|
||||
if (empty(trim($pwOutput ?? ''))) {
|
||||
throw new Exception("Playwright returned empty response for $singleUrl");
|
||||
}
|
||||
$pwResult = json_decode($pwOutput, true);
|
||||
if (isset($pwResult['error'])) {
|
||||
throw new Exception("Playwright error: " . $pwResult['error']);
|
||||
}
|
||||
|
||||
$singleHtml = $pwResult['html'] ?? '';
|
||||
// Strip to just the main page (no sub-page following for single-page extract)
|
||||
$marker = strpos($singleHtml, '<!-- === SUB-PAGE:');
|
||||
if ($marker !== false) {
|
||||
$singleHtml = substr($singleHtml, 0, $marker);
|
||||
}
|
||||
|
||||
// Aggressive cleanup
|
||||
$singleHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<!--.*?-->#s', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)="[^"]*"#i', '', $singleHtml);
|
||||
$singleHtml = preg_replace('#\s{2,}#', ' ', $singleHtml);
|
||||
$singleHtml = preg_replace('#>\s+<#', '><', $singleHtml);
|
||||
if (strlen($singleHtml) > 100000) {
|
||||
$singleHtml = substr($singleHtml, 0, 100000);
|
||||
}
|
||||
|
||||
// Extract images from this page
|
||||
$singleImages = [];
|
||||
$singleImageData = [];
|
||||
if (preg_match_all('#<img[^>]+src=["\']([^"\']+)["\'][^>]*>#i', $singleHtml, $imgMatches, PREG_SET_ORDER)) {
|
||||
foreach ($imgMatches as $imgMatch) {
|
||||
$imgSrc = $imgMatch[1];
|
||||
if (preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $imgSrc)) continue;
|
||||
if (!preg_match('#^https?://#i', $imgSrc)) {
|
||||
$origin = preg_replace('#^(https?://[^/]+).*#', '$1', $singleUrl);
|
||||
$imgSrc = (str_starts_with($imgSrc, '/')) ? $origin . $imgSrc : $origin . '/' . $imgSrc;
|
||||
}
|
||||
$singleImages[] = $imgSrc;
|
||||
}
|
||||
}
|
||||
// Also add Playwright-captured images
|
||||
foreach (($pwResult['images'] ?? []) as $pwImg) {
|
||||
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
|
||||
$singleImages[] = $pwImg;
|
||||
}
|
||||
}
|
||||
$singleImages = array_values(array_unique($singleImages));
|
||||
|
||||
// Download up to 5 images for this page
|
||||
$imgContent = [];
|
||||
foreach (array_slice($singleImages, 0, 5) as $imgUrl) {
|
||||
try {
|
||||
$ch = curl_init($imgUrl);
|
||||
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true]);
|
||||
$imgData = curl_exec($ch);
|
||||
$ct = curl_getinfo($ch, CURLINFO_CONTENT_TYPE) ?: 'image/jpeg';
|
||||
curl_close($ch);
|
||||
if ($imgData && strlen($imgData) > 1000 && strlen($imgData) < 5000000) {
|
||||
$mt = 'image/jpeg';
|
||||
if (stripos($ct, 'png') !== false) $mt = 'image/png';
|
||||
elseif (stripos($ct, 'webp') !== false) $mt = 'image/webp';
|
||||
$imgContent[] = ['source' => ['type' => 'base64', 'media_type' => $mt, 'data' => base64_encode($imgData)], 'url' => $imgUrl];
|
||||
}
|
||||
} catch (Exception $e) {}
|
||||
}
|
||||
|
||||
// Send to Claude for this single menu page
|
||||
$singlePrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu items visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category name strings), items (array of objects with name, description, price, category, imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups items. An ITEM is an individual food/drink with name, description, and price. Each item must have a \'category\' field. For prices: numbers (e.g., 12.99). For imageUrl: the full src URL of the food image from the HTML. For brandColor: suggest a vibrant hex (6 digits, no hash). Return ONLY valid JSON.';
|
||||
|
||||
$msgContent = [];
|
||||
foreach (array_slice($imgContent, 0, 5) as $ic) {
|
||||
$msgContent[] = ['type' => 'image', 'source' => $ic['source']];
|
||||
}
|
||||
$menuNameHint = strlen($menuName) ? "\n\nThis is the \"$menuName\" menu page." : '';
|
||||
$msgContent[] = ['type' => 'text', 'text' => "Extract all menu items from this page.$menuNameHint\n\nHTML:\n\n$singleHtml"];
|
||||
|
||||
$claudeReq = [
|
||||
'model' => 'claude-sonnet-4-20250514',
|
||||
'max_tokens' => 16384,
|
||||
'temperature' => 0,
|
||||
'system' => $singlePrompt,
|
||||
'messages' => [['role' => 'user', 'content' => $msgContent]],
|
||||
];
|
||||
|
||||
$httpPost = function(string $url, string $body, array $headers = [], int $timeout = 120): array {
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => $headers,
|
||||
CURLOPT_TIMEOUT => $timeout,
|
||||
]);
|
||||
$resp = curl_exec($ch);
|
||||
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
return ['code' => $code, 'body' => $resp ?: ''];
|
||||
};
|
||||
|
||||
$claudeResult = $httpPost(
|
||||
'https://api.anthropic.com/v1/messages',
|
||||
json_encode($claudeReq),
|
||||
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
|
||||
120
|
||||
);
|
||||
|
||||
if ($claudeResult['code'] !== 200) {
|
||||
$errData = json_decode($claudeResult['body'], true);
|
||||
$errorDetail = $errData['error']['message'] ?? substr($claudeResult['body'], 0, 500);
|
||||
throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
|
||||
}
|
||||
|
||||
$claudeResponse = json_decode($claudeResult['body'], true);
|
||||
$responseText = '';
|
||||
foreach (($claudeResponse['content'] ?? []) as $block) {
|
||||
if (($block['type'] ?? '') === 'text') { $responseText = $block['text']; break; }
|
||||
}
|
||||
|
||||
// Clean Claude JSON
|
||||
$responseText = preg_replace('/^```(?:json)?\s*/i', '', trim($responseText));
|
||||
$responseText = preg_replace('/\s*```\s*$/', '', $responseText);
|
||||
|
||||
$menuData = json_decode($responseText, true);
|
||||
if (!is_array($menuData)) {
|
||||
throw new Exception("Failed to parse Claude response as JSON");
|
||||
}
|
||||
|
||||
// Tag items with menu name
|
||||
if (strlen($menuName) && !empty($menuData['items'])) {
|
||||
foreach ($menuData['items'] as &$item) {
|
||||
$item['menu'] = $menuName;
|
||||
}
|
||||
unset($item);
|
||||
}
|
||||
|
||||
jsonResponse([
|
||||
'OK' => true,
|
||||
'mode' => 'extract_page',
|
||||
'menuName' => $menuName,
|
||||
'url' => $singleUrl,
|
||||
'DATA' => [
|
||||
'business' => $menuData['business'] ?? [],
|
||||
'categories' => $menuData['categories'] ?? [],
|
||||
'items' => $menuData['items'] ?? [],
|
||||
'modifiers' => $menuData['modifiers'] ?? [],
|
||||
],
|
||||
'itemCount' => count($menuData['items'] ?? []),
|
||||
'categoryCount' => count($menuData['categories'] ?? []),
|
||||
]);
|
||||
}
|
||||
|
||||
$response['steps'] = [];
|
||||
$response['debug'] = [
|
||||
'hasHtmlKey' => isset($data['html']),
|
||||
|
|
@ -1620,13 +1836,28 @@ try {
|
|||
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
|
||||
}
|
||||
|
||||
// Combine HTML, strip scripts/styles
|
||||
// Combine HTML, strip aggressively to keep menu content
|
||||
$combinedHtml = '';
|
||||
foreach ($menuPages as $menuPage) {
|
||||
$cleanHtml = $menuPage['html'];
|
||||
// Remove non-content elements
|
||||
$cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<svg[^>]*>.*?</svg>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<noscript[^>]*>.*?</noscript>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<iframe[^>]*>.*?</iframe>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<nav[^>]*>.*?</nav>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<footer[^>]*>.*?</footer>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<form[^>]*>.*?</form>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<(input|button|select|textarea|option|label|fieldset|legend|datalist|output)[^>]*/?>#is', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#<(meta|link|base|source|track|wbr)[^>]*/?>#is', '', $cleanHtml);
|
||||
// Strip class/style/data/id/aria attributes to reduce size
|
||||
$cleanHtml = preg_replace('#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex|onclick|onload|loading)="[^"]*"#i', '', $cleanHtml);
|
||||
$cleanHtml = preg_replace("#\s+(class|style|data-[a-z-]+|id|aria-[a-z-]+|role|tabindex)='[^']*'#i", '', $cleanHtml);
|
||||
// Collapse whitespace
|
||||
$cleanHtml = preg_replace('#\s{2,}#', ' ', $cleanHtml);
|
||||
$cleanHtml = preg_replace('#>\s+<#', '><', $cleanHtml);
|
||||
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
|
||||
}
|
||||
|
||||
|
|
@ -1634,8 +1865,11 @@ try {
|
|||
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
|
||||
}
|
||||
|
||||
if (strlen($combinedHtml) > 100000) {
|
||||
$combinedHtml = substr($combinedHtml, 0, 100000);
|
||||
$response['steps'][] = "Combined HTML size after stripping: " . strlen($combinedHtml) . " bytes";
|
||||
|
||||
if (strlen($combinedHtml) > 200000) {
|
||||
$combinedHtml = substr($combinedHtml, 0, 200000);
|
||||
$response['steps'][] = "Truncated to 200KB";
|
||||
}
|
||||
|
||||
// Server-side heading hierarchy detection
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue