- Updated Claude prompt to detect separate menus vs categories - Added platformImageMap and subPagesVisited parsing from Playwright - Bumped Playwright wait from 5s to 10s for sub-page crawling - saveWizard.php creates separate Menus rows and assigns categories/items to the correct menu based on each item's "menu" field Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1940 lines
108 KiB
PHP
1940 lines
108 KiB
PHP
<?php
|
|
require_once __DIR__ . '/../helpers.php';
|
|
runAuth();
|
|
|
|
/**
|
|
* Analyze Menu URL
|
|
*
|
|
* Extracts menu data from restaurant websites via multiple strategies:
|
|
* - Grubhub API fast path
|
|
* - Toast __OO_STATE__ parsing (uploaded HTML or live URL)
|
|
* - WooCommerce Playwright extraction
|
|
* - DoorDash embedded JSON extraction
|
|
* - Claude API fallback for generic pages
|
|
*
|
|
* POST JSON: { url: "https://...", html: "..." }
|
|
*/
|
|
|
|
set_time_limit(300);
|
|
|
|
$response = ['OK' => false];
|
|
|
|
try {
|
|
// Load API Key
|
|
$configPath = realpath(__DIR__ . '/../../config/claude.json');
|
|
$CLAUDE_API_KEY = '';
|
|
if ($configPath && file_exists($configPath)) {
|
|
$configData = json_decode(file_get_contents($configPath), true);
|
|
if (!empty($configData['apiKey'])) {
|
|
$CLAUDE_API_KEY = $configData['apiKey'];
|
|
}
|
|
}
|
|
if (empty($CLAUDE_API_KEY)) {
|
|
throw new Exception('Claude API key not configured');
|
|
}
|
|
|
|
$data = readJsonBody();
|
|
if (empty($data)) throw new Exception('No request body provided');
|
|
|
|
$response['steps'] = [];
|
|
$response['debug'] = [
|
|
'hasHtmlKey' => isset($data['html']),
|
|
'hasUrlKey' => isset($data['url']),
|
|
'htmlLength' => isset($data['html']) ? strlen($data['html']) : 0,
|
|
'urlValue' => $data['url'] ?? '',
|
|
];
|
|
|
|
$pageHtml = '';
|
|
$baseUrl = '';
|
|
$basePath = '';
|
|
$targetUrl = '';
|
|
$playwrightImages = [];
|
|
|
|
// Helper: webroot path
|
|
$webroot = isDev()
|
|
? '/opt/lucee/tomcat/webapps/ROOT'
|
|
: '/var/www/biz.payfrit.com';
|
|
|
|
// Helper: expand a URL path to a local file path
|
|
$expandPath = function(string $urlPath) use ($webroot): string {
|
|
return $webroot . $urlPath;
|
|
};
|
|
|
|
// Helper: convert 24h time to 12h format string
|
|
$formatTime12h = function(int $h, int $m): string {
|
|
$ampm = $h >= 12 ? 'pm' : 'am';
|
|
if ($h > 12) $h -= 12;
|
|
if ($h === 0) $h = 12;
|
|
return $h . ($m > 0 ? ':' . str_pad($m, 2, '0', STR_PAD_LEFT) : '') . $ampm;
|
|
};
|
|
|
|
// Helper: extract value from escaped JSON using backslash-quote markers
|
|
$BQ = "\\\""; // backslash-quote as it appears in HTML
|
|
|
|
$extractBqValue = function(string $text, string $key, int $startPos = 0) use ($BQ): ?string {
|
|
$marker = $BQ . $key . $BQ . ':' . $BQ;
|
|
$pos = stripos($text, $marker, $startPos);
|
|
if ($pos === false) return null;
|
|
$valStart = $pos + strlen($marker);
|
|
$valEnd = strpos($text, $BQ, $valStart);
|
|
if ($valEnd === false || $valEnd <= $valStart) return null;
|
|
return substr($text, $valStart, $valEnd - $valStart);
|
|
};
|
|
|
|
// Helper: extract __OO_STATE__ JSON using brace-counting
|
|
$extractOoState = function(string $html): ?string {
|
|
$ooStart = stripos($html, 'window.__OO_STATE__');
|
|
if ($ooStart === false) return null;
|
|
$braceStart = strpos($html, '{', $ooStart);
|
|
if ($braceStart === false) return null;
|
|
|
|
$depth = 0;
|
|
$inStr = false;
|
|
$esc = false;
|
|
$totalLen = strlen($html);
|
|
$braceEnd = 0;
|
|
|
|
for ($i = $braceStart; $i < $totalLen; $i++) {
|
|
$ch = $html[$i];
|
|
if ($esc) { $esc = false; continue; }
|
|
if ($ch === '\\' && $inStr) { $esc = true; continue; }
|
|
if ($ch === '"') { $inStr = !$inStr; continue; }
|
|
if (!$inStr) {
|
|
if ($ch === '{') $depth++;
|
|
elseif ($ch === '}') {
|
|
$depth--;
|
|
if ($depth === 0) { $braceEnd = $i; break; }
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($braceEnd === 0) return null;
|
|
$json = substr($html, $braceStart, $braceEnd - $braceStart + 1);
|
|
|
|
// Decode HTML entities from View Source
|
|
$json = str_replace(['&', '<', '>', '"'], ['&', '<', '>', '"'], $json);
|
|
return $json;
|
|
};
|
|
|
|
// Helper: extract Toast item price from multiple possible fields
|
|
$extractToastPrice = function(array $item): float {
|
|
if (!empty($item['prices']) && is_array($item['prices']) && is_numeric($item['prices'][0] ?? null)) {
|
|
return (float)$item['prices'][0];
|
|
}
|
|
if (isset($item['price']) && is_numeric($item['price'])) return (float)$item['price'];
|
|
if (isset($item['unitPrice']) && is_numeric($item['unitPrice'])) return (float)$item['unitPrice'];
|
|
if (isset($item['basePrice']) && is_numeric($item['basePrice'])) return (float)$item['basePrice'];
|
|
if (isset($item['displayPrice']) && strlen(trim((string)$item['displayPrice']))) {
|
|
$ps = preg_replace('/[^0-9.]/', '', (string)$item['displayPrice']);
|
|
if (strlen($ps) && is_numeric($ps)) return (float)$ps;
|
|
}
|
|
return 0.0;
|
|
};
|
|
|
|
// Helper: extract Toast item image URL
|
|
$extractToastImage = function(array $item): string {
|
|
if (isset($item['imageUrls']) && is_array($item['imageUrls'])) {
|
|
$urls = $item['imageUrls'];
|
|
return $urls['medium'] ?? $urls['large'] ?? $urls['small'] ?? '';
|
|
}
|
|
return '';
|
|
};
|
|
|
|
// Helper: clean JSON from Claude response
|
|
$cleanClaudeJson = function(string $text): string {
|
|
$text = trim($text);
|
|
// Strip markdown code fences
|
|
if (str_starts_with($text, '```json')) $text = substr($text, 7);
|
|
if (str_starts_with($text, '```')) $text = substr($text, 3);
|
|
if (str_ends_with($text, '```')) $text = substr($text, 0, -3);
|
|
$text = trim($text);
|
|
// Extract JSON object if text doesn't start with {
|
|
if (!str_starts_with($text, '{')) {
|
|
$jsonStart = strpos($text, '{');
|
|
if ($jsonStart !== false) {
|
|
$text = substr($text, $jsonStart);
|
|
if (str_ends_with(trim($text), '```')) {
|
|
$text = substr(trim($text), 0, -3);
|
|
}
|
|
$text = trim($text);
|
|
}
|
|
}
|
|
// Remove trailing commas before ] or }
|
|
$text = preg_replace('/,(\s*[\]\}])/', '$1', $text);
|
|
// Remove control characters
|
|
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', '', $text);
|
|
// Clean smart quotes/dashes
|
|
$text = str_replace(["\xe2\x80\x98", "\xe2\x80\x99"], "'", $text); // smart single quotes
|
|
$text = str_replace(["\xe2\x80\x93", "\xe2\x80\x94"], "-", $text); // en/em dash
|
|
$text = str_replace("\xe2\x80\xa6", "...", $text); // ellipsis
|
|
return $text;
|
|
};
|
|
|
|
// Helper: detect media type from base64 prefix
|
|
$detectMediaType = function(string $base64): string {
|
|
if (str_starts_with($base64, 'iVBO')) return 'image/png';
|
|
if (str_starts_with($base64, 'R0lGOD')) return 'image/gif';
|
|
if (str_starts_with($base64, 'UklGR')) return 'image/webp';
|
|
return 'image/jpeg';
|
|
};
|
|
|
|
// Helper: HTTP GET with curl
|
|
$httpGet = function(string $url, array $headers = [], int $timeout = 30): array {
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => $timeout,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTPHEADER => $headers,
|
|
]);
|
|
$body = curl_exec($ch);
|
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
|
curl_close($ch);
|
|
return ['body' => $body, 'code' => $code, 'contentType' => $contentType ?? ''];
|
|
};
|
|
|
|
// Helper: HTTP POST with curl
|
|
$httpPost = function(string $url, string $body, array $headers = [], int $timeout = 30): array {
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $body,
|
|
CURLOPT_TIMEOUT => $timeout,
|
|
CURLOPT_HTTPHEADER => $headers,
|
|
]);
|
|
$result = curl_exec($ch);
|
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
return ['body' => $result, 'code' => $code];
|
|
};
|
|
|
|
// ============================================================
|
|
// Parse request: HTML content or URL
|
|
// ============================================================
|
|
|
|
if (!empty($data['html'])) {
|
|
$pageHtml = trim($data['html']);
|
|
$response['steps'][] = "Using provided HTML content: " . strlen($pageHtml) . " bytes";
|
|
} elseif (!empty($data['url'])) {
|
|
$targetUrl = trim($data['url']);
|
|
if (!preg_match('#^https?://#i', $targetUrl)) {
|
|
$targetUrl = 'https://' . $targetUrl;
|
|
}
|
|
|
|
// ========== GRUBHUB FAST PATH ==========
|
|
if (preg_match('#grubhub\.com/restaurant/#i', $targetUrl)) {
|
|
$response['steps'][] = "Grubhub URL detected - using API";
|
|
|
|
// Extract restaurant ID
|
|
if (!preg_match('#/(\d+)(\?|$)#', $targetUrl, $ghIdMatch)) {
|
|
throw new Exception('Could not extract Grubhub restaurant ID from URL');
|
|
}
|
|
$ghRestaurantId = $ghIdMatch[1];
|
|
$response['steps'][] = "Grubhub restaurant ID: $ghRestaurantId";
|
|
|
|
// Get anonymous access token
|
|
$ghAuth = $httpPost(
|
|
'https://api-gtm.grubhub.com/auth',
|
|
'{"brand":"GRUBHUB","client_id":"beta_UmWlpstzQSFmocLy3h1UieYcVST","scope":"anonymous"}',
|
|
['Content-Type: application/json'],
|
|
15
|
|
);
|
|
if ($ghAuth['code'] !== 200) throw new Exception("Grubhub auth failed: {$ghAuth['code']}");
|
|
$ghAuthData = json_decode($ghAuth['body'], true);
|
|
$ghToken = $ghAuthData['session_handle']['access_token'];
|
|
$response['steps'][] = "Got Grubhub anonymous token";
|
|
|
|
// Fetch restaurant with full menu data
|
|
$ghMenu = $httpGet(
|
|
"https://api-gtm.grubhub.com/restaurants/$ghRestaurantId?hideChoiceCategories=false&version=4&orderType=standard&hideUnavailableMenuItems=false&hideMenuItems=false",
|
|
["Authorization: Bearer $ghToken"],
|
|
30
|
|
);
|
|
if ($ghMenu['code'] !== 200) throw new Exception("Grubhub restaurant fetch failed: {$ghMenu['code']}");
|
|
$ghData = json_decode($ghMenu['body'], true);
|
|
$ghRestaurant = $ghData['restaurant'];
|
|
$response['steps'][] = "Fetched Grubhub restaurant data (" . strlen($ghMenu['body']) . " bytes)";
|
|
|
|
// Parse business info
|
|
$ghBusiness = ['name' => $ghRestaurant['name']];
|
|
if (!empty($ghRestaurant['address']) && is_array($ghRestaurant['address'])) {
|
|
$ghAddr = $ghRestaurant['address'];
|
|
if (isset($ghAddr['street_address'])) $ghBusiness['addressLine1'] = $ghAddr['street_address'];
|
|
if (isset($ghAddr['locality'])) $ghBusiness['city'] = $ghAddr['locality'];
|
|
if (isset($ghAddr['region'])) $ghBusiness['state'] = $ghAddr['region'];
|
|
if (isset($ghAddr['zip'])) $ghBusiness['zip'] = $ghAddr['zip'];
|
|
$ghBusiness['address'] = ($ghBusiness['addressLine1'] ?? '') . ', ' . ($ghBusiness['city'] ?? '') . ', ' . ($ghBusiness['state'] ?? '') . ' ' . ($ghBusiness['zip'] ?? '');
|
|
}
|
|
if (isset($ghRestaurant['latitude']) && is_numeric($ghRestaurant['latitude'])) $ghBusiness['latitude'] = $ghRestaurant['latitude'];
|
|
if (isset($ghRestaurant['longitude']) && is_numeric($ghRestaurant['longitude'])) $ghBusiness['longitude'] = $ghRestaurant['longitude'];
|
|
if (!empty($ghRestaurant['phone_number'])) $ghBusiness['phone'] = preg_replace('/[^0-9]/', '', $ghRestaurant['phone_number']);
|
|
if (!empty(trim($ghRestaurant['description'] ?? ''))) $ghBusiness['description'] = trim($ghRestaurant['description']);
|
|
|
|
// Hours
|
|
$ghHoursParts = [];
|
|
$ghDayOrder = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'];
|
|
$ghDayAbbrev = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun'];
|
|
if (!empty($ghRestaurant['restaurant_managed_hours_list_v2']) && is_array($ghRestaurant['restaurant_managed_hours_list_v2'])) {
|
|
foreach ($ghRestaurant['restaurant_managed_hours_list_v2'] as $ghDayHours) {
|
|
if (isset($ghDayHours['day'], $ghDayHours['start_time'], $ghDayHours['end_time'])) {
|
|
$ghDayIdx = array_search($ghDayHours['day'], $ghDayOrder);
|
|
if ($ghDayIdx !== false) {
|
|
$parts = explode(':', $ghDayHours['start_time']);
|
|
$openStr = $formatTime12h((int)$parts[0], (int)($parts[1] ?? 0));
|
|
$parts = explode(':', $ghDayHours['end_time']);
|
|
$closeStr = $formatTime12h((int)$parts[0], (int)($parts[1] ?? 0));
|
|
$ghHoursParts[] = $ghDayAbbrev[$ghDayIdx] . " $openStr-$closeStr";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (!empty($ghHoursParts)) $ghBusiness['hours'] = implode(', ', $ghHoursParts);
|
|
if (isset($ghData['restaurant_availability']['sales_tax'])) $ghBusiness['taxRate'] = $ghData['restaurant_availability']['sales_tax'];
|
|
|
|
// Parse categories and items
|
|
$ghCategories = [];
|
|
$ghItems = [];
|
|
$ghItemId = 1;
|
|
$ghModifierGroups = [];
|
|
$ghImageMappings = [];
|
|
|
|
if (!empty($ghRestaurant['menu_category_list']) && is_array($ghRestaurant['menu_category_list'])) {
|
|
foreach ($ghRestaurant['menu_category_list'] as $ghCat) {
|
|
$ghCatName = trim($ghCat['name'] ?? 'Menu');
|
|
$ghCatItemCount = 0;
|
|
|
|
if (!empty($ghCat['menu_item_list']) && is_array($ghCat['menu_item_list'])) {
|
|
foreach ($ghCat['menu_item_list'] as $ghItem) {
|
|
$ghItemName = trim($ghItem['name'] ?? '');
|
|
if (empty($ghItemName)) continue;
|
|
|
|
$ghPrice = 0;
|
|
if (!empty($ghItem['price']['amount'])) $ghPrice = (float)$ghItem['price']['amount'] / 100;
|
|
$ghDesc = trim($ghItem['description'] ?? '');
|
|
|
|
// Image URL
|
|
$ghImageUrl = '';
|
|
if (!empty($ghItem['media_image']) && is_array($ghItem['media_image'])) {
|
|
$gi = $ghItem['media_image'];
|
|
if (!empty($gi['base_url']) && !empty($gi['public_id']) && !empty($gi['format'])) {
|
|
$ghImageUrl = $gi['base_url'] . 'w_400,h_400,c_fill/' . $gi['public_id'] . '.' . $gi['format'];
|
|
}
|
|
}
|
|
|
|
// Modifiers
|
|
$ghItemModifiers = [];
|
|
if (!empty($ghItem['choice_category_list']) && is_array($ghItem['choice_category_list'])) {
|
|
foreach ($ghItem['choice_category_list'] as $ghChoiceCat) {
|
|
$ghModName = trim($ghChoiceCat['name'] ?? '');
|
|
if (empty($ghModName)) continue;
|
|
$ghItemModifiers[] = $ghModName;
|
|
|
|
if (!isset($ghModifierGroups[$ghModName])) {
|
|
$ghModOptions = [];
|
|
if (!empty($ghChoiceCat['choice_option_list'])) {
|
|
foreach ($ghChoiceCat['choice_option_list'] as $ghOpt) {
|
|
$optName = trim($ghOpt['description'] ?? '');
|
|
$optPrice = !empty($ghOpt['price']['amount']) ? (float)$ghOpt['price']['amount'] / 100 : 0;
|
|
if (strlen($optName)) $ghModOptions[] = ['name' => $optName, 'price' => $optPrice];
|
|
}
|
|
}
|
|
$ghMinSel = (int)($ghChoiceCat['min_choice_options'] ?? 0);
|
|
$ghMaxSel = (int)($ghChoiceCat['max_choice_options'] ?? 0);
|
|
$ghModifierGroups[$ghModName] = [
|
|
'name' => $ghModName,
|
|
'required' => $ghMinSel > 0,
|
|
'minSelections' => $ghMinSel,
|
|
'maxSelections' => $ghMaxSel,
|
|
'options' => $ghModOptions,
|
|
];
|
|
}
|
|
}
|
|
}
|
|
|
|
$ghItems[] = [
|
|
'id' => 'item_' . $ghItemId,
|
|
'name' => $ghItemName,
|
|
'price' => $ghPrice,
|
|
'description' => $ghDesc,
|
|
'category' => $ghCatName,
|
|
'imageUrl' => $ghImageUrl,
|
|
'hasModifiers' => count($ghItemModifiers) > 0,
|
|
'modifiers' => $ghItemModifiers,
|
|
];
|
|
if (strlen($ghImageUrl)) $ghImageMappings[] = ['itemId' => 'item_' . $ghItemId, 'url' => $ghImageUrl];
|
|
$ghCatItemCount++;
|
|
$ghItemId++;
|
|
}
|
|
}
|
|
$ghCategories[] = ['name' => $ghCatName, 'itemCount' => $ghCatItemCount];
|
|
}
|
|
}
|
|
|
|
$ghModifiers = array_values($ghModifierGroups);
|
|
$response['steps'][] = "Parsed " . count($ghItems) . " items in " . count($ghCategories) . " categories with " . count($ghModifiers) . " modifier groups";
|
|
|
|
$response['OK'] = true;
|
|
$response['DATA'] = [
|
|
'business' => $ghBusiness,
|
|
'categories' => $ghCategories,
|
|
'items' => $ghItems,
|
|
'modifiers' => $ghModifiers,
|
|
'imageUrls' => [],
|
|
'imageMappings' => $ghImageMappings,
|
|
'headerCandidateIndices' => [],
|
|
];
|
|
$response['sourceUrl'] = $targetUrl;
|
|
$response['pagesProcessed'] = 1;
|
|
$response['imagesFound'] = count($ghImageMappings);
|
|
$response['parsedVia'] = 'grubhub_api';
|
|
jsonResponse($response);
|
|
}
|
|
// ========== END GRUBHUB FAST PATH ==========
|
|
|
|
// Check if this is a local temp file (ZIP upload) - read directly
|
|
if (stripos($targetUrl, '/temp/menu-import/') !== false) {
|
|
$localUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/.*)#i', '$1', $targetUrl);
|
|
$localFilePath = $expandPath($localUrlPath);
|
|
$response['steps'][] = "Local temp file detected: $localFilePath";
|
|
|
|
if (!file_exists($localFilePath)) {
|
|
throw new Exception("Local file not found: $localFilePath");
|
|
}
|
|
|
|
$pageHtml = file_get_contents($localFilePath);
|
|
$playwrightImages = [];
|
|
$response['steps'][] = "Read " . strlen($pageHtml) . " bytes from local file";
|
|
|
|
$localDir = dirname($localFilePath);
|
|
$basePath = preg_replace('#/[^/]*$#', '/', $targetUrl);
|
|
|
|
// Check for Toast menu page - extract from visible HTML
|
|
if (stripos($pageHtml, 'class="headerText"') !== false && stripos($pageHtml, 'toasttab') !== false) {
|
|
$response['steps'][] = "Toast menu detected - parsing visible HTML items";
|
|
|
|
try {
|
|
$toastBusiness = [];
|
|
$toastCategories = [];
|
|
$toastItems = [];
|
|
$categorySet = [];
|
|
$itemNameSet = [];
|
|
$itemId = 1;
|
|
|
|
// Find category headers
|
|
if (preg_match_all('#<h2[^>]*class="[^"]*groupHeader[^"]*"[^>]*>([^<]+)</h2>#i', $pageHtml, $catMatches)) {
|
|
foreach ($catMatches[1] as $catName) {
|
|
$catName = trim($catName);
|
|
if (strlen($catName) && !isset($categorySet[$catName])) {
|
|
$categorySet[$catName] = true;
|
|
$toastCategories[] = ['name' => $catName, 'itemCount' => 0];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract item blocks
|
|
if (preg_match_all('#<li[^>]*class="[^"]*item[^"]*"[^>]*>.*?</li>#is', $pageHtml, $blockMatches)) {
|
|
$response['steps'][] = "Found " . count($blockMatches[0]) . " item blocks in HTML";
|
|
foreach ($blockMatches[0] as $block) {
|
|
if (preg_match('#<span class="headerText">([^<]+)</span>#i', $block, $nm)) {
|
|
$itemName = trim($nm[1]);
|
|
if (strlen($itemName) && !isset($itemNameSet[$itemName])) {
|
|
$itemNameSet[$itemName] = true;
|
|
$itemStruct = ['id' => 'item_' . $itemId, 'name' => $itemName, 'modifiers' => [], 'price' => 0, 'description' => ''];
|
|
|
|
// Price
|
|
if (preg_match('#\$([0-9]+\.?[0-9]*)#', $block, $pm)) {
|
|
$p = (float)$pm[1];
|
|
if ($p > 0) $itemStruct['price'] = $p;
|
|
}
|
|
|
|
// Description
|
|
if (preg_match('#<div[^>]*class="[^"]*description[^"]*"[^>]*>([^<]+)</div>#i', $block, $dm)) {
|
|
$itemStruct['description'] = trim($dm[1]);
|
|
}
|
|
|
|
// Image
|
|
if (preg_match('#src="(Menu_files/[^"]+)"#i', $block, $im)) {
|
|
$itemStruct['imageUrl'] = $basePath . $im[1];
|
|
$itemStruct['imageSrc'] = $basePath . $im[1];
|
|
$itemStruct['imageFilename'] = basename($im[1]);
|
|
}
|
|
|
|
$itemStruct['category'] = !empty($toastCategories) ? $toastCategories[0]['name'] : 'Menu';
|
|
$toastItems[] = $itemStruct;
|
|
$itemId++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: simpler headerText extraction
|
|
if (empty($toastItems)) {
|
|
if (preg_match_all('#<span class="headerText">([^<]+)</span>#i', $pageHtml, $nameMatches)) {
|
|
foreach ($nameMatches[1] as $nm) {
|
|
$nm = trim($nm);
|
|
if (strlen($nm) && !isset($itemNameSet[$nm])) {
|
|
$itemNameSet[$nm] = true;
|
|
$toastItems[] = ['id' => 'item_' . $itemId, 'name' => $nm, 'price' => 0, 'description' => '', 'category' => 'Menu', 'modifiers' => []];
|
|
$itemId++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try business name from title
|
|
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $pageHtml, $tm)) {
|
|
$titleText = trim($tm[1]);
|
|
if (strpos($titleText, '|') !== false) $titleText = trim(explode('|', $titleText)[0]);
|
|
$titleText = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $titleText);
|
|
if (strlen($titleText) && !isset($toastBusiness['name'])) {
|
|
$toastBusiness['name'] = $titleText;
|
|
}
|
|
}
|
|
|
|
// Try og:title/og:site_name
|
|
if (empty($toastBusiness['name'])) {
|
|
if (preg_match('#<meta[^>]*property=["\']og:(site_name|title)["\'][^>]*content=["\']([^"\']+)["\']#i', $pageHtml, $ogm)) {
|
|
$ogText = trim($ogm[2]);
|
|
if (strpos($ogText, '|') !== false) $ogText = trim(explode('|', $ogText)[0]);
|
|
if (strlen($ogText)) $toastBusiness['name'] = $ogText;
|
|
} elseif (preg_match('#<meta[^>]*content=["\']([^"\']+)["\'][^>]*property=["\']og:(site_name|title)["\']#i', $pageHtml, $ogm)) {
|
|
$ogText = trim($ogm[1]);
|
|
if (strpos($ogText, '|') !== false) $ogText = trim(explode('|', $ogText)[0]);
|
|
if (strlen($ogText)) $toastBusiness['name'] = $ogText;
|
|
}
|
|
}
|
|
|
|
// Try header element
|
|
if (empty($toastBusiness['name'])) {
|
|
if (preg_match('#<(?:h1|div)[^>]*class="[^"]*(?:restaurant|location|brand)[^"]*"[^>]*>([^<]+)<#i', $pageHtml, $hm)) {
|
|
$ht = trim($hm[1]);
|
|
if (strlen($ht) && strlen($ht) < 100) $toastBusiness['name'] = $ht;
|
|
}
|
|
}
|
|
|
|
// Try first h1
|
|
if (empty($toastBusiness['name'])) {
|
|
if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $pageHtml, $h1m)) {
|
|
$h1t = trim($h1m[1]);
|
|
if (strlen($h1t) && strlen($h1t) < 100) $toastBusiness['name'] = $h1t;
|
|
}
|
|
}
|
|
|
|
// Try address from HTML
|
|
if (empty($toastBusiness['addressLine1'])) {
|
|
if (preg_match('#<[^>]*class="[^"]*address[^"]*"[^>]*>([^<]+)</[^>]+>#i', $pageHtml, $am)) {
|
|
$at = trim($am[1]);
|
|
if (strlen($at) && strlen($at) < 200) $toastBusiness['addressLine1'] = $at;
|
|
}
|
|
}
|
|
|
|
// Try phone from HTML
|
|
if (empty($toastBusiness['phone'])) {
|
|
if (preg_match('#(?:tel:|phone[^"]*">)\s*\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})#i', $pageHtml, $phm)) {
|
|
$toastBusiness['phone'] = $phm[1] . '-' . $phm[2] . '-' . $phm[3];
|
|
}
|
|
}
|
|
|
|
// Check __OO_STATE__ for images, categories, prices, business info
|
|
if (stripos($pageHtml, 'window.__OO_STATE__') !== false) {
|
|
$ooJson = $extractOoState($pageHtml);
|
|
if ($ooJson !== null) {
|
|
try {
|
|
$ooState = json_decode($ooJson, true);
|
|
if (is_array($ooState)) {
|
|
$imageMap = [];
|
|
$itemCategoryMap = [];
|
|
$itemPriceMap = [];
|
|
|
|
foreach ($ooState as $key => $val) {
|
|
// Restaurant info
|
|
if (str_starts_with($key, 'Restaurant:') && is_array($val)) {
|
|
if (!empty($val['name'])) $toastBusiness['name'] = $val['name'];
|
|
if (!empty($val['location']) && is_array($val['location'])) {
|
|
$loc = $val['location'];
|
|
if (!empty($loc['address1'])) $toastBusiness['addressLine1'] = $loc['address1'];
|
|
if (!empty($loc['city'])) $toastBusiness['city'] = $loc['city'];
|
|
if (!empty($loc['state'])) $toastBusiness['state'] = $loc['state'];
|
|
if (!empty($loc['zipCode'])) $toastBusiness['zip'] = $loc['zipCode'];
|
|
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
|
|
}
|
|
if (!empty($val['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $val['brandColor']);
|
|
}
|
|
|
|
// Menu items
|
|
if (str_starts_with($key, 'Menu:') && is_array($val) && !empty($val['groups']) && is_array($val['groups'])) {
|
|
foreach ($val['groups'] as $group) {
|
|
$groupName = trim($group['name'] ?? '');
|
|
if (strlen($groupName) && !isset($categorySet[$groupName])) {
|
|
$categorySet[$groupName] = true;
|
|
$toastCategories[] = ['name' => $groupName, 'itemCount' => 0];
|
|
}
|
|
|
|
// Check for subgroups
|
|
$subgroups = $group['subgroups'] ?? $group['children'] ?? $group['childGroups'] ?? [];
|
|
if (!empty($subgroups) && is_array($subgroups)) {
|
|
foreach ($subgroups as $sg) {
|
|
$sgName = trim($sg['name'] ?? '');
|
|
if (strlen($sgName) && !isset($categorySet[$sgName])) {
|
|
$categorySet[$sgName] = true;
|
|
$toastCategories[] = ['name' => $sgName, 'parentCategoryName' => $groupName, 'itemCount' => 0];
|
|
}
|
|
if (!empty($sg['items']) && is_array($sg['items'])) {
|
|
$effectiveName = strlen($sgName) ? $sgName : $groupName;
|
|
foreach ($sg['items'] as $item) {
|
|
if (!empty($item['name'])) {
|
|
$itemCategoryMap[$item['name']] = $effectiveName;
|
|
$p = $extractToastPrice($item);
|
|
if ($p > 0) $itemPriceMap[$item['name']] = $p;
|
|
$img = $extractToastImage($item);
|
|
if (strlen($img)) $imageMap[$item['name']] = $img;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Direct items
|
|
if (!empty($group['items']) && is_array($group['items'])) {
|
|
foreach ($group['items'] as $item) {
|
|
if (!empty($item['name'])) {
|
|
if (strlen($groupName)) $itemCategoryMap[$item['name']] = $groupName;
|
|
$p = $extractToastPrice($item);
|
|
if ($p > 0) $itemPriceMap[$item['name']] = $p;
|
|
$img = $extractToastImage($item);
|
|
if (strlen($img)) $imageMap[$item['name']] = $img;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply to items
|
|
$imagesMatched = $categoriesMatched = $pricesMatched = 0;
|
|
for ($i = 0; $i < count($toastItems); $i++) {
|
|
$name = $toastItems[$i]['name'];
|
|
if (isset($imageMap[$name])) {
|
|
$toastItems[$i]['imageUrl'] = $imageMap[$name];
|
|
$toastItems[$i]['imageSrc'] = $imageMap[$name];
|
|
$toastItems[$i]['imageFilename'] = basename($imageMap[$name]);
|
|
$imagesMatched++;
|
|
}
|
|
if (isset($itemCategoryMap[$name])) {
|
|
$toastItems[$i]['category'] = $itemCategoryMap[$name];
|
|
$categoriesMatched++;
|
|
}
|
|
if (isset($itemPriceMap[$name]) && ($toastItems[$i]['price'] ?? 0) == 0) {
|
|
$toastItems[$i]['price'] = $itemPriceMap[$name];
|
|
$pricesMatched++;
|
|
}
|
|
}
|
|
$response['steps'][] = "Matched $imagesMatched images, $categoriesMatched categories, $pricesMatched prices from __OO_STATE__";
|
|
}
|
|
} catch (Exception $e) {
|
|
// OO_STATE parse failed, continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Default category if none
|
|
if (!empty($toastItems) && empty($toastCategories)) {
|
|
$toastCategories[] = ['name' => 'Menu', 'itemCount' => count($toastItems)];
|
|
}
|
|
|
|
// Scan ALL HTML files in the ZIP for business info
|
|
$extractUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/[a-f0-9]+/).*#i', '$1', $targetUrl);
|
|
$extractDir = $expandPath($extractUrlPath);
|
|
try {
|
|
$allHtmlFiles = [];
|
|
$it = new RecursiveDirectoryIterator($extractDir, RecursiveDirectoryIterator::SKIP_DOTS);
|
|
$files = new RecursiveIteratorIterator($it);
|
|
foreach ($files as $file) {
|
|
if (preg_match('/\.html?$/i', $file->getFilename())) {
|
|
$allHtmlFiles[] = $file->getRealPath();
|
|
}
|
|
}
|
|
$response['steps'][] = "Found " . count($allHtmlFiles) . " HTML files in ZIP";
|
|
|
|
foreach ($allHtmlFiles as $otherFile) {
|
|
if ($otherFile === $localFilePath) continue;
|
|
try {
|
|
$otherHtml = file_get_contents($otherFile);
|
|
|
|
// Business name from title
|
|
if (empty($toastBusiness['name'])) {
|
|
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $otherHtml, $otm)) {
|
|
$ot = trim($otm[1]);
|
|
if (strlen($ot) && !preg_match('#^(Menu|Home|About|Contact|Order|Online)$#i', $ot)) {
|
|
if (strpos($ot, '|') !== false) $ot = trim(explode('|', $ot)[0]);
|
|
$ot = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $ot);
|
|
if (strlen($ot) && strlen($ot) < 100) $toastBusiness['name'] = $ot;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Address from other files
|
|
if (empty($toastBusiness['addressLine1'])) {
|
|
if (preg_match('#(\d+\s+[A-Za-z0-9\s]+(?:St(?:reet)?|Ave(?:nue)?|Rd|Road|Blvd|Boulevard|Dr(?:ive)?|Ln|Lane|Way|Ct|Court|Pl(?:ace)?|Pkwy|Parkway)[.,]?\s*(?:Suite|Ste|#|Unit|Apt)?\s*[A-Za-z0-9\-]*)#i', $otherHtml, $adm)) {
|
|
$at = trim($adm[1]);
|
|
if (strlen($at) > 5 && strlen($at) < 100) $toastBusiness['addressLine1'] = $at;
|
|
}
|
|
}
|
|
|
|
// Phone from other files
|
|
if (empty($toastBusiness['phone'])) {
|
|
if (preg_match('#\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})#', $otherHtml, $phm)) {
|
|
$toastBusiness['phone'] = $phm[1] . '-' . $phm[2] . '-' . $phm[3];
|
|
}
|
|
}
|
|
|
|
// Check __OO_STATE__ in other files
|
|
if (stripos($otherHtml, 'window.__OO_STATE__') !== false) {
|
|
$otherOoJson = $extractOoState($otherHtml);
|
|
if ($otherOoJson !== null) {
|
|
try {
|
|
$otherOo = json_decode($otherOoJson, true);
|
|
if (is_array($otherOo)) {
|
|
foreach ($otherOo as $oKey => $oVal) {
|
|
if (str_starts_with($oKey, 'Restaurant:') && is_array($oVal)) {
|
|
if (!empty($oVal['name']) && empty($toastBusiness['name'])) $toastBusiness['name'] = $oVal['name'];
|
|
if (!empty($oVal['location']) && is_array($oVal['location'])) {
|
|
$ol = $oVal['location'];
|
|
if (!empty($ol['address1']) && empty($toastBusiness['addressLine1'])) $toastBusiness['addressLine1'] = $ol['address1'];
|
|
if (!empty($ol['city']) && empty($toastBusiness['city'])) $toastBusiness['city'] = $ol['city'];
|
|
if (!empty($ol['state']) && empty($toastBusiness['state'])) $toastBusiness['state'] = $ol['state'];
|
|
if (!empty($ol['zipCode']) && empty($toastBusiness['zip'])) $toastBusiness['zip'] = $ol['zipCode'];
|
|
if (!empty($ol['phone']) && empty($toastBusiness['phone'])) $toastBusiness['phone'] = $ol['phone'];
|
|
}
|
|
if (!empty($oVal['brandColor']) && empty($toastBusiness['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $oVal['brandColor']);
|
|
}
|
|
}
|
|
}
|
|
} catch (Exception $e) { /* skip */ }
|
|
}
|
|
}
|
|
} catch (Exception $e) { /* skip unreadable files */ }
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Could not scan other HTML files: " . $e->getMessage();
|
|
}
|
|
|
|
$response['steps'][] = "Extracted " . count($toastItems) . " unique items from " . count($toastCategories) . " categories";
|
|
|
|
// Scan ZIP images and analyze for business info via Claude
|
|
try {
|
|
$zipImageFiles = [];
|
|
$it = new RecursiveDirectoryIterator($extractDir, RecursiveDirectoryIterator::SKIP_DOTS);
|
|
$files = new RecursiveIteratorIterator($it);
|
|
$imageExtensions = ['jpg','jpeg','png','gif','webp'];
|
|
foreach ($files as $file) {
|
|
if (!$file->isFile()) continue;
|
|
$ext = strtolower(pathinfo($file->getFilename(), PATHINFO_EXTENSION));
|
|
if (in_array($ext, $imageExtensions) && $file->getSize() > 10000 && stripos($file->getPath(), '_files') === false) {
|
|
$zipImageFiles[] = $file->getRealPath();
|
|
}
|
|
}
|
|
|
|
if (!empty($zipImageFiles)) {
|
|
$response['steps'][] = "Found " . count($zipImageFiles) . " images in ZIP to analyze for business info";
|
|
$imgLimit = min(count($zipImageFiles), 3);
|
|
for ($imgIdx = 0; $imgIdx < $imgLimit; $imgIdx++) {
|
|
try {
|
|
$imgContent = file_get_contents($zipImageFiles[$imgIdx]);
|
|
$base64Img = base64_encode($imgContent);
|
|
$mediaType = $detectMediaType($base64Img);
|
|
|
|
$imgRequest = [
|
|
'model' => 'claude-sonnet-4-20250514',
|
|
'max_tokens' => 1024,
|
|
'temperature' => 0,
|
|
'messages' => [[
|
|
'role' => 'user',
|
|
'content' => [
|
|
['type' => 'image', 'source' => ['type' => 'base64', 'media_type' => $mediaType, 'data' => $base64Img]],
|
|
['type' => 'text', 'text' => 'Extract ALL business information visible in this image. Look carefully for: 1) Business NAME (the restaurant/store name), 2) PHONE number (format: xxx-xxx-xxxx), 3) Full ADDRESS (street, city, state, zip), 4) HOURS of operation (all days shown). Return JSON: {"name":"","addressLine1":"","city":"","state":"","zip":"","phone":"","hours":"","brandColor":""}. For hours, format as single string like \'Mon-Thu 7am-10pm, Fri-Sat 7am-11pm\'. Return ONLY valid JSON.'],
|
|
],
|
|
]],
|
|
];
|
|
|
|
$imgResp = $httpPost(
|
|
'https://api.anthropic.com/v1/messages',
|
|
json_encode($imgRequest),
|
|
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
|
|
60
|
|
);
|
|
|
|
if ($imgResp['code'] === 200) {
|
|
$imgData = json_decode($imgResp['body'], true);
|
|
if (!empty($imgData['content'][0]['text'])) {
|
|
$imgText = $cleanClaudeJson($imgData['content'][0]['text']);
|
|
$imgBiz = json_decode($imgText, true);
|
|
if (is_array($imgBiz)) {
|
|
foreach (['name','addressLine1','city','state','zip','phone','hours','brandColor'] as $field) {
|
|
if (!empty($imgBiz[$field]) && is_scalar($imgBiz[$field])) {
|
|
$toastBusiness[$field] = trim($imgBiz[$field]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Error analyzing image: " . $e->getMessage();
|
|
}
|
|
}
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Could not scan ZIP for images: " . $e->getMessage();
|
|
}
|
|
|
|
// Return directly
|
|
$response['OK'] = true;
|
|
$response['DATA'] = [
|
|
'business' => $toastBusiness,
|
|
'categories' => $toastCategories,
|
|
'modifiers' => [],
|
|
'items' => $toastItems,
|
|
'imageUrls' => [],
|
|
'headerCandidateIndices' => [],
|
|
'imageMappings' => [],
|
|
];
|
|
$response['sourceUrl'] = $targetUrl;
|
|
$response['pagesProcessed'] = 1;
|
|
$response['imagesFound'] = 0;
|
|
$response['playwrightImagesCount'] = 0;
|
|
$response['toastDirect'] = true;
|
|
jsonResponse($response);
|
|
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Toast HTML parse failed: " . $e->getMessage() . " - falling back to Claude";
|
|
}
|
|
}
|
|
|
|
// Extract base URL for relative links (local temp file case)
|
|
if (preg_match('#^(https?://[^/]+)#', $targetUrl, $bm)) {
|
|
$baseUrl = $bm[1];
|
|
}
|
|
$basePath = preg_replace('#/[^/]*$#', '/', preg_replace('#\?.*$#', '', $targetUrl));
|
|
|
|
} else {
|
|
// Remote URL - use Playwright for JS-rendered content
|
|
$response['steps'][] = "Fetching URL with Playwright: $targetUrl";
|
|
|
|
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($targetUrl) . " 10000 2>&1");
|
|
if (empty(trim($pwOutput ?? ''))) {
|
|
throw new Exception("Playwright returned empty response");
|
|
}
|
|
|
|
$pwResult = json_decode($pwOutput, true);
|
|
if (isset($pwResult['error'])) {
|
|
throw new Exception("Playwright error: " . $pwResult['error']);
|
|
}
|
|
|
|
$pageHtml = $pwResult['html'] ?? '';
|
|
$playwrightImages = $pwResult['images'] ?? [];
|
|
$response['steps'][] = "Fetched " . strlen($pageHtml) . " bytes via Playwright, " . count($playwrightImages) . " images captured";
|
|
|
|
// Capture platform image map (ordering site food photos matched to item names)
|
|
$platformImageMap = [];
|
|
if (!empty($pwResult['platformImageMap']) && is_array($pwResult['platformImageMap'])) {
|
|
$platformImageMap = $pwResult['platformImageMap'];
|
|
$response['steps'][] = "Found " . count($platformImageMap) . " item images from ordering platform";
|
|
}
|
|
if (!empty($pwResult['subPagesVisited']) && is_array($pwResult['subPagesVisited'])) {
|
|
$response['steps'][] = "Visited " . count($pwResult['subPagesVisited']) . " menu sub-pages: " . implode(', ', $pwResult['subPagesVisited']);
|
|
}
|
|
if (!empty($pwResult['platformPagesVisited']) && is_array($pwResult['platformPagesVisited'])) {
|
|
$response['steps'][] = "Visited " . count($pwResult['platformPagesVisited']) . " ordering platforms for photos: " . implode(', ', $pwResult['platformPagesVisited']);
|
|
}
|
|
|
|
// ========== WOOCOMMERCE FAST PATH ==========
|
|
if (stripos($pageHtml, 'woocommerce') !== false || stripos($pageHtml, 'wc-add-to-cart') !== false || stripos($pageHtml, 'tm-extra-product-options') !== false) {
|
|
$response['steps'][] = "WooCommerce site detected - running modifier extraction";
|
|
$wooUrl = preg_replace('#(https?://[^/]+).*#', '$1', $targetUrl);
|
|
|
|
try {
|
|
$wooOutput = shell_exec("/opt/playwright/run-woo-modifiers.sh " . escapeshellarg($wooUrl) . " 2>&1");
|
|
if (!empty(trim($wooOutput ?? ''))) {
|
|
$wooResult = json_decode($wooOutput, true);
|
|
if (!empty($wooResult['items']) && is_array($wooResult['items'])) {
|
|
$response['steps'][] = "WooCommerce extraction: " . count($wooResult['items']) . " items, " . count($wooResult['modifiers'] ?? []) . " modifier groups";
|
|
|
|
$wooCats = [];
|
|
$wooItems = [];
|
|
foreach ($wooResult['items'] as $wi => $wItem) {
|
|
$catName = !empty($wItem['category']) ? trim($wItem['category']) : 'Menu';
|
|
if (!isset($wooCats[$catName])) $wooCats[$catName] = 0;
|
|
$wooCats[$catName]++;
|
|
|
|
$itemMods = $wooResult['itemModifierMap'][$wItem['name']] ?? [];
|
|
$wooItems[] = [
|
|
'id' => 'item_' . ($wi + 1),
|
|
'name' => $wItem['name'],
|
|
'price' => (float)($wItem['price'] ?? 0),
|
|
'description' => $wItem['description'] ?? '',
|
|
'category' => $catName,
|
|
'modifiers' => $itemMods,
|
|
'hasModifiers' => count($itemMods) > 0,
|
|
'imageUrl' => trim($wItem['imageUrl'] ?? ''),
|
|
];
|
|
}
|
|
|
|
$wooCategories = [];
|
|
foreach ($wooCats as $wcName => $wcCount) {
|
|
$wooCategories[] = ['name' => $wcName, 'itemCount' => $wcCount];
|
|
}
|
|
|
|
$wooBiz = $wooResult['business'] ?? [];
|
|
$response['OK'] = true;
|
|
$response['DATA'] = [
|
|
'business' => [
|
|
'name' => $wooBiz['name'] ?? '',
|
|
'address' => $wooBiz['address'] ?? '',
|
|
'phone' => $wooBiz['phone'] ?? '',
|
|
'hours' => $wooBiz['hours'] ?? '',
|
|
],
|
|
'categories' => $wooCategories,
|
|
'items' => $wooItems,
|
|
'modifiers' => $wooResult['modifiers'] ?? [],
|
|
'imageUrls' => [],
|
|
'imageMappings' => [],
|
|
'headerCandidateIndices' => [],
|
|
];
|
|
$response['sourceUrl'] = $targetUrl;
|
|
$response['parsedVia'] = 'woocommerce_playwright';
|
|
jsonResponse($response);
|
|
}
|
|
}
|
|
$response['steps'][] = "WooCommerce extraction returned no items - falling through to Claude";
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "WooCommerce extraction failed: " . $e->getMessage() . " - falling through to Claude";
|
|
}
|
|
}
|
|
// ========== END WOOCOMMERCE FAST PATH ==========
|
|
|
|
// ========== DOORDASH / ORDER.ONLINE FAST PATH ==========
|
|
if (stripos($pageHtml, 'MenuPageItem') !== false && stripos($pageHtml, 'MenuPageItemList') !== false) {
|
|
$response['steps'][] = "DoorDash/order.online site detected - extracting embedded data";
|
|
try {
|
|
// Build image map from StorePageCarouselItem entries
|
|
$ddImageMap = [];
|
|
$carouselMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StorePageCarouselItem' . $BQ;
|
|
$searchPos = 0;
|
|
while (true) {
|
|
$searchPos = stripos($pageHtml, $carouselMarker, $searchPos);
|
|
if ($searchPos === false) break;
|
|
$nextMarker = stripos($pageHtml, $BQ . '__typename' . $BQ, $searchPos + strlen($carouselMarker));
|
|
if ($nextMarker === false) $nextMarker = strlen($pageHtml);
|
|
$entryText = substr($pageHtml, $searchPos, $nextMarker - $searchPos);
|
|
|
|
$cpName = $extractBqValue($entryText, 'name');
|
|
if ($cpName !== null) {
|
|
$cpImg = $extractBqValue($entryText, 'imgUrl');
|
|
if ($cpImg !== null && $cpImg !== 'null' && stripos($cpImg, 'http') !== false) {
|
|
if (stripos($cpImg, 'width=') !== false) {
|
|
$cpImg = preg_replace('/width=\d+/i', 'width=600', $cpImg);
|
|
$cpImg = preg_replace('/height=\d+/i', 'height=600', $cpImg);
|
|
}
|
|
$ddImageMap[$cpName] = $cpImg;
|
|
}
|
|
}
|
|
$searchPos += strlen($carouselMarker);
|
|
}
|
|
$response['steps'][] = "Built image map with " . count($ddImageMap) . " entries from carousel";
|
|
|
|
// Extract menu from MenuPageItemList
|
|
$ddCategories = [];
|
|
$ddCatSeen = [];
|
|
$ddItems = [];
|
|
$ddItemSeen = [];
|
|
$ddItemCounter = 0;
|
|
|
|
$catMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'MenuPageItemList' . $BQ;
|
|
$itemMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'MenuPageItem' . $BQ;
|
|
|
|
$catPos = 0;
|
|
while (true) {
|
|
$catPos = stripos($pageHtml, $catMarker, $catPos);
|
|
if ($catPos === false) break;
|
|
|
|
$nextCatPos = stripos($pageHtml, $catMarker, $catPos + strlen($catMarker));
|
|
if ($nextCatPos === false) $nextCatPos = strlen($pageHtml);
|
|
$catSection = substr($pageHtml, $catPos, $nextCatPos - $catPos);
|
|
|
|
$catName = $extractBqValue($catSection, 'name');
|
|
if ($catName === null) { $catPos += strlen($catMarker); continue; }
|
|
$catName = str_replace(['\\u0026', '&'], '&', $catName);
|
|
|
|
if ($catName === 'Most Ordered' || isset($ddCatSeen[$catName])) {
|
|
$catPos += strlen($catMarker);
|
|
continue;
|
|
}
|
|
$ddCatSeen[$catName] = true;
|
|
$ddCategories[] = ['name' => $catName, 'parentCategoryName' => ''];
|
|
|
|
// Items within category
|
|
$itemPos = 0;
|
|
while (true) {
|
|
$itemPos = stripos($catSection, $itemMarker, $itemPos);
|
|
if ($itemPos === false) break;
|
|
$nextItemPos = stripos($catSection, $itemMarker, $itemPos + strlen($itemMarker));
|
|
if ($nextItemPos === false) $nextItemPos = strlen($catSection);
|
|
$itemEntry = substr($catSection, $itemPos, $nextItemPos - $itemPos);
|
|
|
|
$ddItemId = $extractBqValue($itemEntry, 'id') ?? '';
|
|
$ipName = $extractBqValue($itemEntry, 'name');
|
|
if ($ipName === null) { $itemPos += strlen($itemMarker); continue; }
|
|
$ipName = str_replace('\\u0026', '&', $ipName);
|
|
if (isset($ddItemSeen[$ipName])) { $itemPos += strlen($itemMarker); continue; }
|
|
$ddItemSeen[$ipName] = true;
|
|
|
|
$ipDesc = $extractBqValue($itemEntry, 'description') ?? '';
|
|
$ipDesc = str_replace('\\u0026', '&', $ipDesc);
|
|
|
|
$ipPriceStr = $extractBqValue($itemEntry, 'displayPrice') ?? '';
|
|
$ipPrice = (float)preg_replace('/[^0-9.]/', '', $ipPriceStr);
|
|
|
|
// Image from carousel map or item entry
|
|
$ipImg = $ddImageMap[$ipName] ?? '';
|
|
if (empty($ipImg)) {
|
|
$ipImg = $extractBqValue($itemEntry, 'imageUrl') ?? '';
|
|
if ($ipImg === 'null' || stripos($ipImg, 'http') === false) $ipImg = '';
|
|
if (strlen($ipImg) && stripos($ipImg, 'width=') !== false) {
|
|
$ipImg = preg_replace('/width=\d+/i', 'width=600', $ipImg);
|
|
$ipImg = preg_replace('/height=\d+/i', 'height=600', $ipImg);
|
|
}
|
|
}
|
|
|
|
$ddItemCounter++;
|
|
$ddItem = [
|
|
'name' => $ipName,
|
|
'description' => $ipDesc,
|
|
'price' => $ipPrice,
|
|
'category' => $catName,
|
|
'modifiers' => [],
|
|
'id' => 'item_' . $ddItemCounter,
|
|
'ddItemId' => $ddItemId,
|
|
'imageUrl' => $ipImg,
|
|
'imageSrc' => $ipImg,
|
|
];
|
|
if (strlen($ipImg)) $ddItem['imageFilename'] = basename(parse_url($ipImg, PHP_URL_PATH) ?: $ipImg);
|
|
$ddItems[] = $ddItem;
|
|
|
|
$itemPos += strlen($itemMarker);
|
|
}
|
|
$catPos += strlen($catMarker);
|
|
}
|
|
|
|
$ddItemsWithImg = 0;
|
|
foreach ($ddItems as $ddi) { if (!empty($ddi['imageUrl'])) $ddItemsWithImg++; }
|
|
$response['steps'][] = "Found " . count($ddCategories) . " categories, " . count($ddItems) . " items ($ddItemsWithImg with images)";
|
|
|
|
// Extract business info
|
|
$ddBusiness = [];
|
|
if (preg_match('#<title>([^<]+)</title>#i', $pageHtml, $ddTm)) {
|
|
$ddTitle = preg_replace('#\s*[-|].*#', '', trim($ddTm[1]));
|
|
if (strlen($ddTitle)) $ddBusiness['name'] = $ddTitle;
|
|
}
|
|
|
|
// Address from StoreHeaderAddress
|
|
$ddAddrMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreHeaderAddress' . $BQ;
|
|
$ddAddrPos = stripos($pageHtml, $ddAddrMarker);
|
|
if ($ddAddrPos !== false) {
|
|
$ddAddrEnd = stripos($pageHtml, $BQ . '__typename' . $BQ, $ddAddrPos + strlen($ddAddrMarker));
|
|
if ($ddAddrEnd === false) $ddAddrEnd = min($ddAddrPos + 2000, strlen($pageHtml));
|
|
$ddAddrSection = substr($pageHtml, $ddAddrPos, $ddAddrEnd - $ddAddrPos);
|
|
$street = $extractBqValue($ddAddrSection, 'street');
|
|
if ($street !== null) $ddBusiness['street'] = $street;
|
|
$displayAddr = $extractBqValue($ddAddrSection, 'displayAddress');
|
|
if ($displayAddr !== null) $ddBusiness['address'] = $displayAddr;
|
|
}
|
|
|
|
// Phone from StoreHeaderPhoneNumber
|
|
$ddPhoneMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreHeaderPhoneNumber' . $BQ;
|
|
$ddPhonePos = stripos($pageHtml, $ddPhoneMarker);
|
|
if ($ddPhonePos !== false) {
|
|
$ddPhoneEnd = stripos($pageHtml, $BQ . '__typename' . $BQ, $ddPhonePos + strlen($ddPhoneMarker));
|
|
if ($ddPhoneEnd === false) $ddPhoneEnd = min($ddPhonePos + 1000, strlen($pageHtml));
|
|
$ddPhoneSection = substr($pageHtml, $ddPhonePos, $ddPhoneEnd - $ddPhonePos);
|
|
$phone = $extractBqValue($ddPhoneSection, 'phoneNumber');
|
|
if ($phone !== null) $ddBusiness['phone'] = $phone;
|
|
}
|
|
|
|
// Hours from StoreOperationHoursRange
|
|
$ddHoursMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreOperationHoursRange' . $BQ;
|
|
if (stripos($pageHtml, $ddHoursMarker) !== false) {
|
|
$ddHoursArr = [];
|
|
$hPos = 0;
|
|
while (true) {
|
|
$hPos = stripos($pageHtml, $ddHoursMarker, $hPos);
|
|
if ($hPos === false) break;
|
|
$hNext = stripos($pageHtml, $ddHoursMarker, $hPos + strlen($ddHoursMarker));
|
|
if ($hNext === false) $hNext = min($hPos + 500, strlen($pageHtml));
|
|
$hSection = substr($pageHtml, $hPos, $hNext - $hPos);
|
|
|
|
$dayRange = $extractBqValue($hSection, 'dayRange');
|
|
$timeRange = $extractBqValue($hSection, 'timeRange');
|
|
if ($dayRange !== null && $timeRange !== null) {
|
|
$ddHoursArr[] = "$dayRange: $timeRange";
|
|
}
|
|
$hPos += strlen($ddHoursMarker);
|
|
}
|
|
if (!empty($ddHoursArr)) $ddBusiness['hours'] = implode('; ', $ddHoursArr);
|
|
}
|
|
|
|
if (!empty($ddItems)) {
|
|
// Playwright modifier extraction
|
|
$ddModifiers = [];
|
|
$ddItemModMap = [];
|
|
try {
|
|
$response['steps'][] = "Running stealth Playwright for modifier extraction...";
|
|
$ddItemsForPw = [];
|
|
foreach ($ddItems as $ddi) {
|
|
$ddItemsForPw[] = ['id' => $ddi['ddItemId'], 'name' => $ddi['name']];
|
|
}
|
|
$ddTempFile = '/tmp/dd-items-' . generateUUID() . '.json';
|
|
file_put_contents($ddTempFile, json_encode($ddItemsForPw));
|
|
|
|
$modTimeout = 180 + count($ddItems) * 2;
|
|
if ($modTimeout > 600) $modTimeout = 600;
|
|
$ddModOutput = shell_exec("/opt/playwright/run-doordash-modifiers.sh " . escapeshellarg($targetUrl) . " " . escapeshellarg($ddTempFile) . " 2>&1");
|
|
@unlink($ddTempFile);
|
|
|
|
if (!empty(trim($ddModOutput ?? ''))) {
|
|
$ddModData = json_decode(trim($ddModOutput), true);
|
|
if (!empty($ddModData['modifiers']) && is_array($ddModData['modifiers'])) {
|
|
$ddModifiers = $ddModData['modifiers'];
|
|
foreach ($ddModifiers as &$ddMod) {
|
|
$ddMod['type'] = (!empty($ddMod['maxSelections']) && $ddMod['maxSelections'] == 1) ? 'select' : 'checkbox';
|
|
}
|
|
unset($ddMod);
|
|
}
|
|
if (!empty($ddModData['itemModifierMap']) && is_array($ddModData['itemModifierMap'])) {
|
|
$ddItemModMap = $ddModData['itemModifierMap'];
|
|
for ($i = 0; $i < count($ddItems); $i++) {
|
|
if (isset($ddItemModMap[$ddItems[$i]['name']])) {
|
|
$ddItems[$i]['modifiers'] = $ddItemModMap[$ddItems[$i]['name']];
|
|
}
|
|
}
|
|
}
|
|
$response['steps'][] = "Modifier extraction: " . count($ddModifiers) . " groups, " . count($ddItemModMap) . " items mapped";
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Modifier extraction failed (non-fatal): " . $e->getMessage();
|
|
}
|
|
|
|
$ddImageUrls = [];
|
|
foreach ($ddItems as $ddI) {
|
|
if (!empty($ddI['imageUrl'])) $ddImageUrls[] = $ddI['imageUrl'];
|
|
}
|
|
|
|
$response['OK'] = true;
|
|
$response['DATA'] = [
|
|
'business' => $ddBusiness,
|
|
'categories' => $ddCategories,
|
|
'modifiers' => $ddModifiers,
|
|
'items' => $ddItems,
|
|
'imageUrls' => $ddImageUrls,
|
|
'headerCandidateIndices' => [],
|
|
];
|
|
$response['sourceUrl'] = $targetUrl;
|
|
$response['parsedVia'] = 'doordash_embedded';
|
|
$response['imagesFound'] = count($ddImageUrls);
|
|
$response['playwrightImagesCount'] = count($playwrightImages);
|
|
jsonResponse($response);
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "DoorDash extraction failed: " . $e->getMessage() . " - falling through to Claude";
|
|
}
|
|
}
|
|
// ========== END DOORDASH FAST PATH ==========
|
|
|
|
// Extract base URL for relative links
|
|
if (preg_match('#^(https?://[^/]+)#', $targetUrl, $bm)) {
|
|
$baseUrl = $bm[1];
|
|
}
|
|
$basePath = preg_replace('#\?.*$#', '', $targetUrl);
|
|
if (!preg_match('#/$#', $basePath)) {
|
|
$basePath = preg_replace('#/[^/]*$#', '/', $basePath);
|
|
}
|
|
}
|
|
} else {
|
|
throw new Exception("Either 'url' or 'html' content is required");
|
|
}
|
|
|
|
// Menu pages array
|
|
$menuPages = [['url' => !empty($targetUrl) ? $targetUrl : 'uploaded', 'html' => $pageHtml]];
|
|
|
|
// Extract images from all pages
|
|
$imageUrls = [];
|
|
$imageMappings = [];
|
|
|
|
// Add Playwright-captured images
|
|
foreach ($playwrightImages as $pwImg) {
|
|
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
|
|
$imageUrls[$pwImg] = true;
|
|
}
|
|
}
|
|
|
|
foreach ($menuPages as $menuPage) {
|
|
if (preg_match_all('#<img[^>]+src=["\']([^"\']+)["\'][^>]*>#i', $menuPage['html'], $imgMatches, PREG_SET_ORDER)) {
|
|
foreach ($imgMatches as $imgMatch) {
|
|
$imgTag = $imgMatch[0];
|
|
$imgSrc = $imgMatch[1];
|
|
|
|
// Extract alt text
|
|
$imgAlt = '';
|
|
if (preg_match('#alt=["\']([^"\']+)["\']#i', $imgTag, $altM)) {
|
|
$imgAlt = $altM[1];
|
|
}
|
|
|
|
// Image mapping for local uploads
|
|
$imgFilename = basename($imgSrc);
|
|
if (strlen($imgFilename) && strlen($imgAlt) && !preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button)#i', $imgSrc)) {
|
|
$imageMappings[] = ['filename' => $imgFilename, 'alt' => $imgAlt, 'src' => $imgSrc];
|
|
}
|
|
|
|
// Resolve relative URLs
|
|
if (str_starts_with($imgSrc, '/')) {
|
|
$imgSrc = $baseUrl . $imgSrc;
|
|
} elseif (!preg_match('#^https?://#i', $imgSrc) && !str_starts_with($imgSrc, 'data:')) {
|
|
$imgSrc = $basePath . $imgSrc;
|
|
}
|
|
|
|
if (preg_match('#^https?://#i', $imgSrc) && !isset($imageUrls[$imgSrc])) {
|
|
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button)#i', $imgSrc)) {
|
|
$imageUrls[$imgSrc] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
$response['steps'][] = "Found " . count($imageUrls) . " unique images";
|
|
|
|
// Check for local scan (ZIP upload)
|
|
$isLocalScan = !empty($targetUrl) && stripos($targetUrl, '/temp/menu-import/') !== false;
|
|
$localBasePath = '';
|
|
if ($isLocalScan) {
|
|
$localUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/[^/]+/).*#i', '$1', $targetUrl);
|
|
$localBasePath = $expandPath($localUrlPath);
|
|
$response['steps'][] = "Local scan detected, base path: $localBasePath";
|
|
}
|
|
|
|
// Download/read images (limit to 20)
|
|
$imageDataArray = [];
|
|
$downloadedCount = 0;
|
|
$localReadCount = 0;
|
|
|
|
foreach (array_keys($imageUrls) as $imgUrl) {
|
|
if ($downloadedCount >= 20) break;
|
|
try {
|
|
$imgBytes = 0;
|
|
$imgContent = '';
|
|
$mediaType = 'image/jpeg';
|
|
|
|
if ($isLocalScan && stripos($imgUrl, '/temp/menu-import/') !== false) {
|
|
$localPath = $expandPath(preg_replace('#https?://[^/]+(/temp/menu-import/.*)#i', '$1', $imgUrl));
|
|
if (file_exists($localPath)) {
|
|
$imgContent = file_get_contents($localPath);
|
|
$imgBytes = strlen($imgContent);
|
|
$ext = strtolower(pathinfo($localPath, PATHINFO_EXTENSION));
|
|
if ($ext === 'png') $mediaType = 'image/png';
|
|
elseif ($ext === 'gif') $mediaType = 'image/gif';
|
|
elseif ($ext === 'webp') $mediaType = 'image/webp';
|
|
$localReadCount++;
|
|
}
|
|
} else {
|
|
$result = $httpGet($imgUrl, [], 10);
|
|
if ($result['code'] === 200 && !empty($result['body'])) {
|
|
$ct = $result['contentType'];
|
|
if (preg_match('#image/(jpeg|jpg|png|gif|webp)#i', $ct)) {
|
|
$imgContent = $result['body'];
|
|
$imgBytes = strlen($imgContent);
|
|
if (stripos($ct, 'png') !== false) $mediaType = 'image/png';
|
|
elseif (stripos($ct, 'gif') !== false) $mediaType = 'image/gif';
|
|
elseif (stripos($ct, 'webp') !== false) $mediaType = 'image/webp';
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($imgBytes > 5000) {
|
|
$base64Content = base64_encode($imgContent);
|
|
$mediaType = $detectMediaType($base64Content);
|
|
|
|
$imageDataArray[] = [
|
|
'type' => 'image',
|
|
'source' => ['type' => 'base64', 'media_type' => $mediaType, 'data' => $base64Content],
|
|
'url' => $imgUrl,
|
|
];
|
|
$downloadedCount++;
|
|
}
|
|
} catch (Exception $e) {
|
|
// Skip failed downloads
|
|
}
|
|
}
|
|
|
|
$response['steps'][] = "Loaded " . count($imageDataArray) . " valid images ($localReadCount from local disk)";
|
|
|
|
// ============================================================
|
|
// TOAST FAST PATH: Parse __OO_STATE__ directly instead of Claude
|
|
// ============================================================
|
|
if (stripos($pageHtml, 'window.__OO_STATE__') !== false && stripos($pageHtml, 'toasttab') !== false) {
|
|
$response['steps'][] = "Toast page detected - extracting menu data from __OO_STATE__";
|
|
try {
|
|
$ooJson = $extractOoState($pageHtml);
|
|
if ($ooJson !== null) {
|
|
$ooState = json_decode($ooJson, true);
|
|
if (!is_array($ooState)) throw new Exception("Failed to parse __OO_STATE__ JSON");
|
|
|
|
$toastBusiness = [];
|
|
$toastCategories = [];
|
|
$toastItems = [];
|
|
$categorySet = [];
|
|
$itemId = 1;
|
|
$menuNames = [];
|
|
|
|
// Extract restaurant info from ROOT_QUERY
|
|
if (!empty($ooState['ROOT_QUERY']) && is_array($ooState['ROOT_QUERY'])) {
|
|
foreach ($ooState['ROOT_QUERY'] as $rqKey => $rqVal) {
|
|
if ((stripos($rqKey, 'restaurantV2By') !== false || stripos($rqKey, 'restaurantV2(') !== false) && is_array($rqVal)) {
|
|
if (!empty($rqVal['name']) && empty($toastBusiness['name'])) $toastBusiness['name'] = $rqVal['name'];
|
|
if (!empty($rqVal['description']) && strlen(trim((string)$rqVal['description']))) {
|
|
$toastBusiness['description'] = trim((string)$rqVal['description']);
|
|
}
|
|
if (!empty($rqVal['location']) && is_array($rqVal['location'])) {
|
|
$loc = $rqVal['location'];
|
|
if (!empty($loc['address1'])) {
|
|
$toastBusiness['addressLine1'] = $loc['address1'];
|
|
$toastBusiness['address'] = $loc['address1'];
|
|
if (!empty($loc['city'])) { $toastBusiness['city'] = $loc['city']; $toastBusiness['address'] .= ', ' . $loc['city']; }
|
|
if (!empty($loc['state'])) { $toastBusiness['state'] = $loc['state']; $toastBusiness['address'] .= ', ' . $loc['state']; }
|
|
$zip = $loc['zip'] ?? $loc['zipCode'] ?? null;
|
|
if (!empty($zip)) { $toastBusiness['zip'] = $zip; $toastBusiness['address'] .= ' ' . $zip; }
|
|
}
|
|
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
|
|
if (!empty($loc['latitude']) && is_numeric($loc['latitude']) && !empty($loc['longitude']) && is_numeric($loc['longitude'])) {
|
|
$toastBusiness['latitude'] = $loc['latitude'];
|
|
$toastBusiness['longitude'] = $loc['longitude'];
|
|
}
|
|
}
|
|
if (!empty($rqVal['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $rqVal['brandColor']);
|
|
|
|
// Hours from schedule
|
|
if (!empty($rqVal['schedule']['upcomingSchedules'][0]['dailySchedules'])) {
|
|
$dayHours = [];
|
|
foreach ($rqVal['schedule']['upcomingSchedules'][0]['dailySchedules'] as $ds) {
|
|
if (!empty($ds['date']) && !empty($ds['servicePeriods'][0]['startTime'])) {
|
|
$dow = (int)date('w', strtotime($ds['date'])) + 1; // 1=Sun
|
|
$sp = $ds['servicePeriods'][0];
|
|
$dayHours[$dow] = ['open' => substr($sp['startTime'], 0, 5), 'close' => substr($sp['endTime'], 0, 5)];
|
|
}
|
|
}
|
|
$dayNames = [1=>'Sun',2=>'Mon',3=>'Tue',4=>'Wed',5=>'Thu',6=>'Fri',7=>'Sat'];
|
|
$dayOrder = [2,3,4,5,6,7,1]; // Mon-Sun
|
|
$hoursParts = [];
|
|
foreach ($dayOrder as $dIdx) {
|
|
if (isset($dayHours[$dIdx])) {
|
|
$dh = $dayHours[$dIdx];
|
|
$op = explode(':', $dh['open']);
|
|
$cp = explode(':', $dh['close']);
|
|
$openStr = $formatTime12h((int)$op[0], (int)($op[1] ?? 0));
|
|
$closeStr = $formatTime12h((int)$cp[0], (int)($cp[1] ?? 0));
|
|
$hoursParts[] = $dayNames[$dIdx] . " $openStr-$closeStr";
|
|
}
|
|
}
|
|
if (!empty($hoursParts)) $toastBusiness['hours'] = implode(', ', $hoursParts);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also check Restaurant: keys (older format)
|
|
foreach ($ooState as $ooKey => $ooVal) {
|
|
if (str_starts_with($ooKey, 'Restaurant:') && empty($toastBusiness['name']) && is_array($ooVal)) {
|
|
if (!empty($ooVal['name'])) $toastBusiness['name'] = $ooVal['name'];
|
|
if (!empty($ooVal['location']) && is_array($ooVal['location'])) {
|
|
$loc = $ooVal['location'];
|
|
if (!empty($loc['address1'])) {
|
|
$toastBusiness['address'] = $loc['address1'];
|
|
if (!empty($loc['city'])) $toastBusiness['address'] .= ', ' . $loc['city'];
|
|
if (!empty($loc['state'])) $toastBusiness['address'] .= ', ' . $loc['state'];
|
|
if (!empty($loc['zipCode'])) $toastBusiness['address'] .= ' ' . $loc['zipCode'];
|
|
}
|
|
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
|
|
}
|
|
if (!empty($ooVal['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $ooVal['brandColor']);
|
|
}
|
|
|
|
// Menu data
|
|
if (str_starts_with($ooKey, 'Menu:') && is_array($ooVal) && !empty($ooVal['groups']) && is_array($ooVal['groups'])) {
|
|
$menuName = $ooVal['name'] ?? '';
|
|
if (strlen($menuName)) $menuNames[] = $menuName;
|
|
|
|
foreach ($ooVal['groups'] as $group) {
|
|
$groupName = trim($group['name'] ?? 'Menu');
|
|
if (!isset($categorySet[$groupName])) {
|
|
$categorySet[$groupName] = true;
|
|
$catObj = ['name' => $groupName, 'itemCount' => 0, 'menuName' => $menuName];
|
|
$toastCategories[] = $catObj;
|
|
}
|
|
|
|
// Items from group
|
|
if (!empty($group['items']) && is_array($group['items'])) {
|
|
foreach ($group['items'] as $item) {
|
|
if (empty($item['name']) || !strlen(trim($item['name']))) continue;
|
|
$itemStruct = [
|
|
'id' => 'item_' . $itemId,
|
|
'name' => trim($item['name']),
|
|
'category' => $groupName,
|
|
'modifiers' => [],
|
|
'hasModifiers' => !empty($item['hasModifiers']),
|
|
'guid' => $item['guid'] ?? '',
|
|
'itemGroupGuid' => $item['itemGroupGuid'] ?? '',
|
|
'description' => isset($item['description']) && !is_null($item['description']) ? trim((string)$item['description']) : '',
|
|
'price' => $extractToastPrice($item),
|
|
'imageUrl' => '',
|
|
];
|
|
$img = $extractToastImage($item);
|
|
if (strlen($img)) {
|
|
$itemStruct['imageUrl'] = $img;
|
|
$itemStruct['imageSrc'] = $img;
|
|
$itemStruct['imageFilename'] = basename($img);
|
|
}
|
|
$toastItems[] = $itemStruct;
|
|
$itemId++;
|
|
}
|
|
}
|
|
|
|
// Subgroups
|
|
$subgroups = $group['subgroups'] ?? $group['children'] ?? $group['childGroups'] ?? [];
|
|
if (!empty($subgroups) && is_array($subgroups)) {
|
|
foreach ($subgroups as $sg) {
|
|
$subName = trim($sg['name'] ?? $groupName);
|
|
if (strlen($subName) && !isset($categorySet[$subName])) {
|
|
$categorySet[$subName] = true;
|
|
$toastCategories[] = ['name' => $subName, 'parentCategoryName' => $groupName, 'itemCount' => 0];
|
|
}
|
|
if (!empty($sg['items']) && is_array($sg['items'])) {
|
|
foreach ($sg['items'] as $subItem) {
|
|
if (empty($subItem['name']) || !strlen(trim($subItem['name']))) continue;
|
|
$itemStruct = [
|
|
'id' => 'item_' . $itemId,
|
|
'name' => trim($subItem['name']),
|
|
'category' => $subName,
|
|
'modifiers' => [],
|
|
'hasModifiers' => !empty($subItem['hasModifiers']),
|
|
'guid' => $subItem['guid'] ?? '',
|
|
'itemGroupGuid' => $subItem['itemGroupGuid'] ?? '',
|
|
'description' => isset($subItem['description']) && !is_null($subItem['description']) ? trim((string)$subItem['description']) : '',
|
|
'price' => $extractToastPrice($subItem),
|
|
'imageUrl' => '',
|
|
];
|
|
$img = $extractToastImage($subItem);
|
|
if (strlen($img)) {
|
|
$itemStruct['imageUrl'] = $img;
|
|
$itemStruct['imageSrc'] = $img;
|
|
$itemStruct['imageFilename'] = basename($img);
|
|
}
|
|
$toastItems[] = $itemStruct;
|
|
$itemId++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: business name from title
|
|
if (empty($toastBusiness['name'])) {
|
|
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $pageHtml, $tm)) {
|
|
$titleText = trim($tm[1]);
|
|
if (strpos($titleText, '|') !== false) $titleText = trim(explode('|', $titleText)[0]);
|
|
$titleText = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $titleText);
|
|
if (strlen($titleText)) $toastBusiness['name'] = $titleText;
|
|
}
|
|
}
|
|
|
|
// Clean business name
|
|
if (!empty($toastBusiness['name'])) {
|
|
$bizName = $toastBusiness['name'];
|
|
$bizName = preg_replace('#\s*[-|]+\s*(Order\s+(pickup|online|delivery|food)|Online\s+Order|Delivery\s*[&and]+\s*Takeout|Takeout\s*[&and]+\s*Delivery|Menu\s*[&and]+\s*Order).*$#i', '', $bizName);
|
|
if (!empty($toastBusiness['addressLine1']) && stripos($bizName, $toastBusiness['addressLine1']) !== false) {
|
|
$bizName = trim(str_ireplace($toastBusiness['addressLine1'], '', $bizName));
|
|
}
|
|
if (!empty($toastBusiness['address'])) {
|
|
$addrFirst = trim(explode(',', $toastBusiness['address'])[0]);
|
|
if (strlen($addrFirst) && stripos($bizName, $addrFirst) !== false) {
|
|
$bizName = trim(str_ireplace($addrFirst, '', $bizName));
|
|
}
|
|
}
|
|
$bizName = trim(preg_replace('#[-|]+$#', '', trim($bizName)));
|
|
$bizName = trim(preg_replace('#^[-|]+#', '', $bizName));
|
|
$toastBusiness['name'] = trim($bizName);
|
|
}
|
|
|
|
// Clean city
|
|
if (!empty($toastBusiness['city']) && strpos($toastBusiness['city'], ',') !== false) {
|
|
$toastBusiness['city'] = trim(explode(',', $toastBusiness['city'])[0]);
|
|
}
|
|
|
|
// Multi-menu hierarchy
|
|
if (count($menuNames) > 1) {
|
|
$hierarchicalCategories = [];
|
|
foreach ($menuNames as $mn) {
|
|
$hierarchicalCategories[] = ['name' => $mn, 'itemCount' => 0];
|
|
foreach ($toastCategories as $tc) {
|
|
if (($tc['menuName'] ?? '') === $mn) {
|
|
$tc['parentCategoryName'] = $mn;
|
|
$hierarchicalCategories[] = $tc;
|
|
}
|
|
}
|
|
}
|
|
$toastCategories = $hierarchicalCategories;
|
|
}
|
|
|
|
// Update category item counts
|
|
for ($ci = 0; $ci < count($toastCategories); $ci++) {
|
|
$count = 0;
|
|
foreach ($toastItems as $ti) {
|
|
if ($ti['category'] === $toastCategories[$ci]['name']) $count++;
|
|
}
|
|
$toastCategories[$ci]['itemCount'] = $count;
|
|
}
|
|
|
|
$response['steps'][] = "Extracted " . count($toastItems) . " items from " . count($toastCategories) . " categories via __OO_STATE__";
|
|
|
|
// Toast modifier extraction via Playwright
|
|
$toastModifiers = [];
|
|
$modifierItemCount = 0;
|
|
foreach ($toastItems as $ti) {
|
|
if (!empty($ti['hasModifiers'])) $modifierItemCount++;
|
|
}
|
|
|
|
if ($modifierItemCount > 0) {
|
|
$response['steps'][] = "$modifierItemCount items have modifiers - extracting via Playwright";
|
|
try {
|
|
$toastUrl = '';
|
|
if (!empty($targetUrl) && preg_match('#toasttab\.com#i', $targetUrl)) {
|
|
$toastUrl = $targetUrl;
|
|
} else {
|
|
// Try shortUrl from HTML
|
|
if (preg_match('#"shortUrl"\s*:\s*"([^"]+)"#i', $pageHtml, $sm)) {
|
|
$toastUrl = 'https://www.toasttab.com/local/order/' . $sm[1];
|
|
}
|
|
if (empty($toastUrl) && preg_match('#toasttab\.com/([a-zA-Z0-9_-]+)/giftcards#i', $pageHtml, $gm)) {
|
|
$toastUrl = 'https://www.toasttab.com/local/order/' . $gm[1];
|
|
}
|
|
}
|
|
|
|
if (strlen($toastUrl)) {
|
|
$response['steps'][] = "Fetching modifiers from: $toastUrl";
|
|
$modOutput = shell_exec("/opt/playwright/run-toast-modifiers.sh " . escapeshellarg($toastUrl) . " 2>&1");
|
|
|
|
if (!empty(trim($modOutput ?? ''))) {
|
|
$modResult = json_decode($modOutput, true);
|
|
if (!empty($modResult['modifiers']) && is_array($modResult['modifiers'])) {
|
|
$toastModifiers = $modResult['modifiers'];
|
|
$response['steps'][] = "Extracted " . count($toastModifiers) . " unique modifier groups";
|
|
}
|
|
if (!empty($modResult['itemModifierMap']) && is_array($modResult['itemModifierMap'])) {
|
|
$modMap = $modResult['itemModifierMap'];
|
|
for ($mi = 0; $mi < count($toastItems); $mi++) {
|
|
if (isset($modMap[$toastItems[$mi]['name']])) {
|
|
$toastItems[$mi]['modifiers'] = $modMap[$toastItems[$mi]['name']];
|
|
}
|
|
}
|
|
$response['steps'][] = "Mapped modifiers to " . count($modMap) . " items";
|
|
}
|
|
if (!empty($modResult['stats'])) {
|
|
$response['steps'][] = "Modifier stats: " . json_encode($modResult['stats']);
|
|
}
|
|
} else {
|
|
$response['steps'][] = "Playwright modifier script returned empty output";
|
|
}
|
|
} else {
|
|
$response['steps'][] = "Could not determine Toast URL for modifier extraction";
|
|
}
|
|
} catch (Exception $e) {
|
|
$response['steps'][] = "Modifier extraction failed: " . $e->getMessage() . " - continuing without modifiers";
|
|
}
|
|
}
|
|
|
|
// Return directly if we have items
|
|
if (!empty($toastItems)) {
|
|
$response['OK'] = true;
|
|
$response['DATA'] = [
|
|
'business' => $toastBusiness,
|
|
'categories' => $toastCategories,
|
|
'items' => $toastItems,
|
|
'modifiers' => $toastModifiers,
|
|
'imageUrls' => [],
|
|
'imageMappings' => $imageMappings,
|
|
'headerCandidateIndices' => [],
|
|
];
|
|
$response['sourceUrl'] = !empty($targetUrl) ? $targetUrl : 'uploaded';
|
|
$response['pagesProcessed'] = 1;
|
|
$response['imagesFound'] = count($imageDataArray);
|
|
$response['playwrightImagesCount'] = count($playwrightImages);
|
|
$response['parsedVia'] = 'toast_oo_state';
|
|
jsonResponse($response);
|
|
}
|
|
}
|
|
} catch (Exception $e) {
|
|
$toastError = "Toast __OO_STATE__ parsing failed: " . $e->getMessage();
|
|
$response['steps'][] = "$toastError - falling back to Claude";
|
|
$response['DEBUG_TOAST_ERROR'] = $toastError;
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// Look for embedded JSON data (__NEXT_DATA__, window state, etc.)
|
|
// ============================================================
|
|
$embeddedJsonData = '';
|
|
foreach ($menuPages as $menuPage) {
|
|
if (preg_match_all('#<script[^>]*id=["\']__NEXT_DATA__["\'][^>]*>([^<]+)</script>#i', $menuPage['html'], $ndm)) {
|
|
foreach ($ndm[1] as $sc) $embeddedJsonData .= "\n--- __NEXT_DATA__ ---\n$sc";
|
|
}
|
|
if (preg_match_all('#window\.__[A-Z_]+__\s*=\s*(\{[^;]+\});#', $menuPage['html'], $stm)) {
|
|
foreach ($stm[0] as $sm) $embeddedJsonData .= "\n--- WINDOW_STATE ---\n$sm";
|
|
}
|
|
if (preg_match_all('#data-(?:props|page|state)=["\'](\{[^"\']+\})["\']#i', $menuPage['html'], $dpm)) {
|
|
foreach ($dpm[0] as $dp) $embeddedJsonData .= "\n--- DATA_PROPS ---\n$dp";
|
|
}
|
|
if (preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $menuPage['html'], $ldm)) {
|
|
foreach ($ldm[1] as $sc) {
|
|
if (stripos($sc, 'menu') !== false || stripos($sc, 'MenuItem') !== false) {
|
|
$embeddedJsonData .= "\n--- JSON_LD_MENU ---\n$sc";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (strlen($embeddedJsonData)) {
|
|
$response['DEBUG_EMBEDDED_JSON_FOUND'] = true;
|
|
$response['DEBUG_EMBEDDED_JSON_LENGTH'] = strlen($embeddedJsonData);
|
|
} else {
|
|
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
|
|
}
|
|
|
|
// Combine HTML, strip scripts/styles
|
|
$combinedHtml = '';
|
|
foreach ($menuPages as $menuPage) {
|
|
$cleanHtml = $menuPage['html'];
|
|
$cleanHtml = preg_replace('#<script[^>]*>.*?</script>#is', '', $cleanHtml);
|
|
$cleanHtml = preg_replace('#<style[^>]*>.*?</style>#is', '', $cleanHtml);
|
|
$cleanHtml = preg_replace('#<!--.*?-->#s', '', $cleanHtml);
|
|
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
|
|
}
|
|
|
|
if (strlen($embeddedJsonData)) {
|
|
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
|
|
}
|
|
|
|
if (strlen($combinedHtml) > 100000) {
|
|
$combinedHtml = substr($combinedHtml, 0, 100000);
|
|
}
|
|
|
|
// Server-side heading hierarchy detection
|
|
$headingHierarchy = [];
|
|
$hierarchyDesc = '';
|
|
$scanPos = 0;
|
|
$currentH2 = '';
|
|
while ($scanPos < strlen($combinedHtml)) {
|
|
$nextH2 = preg_match('#<h2[^>]*>#i', $combinedHtml, $m2, PREG_OFFSET_CAPTURE, $scanPos) ? $m2[0][1] : false;
|
|
$nextH3 = preg_match('#<h3[^>]*>#i', $combinedHtml, $m3, PREG_OFFSET_CAPTURE, $scanPos) ? $m3[0][1] : false;
|
|
|
|
if ($nextH2 === false && $nextH3 === false) break;
|
|
|
|
if ($nextH2 !== false && ($nextH3 === false || $nextH2 < $nextH3)) {
|
|
$closePos = stripos($combinedHtml, '</h2>', $nextH2);
|
|
if ($closePos === false) break;
|
|
$tagContent = substr($combinedHtml, $nextH2, $closePos + 5 - $nextH2);
|
|
$h2Raw = trim(strip_tags($tagContent));
|
|
$h2Clean = trim(preg_replace('/[^a-zA-Z0-9 ]/', '', $h2Raw));
|
|
if (strlen($h2Clean) && strtoupper($h2Clean) !== 'MENU' && stripos($h2Clean, 'copyright') === false) {
|
|
$currentH2 = $h2Raw;
|
|
} else {
|
|
$currentH2 = '';
|
|
}
|
|
$scanPos = $closePos + 5;
|
|
} else {
|
|
$closePos = stripos($combinedHtml, '</h3>', $nextH3);
|
|
if ($closePos === false) break;
|
|
$tagContent = substr($combinedHtml, $nextH3, $closePos + 5 - $nextH3);
|
|
$h3Text = trim(strip_tags($tagContent));
|
|
if (strlen($currentH2) && strlen($h3Text)) {
|
|
if (!isset($headingHierarchy[$currentH2])) $headingHierarchy[$currentH2] = [];
|
|
$headingHierarchy[$currentH2][] = $h3Text;
|
|
}
|
|
$scanPos = $closePos + 5;
|
|
}
|
|
}
|
|
|
|
if (!empty($headingHierarchy)) {
|
|
foreach ($headingHierarchy as $hParent => $hChildren) {
|
|
$hierarchyDesc .= "- \"$hParent\" contains subsections: " . implode(', ', $hChildren) . "\n";
|
|
}
|
|
$response['steps'][] = "Detected " . count($headingHierarchy) . " parent categories with subcategories from h2/h3 structure";
|
|
}
|
|
|
|
// ============================================================
|
|
// Claude API call for generic pages
|
|
// ============================================================
|
|
$systemPrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), menus (array of objects — see below), categories (array), modifiers (array), items (array with name, description, price, category, menu, modifiers array, and imageUrl). MENUS vs CATEGORIES (CRITICAL): A MENU is a distinct time-based or themed menu that a restaurant offers separately — e.g., "Brunch", "Lunch", "Dinner", "Happy Hour", "Late Night", "Kids Menu". If a restaurant has multiple menus, return a "menus" array of objects like [{"name": "Brunch"}, {"name": "Lunch"}, {"name": "Dinner"}]. Each item should have a "menu" field set to which menu it belongs to. If the restaurant only has one menu or the sections are food-type categories (not time/theme based), omit the "menus" key entirely and treat everything as categories within a single menu. CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups multiple items (e.g., \'Appetizers\', \'Tacos\', \'Drinks\', \'Desserts\'). An ITEM is an individual food or drink product with a name, description, and price. Do NOT create a category for each individual item. A typical restaurant has 5-15 categories and 30-150 items. If you find yourself creating more categories than items, you are wrong - those are items, not categories. Each item must have a \'category\' field set to the category it belongs to. CATEGORIES FORMAT: Each entry in the categories array can be either a simple string (for flat categories) OR an object with \'name\' and optional \'subcategories\' array. Example: ["Appetizers", {"name": "Drinks", "subcategories": ["Hot Drinks", "Cold Drinks"]}, "Desserts"]. SUBCATEGORY DETECTION: If a section header contains nested titled sections beneath it (sub-headers with their own items), the outer section is the PARENT and inner sections are SUBCATEGORIES. For items in subcategories, set their \'category\' field to the SUBCATEGORY name (not the parent). CRITICAL FOR IMAGES: Each menu item in the HTML is typically in a container (div, li, article) that also contains an img tag. Extract the img src URL and include it as \'imageUrl\' for that item. Look for img tags that are siblings or children within the same menu-item container. The image URL should be the full or relative src value from the img tag - NOT the alt text. CRITICAL: Extract EVERY menu item from ALL sources including embedded JSON (__NEXT_DATA__, window state, JSON-LD). For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). CRITICAL: Return ONLY valid JSON. All special characters in strings must be properly escaped. Never use smart/curly quotes. Use only ASCII double quotes for JSON string delimiters and backslash-escape any literal double quotes inside values.';
|
|
|
|
// Build message content
|
|
$messagesContent = [];
|
|
|
|
// Add images (up to 10)
|
|
$imgLimit = min(count($imageDataArray), 10);
|
|
for ($i = 0; $i < $imgLimit; $i++) {
|
|
$messagesContent[] = ['type' => 'image', 'source' => $imageDataArray[$i]['source']];
|
|
}
|
|
|
|
// Add HTML text
|
|
$userText = "Extract menu data from this restaurant website HTML. The images above are from the same website - identify which ones are food photos that could be used as item images, and which could be header/banner images.";
|
|
if (strlen($hierarchyDesc)) {
|
|
$userText .= "\n\nIMPORTANT - DETECTED SECTION HIERARCHY FROM HTML HEADINGS:\n"
|
|
. "The following h2 sections contain h3 sub-sections. Use these as parent-subcategory relationships in your categories output:\n"
|
|
. $hierarchyDesc
|
|
. "For each parent above, include it in the categories array as an OBJECT with 'name' and 'subcategories' array. Items belonging to a subsection should have their 'category' field set to the SUBCATEGORY name (not the parent).";
|
|
}
|
|
$userText .= "\n\nHere is the HTML content:\n\n" . $combinedHtml;
|
|
$messagesContent[] = ['type' => 'text', 'text' => $userText];
|
|
|
|
$requestBody = [
|
|
'model' => 'claude-sonnet-4-20250514',
|
|
'max_tokens' => 16384,
|
|
'temperature' => 0,
|
|
'system' => $systemPrompt,
|
|
'messages' => [['role' => 'user', 'content' => $messagesContent]],
|
|
];
|
|
|
|
$response['steps'][] = "Sending to Claude API...";
|
|
|
|
$claudeResult = $httpPost(
|
|
'https://api.anthropic.com/v1/messages',
|
|
json_encode($requestBody),
|
|
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
|
|
120
|
|
);
|
|
|
|
if ($claudeResult['code'] !== 200) {
|
|
$errorDetail = '';
|
|
$errData = json_decode($claudeResult['body'], true);
|
|
if (!empty($errData['error']['message'])) {
|
|
$errorDetail = $errData['error']['message'];
|
|
} else {
|
|
$errorDetail = substr($claudeResult['body'], 0, 500);
|
|
}
|
|
throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
|
|
}
|
|
|
|
$claudeResponse = json_decode($claudeResult['body'], true);
|
|
if (empty($claudeResponse['content'])) throw new Exception("Empty response from Claude");
|
|
|
|
$responseText = '';
|
|
foreach ($claudeResponse['content'] as $block) {
|
|
if (($block['type'] ?? '') === 'text') {
|
|
$responseText = $block['text'];
|
|
break;
|
|
}
|
|
}
|
|
|
|
$responseText = $cleanClaudeJson($responseText);
|
|
$response['DEBUG_RAW_CLAUDE'] = $responseText;
|
|
|
|
$menuData = json_decode($responseText, true);
|
|
if (!is_array($menuData)) {
|
|
$response['OK'] = false;
|
|
$response['MESSAGE'] = 'JSON parse error';
|
|
$response['DEBUG_RAW_RESPONSE'] = substr($responseText, 0, 3000);
|
|
jsonResponse($response);
|
|
}
|
|
|
|
// Build image URL list
|
|
$imageUrlList = [];
|
|
foreach ($imageDataArray as $imgData) {
|
|
if (!empty($imgData['url'])) $imageUrlList[] = $imgData['url'];
|
|
}
|
|
|
|
// Ensure expected structure
|
|
if (!isset($menuData['business'])) $menuData['business'] = [];
|
|
if (!isset($menuData['categories'])) $menuData['categories'] = [];
|
|
if (!isset($menuData['modifiers'])) $menuData['modifiers'] = [];
|
|
if (!isset($menuData['items'])) $menuData['items'] = [];
|
|
|
|
// Pass through menus array if Claude detected multiple menus
|
|
if (!empty($menuData['menus']) && is_array($menuData['menus']) && count($menuData['menus']) > 1) {
|
|
$response['steps'][] = "Detected " . count($menuData['menus']) . " separate menus: " . implode(', ', array_column($menuData['menus'], 'name'));
|
|
}
|
|
|
|
// Convert categories to expected format
|
|
$formattedCategories = [];
|
|
foreach ($menuData['categories'] as $cat) {
|
|
if (is_string($cat)) {
|
|
$formattedCategories[] = ['name' => $cat, 'itemCount' => 0];
|
|
} elseif (is_array($cat)) {
|
|
$parentName = $cat['name'] ?? '';
|
|
if (strlen($parentName)) {
|
|
$formattedCategories[] = ['name' => $parentName, 'itemCount' => 0];
|
|
if (!empty($cat['subcategories']) && is_array($cat['subcategories'])) {
|
|
foreach ($cat['subcategories'] as $subcat) {
|
|
$subcatName = is_string($subcat) ? $subcat : ($subcat['name'] ?? '');
|
|
if (strlen($subcatName)) {
|
|
$formattedCategories[] = ['name' => $subcatName, 'parentCategoryName' => $parentName, 'itemCount' => 0];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
$menuData['categories'] = $formattedCategories;
|
|
|
|
// Fix "every item is a category" pattern
|
|
$totalItems = count($menuData['items']);
|
|
$totalCats = count($formattedCategories);
|
|
if ($totalCats > 10 && $totalItems > 0 && $totalCats > $totalItems * 0.5) {
|
|
$zeroCats = [];
|
|
$singleCats = [];
|
|
foreach ($formattedCategories as $fc) {
|
|
$fcCount = 0;
|
|
foreach ($menuData['items'] as $fi) {
|
|
if ($fi['category'] === $fc['name']) $fcCount++;
|
|
}
|
|
if ($fcCount === 0) $zeroCats[] = $fc['name'];
|
|
elseif ($fcCount === 1) $singleCats[] = $fc['name'];
|
|
}
|
|
|
|
if (count($singleCats) > $totalCats * 0.6 && !empty($zeroCats)) {
|
|
$response['steps'][] = "Detected 'every item is a category' pattern (" . count($singleCats) . " single-item cats, " . count($zeroCats) . " empty cats) - collapsing";
|
|
|
|
$currentParent = $zeroCats[0];
|
|
foreach ($formattedCategories as $fc) {
|
|
if (in_array($fc['name'], $zeroCats)) {
|
|
$currentParent = $fc['name'];
|
|
} else {
|
|
for ($ii = 0; $ii < count($menuData['items']); $ii++) {
|
|
if ($menuData['items'][$ii]['category'] === $fc['name']) {
|
|
$menuData['items'][$ii]['category'] = $currentParent;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
$fixedCategories = [];
|
|
foreach ($zeroCats as $zc) {
|
|
$zcCount = 0;
|
|
foreach ($menuData['items'] as $fi) {
|
|
if ($fi['category'] === $zc) $zcCount++;
|
|
}
|
|
$fixedCategories[] = ['name' => $zc, 'itemCount' => $zcCount];
|
|
}
|
|
$menuData['categories'] = $fixedCategories;
|
|
$formattedCategories = $fixedCategories;
|
|
$response['steps'][] = "Collapsed to " . count($fixedCategories) . " categories";
|
|
}
|
|
}
|
|
|
|
// Server-side hierarchy enforcement from HTML heading structure
|
|
if (!empty($headingHierarchy)) {
|
|
$h3ToParent = [];
|
|
foreach ($headingHierarchy as $hParentName => $hChildren) {
|
|
foreach ($hChildren as $hChild) {
|
|
$h3ToParent[strtolower(trim($hChild))] = $hParentName;
|
|
}
|
|
}
|
|
|
|
$hierarchyApplied = 0;
|
|
for ($i = 0; $i < count($formattedCategories); $i++) {
|
|
if (empty($formattedCategories[$i]['parentCategoryName'])) {
|
|
$catLower = strtolower(trim($formattedCategories[$i]['name']));
|
|
if (isset($h3ToParent[$catLower])) {
|
|
$rawParent = $h3ToParent[$catLower];
|
|
$matchedParent = '';
|
|
foreach ($formattedCategories as $pcat) {
|
|
$parentNorm = strtolower(preg_replace('/[^a-zA-Z0-9 ]/', '', $rawParent));
|
|
$parentNorm = trim(preg_replace('/\s*menu\s*$/i', '', $parentNorm));
|
|
$pcatNorm = trim(preg_replace('/\s*menu\s*$/i', '', strtolower($pcat['name'])));
|
|
if ($pcatNorm === $parentNorm || strtolower($pcat['name']) === strtolower($rawParent)) {
|
|
$matchedParent = $pcat['name'];
|
|
break;
|
|
}
|
|
}
|
|
if (strlen($matchedParent)) {
|
|
$formattedCategories[$i]['parentCategoryName'] = $matchedParent;
|
|
$hierarchyApplied++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ($hierarchyApplied > 0) {
|
|
$menuData['categories'] = $formattedCategories;
|
|
$response['steps'][] = "Server-side hierarchy: applied $hierarchyApplied parent-child relationships";
|
|
}
|
|
}
|
|
|
|
// Items with subcategory field from Claude
|
|
for ($i = 0; $i < count($menuData['items']); $i++) {
|
|
if (!empty($menuData['items'][$i]['subcategory'])) {
|
|
$menuData['items'][$i]['category'] = $menuData['items'][$i]['subcategory'];
|
|
}
|
|
}
|
|
|
|
// Add item IDs
|
|
for ($i = 0; $i < count($menuData['items']); $i++) {
|
|
$menuData['items'][$i]['id'] = 'item_' . ($i + 1);
|
|
}
|
|
|
|
// Process item images
|
|
$itemsWithImages = 0;
|
|
for ($i = 0; $i < count($menuData['items']); $i++) {
|
|
$item = $menuData['items'][$i];
|
|
|
|
if (!empty($item['images']) && is_array($item['images'])) {
|
|
$imgObj = $item['images'];
|
|
$itemsWithImages++;
|
|
$filenames = [];
|
|
foreach ($imgObj as $sizeKey => $imgUrl) {
|
|
if (is_scalar($imgUrl) && strlen(trim((string)$imgUrl))) {
|
|
$filenames[$sizeKey] = basename((string)$imgUrl);
|
|
}
|
|
}
|
|
$menuData['items'][$i]['imageFilenames'] = $filenames;
|
|
|
|
$primarySrc = $imgObj['src'] ?? $imgObj['large'] ?? $imgObj['medium'] ?? $imgObj['small'] ?? null;
|
|
if ($primarySrc) {
|
|
$menuData['items'][$i]['imageSrc'] = $primarySrc;
|
|
$menuData['items'][$i]['imageFilename'] = basename($primarySrc);
|
|
}
|
|
} elseif (!empty($item['imageUrl'])) {
|
|
$menuData['items'][$i]['imageSrc'] = $item['imageUrl'];
|
|
$menuData['items'][$i]['imageFilename'] = basename($item['imageUrl']);
|
|
$itemsWithImages++;
|
|
} elseif (!empty($item['imageSrc'])) {
|
|
$menuData['items'][$i]['imageFilename'] = basename($item['imageSrc']);
|
|
$itemsWithImages++;
|
|
}
|
|
}
|
|
$response['steps'][] = "Found images for $itemsWithImages of " . count($menuData['items']) . " items";
|
|
|
|
$menuData['imageUrls'] = $imageUrlList;
|
|
$menuData['headerCandidateIndices'] = [];
|
|
$menuData['imageMappings'] = $imageMappings;
|
|
|
|
$response['OK'] = true;
|
|
$response['DATA'] = $menuData;
|
|
$response['sourceUrl'] = !empty($targetUrl) ? $targetUrl : 'uploaded';
|
|
$response['pagesProcessed'] = count($menuPages);
|
|
$response['imagesFound'] = count($imageDataArray);
|
|
$response['playwrightImagesCount'] = count($playwrightImages);
|
|
|
|
} catch (Exception $e) {
|
|
$response['MESSAGE'] = $e->getMessage();
|
|
}
|
|
|
|
jsonResponse($response);
|