false];
try {
// Load API Key
$configPath = realpath(__DIR__ . '/../../config/claude.json');
$CLAUDE_API_KEY = '';
if ($configPath && file_exists($configPath)) {
$configData = json_decode(file_get_contents($configPath), true);
if (!empty($configData['apiKey'])) {
$CLAUDE_API_KEY = $configData['apiKey'];
}
}
if (empty($CLAUDE_API_KEY)) {
throw new Exception('Claude API key not configured');
}
$data = readJsonBody();
if (empty($data)) throw new Exception('No request body provided');
$response['steps'] = [];
$response['debug'] = [
'hasHtmlKey' => isset($data['html']),
'hasUrlKey' => isset($data['url']),
'htmlLength' => isset($data['html']) ? strlen($data['html']) : 0,
'urlValue' => $data['url'] ?? '',
];
$pageHtml = '';
$baseUrl = '';
$basePath = '';
$targetUrl = '';
$playwrightImages = [];
// Helper: webroot path
$webroot = isDev()
? '/opt/lucee/tomcat/webapps/ROOT'
: '/var/www/biz.payfrit.com';
// Helper: expand a URL path to a local file path
$expandPath = function(string $urlPath) use ($webroot): string {
return $webroot . $urlPath;
};
// Helper: convert 24h time to 12h format string
$formatTime12h = function(int $h, int $m): string {
$ampm = $h >= 12 ? 'pm' : 'am';
if ($h > 12) $h -= 12;
if ($h === 0) $h = 12;
return $h . ($m > 0 ? ':' . str_pad($m, 2, '0', STR_PAD_LEFT) : '') . $ampm;
};
// Helper: extract value from escaped JSON using backslash-quote markers
$BQ = "\\\""; // backslash-quote as it appears in HTML
$extractBqValue = function(string $text, string $key, int $startPos = 0) use ($BQ): ?string {
$marker = $BQ . $key . $BQ . ':' . $BQ;
$pos = stripos($text, $marker, $startPos);
if ($pos === false) return null;
$valStart = $pos + strlen($marker);
$valEnd = strpos($text, $BQ, $valStart);
if ($valEnd === false || $valEnd <= $valStart) return null;
return substr($text, $valStart, $valEnd - $valStart);
};
// Helper: extract __OO_STATE__ JSON using brace-counting
$extractOoState = function(string $html): ?string {
$ooStart = stripos($html, 'window.__OO_STATE__');
if ($ooStart === false) return null;
$braceStart = strpos($html, '{', $ooStart);
if ($braceStart === false) return null;
$depth = 0;
$inStr = false;
$esc = false;
$totalLen = strlen($html);
$braceEnd = 0;
for ($i = $braceStart; $i < $totalLen; $i++) {
$ch = $html[$i];
if ($esc) { $esc = false; continue; }
if ($ch === '\\' && $inStr) { $esc = true; continue; }
if ($ch === '"') { $inStr = !$inStr; continue; }
if (!$inStr) {
if ($ch === '{') $depth++;
elseif ($ch === '}') {
$depth--;
if ($depth === 0) { $braceEnd = $i; break; }
}
}
}
if ($braceEnd === 0) return null;
$json = substr($html, $braceStart, $braceEnd - $braceStart + 1);
// Decode HTML entities from View Source
$json = str_replace(['&', '<', '>', '"'], ['&', '<', '>', '"'], $json);
return $json;
};
// Helper: extract Toast item price from multiple possible fields
$extractToastPrice = function(array $item): float {
if (!empty($item['prices']) && is_array($item['prices']) && is_numeric($item['prices'][0] ?? null)) {
return (float)$item['prices'][0];
}
if (isset($item['price']) && is_numeric($item['price'])) return (float)$item['price'];
if (isset($item['unitPrice']) && is_numeric($item['unitPrice'])) return (float)$item['unitPrice'];
if (isset($item['basePrice']) && is_numeric($item['basePrice'])) return (float)$item['basePrice'];
if (isset($item['displayPrice']) && strlen(trim((string)$item['displayPrice']))) {
$ps = preg_replace('/[^0-9.]/', '', (string)$item['displayPrice']);
if (strlen($ps) && is_numeric($ps)) return (float)$ps;
}
return 0.0;
};
// Helper: extract Toast item image URL
$extractToastImage = function(array $item): string {
if (isset($item['imageUrls']) && is_array($item['imageUrls'])) {
$urls = $item['imageUrls'];
return $urls['medium'] ?? $urls['large'] ?? $urls['small'] ?? '';
}
return '';
};
// Helper: clean JSON from Claude response
$cleanClaudeJson = function(string $text): string {
$text = trim($text);
// Strip markdown code fences
if (str_starts_with($text, '```json')) $text = substr($text, 7);
if (str_starts_with($text, '```')) $text = substr($text, 3);
if (str_ends_with($text, '```')) $text = substr($text, 0, -3);
$text = trim($text);
// Extract JSON object if text doesn't start with {
if (!str_starts_with($text, '{')) {
$jsonStart = strpos($text, '{');
if ($jsonStart !== false) {
$text = substr($text, $jsonStart);
if (str_ends_with(trim($text), '```')) {
$text = substr(trim($text), 0, -3);
}
$text = trim($text);
}
}
// Remove trailing commas before ] or }
$text = preg_replace('/,(\s*[\]\}])/', '$1', $text);
// Remove control characters
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', '', $text);
// Clean smart quotes/dashes
$text = str_replace(["\xe2\x80\x98", "\xe2\x80\x99"], "'", $text); // smart single quotes
$text = str_replace(["\xe2\x80\x93", "\xe2\x80\x94"], "-", $text); // en/em dash
$text = str_replace("\xe2\x80\xa6", "...", $text); // ellipsis
return $text;
};
// Helper: detect media type from base64 prefix
$detectMediaType = function(string $base64): string {
if (str_starts_with($base64, 'iVBO')) return 'image/png';
if (str_starts_with($base64, 'R0lGOD')) return 'image/gif';
if (str_starts_with($base64, 'UklGR')) return 'image/webp';
return 'image/jpeg';
};
// Helper: HTTP GET with curl
$httpGet = function(string $url, array $headers = [], int $timeout = 30): array {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTPHEADER => $headers,
]);
$body = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
curl_close($ch);
return ['body' => $body, 'code' => $code, 'contentType' => $contentType ?? ''];
};
// Helper: HTTP POST with curl
$httpPost = function(string $url, string $body, array $headers = [], int $timeout = 30): array {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_HTTPHEADER => $headers,
]);
$result = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return ['body' => $result, 'code' => $code];
};
// ============================================================
// Parse request: HTML content or URL
// ============================================================
if (!empty($data['html'])) {
$pageHtml = trim($data['html']);
$response['steps'][] = "Using provided HTML content: " . strlen($pageHtml) . " bytes";
} elseif (!empty($data['url'])) {
$targetUrl = trim($data['url']);
if (!preg_match('#^https?://#i', $targetUrl)) {
$targetUrl = 'https://' . $targetUrl;
}
// ========== GRUBHUB FAST PATH ==========
if (preg_match('#grubhub\.com/restaurant/#i', $targetUrl)) {
$response['steps'][] = "Grubhub URL detected - using API";
// Extract restaurant ID
if (!preg_match('#/(\d+)(\?|$)#', $targetUrl, $ghIdMatch)) {
throw new Exception('Could not extract Grubhub restaurant ID from URL');
}
$ghRestaurantId = $ghIdMatch[1];
$response['steps'][] = "Grubhub restaurant ID: $ghRestaurantId";
// Get anonymous access token
$ghAuth = $httpPost(
'https://api-gtm.grubhub.com/auth',
'{"brand":"GRUBHUB","client_id":"beta_UmWlpstzQSFmocLy3h1UieYcVST","scope":"anonymous"}',
['Content-Type: application/json'],
15
);
if ($ghAuth['code'] !== 200) throw new Exception("Grubhub auth failed: {$ghAuth['code']}");
$ghAuthData = json_decode($ghAuth['body'], true);
$ghToken = $ghAuthData['session_handle']['access_token'];
$response['steps'][] = "Got Grubhub anonymous token";
// Fetch restaurant with full menu data
$ghMenu = $httpGet(
"https://api-gtm.grubhub.com/restaurants/$ghRestaurantId?hideChoiceCategories=false&version=4&orderType=standard&hideUnavailableMenuItems=false&hideMenuItems=false",
["Authorization: Bearer $ghToken"],
30
);
if ($ghMenu['code'] !== 200) throw new Exception("Grubhub restaurant fetch failed: {$ghMenu['code']}");
$ghData = json_decode($ghMenu['body'], true);
$ghRestaurant = $ghData['restaurant'];
$response['steps'][] = "Fetched Grubhub restaurant data (" . strlen($ghMenu['body']) . " bytes)";
// Parse business info
$ghBusiness = ['name' => $ghRestaurant['name']];
if (!empty($ghRestaurant['address']) && is_array($ghRestaurant['address'])) {
$ghAddr = $ghRestaurant['address'];
if (isset($ghAddr['street_address'])) $ghBusiness['addressLine1'] = $ghAddr['street_address'];
if (isset($ghAddr['locality'])) $ghBusiness['city'] = $ghAddr['locality'];
if (isset($ghAddr['region'])) $ghBusiness['state'] = $ghAddr['region'];
if (isset($ghAddr['zip'])) $ghBusiness['zip'] = $ghAddr['zip'];
$ghBusiness['address'] = ($ghBusiness['addressLine1'] ?? '') . ', ' . ($ghBusiness['city'] ?? '') . ', ' . ($ghBusiness['state'] ?? '') . ' ' . ($ghBusiness['zip'] ?? '');
}
if (isset($ghRestaurant['latitude']) && is_numeric($ghRestaurant['latitude'])) $ghBusiness['latitude'] = $ghRestaurant['latitude'];
if (isset($ghRestaurant['longitude']) && is_numeric($ghRestaurant['longitude'])) $ghBusiness['longitude'] = $ghRestaurant['longitude'];
if (!empty($ghRestaurant['phone_number'])) $ghBusiness['phone'] = preg_replace('/[^0-9]/', '', $ghRestaurant['phone_number']);
if (!empty(trim($ghRestaurant['description'] ?? ''))) $ghBusiness['description'] = trim($ghRestaurant['description']);
// Hours
$ghHoursParts = [];
$ghDayOrder = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'];
$ghDayAbbrev = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun'];
if (!empty($ghRestaurant['restaurant_managed_hours_list_v2']) && is_array($ghRestaurant['restaurant_managed_hours_list_v2'])) {
foreach ($ghRestaurant['restaurant_managed_hours_list_v2'] as $ghDayHours) {
if (isset($ghDayHours['day'], $ghDayHours['start_time'], $ghDayHours['end_time'])) {
$ghDayIdx = array_search($ghDayHours['day'], $ghDayOrder);
if ($ghDayIdx !== false) {
$parts = explode(':', $ghDayHours['start_time']);
$openStr = $formatTime12h((int)$parts[0], (int)($parts[1] ?? 0));
$parts = explode(':', $ghDayHours['end_time']);
$closeStr = $formatTime12h((int)$parts[0], (int)($parts[1] ?? 0));
$ghHoursParts[] = $ghDayAbbrev[$ghDayIdx] . " $openStr-$closeStr";
}
}
}
}
if (!empty($ghHoursParts)) $ghBusiness['hours'] = implode(', ', $ghHoursParts);
if (isset($ghData['restaurant_availability']['sales_tax'])) $ghBusiness['taxRate'] = $ghData['restaurant_availability']['sales_tax'];
// Parse categories and items
$ghCategories = [];
$ghItems = [];
$ghItemId = 1;
$ghModifierGroups = [];
$ghImageMappings = [];
if (!empty($ghRestaurant['menu_category_list']) && is_array($ghRestaurant['menu_category_list'])) {
foreach ($ghRestaurant['menu_category_list'] as $ghCat) {
$ghCatName = trim($ghCat['name'] ?? 'Menu');
$ghCatItemCount = 0;
if (!empty($ghCat['menu_item_list']) && is_array($ghCat['menu_item_list'])) {
foreach ($ghCat['menu_item_list'] as $ghItem) {
$ghItemName = trim($ghItem['name'] ?? '');
if (empty($ghItemName)) continue;
$ghPrice = 0;
if (!empty($ghItem['price']['amount'])) $ghPrice = (float)$ghItem['price']['amount'] / 100;
$ghDesc = trim($ghItem['description'] ?? '');
// Image URL
$ghImageUrl = '';
if (!empty($ghItem['media_image']) && is_array($ghItem['media_image'])) {
$gi = $ghItem['media_image'];
if (!empty($gi['base_url']) && !empty($gi['public_id']) && !empty($gi['format'])) {
$ghImageUrl = $gi['base_url'] . 'w_400,h_400,c_fill/' . $gi['public_id'] . '.' . $gi['format'];
}
}
// Modifiers
$ghItemModifiers = [];
if (!empty($ghItem['choice_category_list']) && is_array($ghItem['choice_category_list'])) {
foreach ($ghItem['choice_category_list'] as $ghChoiceCat) {
$ghModName = trim($ghChoiceCat['name'] ?? '');
if (empty($ghModName)) continue;
$ghItemModifiers[] = $ghModName;
if (!isset($ghModifierGroups[$ghModName])) {
$ghModOptions = [];
if (!empty($ghChoiceCat['choice_option_list'])) {
foreach ($ghChoiceCat['choice_option_list'] as $ghOpt) {
$optName = trim($ghOpt['description'] ?? '');
$optPrice = !empty($ghOpt['price']['amount']) ? (float)$ghOpt['price']['amount'] / 100 : 0;
if (strlen($optName)) $ghModOptions[] = ['name' => $optName, 'price' => $optPrice];
}
}
$ghMinSel = (int)($ghChoiceCat['min_choice_options'] ?? 0);
$ghMaxSel = (int)($ghChoiceCat['max_choice_options'] ?? 0);
$ghModifierGroups[$ghModName] = [
'name' => $ghModName,
'required' => $ghMinSel > 0,
'minSelections' => $ghMinSel,
'maxSelections' => $ghMaxSel,
'options' => $ghModOptions,
];
}
}
}
$ghItems[] = [
'id' => 'item_' . $ghItemId,
'name' => $ghItemName,
'price' => $ghPrice,
'description' => $ghDesc,
'category' => $ghCatName,
'imageUrl' => $ghImageUrl,
'hasModifiers' => count($ghItemModifiers) > 0,
'modifiers' => $ghItemModifiers,
];
if (strlen($ghImageUrl)) $ghImageMappings[] = ['itemId' => 'item_' . $ghItemId, 'url' => $ghImageUrl];
$ghCatItemCount++;
$ghItemId++;
}
}
$ghCategories[] = ['name' => $ghCatName, 'itemCount' => $ghCatItemCount];
}
}
$ghModifiers = array_values($ghModifierGroups);
$response['steps'][] = "Parsed " . count($ghItems) . " items in " . count($ghCategories) . " categories with " . count($ghModifiers) . " modifier groups";
$response['OK'] = true;
$response['DATA'] = [
'business' => $ghBusiness,
'categories' => $ghCategories,
'items' => $ghItems,
'modifiers' => $ghModifiers,
'imageUrls' => [],
'imageMappings' => $ghImageMappings,
'headerCandidateIndices' => [],
];
$response['sourceUrl'] = $targetUrl;
$response['pagesProcessed'] = 1;
$response['imagesFound'] = count($ghImageMappings);
$response['parsedVia'] = 'grubhub_api';
jsonResponse($response);
}
// ========== END GRUBHUB FAST PATH ==========
// Check if this is a local temp file (ZIP upload) - read directly
if (stripos($targetUrl, '/temp/menu-import/') !== false) {
$localUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/.*)#i', '$1', $targetUrl);
$localFilePath = $expandPath($localUrlPath);
$response['steps'][] = "Local temp file detected: $localFilePath";
if (!file_exists($localFilePath)) {
throw new Exception("Local file not found: $localFilePath");
}
$pageHtml = file_get_contents($localFilePath);
$playwrightImages = [];
$response['steps'][] = "Read " . strlen($pageHtml) . " bytes from local file";
$localDir = dirname($localFilePath);
$basePath = preg_replace('#/[^/]*$#', '/', $targetUrl);
// Check for Toast menu page - extract from visible HTML
if (stripos($pageHtml, 'class="headerText"') !== false && stripos($pageHtml, 'toasttab') !== false) {
$response['steps'][] = "Toast menu detected - parsing visible HTML items";
try {
$toastBusiness = [];
$toastCategories = [];
$toastItems = [];
$categorySet = [];
$itemNameSet = [];
$itemId = 1;
// Find category headers
if (preg_match_all('#
]*class="[^"]*groupHeader[^"]*"[^>]*>([^<]+)
#i', $pageHtml, $catMatches)) {
foreach ($catMatches[1] as $catName) {
$catName = trim($catName);
if (strlen($catName) && !isset($categorySet[$catName])) {
$categorySet[$catName] = true;
$toastCategories[] = ['name' => $catName, 'itemCount' => 0];
}
}
}
// Extract item blocks
if (preg_match_all('#]*class="[^"]*item[^"]*"[^>]*>.*?#is', $pageHtml, $blockMatches)) {
$response['steps'][] = "Found " . count($blockMatches[0]) . " item blocks in HTML";
foreach ($blockMatches[0] as $block) {
if (preg_match('##i', $block, $nm)) {
$itemName = trim($nm[1]);
if (strlen($itemName) && !isset($itemNameSet[$itemName])) {
$itemNameSet[$itemName] = true;
$itemStruct = ['id' => 'item_' . $itemId, 'name' => $itemName, 'modifiers' => [], 'price' => 0, 'description' => ''];
// Price
if (preg_match('#\$([0-9]+\.?[0-9]*)#', $block, $pm)) {
$p = (float)$pm[1];
if ($p > 0) $itemStruct['price'] = $p;
}
// Description
if (preg_match('#]*class="[^"]*description[^"]*"[^>]*>([^<]+)
#i', $block, $dm)) {
$itemStruct['description'] = trim($dm[1]);
}
// Image
if (preg_match('#src="(Menu_files/[^"]+)"#i', $block, $im)) {
$itemStruct['imageUrl'] = $basePath . $im[1];
$itemStruct['imageSrc'] = $basePath . $im[1];
$itemStruct['imageFilename'] = basename($im[1]);
}
$itemStruct['category'] = !empty($toastCategories) ? $toastCategories[0]['name'] : 'Menu';
$toastItems[] = $itemStruct;
$itemId++;
}
}
}
}
// Fallback: simpler headerText extraction
if (empty($toastItems)) {
if (preg_match_all('##i', $pageHtml, $nameMatches)) {
foreach ($nameMatches[1] as $nm) {
$nm = trim($nm);
if (strlen($nm) && !isset($itemNameSet[$nm])) {
$itemNameSet[$nm] = true;
$toastItems[] = ['id' => 'item_' . $itemId, 'name' => $nm, 'price' => 0, 'description' => '', 'category' => 'Menu', 'modifiers' => []];
$itemId++;
}
}
}
}
// Try business name from title
if (preg_match('#]*>([^<]+)#i', $pageHtml, $tm)) {
$titleText = trim($tm[1]);
if (strpos($titleText, '|') !== false) $titleText = trim(explode('|', $titleText)[0]);
$titleText = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $titleText);
if (strlen($titleText) && !isset($toastBusiness['name'])) {
$toastBusiness['name'] = $titleText;
}
}
// Try og:title/og:site_name
if (empty($toastBusiness['name'])) {
if (preg_match('#]*property=["\']og:(site_name|title)["\'][^>]*content=["\']([^"\']+)["\']#i', $pageHtml, $ogm)) {
$ogText = trim($ogm[2]);
if (strpos($ogText, '|') !== false) $ogText = trim(explode('|', $ogText)[0]);
if (strlen($ogText)) $toastBusiness['name'] = $ogText;
} elseif (preg_match('#]*content=["\']([^"\']+)["\'][^>]*property=["\']og:(site_name|title)["\']#i', $pageHtml, $ogm)) {
$ogText = trim($ogm[1]);
if (strpos($ogText, '|') !== false) $ogText = trim(explode('|', $ogText)[0]);
if (strlen($ogText)) $toastBusiness['name'] = $ogText;
}
}
// Try header element
if (empty($toastBusiness['name'])) {
if (preg_match('#<(?:h1|div)[^>]*class="[^"]*(?:restaurant|location|brand)[^"]*"[^>]*>([^<]+)<#i', $pageHtml, $hm)) {
$ht = trim($hm[1]);
if (strlen($ht) && strlen($ht) < 100) $toastBusiness['name'] = $ht;
}
}
// Try first h1
if (empty($toastBusiness['name'])) {
if (preg_match('#]*>([^<]+)
#i', $pageHtml, $h1m)) {
$h1t = trim($h1m[1]);
if (strlen($h1t) && strlen($h1t) < 100) $toastBusiness['name'] = $h1t;
}
}
// Try address from HTML
if (empty($toastBusiness['addressLine1'])) {
if (preg_match('#<[^>]*class="[^"]*address[^"]*"[^>]*>([^<]+)[^>]+>#i', $pageHtml, $am)) {
$at = trim($am[1]);
if (strlen($at) && strlen($at) < 200) $toastBusiness['addressLine1'] = $at;
}
}
// Try phone from HTML
if (empty($toastBusiness['phone'])) {
if (preg_match('#(?:tel:|phone[^"]*">)\s*\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})#i', $pageHtml, $phm)) {
$toastBusiness['phone'] = $phm[1] . '-' . $phm[2] . '-' . $phm[3];
}
}
// Check __OO_STATE__ for images, categories, prices, business info
if (stripos($pageHtml, 'window.__OO_STATE__') !== false) {
$ooJson = $extractOoState($pageHtml);
if ($ooJson !== null) {
try {
$ooState = json_decode($ooJson, true);
if (is_array($ooState)) {
$imageMap = [];
$itemCategoryMap = [];
$itemPriceMap = [];
foreach ($ooState as $key => $val) {
// Restaurant info
if (str_starts_with($key, 'Restaurant:') && is_array($val)) {
if (!empty($val['name'])) $toastBusiness['name'] = $val['name'];
if (!empty($val['location']) && is_array($val['location'])) {
$loc = $val['location'];
if (!empty($loc['address1'])) $toastBusiness['addressLine1'] = $loc['address1'];
if (!empty($loc['city'])) $toastBusiness['city'] = $loc['city'];
if (!empty($loc['state'])) $toastBusiness['state'] = $loc['state'];
if (!empty($loc['zipCode'])) $toastBusiness['zip'] = $loc['zipCode'];
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
}
if (!empty($val['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $val['brandColor']);
}
// Menu items
if (str_starts_with($key, 'Menu:') && is_array($val) && !empty($val['groups']) && is_array($val['groups'])) {
foreach ($val['groups'] as $group) {
$groupName = trim($group['name'] ?? '');
if (strlen($groupName) && !isset($categorySet[$groupName])) {
$categorySet[$groupName] = true;
$toastCategories[] = ['name' => $groupName, 'itemCount' => 0];
}
// Check for subgroups
$subgroups = $group['subgroups'] ?? $group['children'] ?? $group['childGroups'] ?? [];
if (!empty($subgroups) && is_array($subgroups)) {
foreach ($subgroups as $sg) {
$sgName = trim($sg['name'] ?? '');
if (strlen($sgName) && !isset($categorySet[$sgName])) {
$categorySet[$sgName] = true;
$toastCategories[] = ['name' => $sgName, 'parentCategoryName' => $groupName, 'itemCount' => 0];
}
if (!empty($sg['items']) && is_array($sg['items'])) {
$effectiveName = strlen($sgName) ? $sgName : $groupName;
foreach ($sg['items'] as $item) {
if (!empty($item['name'])) {
$itemCategoryMap[$item['name']] = $effectiveName;
$p = $extractToastPrice($item);
if ($p > 0) $itemPriceMap[$item['name']] = $p;
$img = $extractToastImage($item);
if (strlen($img)) $imageMap[$item['name']] = $img;
}
}
}
}
}
// Direct items
if (!empty($group['items']) && is_array($group['items'])) {
foreach ($group['items'] as $item) {
if (!empty($item['name'])) {
if (strlen($groupName)) $itemCategoryMap[$item['name']] = $groupName;
$p = $extractToastPrice($item);
if ($p > 0) $itemPriceMap[$item['name']] = $p;
$img = $extractToastImage($item);
if (strlen($img)) $imageMap[$item['name']] = $img;
}
}
}
}
}
}
// Apply to items
$imagesMatched = $categoriesMatched = $pricesMatched = 0;
for ($i = 0; $i < count($toastItems); $i++) {
$name = $toastItems[$i]['name'];
if (isset($imageMap[$name])) {
$toastItems[$i]['imageUrl'] = $imageMap[$name];
$toastItems[$i]['imageSrc'] = $imageMap[$name];
$toastItems[$i]['imageFilename'] = basename($imageMap[$name]);
$imagesMatched++;
}
if (isset($itemCategoryMap[$name])) {
$toastItems[$i]['category'] = $itemCategoryMap[$name];
$categoriesMatched++;
}
if (isset($itemPriceMap[$name]) && ($toastItems[$i]['price'] ?? 0) == 0) {
$toastItems[$i]['price'] = $itemPriceMap[$name];
$pricesMatched++;
}
}
$response['steps'][] = "Matched $imagesMatched images, $categoriesMatched categories, $pricesMatched prices from __OO_STATE__";
}
} catch (Exception $e) {
// OO_STATE parse failed, continue
}
}
}
// Default category if none
if (!empty($toastItems) && empty($toastCategories)) {
$toastCategories[] = ['name' => 'Menu', 'itemCount' => count($toastItems)];
}
// Scan ALL HTML files in the ZIP for business info
$extractUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/[a-f0-9]+/).*#i', '$1', $targetUrl);
$extractDir = $expandPath($extractUrlPath);
try {
$allHtmlFiles = [];
$it = new RecursiveDirectoryIterator($extractDir, RecursiveDirectoryIterator::SKIP_DOTS);
$files = new RecursiveIteratorIterator($it);
foreach ($files as $file) {
if (preg_match('/\.html?$/i', $file->getFilename())) {
$allHtmlFiles[] = $file->getRealPath();
}
}
$response['steps'][] = "Found " . count($allHtmlFiles) . " HTML files in ZIP";
foreach ($allHtmlFiles as $otherFile) {
if ($otherFile === $localFilePath) continue;
try {
$otherHtml = file_get_contents($otherFile);
// Business name from title
if (empty($toastBusiness['name'])) {
if (preg_match('#]*>([^<]+)#i', $otherHtml, $otm)) {
$ot = trim($otm[1]);
if (strlen($ot) && !preg_match('#^(Menu|Home|About|Contact|Order|Online)$#i', $ot)) {
if (strpos($ot, '|') !== false) $ot = trim(explode('|', $ot)[0]);
$ot = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $ot);
if (strlen($ot) && strlen($ot) < 100) $toastBusiness['name'] = $ot;
}
}
}
// Address from other files
if (empty($toastBusiness['addressLine1'])) {
if (preg_match('#(\d+\s+[A-Za-z0-9\s]+(?:St(?:reet)?|Ave(?:nue)?|Rd|Road|Blvd|Boulevard|Dr(?:ive)?|Ln|Lane|Way|Ct|Court|Pl(?:ace)?|Pkwy|Parkway)[.,]?\s*(?:Suite|Ste|#|Unit|Apt)?\s*[A-Za-z0-9\-]*)#i', $otherHtml, $adm)) {
$at = trim($adm[1]);
if (strlen($at) > 5 && strlen($at) < 100) $toastBusiness['addressLine1'] = $at;
}
}
// Phone from other files
if (empty($toastBusiness['phone'])) {
if (preg_match('#\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})#', $otherHtml, $phm)) {
$toastBusiness['phone'] = $phm[1] . '-' . $phm[2] . '-' . $phm[3];
}
}
// Check __OO_STATE__ in other files
if (stripos($otherHtml, 'window.__OO_STATE__') !== false) {
$otherOoJson = $extractOoState($otherHtml);
if ($otherOoJson !== null) {
try {
$otherOo = json_decode($otherOoJson, true);
if (is_array($otherOo)) {
foreach ($otherOo as $oKey => $oVal) {
if (str_starts_with($oKey, 'Restaurant:') && is_array($oVal)) {
if (!empty($oVal['name']) && empty($toastBusiness['name'])) $toastBusiness['name'] = $oVal['name'];
if (!empty($oVal['location']) && is_array($oVal['location'])) {
$ol = $oVal['location'];
if (!empty($ol['address1']) && empty($toastBusiness['addressLine1'])) $toastBusiness['addressLine1'] = $ol['address1'];
if (!empty($ol['city']) && empty($toastBusiness['city'])) $toastBusiness['city'] = $ol['city'];
if (!empty($ol['state']) && empty($toastBusiness['state'])) $toastBusiness['state'] = $ol['state'];
if (!empty($ol['zipCode']) && empty($toastBusiness['zip'])) $toastBusiness['zip'] = $ol['zipCode'];
if (!empty($ol['phone']) && empty($toastBusiness['phone'])) $toastBusiness['phone'] = $ol['phone'];
}
if (!empty($oVal['brandColor']) && empty($toastBusiness['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $oVal['brandColor']);
}
}
}
} catch (Exception $e) { /* skip */ }
}
}
} catch (Exception $e) { /* skip unreadable files */ }
}
} catch (Exception $e) {
$response['steps'][] = "Could not scan other HTML files: " . $e->getMessage();
}
$response['steps'][] = "Extracted " . count($toastItems) . " unique items from " . count($toastCategories) . " categories";
// Scan ZIP images and analyze for business info via Claude
try {
$zipImageFiles = [];
$it = new RecursiveDirectoryIterator($extractDir, RecursiveDirectoryIterator::SKIP_DOTS);
$files = new RecursiveIteratorIterator($it);
$imageExtensions = ['jpg','jpeg','png','gif','webp'];
foreach ($files as $file) {
if (!$file->isFile()) continue;
$ext = strtolower(pathinfo($file->getFilename(), PATHINFO_EXTENSION));
if (in_array($ext, $imageExtensions) && $file->getSize() > 10000 && stripos($file->getPath(), '_files') === false) {
$zipImageFiles[] = $file->getRealPath();
}
}
if (!empty($zipImageFiles)) {
$response['steps'][] = "Found " . count($zipImageFiles) . " images in ZIP to analyze for business info";
$imgLimit = min(count($zipImageFiles), 3);
for ($imgIdx = 0; $imgIdx < $imgLimit; $imgIdx++) {
try {
$imgContent = file_get_contents($zipImageFiles[$imgIdx]);
$base64Img = base64_encode($imgContent);
$mediaType = $detectMediaType($base64Img);
$imgRequest = [
'model' => 'claude-sonnet-4-20250514',
'max_tokens' => 1024,
'temperature' => 0,
'messages' => [[
'role' => 'user',
'content' => [
['type' => 'image', 'source' => ['type' => 'base64', 'media_type' => $mediaType, 'data' => $base64Img]],
['type' => 'text', 'text' => 'Extract ALL business information visible in this image. Look carefully for: 1) Business NAME (the restaurant/store name), 2) PHONE number (format: xxx-xxx-xxxx), 3) Full ADDRESS (street, city, state, zip), 4) HOURS of operation (all days shown). Return JSON: {"name":"","addressLine1":"","city":"","state":"","zip":"","phone":"","hours":"","brandColor":""}. For hours, format as single string like \'Mon-Thu 7am-10pm, Fri-Sat 7am-11pm\'. Return ONLY valid JSON.'],
],
]],
];
$imgResp = $httpPost(
'https://api.anthropic.com/v1/messages',
json_encode($imgRequest),
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
60
);
if ($imgResp['code'] === 200) {
$imgData = json_decode($imgResp['body'], true);
if (!empty($imgData['content'][0]['text'])) {
$imgText = $cleanClaudeJson($imgData['content'][0]['text']);
$imgBiz = json_decode($imgText, true);
if (is_array($imgBiz)) {
foreach (['name','addressLine1','city','state','zip','phone','hours','brandColor'] as $field) {
if (!empty($imgBiz[$field]) && is_scalar($imgBiz[$field])) {
$toastBusiness[$field] = trim($imgBiz[$field]);
}
}
}
}
}
} catch (Exception $e) {
$response['steps'][] = "Error analyzing image: " . $e->getMessage();
}
}
}
} catch (Exception $e) {
$response['steps'][] = "Could not scan ZIP for images: " . $e->getMessage();
}
// Return directly
$response['OK'] = true;
$response['DATA'] = [
'business' => $toastBusiness,
'categories' => $toastCategories,
'modifiers' => [],
'items' => $toastItems,
'imageUrls' => [],
'headerCandidateIndices' => [],
'imageMappings' => [],
];
$response['sourceUrl'] = $targetUrl;
$response['pagesProcessed'] = 1;
$response['imagesFound'] = 0;
$response['playwrightImagesCount'] = 0;
$response['toastDirect'] = true;
jsonResponse($response);
} catch (Exception $e) {
$response['steps'][] = "Toast HTML parse failed: " . $e->getMessage() . " - falling back to Claude";
}
}
// Extract base URL for relative links (local temp file case)
if (preg_match('#^(https?://[^/]+)#', $targetUrl, $bm)) {
$baseUrl = $bm[1];
}
$basePath = preg_replace('#/[^/]*$#', '/', preg_replace('#\?.*$#', '', $targetUrl));
} else {
// Remote URL - use Playwright for JS-rendered content
$response['steps'][] = "Fetching URL with Playwright: $targetUrl";
$pwOutput = shell_exec("/opt/playwright/run.sh " . escapeshellarg($targetUrl) . " 10000 2>&1");
if (empty(trim($pwOutput ?? ''))) {
throw new Exception("Playwright returned empty response");
}
$pwResult = json_decode($pwOutput, true);
if (isset($pwResult['error'])) {
throw new Exception("Playwright error: " . $pwResult['error']);
}
$pageHtml = $pwResult['html'] ?? '';
$playwrightImages = $pwResult['images'] ?? [];
$response['steps'][] = "Fetched " . strlen($pageHtml) . " bytes via Playwright, " . count($playwrightImages) . " images captured";
// Capture platform image map (ordering site food photos matched to item names)
$platformImageMap = [];
if (!empty($pwResult['platformImageMap']) && is_array($pwResult['platformImageMap'])) {
$platformImageMap = $pwResult['platformImageMap'];
$response['steps'][] = "Found " . count($platformImageMap) . " item images from ordering platform";
}
if (!empty($pwResult['subPagesVisited']) && is_array($pwResult['subPagesVisited'])) {
$response['steps'][] = "Visited " . count($pwResult['subPagesVisited']) . " menu sub-pages: " . implode(', ', $pwResult['subPagesVisited']);
}
if (!empty($pwResult['platformPagesVisited']) && is_array($pwResult['platformPagesVisited'])) {
$response['steps'][] = "Visited " . count($pwResult['platformPagesVisited']) . " ordering platforms for photos: " . implode(', ', $pwResult['platformPagesVisited']);
}
// ========== WOOCOMMERCE FAST PATH ==========
if (stripos($pageHtml, 'woocommerce') !== false || stripos($pageHtml, 'wc-add-to-cart') !== false || stripos($pageHtml, 'tm-extra-product-options') !== false) {
$response['steps'][] = "WooCommerce site detected - running modifier extraction";
$wooUrl = preg_replace('#(https?://[^/]+).*#', '$1', $targetUrl);
try {
$wooOutput = shell_exec("/opt/playwright/run-woo-modifiers.sh " . escapeshellarg($wooUrl) . " 2>&1");
if (!empty(trim($wooOutput ?? ''))) {
$wooResult = json_decode($wooOutput, true);
if (!empty($wooResult['items']) && is_array($wooResult['items'])) {
$response['steps'][] = "WooCommerce extraction: " . count($wooResult['items']) . " items, " . count($wooResult['modifiers'] ?? []) . " modifier groups";
$wooCats = [];
$wooItems = [];
foreach ($wooResult['items'] as $wi => $wItem) {
$catName = !empty($wItem['category']) ? trim($wItem['category']) : 'Menu';
if (!isset($wooCats[$catName])) $wooCats[$catName] = 0;
$wooCats[$catName]++;
$itemMods = $wooResult['itemModifierMap'][$wItem['name']] ?? [];
$wooItems[] = [
'id' => 'item_' . ($wi + 1),
'name' => $wItem['name'],
'price' => (float)($wItem['price'] ?? 0),
'description' => $wItem['description'] ?? '',
'category' => $catName,
'modifiers' => $itemMods,
'hasModifiers' => count($itemMods) > 0,
'imageUrl' => trim($wItem['imageUrl'] ?? ''),
];
}
$wooCategories = [];
foreach ($wooCats as $wcName => $wcCount) {
$wooCategories[] = ['name' => $wcName, 'itemCount' => $wcCount];
}
$wooBiz = $wooResult['business'] ?? [];
$response['OK'] = true;
$response['DATA'] = [
'business' => [
'name' => $wooBiz['name'] ?? '',
'address' => $wooBiz['address'] ?? '',
'phone' => $wooBiz['phone'] ?? '',
'hours' => $wooBiz['hours'] ?? '',
],
'categories' => $wooCategories,
'items' => $wooItems,
'modifiers' => $wooResult['modifiers'] ?? [],
'imageUrls' => [],
'imageMappings' => [],
'headerCandidateIndices' => [],
];
$response['sourceUrl'] = $targetUrl;
$response['parsedVia'] = 'woocommerce_playwright';
jsonResponse($response);
}
}
$response['steps'][] = "WooCommerce extraction returned no items - falling through to Claude";
} catch (Exception $e) {
$response['steps'][] = "WooCommerce extraction failed: " . $e->getMessage() . " - falling through to Claude";
}
}
// ========== END WOOCOMMERCE FAST PATH ==========
// ========== DOORDASH / ORDER.ONLINE FAST PATH ==========
if (stripos($pageHtml, 'MenuPageItem') !== false && stripos($pageHtml, 'MenuPageItemList') !== false) {
$response['steps'][] = "DoorDash/order.online site detected - extracting embedded data";
try {
// Build image map from StorePageCarouselItem entries
$ddImageMap = [];
$carouselMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StorePageCarouselItem' . $BQ;
$searchPos = 0;
while (true) {
$searchPos = stripos($pageHtml, $carouselMarker, $searchPos);
if ($searchPos === false) break;
$nextMarker = stripos($pageHtml, $BQ . '__typename' . $BQ, $searchPos + strlen($carouselMarker));
if ($nextMarker === false) $nextMarker = strlen($pageHtml);
$entryText = substr($pageHtml, $searchPos, $nextMarker - $searchPos);
$cpName = $extractBqValue($entryText, 'name');
if ($cpName !== null) {
$cpImg = $extractBqValue($entryText, 'imgUrl');
if ($cpImg !== null && $cpImg !== 'null' && stripos($cpImg, 'http') !== false) {
if (stripos($cpImg, 'width=') !== false) {
$cpImg = preg_replace('/width=\d+/i', 'width=600', $cpImg);
$cpImg = preg_replace('/height=\d+/i', 'height=600', $cpImg);
}
$ddImageMap[$cpName] = $cpImg;
}
}
$searchPos += strlen($carouselMarker);
}
$response['steps'][] = "Built image map with " . count($ddImageMap) . " entries from carousel";
// Extract menu from MenuPageItemList
$ddCategories = [];
$ddCatSeen = [];
$ddItems = [];
$ddItemSeen = [];
$ddItemCounter = 0;
$catMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'MenuPageItemList' . $BQ;
$itemMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'MenuPageItem' . $BQ;
$catPos = 0;
while (true) {
$catPos = stripos($pageHtml, $catMarker, $catPos);
if ($catPos === false) break;
$nextCatPos = stripos($pageHtml, $catMarker, $catPos + strlen($catMarker));
if ($nextCatPos === false) $nextCatPos = strlen($pageHtml);
$catSection = substr($pageHtml, $catPos, $nextCatPos - $catPos);
$catName = $extractBqValue($catSection, 'name');
if ($catName === null) { $catPos += strlen($catMarker); continue; }
$catName = str_replace(['\\u0026', '&'], '&', $catName);
if ($catName === 'Most Ordered' || isset($ddCatSeen[$catName])) {
$catPos += strlen($catMarker);
continue;
}
$ddCatSeen[$catName] = true;
$ddCategories[] = ['name' => $catName, 'parentCategoryName' => ''];
// Items within category
$itemPos = 0;
while (true) {
$itemPos = stripos($catSection, $itemMarker, $itemPos);
if ($itemPos === false) break;
$nextItemPos = stripos($catSection, $itemMarker, $itemPos + strlen($itemMarker));
if ($nextItemPos === false) $nextItemPos = strlen($catSection);
$itemEntry = substr($catSection, $itemPos, $nextItemPos - $itemPos);
$ddItemId = $extractBqValue($itemEntry, 'id') ?? '';
$ipName = $extractBqValue($itemEntry, 'name');
if ($ipName === null) { $itemPos += strlen($itemMarker); continue; }
$ipName = str_replace('\\u0026', '&', $ipName);
if (isset($ddItemSeen[$ipName])) { $itemPos += strlen($itemMarker); continue; }
$ddItemSeen[$ipName] = true;
$ipDesc = $extractBqValue($itemEntry, 'description') ?? '';
$ipDesc = str_replace('\\u0026', '&', $ipDesc);
$ipPriceStr = $extractBqValue($itemEntry, 'displayPrice') ?? '';
$ipPrice = (float)preg_replace('/[^0-9.]/', '', $ipPriceStr);
// Image from carousel map or item entry
$ipImg = $ddImageMap[$ipName] ?? '';
if (empty($ipImg)) {
$ipImg = $extractBqValue($itemEntry, 'imageUrl') ?? '';
if ($ipImg === 'null' || stripos($ipImg, 'http') === false) $ipImg = '';
if (strlen($ipImg) && stripos($ipImg, 'width=') !== false) {
$ipImg = preg_replace('/width=\d+/i', 'width=600', $ipImg);
$ipImg = preg_replace('/height=\d+/i', 'height=600', $ipImg);
}
}
$ddItemCounter++;
$ddItem = [
'name' => $ipName,
'description' => $ipDesc,
'price' => $ipPrice,
'category' => $catName,
'modifiers' => [],
'id' => 'item_' . $ddItemCounter,
'ddItemId' => $ddItemId,
'imageUrl' => $ipImg,
'imageSrc' => $ipImg,
];
if (strlen($ipImg)) $ddItem['imageFilename'] = basename(parse_url($ipImg, PHP_URL_PATH) ?: $ipImg);
$ddItems[] = $ddItem;
$itemPos += strlen($itemMarker);
}
$catPos += strlen($catMarker);
}
$ddItemsWithImg = 0;
foreach ($ddItems as $ddi) { if (!empty($ddi['imageUrl'])) $ddItemsWithImg++; }
$response['steps'][] = "Found " . count($ddCategories) . " categories, " . count($ddItems) . " items ($ddItemsWithImg with images)";
// Extract business info
$ddBusiness = [];
if (preg_match('#([^<]+)#i', $pageHtml, $ddTm)) {
$ddTitle = preg_replace('#\s*[-|].*#', '', trim($ddTm[1]));
if (strlen($ddTitle)) $ddBusiness['name'] = $ddTitle;
}
// Address from StoreHeaderAddress
$ddAddrMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreHeaderAddress' . $BQ;
$ddAddrPos = stripos($pageHtml, $ddAddrMarker);
if ($ddAddrPos !== false) {
$ddAddrEnd = stripos($pageHtml, $BQ . '__typename' . $BQ, $ddAddrPos + strlen($ddAddrMarker));
if ($ddAddrEnd === false) $ddAddrEnd = min($ddAddrPos + 2000, strlen($pageHtml));
$ddAddrSection = substr($pageHtml, $ddAddrPos, $ddAddrEnd - $ddAddrPos);
$street = $extractBqValue($ddAddrSection, 'street');
if ($street !== null) $ddBusiness['street'] = $street;
$displayAddr = $extractBqValue($ddAddrSection, 'displayAddress');
if ($displayAddr !== null) $ddBusiness['address'] = $displayAddr;
}
// Phone from StoreHeaderPhoneNumber
$ddPhoneMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreHeaderPhoneNumber' . $BQ;
$ddPhonePos = stripos($pageHtml, $ddPhoneMarker);
if ($ddPhonePos !== false) {
$ddPhoneEnd = stripos($pageHtml, $BQ . '__typename' . $BQ, $ddPhonePos + strlen($ddPhoneMarker));
if ($ddPhoneEnd === false) $ddPhoneEnd = min($ddPhonePos + 1000, strlen($pageHtml));
$ddPhoneSection = substr($pageHtml, $ddPhonePos, $ddPhoneEnd - $ddPhonePos);
$phone = $extractBqValue($ddPhoneSection, 'phoneNumber');
if ($phone !== null) $ddBusiness['phone'] = $phone;
}
// Hours from StoreOperationHoursRange
$ddHoursMarker = $BQ . '__typename' . $BQ . ':' . $BQ . 'StoreOperationHoursRange' . $BQ;
if (stripos($pageHtml, $ddHoursMarker) !== false) {
$ddHoursArr = [];
$hPos = 0;
while (true) {
$hPos = stripos($pageHtml, $ddHoursMarker, $hPos);
if ($hPos === false) break;
$hNext = stripos($pageHtml, $ddHoursMarker, $hPos + strlen($ddHoursMarker));
if ($hNext === false) $hNext = min($hPos + 500, strlen($pageHtml));
$hSection = substr($pageHtml, $hPos, $hNext - $hPos);
$dayRange = $extractBqValue($hSection, 'dayRange');
$timeRange = $extractBqValue($hSection, 'timeRange');
if ($dayRange !== null && $timeRange !== null) {
$ddHoursArr[] = "$dayRange: $timeRange";
}
$hPos += strlen($ddHoursMarker);
}
if (!empty($ddHoursArr)) $ddBusiness['hours'] = implode('; ', $ddHoursArr);
}
if (!empty($ddItems)) {
// Playwright modifier extraction
$ddModifiers = [];
$ddItemModMap = [];
try {
$response['steps'][] = "Running stealth Playwright for modifier extraction...";
$ddItemsForPw = [];
foreach ($ddItems as $ddi) {
$ddItemsForPw[] = ['id' => $ddi['ddItemId'], 'name' => $ddi['name']];
}
$ddTempFile = '/tmp/dd-items-' . generateUUID() . '.json';
file_put_contents($ddTempFile, json_encode($ddItemsForPw));
$modTimeout = 180 + count($ddItems) * 2;
if ($modTimeout > 600) $modTimeout = 600;
$ddModOutput = shell_exec("/opt/playwright/run-doordash-modifiers.sh " . escapeshellarg($targetUrl) . " " . escapeshellarg($ddTempFile) . " 2>&1");
@unlink($ddTempFile);
if (!empty(trim($ddModOutput ?? ''))) {
$ddModData = json_decode(trim($ddModOutput), true);
if (!empty($ddModData['modifiers']) && is_array($ddModData['modifiers'])) {
$ddModifiers = $ddModData['modifiers'];
foreach ($ddModifiers as &$ddMod) {
$ddMod['type'] = (!empty($ddMod['maxSelections']) && $ddMod['maxSelections'] == 1) ? 'select' : 'checkbox';
}
unset($ddMod);
}
if (!empty($ddModData['itemModifierMap']) && is_array($ddModData['itemModifierMap'])) {
$ddItemModMap = $ddModData['itemModifierMap'];
for ($i = 0; $i < count($ddItems); $i++) {
if (isset($ddItemModMap[$ddItems[$i]['name']])) {
$ddItems[$i]['modifiers'] = $ddItemModMap[$ddItems[$i]['name']];
}
}
}
$response['steps'][] = "Modifier extraction: " . count($ddModifiers) . " groups, " . count($ddItemModMap) . " items mapped";
}
} catch (Exception $e) {
$response['steps'][] = "Modifier extraction failed (non-fatal): " . $e->getMessage();
}
$ddImageUrls = [];
foreach ($ddItems as $ddI) {
if (!empty($ddI['imageUrl'])) $ddImageUrls[] = $ddI['imageUrl'];
}
$response['OK'] = true;
$response['DATA'] = [
'business' => $ddBusiness,
'categories' => $ddCategories,
'modifiers' => $ddModifiers,
'items' => $ddItems,
'imageUrls' => $ddImageUrls,
'headerCandidateIndices' => [],
];
$response['sourceUrl'] = $targetUrl;
$response['parsedVia'] = 'doordash_embedded';
$response['imagesFound'] = count($ddImageUrls);
$response['playwrightImagesCount'] = count($playwrightImages);
jsonResponse($response);
}
} catch (Exception $e) {
$response['steps'][] = "DoorDash extraction failed: " . $e->getMessage() . " - falling through to Claude";
}
}
// ========== END DOORDASH FAST PATH ==========
// Extract base URL for relative links
if (preg_match('#^(https?://[^/]+)#', $targetUrl, $bm)) {
$baseUrl = $bm[1];
}
$basePath = preg_replace('#\?.*$#', '', $targetUrl);
if (!preg_match('#/$#', $basePath)) {
$basePath = preg_replace('#/[^/]*$#', '/', $basePath);
}
}
} else {
throw new Exception("Either 'url' or 'html' content is required");
}
// Menu pages array
$menuPages = [['url' => !empty($targetUrl) ? $targetUrl : 'uploaded', 'html' => $pageHtml]];
// Extract images from all pages
$imageUrls = [];
$imageMappings = [];
// Add Playwright-captured images
foreach ($playwrightImages as $pwImg) {
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)#i', $pwImg)) {
$imageUrls[$pwImg] = true;
}
}
foreach ($menuPages as $menuPage) {
if (preg_match_all('#
]+src=["\']([^"\']+)["\'][^>]*>#i', $menuPage['html'], $imgMatches, PREG_SET_ORDER)) {
foreach ($imgMatches as $imgMatch) {
$imgTag = $imgMatch[0];
$imgSrc = $imgMatch[1];
// Extract alt text
$imgAlt = '';
if (preg_match('#alt=["\']([^"\']+)["\']#i', $imgTag, $altM)) {
$imgAlt = $altM[1];
}
// Image mapping for local uploads
$imgFilename = basename($imgSrc);
if (strlen($imgFilename) && strlen($imgAlt) && !preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button)#i', $imgSrc)) {
$imageMappings[] = ['filename' => $imgFilename, 'alt' => $imgAlt, 'src' => $imgSrc];
}
// Resolve relative URLs
if (str_starts_with($imgSrc, '/')) {
$imgSrc = $baseUrl . $imgSrc;
} elseif (!preg_match('#^https?://#i', $imgSrc) && !str_starts_with($imgSrc, 'data:')) {
$imgSrc = $basePath . $imgSrc;
}
if (preg_match('#^https?://#i', $imgSrc) && !isset($imageUrls[$imgSrc])) {
if (!preg_match('#(icon|favicon|logo|sprite|pixel|tracking|badge|button)#i', $imgSrc)) {
$imageUrls[$imgSrc] = true;
}
}
}
}
}
$response['steps'][] = "Found " . count($imageUrls) . " unique images";
// Check for local scan (ZIP upload)
$isLocalScan = !empty($targetUrl) && stripos($targetUrl, '/temp/menu-import/') !== false;
$localBasePath = '';
if ($isLocalScan) {
$localUrlPath = preg_replace('#https?://[^/]+(/temp/menu-import/[^/]+/).*#i', '$1', $targetUrl);
$localBasePath = $expandPath($localUrlPath);
$response['steps'][] = "Local scan detected, base path: $localBasePath";
}
// Download/read images (limit to 20)
$imageDataArray = [];
$downloadedCount = 0;
$localReadCount = 0;
foreach (array_keys($imageUrls) as $imgUrl) {
if ($downloadedCount >= 20) break;
try {
$imgBytes = 0;
$imgContent = '';
$mediaType = 'image/jpeg';
if ($isLocalScan && stripos($imgUrl, '/temp/menu-import/') !== false) {
$localPath = $expandPath(preg_replace('#https?://[^/]+(/temp/menu-import/.*)#i', '$1', $imgUrl));
if (file_exists($localPath)) {
$imgContent = file_get_contents($localPath);
$imgBytes = strlen($imgContent);
$ext = strtolower(pathinfo($localPath, PATHINFO_EXTENSION));
if ($ext === 'png') $mediaType = 'image/png';
elseif ($ext === 'gif') $mediaType = 'image/gif';
elseif ($ext === 'webp') $mediaType = 'image/webp';
$localReadCount++;
}
} else {
$result = $httpGet($imgUrl, [], 10);
if ($result['code'] === 200 && !empty($result['body'])) {
$ct = $result['contentType'];
if (preg_match('#image/(jpeg|jpg|png|gif|webp)#i', $ct)) {
$imgContent = $result['body'];
$imgBytes = strlen($imgContent);
if (stripos($ct, 'png') !== false) $mediaType = 'image/png';
elseif (stripos($ct, 'gif') !== false) $mediaType = 'image/gif';
elseif (stripos($ct, 'webp') !== false) $mediaType = 'image/webp';
}
}
}
if ($imgBytes > 5000) {
$base64Content = base64_encode($imgContent);
$mediaType = $detectMediaType($base64Content);
$imageDataArray[] = [
'type' => 'image',
'source' => ['type' => 'base64', 'media_type' => $mediaType, 'data' => $base64Content],
'url' => $imgUrl,
];
$downloadedCount++;
}
} catch (Exception $e) {
// Skip failed downloads
}
}
$response['steps'][] = "Loaded " . count($imageDataArray) . " valid images ($localReadCount from local disk)";
// ============================================================
// TOAST FAST PATH: Parse __OO_STATE__ directly instead of Claude
// ============================================================
if (stripos($pageHtml, 'window.__OO_STATE__') !== false && stripos($pageHtml, 'toasttab') !== false) {
$response['steps'][] = "Toast page detected - extracting menu data from __OO_STATE__";
try {
$ooJson = $extractOoState($pageHtml);
if ($ooJson !== null) {
$ooState = json_decode($ooJson, true);
if (!is_array($ooState)) throw new Exception("Failed to parse __OO_STATE__ JSON");
$toastBusiness = [];
$toastCategories = [];
$toastItems = [];
$categorySet = [];
$itemId = 1;
$menuNames = [];
// Extract restaurant info from ROOT_QUERY
if (!empty($ooState['ROOT_QUERY']) && is_array($ooState['ROOT_QUERY'])) {
foreach ($ooState['ROOT_QUERY'] as $rqKey => $rqVal) {
if ((stripos($rqKey, 'restaurantV2By') !== false || stripos($rqKey, 'restaurantV2(') !== false) && is_array($rqVal)) {
if (!empty($rqVal['name']) && empty($toastBusiness['name'])) $toastBusiness['name'] = $rqVal['name'];
if (!empty($rqVal['description']) && strlen(trim((string)$rqVal['description']))) {
$toastBusiness['description'] = trim((string)$rqVal['description']);
}
if (!empty($rqVal['location']) && is_array($rqVal['location'])) {
$loc = $rqVal['location'];
if (!empty($loc['address1'])) {
$toastBusiness['addressLine1'] = $loc['address1'];
$toastBusiness['address'] = $loc['address1'];
if (!empty($loc['city'])) { $toastBusiness['city'] = $loc['city']; $toastBusiness['address'] .= ', ' . $loc['city']; }
if (!empty($loc['state'])) { $toastBusiness['state'] = $loc['state']; $toastBusiness['address'] .= ', ' . $loc['state']; }
$zip = $loc['zip'] ?? $loc['zipCode'] ?? null;
if (!empty($zip)) { $toastBusiness['zip'] = $zip; $toastBusiness['address'] .= ' ' . $zip; }
}
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
if (!empty($loc['latitude']) && is_numeric($loc['latitude']) && !empty($loc['longitude']) && is_numeric($loc['longitude'])) {
$toastBusiness['latitude'] = $loc['latitude'];
$toastBusiness['longitude'] = $loc['longitude'];
}
}
if (!empty($rqVal['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $rqVal['brandColor']);
// Hours from schedule
if (!empty($rqVal['schedule']['upcomingSchedules'][0]['dailySchedules'])) {
$dayHours = [];
foreach ($rqVal['schedule']['upcomingSchedules'][0]['dailySchedules'] as $ds) {
if (!empty($ds['date']) && !empty($ds['servicePeriods'][0]['startTime'])) {
$dow = (int)date('w', strtotime($ds['date'])) + 1; // 1=Sun
$sp = $ds['servicePeriods'][0];
$dayHours[$dow] = ['open' => substr($sp['startTime'], 0, 5), 'close' => substr($sp['endTime'], 0, 5)];
}
}
$dayNames = [1=>'Sun',2=>'Mon',3=>'Tue',4=>'Wed',5=>'Thu',6=>'Fri',7=>'Sat'];
$dayOrder = [2,3,4,5,6,7,1]; // Mon-Sun
$hoursParts = [];
foreach ($dayOrder as $dIdx) {
if (isset($dayHours[$dIdx])) {
$dh = $dayHours[$dIdx];
$op = explode(':', $dh['open']);
$cp = explode(':', $dh['close']);
$openStr = $formatTime12h((int)$op[0], (int)($op[1] ?? 0));
$closeStr = $formatTime12h((int)$cp[0], (int)($cp[1] ?? 0));
$hoursParts[] = $dayNames[$dIdx] . " $openStr-$closeStr";
}
}
if (!empty($hoursParts)) $toastBusiness['hours'] = implode(', ', $hoursParts);
}
}
}
}
// Also check Restaurant: keys (older format)
foreach ($ooState as $ooKey => $ooVal) {
if (str_starts_with($ooKey, 'Restaurant:') && empty($toastBusiness['name']) && is_array($ooVal)) {
if (!empty($ooVal['name'])) $toastBusiness['name'] = $ooVal['name'];
if (!empty($ooVal['location']) && is_array($ooVal['location'])) {
$loc = $ooVal['location'];
if (!empty($loc['address1'])) {
$toastBusiness['address'] = $loc['address1'];
if (!empty($loc['city'])) $toastBusiness['address'] .= ', ' . $loc['city'];
if (!empty($loc['state'])) $toastBusiness['address'] .= ', ' . $loc['state'];
if (!empty($loc['zipCode'])) $toastBusiness['address'] .= ' ' . $loc['zipCode'];
}
if (!empty($loc['phone'])) $toastBusiness['phone'] = $loc['phone'];
}
if (!empty($ooVal['brandColor'])) $toastBusiness['brandColor'] = str_replace('#', '', $ooVal['brandColor']);
}
// Menu data
if (str_starts_with($ooKey, 'Menu:') && is_array($ooVal) && !empty($ooVal['groups']) && is_array($ooVal['groups'])) {
$menuName = $ooVal['name'] ?? '';
if (strlen($menuName)) $menuNames[] = $menuName;
foreach ($ooVal['groups'] as $group) {
$groupName = trim($group['name'] ?? 'Menu');
if (!isset($categorySet[$groupName])) {
$categorySet[$groupName] = true;
$catObj = ['name' => $groupName, 'itemCount' => 0, 'menuName' => $menuName];
$toastCategories[] = $catObj;
}
// Items from group
if (!empty($group['items']) && is_array($group['items'])) {
foreach ($group['items'] as $item) {
if (empty($item['name']) || !strlen(trim($item['name']))) continue;
$itemStruct = [
'id' => 'item_' . $itemId,
'name' => trim($item['name']),
'category' => $groupName,
'modifiers' => [],
'hasModifiers' => !empty($item['hasModifiers']),
'guid' => $item['guid'] ?? '',
'itemGroupGuid' => $item['itemGroupGuid'] ?? '',
'description' => isset($item['description']) && !is_null($item['description']) ? trim((string)$item['description']) : '',
'price' => $extractToastPrice($item),
'imageUrl' => '',
];
$img = $extractToastImage($item);
if (strlen($img)) {
$itemStruct['imageUrl'] = $img;
$itemStruct['imageSrc'] = $img;
$itemStruct['imageFilename'] = basename($img);
}
$toastItems[] = $itemStruct;
$itemId++;
}
}
// Subgroups
$subgroups = $group['subgroups'] ?? $group['children'] ?? $group['childGroups'] ?? [];
if (!empty($subgroups) && is_array($subgroups)) {
foreach ($subgroups as $sg) {
$subName = trim($sg['name'] ?? $groupName);
if (strlen($subName) && !isset($categorySet[$subName])) {
$categorySet[$subName] = true;
$toastCategories[] = ['name' => $subName, 'parentCategoryName' => $groupName, 'itemCount' => 0];
}
if (!empty($sg['items']) && is_array($sg['items'])) {
foreach ($sg['items'] as $subItem) {
if (empty($subItem['name']) || !strlen(trim($subItem['name']))) continue;
$itemStruct = [
'id' => 'item_' . $itemId,
'name' => trim($subItem['name']),
'category' => $subName,
'modifiers' => [],
'hasModifiers' => !empty($subItem['hasModifiers']),
'guid' => $subItem['guid'] ?? '',
'itemGroupGuid' => $subItem['itemGroupGuid'] ?? '',
'description' => isset($subItem['description']) && !is_null($subItem['description']) ? trim((string)$subItem['description']) : '',
'price' => $extractToastPrice($subItem),
'imageUrl' => '',
];
$img = $extractToastImage($subItem);
if (strlen($img)) {
$itemStruct['imageUrl'] = $img;
$itemStruct['imageSrc'] = $img;
$itemStruct['imageFilename'] = basename($img);
}
$toastItems[] = $itemStruct;
$itemId++;
}
}
}
}
}
}
}
// Fallback: business name from title
if (empty($toastBusiness['name'])) {
if (preg_match('#]*>([^<]+)#i', $pageHtml, $tm)) {
$titleText = trim($tm[1]);
if (strpos($titleText, '|') !== false) $titleText = trim(explode('|', $titleText)[0]);
$titleText = preg_replace('#\s*-\s*(Menu|Order|Online).*$#i', '', $titleText);
if (strlen($titleText)) $toastBusiness['name'] = $titleText;
}
}
// Clean business name
if (!empty($toastBusiness['name'])) {
$bizName = $toastBusiness['name'];
$bizName = preg_replace('#\s*[-|]+\s*(Order\s+(pickup|online|delivery|food)|Online\s+Order|Delivery\s*[&and]+\s*Takeout|Takeout\s*[&and]+\s*Delivery|Menu\s*[&and]+\s*Order).*$#i', '', $bizName);
if (!empty($toastBusiness['addressLine1']) && stripos($bizName, $toastBusiness['addressLine1']) !== false) {
$bizName = trim(str_ireplace($toastBusiness['addressLine1'], '', $bizName));
}
if (!empty($toastBusiness['address'])) {
$addrFirst = trim(explode(',', $toastBusiness['address'])[0]);
if (strlen($addrFirst) && stripos($bizName, $addrFirst) !== false) {
$bizName = trim(str_ireplace($addrFirst, '', $bizName));
}
}
$bizName = trim(preg_replace('#[-|]+$#', '', trim($bizName)));
$bizName = trim(preg_replace('#^[-|]+#', '', $bizName));
$toastBusiness['name'] = trim($bizName);
}
// Clean city
if (!empty($toastBusiness['city']) && strpos($toastBusiness['city'], ',') !== false) {
$toastBusiness['city'] = trim(explode(',', $toastBusiness['city'])[0]);
}
// Multi-menu hierarchy
if (count($menuNames) > 1) {
$hierarchicalCategories = [];
foreach ($menuNames as $mn) {
$hierarchicalCategories[] = ['name' => $mn, 'itemCount' => 0];
foreach ($toastCategories as $tc) {
if (($tc['menuName'] ?? '') === $mn) {
$tc['parentCategoryName'] = $mn;
$hierarchicalCategories[] = $tc;
}
}
}
$toastCategories = $hierarchicalCategories;
}
// Update category item counts
for ($ci = 0; $ci < count($toastCategories); $ci++) {
$count = 0;
foreach ($toastItems as $ti) {
if ($ti['category'] === $toastCategories[$ci]['name']) $count++;
}
$toastCategories[$ci]['itemCount'] = $count;
}
$response['steps'][] = "Extracted " . count($toastItems) . " items from " . count($toastCategories) . " categories via __OO_STATE__";
// Toast modifier extraction via Playwright
$toastModifiers = [];
$modifierItemCount = 0;
foreach ($toastItems as $ti) {
if (!empty($ti['hasModifiers'])) $modifierItemCount++;
}
if ($modifierItemCount > 0) {
$response['steps'][] = "$modifierItemCount items have modifiers - extracting via Playwright";
try {
$toastUrl = '';
if (!empty($targetUrl) && preg_match('#toasttab\.com#i', $targetUrl)) {
$toastUrl = $targetUrl;
} else {
// Try shortUrl from HTML
if (preg_match('#"shortUrl"\s*:\s*"([^"]+)"#i', $pageHtml, $sm)) {
$toastUrl = 'https://www.toasttab.com/local/order/' . $sm[1];
}
if (empty($toastUrl) && preg_match('#toasttab\.com/([a-zA-Z0-9_-]+)/giftcards#i', $pageHtml, $gm)) {
$toastUrl = 'https://www.toasttab.com/local/order/' . $gm[1];
}
}
if (strlen($toastUrl)) {
$response['steps'][] = "Fetching modifiers from: $toastUrl";
$modOutput = shell_exec("/opt/playwright/run-toast-modifiers.sh " . escapeshellarg($toastUrl) . " 2>&1");
if (!empty(trim($modOutput ?? ''))) {
$modResult = json_decode($modOutput, true);
if (!empty($modResult['modifiers']) && is_array($modResult['modifiers'])) {
$toastModifiers = $modResult['modifiers'];
$response['steps'][] = "Extracted " . count($toastModifiers) . " unique modifier groups";
}
if (!empty($modResult['itemModifierMap']) && is_array($modResult['itemModifierMap'])) {
$modMap = $modResult['itemModifierMap'];
for ($mi = 0; $mi < count($toastItems); $mi++) {
if (isset($modMap[$toastItems[$mi]['name']])) {
$toastItems[$mi]['modifiers'] = $modMap[$toastItems[$mi]['name']];
}
}
$response['steps'][] = "Mapped modifiers to " . count($modMap) . " items";
}
if (!empty($modResult['stats'])) {
$response['steps'][] = "Modifier stats: " . json_encode($modResult['stats']);
}
} else {
$response['steps'][] = "Playwright modifier script returned empty output";
}
} else {
$response['steps'][] = "Could not determine Toast URL for modifier extraction";
}
} catch (Exception $e) {
$response['steps'][] = "Modifier extraction failed: " . $e->getMessage() . " - continuing without modifiers";
}
}
// Return directly if we have items
if (!empty($toastItems)) {
$response['OK'] = true;
$response['DATA'] = [
'business' => $toastBusiness,
'categories' => $toastCategories,
'items' => $toastItems,
'modifiers' => $toastModifiers,
'imageUrls' => [],
'imageMappings' => $imageMappings,
'headerCandidateIndices' => [],
];
$response['sourceUrl'] = !empty($targetUrl) ? $targetUrl : 'uploaded';
$response['pagesProcessed'] = 1;
$response['imagesFound'] = count($imageDataArray);
$response['playwrightImagesCount'] = count($playwrightImages);
$response['parsedVia'] = 'toast_oo_state';
jsonResponse($response);
}
}
} catch (Exception $e) {
$toastError = "Toast __OO_STATE__ parsing failed: " . $e->getMessage();
$response['steps'][] = "$toastError - falling back to Claude";
$response['DEBUG_TOAST_ERROR'] = $toastError;
}
}
// ============================================================
// Look for embedded JSON data (__NEXT_DATA__, window state, etc.)
// ============================================================
$embeddedJsonData = '';
foreach ($menuPages as $menuPage) {
if (preg_match_all('##i', $menuPage['html'], $ndm)) {
foreach ($ndm[1] as $sc) $embeddedJsonData .= "\n--- __NEXT_DATA__ ---\n$sc";
}
if (preg_match_all('#window\.__[A-Z_]+__\s*=\s*(\{[^;]+\});#', $menuPage['html'], $stm)) {
foreach ($stm[0] as $sm) $embeddedJsonData .= "\n--- WINDOW_STATE ---\n$sm";
}
if (preg_match_all('#data-(?:props|page|state)=["\'](\{[^"\']+\})["\']#i', $menuPage['html'], $dpm)) {
foreach ($dpm[0] as $dp) $embeddedJsonData .= "\n--- DATA_PROPS ---\n$dp";
}
if (preg_match_all('##i', $menuPage['html'], $ldm)) {
foreach ($ldm[1] as $sc) {
if (stripos($sc, 'menu') !== false || stripos($sc, 'MenuItem') !== false) {
$embeddedJsonData .= "\n--- JSON_LD_MENU ---\n$sc";
}
}
}
}
if (strlen($embeddedJsonData)) {
$response['DEBUG_EMBEDDED_JSON_FOUND'] = true;
$response['DEBUG_EMBEDDED_JSON_LENGTH'] = strlen($embeddedJsonData);
} else {
$response['DEBUG_EMBEDDED_JSON_FOUND'] = false;
}
// Combine HTML, strip scripts/styles
$combinedHtml = '';
foreach ($menuPages as $menuPage) {
$cleanHtml = $menuPage['html'];
$cleanHtml = preg_replace('##is', '', $cleanHtml);
$cleanHtml = preg_replace('##is', '', $cleanHtml);
$cleanHtml = preg_replace('##s', '', $cleanHtml);
$combinedHtml .= "\n--- PAGE: " . $menuPage['url'] . " ---\n" . $cleanHtml;
}
if (strlen($embeddedJsonData)) {
$combinedHtml .= "\n\n=== EMBEDDED JSON DATA (may contain full menu) ===\n" . $embeddedJsonData;
}
if (strlen($combinedHtml) > 100000) {
$combinedHtml = substr($combinedHtml, 0, 100000);
}
// Server-side heading hierarchy detection
$headingHierarchy = [];
$hierarchyDesc = '';
$scanPos = 0;
$currentH2 = '';
while ($scanPos < strlen($combinedHtml)) {
$nextH2 = preg_match('#]*>#i', $combinedHtml, $m2, PREG_OFFSET_CAPTURE, $scanPos) ? $m2[0][1] : false;
$nextH3 = preg_match('#]*>#i', $combinedHtml, $m3, PREG_OFFSET_CAPTURE, $scanPos) ? $m3[0][1] : false;
if ($nextH2 === false && $nextH3 === false) break;
if ($nextH2 !== false && ($nextH3 === false || $nextH2 < $nextH3)) {
$closePos = stripos($combinedHtml, '
', $nextH2);
if ($closePos === false) break;
$tagContent = substr($combinedHtml, $nextH2, $closePos + 5 - $nextH2);
$h2Raw = trim(strip_tags($tagContent));
$h2Clean = trim(preg_replace('/[^a-zA-Z0-9 ]/', '', $h2Raw));
if (strlen($h2Clean) && strtoupper($h2Clean) !== 'MENU' && stripos($h2Clean, 'copyright') === false) {
$currentH2 = $h2Raw;
} else {
$currentH2 = '';
}
$scanPos = $closePos + 5;
} else {
$closePos = stripos($combinedHtml, '', $nextH3);
if ($closePos === false) break;
$tagContent = substr($combinedHtml, $nextH3, $closePos + 5 - $nextH3);
$h3Text = trim(strip_tags($tagContent));
if (strlen($currentH2) && strlen($h3Text)) {
if (!isset($headingHierarchy[$currentH2])) $headingHierarchy[$currentH2] = [];
$headingHierarchy[$currentH2][] = $h3Text;
}
$scanPos = $closePos + 5;
}
}
if (!empty($headingHierarchy)) {
foreach ($headingHierarchy as $hParent => $hChildren) {
$hierarchyDesc .= "- \"$hParent\" contains subsections: " . implode(', ', $hChildren) . "\n";
}
$response['steps'][] = "Detected " . count($headingHierarchy) . " parent categories with subcategories from h2/h3 structure";
}
// ============================================================
// Claude API call for generic pages
// ============================================================
$systemPrompt = 'You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), menus (array of objects — see below), categories (array), modifiers (array), items (array with name, description, price, category, menu, modifiers array, and imageUrl). MENUS vs CATEGORIES (CRITICAL): A MENU is a distinct time-based or themed menu that a restaurant offers separately — e.g., "Brunch", "Lunch", "Dinner", "Happy Hour", "Late Night", "Kids Menu". If a restaurant has multiple menus, return a "menus" array of objects like [{"name": "Brunch"}, {"name": "Lunch"}, {"name": "Dinner"}]. Each item should have a "menu" field set to which menu it belongs to. If the restaurant only has one menu or the sections are food-type categories (not time/theme based), omit the "menus" key entirely and treat everything as categories within a single menu. CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups multiple items (e.g., \'Appetizers\', \'Tacos\', \'Drinks\', \'Desserts\'). An ITEM is an individual food or drink product with a name, description, and price. Do NOT create a category for each individual item. A typical restaurant has 5-15 categories and 30-150 items. If you find yourself creating more categories than items, you are wrong - those are items, not categories. Each item must have a \'category\' field set to the category it belongs to. CATEGORIES FORMAT: Each entry in the categories array can be either a simple string (for flat categories) OR an object with \'name\' and optional \'subcategories\' array. Example: ["Appetizers", {"name": "Drinks", "subcategories": ["Hot Drinks", "Cold Drinks"]}, "Desserts"]. SUBCATEGORY DETECTION: If a section header contains nested titled sections beneath it (sub-headers with their own items), the outer section is the PARENT and inner sections are SUBCATEGORIES. For items in subcategories, set their \'category\' field to the SUBCATEGORY name (not the parent). CRITICAL FOR IMAGES: Each menu item in the HTML is typically in a container (div, li, article) that also contains an img tag. Extract the img src URL and include it as \'imageUrl\' for that item. Look for img tags that are siblings or children within the same menu-item container. The image URL should be the full or relative src value from the img tag - NOT the alt text. CRITICAL: Extract EVERY menu item from ALL sources including embedded JSON (__NEXT_DATA__, window state, JSON-LD). For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). CRITICAL: Return ONLY valid JSON. All special characters in strings must be properly escaped. Never use smart/curly quotes. Use only ASCII double quotes for JSON string delimiters and backslash-escape any literal double quotes inside values.';
// Build message content
$messagesContent = [];
// Add images (up to 10)
$imgLimit = min(count($imageDataArray), 10);
for ($i = 0; $i < $imgLimit; $i++) {
$messagesContent[] = ['type' => 'image', 'source' => $imageDataArray[$i]['source']];
}
// Add HTML text
$userText = "Extract menu data from this restaurant website HTML. The images above are from the same website - identify which ones are food photos that could be used as item images, and which could be header/banner images.";
if (strlen($hierarchyDesc)) {
$userText .= "\n\nIMPORTANT - DETECTED SECTION HIERARCHY FROM HTML HEADINGS:\n"
. "The following h2 sections contain h3 sub-sections. Use these as parent-subcategory relationships in your categories output:\n"
. $hierarchyDesc
. "For each parent above, include it in the categories array as an OBJECT with 'name' and 'subcategories' array. Items belonging to a subsection should have their 'category' field set to the SUBCATEGORY name (not the parent).";
}
$userText .= "\n\nHere is the HTML content:\n\n" . $combinedHtml;
$messagesContent[] = ['type' => 'text', 'text' => $userText];
$requestBody = [
'model' => 'claude-sonnet-4-20250514',
'max_tokens' => 16384,
'temperature' => 0,
'system' => $systemPrompt,
'messages' => [['role' => 'user', 'content' => $messagesContent]],
];
$response['steps'][] = "Sending to Claude API...";
$claudeResult = $httpPost(
'https://api.anthropic.com/v1/messages',
json_encode($requestBody),
['Content-Type: application/json', "x-api-key: $CLAUDE_API_KEY", 'anthropic-version: 2023-06-01'],
120
);
if ($claudeResult['code'] !== 200) {
$errorDetail = '';
$errData = json_decode($claudeResult['body'], true);
if (!empty($errData['error']['message'])) {
$errorDetail = $errData['error']['message'];
} else {
$errorDetail = substr($claudeResult['body'], 0, 500);
}
throw new Exception("Claude API error: {$claudeResult['code']} - $errorDetail");
}
$claudeResponse = json_decode($claudeResult['body'], true);
if (empty($claudeResponse['content'])) throw new Exception("Empty response from Claude");
$responseText = '';
foreach ($claudeResponse['content'] as $block) {
if (($block['type'] ?? '') === 'text') {
$responseText = $block['text'];
break;
}
}
$responseText = $cleanClaudeJson($responseText);
$response['DEBUG_RAW_CLAUDE'] = $responseText;
$menuData = json_decode($responseText, true);
if (!is_array($menuData)) {
$response['OK'] = false;
$response['MESSAGE'] = 'JSON parse error';
$response['DEBUG_RAW_RESPONSE'] = substr($responseText, 0, 3000);
jsonResponse($response);
}
// Build image URL list
$imageUrlList = [];
foreach ($imageDataArray as $imgData) {
if (!empty($imgData['url'])) $imageUrlList[] = $imgData['url'];
}
// Ensure expected structure
if (!isset($menuData['business'])) $menuData['business'] = [];
if (!isset($menuData['categories'])) $menuData['categories'] = [];
if (!isset($menuData['modifiers'])) $menuData['modifiers'] = [];
if (!isset($menuData['items'])) $menuData['items'] = [];
// Server-side address parsing: split combined address into components
$biz = &$menuData['business'];
if (!empty($biz['address']) && empty($biz['addressLine1'])) {
$addr = trim(preg_replace('/,?\s*(United States|USA|US|U\.S\.A?\.)\s*$/i', '', $biz['address']));
// Extract ZIP
if (preg_match('/\b(\d{5})(?:-\d{4})?\s*$/', $addr, $zm)) {
$biz['zip'] = $zm[1];
$addr = trim(substr($addr, 0, $zm[0] ? strrpos($addr, $zm[0]) : strlen($addr)));
}
// Extract state (2-letter code at end)
if (preg_match('/\b([A-Z]{2})\s*$/i', $addr, $sm)) {
$biz['state'] = strtoupper($sm[1]);
$addr = trim(substr($addr, 0, strrpos($addr, $sm[0])));
}
// Split remaining into addressLine1 and city by comma
$addr = rtrim($addr, ', ');
if (strpos($addr, ',') !== false) {
$parts = array_map('trim', explode(',', $addr));
$biz['addressLine1'] = $parts[0];
$biz['city'] = $parts[1] ?? '';
}
}
// Clean city if it still has state/zip/country in it
if (!empty($biz['city']) && strpos($biz['city'], ',') !== false) {
$biz['city'] = trim(explode(',', $biz['city'])[0]);
}
// Pass through menus array if Claude detected multiple menus
if (!empty($menuData['menus']) && is_array($menuData['menus']) && count($menuData['menus']) > 1) {
$response['steps'][] = "Detected " . count($menuData['menus']) . " separate menus: " . implode(', ', array_column($menuData['menus'], 'name'));
}
// Convert categories to expected format
$formattedCategories = [];
foreach ($menuData['categories'] as $cat) {
if (is_string($cat)) {
$formattedCategories[] = ['name' => $cat, 'itemCount' => 0];
} elseif (is_array($cat)) {
$parentName = $cat['name'] ?? '';
if (strlen($parentName)) {
$formattedCategories[] = ['name' => $parentName, 'itemCount' => 0];
if (!empty($cat['subcategories']) && is_array($cat['subcategories'])) {
foreach ($cat['subcategories'] as $subcat) {
$subcatName = is_string($subcat) ? $subcat : ($subcat['name'] ?? '');
if (strlen($subcatName)) {
$formattedCategories[] = ['name' => $subcatName, 'parentCategoryName' => $parentName, 'itemCount' => 0];
}
}
}
}
}
}
$menuData['categories'] = $formattedCategories;
// Fix "every item is a category" pattern
$totalItems = count($menuData['items']);
$totalCats = count($formattedCategories);
if ($totalCats > 10 && $totalItems > 0 && $totalCats > $totalItems * 0.5) {
$zeroCats = [];
$singleCats = [];
foreach ($formattedCategories as $fc) {
$fcCount = 0;
foreach ($menuData['items'] as $fi) {
if ($fi['category'] === $fc['name']) $fcCount++;
}
if ($fcCount === 0) $zeroCats[] = $fc['name'];
elseif ($fcCount === 1) $singleCats[] = $fc['name'];
}
if (count($singleCats) > $totalCats * 0.6 && !empty($zeroCats)) {
$response['steps'][] = "Detected 'every item is a category' pattern (" . count($singleCats) . " single-item cats, " . count($zeroCats) . " empty cats) - collapsing";
$currentParent = $zeroCats[0];
foreach ($formattedCategories as $fc) {
if (in_array($fc['name'], $zeroCats)) {
$currentParent = $fc['name'];
} else {
for ($ii = 0; $ii < count($menuData['items']); $ii++) {
if ($menuData['items'][$ii]['category'] === $fc['name']) {
$menuData['items'][$ii]['category'] = $currentParent;
}
}
}
}
$fixedCategories = [];
foreach ($zeroCats as $zc) {
$zcCount = 0;
foreach ($menuData['items'] as $fi) {
if ($fi['category'] === $zc) $zcCount++;
}
$fixedCategories[] = ['name' => $zc, 'itemCount' => $zcCount];
}
$menuData['categories'] = $fixedCategories;
$formattedCategories = $fixedCategories;
$response['steps'][] = "Collapsed to " . count($fixedCategories) . " categories";
}
}
// Server-side hierarchy enforcement from HTML heading structure
if (!empty($headingHierarchy)) {
$h3ToParent = [];
foreach ($headingHierarchy as $hParentName => $hChildren) {
foreach ($hChildren as $hChild) {
$h3ToParent[strtolower(trim($hChild))] = $hParentName;
}
}
$hierarchyApplied = 0;
for ($i = 0; $i < count($formattedCategories); $i++) {
if (empty($formattedCategories[$i]['parentCategoryName'])) {
$catLower = strtolower(trim($formattedCategories[$i]['name']));
if (isset($h3ToParent[$catLower])) {
$rawParent = $h3ToParent[$catLower];
$matchedParent = '';
foreach ($formattedCategories as $pcat) {
$parentNorm = strtolower(preg_replace('/[^a-zA-Z0-9 ]/', '', $rawParent));
$parentNorm = trim(preg_replace('/\s*menu\s*$/i', '', $parentNorm));
$pcatNorm = trim(preg_replace('/\s*menu\s*$/i', '', strtolower($pcat['name'])));
if ($pcatNorm === $parentNorm || strtolower($pcat['name']) === strtolower($rawParent)) {
$matchedParent = $pcat['name'];
break;
}
}
if (strlen($matchedParent)) {
$formattedCategories[$i]['parentCategoryName'] = $matchedParent;
$hierarchyApplied++;
}
}
}
}
if ($hierarchyApplied > 0) {
$menuData['categories'] = $formattedCategories;
$response['steps'][] = "Server-side hierarchy: applied $hierarchyApplied parent-child relationships";
}
}
// Items with subcategory field from Claude
for ($i = 0; $i < count($menuData['items']); $i++) {
if (!empty($menuData['items'][$i]['subcategory'])) {
$menuData['items'][$i]['category'] = $menuData['items'][$i]['subcategory'];
}
}
// Add item IDs
for ($i = 0; $i < count($menuData['items']); $i++) {
$menuData['items'][$i]['id'] = 'item_' . ($i + 1);
}
// Process item images
$itemsWithImages = 0;
for ($i = 0; $i < count($menuData['items']); $i++) {
$item = $menuData['items'][$i];
if (!empty($item['images']) && is_array($item['images'])) {
$imgObj = $item['images'];
$itemsWithImages++;
$filenames = [];
foreach ($imgObj as $sizeKey => $imgUrl) {
if (is_scalar($imgUrl) && strlen(trim((string)$imgUrl))) {
$filenames[$sizeKey] = basename((string)$imgUrl);
}
}
$menuData['items'][$i]['imageFilenames'] = $filenames;
$primarySrc = $imgObj['src'] ?? $imgObj['large'] ?? $imgObj['medium'] ?? $imgObj['small'] ?? null;
if ($primarySrc) {
$menuData['items'][$i]['imageSrc'] = $primarySrc;
$menuData['items'][$i]['imageFilename'] = basename($primarySrc);
}
} elseif (!empty($item['imageUrl'])) {
$menuData['items'][$i]['imageSrc'] = $item['imageUrl'];
$menuData['items'][$i]['imageFilename'] = basename($item['imageUrl']);
$itemsWithImages++;
} elseif (!empty($item['imageSrc'])) {
$menuData['items'][$i]['imageFilename'] = basename($item['imageSrc']);
$itemsWithImages++;
}
}
$response['steps'][] = "Found images for $itemsWithImages of " . count($menuData['items']) . " items";
$menuData['imageUrls'] = $imageUrlList;
$menuData['headerCandidateIndices'] = [];
$menuData['imageMappings'] = $imageMappings;
$response['OK'] = true;
$response['DATA'] = $menuData;
$response['sourceUrl'] = !empty($targetUrl) ? $targetUrl : 'uploaded';
$response['pagesProcessed'] = count($menuPages);
$response['imagesFound'] = count($imageDataArray);
$response['playwrightImagesCount'] = count($playwrightImages);
} catch (Exception $e) {
$response['MESSAGE'] = $e->getMessage();
}
jsonResponse($response);