From 571930ed2547d9a0bc29c38de77b3e2fcc00cc10 Mon Sep 17 00:00:00 2001 From: John Mizerek Date: Sat, 14 Mar 2026 17:34:55 -0700 Subject: [PATCH] Extract business info during discovery phase - Parse JSON-LD structured data (Restaurant, FoodEstablishment, etc.) - Extract phone from tel: links, address from og: meta tags - Return businessInfo in discovery response so sub-pages don't need it Co-Authored-By: Claude Opus 4.6 --- api/setup/analyzeMenuUrl.php | 83 ++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/api/setup/analyzeMenuUrl.php b/api/setup/analyzeMenuUrl.php index 4196d56..7f0c222 100644 --- a/api/setup/analyzeMenuUrl.php +++ b/api/setup/analyzeMenuUrl.php @@ -53,13 +53,89 @@ try { $subPages = $pwResult['subPagesVisited'] ?? []; $platformPages = $pwResult['platformPagesVisited'] ?? []; - // Extract business name from title + // Extract business info from main page $html = $pwResult['html'] ?? ''; - $siteName = ''; - if (preg_match('#]*>([^<]+)#i', $html, $tm)) { + $bizInfo = []; + + // 1. Try JSON-LD structured data (most reliable) + if (preg_match_all('#]*type=["\']application/ld\+json["\'][^>]*>([^<]+)#i', $html, $ldMatches)) { + foreach ($ldMatches[1] as $ldJson) { + $ld = json_decode($ldJson, true); + if (!is_array($ld)) continue; + // Handle @graph wrapper + $entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld]; + foreach ($entries as $entry) { + $type = $entry['@type'] ?? ''; + if (in_array($type, ['Restaurant', 'FoodEstablishment', 'LocalBusiness', 'CafeOrCoffeeShop', 'BarOrPub'])) { + if (!empty($entry['name'])) $bizInfo['name'] = $entry['name']; + if (!empty($entry['telephone'])) $bizInfo['phone'] = $entry['telephone']; + if (!empty($entry['address'])) { + $a = $entry['address']; + if (is_string($a)) { + $bizInfo['address'] = $a; + } elseif (is_array($a)) { + if (!empty($a['streetAddress'])) $bizInfo['addressLine1'] = $a['streetAddress']; + if (!empty($a['addressLocality'])) $bizInfo['city'] = $a['addressLocality']; + if (!empty($a['addressRegion'])) $bizInfo['state'] = $a['addressRegion']; + if (!empty($a['postalCode'])) $bizInfo['zip'] = $a['postalCode']; + $bizInfo['address'] = trim(implode(', ', array_filter([ + $a['streetAddress'] ?? '', $a['addressLocality'] ?? '', + $a['addressRegion'] ?? '', $a['postalCode'] ?? '' + ]))); + } + } + if (!empty($entry['openingHours'])) { + $bizInfo['hours'] = is_array($entry['openingHours']) + ? implode(', ', $entry['openingHours']) + : $entry['openingHours']; + } + if (!empty($entry['openingHoursSpecification']) && is_array($entry['openingHoursSpecification'])) { + $dayMap = ['Monday'=>'Mon','Tuesday'=>'Tue','Wednesday'=>'Wed','Thursday'=>'Thu','Friday'=>'Fri','Saturday'=>'Sat','Sunday'=>'Sun']; + $hParts = []; + foreach ($entry['openingHoursSpecification'] as $spec) { + $days = $spec['dayOfWeek'] ?? []; + if (is_string($days)) $days = [$days]; + $open = $spec['opens'] ?? ''; + $close = $spec['closes'] ?? ''; + $dayAbbrs = array_map(fn($d) => $dayMap[basename($d)] ?? $d, $days); + if ($open && $close) $hParts[] = implode('/', $dayAbbrs) . " $open-$close"; + } + if (!empty($hParts)) $bizInfo['hours'] = implode(', ', $hParts); + } + } + } + } + } + + // 2. Business name from title (fallback) + $siteName = $bizInfo['name'] ?? ''; + if (empty($siteName) && preg_match('#]*>([^<]+)#i', $html, $tm)) { $siteName = trim($tm[1]); $siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName); $siteName = trim($siteName); + $bizInfo['name'] = $siteName; + } + + // 3. Phone from tel: links (fallback) + if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $html, $pm)) { + $bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $pm[1])); + } + + // 4. Address from common patterns (fallback) + if (empty($bizInfo['address'])) { + // Look for address in meta tags + if (preg_match('#]+(?:property|name)=["\'](?:og:street-address|business:contact_data:street_address)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $am)) { + $bizInfo['addressLine1'] = trim($am[1]); + } + if (preg_match('#]+(?:property|name)=["\'](?:og:locality|business:contact_data:locality)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $cm)) { + $bizInfo['city'] = trim($cm[1]); + } + if (preg_match('#]+(?:property|name)=["\'](?:og:region|business:contact_data:region)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $sm)) { + $bizInfo['state'] = trim($sm[1]); + } + if (preg_match('#]+(?:property|name)=["\'](?:og:postal-code|business:contact_data:postal_code)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $zm)) { + $bizInfo['zip'] = trim($zm[1]); + } } // Extract menu names from sub-page URLs @@ -79,6 +155,7 @@ try { 'OK' => true, 'mode' => 'discover', 'siteName' => $siteName, + 'businessInfo' => $bizInfo, 'mainUrl' => $discoverUrl, 'menuPages' => $menuPages, 'platformPages' => $platformPages,