Extract business info during discovery phase

- Parse JSON-LD structured data (Restaurant, FoodEstablishment, etc.)
- Extract phone from tel: links, address from og: meta tags
- Return businessInfo in discovery response so sub-pages don't need it

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-14 17:34:55 -07:00
parent 4ac13de09d
commit 571930ed25

View file

@ -53,13 +53,89 @@ try {
$subPages = $pwResult['subPagesVisited'] ?? []; $subPages = $pwResult['subPagesVisited'] ?? [];
$platformPages = $pwResult['platformPagesVisited'] ?? []; $platformPages = $pwResult['platformPagesVisited'] ?? [];
// Extract business name from title // Extract business info from main page
$html = $pwResult['html'] ?? ''; $html = $pwResult['html'] ?? '';
$siteName = ''; $bizInfo = [];
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
// 1. Try JSON-LD structured data (most reliable)
if (preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $html, $ldMatches)) {
foreach ($ldMatches[1] as $ldJson) {
$ld = json_decode($ldJson, true);
if (!is_array($ld)) continue;
// Handle @graph wrapper
$entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld];
foreach ($entries as $entry) {
$type = $entry['@type'] ?? '';
if (in_array($type, ['Restaurant', 'FoodEstablishment', 'LocalBusiness', 'CafeOrCoffeeShop', 'BarOrPub'])) {
if (!empty($entry['name'])) $bizInfo['name'] = $entry['name'];
if (!empty($entry['telephone'])) $bizInfo['phone'] = $entry['telephone'];
if (!empty($entry['address'])) {
$a = $entry['address'];
if (is_string($a)) {
$bizInfo['address'] = $a;
} elseif (is_array($a)) {
if (!empty($a['streetAddress'])) $bizInfo['addressLine1'] = $a['streetAddress'];
if (!empty($a['addressLocality'])) $bizInfo['city'] = $a['addressLocality'];
if (!empty($a['addressRegion'])) $bizInfo['state'] = $a['addressRegion'];
if (!empty($a['postalCode'])) $bizInfo['zip'] = $a['postalCode'];
$bizInfo['address'] = trim(implode(', ', array_filter([
$a['streetAddress'] ?? '', $a['addressLocality'] ?? '',
$a['addressRegion'] ?? '', $a['postalCode'] ?? ''
])));
}
}
if (!empty($entry['openingHours'])) {
$bizInfo['hours'] = is_array($entry['openingHours'])
? implode(', ', $entry['openingHours'])
: $entry['openingHours'];
}
if (!empty($entry['openingHoursSpecification']) && is_array($entry['openingHoursSpecification'])) {
$dayMap = ['Monday'=>'Mon','Tuesday'=>'Tue','Wednesday'=>'Wed','Thursday'=>'Thu','Friday'=>'Fri','Saturday'=>'Sat','Sunday'=>'Sun'];
$hParts = [];
foreach ($entry['openingHoursSpecification'] as $spec) {
$days = $spec['dayOfWeek'] ?? [];
if (is_string($days)) $days = [$days];
$open = $spec['opens'] ?? '';
$close = $spec['closes'] ?? '';
$dayAbbrs = array_map(fn($d) => $dayMap[basename($d)] ?? $d, $days);
if ($open && $close) $hParts[] = implode('/', $dayAbbrs) . " $open-$close";
}
if (!empty($hParts)) $bizInfo['hours'] = implode(', ', $hParts);
}
}
}
}
}
// 2. Business name from title (fallback)
$siteName = $bizInfo['name'] ?? '';
if (empty($siteName) && preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
$siteName = trim($tm[1]); $siteName = trim($tm[1]);
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName); $siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
$siteName = trim($siteName); $siteName = trim($siteName);
$bizInfo['name'] = $siteName;
}
// 3. Phone from tel: links (fallback)
if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $html, $pm)) {
$bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $pm[1]));
}
// 4. Address from common patterns (fallback)
if (empty($bizInfo['address'])) {
// Look for address in meta tags
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:street-address|business:contact_data:street_address)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $am)) {
$bizInfo['addressLine1'] = trim($am[1]);
}
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:locality|business:contact_data:locality)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $cm)) {
$bizInfo['city'] = trim($cm[1]);
}
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:region|business:contact_data:region)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $sm)) {
$bizInfo['state'] = trim($sm[1]);
}
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:postal-code|business:contact_data:postal_code)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $zm)) {
$bizInfo['zip'] = trim($zm[1]);
}
} }
// Extract menu names from sub-page URLs // Extract menu names from sub-page URLs
@ -79,6 +155,7 @@ try {
'OK' => true, 'OK' => true,
'mode' => 'discover', 'mode' => 'discover',
'siteName' => $siteName, 'siteName' => $siteName,
'businessInfo' => $bizInfo,
'mainUrl' => $discoverUrl, 'mainUrl' => $discoverUrl,
'menuPages' => $menuPages, 'menuPages' => $menuPages,
'platformPages' => $platformPages, 'platformPages' => $platformPages,