Extract business info during discovery phase
- Parse JSON-LD structured data (Restaurant, FoodEstablishment, etc.) - Extract phone from tel: links, address from og: meta tags - Return businessInfo in discovery response so sub-pages don't need it Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4ac13de09d
commit
571930ed25
1 changed files with 80 additions and 3 deletions
|
|
@ -53,13 +53,89 @@ try {
|
|||
$subPages = $pwResult['subPagesVisited'] ?? [];
|
||||
$platformPages = $pwResult['platformPagesVisited'] ?? [];
|
||||
|
||||
// Extract business name from title
|
||||
// Extract business info from main page
|
||||
$html = $pwResult['html'] ?? '';
|
||||
$siteName = '';
|
||||
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
||||
$bizInfo = [];
|
||||
|
||||
// 1. Try JSON-LD structured data (most reliable)
|
||||
if (preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $html, $ldMatches)) {
|
||||
foreach ($ldMatches[1] as $ldJson) {
|
||||
$ld = json_decode($ldJson, true);
|
||||
if (!is_array($ld)) continue;
|
||||
// Handle @graph wrapper
|
||||
$entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld];
|
||||
foreach ($entries as $entry) {
|
||||
$type = $entry['@type'] ?? '';
|
||||
if (in_array($type, ['Restaurant', 'FoodEstablishment', 'LocalBusiness', 'CafeOrCoffeeShop', 'BarOrPub'])) {
|
||||
if (!empty($entry['name'])) $bizInfo['name'] = $entry['name'];
|
||||
if (!empty($entry['telephone'])) $bizInfo['phone'] = $entry['telephone'];
|
||||
if (!empty($entry['address'])) {
|
||||
$a = $entry['address'];
|
||||
if (is_string($a)) {
|
||||
$bizInfo['address'] = $a;
|
||||
} elseif (is_array($a)) {
|
||||
if (!empty($a['streetAddress'])) $bizInfo['addressLine1'] = $a['streetAddress'];
|
||||
if (!empty($a['addressLocality'])) $bizInfo['city'] = $a['addressLocality'];
|
||||
if (!empty($a['addressRegion'])) $bizInfo['state'] = $a['addressRegion'];
|
||||
if (!empty($a['postalCode'])) $bizInfo['zip'] = $a['postalCode'];
|
||||
$bizInfo['address'] = trim(implode(', ', array_filter([
|
||||
$a['streetAddress'] ?? '', $a['addressLocality'] ?? '',
|
||||
$a['addressRegion'] ?? '', $a['postalCode'] ?? ''
|
||||
])));
|
||||
}
|
||||
}
|
||||
if (!empty($entry['openingHours'])) {
|
||||
$bizInfo['hours'] = is_array($entry['openingHours'])
|
||||
? implode(', ', $entry['openingHours'])
|
||||
: $entry['openingHours'];
|
||||
}
|
||||
if (!empty($entry['openingHoursSpecification']) && is_array($entry['openingHoursSpecification'])) {
|
||||
$dayMap = ['Monday'=>'Mon','Tuesday'=>'Tue','Wednesday'=>'Wed','Thursday'=>'Thu','Friday'=>'Fri','Saturday'=>'Sat','Sunday'=>'Sun'];
|
||||
$hParts = [];
|
||||
foreach ($entry['openingHoursSpecification'] as $spec) {
|
||||
$days = $spec['dayOfWeek'] ?? [];
|
||||
if (is_string($days)) $days = [$days];
|
||||
$open = $spec['opens'] ?? '';
|
||||
$close = $spec['closes'] ?? '';
|
||||
$dayAbbrs = array_map(fn($d) => $dayMap[basename($d)] ?? $d, $days);
|
||||
if ($open && $close) $hParts[] = implode('/', $dayAbbrs) . " $open-$close";
|
||||
}
|
||||
if (!empty($hParts)) $bizInfo['hours'] = implode(', ', $hParts);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Business name from title (fallback)
|
||||
$siteName = $bizInfo['name'] ?? '';
|
||||
if (empty($siteName) && preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
||||
$siteName = trim($tm[1]);
|
||||
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
|
||||
$siteName = trim($siteName);
|
||||
$bizInfo['name'] = $siteName;
|
||||
}
|
||||
|
||||
// 3. Phone from tel: links (fallback)
|
||||
if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $html, $pm)) {
|
||||
$bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $pm[1]));
|
||||
}
|
||||
|
||||
// 4. Address from common patterns (fallback)
|
||||
if (empty($bizInfo['address'])) {
|
||||
// Look for address in meta tags
|
||||
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:street-address|business:contact_data:street_address)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $am)) {
|
||||
$bizInfo['addressLine1'] = trim($am[1]);
|
||||
}
|
||||
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:locality|business:contact_data:locality)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $cm)) {
|
||||
$bizInfo['city'] = trim($cm[1]);
|
||||
}
|
||||
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:region|business:contact_data:region)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $sm)) {
|
||||
$bizInfo['state'] = trim($sm[1]);
|
||||
}
|
||||
if (preg_match('#<meta[^>]+(?:property|name)=["\'](?:og:postal-code|business:contact_data:postal_code)["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $zm)) {
|
||||
$bizInfo['zip'] = trim($zm[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Extract menu names from sub-page URLs
|
||||
|
|
@ -79,6 +155,7 @@ try {
|
|||
'OK' => true,
|
||||
'mode' => 'discover',
|
||||
'siteName' => $siteName,
|
||||
'businessInfo' => $bizInfo,
|
||||
'mainUrl' => $discoverUrl,
|
||||
'menuPages' => $menuPages,
|
||||
'platformPages' => $platformPages,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue