Fix discovery business info extraction
- Prefer title tag for name over JSON-LD (sites often put address in LD name) - Parse full address string into components (addressLine1, city, state, zip) - Handle newlines in addresses (Squarespace puts newlines in JSON-LD) - Convert 24h hours to 12h format - Strip country suffix from addresses Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a14213151f
commit
1df69463a8
1 changed files with 43 additions and 6 deletions
|
|
@ -107,13 +107,50 @@ try {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Business name from title (fallback)
|
// 2. Business name from title — prefer over JSON-LD since many sites put address in LD name
|
||||||
|
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
||||||
|
$titleName = trim($tm[1]);
|
||||||
|
$titleName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $titleName);
|
||||||
|
$titleName = trim($titleName);
|
||||||
|
if (strlen($titleName)) {
|
||||||
|
$ldName = $bizInfo['name'] ?? '';
|
||||||
|
// Use title if JSON-LD name looks like an address (starts with number or contains comma)
|
||||||
|
if (empty($ldName) || preg_match('/^\d/', $ldName) || strpos($ldName, ',') !== false) {
|
||||||
|
$bizInfo['name'] = $titleName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
$siteName = $bizInfo['name'] ?? '';
|
$siteName = $bizInfo['name'] ?? '';
|
||||||
if (empty($siteName) && preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
|
|
||||||
$siteName = trim($tm[1]);
|
// Parse address into components if only full string
|
||||||
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
|
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
|
||||||
$siteName = trim($siteName);
|
$addr = trim(preg_replace('/,?\s*(United States|USA|US|U\.S\.A?\.)\s*$/i', '', $bizInfo['address']));
|
||||||
$bizInfo['name'] = $siteName;
|
$addr = preg_replace('/\n+/', ', ', $addr); // newlines to commas
|
||||||
|
if (preg_match('/\b(\d{5})(?:-\d{4})?\s*$/', $addr, $zm)) {
|
||||||
|
$bizInfo['zip'] = $zm[1];
|
||||||
|
$addr = trim(substr($addr, 0, strrpos($addr, $zm[0])));
|
||||||
|
}
|
||||||
|
if (preg_match('/\b([A-Z]{2})\s*$/i', $addr, $sm)) {
|
||||||
|
$bizInfo['state'] = strtoupper($sm[1]);
|
||||||
|
$addr = trim(substr($addr, 0, strrpos($addr, $sm[0])));
|
||||||
|
}
|
||||||
|
$addr = rtrim($addr, ', ');
|
||||||
|
if (strpos($addr, ',') !== false) {
|
||||||
|
$parts = array_map('trim', explode(',', $addr));
|
||||||
|
$bizInfo['addressLine1'] = $parts[0];
|
||||||
|
$bizInfo['city'] = $parts[count($parts) - 1]; // last part before state is city
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert 24h hours to 12h format
|
||||||
|
if (!empty($bizInfo['hours'])) {
|
||||||
|
$bizInfo['hours'] = preg_replace_callback('/(\d{1,2}):(\d{2})/', function($m) {
|
||||||
|
$h = (int)$m[1]; $min = $m[2];
|
||||||
|
$ampm = $h >= 12 ? 'pm' : 'am';
|
||||||
|
if ($h > 12) $h -= 12;
|
||||||
|
if ($h === 0) $h = 12;
|
||||||
|
return $h . ($min !== '00' ? ":$min" : '') . $ampm;
|
||||||
|
}, $bizInfo['hours']);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Phone from tel: links (fallback)
|
// 3. Phone from tel: links (fallback)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue