Fix discovery business info extraction

- Prefer title tag for name over JSON-LD (sites often put address in LD name)
- Parse full address string into components (addressLine1, city, state, zip)
- Handle newlines in addresses (Squarespace puts newlines in JSON-LD)
- Convert 24h hours to 12h format
- Strip country suffix from addresses

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-14 18:09:54 -07:00
parent a14213151f
commit 1df69463a8

View file

@ -107,13 +107,50 @@ try {
}
}
// 2. Business name from title (fallback)
// 2. Business name from title — prefer over JSON-LD since many sites put address in LD name
if (preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
$titleName = trim($tm[1]);
$titleName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $titleName);
$titleName = trim($titleName);
if (strlen($titleName)) {
$ldName = $bizInfo['name'] ?? '';
// Use title if JSON-LD name looks like an address (starts with number or contains comma)
if (empty($ldName) || preg_match('/^\d/', $ldName) || strpos($ldName, ',') !== false) {
$bizInfo['name'] = $titleName;
}
}
}
$siteName = $bizInfo['name'] ?? '';
if (empty($siteName) && preg_match('#<title[^>]*>([^<]+)</title>#i', $html, $tm)) {
$siteName = trim($tm[1]);
$siteName = preg_replace('#\s*[-|]+\s*(Menu|Order|Online|Home|Welcome|Restaurant).*$#i', '', $siteName);
$siteName = trim($siteName);
$bizInfo['name'] = $siteName;
// Parse address into components if only full string
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
$addr = trim(preg_replace('/,?\s*(United States|USA|US|U\.S\.A?\.)\s*$/i', '', $bizInfo['address']));
$addr = preg_replace('/\n+/', ', ', $addr); // newlines to commas
if (preg_match('/\b(\d{5})(?:-\d{4})?\s*$/', $addr, $zm)) {
$bizInfo['zip'] = $zm[1];
$addr = trim(substr($addr, 0, strrpos($addr, $zm[0])));
}
if (preg_match('/\b([A-Z]{2})\s*$/i', $addr, $sm)) {
$bizInfo['state'] = strtoupper($sm[1]);
$addr = trim(substr($addr, 0, strrpos($addr, $sm[0])));
}
$addr = rtrim($addr, ', ');
if (strpos($addr, ',') !== false) {
$parts = array_map('trim', explode(',', $addr));
$bizInfo['addressLine1'] = $parts[0];
$bizInfo['city'] = $parts[count($parts) - 1]; // last part before state is city
}
}
// Convert 24h hours to 12h format
if (!empty($bizInfo['hours'])) {
$bizInfo['hours'] = preg_replace_callback('/(\d{1,2}):(\d{2})/', function($m) {
$h = (int)$m[1]; $min = $m[2];
$ampm = $h >= 12 ? 'pm' : 'am';
if ($h > 12) $h -= 12;
if ($h === 0) $h = 12;
return $h . ($min !== '00' ? ":$min" : '') . $ampm;
}, $bizInfo['hours']);
}
// 3. Phone from tel: links (fallback)