Fetch contact/about page during discovery for business info

- Detect contact/about/location/hours links on main page
- Fetch contact page and extract phone, address, hours
- Phone: regex for US phone formats + tel: links
- Address: US street address pattern (number + street type)
- Hours: day + time range patterns from plain text
- Overrides bad JSON-LD data with actual contact page info

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-14 18:26:19 -07:00
parent b48f20011d
commit 24849c01e4

View file

@ -188,6 +188,68 @@ try {
}
}
// 5. Fetch contact/about page for better business info
$baseOrigin = preg_replace('#^(https?://[^/]+).*#', '$1', $discoverUrl);
$contactUrl = '';
if (preg_match_all('#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]*)</a>#i', $html, $linkMatches, PREG_SET_ORDER)) {
foreach ($linkMatches as $lm) {
$href = $lm[1];
$text = strtolower(trim($lm[2]));
if (preg_match('/\b(contact|about|location|find.?us|visit|hours)\b/i', $text) || preg_match('#/(contact|about|location|find-us|visit|hours)/?$#i', $href)) {
if (str_starts_with($href, '/')) $href = $baseOrigin . $href;
if (str_starts_with($href, $baseOrigin)) {
$contactUrl = $href;
break;
}
}
}
}
if ($contactUrl) {
$ch = curl_init($contactUrl);
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']);
$contactHtml = curl_exec($ch);
curl_close($ch);
if ($contactHtml && strlen($contactHtml) > 100) {
// Strip scripts/styles for cleaner parsing
$contactClean = preg_replace('#<script[^>]*>.*?</script>#is', '', $contactHtml);
$contactClean = preg_replace('#<style[^>]*>.*?</style>#is', '', $contactClean);
$contactText = strip_tags($contactClean);
// Phone: (xxx) xxx-xxxx or xxx-xxx-xxxx or xxx.xxx.xxxx
if (preg_match('/(?:P(?:hone)?:?\s*)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}/', $contactText, $cpm)) {
$bizInfo['phone'] = trim($cpm[0]);
// Clean prefix like "P:" or "Phone:"
$bizInfo['phone'] = preg_replace('/^P(?:hone)?:?\s*/i', '', $bizInfo['phone']);
}
// Also check tel: links on contact page
if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $contactHtml, $cpm2)) {
$bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $cpm2[1]));
}
// Address: look for US street address pattern
if (preg_match('/(\d+\s+[A-Z][a-zA-Z\s]+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Drive|Dr|Lane|Ln|Way|Place|Pl|Court|Ct)\.?)(?:\s*[,\n]?\s*([A-Z][a-zA-Z\s]+),\s*([A-Z]{2})\s*,?\s*(\d{5}))?/m', $contactText, $cam)) {
if (!empty($cam[1])) $bizInfo['addressLine1'] = trim($cam[1]);
if (!empty($cam[2])) $bizInfo['city'] = trim($cam[2]);
if (!empty($cam[3])) $bizInfo['state'] = strtoupper($cam[3]);
if (!empty($cam[4])) $bizInfo['zip'] = $cam[4];
}
// Hours: look for day-time patterns
$dayPattern = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
$timePattern = '\d{1,2}(?::\d{2})?\s*(?:am|pm)';
if (preg_match_all("/($dayPattern)\s*($timePattern)\s*-\s*($timePattern)/i", $contactText, $hm, PREG_SET_ORDER)) {
$hourParts = [];
foreach ($hm as $h) {
$hourParts[] = $h[1] . ' ' . $h[2] . '-' . $h[3];
}
if (!empty($hourParts)) $bizInfo['hours'] = implode(', ', $hourParts);
}
}
}
// Extract menu names from sub-page URLs
$menuPages = [];
foreach ($subPages as $spUrl) {