Fetch contact/about page during discovery for business info
- Detect contact/about/location/hours links on main page - Fetch contact page and extract phone, address, hours - Phone: regex for US phone formats + tel: links - Address: US street address pattern (number + street type) - Hours: day + time range patterns from plain text - Overrides bad JSON-LD data with actual contact page info Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b48f20011d
commit
24849c01e4
1 changed files with 62 additions and 0 deletions
|
|
@ -188,6 +188,68 @@ try {
|
|||
}
|
||||
}
|
||||
|
||||
// 5. Fetch contact/about page for better business info
|
||||
$baseOrigin = preg_replace('#^(https?://[^/]+).*#', '$1', $discoverUrl);
|
||||
$contactUrl = '';
|
||||
if (preg_match_all('#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]*)</a>#i', $html, $linkMatches, PREG_SET_ORDER)) {
|
||||
foreach ($linkMatches as $lm) {
|
||||
$href = $lm[1];
|
||||
$text = strtolower(trim($lm[2]));
|
||||
if (preg_match('/\b(contact|about|location|find.?us|visit|hours)\b/i', $text) || preg_match('#/(contact|about|location|find-us|visit|hours)/?$#i', $href)) {
|
||||
if (str_starts_with($href, '/')) $href = $baseOrigin . $href;
|
||||
if (str_starts_with($href, $baseOrigin)) {
|
||||
$contactUrl = $href;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($contactUrl) {
|
||||
$ch = curl_init($contactUrl);
|
||||
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']);
|
||||
$contactHtml = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($contactHtml && strlen($contactHtml) > 100) {
|
||||
// Strip scripts/styles for cleaner parsing
|
||||
$contactClean = preg_replace('#<script[^>]*>.*?</script>#is', '', $contactHtml);
|
||||
$contactClean = preg_replace('#<style[^>]*>.*?</style>#is', '', $contactClean);
|
||||
$contactText = strip_tags($contactClean);
|
||||
|
||||
// Phone: (xxx) xxx-xxxx or xxx-xxx-xxxx or xxx.xxx.xxxx
|
||||
if (preg_match('/(?:P(?:hone)?:?\s*)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}/', $contactText, $cpm)) {
|
||||
$bizInfo['phone'] = trim($cpm[0]);
|
||||
// Clean prefix like "P:" or "Phone:"
|
||||
$bizInfo['phone'] = preg_replace('/^P(?:hone)?:?\s*/i', '', $bizInfo['phone']);
|
||||
}
|
||||
|
||||
// Also check tel: links on contact page
|
||||
if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $contactHtml, $cpm2)) {
|
||||
$bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $cpm2[1]));
|
||||
}
|
||||
|
||||
// Address: look for US street address pattern
|
||||
if (preg_match('/(\d+\s+[A-Z][a-zA-Z\s]+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Drive|Dr|Lane|Ln|Way|Place|Pl|Court|Ct)\.?)(?:\s*[,\n]?\s*([A-Z][a-zA-Z\s]+),\s*([A-Z]{2})\s*,?\s*(\d{5}))?/m', $contactText, $cam)) {
|
||||
if (!empty($cam[1])) $bizInfo['addressLine1'] = trim($cam[1]);
|
||||
if (!empty($cam[2])) $bizInfo['city'] = trim($cam[2]);
|
||||
if (!empty($cam[3])) $bizInfo['state'] = strtoupper($cam[3]);
|
||||
if (!empty($cam[4])) $bizInfo['zip'] = $cam[4];
|
||||
}
|
||||
|
||||
// Hours: look for day-time patterns
|
||||
$dayPattern = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
|
||||
$timePattern = '\d{1,2}(?::\d{2})?\s*(?:am|pm)';
|
||||
if (preg_match_all("/($dayPattern)\s*($timePattern)\s*-\s*($timePattern)/i", $contactText, $hm, PREG_SET_ORDER)) {
|
||||
$hourParts = [];
|
||||
foreach ($hm as $h) {
|
||||
$hourParts[] = $h[1] . ' ' . $h[2] . '-' . $h[3];
|
||||
}
|
||||
if (!empty($hourParts)) $bizInfo['hours'] = implode(', ', $hourParts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract menu names from sub-page URLs
|
||||
$menuPages = [];
|
||||
foreach ($subPages as $spUrl) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue