Fetch contact/about page during discovery for business info

- Detect contact/about/location/hours links on main page - Fetch contact page and extract phone, address, hours - Phone: regex for US phone formats + tel: links - Address: US street address pattern (number + street type) - Hours: day + time range patterns from plain text - Overrides bad JSON-LD data with actual contact page info Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 18:26:19 -07:00 · 2026-03-14 18:26:19 -07:00 · 24849c01e4
commit 24849c01e4
parent b48f20011d
1 changed files with 62 additions and 0 deletions
--- a/api/setup/analyzeMenuUrl.php
+++ b/api/setup/analyzeMenuUrl.php
@ -188,6 +188,68 @@ try {
            }
        }

+        // 5. Fetch contact/about page for better business info
+        $baseOrigin = preg_replace('#^(https?://[^/]+).*#', '$1', $discoverUrl);
+        $contactUrl = '';
+        if (preg_match_all('#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]*)</a>#i', $html, $linkMatches, PREG_SET_ORDER)) {
+            foreach ($linkMatches as $lm) {
+                $href = $lm[1];
+                $text = strtolower(trim($lm[2]));
+                if (preg_match('/\b(contact|about|location|find.?us|visit|hours)\b/i', $text) || preg_match('#/(contact|about|location|find-us|visit|hours)/?$#i', $href)) {
+                    if (str_starts_with($href, '/')) $href = $baseOrigin . $href;
+                    if (str_starts_with($href, $baseOrigin)) {
+                        $contactUrl = $href;
+                        break;
+                    }
+                }
+            }
+        }
+        if ($contactUrl) {
+            $ch = curl_init($contactUrl);
+            curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true,
+                CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']);
+            $contactHtml = curl_exec($ch);
+            curl_close($ch);
+
+            if ($contactHtml && strlen($contactHtml) > 100) {
+                // Strip scripts/styles for cleaner parsing
+                $contactClean = preg_replace('#<script[^>]*>.*?</script>#is', '', $contactHtml);
+                $contactClean = preg_replace('#<style[^>]*>.*?</style>#is', '', $contactClean);
+                $contactText = strip_tags($contactClean);
+
+                // Phone: (xxx) xxx-xxxx or xxx-xxx-xxxx or xxx.xxx.xxxx
+                if (preg_match('/(?:P(?:hone)?:?\s*)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}/', $contactText, $cpm)) {
+                    $bizInfo['phone'] = trim($cpm[0]);
+                    // Clean prefix like "P:" or "Phone:"
+                    $bizInfo['phone'] = preg_replace('/^P(?:hone)?:?\s*/i', '', $bizInfo['phone']);
+                }
+
+                // Also check tel: links on contact page
+                if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $contactHtml, $cpm2)) {
+                    $bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $cpm2[1]));
+                }
+
+                // Address: look for US street address pattern
+                if (preg_match('/(\d+\s+[A-Z][a-zA-Z\s]+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Drive|Dr|Lane|Ln|Way|Place|Pl|Court|Ct)\.?)(?:\s*[,\n]?\s*([A-Z][a-zA-Z\s]+),\s*([A-Z]{2})\s*,?\s*(\d{5}))?/m', $contactText, $cam)) {
+                    if (!empty($cam[1])) $bizInfo['addressLine1'] = trim($cam[1]);
+                    if (!empty($cam[2])) $bizInfo['city'] = trim($cam[2]);
+                    if (!empty($cam[3])) $bizInfo['state'] = strtoupper($cam[3]);
+                    if (!empty($cam[4])) $bizInfo['zip'] = $cam[4];
+                }
+
+                // Hours: look for day-time patterns
+                $dayPattern = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
+                $timePattern = '\d{1,2}(?::\d{2})?\s*(?:am|pm)';
+                if (preg_match_all("/($dayPattern)\s*($timePattern)\s*-\s*($timePattern)/i", $contactText, $hm, PREG_SET_ORDER)) {
+                    $hourParts = [];
+                    foreach ($hm as $h) {
+                        $hourParts[] = $h[1] . ' ' . $h[2] . '-' . $h[3];
+                    }
+                    if (!empty($hourParts)) $bizInfo['hours'] = implode(', ', $hourParts);
+                }
+            }
+        }
+
        // Extract menu names from sub-page URLs
        $menuPages = [];
        foreach ($subPages as $spUrl) {