diff --git a/api/setup/analyzeMenuUrl.php b/api/setup/analyzeMenuUrl.php index d6b0ca7..15ffb5e 100644 --- a/api/setup/analyzeMenuUrl.php +++ b/api/setup/analyzeMenuUrl.php @@ -188,6 +188,68 @@ try { } } + // 5. Fetch contact/about page for better business info + $baseOrigin = preg_replace('#^(https?://[^/]+).*#', '$1', $discoverUrl); + $contactUrl = ''; + if (preg_match_all('#]+href=["\']([^"\']+)["\'][^>]*>([^<]*)#i', $html, $linkMatches, PREG_SET_ORDER)) { + foreach ($linkMatches as $lm) { + $href = $lm[1]; + $text = strtolower(trim($lm[2])); + if (preg_match('/\b(contact|about|location|find.?us|visit|hours)\b/i', $text) || preg_match('#/(contact|about|location|find-us|visit|hours)/?$#i', $href)) { + if (str_starts_with($href, '/')) $href = $baseOrigin . $href; + if (str_starts_with($href, $baseOrigin)) { + $contactUrl = $href; + break; + } + } + } + } + if ($contactUrl) { + $ch = curl_init($contactUrl); + curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_FOLLOWLOCATION => true, + CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']); + $contactHtml = curl_exec($ch); + curl_close($ch); + + if ($contactHtml && strlen($contactHtml) > 100) { + // Strip scripts/styles for cleaner parsing + $contactClean = preg_replace('#]*>.*?#is', '', $contactHtml); + $contactClean = preg_replace('#]*>.*?#is', '', $contactClean); + $contactText = strip_tags($contactClean); + + // Phone: (xxx) xxx-xxxx or xxx-xxx-xxxx or xxx.xxx.xxxx + if (preg_match('/(?:P(?:hone)?:?\s*)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}/', $contactText, $cpm)) { + $bizInfo['phone'] = trim($cpm[0]); + // Clean prefix like "P:" or "Phone:" + $bizInfo['phone'] = preg_replace('/^P(?:hone)?:?\s*/i', '', $bizInfo['phone']); + } + + // Also check tel: links on contact page + if (empty($bizInfo['phone']) && preg_match('#href=["\']tel:([^"\']+)["\']#i', $contactHtml, $cpm2)) { + $bizInfo['phone'] = trim(preg_replace('/[^\d+()-\s]/', '', $cpm2[1])); + } + + // Address: look for US street address pattern + if (preg_match('/(\d+\s+[A-Z][a-zA-Z\s]+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Drive|Dr|Lane|Ln|Way|Place|Pl|Court|Ct)\.?)(?:\s*[,\n]?\s*([A-Z][a-zA-Z\s]+),\s*([A-Z]{2})\s*,?\s*(\d{5}))?/m', $contactText, $cam)) { + if (!empty($cam[1])) $bizInfo['addressLine1'] = trim($cam[1]); + if (!empty($cam[2])) $bizInfo['city'] = trim($cam[2]); + if (!empty($cam[3])) $bizInfo['state'] = strtoupper($cam[3]); + if (!empty($cam[4])) $bizInfo['zip'] = $cam[4]; + } + + // Hours: look for day-time patterns + $dayPattern = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)'; + $timePattern = '\d{1,2}(?::\d{2})?\s*(?:am|pm)'; + if (preg_match_all("/($dayPattern)\s*($timePattern)\s*-\s*($timePattern)/i", $contactText, $hm, PREG_SET_ORDER)) { + $hourParts = []; + foreach ($hm as $h) { + $hourParts[] = $h[1] . ' ' . $h[2] . '-' . $h[3]; + } + if (!empty($hourParts)) $bizInfo['hours'] = implode(', ', $hourParts); + } + } + } + // Extract menu names from sub-page URLs $menuPages = []; foreach ($subPages as $spUrl) {