Extract og:image as header during wizard discovery

- analyzeMenuUrl.php: Extract og:image and JSON-LD image during discovery, return as headerImageUrl
- downloadImages.php: Add User-Agent header, detect image format from content-type/magic bytes, update HeaderImageExtension in DB

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-15 09:36:43 -07:00
parent add3842db3
commit d7cd6774c7
2 changed files with 51 additions and 9 deletions

View file

@ -292,6 +292,31 @@ JSEOF;
} }
$siteName = $bizInfo['name'] ?? ''; $siteName = $bizInfo['name'] ?? '';
// 3. Extract header image from og:image or JSON-LD image
$headerImageUrl = '';
// Try og:image meta tag first (most common for restaurants)
if (preg_match('#<meta\s+(?:property|name)=["\']og:image["\']\s+content=["\']([^"\']+)["\']#i', $html, $ogm)) {
$headerImageUrl = trim($ogm[1]);
} elseif (preg_match('#<meta\s+content=["\']([^"\']+)["\']\s+(?:property|name)=["\']og:image["\']#i', $html, $ogm)) {
$headerImageUrl = trim($ogm[1]);
}
// Fallback: JSON-LD image field
if (empty($headerImageUrl) && preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $html, $ldImgMatches)) {
foreach ($ldImgMatches[1] as $ldJson) {
$ld = json_decode($ldJson, true);
if (!is_array($ld)) continue;
$entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld];
foreach ($entries as $entry) {
if (!empty($entry['image'])) {
$img = $entry['image'];
if (is_string($img)) { $headerImageUrl = $img; break 2; }
if (is_array($img) && !empty($img['url'])) { $headerImageUrl = $img['url']; break 2; }
if (is_array($img) && isset($img[0])) { $headerImageUrl = is_string($img[0]) ? $img[0] : ($img[0]['url'] ?? ''); break 2; }
}
}
}
}
// Parse address into components if only full string // Parse address into components if only full string
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) { if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
$addr = $bizInfo['address']; $addr = $bizInfo['address'];
@ -443,6 +468,7 @@ JSEOF;
'platformPages' => $platformPages, 'platformPages' => $platformPages,
'hasPlatform' => $hasPlatform, 'hasPlatform' => $hasPlatform,
'totalPagesFound' => count($menuPages), 'totalPagesFound' => count($menuPages),
'headerImageUrl' => $headerImageUrl,
]); ]);
} }

View file

@ -81,12 +81,6 @@ try {
// Download header // Download header
if (!empty($data['headerUrl'])) { if (!empty($data['headerUrl'])) {
$headerUrl = $data['headerUrl']; $headerUrl = $data['headerUrl'];
$ext = '.jpg';
if (stripos($headerUrl, '.png') !== false) $ext = '.png';
elseif (stripos($headerUrl, '.gif') !== false) $ext = '.gif';
elseif (stripos($headerUrl, '.webp') !== false) $ext = '.webp';
$headerFile = "$headersPath/$businessID$ext";
try { try {
$ch = curl_init($headerUrl); $ch = curl_init($headerUrl);
@ -94,24 +88,46 @@ try {
CURLOPT_RETURNTRANSFER => true, CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 30, CURLOPT_TIMEOUT => 30,
CURLOPT_FOLLOWLOCATION => true, CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTPHEADER => [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
],
]); ]);
$content = curl_exec($ch); $content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
curl_close($ch); curl_close($ch);
if ($httpCode === 200 && $content !== false) { if ($httpCode === 200 && $content !== false && strlen($content) > 100) {
// Detect actual format from content-type or magic bytes
$ext = 'jpg';
if (stripos($contentType, 'png') !== false) $ext = 'png';
elseif (stripos($contentType, 'gif') !== false) $ext = 'gif';
elseif (stripos($contentType, 'webp') !== false) $ext = 'webp';
else {
// Fallback: check magic bytes
$hex = strtoupper(bin2hex(substr($content, 0, 8)));
if (str_starts_with($hex, '89504E47')) $ext = 'png';
elseif (str_starts_with($hex, '474946')) $ext = 'gif';
elseif (str_starts_with($hex, '52494646')) $ext = 'webp';
}
$headerFile = "$headersPath/$businessID.$ext";
file_put_contents($headerFile, $content); file_put_contents($headerFile, $content);
// Update database
queryTimed("UPDATE Businesses SET HeaderImageExtension = ? WHERE ID = ?", [$ext, $businessID]);
$response['downloaded'][] = [ $response['downloaded'][] = [
'type' => 'header', 'type' => 'header',
'url' => $headerUrl, 'url' => $headerUrl,
'savedTo' => "/uploads/headers/$businessID$ext", 'savedTo' => "/uploads/headers/$businessID.$ext",
'size' => strlen($content), 'size' => strlen($content),
]; ];
} else { } else {
$response['downloaded'][] = [ $response['downloaded'][] = [
'type' => 'header', 'type' => 'header',
'url' => $headerUrl, 'url' => $headerUrl,
'error' => "HTTP $httpCode", 'error' => "HTTP $httpCode (size: " . strlen($content ?: '') . ")",
]; ];
} }
} catch (Exception $e) { } catch (Exception $e) {