Extract og:image as header during wizard discovery
- analyzeMenuUrl.php: Extract og:image and JSON-LD image during discovery, return as headerImageUrl - downloadImages.php: Add User-Agent header, detect image format from content-type/magic bytes, update HeaderImageExtension in DB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
add3842db3
commit
d7cd6774c7
2 changed files with 51 additions and 9 deletions
|
|
@ -292,6 +292,31 @@ JSEOF;
|
|||
}
|
||||
$siteName = $bizInfo['name'] ?? '';
|
||||
|
||||
// 3. Extract header image from og:image or JSON-LD image
|
||||
$headerImageUrl = '';
|
||||
// Try og:image meta tag first (most common for restaurants)
|
||||
if (preg_match('#<meta\s+(?:property|name)=["\']og:image["\']\s+content=["\']([^"\']+)["\']#i', $html, $ogm)) {
|
||||
$headerImageUrl = trim($ogm[1]);
|
||||
} elseif (preg_match('#<meta\s+content=["\']([^"\']+)["\']\s+(?:property|name)=["\']og:image["\']#i', $html, $ogm)) {
|
||||
$headerImageUrl = trim($ogm[1]);
|
||||
}
|
||||
// Fallback: JSON-LD image field
|
||||
if (empty($headerImageUrl) && preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $html, $ldImgMatches)) {
|
||||
foreach ($ldImgMatches[1] as $ldJson) {
|
||||
$ld = json_decode($ldJson, true);
|
||||
if (!is_array($ld)) continue;
|
||||
$entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld];
|
||||
foreach ($entries as $entry) {
|
||||
if (!empty($entry['image'])) {
|
||||
$img = $entry['image'];
|
||||
if (is_string($img)) { $headerImageUrl = $img; break 2; }
|
||||
if (is_array($img) && !empty($img['url'])) { $headerImageUrl = $img['url']; break 2; }
|
||||
if (is_array($img) && isset($img[0])) { $headerImageUrl = is_string($img[0]) ? $img[0] : ($img[0]['url'] ?? ''); break 2; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse address into components if only full string
|
||||
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
|
||||
$addr = $bizInfo['address'];
|
||||
|
|
@ -443,6 +468,7 @@ JSEOF;
|
|||
'platformPages' => $platformPages,
|
||||
'hasPlatform' => $hasPlatform,
|
||||
'totalPagesFound' => count($menuPages),
|
||||
'headerImageUrl' => $headerImageUrl,
|
||||
]);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -81,12 +81,6 @@ try {
|
|||
// Download header
|
||||
if (!empty($data['headerUrl'])) {
|
||||
$headerUrl = $data['headerUrl'];
|
||||
$ext = '.jpg';
|
||||
if (stripos($headerUrl, '.png') !== false) $ext = '.png';
|
||||
elseif (stripos($headerUrl, '.gif') !== false) $ext = '.gif';
|
||||
elseif (stripos($headerUrl, '.webp') !== false) $ext = '.webp';
|
||||
|
||||
$headerFile = "$headersPath/$businessID$ext";
|
||||
|
||||
try {
|
||||
$ch = curl_init($headerUrl);
|
||||
|
|
@ -94,24 +88,46 @@ try {
|
|||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_TIMEOUT => 30,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
],
|
||||
]);
|
||||
$content = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
||||
curl_close($ch);
|
||||
|
||||
if ($httpCode === 200 && $content !== false) {
|
||||
if ($httpCode === 200 && $content !== false && strlen($content) > 100) {
|
||||
// Detect actual format from content-type or magic bytes
|
||||
$ext = 'jpg';
|
||||
if (stripos($contentType, 'png') !== false) $ext = 'png';
|
||||
elseif (stripos($contentType, 'gif') !== false) $ext = 'gif';
|
||||
elseif (stripos($contentType, 'webp') !== false) $ext = 'webp';
|
||||
else {
|
||||
// Fallback: check magic bytes
|
||||
$hex = strtoupper(bin2hex(substr($content, 0, 8)));
|
||||
if (str_starts_with($hex, '89504E47')) $ext = 'png';
|
||||
elseif (str_starts_with($hex, '474946')) $ext = 'gif';
|
||||
elseif (str_starts_with($hex, '52494646')) $ext = 'webp';
|
||||
}
|
||||
|
||||
$headerFile = "$headersPath/$businessID.$ext";
|
||||
file_put_contents($headerFile, $content);
|
||||
|
||||
// Update database
|
||||
queryTimed("UPDATE Businesses SET HeaderImageExtension = ? WHERE ID = ?", [$ext, $businessID]);
|
||||
|
||||
$response['downloaded'][] = [
|
||||
'type' => 'header',
|
||||
'url' => $headerUrl,
|
||||
'savedTo' => "/uploads/headers/$businessID$ext",
|
||||
'savedTo' => "/uploads/headers/$businessID.$ext",
|
||||
'size' => strlen($content),
|
||||
];
|
||||
} else {
|
||||
$response['downloaded'][] = [
|
||||
'type' => 'header',
|
||||
'url' => $headerUrl,
|
||||
'error' => "HTTP $httpCode",
|
||||
'error' => "HTTP $httpCode (size: " . strlen($content ?: '') . ")",
|
||||
];
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue