Extract og:image as header during wizard discovery
- analyzeMenuUrl.php: Extract og:image and JSON-LD image during discovery, return as headerImageUrl - downloadImages.php: Add User-Agent header, detect image format from content-type/magic bytes, update HeaderImageExtension in DB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
add3842db3
commit
d7cd6774c7
2 changed files with 51 additions and 9 deletions
|
|
@ -292,6 +292,31 @@ JSEOF;
|
||||||
}
|
}
|
||||||
$siteName = $bizInfo['name'] ?? '';
|
$siteName = $bizInfo['name'] ?? '';
|
||||||
|
|
||||||
|
// 3. Extract header image from og:image or JSON-LD image
|
||||||
|
$headerImageUrl = '';
|
||||||
|
// Try og:image meta tag first (most common for restaurants)
|
||||||
|
if (preg_match('#<meta\s+(?:property|name)=["\']og:image["\']\s+content=["\']([^"\']+)["\']#i', $html, $ogm)) {
|
||||||
|
$headerImageUrl = trim($ogm[1]);
|
||||||
|
} elseif (preg_match('#<meta\s+content=["\']([^"\']+)["\']\s+(?:property|name)=["\']og:image["\']#i', $html, $ogm)) {
|
||||||
|
$headerImageUrl = trim($ogm[1]);
|
||||||
|
}
|
||||||
|
// Fallback: JSON-LD image field
|
||||||
|
if (empty($headerImageUrl) && preg_match_all('#<script[^>]*type=["\']application/ld\+json["\'][^>]*>([^<]+)</script>#i', $html, $ldImgMatches)) {
|
||||||
|
foreach ($ldImgMatches[1] as $ldJson) {
|
||||||
|
$ld = json_decode($ldJson, true);
|
||||||
|
if (!is_array($ld)) continue;
|
||||||
|
$entries = isset($ld['@graph']) ? $ld['@graph'] : [$ld];
|
||||||
|
foreach ($entries as $entry) {
|
||||||
|
if (!empty($entry['image'])) {
|
||||||
|
$img = $entry['image'];
|
||||||
|
if (is_string($img)) { $headerImageUrl = $img; break 2; }
|
||||||
|
if (is_array($img) && !empty($img['url'])) { $headerImageUrl = $img['url']; break 2; }
|
||||||
|
if (is_array($img) && isset($img[0])) { $headerImageUrl = is_string($img[0]) ? $img[0] : ($img[0]['url'] ?? ''); break 2; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Parse address into components if only full string
|
// Parse address into components if only full string
|
||||||
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
|
if (!empty($bizInfo['address']) && empty($bizInfo['addressLine1'])) {
|
||||||
$addr = $bizInfo['address'];
|
$addr = $bizInfo['address'];
|
||||||
|
|
@ -443,6 +468,7 @@ JSEOF;
|
||||||
'platformPages' => $platformPages,
|
'platformPages' => $platformPages,
|
||||||
'hasPlatform' => $hasPlatform,
|
'hasPlatform' => $hasPlatform,
|
||||||
'totalPagesFound' => count($menuPages),
|
'totalPagesFound' => count($menuPages),
|
||||||
|
'headerImageUrl' => $headerImageUrl,
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,12 +81,6 @@ try {
|
||||||
// Download header
|
// Download header
|
||||||
if (!empty($data['headerUrl'])) {
|
if (!empty($data['headerUrl'])) {
|
||||||
$headerUrl = $data['headerUrl'];
|
$headerUrl = $data['headerUrl'];
|
||||||
$ext = '.jpg';
|
|
||||||
if (stripos($headerUrl, '.png') !== false) $ext = '.png';
|
|
||||||
elseif (stripos($headerUrl, '.gif') !== false) $ext = '.gif';
|
|
||||||
elseif (stripos($headerUrl, '.webp') !== false) $ext = '.webp';
|
|
||||||
|
|
||||||
$headerFile = "$headersPath/$businessID$ext";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$ch = curl_init($headerUrl);
|
$ch = curl_init($headerUrl);
|
||||||
|
|
@ -94,24 +88,46 @@ try {
|
||||||
CURLOPT_RETURNTRANSFER => true,
|
CURLOPT_RETURNTRANSFER => true,
|
||||||
CURLOPT_TIMEOUT => 30,
|
CURLOPT_TIMEOUT => 30,
|
||||||
CURLOPT_FOLLOWLOCATION => true,
|
CURLOPT_FOLLOWLOCATION => true,
|
||||||
|
CURLOPT_HTTPHEADER => [
|
||||||
|
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||||
|
],
|
||||||
]);
|
]);
|
||||||
$content = curl_exec($ch);
|
$content = curl_exec($ch);
|
||||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||||
|
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
||||||
curl_close($ch);
|
curl_close($ch);
|
||||||
|
|
||||||
if ($httpCode === 200 && $content !== false) {
|
if ($httpCode === 200 && $content !== false && strlen($content) > 100) {
|
||||||
|
// Detect actual format from content-type or magic bytes
|
||||||
|
$ext = 'jpg';
|
||||||
|
if (stripos($contentType, 'png') !== false) $ext = 'png';
|
||||||
|
elseif (stripos($contentType, 'gif') !== false) $ext = 'gif';
|
||||||
|
elseif (stripos($contentType, 'webp') !== false) $ext = 'webp';
|
||||||
|
else {
|
||||||
|
// Fallback: check magic bytes
|
||||||
|
$hex = strtoupper(bin2hex(substr($content, 0, 8)));
|
||||||
|
if (str_starts_with($hex, '89504E47')) $ext = 'png';
|
||||||
|
elseif (str_starts_with($hex, '474946')) $ext = 'gif';
|
||||||
|
elseif (str_starts_with($hex, '52494646')) $ext = 'webp';
|
||||||
|
}
|
||||||
|
|
||||||
|
$headerFile = "$headersPath/$businessID.$ext";
|
||||||
file_put_contents($headerFile, $content);
|
file_put_contents($headerFile, $content);
|
||||||
|
|
||||||
|
// Update database
|
||||||
|
queryTimed("UPDATE Businesses SET HeaderImageExtension = ? WHERE ID = ?", [$ext, $businessID]);
|
||||||
|
|
||||||
$response['downloaded'][] = [
|
$response['downloaded'][] = [
|
||||||
'type' => 'header',
|
'type' => 'header',
|
||||||
'url' => $headerUrl,
|
'url' => $headerUrl,
|
||||||
'savedTo' => "/uploads/headers/$businessID$ext",
|
'savedTo' => "/uploads/headers/$businessID.$ext",
|
||||||
'size' => strlen($content),
|
'size' => strlen($content),
|
||||||
];
|
];
|
||||||
} else {
|
} else {
|
||||||
$response['downloaded'][] = [
|
$response['downloaded'][] = [
|
||||||
'type' => 'header',
|
'type' => 'header',
|
||||||
'url' => $headerUrl,
|
'url' => $headerUrl,
|
||||||
'error' => "HTTP $httpCode",
|
'error' => "HTTP $httpCode (size: " . strlen($content ?: '') . ")",
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue