Add platform_images mode with stealth Playwright for order.online
- Stealth Playwright bypasses Cloudflare bot protection - Multiple selector strategies for menu item cards - Fuzzy name matching (exact, normalized, partial) - Background-image extraction as fallback - Script auto-created at /opt/playwright/platform-images.js Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
24849c01e4
commit
3d9084d848
1 changed files with 170 additions and 0 deletions
|
|
@ -36,6 +36,176 @@ try {
|
||||||
$data = readJsonBody();
|
$data = readJsonBody();
|
||||||
if (empty($data)) throw new Exception('No request body provided');
|
if (empty($data)) throw new Exception('No request body provided');
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PLATFORM IMAGES MODE: Use stealth Playwright to grab food photos
|
||||||
|
// ============================================================
|
||||||
|
if (!empty($data['mode']) && $data['mode'] === 'platform_images' && !empty($data['url'])) {
|
||||||
|
$platformUrl = trim($data['url']);
|
||||||
|
$itemNames = $data['items'] ?? [];
|
||||||
|
|
||||||
|
// Write item names to temp file for the stealth script
|
||||||
|
$tempFile = '/tmp/platform-items-' . uniqid() . '.json';
|
||||||
|
file_put_contents($tempFile, json_encode($itemNames));
|
||||||
|
|
||||||
|
// Use stealth Playwright to bypass Cloudflare
|
||||||
|
$scriptPath = '/opt/playwright/platform-images.js';
|
||||||
|
if (!file_exists($scriptPath)) {
|
||||||
|
// Create the stealth image extraction script
|
||||||
|
$script = <<<'JSEOF'
|
||||||
|
const { chromium } = require("playwright-extra");
|
||||||
|
const stealth = require("puppeteer-extra-plugin-stealth");
|
||||||
|
const fs = require("fs");
|
||||||
|
chromium.use(stealth());
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const url = process.argv[2];
|
||||||
|
const itemsFile = process.argv[3];
|
||||||
|
const log = (msg) => process.stderr.write("[platform-img] " + msg + "\n");
|
||||||
|
|
||||||
|
let itemNames = [];
|
||||||
|
if (itemsFile && fs.existsSync(itemsFile)) {
|
||||||
|
try { itemNames = JSON.parse(fs.readFileSync(itemsFile, "utf8")); } catch(e) {}
|
||||||
|
}
|
||||||
|
log("Looking for images for " + itemNames.length + " items at " + url);
|
||||||
|
|
||||||
|
const browser = await chromium.launch({ headless: true });
|
||||||
|
const page = await browser.newPage({
|
||||||
|
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
||||||
|
await page.waitForTimeout(8000);
|
||||||
|
|
||||||
|
// Scroll to load lazy images
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
await page.evaluate((s) => window.scrollTo(0, s * window.innerHeight), i + 1);
|
||||||
|
await page.waitForTimeout(500);
|
||||||
|
}
|
||||||
|
await page.evaluate(() => window.scrollTo(0, 0));
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
|
// Try multiple selector strategies for menu item cards
|
||||||
|
const imageMap = await page.evaluate(() => {
|
||||||
|
const pairs = {};
|
||||||
|
// Strategy 1: Cards with name + image
|
||||||
|
const selectors = [
|
||||||
|
'[data-testid*="MenuItem"]', '[class*="menu-item"]', '[class*="MenuItem"]',
|
||||||
|
'[class*="product-card"]', '[class*="item-card"]', 'article', '.menuItem',
|
||||||
|
'[class*="StoreMenuItem"]', '[class*="menu_item"]',
|
||||||
|
'[class*="MenuProduct"]', '[class*="menuProduct"]',
|
||||||
|
'[class*="sc-"]' // styled-components
|
||||||
|
];
|
||||||
|
for (const sel of selectors) {
|
||||||
|
const cards = document.querySelectorAll(sel);
|
||||||
|
for (const card of cards) {
|
||||||
|
const nameEl = card.querySelector('h3, h4, h2, [class*="name"], [class*="title"], [class*="Name"], [class*="Title"], span[class]');
|
||||||
|
const imgEl = card.querySelector('img[src*="http"], [style*="background-image"]');
|
||||||
|
if (nameEl && imgEl) {
|
||||||
|
const name = nameEl.textContent.trim();
|
||||||
|
let imgUrl = imgEl.src || '';
|
||||||
|
if (!imgUrl && imgEl.style.backgroundImage) {
|
||||||
|
imgUrl = imgEl.style.backgroundImage.replace(/url\(["']?/, '').replace(/["']?\)/, '');
|
||||||
|
}
|
||||||
|
if (name && imgUrl && imgUrl.startsWith('http')) {
|
||||||
|
pairs[name] = imgUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Object.keys(pairs).length > 5) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 2: All images with alt text
|
||||||
|
if (Object.keys(pairs).length === 0) {
|
||||||
|
for (const img of document.querySelectorAll('img[src*="http"]')) {
|
||||||
|
if (/(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg|avatar)/i.test(img.src)) continue;
|
||||||
|
if (img.naturalWidth > 80 && img.naturalHeight > 80 && img.alt && img.alt.length > 2) {
|
||||||
|
pairs[img.alt.trim()] = img.src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 3: Background images in divs near text
|
||||||
|
if (Object.keys(pairs).length === 0) {
|
||||||
|
const allDivs = document.querySelectorAll('div[style*="background-image"]');
|
||||||
|
for (const div of allDivs) {
|
||||||
|
const bgUrl = div.style.backgroundImage.replace(/url\(["']?/, '').replace(/["']?\)/, '');
|
||||||
|
if (!bgUrl.startsWith('http')) continue;
|
||||||
|
const textEl = div.closest('[class]')?.querySelector('h3, h4, span, p');
|
||||||
|
if (textEl) {
|
||||||
|
const name = textEl.textContent.trim();
|
||||||
|
if (name.length > 2 && name.length < 60) {
|
||||||
|
pairs[name] = bgUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pairs;
|
||||||
|
});
|
||||||
|
|
||||||
|
log("Found " + Object.keys(imageMap).length + " image pairs");
|
||||||
|
|
||||||
|
// Fuzzy match item names to found images
|
||||||
|
const result = {};
|
||||||
|
const normalize = (s) => s.toLowerCase().replace(/[^a-z0-9]/g, '');
|
||||||
|
const imageKeys = Object.keys(imageMap);
|
||||||
|
|
||||||
|
for (const itemName of itemNames) {
|
||||||
|
const normItem = normalize(itemName);
|
||||||
|
// Exact match first
|
||||||
|
if (imageMap[itemName]) {
|
||||||
|
result[itemName] = imageMap[itemName];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Normalized match
|
||||||
|
for (const key of imageKeys) {
|
||||||
|
if (normalize(key) === normItem) {
|
||||||
|
result[itemName] = imageMap[key];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Partial match (item name contained in key or vice versa)
|
||||||
|
if (!result[itemName]) {
|
||||||
|
for (const key of imageKeys) {
|
||||||
|
const normKey = normalize(key);
|
||||||
|
if (normKey.includes(normItem) || normItem.includes(normKey)) {
|
||||||
|
result[itemName] = imageMap[key];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Matched " + Object.keys(result).length + " items to images");
|
||||||
|
console.log(JSON.stringify({ imageMap: result, totalFound: Object.keys(imageMap).length }));
|
||||||
|
} catch (e) {
|
||||||
|
log("Error: " + e.message);
|
||||||
|
console.log(JSON.stringify({ imageMap: {}, totalFound: 0, error: e.message }));
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
|
JSEOF;
|
||||||
|
file_put_contents($scriptPath, $script);
|
||||||
|
}
|
||||||
|
|
||||||
|
$output = shell_exec("PLAYWRIGHT_BROWSERS_PATH=/opt/playwright/browsers /usr/bin/node $scriptPath " . escapeshellarg($platformUrl) . " " . escapeshellarg($tempFile) . " 2>&1");
|
||||||
|
@unlink($tempFile);
|
||||||
|
|
||||||
|
$result = json_decode(trim($output ?? ''), true);
|
||||||
|
if (!is_array($result)) {
|
||||||
|
$result = ['imageMap' => [], 'totalFound' => 0];
|
||||||
|
}
|
||||||
|
|
||||||
|
jsonResponse([
|
||||||
|
'OK' => true,
|
||||||
|
'mode' => 'platform_images',
|
||||||
|
'imageMap' => $result['imageMap'] ?? [],
|
||||||
|
'totalFound' => $result['totalFound'] ?? 0,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
|
// DISCOVERY MODE: Crawl site, detect menu sub-pages, return without Claude
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue