Improve DoorDash modifier extraction: pass item names to Playwright
- Pass extracted item names via temp JSON file so Playwright knows exactly what to click instead of guessing from DOM selectors (7 → 171 items) - Use TreeWalker for exact text matching and aggressive scrolling - Better price parsing: handle cents (int), dollars (string), displayPrice - Improved modal dismissal with overlay click fallback Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b14f26ed47
commit
f5974a5fa2
2 changed files with 143 additions and 108 deletions
|
|
@ -1442,12 +1442,22 @@
|
||||||
<cfset ddItemModMap = structNew()>
|
<cfset ddItemModMap = structNew()>
|
||||||
<cftry>
|
<cftry>
|
||||||
<cfset arrayAppend(response.steps, "Running stealth Playwright for modifier extraction...")>
|
<cfset arrayAppend(response.steps, "Running stealth Playwright for modifier extraction...")>
|
||||||
|
<!--- Write item names to temp file so Playwright knows what to click --->
|
||||||
|
<cfset ddItemNames = arrayNew(1)>
|
||||||
|
<cfloop array="#ddItems#" index="ddi">
|
||||||
|
<cfset arrayAppend(ddItemNames, ddi.name)>
|
||||||
|
</cfloop>
|
||||||
|
<cfset ddTempFile = "/tmp/dd-items-#createUUID()#.json">
|
||||||
|
<cffile action="write" file="#ddTempFile#" output="#serializeJSON(ddItemNames)#" charset="utf-8">
|
||||||
|
|
||||||
<cfset modTimeout = 180000 + (arrayLen(ddItems) * 1500)>
|
<cfset modTimeout = 180000 + (arrayLen(ddItems) * 1500)>
|
||||||
<cfif modTimeout GT 600000><cfset modTimeout = 600000></cfif>
|
<cfif modTimeout GT 600000><cfset modTimeout = 600000></cfif>
|
||||||
<cfexecute name="/opt/playwright/run-doordash-modifiers.sh"
|
<cfexecute name="/opt/playwright/run-doordash-modifiers.sh"
|
||||||
arguments="#targetUrl#"
|
arguments="#targetUrl# #ddTempFile#"
|
||||||
timeout="#int(modTimeout / 1000)#"
|
timeout="#int(modTimeout / 1000)#"
|
||||||
variable="ddModResult" />
|
variable="ddModResult" />
|
||||||
|
<!--- Clean up temp file --->
|
||||||
|
<cftry><cffile action="delete" file="#ddTempFile#"><cfcatch></cfcatch></cftry>
|
||||||
|
|
||||||
<cfif len(trim(ddModResult))>
|
<cfif len(trim(ddModResult))>
|
||||||
<cfset ddModData = deserializeJSON(trim(ddModResult))>
|
<cfset ddModData = deserializeJSON(trim(ddModResult))>
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
const { chromium } = require("playwright-extra");
|
const { chromium } = require("playwright-extra");
|
||||||
const stealth = require("puppeteer-extra-plugin-stealth");
|
const stealth = require("puppeteer-extra-plugin-stealth");
|
||||||
|
const fs = require("fs");
|
||||||
chromium.use(stealth());
|
chromium.use(stealth());
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
const url = process.argv[2];
|
const url = process.argv[2];
|
||||||
|
const itemNamesFile = process.argv[3]; // Optional: JSON file with array of item names
|
||||||
if (!url) {
|
if (!url) {
|
||||||
console.log(JSON.stringify({ error: "URL required", modifiers: [], itemModifierMap: {} }));
|
console.log(JSON.stringify({ error: "URL required", modifiers: [], itemModifierMap: {} }));
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
|
|
@ -11,6 +13,17 @@ chromium.use(stealth());
|
||||||
|
|
||||||
const log = (msg) => process.stderr.write("[dd-mod] " + msg + "\n");
|
const log = (msg) => process.stderr.write("[dd-mod] " + msg + "\n");
|
||||||
|
|
||||||
|
// Load item names if provided (from CFML fast-path)
|
||||||
|
let knownItemNames = [];
|
||||||
|
if (itemNamesFile && fs.existsSync(itemNamesFile)) {
|
||||||
|
try {
|
||||||
|
knownItemNames = JSON.parse(fs.readFileSync(itemNamesFile, "utf8"));
|
||||||
|
log("Loaded " + knownItemNames.length + " item names from file");
|
||||||
|
} catch (e) {
|
||||||
|
log("Failed to load item names file: " + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let browser;
|
let browser;
|
||||||
try {
|
try {
|
||||||
browser = await chromium.launch({ headless: true });
|
browser = await chromium.launch({ headless: true });
|
||||||
|
|
@ -31,12 +44,10 @@ chromium.use(stealth());
|
||||||
const ct = response.headers()["content-type"] || "";
|
const ct = response.headers()["content-type"] || "";
|
||||||
if (ct.includes("json")) {
|
if (ct.includes("json")) {
|
||||||
const body = await response.json();
|
const body = await response.json();
|
||||||
// DoorDash itemPage response structure
|
|
||||||
if (body && body.data && body.data.itemPage) {
|
if (body && body.data && body.data.itemPage) {
|
||||||
latestItemPage = body.data.itemPage;
|
latestItemPage = body.data.itemPage;
|
||||||
responseCount++;
|
responseCount++;
|
||||||
}
|
}
|
||||||
// Some DoorDash endpoints wrap in array
|
|
||||||
if (Array.isArray(body)) {
|
if (Array.isArray(body)) {
|
||||||
for (const entry of body) {
|
for (const entry of body) {
|
||||||
if (entry && entry.data && entry.data.itemPage) {
|
if (entry && entry.data && entry.data.itemPage) {
|
||||||
|
|
@ -54,49 +65,75 @@ chromium.use(stealth());
|
||||||
await page.goto(url, { waitUntil: "load", timeout: 60000 });
|
await page.goto(url, { waitUntil: "load", timeout: 60000 });
|
||||||
await page.waitForTimeout(5000);
|
await page.waitForTimeout(5000);
|
||||||
|
|
||||||
// Scroll to load all items
|
// Aggressive scroll to force DoorDash to render all items
|
||||||
const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
|
log("Scrolling to load all items...");
|
||||||
const viewportHeight = await page.evaluate(() => window.innerHeight);
|
let lastHeight = 0;
|
||||||
const scrollSteps = Math.min(Math.ceil(scrollHeight / viewportHeight), 20);
|
for (let round = 0; round < 3; round++) {
|
||||||
|
const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
const viewportHeight = await page.evaluate(() => window.innerHeight);
|
||||||
|
const scrollSteps = Math.ceil(scrollHeight / viewportHeight);
|
||||||
|
|
||||||
for (let i = 0; i < scrollSteps; i++) {
|
for (let i = 0; i <= scrollSteps; i++) {
|
||||||
await page.evaluate((step) => {
|
await page.evaluate((y) => window.scrollTo(0, y), i * viewportHeight);
|
||||||
window.scrollTo(0, step * window.innerHeight);
|
await page.waitForTimeout(250);
|
||||||
}, i + 1);
|
}
|
||||||
await page.waitForTimeout(300);
|
await page.waitForTimeout(500);
|
||||||
|
|
||||||
|
const newHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
if (newHeight === lastHeight) break;
|
||||||
|
lastHeight = newHeight;
|
||||||
}
|
}
|
||||||
await page.evaluate(() => window.scrollTo(0, 0));
|
await page.evaluate(() => window.scrollTo(0, 0));
|
||||||
await page.waitForTimeout(1000);
|
await page.waitForTimeout(500);
|
||||||
|
|
||||||
// Find all clickable menu item elements
|
// If we have known item names, use them to find and click elements
|
||||||
// DoorDash renders items as buttons/anchors with item names and images
|
// Otherwise fall back to DOM auto-detection
|
||||||
const itemElements = await page.evaluate(() => {
|
let itemsToClick = [];
|
||||||
const items = [];
|
|
||||||
const seen = new Set();
|
|
||||||
|
|
||||||
// Strategy 1: Look for item cards with data-anchor-id containing "MenuItem"
|
if (knownItemNames.length > 0) {
|
||||||
document.querySelectorAll('[data-anchor-id*="MenuItem"]').forEach(el => {
|
// Find each known item in the DOM by text content
|
||||||
const nameEl = el.querySelector('[data-telemetry-id="storeMenuItem.title"]') ||
|
log("Searching DOM for " + knownItemNames.length + " known items...");
|
||||||
el.querySelector('span[class*="Text"]') ||
|
itemsToClick = await page.evaluate((names) => {
|
||||||
el.querySelector('h3') ||
|
const found = [];
|
||||||
el.querySelector('span');
|
const allElements = document.querySelectorAll('span, h3, h4, p, div');
|
||||||
if (nameEl) {
|
|
||||||
const name = nameEl.textContent.trim();
|
// Build a map of text -> element for fast lookup
|
||||||
if (name && !seen.has(name) && name.length > 1 && name.length < 200) {
|
const textMap = new Map();
|
||||||
seen.add(name);
|
for (const el of allElements) {
|
||||||
const rect = el.getBoundingClientRect();
|
// Only use leaf-ish elements (avoid containers that contain the whole menu)
|
||||||
if (rect.width > 0 && rect.height > 0) {
|
if (el.children.length > 5) continue;
|
||||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
const text = el.textContent.trim();
|
||||||
|
if (text.length > 1 && text.length < 200 && !textMap.has(text)) {
|
||||||
|
textMap.set(text, el);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const name of names) {
|
||||||
|
const el = textMap.get(name);
|
||||||
|
if (el) {
|
||||||
|
// Find the clickable parent (the item card)
|
||||||
|
const clickable = el.closest('a, button, [role="button"], [tabindex="0"], [data-anchor-id]')
|
||||||
|
|| el.parentElement?.closest('a, button, [role="button"], [tabindex="0"], [data-anchor-id]')
|
||||||
|
|| el.parentElement;
|
||||||
|
if (clickable) {
|
||||||
|
const rect = clickable.getBoundingClientRect();
|
||||||
|
if (rect.width > 0 && rect.height > 0) {
|
||||||
|
found.push({ name, y: rect.y + window.scrollY });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
return found;
|
||||||
|
}, knownItemNames);
|
||||||
|
|
||||||
// Strategy 2: Look for buttons/divs containing item names with prices
|
log("Found " + itemsToClick.length + "/" + knownItemNames.length + " items in DOM");
|
||||||
if (items.length === 0) {
|
} else {
|
||||||
document.querySelectorAll('button, [role="button"], [data-testid*="item"], [data-testid*="menu"]').forEach(el => {
|
// Auto-detect from DOM (fallback)
|
||||||
|
itemsToClick = await page.evaluate(() => {
|
||||||
|
const items = [];
|
||||||
|
const seen = new Set();
|
||||||
|
document.querySelectorAll('[data-anchor-id*="MenuItem"], button, [role="button"]').forEach(el => {
|
||||||
const text = el.textContent || "";
|
const text = el.textContent || "";
|
||||||
// Items typically have a price like $X.XX
|
|
||||||
if (text.match(/\$\d+\.\d{2}/) && text.length < 500) {
|
if (text.match(/\$\d+\.\d{2}/) && text.length < 500) {
|
||||||
const lines = text.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
const lines = text.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
||||||
const name = lines[0];
|
const name = lines[0];
|
||||||
|
|
@ -104,88 +141,66 @@ chromium.use(stealth());
|
||||||
seen.add(name);
|
seen.add(name);
|
||||||
const rect = el.getBoundingClientRect();
|
const rect = el.getBoundingClientRect();
|
||||||
if (rect.width > 0 && rect.height > 0) {
|
if (rect.width > 0 && rect.height > 0) {
|
||||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
items.push({ name, y: rect.y + window.scrollY });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
return items;
|
||||||
|
});
|
||||||
|
log("Auto-detected " + itemsToClick.length + " clickable items");
|
||||||
|
}
|
||||||
|
|
||||||
// Strategy 3: Generic - find any clickable element with an image and text nearby
|
if (itemsToClick.length === 0) {
|
||||||
if (items.length === 0) {
|
|
||||||
document.querySelectorAll('img[src*="cdn4dd"]').forEach(img => {
|
|
||||||
const parent = img.closest('a, button, [role="button"], [tabindex="0"]') || img.parentElement.parentElement;
|
|
||||||
if (parent) {
|
|
||||||
const nameEl = parent.querySelector('span, h3, h4, p');
|
|
||||||
if (nameEl) {
|
|
||||||
const name = nameEl.textContent.trim();
|
|
||||||
if (name && !seen.has(name) && name.length > 1 && name.length < 200) {
|
|
||||||
seen.add(name);
|
|
||||||
const rect = parent.getBoundingClientRect();
|
|
||||||
if (rect.width > 0 && rect.height > 0) {
|
|
||||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return items;
|
|
||||||
});
|
|
||||||
|
|
||||||
log("Found " + itemElements.length + " clickable items on page");
|
|
||||||
|
|
||||||
if (itemElements.length === 0) {
|
|
||||||
log("No clickable items found, trying fallback...");
|
|
||||||
// Take a screenshot for debugging
|
|
||||||
console.log(JSON.stringify({ error: "No clickable items found", modifiers: [], itemModifierMap: {} }));
|
console.log(JSON.stringify({ error: "No clickable items found", modifiers: [], itemModifierMap: {} }));
|
||||||
await browser.close();
|
await browser.close();
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Click each item, capture modifier data
|
// Click each item, capture modifier data
|
||||||
const allModifierGroups = new Map(); // name -> modifier group data
|
const allModifierGroups = new Map();
|
||||||
const itemModifierMap = {}; // item name -> [modifier group names]
|
const itemModifierMap = {};
|
||||||
let clickedCount = 0;
|
let clickedCount = 0;
|
||||||
let modItemCount = 0;
|
let modItemCount = 0;
|
||||||
|
let noModCount = 0;
|
||||||
|
|
||||||
// Limit to prevent timeouts (DoorDash has many items)
|
const maxClicks = Math.min(itemsToClick.length, 250);
|
||||||
const maxClicks = Math.min(itemElements.length, 200);
|
|
||||||
|
|
||||||
for (let i = 0; i < maxClicks; i++) {
|
for (let i = 0; i < maxClicks; i++) {
|
||||||
const item = itemElements[i];
|
const item = itemsToClick[i];
|
||||||
try {
|
try {
|
||||||
latestItemPage = null;
|
latestItemPage = null;
|
||||||
|
|
||||||
// Scroll item into view and click by coordinates
|
// Scroll to item position
|
||||||
await page.evaluate((y) => window.scrollTo(0, y - 300), item.y);
|
await page.evaluate((y) => window.scrollTo(0, Math.max(0, y - 300)), item.y);
|
||||||
await page.waitForTimeout(200);
|
await page.waitForTimeout(200);
|
||||||
|
|
||||||
// Recalculate position after scroll
|
// Find the item element by name and click it
|
||||||
const freshPos = await page.evaluate((itemName) => {
|
const clicked = await page.evaluate((itemName) => {
|
||||||
const els = document.querySelectorAll('[data-anchor-id*="MenuItem"], button, [role="button"]');
|
// Find the text node with this exact name
|
||||||
for (const el of els) {
|
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
||||||
if (el.textContent.includes(itemName)) {
|
while (walker.nextNode()) {
|
||||||
const rect = el.getBoundingClientRect();
|
if (walker.currentNode.textContent.trim() === itemName) {
|
||||||
|
const el = walker.currentNode.parentElement;
|
||||||
|
const clickable = el.closest('a, button, [role="button"], [tabindex="0"], [data-anchor-id]')
|
||||||
|
|| el.parentElement?.closest('a, button, [role="button"], [tabindex="0"], [data-anchor-id]')
|
||||||
|
|| el;
|
||||||
|
const rect = clickable.getBoundingClientRect();
|
||||||
if (rect.width > 0 && rect.height > 0) {
|
if (rect.width > 0 && rect.height > 0) {
|
||||||
return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2, found: true };
|
clickable.click();
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return { found: false };
|
return false;
|
||||||
}, item.name);
|
}, item.name);
|
||||||
|
|
||||||
if (!freshPos.found) {
|
if (!clicked) continue;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.mouse.click(freshPos.x, freshPos.y);
|
|
||||||
clickedCount++;
|
clickedCount++;
|
||||||
|
|
||||||
// Wait for GraphQL response (up to 4s)
|
// Wait for GraphQL response (up to 5s)
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
while (!latestItemPage && Date.now() - startTime < 4000) {
|
while (!latestItemPage && Date.now() - startTime < 5000) {
|
||||||
await page.waitForTimeout(150);
|
await page.waitForTimeout(150);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -199,11 +214,19 @@ chromium.use(stealth());
|
||||||
const options = [];
|
const options = [];
|
||||||
if (ol.options && Array.isArray(ol.options)) {
|
if (ol.options && Array.isArray(ol.options)) {
|
||||||
for (const opt of ol.options) {
|
for (const opt of ol.options) {
|
||||||
const price = opt.price ? (typeof opt.price === "number" ? opt.price / 100 : parseFloat(opt.price) || 0) : 0;
|
// DoorDash prices: sometimes cents (int), sometimes dollars (string like "$1.50")
|
||||||
options.push({
|
let price = 0;
|
||||||
name: opt.name || "",
|
if (opt.price) {
|
||||||
price: price
|
if (typeof opt.price === "number") {
|
||||||
});
|
price = opt.price > 100 ? opt.price / 100 : opt.price;
|
||||||
|
} else if (typeof opt.price === "string") {
|
||||||
|
price = parseFloat(opt.price.replace(/[^0-9.]/g, "")) || 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (opt.displayPrice) {
|
||||||
|
price = parseFloat(String(opt.displayPrice).replace(/[^0-9.]/g, "")) || price;
|
||||||
|
}
|
||||||
|
options.push({ name: opt.name || "", price: price });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
allModifierGroups.set(olName, {
|
allModifierGroups.set(olName, {
|
||||||
|
|
@ -218,22 +241,24 @@ chromium.use(stealth());
|
||||||
}
|
}
|
||||||
itemModifierMap[item.name] = modNames;
|
itemModifierMap[item.name] = modNames;
|
||||||
modItemCount++;
|
modItemCount++;
|
||||||
|
} else {
|
||||||
|
noModCount++;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
noModCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close modal (press Escape or click outside)
|
// Close modal
|
||||||
await page.keyboard.press("Escape");
|
await page.keyboard.press("Escape");
|
||||||
await page.waitForTimeout(400);
|
await page.waitForTimeout(350);
|
||||||
|
|
||||||
// Check if modal is still open, click overlay if so
|
// Double-check modal is closed
|
||||||
const modalStillOpen = await page.evaluate(() => {
|
const stillOpen = await page.evaluate(() => {
|
||||||
const overlay = document.querySelector('[data-testid="modal-overlay"], [class*="ModalOverlay"], [class*="overlay"]');
|
const overlay = document.querySelector('[data-testid="modal-overlay"], [class*="ModalOverlay"], [class*="Overlay"]');
|
||||||
return !!overlay;
|
if (overlay) { overlay.click(); return true; }
|
||||||
|
return false;
|
||||||
});
|
});
|
||||||
if (modalStillOpen) {
|
if (stillOpen) await page.waitForTimeout(300);
|
||||||
await page.mouse.click(10, 10);
|
|
||||||
await page.waitForTimeout(300);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
log("Error clicking " + item.name + ": " + e.message);
|
log("Error clicking " + item.name + ": " + e.message);
|
||||||
|
|
@ -241,13 +266,12 @@ chromium.use(stealth());
|
||||||
await page.waitForTimeout(300);
|
await page.waitForTimeout(300);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Progress log every 20 items
|
if ((i + 1) % 25 === 0) {
|
||||||
if ((i + 1) % 20 === 0) {
|
log("Progress: " + (i + 1) + "/" + maxClicks + " | " + modItemCount + " with mods, " + noModCount + " without");
|
||||||
log("Progress: " + (i + 1) + "/" + maxClicks + " clicked, " + modItemCount + " with modifiers");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log("Done: " + clickedCount + " clicked, " + modItemCount + " items with modifiers, " + allModifierGroups.size + " unique modifier groups");
|
log("Done: " + clickedCount + "/" + maxClicks + " clicked, " + modItemCount + " with modifiers, " + allModifierGroups.size + " unique groups");
|
||||||
|
|
||||||
const modifiers = Array.from(allModifierGroups.values());
|
const modifiers = Array.from(allModifierGroups.values());
|
||||||
|
|
||||||
|
|
@ -255,9 +279,10 @@ chromium.use(stealth());
|
||||||
modifiers: modifiers,
|
modifiers: modifiers,
|
||||||
itemModifierMap: itemModifierMap,
|
itemModifierMap: itemModifierMap,
|
||||||
stats: {
|
stats: {
|
||||||
clickableItems: itemElements.length,
|
totalItems: itemsToClick.length,
|
||||||
clickedCount: clickedCount,
|
clickedCount: clickedCount,
|
||||||
itemsWithModifiers: modItemCount,
|
itemsWithModifiers: modItemCount,
|
||||||
|
itemsWithoutModifiers: noModCount,
|
||||||
uniqueModifierGroups: modifiers.length
|
uniqueModifierGroups: modifiers.length
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
|
||||||
Reference in a new issue