Add DoorDash modifier extraction via stealth Playwright
- New doordash-modifiers.js: stealth Playwright script that clicks each menu item on a DoorDash page, captures itemPage GraphQL responses, and extracts optionLists (modifier groups with options and prices) - Wire modifier extraction into DoorDash fast-path in analyzeMenuUrl.cfm: after parsing items/categories, runs modifier script and maps results - Improved business info extraction: address, phone, and hours now use position-based parsing of StoreHeaderAddress, StoreHeaderPhoneNumber, and StoreOperationHoursRange embedded data (fixes intermittent missing info) - Add playwright-extra and stealth plugin to package.json Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8be3a3d802
commit
b14f26ed47
4 changed files with 406 additions and 6 deletions
|
|
@ -1352,13 +1352,137 @@
|
|||
<cfset ddBusiness["name"] = ddTitle>
|
||||
</cfif>
|
||||
</cfif>
|
||||
<cfset ddAddrMatch = reMatchNoCase('\\"__typename\\":\\"StoreHeaderAddress\\",\\"street\\":\\"([^\\"]+)\\",\\"displayAddress\\":\\"([^\\"]+)\\"', pageHtml)>
|
||||
<cfif arrayLen(ddAddrMatch)>
|
||||
<cfset ddAddr = reReplaceNoCase(ddAddrMatch[1], '.*\\"displayAddress\\":\\"([^\\"]+)\\".*', '\1')>
|
||||
<cfset ddBusiness["address"] = ddAddr>
|
||||
|
||||
<!--- Extract address from StoreHeaderAddress --->
|
||||
<cfset ddAddrMarker = BQ & "__typename" & BQ & ":" & BQ & "StoreHeaderAddress" & BQ>
|
||||
<cfset ddAddrPos = findNoCase(ddAddrMarker, pageHtml)>
|
||||
<cfif ddAddrPos GT 0>
|
||||
<cfset ddAddrEnd = findNoCase(BQ & "__typename" & BQ, pageHtml, ddAddrPos + len(ddAddrMarker))>
|
||||
<cfif ddAddrEnd EQ 0><cfset ddAddrEnd = min(ddAddrPos + 2000, len(pageHtml))></cfif>
|
||||
<cfset ddAddrSection = mid(pageHtml, ddAddrPos, ddAddrEnd - ddAddrPos)>
|
||||
|
||||
<!--- Street --->
|
||||
<cfset streetKey = BQ & "street" & BQ & ":" & BQ>
|
||||
<cfset stPos = findNoCase(streetKey, ddAddrSection)>
|
||||
<cfif stPos GT 0>
|
||||
<cfset stStart = stPos + len(streetKey)>
|
||||
<cfset stEnd = find(BQ, ddAddrSection, stStart)>
|
||||
<cfif stEnd GT stStart>
|
||||
<cfset ddBusiness["street"] = mid(ddAddrSection, stStart, stEnd - stStart)>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<!--- Display address (city, state, zip) --->
|
||||
<cfset daKey = BQ & "displayAddress" & BQ & ":" & BQ>
|
||||
<cfset daPos = findNoCase(daKey, ddAddrSection)>
|
||||
<cfif daPos GT 0>
|
||||
<cfset daStart = daPos + len(daKey)>
|
||||
<cfset daEnd = find(BQ, ddAddrSection, daStart)>
|
||||
<cfif daEnd GT daStart>
|
||||
<cfset ddBusiness["address"] = mid(ddAddrSection, daStart, daEnd - daStart)>
|
||||
</cfif>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<!--- Extract phone from StoreHeaderPhoneNumber --->
|
||||
<cfset ddPhoneMarker = BQ & "__typename" & BQ & ":" & BQ & "StoreHeaderPhoneNumber" & BQ>
|
||||
<cfset ddPhonePos = findNoCase(ddPhoneMarker, pageHtml)>
|
||||
<cfif ddPhonePos GT 0>
|
||||
<cfset ddPhoneEnd = findNoCase(BQ & "__typename" & BQ, pageHtml, ddPhonePos + len(ddPhoneMarker))>
|
||||
<cfif ddPhoneEnd EQ 0><cfset ddPhoneEnd = min(ddPhonePos + 1000, len(pageHtml))></cfif>
|
||||
<cfset ddPhoneSection = mid(pageHtml, ddPhonePos, ddPhoneEnd - ddPhonePos)>
|
||||
|
||||
<cfset phoneValKey = BQ & "phoneNumber" & BQ & ":" & BQ>
|
||||
<cfset phPos = findNoCase(phoneValKey, ddPhoneSection)>
|
||||
<cfif phPos GT 0>
|
||||
<cfset phStart = phPos + len(phoneValKey)>
|
||||
<cfset phEnd = find(BQ, ddPhoneSection, phStart)>
|
||||
<cfif phEnd GT phStart>
|
||||
<cfset ddBusiness["phone"] = mid(ddPhoneSection, phStart, phEnd - phStart)>
|
||||
</cfif>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<!--- Extract hours from StoreOperationHoursRange entries --->
|
||||
<cfset ddHoursMarker = BQ & "__typename" & BQ & ":" & BQ & "StoreOperationHoursRange" & BQ>
|
||||
<cfset ddHoursPos = findNoCase(ddHoursMarker, pageHtml)>
|
||||
<cfif ddHoursPos GT 0>
|
||||
<cfset ddHoursArr = arrayNew(1)>
|
||||
<cfset hPos = 1>
|
||||
<cfloop condition="true">
|
||||
<cfset hPos = findNoCase(ddHoursMarker, pageHtml, hPos)>
|
||||
<cfif hPos EQ 0><cfbreak></cfif>
|
||||
<cfset hNext = findNoCase(ddHoursMarker, pageHtml, hPos + len(ddHoursMarker))>
|
||||
<cfif hNext EQ 0><cfset hNext = min(hPos + 500, len(pageHtml))></cfif>
|
||||
<cfset hSection = mid(pageHtml, hPos, hNext - hPos)>
|
||||
|
||||
<cfset dayKey = BQ & "dayRange" & BQ & ":" & BQ>
|
||||
<cfset timeKey = BQ & "timeRange" & BQ & ":" & BQ>
|
||||
<cfset dPos = findNoCase(dayKey, hSection)>
|
||||
<cfset tPos = findNoCase(timeKey, hSection)>
|
||||
<cfif dPos GT 0 AND tPos GT 0>
|
||||
<cfset dStart = dPos + len(dayKey)>
|
||||
<cfset dEnd = find(BQ, hSection, dStart)>
|
||||
<cfset tStart = tPos + len(timeKey)>
|
||||
<cfset tEnd = find(BQ, hSection, tStart)>
|
||||
<cfif dEnd GT dStart AND tEnd GT tStart>
|
||||
<cfset arrayAppend(ddHoursArr, mid(hSection, dStart, dEnd - dStart) & ": " & mid(hSection, tStart, tEnd - tStart))>
|
||||
</cfif>
|
||||
</cfif>
|
||||
<cfset hPos = hPos + len(ddHoursMarker)>
|
||||
</cfloop>
|
||||
<cfif arrayLen(ddHoursArr)>
|
||||
<cfset ddBusiness["hours"] = arrayToList(ddHoursArr, "; ")>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<cfif arrayLen(ddItems) GT 0>
|
||||
<!--- Run stealth Playwright to extract modifiers by clicking items --->
|
||||
<cfset ddModifiers = arrayNew(1)>
|
||||
<cfset ddItemModMap = structNew()>
|
||||
<cftry>
|
||||
<cfset arrayAppend(response.steps, "Running stealth Playwright for modifier extraction...")>
|
||||
<cfset modTimeout = 180000 + (arrayLen(ddItems) * 1500)>
|
||||
<cfif modTimeout GT 600000><cfset modTimeout = 600000></cfif>
|
||||
<cfexecute name="/opt/playwright/run-doordash-modifiers.sh"
|
||||
arguments="#targetUrl#"
|
||||
timeout="#int(modTimeout / 1000)#"
|
||||
variable="ddModResult" />
|
||||
|
||||
<cfif len(trim(ddModResult))>
|
||||
<cfset ddModData = deserializeJSON(trim(ddModResult))>
|
||||
<cfif structKeyExists(ddModData, "modifiers") AND isArray(ddModData.modifiers) AND arrayLen(ddModData.modifiers) GT 0>
|
||||
<cfset ddModifiers = ddModData.modifiers>
|
||||
<!--- Convert modifier options to wizard format --->
|
||||
<cfloop array="#ddModifiers#" index="ddMod">
|
||||
<!--- Determine type based on maxSelections --->
|
||||
<cfif structKeyExists(ddMod, "maxSelections") AND ddMod.maxSelections EQ 1>
|
||||
<cfset ddMod["type"] = "select">
|
||||
<cfelse>
|
||||
<cfset ddMod["type"] = "checkbox">
|
||||
</cfif>
|
||||
</cfloop>
|
||||
</cfif>
|
||||
|
||||
<!--- Map modifiers to items --->
|
||||
<cfif structKeyExists(ddModData, "itemModifierMap") AND isStruct(ddModData.itemModifierMap)>
|
||||
<cfset ddItemModMap = ddModData.itemModifierMap>
|
||||
<!--- Apply modifier names to ddItems --->
|
||||
<cfloop array="#ddItems#" index="ddi">
|
||||
<cfif structKeyExists(ddItemModMap, ddi.name)>
|
||||
<cfset ddi["modifiers"] = ddItemModMap[ddi.name]>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
</cfif>
|
||||
|
||||
<cfset modStats = structKeyExists(ddModData, "stats") ? ddModData.stats : {}>
|
||||
<cfset arrayAppend(response.steps, "Modifier extraction: " & arrayLen(ddModifiers) & " groups, " & structCount(ddItemModMap) & " items mapped")>
|
||||
</cfif>
|
||||
<cfcatch>
|
||||
<cfset arrayAppend(response.steps, "Modifier extraction failed (non-fatal): " & cfcatch.message)>
|
||||
</cfcatch>
|
||||
</cftry>
|
||||
|
||||
<!--- Build image URL list --->
|
||||
<cfset ddImageUrls = arrayNew(1)>
|
||||
<cfloop array="#ddItems#" index="ddI">
|
||||
|
|
@ -1370,7 +1494,7 @@
|
|||
<cfset menuData = {
|
||||
"business": ddBusiness,
|
||||
"categories": ddCategories,
|
||||
"modifiers": arrayNew(1),
|
||||
"modifiers": ddModifiers,
|
||||
"items": ddItems,
|
||||
"imageUrls": ddImageUrls,
|
||||
"headerCandidateIndices": arrayNew(1)
|
||||
|
|
|
|||
271
playwright/doordash-modifiers.js
Normal file
271
playwright/doordash-modifiers.js
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
const { chromium } = require("playwright-extra");
|
||||
const stealth = require("puppeteer-extra-plugin-stealth");
|
||||
chromium.use(stealth());
|
||||
|
||||
(async () => {
|
||||
const url = process.argv[2];
|
||||
if (!url) {
|
||||
console.log(JSON.stringify({ error: "URL required", modifiers: [], itemModifierMap: {} }));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const log = (msg) => process.stderr.write("[dd-mod] " + msg + "\n");
|
||||
|
||||
let browser;
|
||||
try {
|
||||
browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1280, height: 900 }
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
// Intercept itemPage GraphQL responses
|
||||
let latestItemPage = null;
|
||||
let responseCount = 0;
|
||||
|
||||
page.on("response", async (response) => {
|
||||
try {
|
||||
const responseUrl = response.url();
|
||||
if (responseUrl.includes("graphql") || responseUrl.includes("api/v2")) {
|
||||
const ct = response.headers()["content-type"] || "";
|
||||
if (ct.includes("json")) {
|
||||
const body = await response.json();
|
||||
// DoorDash itemPage response structure
|
||||
if (body && body.data && body.data.itemPage) {
|
||||
latestItemPage = body.data.itemPage;
|
||||
responseCount++;
|
||||
}
|
||||
// Some DoorDash endpoints wrap in array
|
||||
if (Array.isArray(body)) {
|
||||
for (const entry of body) {
|
||||
if (entry && entry.data && entry.data.itemPage) {
|
||||
latestItemPage = entry.data.itemPage;
|
||||
responseCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {}
|
||||
});
|
||||
|
||||
log("Navigating to " + url);
|
||||
await page.goto(url, { waitUntil: "load", timeout: 60000 });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Scroll to load all items
|
||||
const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
const viewportHeight = await page.evaluate(() => window.innerHeight);
|
||||
const scrollSteps = Math.min(Math.ceil(scrollHeight / viewportHeight), 20);
|
||||
|
||||
for (let i = 0; i < scrollSteps; i++) {
|
||||
await page.evaluate((step) => {
|
||||
window.scrollTo(0, step * window.innerHeight);
|
||||
}, i + 1);
|
||||
await page.waitForTimeout(300);
|
||||
}
|
||||
await page.evaluate(() => window.scrollTo(0, 0));
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
// Find all clickable menu item elements
|
||||
// DoorDash renders items as buttons/anchors with item names and images
|
||||
const itemElements = await page.evaluate(() => {
|
||||
const items = [];
|
||||
const seen = new Set();
|
||||
|
||||
// Strategy 1: Look for item cards with data-anchor-id containing "MenuItem"
|
||||
document.querySelectorAll('[data-anchor-id*="MenuItem"]').forEach(el => {
|
||||
const nameEl = el.querySelector('[data-telemetry-id="storeMenuItem.title"]') ||
|
||||
el.querySelector('span[class*="Text"]') ||
|
||||
el.querySelector('h3') ||
|
||||
el.querySelector('span');
|
||||
if (nameEl) {
|
||||
const name = nameEl.textContent.trim();
|
||||
if (name && !seen.has(name) && name.length > 1 && name.length < 200) {
|
||||
seen.add(name);
|
||||
const rect = el.getBoundingClientRect();
|
||||
if (rect.width > 0 && rect.height > 0) {
|
||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Strategy 2: Look for buttons/divs containing item names with prices
|
||||
if (items.length === 0) {
|
||||
document.querySelectorAll('button, [role="button"], [data-testid*="item"], [data-testid*="menu"]').forEach(el => {
|
||||
const text = el.textContent || "";
|
||||
// Items typically have a price like $X.XX
|
||||
if (text.match(/\$\d+\.\d{2}/) && text.length < 500) {
|
||||
const lines = text.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
||||
const name = lines[0];
|
||||
if (name && !seen.has(name) && name.length > 1 && name.length < 200 && !name.startsWith("$")) {
|
||||
seen.add(name);
|
||||
const rect = el.getBoundingClientRect();
|
||||
if (rect.width > 0 && rect.height > 0) {
|
||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 3: Generic - find any clickable element with an image and text nearby
|
||||
if (items.length === 0) {
|
||||
document.querySelectorAll('img[src*="cdn4dd"]').forEach(img => {
|
||||
const parent = img.closest('a, button, [role="button"], [tabindex="0"]') || img.parentElement.parentElement;
|
||||
if (parent) {
|
||||
const nameEl = parent.querySelector('span, h3, h4, p');
|
||||
if (nameEl) {
|
||||
const name = nameEl.textContent.trim();
|
||||
if (name && !seen.has(name) && name.length > 1 && name.length < 200) {
|
||||
seen.add(name);
|
||||
const rect = parent.getBoundingClientRect();
|
||||
if (rect.width > 0 && rect.height > 0) {
|
||||
items.push({ name, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return items;
|
||||
});
|
||||
|
||||
log("Found " + itemElements.length + " clickable items on page");
|
||||
|
||||
if (itemElements.length === 0) {
|
||||
log("No clickable items found, trying fallback...");
|
||||
// Take a screenshot for debugging
|
||||
console.log(JSON.stringify({ error: "No clickable items found", modifiers: [], itemModifierMap: {} }));
|
||||
await browser.close();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Click each item, capture modifier data
|
||||
const allModifierGroups = new Map(); // name -> modifier group data
|
||||
const itemModifierMap = {}; // item name -> [modifier group names]
|
||||
let clickedCount = 0;
|
||||
let modItemCount = 0;
|
||||
|
||||
// Limit to prevent timeouts (DoorDash has many items)
|
||||
const maxClicks = Math.min(itemElements.length, 200);
|
||||
|
||||
for (let i = 0; i < maxClicks; i++) {
|
||||
const item = itemElements[i];
|
||||
try {
|
||||
latestItemPage = null;
|
||||
|
||||
// Scroll item into view and click by coordinates
|
||||
await page.evaluate((y) => window.scrollTo(0, y - 300), item.y);
|
||||
await page.waitForTimeout(200);
|
||||
|
||||
// Recalculate position after scroll
|
||||
const freshPos = await page.evaluate((itemName) => {
|
||||
const els = document.querySelectorAll('[data-anchor-id*="MenuItem"], button, [role="button"]');
|
||||
for (const el of els) {
|
||||
if (el.textContent.includes(itemName)) {
|
||||
const rect = el.getBoundingClientRect();
|
||||
if (rect.width > 0 && rect.height > 0) {
|
||||
return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2, found: true };
|
||||
}
|
||||
}
|
||||
}
|
||||
return { found: false };
|
||||
}, item.name);
|
||||
|
||||
if (!freshPos.found) {
|
||||
continue;
|
||||
}
|
||||
|
||||
await page.mouse.click(freshPos.x, freshPos.y);
|
||||
clickedCount++;
|
||||
|
||||
// Wait for GraphQL response (up to 4s)
|
||||
const startTime = Date.now();
|
||||
while (!latestItemPage && Date.now() - startTime < 4000) {
|
||||
await page.waitForTimeout(150);
|
||||
}
|
||||
|
||||
if (latestItemPage && latestItemPage.optionLists && Array.isArray(latestItemPage.optionLists)) {
|
||||
const optionLists = latestItemPage.optionLists;
|
||||
if (optionLists.length > 0) {
|
||||
const modNames = [];
|
||||
for (const ol of optionLists) {
|
||||
const olName = ol.name || "Options";
|
||||
if (!allModifierGroups.has(olName)) {
|
||||
const options = [];
|
||||
if (ol.options && Array.isArray(ol.options)) {
|
||||
for (const opt of ol.options) {
|
||||
const price = opt.price ? (typeof opt.price === "number" ? opt.price / 100 : parseFloat(opt.price) || 0) : 0;
|
||||
options.push({
|
||||
name: opt.name || "",
|
||||
price: price
|
||||
});
|
||||
}
|
||||
}
|
||||
allModifierGroups.set(olName, {
|
||||
name: olName,
|
||||
required: ol.isRequired || false,
|
||||
minSelections: ol.minNumOptions || 0,
|
||||
maxSelections: ol.maxNumOptions || 0,
|
||||
options: options
|
||||
});
|
||||
}
|
||||
modNames.push(olName);
|
||||
}
|
||||
itemModifierMap[item.name] = modNames;
|
||||
modItemCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Close modal (press Escape or click outside)
|
||||
await page.keyboard.press("Escape");
|
||||
await page.waitForTimeout(400);
|
||||
|
||||
// Check if modal is still open, click overlay if so
|
||||
const modalStillOpen = await page.evaluate(() => {
|
||||
const overlay = document.querySelector('[data-testid="modal-overlay"], [class*="ModalOverlay"], [class*="overlay"]');
|
||||
return !!overlay;
|
||||
});
|
||||
if (modalStillOpen) {
|
||||
await page.mouse.click(10, 10);
|
||||
await page.waitForTimeout(300);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
log("Error clicking " + item.name + ": " + e.message);
|
||||
try { await page.keyboard.press("Escape"); } catch (e2) {}
|
||||
await page.waitForTimeout(300);
|
||||
}
|
||||
|
||||
// Progress log every 20 items
|
||||
if ((i + 1) % 20 === 0) {
|
||||
log("Progress: " + (i + 1) + "/" + maxClicks + " clicked, " + modItemCount + " with modifiers");
|
||||
}
|
||||
}
|
||||
|
||||
log("Done: " + clickedCount + " clicked, " + modItemCount + " items with modifiers, " + allModifierGroups.size + " unique modifier groups");
|
||||
|
||||
const modifiers = Array.from(allModifierGroups.values());
|
||||
|
||||
console.log(JSON.stringify({
|
||||
modifiers: modifiers,
|
||||
itemModifierMap: itemModifierMap,
|
||||
stats: {
|
||||
clickableItems: itemElements.length,
|
||||
clickedCount: clickedCount,
|
||||
itemsWithModifiers: modItemCount,
|
||||
uniqueModifierGroups: modifiers.length
|
||||
}
|
||||
}));
|
||||
|
||||
} catch (e) {
|
||||
log("Fatal error: " + e.message);
|
||||
console.log(JSON.stringify({ error: e.message, modifiers: [], itemModifierMap: {} }));
|
||||
}
|
||||
|
||||
if (browser) await browser.close();
|
||||
})();
|
||||
|
|
@ -10,6 +10,8 @@
|
|||
"license": "ISC",
|
||||
"description": "",
|
||||
"dependencies": {
|
||||
"playwright": "^1.58.2"
|
||||
"playwright": "^1.58.2",
|
||||
"playwright-extra": "^4.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
3
playwright/run-doordash-modifiers.sh
Normal file
3
playwright/run-doordash-modifiers.sh
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
#!/bin/bash
|
||||
export PLAYWRIGHT_BROWSERS_PATH=/opt/playwright/browsers
|
||||
exec /usr/bin/node /opt/playwright/doordash-modifiers.js "$@"
|
||||
Reference in a new issue