Add DoorDash/order.online fast-path parser

Extract menu data directly from embedded JSON in DoorDash HTML:
- Categories from MenuBookCategory entries
- Items with names, descriptions, prices, and image URLs from StorePageCarouselItem
- Business info from page title and StoreHeaderAddress
- Uses Claude to assign items to categories
- Upgrades image URLs to 600px for better quality

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-10 11:44:24 -07:00
parent dd2a508680
commit 67e2079550

View file

@ -1147,6 +1147,182 @@
</cfif>
<!--- ========== END WOOCOMMERCE FAST PATH ========== --->
<!--- ========== DOORDASH / ORDER.ONLINE FAST PATH ========== --->
<cfif findNoCase("StorePageCarouselItem", pageHtml) AND findNoCase("MenuBookCategory", pageHtml)>
<cfset arrayAppend(response.steps, "DoorDash/order.online site detected - extracting embedded data")>
<cftry>
<!--- Extract categories from MenuBookCategory --->
<!--- HTML contains escaped JSON: \"__typename\":\"MenuBookCategory\" etc. --->
<cfset ddCatMatches = reMatchNoCase('\\"__typename\\":\\"MenuBookCategory\\",\\"id\\":\\"([^\\"]+)\\",\\"name\\":\\"([^\\"]+)\\",\\"numItems\\":(\d+)', pageHtml)>
<cfset ddCategories = arrayNew(1)>
<cfset ddCatSeen = structNew()>
<cfloop array="#ddCatMatches#" index="ddCatMatch">
<cfset ddCatName = reReplaceNoCase(ddCatMatch, '.*\\"name\\":\\"([^\\"]+)\\".*', '\1')>
<cfset ddCatName = replace(ddCatName, '\u0026', '&', 'all')>
<cfset ddCatName = replace(ddCatName, '&amp;', '&', 'all')>
<cfif NOT structKeyExists(ddCatSeen, ddCatName) AND ddCatName NEQ "Most Ordered">
<cfset ddCatSeen[ddCatName] = true>
<cfset arrayAppend(ddCategories, { "name": ddCatName, "parentCategoryName": "" })>
</cfif>
</cfloop>
<cfset arrayAppend(response.steps, "Found " & arrayLen(ddCategories) & " DoorDash categories")>
<!--- Extract items from StorePageCarouselItem --->
<cfset ddItemMatches = reMatchNoCase('\\"__typename\\":\\"StorePageCarouselItem\\",\\"id\\":\\"(\d+)\\",\\"name\\":\\"([^\\"]+)\\",\\"description\\":\\"([^\\"]*)\\",\\"displayPrice\\":\\"([^\\"]*)\\",\\"displayStrikethroughPrice\\":\\"[^\\"]*\\",\\"imgUrl\\":\\"([^\\"]*?)\\"', pageHtml)>
<cfset ddItems = arrayNew(1)>
<cfset ddItemSeen = structNew()>
<cfloop array="#ddItemMatches#" index="ddItemMatch">
<cfset ddItemName = reReplaceNoCase(ddItemMatch, '.*\\"name\\":\\"([^\\"]+)\\".*', '\1')>
<cfset ddItemName = replace(ddItemName, '\u0026', '&', 'all')>
<cfset ddItemName = replace(ddItemName, '&amp;', '&', 'all')>
<cfif structKeyExists(ddItemSeen, ddItemName)>
<cfcontinue>
</cfif>
<cfset ddItemSeen[ddItemName] = true>
<cfset ddItemDesc = reReplaceNoCase(ddItemMatch, '.*\\"description\\":\\"([^\\"]*)\\"\s*,\s*\\"displayPrice.*', '\1')>
<cfset ddItemDesc = replace(ddItemDesc, '\u0026', '&', 'all')>
<cfset ddItemPrice = reReplaceNoCase(ddItemMatch, '.*\\"displayPrice\\":\\"([^\\"]*)\\"\s*,.*', '\1')>
<cfset ddItemPrice = reReplace(ddItemPrice, '[^0-9.]', '', 'all')>
<cfset ddItemImg = reReplaceNoCase(ddItemMatch, '.*\\"imgUrl\\":\\"([^\\"]*)\\"\s*,?.*', '\1')>
<!--- Upgrade image to larger size --->
<cfif len(ddItemImg) AND findNoCase("width=", ddItemImg)>
<cfset ddItemImg = reReplaceNoCase(ddItemImg, 'width=\d+', 'width=600')>
<cfset ddItemImg = reReplaceNoCase(ddItemImg, 'height=\d+', 'height=600')>
</cfif>
<cfset ddItem = structNew()>
<cfset ddItem["name"] = ddItemName>
<cfset ddItem["description"] = ddItemDesc>
<cfset ddItem["price"] = val(ddItemPrice)>
<cfset ddItem["imageUrl"] = ddItemImg>
<cfset ddItem["imageSrc"] = ddItemImg>
<cfif len(ddItemImg)>
<cfset ddItem["imageFilename"] = listLast(ddItemImg, "/")>
</cfif>
<cfset ddItem["category"] = "">
<cfset ddItem["modifiers"] = arrayNew(1)>
<cfset ddItem["id"] = "item_" & arrayLen(ddItems) + 1>
<cfset arrayAppend(ddItems, ddItem)>
</cfloop>
<cfset arrayAppend(response.steps, "Found " & arrayLen(ddItems) & " DoorDash items with images")>
<!--- Try to assign categories to items using category button labels from HTML --->
<!--- DoorDash renders category sections with aria-labels matching category names --->
<!--- Items in the carousel don't have explicit category assignment, so use Claude for that --->
<!--- Extract business info --->
<cfset ddBusiness = structNew()>
<!--- Business name from title or og:title --->
<cfset ddTitleMatch = reMatchNoCase('<title>([^<]+)</title>', pageHtml)>
<cfif arrayLen(ddTitleMatch)>
<cfset ddTitle = reReplaceNoCase(ddTitleMatch[1], '<title>([^<]+)</title>', '\1')>
<cfset ddTitle = reReplace(ddTitle, '\s*[-|].*', '')>
<cfset ddTitle = trim(ddTitle)>
<cfif len(ddTitle)>
<cfset ddBusiness["name"] = ddTitle>
</cfif>
</cfif>
<!--- Address from StoreHeaderAddress --->
<cfset ddAddrMatch = reMatchNoCase('\\"__typename\\":\\"StoreHeaderAddress\\",\\"street\\":\\"([^\\"]+)\\",\\"displayAddress\\":\\"([^\\"]+)\\"', pageHtml)>
<cfif arrayLen(ddAddrMatch)>
<cfset ddAddr = reReplaceNoCase(ddAddrMatch[1], '.*\\"displayAddress\\":\\"([^\\"]+)\\".*', '\1')>
<cfset ddBusiness["address"] = ddAddr>
</cfif>
<cfif arrayLen(ddItems) GT 0>
<!--- Use Claude to assign categories to items --->
<cfset ddCatNames = arrayNew(1)>
<cfloop array="#ddCategories#" index="ddCat">
<cfset arrayAppend(ddCatNames, ddCat.name)>
</cfloop>
<cfset ddItemNames = arrayNew(1)>
<cfloop array="#ddItems#" index="ddI">
<cfset arrayAppend(ddItemNames, ddI.name & " - " & left(ddI.description, 60))>
</cfloop>
<cfset ddCatPrompt = "Given these restaurant menu categories: " & arrayToList(ddCatNames, ", ") & chr(10) & chr(10) & "Assign each of these items to the best matching category. Return ONLY a JSON array of category names in the same order as the items:" & chr(10) & serializeJSON(ddItemNames)>
<cfset ddCatRequest = {
"model": "claude-sonnet-4-20250514",
"max_tokens": 4096,
"temperature": 0,
"messages": [{ "role": "user", "content": ddCatPrompt }]
}>
<cftry>
<cfhttp url="https://api.anthropic.com/v1/messages" method="POST" timeout="60" result="ddCatResult">
<cfhttpparam type="header" name="Content-Type" value="application/json">
<cfhttpparam type="header" name="x-api-key" value="#CLAUDE_API_KEY#">
<cfhttpparam type="header" name="anthropic-version" value="2023-06-01">
<cfhttpparam type="body" value="#serializeJSON(ddCatRequest)#">
</cfhttp>
<cfif findNoCase("200", ddCatResult.statusCode)>
<cfset ddCatResponse = deserializeJSON(ddCatResult.fileContent)>
<cfset ddCatText = ddCatResponse.content[1].text>
<!--- Strip code fences --->
<cfset ddCatText = trim(ddCatText)>
<cfif left(ddCatText, 7) EQ "```json">
<cfset ddCatText = mid(ddCatText, 8, len(ddCatText) - 7)>
</cfif>
<cfif left(ddCatText, 3) EQ "```">
<cfset ddCatText = mid(ddCatText, 4, len(ddCatText) - 3)>
</cfif>
<cfif right(ddCatText, 3) EQ "```">
<cfset ddCatText = left(ddCatText, len(ddCatText) - 3)>
</cfif>
<cfset ddCatText = trim(ddCatText)>
<cfset ddCatAssignments = deserializeJSON(ddCatText)>
<cfif isArray(ddCatAssignments) AND arrayLen(ddCatAssignments) EQ arrayLen(ddItems)>
<cfloop from="1" to="#arrayLen(ddItems)#" index="ddIdx">
<cfset ddItems[ddIdx]["category"] = ddCatAssignments[ddIdx]>
</cfloop>
<cfset arrayAppend(response.steps, "Claude assigned categories to all items")>
</cfif>
</cfif>
<cfcatch>
<cfset arrayAppend(response.steps, "Category assignment failed: " & cfcatch.message)>
</cfcatch>
</cftry>
<!--- Build image URL list --->
<cfset ddImageUrls = arrayNew(1)>
<cfloop array="#ddItems#" index="ddI">
<cfif len(ddI.imageUrl)>
<cfset arrayAppend(ddImageUrls, ddI.imageUrl)>
</cfif>
</cfloop>
<cfset menuData = {
"business": ddBusiness,
"categories": ddCategories,
"modifiers": arrayNew(1),
"items": ddItems,
"imageUrls": ddImageUrls,
"headerCandidateIndices": arrayNew(1)
}>
<cfset response["OK"] = true>
<cfset response["DATA"] = menuData>
<cfset response["sourceUrl"] = targetUrl>
<cfset response["parsedVia"] = "doordash_embedded">
<cfset response["imagesFound"] = arrayLen(ddImageUrls)>
<cfset response["playwrightImagesCount"] = arrayLen(playwrightImages)>
<cfcontent type="application/json" reset="true">
<cfoutput>#serializeJSON(response)#</cfoutput>
<cfabort>
</cfif>
<cfcatch>
<cfset arrayAppend(response.steps, "DoorDash extraction failed: " & cfcatch.message & " - falling through to Claude")>
</cfcatch>
</cftry>
</cfif>
<!--- ========== END DOORDASH FAST PATH ========== --->
<!--- Extract base URL for resolving relative links --->
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
<cfset basePath = reReplace(targetUrl, "(https?://[^/]+/[^?]*/?).*", "\1")>