Add DoorDash/order.online fast-path parser
Extract menu data directly from embedded JSON in DoorDash HTML: - Categories from MenuBookCategory entries - Items with names, descriptions, prices, and image URLs from StorePageCarouselItem - Business info from page title and StoreHeaderAddress - Uses Claude to assign items to categories - Upgrades image URLs to 600px for better quality Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
dd2a508680
commit
67e2079550
1 changed files with 176 additions and 0 deletions
|
|
@ -1147,6 +1147,182 @@
|
|||
</cfif>
|
||||
<!--- ========== END WOOCOMMERCE FAST PATH ========== --->
|
||||
|
||||
<!--- ========== DOORDASH / ORDER.ONLINE FAST PATH ========== --->
|
||||
<cfif findNoCase("StorePageCarouselItem", pageHtml) AND findNoCase("MenuBookCategory", pageHtml)>
|
||||
<cfset arrayAppend(response.steps, "DoorDash/order.online site detected - extracting embedded data")>
|
||||
<cftry>
|
||||
<!--- Extract categories from MenuBookCategory --->
|
||||
<!--- HTML contains escaped JSON: \"__typename\":\"MenuBookCategory\" etc. --->
|
||||
<cfset ddCatMatches = reMatchNoCase('\\"__typename\\":\\"MenuBookCategory\\",\\"id\\":\\"([^\\"]+)\\",\\"name\\":\\"([^\\"]+)\\",\\"numItems\\":(\d+)', pageHtml)>
|
||||
<cfset ddCategories = arrayNew(1)>
|
||||
<cfset ddCatSeen = structNew()>
|
||||
<cfloop array="#ddCatMatches#" index="ddCatMatch">
|
||||
<cfset ddCatName = reReplaceNoCase(ddCatMatch, '.*\\"name\\":\\"([^\\"]+)\\".*', '\1')>
|
||||
<cfset ddCatName = replace(ddCatName, '\u0026', '&', 'all')>
|
||||
<cfset ddCatName = replace(ddCatName, '&', '&', 'all')>
|
||||
<cfif NOT structKeyExists(ddCatSeen, ddCatName) AND ddCatName NEQ "Most Ordered">
|
||||
<cfset ddCatSeen[ddCatName] = true>
|
||||
<cfset arrayAppend(ddCategories, { "name": ddCatName, "parentCategoryName": "" })>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
<cfset arrayAppend(response.steps, "Found " & arrayLen(ddCategories) & " DoorDash categories")>
|
||||
|
||||
<!--- Extract items from StorePageCarouselItem --->
|
||||
<cfset ddItemMatches = reMatchNoCase('\\"__typename\\":\\"StorePageCarouselItem\\",\\"id\\":\\"(\d+)\\",\\"name\\":\\"([^\\"]+)\\",\\"description\\":\\"([^\\"]*)\\",\\"displayPrice\\":\\"([^\\"]*)\\",\\"displayStrikethroughPrice\\":\\"[^\\"]*\\",\\"imgUrl\\":\\"([^\\"]*?)\\"', pageHtml)>
|
||||
<cfset ddItems = arrayNew(1)>
|
||||
<cfset ddItemSeen = structNew()>
|
||||
<cfloop array="#ddItemMatches#" index="ddItemMatch">
|
||||
<cfset ddItemName = reReplaceNoCase(ddItemMatch, '.*\\"name\\":\\"([^\\"]+)\\".*', '\1')>
|
||||
<cfset ddItemName = replace(ddItemName, '\u0026', '&', 'all')>
|
||||
<cfset ddItemName = replace(ddItemName, '&', '&', 'all')>
|
||||
<cfif structKeyExists(ddItemSeen, ddItemName)>
|
||||
<cfcontinue>
|
||||
</cfif>
|
||||
<cfset ddItemSeen[ddItemName] = true>
|
||||
|
||||
<cfset ddItemDesc = reReplaceNoCase(ddItemMatch, '.*\\"description\\":\\"([^\\"]*)\\"\s*,\s*\\"displayPrice.*', '\1')>
|
||||
<cfset ddItemDesc = replace(ddItemDesc, '\u0026', '&', 'all')>
|
||||
|
||||
<cfset ddItemPrice = reReplaceNoCase(ddItemMatch, '.*\\"displayPrice\\":\\"([^\\"]*)\\"\s*,.*', '\1')>
|
||||
<cfset ddItemPrice = reReplace(ddItemPrice, '[^0-9.]', '', 'all')>
|
||||
|
||||
<cfset ddItemImg = reReplaceNoCase(ddItemMatch, '.*\\"imgUrl\\":\\"([^\\"]*)\\"\s*,?.*', '\1')>
|
||||
<!--- Upgrade image to larger size --->
|
||||
<cfif len(ddItemImg) AND findNoCase("width=", ddItemImg)>
|
||||
<cfset ddItemImg = reReplaceNoCase(ddItemImg, 'width=\d+', 'width=600')>
|
||||
<cfset ddItemImg = reReplaceNoCase(ddItemImg, 'height=\d+', 'height=600')>
|
||||
</cfif>
|
||||
|
||||
<cfset ddItem = structNew()>
|
||||
<cfset ddItem["name"] = ddItemName>
|
||||
<cfset ddItem["description"] = ddItemDesc>
|
||||
<cfset ddItem["price"] = val(ddItemPrice)>
|
||||
<cfset ddItem["imageUrl"] = ddItemImg>
|
||||
<cfset ddItem["imageSrc"] = ddItemImg>
|
||||
<cfif len(ddItemImg)>
|
||||
<cfset ddItem["imageFilename"] = listLast(ddItemImg, "/")>
|
||||
</cfif>
|
||||
<cfset ddItem["category"] = "">
|
||||
<cfset ddItem["modifiers"] = arrayNew(1)>
|
||||
<cfset ddItem["id"] = "item_" & arrayLen(ddItems) + 1>
|
||||
<cfset arrayAppend(ddItems, ddItem)>
|
||||
</cfloop>
|
||||
<cfset arrayAppend(response.steps, "Found " & arrayLen(ddItems) & " DoorDash items with images")>
|
||||
|
||||
<!--- Try to assign categories to items using category button labels from HTML --->
|
||||
<!--- DoorDash renders category sections with aria-labels matching category names --->
|
||||
<!--- Items in the carousel don't have explicit category assignment, so use Claude for that --->
|
||||
|
||||
<!--- Extract business info --->
|
||||
<cfset ddBusiness = structNew()>
|
||||
<!--- Business name from title or og:title --->
|
||||
<cfset ddTitleMatch = reMatchNoCase('<title>([^<]+)</title>', pageHtml)>
|
||||
<cfif arrayLen(ddTitleMatch)>
|
||||
<cfset ddTitle = reReplaceNoCase(ddTitleMatch[1], '<title>([^<]+)</title>', '\1')>
|
||||
<cfset ddTitle = reReplace(ddTitle, '\s*[-|].*', '')>
|
||||
<cfset ddTitle = trim(ddTitle)>
|
||||
<cfif len(ddTitle)>
|
||||
<cfset ddBusiness["name"] = ddTitle>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<!--- Address from StoreHeaderAddress --->
|
||||
<cfset ddAddrMatch = reMatchNoCase('\\"__typename\\":\\"StoreHeaderAddress\\",\\"street\\":\\"([^\\"]+)\\",\\"displayAddress\\":\\"([^\\"]+)\\"', pageHtml)>
|
||||
<cfif arrayLen(ddAddrMatch)>
|
||||
<cfset ddAddr = reReplaceNoCase(ddAddrMatch[1], '.*\\"displayAddress\\":\\"([^\\"]+)\\".*', '\1')>
|
||||
<cfset ddBusiness["address"] = ddAddr>
|
||||
</cfif>
|
||||
|
||||
<cfif arrayLen(ddItems) GT 0>
|
||||
<!--- Use Claude to assign categories to items --->
|
||||
<cfset ddCatNames = arrayNew(1)>
|
||||
<cfloop array="#ddCategories#" index="ddCat">
|
||||
<cfset arrayAppend(ddCatNames, ddCat.name)>
|
||||
</cfloop>
|
||||
<cfset ddItemNames = arrayNew(1)>
|
||||
<cfloop array="#ddItems#" index="ddI">
|
||||
<cfset arrayAppend(ddItemNames, ddI.name & " - " & left(ddI.description, 60))>
|
||||
</cfloop>
|
||||
|
||||
<cfset ddCatPrompt = "Given these restaurant menu categories: " & arrayToList(ddCatNames, ", ") & chr(10) & chr(10) & "Assign each of these items to the best matching category. Return ONLY a JSON array of category names in the same order as the items:" & chr(10) & serializeJSON(ddItemNames)>
|
||||
|
||||
<cfset ddCatRequest = {
|
||||
"model": "claude-sonnet-4-20250514",
|
||||
"max_tokens": 4096,
|
||||
"temperature": 0,
|
||||
"messages": [{ "role": "user", "content": ddCatPrompt }]
|
||||
}>
|
||||
|
||||
<cftry>
|
||||
<cfhttp url="https://api.anthropic.com/v1/messages" method="POST" timeout="60" result="ddCatResult">
|
||||
<cfhttpparam type="header" name="Content-Type" value="application/json">
|
||||
<cfhttpparam type="header" name="x-api-key" value="#CLAUDE_API_KEY#">
|
||||
<cfhttpparam type="header" name="anthropic-version" value="2023-06-01">
|
||||
<cfhttpparam type="body" value="#serializeJSON(ddCatRequest)#">
|
||||
</cfhttp>
|
||||
|
||||
<cfif findNoCase("200", ddCatResult.statusCode)>
|
||||
<cfset ddCatResponse = deserializeJSON(ddCatResult.fileContent)>
|
||||
<cfset ddCatText = ddCatResponse.content[1].text>
|
||||
<!--- Strip code fences --->
|
||||
<cfset ddCatText = trim(ddCatText)>
|
||||
<cfif left(ddCatText, 7) EQ "```json">
|
||||
<cfset ddCatText = mid(ddCatText, 8, len(ddCatText) - 7)>
|
||||
</cfif>
|
||||
<cfif left(ddCatText, 3) EQ "```">
|
||||
<cfset ddCatText = mid(ddCatText, 4, len(ddCatText) - 3)>
|
||||
</cfif>
|
||||
<cfif right(ddCatText, 3) EQ "```">
|
||||
<cfset ddCatText = left(ddCatText, len(ddCatText) - 3)>
|
||||
</cfif>
|
||||
<cfset ddCatText = trim(ddCatText)>
|
||||
<cfset ddCatAssignments = deserializeJSON(ddCatText)>
|
||||
<cfif isArray(ddCatAssignments) AND arrayLen(ddCatAssignments) EQ arrayLen(ddItems)>
|
||||
<cfloop from="1" to="#arrayLen(ddItems)#" index="ddIdx">
|
||||
<cfset ddItems[ddIdx]["category"] = ddCatAssignments[ddIdx]>
|
||||
</cfloop>
|
||||
<cfset arrayAppend(response.steps, "Claude assigned categories to all items")>
|
||||
</cfif>
|
||||
</cfif>
|
||||
<cfcatch>
|
||||
<cfset arrayAppend(response.steps, "Category assignment failed: " & cfcatch.message)>
|
||||
</cfcatch>
|
||||
</cftry>
|
||||
|
||||
<!--- Build image URL list --->
|
||||
<cfset ddImageUrls = arrayNew(1)>
|
||||
<cfloop array="#ddItems#" index="ddI">
|
||||
<cfif len(ddI.imageUrl)>
|
||||
<cfset arrayAppend(ddImageUrls, ddI.imageUrl)>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
|
||||
<cfset menuData = {
|
||||
"business": ddBusiness,
|
||||
"categories": ddCategories,
|
||||
"modifiers": arrayNew(1),
|
||||
"items": ddItems,
|
||||
"imageUrls": ddImageUrls,
|
||||
"headerCandidateIndices": arrayNew(1)
|
||||
}>
|
||||
|
||||
<cfset response["OK"] = true>
|
||||
<cfset response["DATA"] = menuData>
|
||||
<cfset response["sourceUrl"] = targetUrl>
|
||||
<cfset response["parsedVia"] = "doordash_embedded">
|
||||
<cfset response["imagesFound"] = arrayLen(ddImageUrls)>
|
||||
<cfset response["playwrightImagesCount"] = arrayLen(playwrightImages)>
|
||||
<cfcontent type="application/json" reset="true">
|
||||
<cfoutput>#serializeJSON(response)#</cfoutput>
|
||||
<cfabort>
|
||||
</cfif>
|
||||
<cfcatch>
|
||||
<cfset arrayAppend(response.steps, "DoorDash extraction failed: " & cfcatch.message & " - falling through to Claude")>
|
||||
</cfcatch>
|
||||
</cftry>
|
||||
</cfif>
|
||||
<!--- ========== END DOORDASH FAST PATH ========== --->
|
||||
|
||||
<!--- Extract base URL for resolving relative links --->
|
||||
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
|
||||
<cfset basePath = reReplace(targetUrl, "(https?://[^/]+/[^?]*/?).*", "\1")>
|
||||
|
|
|
|||
Reference in a new issue