Fix DoorDash parser: use find() loops instead of listToArray
listToArray treats delimiter as individual chars, not a string. Rewritten to use position-based find() traversal for proper multi-character delimiter splitting. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
33040c9cd3
commit
a830a0820a
1 changed files with 105 additions and 68 deletions
|
|
@ -1152,127 +1152,160 @@
|
||||||
<cfset arrayAppend(response.steps, "DoorDash/order.online site detected - extracting embedded data")>
|
<cfset arrayAppend(response.steps, "DoorDash/order.online site detected - extracting embedded data")>
|
||||||
<cftry>
|
<cftry>
|
||||||
<!--- DoorDash embeds menu data as escaped JSON in script tags --->
|
<!--- DoorDash embeds menu data as escaped JSON in script tags --->
|
||||||
<!--- The backslash-quote delimiter used throughout --->
|
<!--- Backslash-quote as it appears in the HTML: chr(92) & chr(34) --->
|
||||||
<cfset BQ = '\"'><!--- literal backslash-quote as it appears in the HTML --->
|
<cfset BQ = chr(92) & chr(34)>
|
||||||
|
|
||||||
<!--- Build an image map from StorePageCarouselItem entries (these have imgUrl) --->
|
<!--- Helper function: extract value after a \"key\":\" pattern --->
|
||||||
|
<!--- Returns text between the opening \" and closing \" --->
|
||||||
|
|
||||||
|
<!--- Build image map from StorePageCarouselItem entries --->
|
||||||
<cfset ddImageMap = structNew()>
|
<cfset ddImageMap = structNew()>
|
||||||
<cfset ddCarouselParts = listToArray(pageHtml, BQ & "__typename" & BQ & ":" & BQ & "StorePageCarouselItem" & BQ & "," & BQ & "id" & BQ & ":" & BQ)>
|
<cfset carouselMarker = BQ & "__typename" & BQ & ":" & BQ & "StorePageCarouselItem" & BQ>
|
||||||
<cfloop from="2" to="#arrayLen(ddCarouselParts)#" index="cpIdx">
|
<cfset searchPos = 1>
|
||||||
<cfset cp = ddCarouselParts[cpIdx]>
|
<cfloop condition="true">
|
||||||
|
<cfset searchPos = findNoCase(carouselMarker, pageHtml, searchPos)>
|
||||||
|
<cfif searchPos EQ 0><cfbreak></cfif>
|
||||||
|
<!--- Find the end of this entry (next typename marker or reasonable limit) --->
|
||||||
|
<cfset nextMarker = findNoCase(BQ & "__typename" & BQ, pageHtml, searchPos + len(carouselMarker))>
|
||||||
|
<cfif nextMarker EQ 0><cfset nextMarker = len(pageHtml)></cfif>
|
||||||
|
<cfset entryText = mid(pageHtml, searchPos, nextMarker - searchPos)>
|
||||||
|
|
||||||
<!--- Extract name --->
|
<!--- Extract name --->
|
||||||
<cfset cpNameStart = findNoCase(BQ & "name" & BQ & ":" & BQ, cp)>
|
<cfset nameKey = BQ & "name" & BQ & ":" & BQ>
|
||||||
<cfif cpNameStart GT 0>
|
<cfset nPos = findNoCase(nameKey, entryText)>
|
||||||
<cfset cpNameStart = cpNameStart + len(BQ & "name" & BQ & ":" & BQ)>
|
<cfif nPos GT 0>
|
||||||
<cfset cpNameEnd = find(BQ, cp, cpNameStart)>
|
<cfset nStart = nPos + len(nameKey)>
|
||||||
<cfif cpNameEnd GT cpNameStart>
|
<cfset nEnd = find(BQ, entryText, nStart)>
|
||||||
<cfset cpName = mid(cp, cpNameStart, cpNameEnd - cpNameStart)>
|
<cfif nEnd GT nStart>
|
||||||
|
<cfset cpName = mid(entryText, nStart, nEnd - nStart)>
|
||||||
<!--- Extract imgUrl --->
|
<!--- Extract imgUrl --->
|
||||||
<cfset cpImgStart = findNoCase(BQ & "imgUrl" & BQ & ":" & BQ, cp)>
|
<cfset imgKey = BQ & "imgUrl" & BQ & ":" & BQ>
|
||||||
<cfif cpImgStart GT 0>
|
<cfset iPos = findNoCase(imgKey, entryText)>
|
||||||
<cfset cpImgStart = cpImgStart + len(BQ & "imgUrl" & BQ & ":" & BQ)>
|
<cfif iPos GT 0>
|
||||||
<cfset cpImgEnd = find(BQ, cp, cpImgStart)>
|
<cfset iStart = iPos + len(imgKey)>
|
||||||
<cfif cpImgEnd GT cpImgStart>
|
<cfset iEnd = find(BQ, entryText, iStart)>
|
||||||
<cfset cpImgUrl = mid(cp, cpImgStart, cpImgEnd - cpImgStart)>
|
<cfif iEnd GT iStart>
|
||||||
<cfif len(cpImgUrl) AND cpImgUrl NEQ "null">
|
<cfset cpImg = mid(entryText, iStart, iEnd - iStart)>
|
||||||
<!--- Upgrade to larger size --->
|
<cfif len(cpImg) AND cpImg NEQ "null" AND findNoCase("http", cpImg)>
|
||||||
<cfif findNoCase("width=", cpImgUrl)>
|
<cfif findNoCase("width=", cpImg)>
|
||||||
<cfset cpImgUrl = reReplaceNoCase(cpImgUrl, 'width=\d+', 'width=600')>
|
<cfset cpImg = reReplaceNoCase(cpImg, 'width=\d+', 'width=600')>
|
||||||
<cfset cpImgUrl = reReplaceNoCase(cpImgUrl, 'height=\d+', 'height=600')>
|
<cfset cpImg = reReplaceNoCase(cpImg, 'height=\d+', 'height=600')>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset ddImageMap[cpName] = cpImgUrl>
|
<cfset ddImageMap[cpName] = cpImg>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
|
<cfset searchPos = searchPos + len(carouselMarker)>
|
||||||
</cfloop>
|
</cfloop>
|
||||||
<cfset arrayAppend(response.steps, "Built image map with " & structCount(ddImageMap) & " entries from carousel")>
|
<cfset arrayAppend(response.steps, "Built image map with " & structCount(ddImageMap) & " entries from carousel")>
|
||||||
|
|
||||||
<!--- Extract full menu from MenuPageItemList (categories) and MenuPageItem (items) --->
|
<!--- Extract full menu from MenuPageItemList (categories with items) --->
|
||||||
<cfset ddCategories = arrayNew(1)>
|
<cfset ddCategories = arrayNew(1)>
|
||||||
<cfset ddCatSeen = structNew()>
|
<cfset ddCatSeen = structNew()>
|
||||||
<cfset ddItems = arrayNew(1)>
|
<cfset ddItems = arrayNew(1)>
|
||||||
<cfset ddItemSeen = structNew()>
|
<cfset ddItemSeen = structNew()>
|
||||||
<cfset ddItemCounter = 0>
|
<cfset ddItemCounter = 0>
|
||||||
|
|
||||||
<!--- Split on MenuPageItemList to get category sections --->
|
<cfset catMarker = BQ & "__typename" & BQ & ":" & BQ & "MenuPageItemList" & BQ>
|
||||||
<cfset ddCatParts = listToArray(pageHtml, BQ & "__typename" & BQ & ":" & BQ & "MenuPageItemList" & BQ & "," & BQ & "id" & BQ & ":" & BQ)>
|
<cfset itemMarker = BQ & "__typename" & BQ & ":" & BQ & "MenuPageItem" & BQ>
|
||||||
<cfloop from="2" to="#arrayLen(ddCatParts)#" index="catIdx">
|
<cfset nameKey = BQ & "name" & BQ & ":" & BQ>
|
||||||
<cfset catPart = ddCatParts[catIdx]>
|
<cfset descKey = BQ & "description" & BQ & ":" & BQ>
|
||||||
|
<cfset priceKey = BQ & "displayPrice" & BQ & ":" & BQ>
|
||||||
|
<cfset imgKey = BQ & "imgUrl" & BQ & ":" & BQ>
|
||||||
|
|
||||||
|
<!--- Find each MenuPageItemList section --->
|
||||||
|
<cfset catPos = 1>
|
||||||
|
<cfloop condition="true">
|
||||||
|
<cfset catPos = findNoCase(catMarker, pageHtml, catPos)>
|
||||||
|
<cfif catPos EQ 0><cfbreak></cfif>
|
||||||
|
|
||||||
|
<!--- Find end of this category section (next MenuPageItemList or end) --->
|
||||||
|
<cfset nextCatPos = findNoCase(catMarker, pageHtml, catPos + len(catMarker))>
|
||||||
|
<cfif nextCatPos EQ 0><cfset nextCatPos = len(pageHtml)></cfif>
|
||||||
|
<cfset catSection = mid(pageHtml, catPos, nextCatPos - catPos)>
|
||||||
|
|
||||||
<!--- Extract category name --->
|
<!--- Extract category name --->
|
||||||
<cfset catNameStart = findNoCase(BQ & "name" & BQ & ":" & BQ, catPart)>
|
<cfset cnPos = findNoCase(nameKey, catSection)>
|
||||||
<cfif catNameStart EQ 0><cfcontinue></cfif>
|
<cfif cnPos EQ 0><cfset catPos = catPos + len(catMarker)><cfcontinue></cfif>
|
||||||
<cfset catNameStart = catNameStart + len(BQ & "name" & BQ & ":" & BQ)>
|
<cfset cnStart = cnPos + len(nameKey)>
|
||||||
<cfset catNameEnd = find(BQ, catPart, catNameStart)>
|
<cfset cnEnd = find(BQ, catSection, cnStart)>
|
||||||
<cfif catNameEnd LTE catNameStart><cfcontinue></cfif>
|
<cfif cnEnd LTE cnStart><cfset catPos = catPos + len(catMarker)><cfcontinue></cfif>
|
||||||
<cfset catName = mid(catPart, catNameStart, catNameEnd - catNameStart)>
|
<cfset catName = mid(catSection, cnStart, cnEnd - cnStart)>
|
||||||
<cfset catName = replace(catName, '\u0026', '&', 'all')>
|
<cfset catName = replace(catName, '\u0026', '&', 'all')>
|
||||||
<cfset catName = replace(catName, '&', '&', 'all')>
|
<cfset catName = replace(catName, '&', '&', 'all')>
|
||||||
|
|
||||||
<!--- Skip "Most Ordered" and duplicates --->
|
<!--- Skip "Most Ordered" and duplicates --->
|
||||||
<cfif catName EQ "Most Ordered" OR structKeyExists(ddCatSeen, catName)>
|
<cfif catName EQ "Most Ordered" OR structKeyExists(ddCatSeen, catName)>
|
||||||
|
<cfset catPos = catPos + len(catMarker)>
|
||||||
<cfcontinue>
|
<cfcontinue>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset ddCatSeen[catName] = true>
|
<cfset ddCatSeen[catName] = true>
|
||||||
<cfset arrayAppend(ddCategories, { "name": catName, "parentCategoryName": "" })>
|
<cfset arrayAppend(ddCategories, { "name": catName, "parentCategoryName": "" })>
|
||||||
|
|
||||||
<!--- Extract items within this category section --->
|
<!--- Find all MenuPageItem entries within this category section --->
|
||||||
<cfset itemParts = listToArray(catPart, BQ & "__typename" & BQ & ":" & BQ & "MenuPageItem" & BQ & "," & BQ & "id" & BQ & ":" & BQ)>
|
<cfset itemPos = 1>
|
||||||
<cfloop from="2" to="#arrayLen(itemParts)#" index="ipIdx">
|
<cfloop condition="true">
|
||||||
<cfset ip = itemParts[ipIdx]>
|
<cfset itemPos = findNoCase(itemMarker, catSection, itemPos)>
|
||||||
|
<cfif itemPos EQ 0><cfbreak></cfif>
|
||||||
|
|
||||||
|
<!--- Find end of this item entry --->
|
||||||
|
<cfset nextItemPos = findNoCase(itemMarker, catSection, itemPos + len(itemMarker))>
|
||||||
|
<cfif nextItemPos EQ 0><cfset nextItemPos = len(catSection)></cfif>
|
||||||
|
<cfset itemEntry = mid(catSection, itemPos, nextItemPos - itemPos)>
|
||||||
|
|
||||||
<!--- Extract item name --->
|
<!--- Extract item name --->
|
||||||
<cfset ipNameStart = findNoCase(BQ & "name" & BQ & ":" & BQ, ip)>
|
<cfset inPos = findNoCase(nameKey, itemEntry)>
|
||||||
<cfif ipNameStart EQ 0><cfcontinue></cfif>
|
<cfif inPos EQ 0><cfset itemPos = itemPos + len(itemMarker)><cfcontinue></cfif>
|
||||||
<cfset ipNameStart = ipNameStart + len(BQ & "name" & BQ & ":" & BQ)>
|
<cfset inStart = inPos + len(nameKey)>
|
||||||
<cfset ipNameEnd = find(BQ, ip, ipNameStart)>
|
<cfset inEnd = find(BQ, itemEntry, inStart)>
|
||||||
<cfif ipNameEnd LTE ipNameStart><cfcontinue></cfif>
|
<cfif inEnd LTE inStart><cfset itemPos = itemPos + len(itemMarker)><cfcontinue></cfif>
|
||||||
<cfset ipName = mid(ip, ipNameStart, ipNameEnd - ipNameStart)>
|
<cfset ipName = mid(itemEntry, inStart, inEnd - inStart)>
|
||||||
<cfset ipName = replace(ipName, '\u0026', '&', 'all')>
|
<cfset ipName = replace(ipName, '\u0026', '&', 'all')>
|
||||||
|
|
||||||
<!--- Skip duplicates --->
|
<!--- Skip duplicates --->
|
||||||
<cfif structKeyExists(ddItemSeen, ipName)><cfcontinue></cfif>
|
<cfif structKeyExists(ddItemSeen, ipName)>
|
||||||
|
<cfset itemPos = itemPos + len(itemMarker)>
|
||||||
|
<cfcontinue>
|
||||||
|
</cfif>
|
||||||
<cfset ddItemSeen[ipName] = true>
|
<cfset ddItemSeen[ipName] = true>
|
||||||
|
|
||||||
<!--- Extract description --->
|
<!--- Extract description --->
|
||||||
<cfset ipDesc = "">
|
<cfset ipDesc = "">
|
||||||
<cfset ipDescStart = findNoCase(BQ & "description" & BQ & ":" & BQ, ip)>
|
<cfset idPos = findNoCase(descKey, itemEntry)>
|
||||||
<cfif ipDescStart GT 0>
|
<cfif idPos GT 0>
|
||||||
<cfset ipDescStart = ipDescStart + len(BQ & "description" & BQ & ":" & BQ)>
|
<cfset idStart = idPos + len(descKey)>
|
||||||
<cfset ipDescEnd = find(BQ, ip, ipDescStart)>
|
<cfset idEnd = find(BQ, itemEntry, idStart)>
|
||||||
<cfif ipDescEnd GT ipDescStart>
|
<cfif idEnd GT idStart>
|
||||||
<cfset ipDesc = mid(ip, ipDescStart, ipDescEnd - ipDescStart)>
|
<cfset ipDesc = mid(itemEntry, idStart, idEnd - idStart)>
|
||||||
<cfset ipDesc = replace(ipDesc, '\u0026', '&', 'all')>
|
<cfset ipDesc = replace(ipDesc, '\u0026', '&', 'all')>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<!--- Extract displayPrice --->
|
<!--- Extract displayPrice --->
|
||||||
<cfset ipPrice = 0>
|
<cfset ipPrice = 0>
|
||||||
<cfset ipPriceStart = findNoCase(BQ & "displayPrice" & BQ & ":" & BQ, ip)>
|
<cfset ipPos = findNoCase(priceKey, itemEntry)>
|
||||||
<cfif ipPriceStart GT 0>
|
<cfif ipPos GT 0>
|
||||||
<cfset ipPriceStart = ipPriceStart + len(BQ & "displayPrice" & BQ & ":" & BQ)>
|
<cfset ipStart = ipPos + len(priceKey)>
|
||||||
<cfset ipPriceEnd = find(BQ, ip, ipPriceStart)>
|
<cfset ipEnd = find(BQ, itemEntry, ipStart)>
|
||||||
<cfif ipPriceEnd GT ipPriceStart>
|
<cfif ipEnd GT ipStart>
|
||||||
<cfset ipPriceStr = mid(ip, ipPriceStart, ipPriceEnd - ipPriceStart)>
|
<cfset ipPriceStr = mid(itemEntry, ipStart, ipEnd - ipStart)>
|
||||||
<cfset ipPriceStr = reReplace(ipPriceStr, '[^0-9.]', '', 'all')>
|
<cfset ipPriceStr = reReplace(ipPriceStr, '[^0-9.]', '', 'all')>
|
||||||
<cfset ipPrice = val(ipPriceStr)>
|
<cfset ipPrice = val(ipPriceStr)>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<!--- Look up image from carousel image map --->
|
<!--- Look up image from carousel map, or check imgUrl on item --->
|
||||||
<cfset ipImg = structKeyExists(ddImageMap, ipName) ? ddImageMap[ipName] : "">
|
<cfset ipImg = structKeyExists(ddImageMap, ipName) ? ddImageMap[ipName] : "">
|
||||||
|
|
||||||
<!--- Also check for imgUrl directly on this MenuPageItem --->
|
|
||||||
<cfif NOT len(ipImg)>
|
<cfif NOT len(ipImg)>
|
||||||
<cfset ipImgStart = findNoCase(BQ & "imgUrl" & BQ & ":" & BQ, ip)>
|
<cfset iiPos = findNoCase(imgKey, itemEntry)>
|
||||||
<cfif ipImgStart GT 0>
|
<cfif iiPos GT 0>
|
||||||
<cfset ipImgStart = ipImgStart + len(BQ & "imgUrl" & BQ & ":" & BQ)>
|
<cfset iiStart = iiPos + len(imgKey)>
|
||||||
<cfset ipImgEnd = find(BQ, ip, ipImgStart)>
|
<cfset iiEnd = find(BQ, itemEntry, iiStart)>
|
||||||
<cfif ipImgEnd GT ipImgStart>
|
<cfif iiEnd GT iiStart>
|
||||||
<cfset ipImg = mid(ip, ipImgStart, ipImgEnd - ipImgStart)>
|
<cfset ipImg = mid(itemEntry, iiStart, iiEnd - iiStart)>
|
||||||
<cfif ipImg EQ "null"><cfset ipImg = ""></cfif>
|
<cfif ipImg EQ "null" OR NOT findNoCase("http", ipImg)><cfset ipImg = ""></cfif>
|
||||||
<cfif len(ipImg) AND findNoCase("width=", ipImg)>
|
<cfif len(ipImg) AND findNoCase("width=", ipImg)>
|
||||||
<cfset ipImg = reReplaceNoCase(ipImg, 'width=\d+', 'width=600')>
|
<cfset ipImg = reReplaceNoCase(ipImg, 'width=\d+', 'width=600')>
|
||||||
<cfset ipImg = reReplaceNoCase(ipImg, 'height=\d+', 'height=600')>
|
<cfset ipImg = reReplaceNoCase(ipImg, 'height=\d+', 'height=600')>
|
||||||
|
|
@ -1295,7 +1328,11 @@
|
||||||
<cfset ddItem["imageFilename"] = listLast(ipImg, "/")>
|
<cfset ddItem["imageFilename"] = listLast(ipImg, "/")>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset arrayAppend(ddItems, ddItem)>
|
<cfset arrayAppend(ddItems, ddItem)>
|
||||||
|
|
||||||
|
<cfset itemPos = itemPos + len(itemMarker)>
|
||||||
</cfloop>
|
</cfloop>
|
||||||
|
|
||||||
|
<cfset catPos = catPos + len(catMarker)>
|
||||||
</cfloop>
|
</cfloop>
|
||||||
|
|
||||||
<cfset ddItemsWithImg = 0>
|
<cfset ddItemsWithImg = 0>
|
||||||
|
|
|
||||||
Reference in a new issue