Parse Toast menu from visible HTML for complete item extraction
- Extract items from visible HTML instead of just __OO_STATE__ JSON - Parse headerText spans for item names, price spans for prices - Extract images from Menu_files/ src attributes - Fall back to simpler headerText matching if block parsing fails - Also extract images from __OO_STATE__ and match to items by name - Fixes issue where only 116 items extracted instead of 163+ Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
b5abbe43b4
commit
d8e6f619ac
1 changed files with 155 additions and 87 deletions
|
|
@ -68,107 +68,175 @@
|
||||||
<cfset playwrightImages = arrayNew(1)>
|
<cfset playwrightImages = arrayNew(1)>
|
||||||
<cfset arrayAppend(response.steps, "Read " & len(pageHtml) & " bytes from local file")>
|
<cfset arrayAppend(response.steps, "Read " & len(pageHtml) & " bytes from local file")>
|
||||||
|
|
||||||
<!--- Check for Toast __OO_STATE__ - parse directly, skip Claude --->
|
<!--- Check for Toast menu page - extract from visible HTML for most complete data --->
|
||||||
<cfif findNoCase("window.__OO_STATE__", pageHtml)>
|
<cfif findNoCase("class=""headerText""", pageHtml) AND findNoCase("toasttab", pageHtml)>
|
||||||
<cfset arrayAppend(response.steps, "Toast menu detected - parsing __OO_STATE__ directly")>
|
<cfset arrayAppend(response.steps, "Toast menu detected - parsing visible HTML items")>
|
||||||
|
|
||||||
<!--- Extract the JSON using regex --->
|
<cftry>
|
||||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
|
<!--- Extract visible items from rendered HTML (most complete) --->
|
||||||
<cfif arrayLen(ooStateMatch)>
|
<cfset toastBusiness = structNew()>
|
||||||
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
<cfset toastCategories = arrayNew(1)>
|
||||||
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
<cfset toastItems = arrayNew(1)>
|
||||||
|
<cfset categorySet = structNew()>
|
||||||
|
<cfset itemNameSet = structNew()>
|
||||||
|
<cfset itemId = 1>
|
||||||
|
<cfset currentCategory = "Menu">
|
||||||
|
|
||||||
<cftry>
|
<!--- Find category headers (h2 with specific Toast patterns) --->
|
||||||
<cfset ooState = deserializeJSON(ooStateJson)>
|
<cfset categoryMatches = reMatchNoCase('<h2[^>]*class="[^"]*groupHeader[^"]*"[^>]*>([^<]+)</h2>', pageHtml)>
|
||||||
<cfset arrayAppend(response.steps, "Parsed __OO_STATE__ JSON successfully")>
|
<cfloop array="#categoryMatches#" index="catMatch">
|
||||||
|
<cfset catName = reReplaceNoCase(catMatch, '.*>([^<]+)</h2>.*', '\1')>
|
||||||
|
<cfset catName = trim(catName)>
|
||||||
|
<cfif len(catName) AND NOT structKeyExists(categorySet, catName)>
|
||||||
|
<cfset categorySet[catName] = true>
|
||||||
|
<cfset arrayAppend(toastCategories, { "name": catName, "itemCount": 0 })>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
|
||||||
<!--- Extract menus, categories, and items --->
|
<!--- Extract item blocks with name, price, description, image --->
|
||||||
<cfset toastBusiness = structNew()>
|
<!--- Toast pattern: li.item containing headerText for name, price span, itemImage img --->
|
||||||
<cfset toastCategories = arrayNew(1)>
|
<cfset itemBlocks = reMatchNoCase('<li[^>]*class="[^"]*item[^"]*"[^>]*>.*?</li>', pageHtml)>
|
||||||
<cfset toastItems = arrayNew(1)>
|
<cfset arrayAppend(response.steps, "Found " & arrayLen(itemBlocks) & " item blocks in HTML")>
|
||||||
<cfset categorySet = structNew()>
|
|
||||||
<cfset itemId = 1>
|
|
||||||
|
|
||||||
<cfloop collection="#ooState#" item="key">
|
<cfloop array="#itemBlocks#" index="block">
|
||||||
<cfif left(key, 5) EQ "Menu:">
|
<!--- Extract item name --->
|
||||||
<cfset menu = ooState[key]>
|
<cfset nameMatch = reMatchNoCase('<span class="headerText">([^<]+)</span>', block)>
|
||||||
<cfif structKeyExists(menu, "groups") AND isArray(menu.groups)>
|
<cfif arrayLen(nameMatch)>
|
||||||
<cfloop array="#menu.groups#" index="group">
|
<cfset itemName = reReplaceNoCase(nameMatch[1], '.*>([^<]+)</span>.*', '\1')>
|
||||||
<cfset groupName = structKeyExists(group, "name") ? group.name : "">
|
<cfset itemName = trim(itemName)>
|
||||||
<cfif len(groupName) AND NOT structKeyExists(categorySet, groupName)>
|
|
||||||
<cfset categorySet[groupName] = true>
|
|
||||||
<cfset arrayAppend(toastCategories, { "name": groupName, "itemCount": 0 })>
|
|
||||||
</cfif>
|
|
||||||
|
|
||||||
<cfif structKeyExists(group, "items") AND isArray(group.items)>
|
<!--- Skip duplicates --->
|
||||||
<cfloop array="#group.items#" index="item">
|
<cfif len(itemName) AND NOT structKeyExists(itemNameSet, itemName)>
|
||||||
<cfset itemStruct = structNew()>
|
<cfset itemNameSet[itemName] = true>
|
||||||
<cfset itemStruct["id"] = "item_" & itemId>
|
|
||||||
<cfset itemStruct["name"] = structKeyExists(item, "name") ? item.name : "">
|
|
||||||
<cfset itemStruct["description"] = structKeyExists(item, "description") ? item.description : "">
|
|
||||||
<cfset itemStruct["category"] = groupName>
|
|
||||||
<cfset itemStruct["modifiers"] = arrayNew(1)>
|
|
||||||
|
|
||||||
<!--- Handle prices (array for sizes) --->
|
<cfset itemStruct = structNew()>
|
||||||
<cfif structKeyExists(item, "prices") AND isArray(item.prices) AND arrayLen(item.prices)>
|
<cfset itemStruct["id"] = "item_" & itemId>
|
||||||
<cfset itemStruct["price"] = item.prices[1]>
|
<cfset itemStruct["name"] = itemName>
|
||||||
<cfelseif structKeyExists(item, "price")>
|
<cfset itemStruct["modifiers"] = arrayNew(1)>
|
||||||
<cfset itemStruct["price"] = item.price>
|
|
||||||
<cfelse>
|
|
||||||
<cfset itemStruct["price"] = 0>
|
|
||||||
</cfif>
|
|
||||||
|
|
||||||
<!--- Handle images --->
|
<!--- Extract price --->
|
||||||
<cfif structKeyExists(item, "imageUrls") AND isStruct(item.imageUrls)>
|
<cfset priceMatch = reMatchNoCase('<span[^>]*class="price"[^>]*>\$?([0-9.]+)</span>', block)>
|
||||||
<cfset imgUrls = item.imageUrls>
|
<cfif arrayLen(priceMatch)>
|
||||||
<cfif structKeyExists(imgUrls, "medium")>
|
<cfset priceStr = reReplaceNoCase(priceMatch[1], '.*>\\$?([0-9.]+)</span>.*', '\1')>
|
||||||
<cfset itemStruct["imageUrl"] = imgUrls.medium>
|
<cfset itemStruct["price"] = val(priceStr)>
|
||||||
<cfset itemStruct["imageSrc"] = imgUrls.medium>
|
<cfelse>
|
||||||
<cfset itemStruct["imageFilename"] = listLast(imgUrls.medium, "/")>
|
<cfset itemStruct["price"] = 0>
|
||||||
<cfelseif structKeyExists(imgUrls, "large")>
|
</cfif>
|
||||||
<cfset itemStruct["imageUrl"] = imgUrls.large>
|
|
||||||
<cfset itemStruct["imageSrc"] = imgUrls.large>
|
|
||||||
<cfset itemStruct["imageFilename"] = listLast(imgUrls.large, "/")>
|
|
||||||
</cfif>
|
|
||||||
</cfif>
|
|
||||||
|
|
||||||
<cfif len(itemStruct.name)>
|
<!--- Extract description --->
|
||||||
<cfset arrayAppend(toastItems, itemStruct)>
|
<cfset descMatch = reMatchNoCase('<div[^>]*class="[^"]*description[^"]*"[^>]*>([^<]+)</div>', block)>
|
||||||
<cfset itemId++>
|
<cfif arrayLen(descMatch)>
|
||||||
|
<cfset itemStruct["description"] = trim(reReplaceNoCase(descMatch[1], '.*>([^<]+)</div>.*', '\1'))>
|
||||||
|
<cfelse>
|
||||||
|
<cfset itemStruct["description"] = "">
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Extract image URL from srcset or src --->
|
||||||
|
<cfset imgMatch = reMatchNoCase('src="(Menu_files/[^"]+)"', block)>
|
||||||
|
<cfif arrayLen(imgMatch)>
|
||||||
|
<cfset imgSrc = reReplaceNoCase(imgMatch[1], '.*src="([^"]+)".*', '\1')>
|
||||||
|
<!--- Convert to full URL --->
|
||||||
|
<cfset itemStruct["imageUrl"] = basePath & imgSrc>
|
||||||
|
<cfset itemStruct["imageSrc"] = basePath & imgSrc>
|
||||||
|
<cfset itemStruct["imageFilename"] = listLast(imgSrc, "/")>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Try to determine category from nearby h2 or default --->
|
||||||
|
<cfset itemStruct["category"] = arrayLen(toastCategories) ? toastCategories[1].name : "Menu">
|
||||||
|
|
||||||
|
<cfset arrayAppend(toastItems, itemStruct)>
|
||||||
|
<cfset itemId++>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
|
||||||
|
<!--- If no items found from blocks, try simpler headerText extraction --->
|
||||||
|
<cfif arrayLen(toastItems) EQ 0>
|
||||||
|
<cfset nameMatches = reMatchNoCase('<span class="headerText">([^<]+)</span>', pageHtml)>
|
||||||
|
<cfloop array="#nameMatches#" index="nameMatch">
|
||||||
|
<cfset itemName = reReplaceNoCase(nameMatch, '.*>([^<]+)</span>.*', '\1')>
|
||||||
|
<cfset itemName = trim(itemName)>
|
||||||
|
<cfif len(itemName) AND NOT structKeyExists(itemNameSet, itemName)>
|
||||||
|
<cfset itemNameSet[itemName] = true>
|
||||||
|
<cfset itemStruct = { "id": "item_" & itemId, "name": itemName, "price": 0, "description": "", "category": "Menu", "modifiers": [] }>
|
||||||
|
<cfset arrayAppend(toastItems, itemStruct)>
|
||||||
|
<cfset itemId++>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Also try to extract from __OO_STATE__ for images --->
|
||||||
|
<cfif findNoCase("window.__OO_STATE__", pageHtml)>
|
||||||
|
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
|
||||||
|
<cfif arrayLen(ooStateMatch)>
|
||||||
|
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
||||||
|
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
||||||
|
<cftry>
|
||||||
|
<cfset ooState = deserializeJSON(ooStateJson)>
|
||||||
|
<!--- Build name -> image URL map from OO_STATE --->
|
||||||
|
<cfset imageMap = structNew()>
|
||||||
|
<cfloop collection="#ooState#" item="key">
|
||||||
|
<cfif left(key, 5) EQ "Menu:">
|
||||||
|
<cfset menu = ooState[key]>
|
||||||
|
<cfif structKeyExists(menu, "groups") AND isArray(menu.groups)>
|
||||||
|
<cfloop array="#menu.groups#" index="group">
|
||||||
|
<cfif structKeyExists(group, "items") AND isArray(group.items)>
|
||||||
|
<cfloop array="#group.items#" index="item">
|
||||||
|
<cfif structKeyExists(item, "name") AND structKeyExists(item, "imageUrls")>
|
||||||
|
<cfset imgUrls = item.imageUrls>
|
||||||
|
<cfif structKeyExists(imgUrls, "medium")>
|
||||||
|
<cfset imageMap[item.name] = imgUrls.medium>
|
||||||
|
<cfelseif structKeyExists(imgUrls, "large")>
|
||||||
|
<cfset imageMap[item.name] = imgUrls.large>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfloop>
|
</cfloop>
|
||||||
</cfif>
|
</cfif>
|
||||||
</cfloop>
|
</cfif>
|
||||||
</cfif>
|
</cfloop>
|
||||||
</cfif>
|
<!--- Apply images to items --->
|
||||||
</cfloop>
|
<cfset imagesMatched = 0>
|
||||||
|
<cfloop from="1" to="#arrayLen(toastItems)#" index="i">
|
||||||
|
<cfif structKeyExists(imageMap, toastItems[i].name)>
|
||||||
|
<cfset toastItems[i]["imageUrl"] = imageMap[toastItems[i].name]>
|
||||||
|
<cfset toastItems[i]["imageSrc"] = imageMap[toastItems[i].name]>
|
||||||
|
<cfset toastItems[i]["imageFilename"] = listLast(imageMap[toastItems[i].name], "/")>
|
||||||
|
<cfset imagesMatched++>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
<cfset arrayAppend(response.steps, "Matched " & imagesMatched & " images from __OO_STATE__")>
|
||||||
|
<cfcatch></cfcatch>
|
||||||
|
</cftry>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
<cfset arrayAppend(response.steps, "Extracted " & arrayLen(toastItems) & " items from " & arrayLen(toastCategories) & " categories")>
|
<cfset arrayAppend(response.steps, "Extracted " & arrayLen(toastItems) & " unique items from " & arrayLen(toastCategories) & " categories")>
|
||||||
|
|
||||||
<!--- Return directly without Claude --->
|
<!--- Return directly without Claude --->
|
||||||
<cfset response["OK"] = true>
|
<cfset response["OK"] = true>
|
||||||
<cfset response["DATA"] = {
|
<cfset response["DATA"] = {
|
||||||
"business": toastBusiness,
|
"business": toastBusiness,
|
||||||
"categories": toastCategories,
|
"categories": toastCategories,
|
||||||
"modifiers": arrayNew(1),
|
"modifiers": arrayNew(1),
|
||||||
"items": toastItems,
|
"items": toastItems,
|
||||||
"imageUrls": arrayNew(1),
|
"imageUrls": arrayNew(1),
|
||||||
"headerCandidateIndices": arrayNew(1),
|
"headerCandidateIndices": arrayNew(1),
|
||||||
"imageMappings": arrayNew(1)
|
"imageMappings": arrayNew(1)
|
||||||
}>
|
}>
|
||||||
<cfset response["sourceUrl"] = targetUrl>
|
<cfset response["sourceUrl"] = targetUrl>
|
||||||
<cfset response["pagesProcessed"] = 1>
|
<cfset response["pagesProcessed"] = 1>
|
||||||
<cfset response["imagesFound"] = 0>
|
<cfset response["imagesFound"] = 0>
|
||||||
<cfset response["playwrightImagesCount"] = 0>
|
<cfset response["playwrightImagesCount"] = 0>
|
||||||
<cfset response["toastDirect"] = true>
|
<cfset response["toastDirect"] = true>
|
||||||
<cfoutput>#serializeJSON(response)#</cfoutput>
|
<cfoutput>#serializeJSON(response)#</cfoutput>
|
||||||
<cfabort>
|
<cfabort>
|
||||||
|
|
||||||
<cfcatch type="any">
|
<cfcatch type="any">
|
||||||
<cfset arrayAppend(response.steps, "Toast JSON parse failed: " & cfcatch.message & " - falling back to Claude")>
|
<cfset arrayAppend(response.steps, "Toast HTML parse failed: " & cfcatch.message & " - falling back to Claude")>
|
||||||
</cfcatch>
|
</cfcatch>
|
||||||
</cftry>
|
</cftry>
|
||||||
</cfif>
|
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<!--- Extract base URL for resolving relative links --->
|
<!--- Extract base URL for resolving relative links --->
|
||||||
|
|
|
||||||
Reference in a new issue