Look for embedded JSON data in menu pages
This commit is contained in:
parent
361e54c17a
commit
a1b557cdc7
1 changed files with 47 additions and 1 deletions
|
|
@ -226,6 +226,47 @@
|
|||
|
||||
<cfset arrayAppend(response.steps, "Downloaded #arrayLen(imageDataArray)# valid images")>
|
||||
|
||||
<!--- Look for embedded JSON data (Next.js __NEXT_DATA__, Toast state, etc.) --->
|
||||
<cfset embeddedJsonData = "">
|
||||
<cfset embeddedMenuItems = arrayNew(1)>
|
||||
<cfloop array="#menuPages#" index="menuPage">
|
||||
<!--- Look for __NEXT_DATA__ (Next.js apps) --->
|
||||
<cfset nextDataMatch = reMatchNoCase('<script[^>]*id=["'']__NEXT_DATA__["''][^>]*>([^<]+)</script>', menuPage.html)>
|
||||
<cfif arrayLen(nextDataMatch)>
|
||||
<cfset scriptContent = reReplaceNoCase(nextDataMatch[1], '<script[^>]*>([^<]+)</script>', '\1')>
|
||||
<cfset embeddedJsonData = embeddedJsonData & chr(10) & "--- __NEXT_DATA__ ---" & chr(10) & scriptContent>
|
||||
</cfif>
|
||||
|
||||
<!--- Look for window.__INITIAL_STATE__ or similar patterns --->
|
||||
<cfset stateMatches = reMatchNoCase('window\.__[A-Z_]+__\s*=\s*(\{[^;]+\});', menuPage.html)>
|
||||
<cfloop array="#stateMatches#" index="stateMatch">
|
||||
<cfset embeddedJsonData = embeddedJsonData & chr(10) & "--- WINDOW_STATE ---" & chr(10) & stateMatch>
|
||||
</cfloop>
|
||||
|
||||
<!--- Look for data-props or data-page attributes with JSON --->
|
||||
<cfset dataPropsMatches = reMatchNoCase('data-(?:props|page|state)=["''](\{[^"'']+\})["'']', menuPage.html)>
|
||||
<cfloop array="#dataPropsMatches#" index="propsMatch">
|
||||
<cfset embeddedJsonData = embeddedJsonData & chr(10) & "--- DATA_PROPS ---" & chr(10) & propsMatch>
|
||||
</cfloop>
|
||||
|
||||
<!--- Look for JSON-LD structured data (schema.org Menu) --->
|
||||
<cfset jsonLdMatches = reMatchNoCase('<script[^>]*type=["'']application/ld\+json["''][^>]*>([^<]+)</script>', menuPage.html)>
|
||||
<cfloop array="#jsonLdMatches#" index="jsonLdMatch">
|
||||
<cfset scriptContent = reReplaceNoCase(jsonLdMatch, '<script[^>]*>([^<]+)</script>', '\1')>
|
||||
<cfif findNoCase("menu", scriptContent) OR findNoCase("MenuItem", scriptContent)>
|
||||
<cfset embeddedJsonData = embeddedJsonData & chr(10) & "--- JSON_LD_MENU ---" & chr(10) & scriptContent>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
</cfloop>
|
||||
|
||||
<cfif len(embeddedJsonData)>
|
||||
<cfset response["DEBUG_EMBEDDED_JSON_FOUND"] = true>
|
||||
<cfset response["DEBUG_EMBEDDED_JSON_LENGTH"] = len(embeddedJsonData)>
|
||||
<cfset response["DEBUG_EMBEDDED_JSON_PREVIEW"] = left(embeddedJsonData, 2000)>
|
||||
<cfelse>
|
||||
<cfset response["DEBUG_EMBEDDED_JSON_FOUND"] = false>
|
||||
</cfif>
|
||||
|
||||
<!--- Combine all page HTML into one text block --->
|
||||
<cfset combinedHtml = "">
|
||||
<cfloop array="#menuPages#" index="menuPage">
|
||||
|
|
@ -237,6 +278,11 @@
|
|||
<cfset combinedHtml = combinedHtml & chr(10) & "--- PAGE: " & menuPage.url & " ---" & chr(10) & cleanHtml>
|
||||
</cfloop>
|
||||
|
||||
<!--- If we found embedded JSON, append it to help Claude find all menu items --->
|
||||
<cfif len(embeddedJsonData)>
|
||||
<cfset combinedHtml = combinedHtml & chr(10) & chr(10) & "=== EMBEDDED JSON DATA (may contain full menu) ===" & chr(10) & embeddedJsonData>
|
||||
</cfif>
|
||||
|
||||
<!--- Limit HTML size for Claude --->
|
||||
<cfif len(combinedHtml) GT 100000>
|
||||
<cfset combinedHtml = left(combinedHtml, 100000)>
|
||||
|
|
@ -283,7 +329,7 @@
|
|||
<cfset arrayAppend(response.steps, "Found " & arrayLen(h3Texts) & " h3 and " & arrayLen(h4Texts) & " h4 tags")>
|
||||
|
||||
<!--- System prompt for URL analysis --->
|
||||
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category names), modifiers (array), items (array with name, description, price, category, subcategory, modifiers array, and imageUrl if found). CRITICAL: Extract EVERY menu item. SUBCATEGORY RULE: If a section header (like h3) has NO menu items directly below it, but contains NESTED sections (each with their own h3 and items), then: the outer section is the PARENT CATEGORY, the inner sections are SUBCATEGORIES. For items in subcategories, set category to the PARENT name and subcategory to the inner section name. Example: outer h3 says 'Drinks', inner h3s say 'Beer' and 'Wine' with items under them - those items should have category='Drinks' and subcategory='Beer' or 'Wine'. For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). Return ONLY valid JSON.">
|
||||
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category names), modifiers (array), items (array with name, description, price, category, subcategory, modifiers array, and imageUrl if found). CRITICAL: Extract EVERY menu item from ALL sources. IMPORTANT: Look for embedded JSON data in the HTML - this often contains the COMPLETE menu including items in collapsed/lazy-loaded sections. Check for __NEXT_DATA__, window state objects, JSON-LD schema, and data attributes. Combine items from both the visible HTML AND any embedded JSON. SUBCATEGORY RULE: If a section header has NO menu items directly below it but contains NESTED sections (each with their own header and items), then: the outer section is the PARENT CATEGORY, the inner sections are SUBCATEGORIES. For items in subcategories, set category to the PARENT name and subcategory to the inner section name. For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). Return ONLY valid JSON.">
|
||||
|
||||
<!--- Build message content --->
|
||||
<cfset messagesContent = arrayNew(1)>
|
||||
|
|
|
|||
Reference in a new issue