Add server-side h2/h3 hierarchy detection for subcategory discovery
- Parse HTML heading structure to detect h2 parents with h3 subcategories - Append detected hierarchy to Claude prompt as explicit hint - Post-process Claude response to enforce hierarchy even if Claude returns flat Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
495b03c76d
commit
9acf4aa511
1 changed files with 104 additions and 1 deletions
|
|
@ -1036,6 +1036,59 @@
|
|||
</cfif>
|
||||
<cfset arrayAppend(response.steps, "Found " & arrayLen(h3Texts) & " h3 and " & arrayLen(h4Texts) & " h4 tags")>
|
||||
|
||||
<!--- Server-side heading hierarchy detection from HTML h2/h3 structure --->
|
||||
<cfset headingHierarchy = structNew()>
|
||||
<cfset hierarchyDesc = "">
|
||||
<cfset scanPos = 1>
|
||||
<cfset currentH2 = "">
|
||||
<cfloop condition="scanPos LT len(combinedHtml)">
|
||||
<cfset nextH2 = reFindNoCase("<h2[^>]*>", combinedHtml, scanPos)>
|
||||
<cfset nextH3 = reFindNoCase("<h3[^>]*>", combinedHtml, scanPos)>
|
||||
|
||||
<cfif nextH2 EQ 0 AND nextH3 EQ 0><cfbreak></cfif>
|
||||
|
||||
<cfif nextH2 GT 0 AND (nextH3 EQ 0 OR nextH2 LT nextH3)>
|
||||
<!--- h2 found first --->
|
||||
<cfset closePos = findNoCase("</h2>", combinedHtml, nextH2)>
|
||||
<cfif closePos EQ 0><cfbreak></cfif>
|
||||
<cfset tagContent = mid(combinedHtml, nextH2, closePos + 5 - nextH2)>
|
||||
<cfset h2Raw = reReplaceNoCase(tagContent, "<[^>]+>", "", "all")>
|
||||
<cfset h2Raw = trim(h2Raw)>
|
||||
<!--- Clean: strip decorative dashes --->
|
||||
<cfset h2Clean = reReplace(h2Raw, "[^a-zA-Z0-9 ]", "", "all")>
|
||||
<cfset h2Clean = trim(h2Clean)>
|
||||
<!--- Skip non-category h2s --->
|
||||
<cfif len(h2Clean) AND h2Clean NEQ "MENU" AND NOT findNoCase("copyright", h2Clean)>
|
||||
<cfset currentH2 = h2Raw>
|
||||
<cfelse>
|
||||
<cfset currentH2 = "">
|
||||
</cfif>
|
||||
<cfset scanPos = closePos + 5>
|
||||
<cfelse>
|
||||
<!--- h3 found first --->
|
||||
<cfset closePos = findNoCase("</h3>", combinedHtml, nextH3)>
|
||||
<cfif closePos EQ 0><cfbreak></cfif>
|
||||
<cfset tagContent = mid(combinedHtml, nextH3, closePos + 5 - nextH3)>
|
||||
<cfset h3Text = reReplaceNoCase(tagContent, "<[^>]+>", "", "all")>
|
||||
<cfset h3Text = trim(h3Text)>
|
||||
<cfif len(currentH2) AND len(h3Text)>
|
||||
<cfif NOT structKeyExists(headingHierarchy, currentH2)>
|
||||
<cfset headingHierarchy[currentH2] = arrayNew(1)>
|
||||
</cfif>
|
||||
<cfset arrayAppend(headingHierarchy[currentH2], h3Text)>
|
||||
</cfif>
|
||||
<cfset scanPos = closePos + 5>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
|
||||
<cfif structCount(headingHierarchy) GT 0>
|
||||
<cfloop collection="#headingHierarchy#" item="hParent">
|
||||
<cfset hierarchyDesc = hierarchyDesc & "- """ & hParent & """ contains subsections: " & arrayToList(headingHierarchy[hParent], ", ") & chr(10)>
|
||||
</cfloop>
|
||||
<cfset response["DEBUG_HEADING_HIERARCHY"] = headingHierarchy>
|
||||
<cfset arrayAppend(response.steps, "Detected " & structCount(headingHierarchy) & " parent categories with subcategories from h2/h3 structure")>
|
||||
</cfif>
|
||||
|
||||
<!--- System prompt for URL analysis --->
|
||||
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array), modifiers (array), items (array with name, description, price, category, modifiers array, and imageUrl). CATEGORIES FORMAT: Each entry in the categories array can be either a simple string (for flat categories) OR an object with 'name' and optional 'subcategories' array. Example: [""Appetizers"", {""name"": ""Drinks"", ""subcategories"": [""Hot Drinks"", ""Cold Drinks""]}, ""Desserts""]. SUBCATEGORY DETECTION: If a section header contains nested titled sections beneath it (sub-headers with their own items), the outer section is the PARENT and inner sections are SUBCATEGORIES. For items in subcategories, set their 'category' field to the SUBCATEGORY name (not the parent). CRITICAL FOR IMAGES: Each menu item in the HTML is typically in a container (div, li, article) that also contains an img tag. Extract the img src URL and include it as 'imageUrl' for that item. Look for img tags that are siblings or children within the same menu-item container. The image URL should be the full or relative src value from the img tag - NOT the alt text. CRITICAL: Extract EVERY menu item from ALL sources including embedded JSON (__NEXT_DATA__, window state, JSON-LD). For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). Return ONLY valid JSON.">
|
||||
|
||||
|
|
@ -1055,7 +1108,13 @@
|
|||
<!--- Add HTML text --->
|
||||
<cfset textBlock = structNew()>
|
||||
<cfset textBlock["type"] = "text">
|
||||
<cfset textBlock["text"] = "Extract menu data from this restaurant website HTML. The images above are from the same website - identify which ones are food photos that could be used as item images, and which could be header/banner images. Here is the HTML content:" & chr(10) & chr(10) & combinedHtml>
|
||||
<cfset userText = "Extract menu data from this restaurant website HTML. The images above are from the same website - identify which ones are food photos that could be used as item images, and which could be header/banner images.">
|
||||
<!--- Append heading hierarchy hint if detected --->
|
||||
<cfif len(hierarchyDesc)>
|
||||
<cfset userText = userText & chr(10) & chr(10) & "IMPORTANT - DETECTED SECTION HIERARCHY FROM HTML HEADINGS:" & chr(10) & "The following h2 sections contain h3 sub-sections. Use these as parent-subcategory relationships in your categories output:" & chr(10) & hierarchyDesc & "For each parent above, include it in the categories array as an OBJECT with 'name' and 'subcategories' array. Items belonging to a subsection should have their 'category' field set to the SUBCATEGORY name (not the parent).">
|
||||
</cfif>
|
||||
<cfset userText = userText & chr(10) & chr(10) & "Here is the HTML content:" & chr(10) & chr(10) & combinedHtml>
|
||||
<cfset textBlock["text"] = userText>
|
||||
<cfset arrayAppend(messagesContent, textBlock)>
|
||||
|
||||
<cfset userMessage = structNew()>
|
||||
|
|
@ -1215,6 +1274,50 @@
|
|||
</cfloop>
|
||||
<cfset menuData["categories"] = formattedCategories>
|
||||
|
||||
<!--- Server-side hierarchy enforcement from HTML heading structure (backup if Claude returns flat) --->
|
||||
<cfif structCount(headingHierarchy) GT 0>
|
||||
<!--- Build reverse map: lowercase h3 name → raw h2 parent name --->
|
||||
<cfset h3ToParent = structNew()>
|
||||
<cfloop collection="#headingHierarchy#" item="hParentName">
|
||||
<cfloop array="#headingHierarchy[hParentName]#" index="hChildName">
|
||||
<cfset h3ToParent[lCase(trim(hChildName))] = hParentName>
|
||||
</cfloop>
|
||||
</cfloop>
|
||||
|
||||
<!--- Check if any categories match h3 names but lack parentCategoryName --->
|
||||
<cfset hierarchyApplied = 0>
|
||||
<cfloop from="1" to="#arrayLen(formattedCategories)#" index="i">
|
||||
<cfset cat = formattedCategories[i]>
|
||||
<cfif NOT structKeyExists(cat, "parentCategoryName") OR NOT len(cat.parentCategoryName)>
|
||||
<cfset catLower = lCase(trim(cat.name))>
|
||||
<cfif structKeyExists(h3ToParent, catLower)>
|
||||
<cfset rawParent = h3ToParent[catLower]>
|
||||
<!--- Find matching parent category in the list --->
|
||||
<cfset matchedParent = "">
|
||||
<cfloop array="#formattedCategories#" index="pcat">
|
||||
<cfset pcatLower = lCase(trim(pcat.name))>
|
||||
<!--- Normalize: strip dashes and "menu" suffix for comparison --->
|
||||
<cfset parentNorm = lCase(reReplace(rawParent, "[^a-zA-Z0-9 ]", "", "all"))>
|
||||
<cfset parentNorm = trim(reReplaceNoCase(parentNorm, "\s*menu\s*$", ""))>
|
||||
<cfset pcatNorm = trim(reReplaceNoCase(pcatLower, "\s*menu\s*$", ""))>
|
||||
<cfif pcatNorm EQ parentNorm OR pcatLower EQ lCase(rawParent)>
|
||||
<cfset matchedParent = pcat.name>
|
||||
<cfbreak>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
<cfif len(matchedParent)>
|
||||
<cfset formattedCategories[i]["parentCategoryName"] = matchedParent>
|
||||
<cfset hierarchyApplied = hierarchyApplied + 1>
|
||||
</cfif>
|
||||
</cfif>
|
||||
</cfif>
|
||||
</cfloop>
|
||||
<cfif hierarchyApplied GT 0>
|
||||
<cfset menuData["categories"] = formattedCategories>
|
||||
<cfset arrayAppend(response.steps, "Server-side hierarchy: applied " & hierarchyApplied & " parent-child relationships")>
|
||||
</cfif>
|
||||
</cfif>
|
||||
|
||||
<!--- For items with subcategory field from Claude, set their category to the subcategory name --->
|
||||
<cfloop from="1" to="#arrayLen(menuData.items)#" index="i">
|
||||
<cfset item = menuData.items[i]>
|
||||
|
|
|
|||
Reference in a new issue