Harden JSON parsing for Claude API responses
- Add smart quote/dash replacement for PDF-sourced text - Add Jackson fallback parser for when Lucee's deserializeJSON fails - Strengthen prompt to request properly escaped JSON - Clean control characters more selectively Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d288b2b71c
commit
4240fe76cc
1 changed files with 27 additions and 12 deletions
|
|
@ -2114,7 +2114,7 @@
|
|||
</cfif>
|
||||
|
||||
<!--- System prompt for URL analysis --->
|
||||
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array), modifiers (array), items (array with name, description, price, category, modifiers array, and imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups multiple items (e.g., 'Appetizers', 'Tacos', 'Drinks', 'Desserts'). An ITEM is an individual food or drink product with a name, description, and price. Do NOT create a category for each individual item. A typical restaurant has 5-15 categories and 30-150 items. If you find yourself creating more categories than items, you are wrong - those are items, not categories. Each item must have a 'category' field set to the category it belongs to. CATEGORIES FORMAT: Each entry in the categories array can be either a simple string (for flat categories) OR an object with 'name' and optional 'subcategories' array. Example: [""Appetizers"", {""name"": ""Drinks"", ""subcategories"": [""Hot Drinks"", ""Cold Drinks""]}, ""Desserts""]. SUBCATEGORY DETECTION: If a section header contains nested titled sections beneath it (sub-headers with their own items), the outer section is the PARENT and inner sections are SUBCATEGORIES. For items in subcategories, set their 'category' field to the SUBCATEGORY name (not the parent). CRITICAL FOR IMAGES: Each menu item in the HTML is typically in a container (div, li, article) that also contains an img tag. Extract the img src URL and include it as 'imageUrl' for that item. Look for img tags that are siblings or children within the same menu-item container. The image URL should be the full or relative src value from the img tag - NOT the alt text. CRITICAL: Extract EVERY menu item from ALL sources including embedded JSON (__NEXT_DATA__, window state, JSON-LD). For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). Return ONLY valid JSON.">
|
||||
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array), modifiers (array), items (array with name, description, price, category, modifiers array, and imageUrl). CATEGORIES vs ITEMS (CRITICAL): A CATEGORY is a broad section heading that groups multiple items (e.g., 'Appetizers', 'Tacos', 'Drinks', 'Desserts'). An ITEM is an individual food or drink product with a name, description, and price. Do NOT create a category for each individual item. A typical restaurant has 5-15 categories and 30-150 items. If you find yourself creating more categories than items, you are wrong - those are items, not categories. Each item must have a 'category' field set to the category it belongs to. CATEGORIES FORMAT: Each entry in the categories array can be either a simple string (for flat categories) OR an object with 'name' and optional 'subcategories' array. Example: [""Appetizers"", {""name"": ""Drinks"", ""subcategories"": [""Hot Drinks"", ""Cold Drinks""]}, ""Desserts""]. SUBCATEGORY DETECTION: If a section header contains nested titled sections beneath it (sub-headers with their own items), the outer section is the PARENT and inner sections are SUBCATEGORIES. For items in subcategories, set their 'category' field to the SUBCATEGORY name (not the parent). CRITICAL FOR IMAGES: Each menu item in the HTML is typically in a container (div, li, article) that also contains an img tag. Extract the img src URL and include it as 'imageUrl' for that item. Look for img tags that are siblings or children within the same menu-item container. The image URL should be the full or relative src value from the img tag - NOT the alt text. CRITICAL: Extract EVERY menu item from ALL sources including embedded JSON (__NEXT_DATA__, window state, JSON-LD). For brandColor: suggest a vibrant hex (6 digits, no hash). For prices: numbers (e.g., 12.99). CRITICAL: Return ONLY valid JSON. All special characters in strings must be properly escaped. Never use smart/curly quotes. Use only ASCII double quotes for JSON string delimiters and backslash-escape any literal double quotes inside values.">
|
||||
|
||||
<!--- Build message content --->
|
||||
<cfset messagesContent = arrayNew(1)>
|
||||
|
|
@ -2229,22 +2229,37 @@
|
|||
</cfif>
|
||||
<!--- Remove trailing commas before ] or } --->
|
||||
<cfset responseText = reReplace(responseText, ",(\s*[\]\}])", "\1", "all")>
|
||||
<!--- Remove control characters that break JSON --->
|
||||
<cfset responseText = reReplace(responseText, "[\x00-\x1F]", " ", "all")>
|
||||
<!--- Remove control characters that break JSON (preserve \n \r \t which are valid escaped) --->
|
||||
<cfset responseText = reReplace(responseText, "[\x00-\x08\x0B\x0C\x0E-\x1F]", "", "all")>
|
||||
|
||||
<!--- Clean smart quotes/dashes from PDF text that Claude preserves --->
|
||||
<!--- Replace smart typography from PDFs --->
|
||||
<cfset responseText = replace(responseText, chr(8216), "'", 'all')>
|
||||
<cfset responseText = replace(responseText, chr(8217), "'", 'all')>
|
||||
<cfset responseText = replace(responseText, chr(8211), "-", 'all')>
|
||||
<cfset responseText = replace(responseText, chr(8212), "-", 'all')>
|
||||
<cfset responseText = replace(responseText, chr(8230), "...", 'all')>
|
||||
|
||||
<!--- Try to parse JSON with error handling --->
|
||||
<cftry>
|
||||
<cfset menuData = deserializeJSON(responseText)>
|
||||
<cfcatch type="any">
|
||||
<!--- JSON parsing failed - try to extract what we can --->
|
||||
<!--- Return the raw response for debugging --->
|
||||
<cfset response["success"] = false>
|
||||
<cfset response["error"] = "JSON parse error: #cfcatch.message#">
|
||||
<cfset response["DEBUG_RAW_RESPONSE"] = left(responseText, 2000)>
|
||||
<cfset response["DEBUG_RESPONSE_LENGTH"] = len(responseText)>
|
||||
<cfcontent type="application/json" reset="true">
|
||||
<cfoutput>#serializeJSON(response)#</cfoutput>
|
||||
<cfabort>
|
||||
<!--- Lucee failed - try Jackson (bundled with Lucee) which is more lenient --->
|
||||
<cftry>
|
||||
<cfset objectMapper = createObject("java", "com.fasterxml.jackson.databind.ObjectMapper")>
|
||||
<cfset javaMap = objectMapper.readValue(responseText, createObject("java", "java.util.LinkedHashMap").getClass())>
|
||||
<cfset menuData = deserializeJSON(objectMapper.writeValueAsString(javaMap))>
|
||||
<cfcatch type="any">
|
||||
<!--- Both parsers failed --->
|
||||
<cfset response["success"] = false>
|
||||
<cfset response["error"] = "JSON parse error: #cfcatch.message#">
|
||||
<cfset response["DEBUG_RAW_RESPONSE"] = left(responseText, 2000)>
|
||||
<cfset response["DEBUG_RESPONSE_LENGTH"] = len(responseText)>
|
||||
<cfcontent type="application/json" reset="true">
|
||||
<cfoutput>#serializeJSON(response)#</cfoutput>
|
||||
<cfabort>
|
||||
</cfcatch>
|
||||
</cftry>
|
||||
</cfcatch>
|
||||
</cftry>
|
||||
|
||||
|
|
|
|||
Reference in a new issue