368 lines
16 KiB
Text
368 lines
16 KiB
Text
<cfsetting showdebugoutput="false">
|
|
<cfsetting enablecfoutputonly="true">
|
|
<cfsetting requesttimeout="300">
|
|
<cfcontent type="application/json; charset=utf-8" reset="true">
|
|
|
|
<cfset response = structNew()>
|
|
<cfset response["OK"] = false>
|
|
|
|
<cftry>
|
|
<!--- Load API Key --->
|
|
<cfset CLAUDE_API_KEY = "">
|
|
<cfset configPath = getDirectoryFromPath(getCurrentTemplatePath()) & "../../config/claude.json">
|
|
<cfif fileExists(configPath)>
|
|
<cfset configData = deserializeJSON(fileRead(configPath))>
|
|
<cfif structKeyExists(configData, "apiKey")>
|
|
<cfset CLAUDE_API_KEY = configData.apiKey>
|
|
</cfif>
|
|
</cfif>
|
|
|
|
<cfif NOT len(CLAUDE_API_KEY)>
|
|
<cfthrow message="Claude API key not configured">
|
|
</cfif>
|
|
|
|
<!--- Get URL from request --->
|
|
<cfset requestBody = toString(getHttpRequestData().content)>
|
|
<cfif NOT len(requestBody)>
|
|
<cfthrow message="No request body provided">
|
|
</cfif>
|
|
|
|
<cfset requestData = deserializeJSON(requestBody)>
|
|
<cfif NOT structKeyExists(requestData, "url") OR NOT len(trim(requestData.url))>
|
|
<cfthrow message="URL is required">
|
|
</cfif>
|
|
|
|
<cfset targetUrl = trim(requestData.url)>
|
|
|
|
<!--- Validate URL format --->
|
|
<cfif NOT reFindNoCase("^https?://", targetUrl)>
|
|
<cfset targetUrl = "https://" & targetUrl>
|
|
</cfif>
|
|
|
|
<cfset response["steps"] = arrayNew(1)>
|
|
<cfset arrayAppend(response.steps, "Fetching URL: " & targetUrl)>
|
|
|
|
<!--- Fetch the main page --->
|
|
<cfhttp url="#targetUrl#" method="GET" timeout="30" result="mainPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36">
|
|
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml">
|
|
</cfhttp>
|
|
|
|
<cfif mainPage.statusCode NEQ "200 OK" AND NOT findNoCase("200", mainPage.statusCode)>
|
|
<cfthrow message="Failed to fetch URL: #mainPage.statusCode#">
|
|
</cfif>
|
|
|
|
<cfset pageHtml = mainPage.fileContent>
|
|
<cfset arrayAppend(response.steps, "Fetched #len(pageHtml)# bytes")>
|
|
|
|
<!--- Extract base URL for resolving relative links --->
|
|
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
|
|
<cfset basePath = reReplace(targetUrl, "(https?://[^/]+/[^?]*/?).*", "\1")>
|
|
<cfif NOT reFindNoCase("/$", basePath)>
|
|
<cfset basePath = reReplace(basePath, "/[^/]*$", "/")>
|
|
</cfif>
|
|
|
|
<!--- Find menu links and fetch them too --->
|
|
<cfset menuPages = arrayNew(1)>
|
|
<cfset arrayAppend(menuPages, { url: targetUrl, html: pageHtml })>
|
|
|
|
<!--- Look for menu links in the page --->
|
|
<cfset menuLinkPatterns = 'href=["'']([^"'']*(?:menu|food|dishes|order)[^"'']*)["'']'>
|
|
<cfset menuLinks = reMatchNoCase(menuLinkPatterns, pageHtml)>
|
|
|
|
<cfloop array="#menuLinks#" index="linkMatch">
|
|
<cfset linkUrl = reReplaceNoCase(linkMatch, 'href=["'']([^"'']*)["'']', "\1")>
|
|
|
|
<!--- Resolve relative URLs --->
|
|
<cfif left(linkUrl, 1) EQ "/">
|
|
<cfset linkUrl = baseUrl & linkUrl>
|
|
<cfelseif NOT reFindNoCase("^https?://", linkUrl)>
|
|
<cfset linkUrl = basePath & linkUrl>
|
|
</cfif>
|
|
|
|
<!--- Skip if same as main page or external domain --->
|
|
<cfif linkUrl NEQ targetUrl AND findNoCase(baseUrl, linkUrl)>
|
|
<cftry>
|
|
<cfhttp url="#linkUrl#" method="GET" timeout="15" result="subPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36">
|
|
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml">
|
|
</cfhttp>
|
|
|
|
<cfif findNoCase("200", subPage.statusCode)>
|
|
<cfset arrayAppend(menuPages, { url: linkUrl, html: subPage.fileContent })>
|
|
<cfset arrayAppend(response.steps, "Found menu page: " & linkUrl)>
|
|
</cfif>
|
|
<cfcatch>
|
|
<!--- Skip failed requests --->
|
|
</cfcatch>
|
|
</cftry>
|
|
</cfif>
|
|
|
|
<!--- Limit to 5 pages max --->
|
|
<cfif arrayLen(menuPages) GTE 5>
|
|
<cfbreak>
|
|
</cfif>
|
|
</cfloop>
|
|
|
|
<!--- Extract images from all pages --->
|
|
<cfset allImages = arrayNew(1)>
|
|
<cfset imageUrls = structNew()>
|
|
|
|
<cfloop array="#menuPages#" index="menuPage">
|
|
<!--- Find all img tags --->
|
|
<cfset imgMatches = reMatchNoCase('<img[^>]+src=["'']([^"'']+)["''][^>]*>', menuPage.html)>
|
|
|
|
<cfloop array="#imgMatches#" index="imgTag">
|
|
<cfset imgSrc = reReplaceNoCase(imgTag, '.*src=["'']([^"'']+)["''].*', "\1")>
|
|
|
|
<!--- Resolve relative URLs --->
|
|
<cfif left(imgSrc, 1) EQ "/">
|
|
<cfset imgSrc = baseUrl & imgSrc>
|
|
<cfelseif NOT reFindNoCase("^https?://", imgSrc) AND NOT reFindNoCase("^data:", imgSrc)>
|
|
<cfset imgSrc = basePath & imgSrc>
|
|
</cfif>
|
|
|
|
<!--- Skip data URLs, icons, and already-processed images --->
|
|
<cfif reFindNoCase("^https?://", imgSrc) AND NOT structKeyExists(imageUrls, imgSrc)>
|
|
<!--- Skip common icon/logo patterns that are too small --->
|
|
<cfif NOT reFindNoCase("(icon|favicon|logo|sprite|pixel|tracking|badge|button)", imgSrc)>
|
|
<cfset imageUrls[imgSrc] = true>
|
|
</cfif>
|
|
</cfif>
|
|
</cfloop>
|
|
</cfloop>
|
|
|
|
<cfset arrayAppend(response.steps, "Found #structCount(imageUrls)# unique images")>
|
|
|
|
<!--- Download images (limit to 20) --->
|
|
<cfset imageDataArray = arrayNew(1)>
|
|
<cfset downloadedCount = 0>
|
|
|
|
<cfloop collection="#imageUrls#" item="imgUrl">
|
|
<cfif downloadedCount GTE 20>
|
|
<cfbreak>
|
|
</cfif>
|
|
|
|
<cftry>
|
|
<cfhttp url="#imgUrl#" method="GET" timeout="10" result="imgResult" getasbinary="yes">
|
|
</cfhttp>
|
|
|
|
<cfif findNoCase("200", imgResult.statusCode) AND isBinary(imgResult.fileContent)>
|
|
<!--- Check content type --->
|
|
<cfset contentType = structKeyExists(imgResult.responseHeader, "Content-Type") ? imgResult.responseHeader["Content-Type"] : "">
|
|
|
|
<cfif reFindNoCase("image/(jpeg|jpg|png|gif|webp)", contentType)>
|
|
<!--- Check image size (skip tiny images) --->
|
|
<cfset imgBytes = len(imgResult.fileContent)>
|
|
|
|
<cfif imgBytes GT 5000>
|
|
<cfset base64Content = toBase64(imgResult.fileContent)>
|
|
|
|
<cfset mediaType = "image/jpeg">
|
|
<cfif findNoCase("png", contentType)><cfset mediaType = "image/png"></cfif>
|
|
<cfif findNoCase("gif", contentType)><cfset mediaType = "image/gif"></cfif>
|
|
<cfif findNoCase("webp", contentType)><cfset mediaType = "image/webp"></cfif>
|
|
|
|
<cfset imgSource = structNew()>
|
|
<cfset imgSource["type"] = "base64">
|
|
<cfset imgSource["media_type"] = mediaType>
|
|
<cfset imgSource["data"] = base64Content>
|
|
|
|
<cfset imgStruct = structNew()>
|
|
<cfset imgStruct["type"] = "image">
|
|
<cfset imgStruct["source"] = imgSource>
|
|
<cfset imgStruct["url"] = imgUrl>
|
|
|
|
<cfset arrayAppend(imageDataArray, imgStruct)>
|
|
<cfset downloadedCount = downloadedCount + 1>
|
|
</cfif>
|
|
</cfif>
|
|
</cfif>
|
|
<cfcatch>
|
|
<!--- Skip failed downloads --->
|
|
</cfcatch>
|
|
</cftry>
|
|
</cfloop>
|
|
|
|
<cfset arrayAppend(response.steps, "Downloaded #arrayLen(imageDataArray)# valid images")>
|
|
|
|
<!--- Combine all page HTML into one text block --->
|
|
<cfset combinedHtml = "">
|
|
<cfloop array="#menuPages#" index="menuPage">
|
|
<!--- Strip scripts, styles, and extract text content --->
|
|
<cfset cleanHtml = menuPage.html>
|
|
<cfset cleanHtml = reReplaceNoCase(cleanHtml, "<script[^>]*>.*?</script>", "", "all")>
|
|
<cfset cleanHtml = reReplaceNoCase(cleanHtml, "<style[^>]*>.*?</style>", "", "all")>
|
|
<cfset cleanHtml = reReplaceNoCase(cleanHtml, "<!--.*?-->", "", "all")>
|
|
<cfset combinedHtml = combinedHtml & chr(10) & "--- PAGE: " & menuPage.url & " ---" & chr(10) & cleanHtml>
|
|
</cfloop>
|
|
|
|
<!--- Limit HTML size for Claude --->
|
|
<cfif len(combinedHtml) GT 100000>
|
|
<cfset combinedHtml = left(combinedHtml, 100000)>
|
|
</cfif>
|
|
|
|
<!--- System prompt for URL analysis --->
|
|
<cfset systemPrompt = "You are an expert at extracting structured menu data from restaurant website HTML. Extract ALL menu data visible in the HTML. Return valid JSON with these keys: business (object with name, address, phone, hours, brandColor), categories (array of category names), modifiers (array of modifier templates with name, required boolean, appliesTo, categoryName if applicable, and options array), items (array with name, description, price, category, modifiers array, and imageUrl if found). For brandColor: suggest a vibrant hex color (6 digits, no ##) based on the restaurant style. For hours: format as ""Mon-Fri 10:30am-10pm, Sat 11am-10pm, Sun 11am-9pm"". Include ALL days visible. For prices: extract as numbers (e.g., 12.99). For modifier options: use format {""name"": ""option"", ""price"": 0}. Return ONLY valid JSON, no markdown, no explanation.">
|
|
|
|
<!--- Build message content --->
|
|
<cfset messagesContent = arrayNew(1)>
|
|
|
|
<!--- Add images first (up to 10 for analysis) --->
|
|
<cfset imgLimit = min(arrayLen(imageDataArray), 10)>
|
|
<cfloop from="1" to="#imgLimit#" index="i">
|
|
<cfset imgData = imageDataArray[i]>
|
|
<cfset imgContent = structNew()>
|
|
<cfset imgContent["type"] = "image">
|
|
<cfset imgContent["source"] = imgData.source>
|
|
<cfset arrayAppend(messagesContent, imgContent)>
|
|
</cfloop>
|
|
|
|
<!--- Add HTML text --->
|
|
<cfset textBlock = structNew()>
|
|
<cfset textBlock["type"] = "text">
|
|
<cfset textBlock["text"] = "Extract menu data from this restaurant website HTML. The images above are from the same website - identify which ones are food photos that could be used as item images, and which could be header/banner images. Here is the HTML content:" & chr(10) & chr(10) & combinedHtml>
|
|
<cfset arrayAppend(messagesContent, textBlock)>
|
|
|
|
<cfset userMessage = structNew()>
|
|
<cfset userMessage["role"] = "user">
|
|
<cfset userMessage["content"] = messagesContent>
|
|
|
|
<cfset requestBody = structNew()>
|
|
<cfset requestBody["model"] = "claude-sonnet-4-20250514">
|
|
<cfset requestBody["max_tokens"] = 8192>
|
|
<cfset requestBody["temperature"] = 0>
|
|
<cfset requestBody["system"] = systemPrompt>
|
|
<cfset requestBody["messages"] = arrayNew(1)>
|
|
<cfset arrayAppend(requestBody["messages"], userMessage)>
|
|
|
|
<cfset arrayAppend(response.steps, "Sending to Claude API...")>
|
|
|
|
<!--- Call Claude API --->
|
|
<cfhttp url="https://api.anthropic.com/v1/messages" method="POST" timeout="120" result="httpResult">
|
|
<cfhttpparam type="header" name="Content-Type" value="application/json">
|
|
<cfhttpparam type="header" name="x-api-key" value="#CLAUDE_API_KEY#">
|
|
<cfhttpparam type="header" name="anthropic-version" value="2023-06-01">
|
|
<cfhttpparam type="body" value="#serializeJSON(requestBody)#">
|
|
</cfhttp>
|
|
|
|
<cfset httpStatusCode = httpResult.statusCode>
|
|
<cfif isNumeric(httpStatusCode)>
|
|
<cfset httpStatusCode = int(httpStatusCode)>
|
|
<cfelseif findNoCase("200", httpStatusCode)>
|
|
<cfset httpStatusCode = 200>
|
|
<cfelse>
|
|
<cfset httpStatusCode = 0>
|
|
</cfif>
|
|
|
|
<cfif httpStatusCode NEQ 200>
|
|
<cfset errorDetail = "">
|
|
<cftry>
|
|
<cfset errorResponse = deserializeJSON(httpResult.fileContent)>
|
|
<cfif structKeyExists(errorResponse, "error") AND structKeyExists(errorResponse.error, "message")>
|
|
<cfset errorDetail = errorResponse.error.message>
|
|
<cfelse>
|
|
<cfset errorDetail = httpResult.fileContent>
|
|
</cfif>
|
|
<cfcatch>
|
|
<cfset errorDetail = httpResult.fileContent>
|
|
</cfcatch>
|
|
</cftry>
|
|
<cfthrow message="Claude API error: #httpResult.statusCode# - #errorDetail#">
|
|
</cfif>
|
|
|
|
<!--- Parse response --->
|
|
<cfset claudeResponse = deserializeJSON(httpResult.fileContent)>
|
|
<cfif NOT structKeyExists(claudeResponse, "content") OR NOT arrayLen(claudeResponse.content)>
|
|
<cfthrow message="Empty response from Claude">
|
|
</cfif>
|
|
|
|
<cfset responseText = "">
|
|
<cfloop array="#claudeResponse.content#" index="block">
|
|
<cfif structKeyExists(block, "type") AND block.type EQ "text">
|
|
<cfset responseText = block.text>
|
|
<cfbreak>
|
|
</cfif>
|
|
</cfloop>
|
|
|
|
<!--- Clean up JSON response --->
|
|
<cfset responseText = trim(responseText)>
|
|
<cfif left(responseText, 7) EQ "```json">
|
|
<cfset responseText = mid(responseText, 8, len(responseText) - 7)>
|
|
</cfif>
|
|
<cfif left(responseText, 3) EQ "```">
|
|
<cfset responseText = mid(responseText, 4, len(responseText) - 3)>
|
|
</cfif>
|
|
<cfif right(responseText, 3) EQ "```">
|
|
<cfset responseText = left(responseText, len(responseText) - 3)>
|
|
</cfif>
|
|
<cfset responseText = trim(responseText)>
|
|
<cfset responseText = reReplace(responseText, ",(\s*[\]\}])", "\1", "all")>
|
|
|
|
<cfset menuData = deserializeJSON(responseText)>
|
|
|
|
<!--- Build image URL list for the wizard to use --->
|
|
<cfset imageUrlList = arrayNew(1)>
|
|
<cfloop array="#imageDataArray#" index="imgData">
|
|
<cfif structKeyExists(imgData, "url")>
|
|
<cfset arrayAppend(imageUrlList, imgData.url)>
|
|
</cfif>
|
|
</cfloop>
|
|
|
|
<!--- Ensure expected structure --->
|
|
<cfif NOT structKeyExists(menuData, "business")>
|
|
<cfset menuData["business"] = structNew()>
|
|
</cfif>
|
|
<cfif NOT structKeyExists(menuData, "categories")>
|
|
<cfset menuData["categories"] = arrayNew(1)>
|
|
</cfif>
|
|
<cfif NOT structKeyExists(menuData, "modifiers")>
|
|
<cfset menuData["modifiers"] = arrayNew(1)>
|
|
</cfif>
|
|
<cfif NOT structKeyExists(menuData, "items")>
|
|
<cfset menuData["items"] = arrayNew(1)>
|
|
</cfif>
|
|
|
|
<!--- Convert categories to expected format if needed --->
|
|
<cfset formattedCategories = arrayNew(1)>
|
|
<cfloop array="#menuData.categories#" index="cat">
|
|
<cfif isSimpleValue(cat)>
|
|
<cfset catObj = structNew()>
|
|
<cfset catObj["name"] = cat>
|
|
<cfset catObj["itemCount"] = 0>
|
|
<cfset arrayAppend(formattedCategories, catObj)>
|
|
<cfelseif isStruct(cat)>
|
|
<cfif NOT structKeyExists(cat, "itemCount")>
|
|
<cfset cat["itemCount"] = 0>
|
|
</cfif>
|
|
<cfset arrayAppend(formattedCategories, cat)>
|
|
</cfif>
|
|
</cfloop>
|
|
<cfset menuData["categories"] = formattedCategories>
|
|
|
|
<!--- Add item IDs --->
|
|
<cfloop from="1" to="#arrayLen(menuData.items)#" index="i">
|
|
<cfset menuData.items[i]["id"] = "item_" & i>
|
|
</cfloop>
|
|
|
|
<!--- Add image URLs to response --->
|
|
<cfset menuData["imageUrls"] = imageUrlList>
|
|
<cfset menuData["headerCandidateIndices"] = arrayNew(1)>
|
|
|
|
<cfset response["OK"] = true>
|
|
<cfset response["DATA"] = menuData>
|
|
<cfset response["sourceUrl"] = targetUrl>
|
|
<cfset response["pagesProcessed"] = arrayLen(menuPages)>
|
|
<cfset response["imagesFound"] = arrayLen(imageDataArray)>
|
|
|
|
<cfcatch type="any">
|
|
<cfset response["MESSAGE"] = cfcatch.message>
|
|
<cfif len(cfcatch.detail)>
|
|
<cfset response["DETAIL"] = cfcatch.detail>
|
|
</cfif>
|
|
<cfif structKeyExists(cfcatch, "tagContext") AND arrayLen(cfcatch.tagContext) GT 0>
|
|
<cfset response["DEBUG_LINE"] = cfcatch.tagContext[1].line>
|
|
<cfset response["DEBUG_TEMPLATE"] = cfcatch.tagContext[1].template>
|
|
</cfif>
|
|
</cfcatch>
|
|
</cftry>
|
|
|
|
<cfoutput>#serializeJSON(response)#</cfoutput>
|