Use Playwright for JS-rendered menu scraping

- Replace cfhttp with Playwright headless browser
- Capture images from network requests during page render
- No longer needs to fetch subpages (JS renders everything)
- Should capture subcategory items that load dynamically

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-02-12 21:43:37 -08:00
parent 22fc113461
commit 5c50ce2cf9

View file

@ -55,26 +55,24 @@
<cfset targetUrl = "https://" & targetUrl>
</cfif>
<cfset arrayAppend(response.steps, "Fetching URL: " & targetUrl)>
<cfset arrayAppend(response.steps, "Fetching URL with Playwright: " & targetUrl)>
<!--- Fetch the main page with browser-like headers --->
<cfhttp url="#targetUrl#" method="GET" timeout="30" result="mainPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36">
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8">
<cfhttpparam type="header" name="Accept-Language" value="en-US,en;q=0.9">
<cfhttpparam type="header" name="Accept-Encoding" value="gzip, deflate, br">
<cfhttpparam type="header" name="Sec-Fetch-Dest" value="document">
<cfhttpparam type="header" name="Sec-Fetch-Mode" value="navigate">
<cfhttpparam type="header" name="Sec-Fetch-Site" value="none">
<cfhttpparam type="header" name="Sec-Fetch-User" value="?1">
<cfhttpparam type="header" name="Upgrade-Insecure-Requests" value="1">
</cfhttp>
<!--- Use Playwright for JS-rendered content --->
<cfset playwrightOutput = "">
<cfexecute name="node" arguments="/opt/playwright/render.js '#targetUrl#' 4000" timeout="90" variable="playwrightOutput" />
<cfif mainPage.statusCode NEQ "200 OK" AND NOT findNoCase("200", mainPage.statusCode)>
<cfthrow message="Failed to fetch URL: #mainPage.statusCode#">
<cfif NOT len(trim(playwrightOutput))>
<cfthrow message="Playwright returned empty response">
</cfif>
<cfset pageHtml = mainPage.fileContent>
<cfset arrayAppend(response.steps, "Fetched " & len(pageHtml) & " bytes")>
<cfset playwrightResult = deserializeJSON(playwrightOutput)>
<cfif structKeyExists(playwrightResult, "error")>
<cfthrow message="Playwright error: #playwrightResult.error#">
</cfif>
<cfset pageHtml = playwrightResult.html>
<cfset playwrightImages = structKeyExists(playwrightResult, "images") ? playwrightResult.images : arrayNew(1)>
<cfset arrayAppend(response.steps, "Fetched " & len(pageHtml) & " bytes via Playwright, " & arrayLen(playwrightImages) & " images captured")>
<!--- Extract base URL for resolving relative links --->
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
@ -86,52 +84,27 @@
<cfthrow message="Either 'url' or 'html' content is required">
</cfif>
<!--- Find menu links and fetch them too --->
<!--- Initialize playwrightImages if not set (HTML upload case) --->
<cfif NOT isDefined("playwrightImages")>
<cfset playwrightImages = arrayNew(1)>
</cfif>
<!--- Menu pages array - Playwright renders JS so we get everything in one page --->
<cfset menuPages = arrayNew(1)>
<cfset arrayAppend(menuPages, { url: targetUrl, html: pageHtml })>
<!--- Look for menu links in the page --->
<cfset menuLinkPatterns = 'href=["'']([^"'']*(?:menu|food|dishes|order)[^"'']*)["'']'>
<cfset menuLinks = reMatchNoCase(menuLinkPatterns, pageHtml)>
<cfloop array="#menuLinks#" index="linkMatch">
<cfset linkUrl = reReplaceNoCase(linkMatch, 'href=["'']([^"'']*)["'']', "\1")>
<!--- Resolve relative URLs --->
<cfif left(linkUrl, 1) EQ "/">
<cfset linkUrl = baseUrl & linkUrl>
<cfelseif NOT reFindNoCase("^https?://", linkUrl)>
<cfset linkUrl = basePath & linkUrl>
</cfif>
<!--- Skip if same as main page or external domain --->
<cfif linkUrl NEQ targetUrl AND findNoCase(baseUrl, linkUrl)>
<cftry>
<cfhttp url="#linkUrl#" method="GET" timeout="15" result="subPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36">
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml">
</cfhttp>
<cfif findNoCase("200", subPage.statusCode)>
<cfset arrayAppend(menuPages, { url: linkUrl, html: subPage.fileContent })>
<cfset arrayAppend(response.steps, "Found menu page: " & linkUrl)>
</cfif>
<cfcatch>
<!--- Skip failed requests --->
</cfcatch>
</cftry>
</cfif>
<!--- Limit to 5 pages max --->
<cfif arrayLen(menuPages) GTE 5>
<cfbreak>
</cfif>
</cfloop>
<cfset arrayAppend(menuPages, { url: isDefined("targetUrl") ? targetUrl : "uploaded", html: pageHtml })>
<!--- Extract images from all pages --->
<cfset allImages = arrayNew(1)>
<cfset imageUrls = structNew()>
<cfset imageMappings = arrayNew(1)><!--- For local HTML: filename -> alt text mappings --->
<!--- Add images captured by Playwright (network requests) --->
<cfloop array="#playwrightImages#" index="pwImg">
<cfif NOT reFindNoCase("(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)", pwImg)>
<cfset imageUrls[pwImg] = true>
</cfif>
</cfloop>
<cfloop array="#menuPages#" index="menuPage">
<!--- Find all img tags --->
<cfset imgMatches = reMatchNoCase('<img[^>]+src=["'']([^"'']+)["''][^>]*>', menuPage.html)>
@ -579,11 +552,13 @@
<cfset response["OK"] = true>
<cfset response["DATA"] = menuData>
<cfset response["sourceUrl"] = targetUrl>
<cfset response["sourceUrl"] = isDefined("targetUrl") ? targetUrl : "uploaded">
<cfset response["pagesProcessed"] = arrayLen(menuPages)>
<cfset response["imagesFound"] = arrayLen(imageDataArray)>
<cfset response["playwrightImagesCount"] = arrayLen(playwrightImages)>
<!--- Debug: show subcategory mapping --->
<cfset response["DEBUG_SUBCAT_MAP"] = subcatToParentMap>
<cfset response["DEBUG_PLAYWRIGHT_IMAGES"] = playwrightImages>
<cfset response["DEBUG_RAW_CATEGORIES"] = menuData.categories>
<cfcatch type="any">