Use Playwright for JS-rendered menu scraping
- Replace cfhttp with Playwright headless browser - Capture images from network requests during page render - No longer needs to fetch subpages (JS renders everything) - Should capture subcategory items that load dynamically Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
22fc113461
commit
5c50ce2cf9
1 changed files with 31 additions and 56 deletions
|
|
@ -55,26 +55,24 @@
|
||||||
<cfset targetUrl = "https://" & targetUrl>
|
<cfset targetUrl = "https://" & targetUrl>
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<cfset arrayAppend(response.steps, "Fetching URL: " & targetUrl)>
|
<cfset arrayAppend(response.steps, "Fetching URL with Playwright: " & targetUrl)>
|
||||||
|
|
||||||
<!--- Fetch the main page with browser-like headers --->
|
<!--- Use Playwright for JS-rendered content --->
|
||||||
<cfhttp url="#targetUrl#" method="GET" timeout="30" result="mainPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36">
|
<cfset playwrightOutput = "">
|
||||||
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8">
|
<cfexecute name="node" arguments="/opt/playwright/render.js '#targetUrl#' 4000" timeout="90" variable="playwrightOutput" />
|
||||||
<cfhttpparam type="header" name="Accept-Language" value="en-US,en;q=0.9">
|
|
||||||
<cfhttpparam type="header" name="Accept-Encoding" value="gzip, deflate, br">
|
|
||||||
<cfhttpparam type="header" name="Sec-Fetch-Dest" value="document">
|
|
||||||
<cfhttpparam type="header" name="Sec-Fetch-Mode" value="navigate">
|
|
||||||
<cfhttpparam type="header" name="Sec-Fetch-Site" value="none">
|
|
||||||
<cfhttpparam type="header" name="Sec-Fetch-User" value="?1">
|
|
||||||
<cfhttpparam type="header" name="Upgrade-Insecure-Requests" value="1">
|
|
||||||
</cfhttp>
|
|
||||||
|
|
||||||
<cfif mainPage.statusCode NEQ "200 OK" AND NOT findNoCase("200", mainPage.statusCode)>
|
<cfif NOT len(trim(playwrightOutput))>
|
||||||
<cfthrow message="Failed to fetch URL: #mainPage.statusCode#">
|
<cfthrow message="Playwright returned empty response">
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<cfset pageHtml = mainPage.fileContent>
|
<cfset playwrightResult = deserializeJSON(playwrightOutput)>
|
||||||
<cfset arrayAppend(response.steps, "Fetched " & len(pageHtml) & " bytes")>
|
<cfif structKeyExists(playwrightResult, "error")>
|
||||||
|
<cfthrow message="Playwright error: #playwrightResult.error#">
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<cfset pageHtml = playwrightResult.html>
|
||||||
|
<cfset playwrightImages = structKeyExists(playwrightResult, "images") ? playwrightResult.images : arrayNew(1)>
|
||||||
|
<cfset arrayAppend(response.steps, "Fetched " & len(pageHtml) & " bytes via Playwright, " & arrayLen(playwrightImages) & " images captured")>
|
||||||
|
|
||||||
<!--- Extract base URL for resolving relative links --->
|
<!--- Extract base URL for resolving relative links --->
|
||||||
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
|
<cfset baseUrl = reReplace(targetUrl, "(https?://[^/]+).*", "\1")>
|
||||||
|
|
@ -86,52 +84,27 @@
|
||||||
<cfthrow message="Either 'url' or 'html' content is required">
|
<cfthrow message="Either 'url' or 'html' content is required">
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
<!--- Find menu links and fetch them too --->
|
<!--- Initialize playwrightImages if not set (HTML upload case) --->
|
||||||
|
<cfif NOT isDefined("playwrightImages")>
|
||||||
|
<cfset playwrightImages = arrayNew(1)>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Menu pages array - Playwright renders JS so we get everything in one page --->
|
||||||
<cfset menuPages = arrayNew(1)>
|
<cfset menuPages = arrayNew(1)>
|
||||||
<cfset arrayAppend(menuPages, { url: targetUrl, html: pageHtml })>
|
<cfset arrayAppend(menuPages, { url: isDefined("targetUrl") ? targetUrl : "uploaded", html: pageHtml })>
|
||||||
|
|
||||||
<!--- Look for menu links in the page --->
|
|
||||||
<cfset menuLinkPatterns = 'href=["'']([^"'']*(?:menu|food|dishes|order)[^"'']*)["'']'>
|
|
||||||
<cfset menuLinks = reMatchNoCase(menuLinkPatterns, pageHtml)>
|
|
||||||
|
|
||||||
<cfloop array="#menuLinks#" index="linkMatch">
|
|
||||||
<cfset linkUrl = reReplaceNoCase(linkMatch, 'href=["'']([^"'']*)["'']', "\1")>
|
|
||||||
|
|
||||||
<!--- Resolve relative URLs --->
|
|
||||||
<cfif left(linkUrl, 1) EQ "/">
|
|
||||||
<cfset linkUrl = baseUrl & linkUrl>
|
|
||||||
<cfelseif NOT reFindNoCase("^https?://", linkUrl)>
|
|
||||||
<cfset linkUrl = basePath & linkUrl>
|
|
||||||
</cfif>
|
|
||||||
|
|
||||||
<!--- Skip if same as main page or external domain --->
|
|
||||||
<cfif linkUrl NEQ targetUrl AND findNoCase(baseUrl, linkUrl)>
|
|
||||||
<cftry>
|
|
||||||
<cfhttp url="#linkUrl#" method="GET" timeout="15" result="subPage" useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36">
|
|
||||||
<cfhttpparam type="header" name="Accept" value="text/html,application/xhtml+xml">
|
|
||||||
</cfhttp>
|
|
||||||
|
|
||||||
<cfif findNoCase("200", subPage.statusCode)>
|
|
||||||
<cfset arrayAppend(menuPages, { url: linkUrl, html: subPage.fileContent })>
|
|
||||||
<cfset arrayAppend(response.steps, "Found menu page: " & linkUrl)>
|
|
||||||
</cfif>
|
|
||||||
<cfcatch>
|
|
||||||
<!--- Skip failed requests --->
|
|
||||||
</cfcatch>
|
|
||||||
</cftry>
|
|
||||||
</cfif>
|
|
||||||
|
|
||||||
<!--- Limit to 5 pages max --->
|
|
||||||
<cfif arrayLen(menuPages) GTE 5>
|
|
||||||
<cfbreak>
|
|
||||||
</cfif>
|
|
||||||
</cfloop>
|
|
||||||
|
|
||||||
<!--- Extract images from all pages --->
|
<!--- Extract images from all pages --->
|
||||||
<cfset allImages = arrayNew(1)>
|
<cfset allImages = arrayNew(1)>
|
||||||
<cfset imageUrls = structNew()>
|
<cfset imageUrls = structNew()>
|
||||||
<cfset imageMappings = arrayNew(1)><!--- For local HTML: filename -> alt text mappings --->
|
<cfset imageMappings = arrayNew(1)><!--- For local HTML: filename -> alt text mappings --->
|
||||||
|
|
||||||
|
<!--- Add images captured by Playwright (network requests) --->
|
||||||
|
<cfloop array="#playwrightImages#" index="pwImg">
|
||||||
|
<cfif NOT reFindNoCase("(icon|favicon|logo|sprite|pixel|tracking|badge|button|\.svg)", pwImg)>
|
||||||
|
<cfset imageUrls[pwImg] = true>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
|
||||||
<cfloop array="#menuPages#" index="menuPage">
|
<cfloop array="#menuPages#" index="menuPage">
|
||||||
<!--- Find all img tags --->
|
<!--- Find all img tags --->
|
||||||
<cfset imgMatches = reMatchNoCase('<img[^>]+src=["'']([^"'']+)["''][^>]*>', menuPage.html)>
|
<cfset imgMatches = reMatchNoCase('<img[^>]+src=["'']([^"'']+)["''][^>]*>', menuPage.html)>
|
||||||
|
|
@ -579,11 +552,13 @@
|
||||||
|
|
||||||
<cfset response["OK"] = true>
|
<cfset response["OK"] = true>
|
||||||
<cfset response["DATA"] = menuData>
|
<cfset response["DATA"] = menuData>
|
||||||
<cfset response["sourceUrl"] = targetUrl>
|
<cfset response["sourceUrl"] = isDefined("targetUrl") ? targetUrl : "uploaded">
|
||||||
<cfset response["pagesProcessed"] = arrayLen(menuPages)>
|
<cfset response["pagesProcessed"] = arrayLen(menuPages)>
|
||||||
<cfset response["imagesFound"] = arrayLen(imageDataArray)>
|
<cfset response["imagesFound"] = arrayLen(imageDataArray)>
|
||||||
|
<cfset response["playwrightImagesCount"] = arrayLen(playwrightImages)>
|
||||||
<!--- Debug: show subcategory mapping --->
|
<!--- Debug: show subcategory mapping --->
|
||||||
<cfset response["DEBUG_SUBCAT_MAP"] = subcatToParentMap>
|
<cfset response["DEBUG_SUBCAT_MAP"] = subcatToParentMap>
|
||||||
|
<cfset response["DEBUG_PLAYWRIGHT_IMAGES"] = playwrightImages>
|
||||||
<cfset response["DEBUG_RAW_CATEGORIES"] = menuData.categories>
|
<cfset response["DEBUG_RAW_CATEGORIES"] = menuData.categories>
|
||||||
|
|
||||||
<cfcatch type="any">
|
<cfcatch type="any">
|
||||||
|
|
|
||||||
Reference in a new issue