Fix __OO_STATE__ extraction for Chrome View Source HTML
Chrome's Ctrl+U (View Source) saves wraps content in <span> tags which breaks the regex termination pattern ;\s*window\. because HTML tags appear between ; and the next window. variable. Strip HTML tags from a working copy before regex extraction when View Source format is detected (presence of <span id="line" tags). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4684936595
commit
22e89b2dd3
1 changed files with 18 additions and 3 deletions
|
|
@ -278,7 +278,12 @@
|
|||
|
||||
<!--- Also try to extract from __OO_STATE__ for images and business info --->
|
||||
<cfif hasOoState>
|
||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
|
||||
<!--- Strip View Source HTML tags if present (Chrome Ctrl+U) --->
|
||||
<cfset ooExtractHtml2 = pageHtml>
|
||||
<cfif findNoCase('<span id="line', pageHtml)>
|
||||
<cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
||||
</cfif>
|
||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml2)>
|
||||
<cfif arrayLen(ooStateMatch)>
|
||||
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
||||
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
||||
|
|
@ -557,7 +562,12 @@
|
|||
|
||||
<!--- Check for __OO_STATE__ in other files too (might have Restaurant info) --->
|
||||
<cfif findNoCase("window.__OO_STATE__", otherHtml)>
|
||||
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherHtml)>
|
||||
<!--- Strip View Source HTML tags if present --->
|
||||
<cfset otherCleanHtml = otherHtml>
|
||||
<cfif findNoCase('<span id="line', otherHtml)>
|
||||
<cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")>
|
||||
</cfif>
|
||||
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherCleanHtml)>
|
||||
<cfif arrayLen(otherOoMatch)>
|
||||
<cfset otherOoJson = reReplaceNoCase(otherOoMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
||||
<cfset otherOoJson = reReplace(otherOoJson, ";\s*window\.$", "")>
|
||||
|
|
@ -940,7 +950,12 @@
|
|||
<cfif findNoCase("window.__OO_STATE__", pageHtml) AND findNoCase("toasttab", pageHtml)>
|
||||
<cfset arrayAppend(response.steps, "Toast page detected - extracting menu data from __OO_STATE__")>
|
||||
<cftry>
|
||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
|
||||
<!--- Strip View Source HTML tags (Chrome Ctrl+U saves with <span> wrappers that break regex) --->
|
||||
<cfset ooExtractHtml = pageHtml>
|
||||
<cfif findNoCase('<span id="line', pageHtml)>
|
||||
<cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
||||
</cfif>
|
||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml)>
|
||||
<cfif arrayLen(ooStateMatch)>
|
||||
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
||||
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
||||
|
|
|
|||
Reference in a new issue