Fix __OO_STATE__ extraction for Chrome View Source HTML

Chrome's Ctrl+U (View Source) saves wraps content in <span> tags
which breaks the regex termination pattern ;\s*window\. because
HTML tags appear between ; and the next window. variable.

Strip HTML tags from a working copy before regex extraction when
View Source format is detected (presence of <span id="line" tags).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-01 18:45:34 -08:00
parent 4684936595
commit 22e89b2dd3

View file

@ -278,7 +278,12 @@
<!--- Also try to extract from __OO_STATE__ for images and business info --->
<cfif hasOoState>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
<!--- Strip View Source HTML tags if present (Chrome Ctrl+U) --->
<cfset ooExtractHtml2 = pageHtml>
<cfif findNoCase('<span id="line', pageHtml)>
<cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")>
</cfif>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml2)>
<cfif arrayLen(ooStateMatch)>
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
@ -557,7 +562,12 @@
<!--- Check for __OO_STATE__ in other files too (might have Restaurant info) --->
<cfif findNoCase("window.__OO_STATE__", otherHtml)>
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherHtml)>
<!--- Strip View Source HTML tags if present --->
<cfset otherCleanHtml = otherHtml>
<cfif findNoCase('<span id="line', otherHtml)>
<cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")>
</cfif>
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherCleanHtml)>
<cfif arrayLen(otherOoMatch)>
<cfset otherOoJson = reReplaceNoCase(otherOoMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
<cfset otherOoJson = reReplace(otherOoJson, ";\s*window\.$", "")>
@ -940,7 +950,12 @@
<cfif findNoCase("window.__OO_STATE__", pageHtml) AND findNoCase("toasttab", pageHtml)>
<cfset arrayAppend(response.steps, "Toast page detected - extracting menu data from __OO_STATE__")>
<cftry>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", pageHtml)>
<!--- Strip View Source HTML tags (Chrome Ctrl+U saves with <span> wrappers that break regex) --->
<cfset ooExtractHtml = pageHtml>
<cfif findNoCase('<span id="line', pageHtml)>
<cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")>
</cfif>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml)>
<cfif arrayLen(ooStateMatch)>
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>