Replace regex extraction with brace-counting for __OO_STATE__

The regex .*? (non-greedy) fails on 500K+ JSON due to Java regex
backtracking limits, causing truncated data (only 3 of 6 menus
extracted). Replace all 3 extraction points with cfscript
brace-counting that reliably handles any JSON size.

Also decode HTML entities (&amp; -> &, &lt; -> <, etc.) from
Chrome View Source saves.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John Mizerek 2026-03-01 19:18:17 -08:00
parent 22e89b2dd3
commit e7aaae58b7

View file

@ -283,12 +283,36 @@
<cfif findNoCase('<span id="line', pageHtml)> <cfif findNoCase('<span id="line', pageHtml)>
<cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")> <cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")>
</cfif> </cfif>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml2)> <cfscript>
<cfif arrayLen(ooStateMatch)> ooStateJson2 = "";
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")> ooStart2 = findNoCase("window.__OO_STATE__", ooExtractHtml2);
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")> if (ooStart2 > 0) {
var bs2 = find("{", ooExtractHtml2, ooStart2);
if (bs2 > 0) {
var d2 = 0; var inS2 = false; var esc2 = false; var be2 = 0;
for (var i2 = bs2; i2 <= len(ooExtractHtml2); i2++) {
var c2 = mid(ooExtractHtml2, i2, 1);
if (esc2) { esc2 = false; continue; }
if (c2 == chr(92) && inS2) { esc2 = true; continue; }
if (c2 == '"') { inS2 = !inS2; continue; }
if (!inS2) {
if (c2 == "{") d2 = d2 + 1;
else if (c2 == "}") { d2 = d2 - 1; if (d2 == 0) { be2 = i2; break; } }
}
}
if (be2 > 0) ooStateJson2 = mid(ooExtractHtml2, bs2, be2 - bs2 + 1);
}
}
if (len(ooStateJson2)) {
ooStateJson2 = replace(ooStateJson2, "&amp;", "&", "all");
ooStateJson2 = replace(ooStateJson2, "&lt;", "<", "all");
ooStateJson2 = replace(ooStateJson2, "&gt;", ">", "all");
ooStateJson2 = replace(ooStateJson2, "&quot;", '"', "all");
}
</cfscript>
<cfif len(ooStateJson2)>
<cftry> <cftry>
<cfset ooState = deserializeJSON(ooStateJson)> <cfset ooState = deserializeJSON(ooStateJson2)>
<!--- Debug: log all top-level keys in OO_STATE ---> <!--- Debug: log all top-level keys in OO_STATE --->
<cfset ooStateKeys = structKeyList(ooState)> <cfset ooStateKeys = structKeyList(ooState)>
<cfset arrayAppend(response.steps, "OO_STATE keys: " & left(ooStateKeys, 500))> <cfset arrayAppend(response.steps, "OO_STATE keys: " & left(ooStateKeys, 500))>
@ -567,10 +591,34 @@
<cfif findNoCase('<span id="line', otherHtml)> <cfif findNoCase('<span id="line', otherHtml)>
<cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")> <cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")>
</cfif> </cfif>
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherCleanHtml)> <cfscript>
<cfif arrayLen(otherOoMatch)> otherOoJson = "";
<cfset otherOoJson = reReplaceNoCase(otherOoMatch[1], "window\.__OO_STATE__\s*=\s*", "")> var otherOoStart = findNoCase("window.__OO_STATE__", otherCleanHtml);
<cfset otherOoJson = reReplace(otherOoJson, ";\s*window\.$", "")> if (otherOoStart > 0) {
var obs = find("{", otherCleanHtml, otherOoStart);
if (obs > 0) {
var od = 0; var ois = false; var oesc = false; var obe = 0;
for (var oi = obs; oi <= len(otherCleanHtml); oi++) {
var oc = mid(otherCleanHtml, oi, 1);
if (oesc) { oesc = false; continue; }
if (oc == chr(92) && ois) { oesc = true; continue; }
if (oc == '"') { ois = !ois; continue; }
if (!ois) {
if (oc == "{") od = od + 1;
else if (oc == "}") { od = od - 1; if (od == 0) { obe = oi; break; } }
}
}
if (obe > 0) otherOoJson = mid(otherCleanHtml, obs, obe - obs + 1);
}
}
if (len(otherOoJson)) {
otherOoJson = replace(otherOoJson, "&amp;", "&", "all");
otherOoJson = replace(otherOoJson, "&lt;", "<", "all");
otherOoJson = replace(otherOoJson, "&gt;", ">", "all");
otherOoJson = replace(otherOoJson, "&quot;", '"', "all");
}
</cfscript>
<cfif len(otherOoJson)>
<cftry> <cftry>
<cfset otherOoState = deserializeJSON(otherOoJson)> <cfset otherOoState = deserializeJSON(otherOoJson)>
<cfloop collection="#otherOoState#" item="otherKey"> <cfloop collection="#otherOoState#" item="otherKey">
@ -955,10 +1003,45 @@
<cfif findNoCase('<span id="line', pageHtml)> <cfif findNoCase('<span id="line', pageHtml)>
<cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")> <cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")>
</cfif> </cfif>
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml)> <!--- Extract JSON using brace-counting (regex .*? fails on 500K+ JSON due to backtracking) --->
<cfif arrayLen(ooStateMatch)> <cfscript>
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")> ooStateJson = "";
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")> ooStartPos = findNoCase("window.__OO_STATE__", ooExtractHtml);
if (ooStartPos > 0) {
braceStart = find("{", ooExtractHtml, ooStartPos);
if (braceStart > 0) {
var depth = 0;
var inStr = false;
var esc = false;
var braceEnd = 0;
var totalLen = len(ooExtractHtml);
for (var ci = braceStart; ci <= totalLen; ci++) {
var ch = mid(ooExtractHtml, ci, 1);
if (esc) { esc = false; continue; }
if (ch == chr(92) && inStr) { esc = true; continue; }
if (ch == '"') { inStr = !inStr; continue; }
if (!inStr) {
if (ch == "{") depth = depth + 1;
else if (ch == "}") {
depth = depth - 1;
if (depth == 0) { braceEnd = ci; break; }
}
}
}
if (braceEnd > 0) {
ooStateJson = mid(ooExtractHtml, braceStart, braceEnd - braceStart + 1);
}
}
}
// Decode HTML entities from View Source (Chrome encodes & as &amp; etc.)
if (len(ooStateJson)) {
ooStateJson = replace(ooStateJson, "&amp;", "&", "all");
ooStateJson = replace(ooStateJson, "&lt;", "<", "all");
ooStateJson = replace(ooStateJson, "&gt;", ">", "all");
ooStateJson = replace(ooStateJson, "&quot;", '"', "all");
}
</cfscript>
<cfif len(ooStateJson)>
<cfset ooState = deserializeJSON(ooStateJson)> <cfset ooState = deserializeJSON(ooStateJson)>
<cfset toastBusiness = structNew()> <cfset toastBusiness = structNew()>