Replace regex extraction with brace-counting for __OO_STATE__
The regex .*? (non-greedy) fails on 500K+ JSON due to Java regex backtracking limits, causing truncated data (only 3 of 6 menus extracted). Replace all 3 extraction points with cfscript brace-counting that reliably handles any JSON size. Also decode HTML entities (& -> &, < -> <, etc.) from Chrome View Source saves. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
22e89b2dd3
commit
e7aaae58b7
1 changed files with 96 additions and 13 deletions
|
|
@ -283,12 +283,36 @@
|
||||||
<cfif findNoCase('<span id="line', pageHtml)>
|
<cfif findNoCase('<span id="line', pageHtml)>
|
||||||
<cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
<cfset ooExtractHtml2 = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml2)>
|
<cfscript>
|
||||||
<cfif arrayLen(ooStateMatch)>
|
ooStateJson2 = "";
|
||||||
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
ooStart2 = findNoCase("window.__OO_STATE__", ooExtractHtml2);
|
||||||
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
if (ooStart2 > 0) {
|
||||||
|
var bs2 = find("{", ooExtractHtml2, ooStart2);
|
||||||
|
if (bs2 > 0) {
|
||||||
|
var d2 = 0; var inS2 = false; var esc2 = false; var be2 = 0;
|
||||||
|
for (var i2 = bs2; i2 <= len(ooExtractHtml2); i2++) {
|
||||||
|
var c2 = mid(ooExtractHtml2, i2, 1);
|
||||||
|
if (esc2) { esc2 = false; continue; }
|
||||||
|
if (c2 == chr(92) && inS2) { esc2 = true; continue; }
|
||||||
|
if (c2 == '"') { inS2 = !inS2; continue; }
|
||||||
|
if (!inS2) {
|
||||||
|
if (c2 == "{") d2 = d2 + 1;
|
||||||
|
else if (c2 == "}") { d2 = d2 - 1; if (d2 == 0) { be2 = i2; break; } }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (be2 > 0) ooStateJson2 = mid(ooExtractHtml2, bs2, be2 - bs2 + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (len(ooStateJson2)) {
|
||||||
|
ooStateJson2 = replace(ooStateJson2, "&", "&", "all");
|
||||||
|
ooStateJson2 = replace(ooStateJson2, "<", "<", "all");
|
||||||
|
ooStateJson2 = replace(ooStateJson2, ">", ">", "all");
|
||||||
|
ooStateJson2 = replace(ooStateJson2, """, '"', "all");
|
||||||
|
}
|
||||||
|
</cfscript>
|
||||||
|
<cfif len(ooStateJson2)>
|
||||||
<cftry>
|
<cftry>
|
||||||
<cfset ooState = deserializeJSON(ooStateJson)>
|
<cfset ooState = deserializeJSON(ooStateJson2)>
|
||||||
<!--- Debug: log all top-level keys in OO_STATE --->
|
<!--- Debug: log all top-level keys in OO_STATE --->
|
||||||
<cfset ooStateKeys = structKeyList(ooState)>
|
<cfset ooStateKeys = structKeyList(ooState)>
|
||||||
<cfset arrayAppend(response.steps, "OO_STATE keys: " & left(ooStateKeys, 500))>
|
<cfset arrayAppend(response.steps, "OO_STATE keys: " & left(ooStateKeys, 500))>
|
||||||
|
|
@ -567,10 +591,34 @@
|
||||||
<cfif findNoCase('<span id="line', otherHtml)>
|
<cfif findNoCase('<span id="line', otherHtml)>
|
||||||
<cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")>
|
<cfset otherCleanHtml = reReplace(otherHtml, "<[^>]+>", "", "all")>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherCleanHtml)>
|
<cfscript>
|
||||||
<cfif arrayLen(otherOoMatch)>
|
otherOoJson = "";
|
||||||
<cfset otherOoJson = reReplaceNoCase(otherOoMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
var otherOoStart = findNoCase("window.__OO_STATE__", otherCleanHtml);
|
||||||
<cfset otherOoJson = reReplace(otherOoJson, ";\s*window\.$", "")>
|
if (otherOoStart > 0) {
|
||||||
|
var obs = find("{", otherCleanHtml, otherOoStart);
|
||||||
|
if (obs > 0) {
|
||||||
|
var od = 0; var ois = false; var oesc = false; var obe = 0;
|
||||||
|
for (var oi = obs; oi <= len(otherCleanHtml); oi++) {
|
||||||
|
var oc = mid(otherCleanHtml, oi, 1);
|
||||||
|
if (oesc) { oesc = false; continue; }
|
||||||
|
if (oc == chr(92) && ois) { oesc = true; continue; }
|
||||||
|
if (oc == '"') { ois = !ois; continue; }
|
||||||
|
if (!ois) {
|
||||||
|
if (oc == "{") od = od + 1;
|
||||||
|
else if (oc == "}") { od = od - 1; if (od == 0) { obe = oi; break; } }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (obe > 0) otherOoJson = mid(otherCleanHtml, obs, obe - obs + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (len(otherOoJson)) {
|
||||||
|
otherOoJson = replace(otherOoJson, "&", "&", "all");
|
||||||
|
otherOoJson = replace(otherOoJson, "<", "<", "all");
|
||||||
|
otherOoJson = replace(otherOoJson, ">", ">", "all");
|
||||||
|
otherOoJson = replace(otherOoJson, """, '"', "all");
|
||||||
|
}
|
||||||
|
</cfscript>
|
||||||
|
<cfif len(otherOoJson)>
|
||||||
<cftry>
|
<cftry>
|
||||||
<cfset otherOoState = deserializeJSON(otherOoJson)>
|
<cfset otherOoState = deserializeJSON(otherOoJson)>
|
||||||
<cfloop collection="#otherOoState#" item="otherKey">
|
<cfloop collection="#otherOoState#" item="otherKey">
|
||||||
|
|
@ -955,10 +1003,45 @@
|
||||||
<cfif findNoCase('<span id="line', pageHtml)>
|
<cfif findNoCase('<span id="line', pageHtml)>
|
||||||
<cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
<cfset ooExtractHtml = reReplace(pageHtml, "<[^>]+>", "", "all")>
|
||||||
</cfif>
|
</cfif>
|
||||||
<cfset ooStateMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", ooExtractHtml)>
|
<!--- Extract JSON using brace-counting (regex .*? fails on 500K+ JSON due to backtracking) --->
|
||||||
<cfif arrayLen(ooStateMatch)>
|
<cfscript>
|
||||||
<cfset ooStateJson = reReplaceNoCase(ooStateMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
ooStateJson = "";
|
||||||
<cfset ooStateJson = reReplace(ooStateJson, ";\s*window\.$", "")>
|
ooStartPos = findNoCase("window.__OO_STATE__", ooExtractHtml);
|
||||||
|
if (ooStartPos > 0) {
|
||||||
|
braceStart = find("{", ooExtractHtml, ooStartPos);
|
||||||
|
if (braceStart > 0) {
|
||||||
|
var depth = 0;
|
||||||
|
var inStr = false;
|
||||||
|
var esc = false;
|
||||||
|
var braceEnd = 0;
|
||||||
|
var totalLen = len(ooExtractHtml);
|
||||||
|
for (var ci = braceStart; ci <= totalLen; ci++) {
|
||||||
|
var ch = mid(ooExtractHtml, ci, 1);
|
||||||
|
if (esc) { esc = false; continue; }
|
||||||
|
if (ch == chr(92) && inStr) { esc = true; continue; }
|
||||||
|
if (ch == '"') { inStr = !inStr; continue; }
|
||||||
|
if (!inStr) {
|
||||||
|
if (ch == "{") depth = depth + 1;
|
||||||
|
else if (ch == "}") {
|
||||||
|
depth = depth - 1;
|
||||||
|
if (depth == 0) { braceEnd = ci; break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (braceEnd > 0) {
|
||||||
|
ooStateJson = mid(ooExtractHtml, braceStart, braceEnd - braceStart + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Decode HTML entities from View Source (Chrome encodes & as & etc.)
|
||||||
|
if (len(ooStateJson)) {
|
||||||
|
ooStateJson = replace(ooStateJson, "&", "&", "all");
|
||||||
|
ooStateJson = replace(ooStateJson, "<", "<", "all");
|
||||||
|
ooStateJson = replace(ooStateJson, ">", ">", "all");
|
||||||
|
ooStateJson = replace(ooStateJson, """, '"', "all");
|
||||||
|
}
|
||||||
|
</cfscript>
|
||||||
|
<cfif len(ooStateJson)>
|
||||||
<cfset ooState = deserializeJSON(ooStateJson)>
|
<cfset ooState = deserializeJSON(ooStateJson)>
|
||||||
|
|
||||||
<cfset toastBusiness = structNew()>
|
<cfset toastBusiness = structNew()>
|
||||||
|
|
|
||||||
Reference in a new issue