Scan all HTML files in ZIP for business info
- Extract directory and scan all .htm/.html files recursively - Look for business name in title tags (skip generic titles) - Extract street addresses with regex patterns - Extract phone numbers - Check __OO_STATE__ in other pages for Restaurant data - Merge found info into toastBusiness (first found wins) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
90ed78fa96
commit
cf34636879
1 changed files with 118 additions and 0 deletions
|
|
@ -386,6 +386,124 @@
|
||||||
<cfset arrayAppend(response.steps, "Added default 'Menu' category for " & arrayLen(toastItems) & " items")>
|
<cfset arrayAppend(response.steps, "Added default 'Menu' category for " & arrayLen(toastItems) & " items")>
|
||||||
</cfif>
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Scan ALL HTML files in the ZIP for business info --->
|
||||||
|
<cfset extractDir = getDirectoryFromPath(localFilePath)>
|
||||||
|
<!--- Go up one level if we're in a subfolder (e.g., Menu_files) --->
|
||||||
|
<cfif NOT findNoCase("menu-import", listLast(extractDir, "/\"))>
|
||||||
|
<cfset extractDir = getDirectoryFromPath(left(extractDir, len(extractDir) - 1))>
|
||||||
|
</cfif>
|
||||||
|
<cftry>
|
||||||
|
<cfdirectory action="list" directory="#extractDir#" name="allHtmlFiles" filter="*.htm*" recurse="true" type="file">
|
||||||
|
<cfset arrayAppend(response.steps, "Found " & allHtmlFiles.recordCount & " HTML files in ZIP")>
|
||||||
|
|
||||||
|
<cfloop query="allHtmlFiles">
|
||||||
|
<!--- Skip the main menu file we already processed --->
|
||||||
|
<cfset otherFilePath = "#allHtmlFiles.directory#/#allHtmlFiles.name#">
|
||||||
|
<cfif otherFilePath EQ localFilePath>
|
||||||
|
<cfcontinue>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<cftry>
|
||||||
|
<cfset otherHtml = fileRead(otherFilePath, "utf-8")>
|
||||||
|
<cfset arrayAppend(response.steps, "Scanning " & allHtmlFiles.name & " for business info...")>
|
||||||
|
|
||||||
|
<!--- Extract business name from title tag --->
|
||||||
|
<cfif NOT structKeyExists(toastBusiness, "name") OR NOT len(toastBusiness.name)>
|
||||||
|
<cfset otherTitleMatch = reMatchNoCase('<title[^>]*>([^<]+)</title>', otherHtml)>
|
||||||
|
<cfif arrayLen(otherTitleMatch)>
|
||||||
|
<cfset otherTitle = reReplaceNoCase(otherTitleMatch[1], '.*<title[^>]*>([^<]+)</title>.*', '\1')>
|
||||||
|
<cfset otherTitle = trim(otherTitle)>
|
||||||
|
<!--- Skip generic titles --->
|
||||||
|
<cfif len(otherTitle) AND NOT reFindNoCase("^(Menu|Home|About|Contact|Order|Online)$", otherTitle)>
|
||||||
|
<cfif findNoCase("|", otherTitle)>
|
||||||
|
<cfset otherTitle = trim(listFirst(otherTitle, "|"))>
|
||||||
|
</cfif>
|
||||||
|
<cfset otherTitle = reReplaceNoCase(otherTitle, "\s*-\s*(Menu|Order|Online).*$", "")>
|
||||||
|
<cfif len(otherTitle) AND len(otherTitle) LT 100>
|
||||||
|
<cfset toastBusiness["name"] = otherTitle>
|
||||||
|
<cfset arrayAppend(response.steps, "Found business name in " & allHtmlFiles.name & ": " & otherTitle)>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Extract address - look for common patterns --->
|
||||||
|
<cfif NOT structKeyExists(toastBusiness, "addressLine1") OR NOT len(toastBusiness.addressLine1)>
|
||||||
|
<!--- Look for street address patterns (number + street name) --->
|
||||||
|
<cfset addrMatch = reMatchNoCase('(\d+\s+[A-Za-z0-9\s]+(?:St(?:reet)?|Ave(?:nue)?|Rd|Road|Blvd|Boulevard|Dr(?:ive)?|Ln|Lane|Way|Ct|Court|Pl(?:ace)?|Pkwy|Parkway)[.,]?\s*(?:Suite|Ste|##|Unit|Apt)?\s*[A-Za-z0-9\-]*)', otherHtml)>
|
||||||
|
<cfif arrayLen(addrMatch)>
|
||||||
|
<cfset addrText = trim(addrMatch[1])>
|
||||||
|
<cfif len(addrText) GT 5 AND len(addrText) LT 100>
|
||||||
|
<cfset toastBusiness["addressLine1"] = addrText>
|
||||||
|
<cfset arrayAppend(response.steps, "Found address in " & allHtmlFiles.name & ": " & addrText)>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Extract phone number --->
|
||||||
|
<cfif NOT structKeyExists(toastBusiness, "phone") OR NOT len(toastBusiness.phone)>
|
||||||
|
<cfset phoneMatch = reMatchNoCase('\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})', otherHtml)>
|
||||||
|
<cfif arrayLen(phoneMatch)>
|
||||||
|
<cfset phoneText = reReplaceNoCase(phoneMatch[1], '.*\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4}).*', '\1-\2-\3')>
|
||||||
|
<cfif len(phoneText) GTE 10>
|
||||||
|
<cfset toastBusiness["phone"] = phoneText>
|
||||||
|
<cfset arrayAppend(response.steps, "Found phone in " & allHtmlFiles.name & ": " & phoneText)>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
|
||||||
|
<!--- Check for __OO_STATE__ in other files too (might have Restaurant info) --->
|
||||||
|
<cfif findNoCase("window.__OO_STATE__", otherHtml)>
|
||||||
|
<cfset otherOoMatch = reMatchNoCase("window\.__OO_STATE__\s*=\s*(\{.*?\});\s*window\.", otherHtml)>
|
||||||
|
<cfif arrayLen(otherOoMatch)>
|
||||||
|
<cfset otherOoJson = reReplaceNoCase(otherOoMatch[1], "window\.__OO_STATE__\s*=\s*", "")>
|
||||||
|
<cfset otherOoJson = reReplace(otherOoJson, ";\s*window\.$", "")>
|
||||||
|
<cftry>
|
||||||
|
<cfset otherOoState = deserializeJSON(otherOoJson)>
|
||||||
|
<cfloop collection="#otherOoState#" item="otherKey">
|
||||||
|
<cfif left(otherKey, 11) EQ "Restaurant:">
|
||||||
|
<cfset otherRest = otherOoState[otherKey]>
|
||||||
|
<cfif structKeyExists(otherRest, "name") AND (NOT structKeyExists(toastBusiness, "name") OR NOT len(toastBusiness.name))>
|
||||||
|
<cfset toastBusiness["name"] = otherRest.name>
|
||||||
|
<cfset arrayAppend(response.steps, "Found business name in " & allHtmlFiles.name & " __OO_STATE__: " & otherRest.name)>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherRest, "location")>
|
||||||
|
<cfset otherLoc = otherRest.location>
|
||||||
|
<cfif structKeyExists(otherLoc, "address1") AND (NOT structKeyExists(toastBusiness, "addressLine1") OR NOT len(toastBusiness.addressLine1))>
|
||||||
|
<cfset toastBusiness["addressLine1"] = otherLoc.address1>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherLoc, "city") AND (NOT structKeyExists(toastBusiness, "city") OR NOT len(toastBusiness.city))>
|
||||||
|
<cfset toastBusiness["city"] = otherLoc.city>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherLoc, "state") AND (NOT structKeyExists(toastBusiness, "state") OR NOT len(toastBusiness.state))>
|
||||||
|
<cfset toastBusiness["state"] = otherLoc.state>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherLoc, "zipCode") AND (NOT structKeyExists(toastBusiness, "zip") OR NOT len(toastBusiness.zip))>
|
||||||
|
<cfset toastBusiness["zip"] = otherLoc.zipCode>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherLoc, "phone") AND (NOT structKeyExists(toastBusiness, "phone") OR NOT len(toastBusiness.phone))>
|
||||||
|
<cfset toastBusiness["phone"] = otherLoc.phone>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
<cfif structKeyExists(otherRest, "brandColor") AND (NOT structKeyExists(toastBusiness, "brandColor") OR NOT len(toastBusiness.brandColor))>
|
||||||
|
<cfset toastBusiness["brandColor"] = replace(otherRest.brandColor, "##", "")>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
</cfloop>
|
||||||
|
<cfcatch></cfcatch>
|
||||||
|
</cftry>
|
||||||
|
</cfif>
|
||||||
|
</cfif>
|
||||||
|
<cfcatch>
|
||||||
|
<!--- Skip files that can't be read --->
|
||||||
|
</cfcatch>
|
||||||
|
</cftry>
|
||||||
|
</cfloop>
|
||||||
|
<cfcatch>
|
||||||
|
<cfset arrayAppend(response.steps, "Could not scan other HTML files: " & cfcatch.message)>
|
||||||
|
</cfcatch>
|
||||||
|
</cftry>
|
||||||
|
|
||||||
<cfset arrayAppend(response.steps, "Extracted " & arrayLen(toastItems) & " unique items from " & arrayLen(toastCategories) & " categories")>
|
<cfset arrayAppend(response.steps, "Extracted " & arrayLen(toastItems) & " unique items from " & arrayLen(toastCategories) & " categories")>
|
||||||
<!--- Summary of business info found --->
|
<!--- Summary of business info found --->
|
||||||
<cfset bizKeys = structKeyList(toastBusiness)>
|
<cfset bizKeys = structKeyList(toastBusiness)>
|
||||||
|
|
|
||||||
Reference in a new issue