Previously only lived on servers at /opt/playwright/. Now tracked in repo. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
497 lines
21 KiB
JavaScript
497 lines
21 KiB
JavaScript
const { chromium } = require("playwright");
|
||
|
||
(async () => {
|
||
const url = process.argv[2];
|
||
if (!url) {
|
||
console.log(JSON.stringify({ error: "URL required" }));
|
||
process.exit(1);
|
||
}
|
||
|
||
const log = (msg) => process.stderr.write("[woo-mod] " + msg + "\n");
|
||
|
||
let browser;
|
||
try {
|
||
browser = await chromium.launch({ headless: true });
|
||
const context = await browser.newContext({
|
||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||
});
|
||
const page = await context.newPage();
|
||
|
||
log("Navigating to " + url);
|
||
await page.goto(url, { waitUntil: "networkidle", timeout: 60000 });
|
||
await page.waitForTimeout(3000);
|
||
|
||
// Close any popups/modals
|
||
try {
|
||
const closeButtons = await page.$$('.close, .modal .close, [aria-label="Close"]');
|
||
for (const btn of closeButtons) {
|
||
if (await btn.isVisible()) await btn.click().catch(() => {});
|
||
}
|
||
} catch (e) {}
|
||
|
||
// Extract business info from the page
|
||
const businessInfo = await page.evaluate(() => {
|
||
const info = { name: '', address: '', phone: '', hours: '' };
|
||
// Try common selectors for business name
|
||
const nameEl = document.querySelector('.site-title, .logo-text, h1.site-title, .custom-logo-link img, title');
|
||
if (nameEl) {
|
||
info.name = nameEl.alt || nameEl.textContent || '';
|
||
info.name = info.name.replace(/\s*[-–|].*$/, '').trim(); // strip taglines
|
||
}
|
||
// Try page title as fallback
|
||
if (!info.name && document.title) {
|
||
info.name = document.title.replace(/\s*[-–|].*$/, '').trim();
|
||
}
|
||
// Look for address/phone in common locations
|
||
const bodyText = document.body.innerText;
|
||
// Phone
|
||
const phoneMatch = bodyText.match(/(?:Call|Phone|Tel)[:\s]*\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/i) ||
|
||
bodyText.match(/\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}/);
|
||
if (phoneMatch) info.phone = phoneMatch[0].replace(/^(?:Call|Phone|Tel)[:\s]*/i, '').trim();
|
||
// Address - look for street patterns
|
||
const addrMatch = bodyText.match(/\d{1,5}\s+[A-Z][a-zA-Z\s]+(?:St|Ave|Blvd|Dr|Rd|Ln|Way|Ct|Pl|Cir)[.,]?\s*(?:[A-Z][a-zA-Z\s]+,?\s*[A-Z]{2}\s*\d{5})?/);
|
||
if (addrMatch) info.address = addrMatch[0].trim();
|
||
return info;
|
||
});
|
||
log("Business: " + businessInfo.name + " | " + businessInfo.address + " | " + businessInfo.phone);
|
||
|
||
// Strategy 1: Products displayed inline on the page (custom WooCommerce themes)
|
||
// Collect products from all category tabs/pages
|
||
let allProducts = [];
|
||
|
||
// Check for category tabs (clickable, same-page) vs category links (separate pages)
|
||
const categoryTabs = await page.evaluate(() => {
|
||
const tabs = document.querySelectorAll('li.tabs, .category-tab, [data-filter]');
|
||
return [...tabs].map((t, i) => ({ index: i, name: t.textContent.trim(), active: t.classList.contains('active') }))
|
||
.filter(t => t.name.length > 0 && t.name.length < 60);
|
||
});
|
||
|
||
const categoryLinks = await page.evaluate(() => {
|
||
// Only use links if no tabs found — some themes use links that 404
|
||
const cats = document.querySelectorAll('.product-category a, .product_cat a');
|
||
return [...cats].map(a => ({ href: a.href, name: a.textContent.trim() })).filter(c => c.name.length > 0 && c.name.length < 60);
|
||
});
|
||
|
||
const useTabs = categoryTabs.length > 0;
|
||
log("Found " + categoryTabs.length + " category tabs, " + categoryLinks.length + " category links" + (useTabs ? " (using tabs)" : ""));
|
||
|
||
const visitedUrls = new Set([page.url()]);
|
||
|
||
// Scrape current page first
|
||
const scrapeInlineProducts = async (catOverride) => {
|
||
return await page.evaluate((catName) => {
|
||
const products = [];
|
||
const productEls = document.querySelectorAll('.product-con-box, li.product, .type-product, .product-item');
|
||
productEls.forEach(el => {
|
||
const nameEl = el.querySelector('.woocommerce-loop-product__title, h2, h3, .product-title');
|
||
if (!nameEl) return;
|
||
const name = nameEl.textContent.trim();
|
||
if (!name) return;
|
||
|
||
const descEl = el.querySelector('.woocommerce-product-details__short-description, .description, .short-description');
|
||
const description = descEl ? descEl.textContent.trim().substring(0, 200) : '';
|
||
|
||
let price = 0;
|
||
const priceEl = el.querySelector('.price .woocommerce-Price-amount, .price ins .amount, .price');
|
||
if (priceEl) {
|
||
const m = priceEl.textContent.match(/\$?([\d.]+)/);
|
||
if (m) price = parseFloat(m[1]) || 0;
|
||
}
|
||
|
||
const imgEl = el.querySelector('img');
|
||
const imageUrl = imgEl ? (imgEl.src || imgEl.dataset.src || '') : '';
|
||
|
||
// Try to get category from element classes
|
||
let category = catName || '';
|
||
if (!category) {
|
||
const classes = el.className || '';
|
||
const catMatch = classes.match(/product_cat-([a-z0-9-]+)/);
|
||
if (catMatch) {
|
||
category = catMatch[1].replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
|
||
}
|
||
}
|
||
|
||
// Get post ID for clicking later
|
||
const idMatch = (el.className || '').match(/post-(\d+)/);
|
||
const postId = idMatch ? idMatch[1] : '';
|
||
|
||
products.push({ name, price, description, imageUrl, category, postId });
|
||
});
|
||
return products;
|
||
}, catOverride);
|
||
};
|
||
|
||
// Scrape products — either by clicking tabs or visiting category pages
|
||
if (useTabs) {
|
||
// Click each tab (prevent navigation) and scrape products that appear
|
||
for (const tab of categoryTabs) {
|
||
try {
|
||
log("Clicking tab: " + tab.name);
|
||
const tabName = tab.name;
|
||
await page.evaluate((name) => {
|
||
const btns = document.querySelectorAll('li.tabs a, a.catabtn, .category-tab a');
|
||
for (const btn of btns) {
|
||
if (btn.textContent.trim() === name) {
|
||
btn.addEventListener('click', e => e.preventDefault(), { once: true });
|
||
btn.click();
|
||
break;
|
||
}
|
||
}
|
||
}, tabName);
|
||
await page.waitForTimeout(2500);
|
||
|
||
const catProducts = await scrapeInlineProducts(tab.name);
|
||
for (const p of catProducts) {
|
||
if (!allProducts.find(ep => ep.name === p.name)) {
|
||
allProducts.push(p);
|
||
}
|
||
}
|
||
log(" -> " + catProducts.length + " products");
|
||
} catch (e) {
|
||
log("Error on tab " + tab.name + ": " + e.message);
|
||
}
|
||
}
|
||
} else {
|
||
// Scrape homepage products first
|
||
let homeProducts = await scrapeInlineProducts(null);
|
||
log("Found " + homeProducts.length + " products on homepage");
|
||
allProducts.push(...homeProducts);
|
||
|
||
// Visit each category page
|
||
if (categoryLinks.length > 0) {
|
||
for (const cat of categoryLinks) {
|
||
if (visitedUrls.has(cat.href)) continue;
|
||
visitedUrls.add(cat.href);
|
||
try {
|
||
log("Visiting category: " + cat.name);
|
||
await page.goto(cat.href, { waitUntil: "networkidle", timeout: 30000 });
|
||
await page.waitForTimeout(2000);
|
||
|
||
let pageNum = 1;
|
||
while (pageNum <= 10) {
|
||
const catProducts = await scrapeInlineProducts(cat.name);
|
||
for (const p of catProducts) {
|
||
if (!allProducts.find(ep => ep.name === p.name)) {
|
||
allProducts.push(p);
|
||
}
|
||
}
|
||
|
||
const nextUrl = await page.evaluate(() => {
|
||
const next = document.querySelector('.woocommerce-pagination .next, a.next.page-numbers');
|
||
return next ? next.href : null;
|
||
});
|
||
if (!nextUrl) break;
|
||
pageNum++;
|
||
await page.goto(nextUrl, { waitUntil: "networkidle", timeout: 30000 });
|
||
await page.waitForTimeout(1500);
|
||
}
|
||
} catch (e) {
|
||
log("Error on category " + cat.name + ": " + e.message);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
log("Total unique products: " + allProducts.length);
|
||
|
||
// Strategy 2: If no inline products found, try standard product links
|
||
if (allProducts.length === 0) {
|
||
log("No inline products - trying product link approach");
|
||
await page.goto(url, { waitUntil: "networkidle", timeout: 30000 });
|
||
await page.waitForTimeout(3000);
|
||
|
||
const productLinks = await page.evaluate(() => {
|
||
const anchors = document.querySelectorAll('a[href*="/product/"], a.woocommerce-LoopProduct-link');
|
||
return [...new Set([...anchors].map(a => a.href))];
|
||
});
|
||
|
||
log("Found " + productLinks.length + " product links");
|
||
|
||
for (let i = 0; i < productLinks.length; i++) {
|
||
try {
|
||
await page.goto(productLinks[i], { waitUntil: "domcontentloaded", timeout: 30000 });
|
||
await page.waitForTimeout(1000);
|
||
|
||
const pd = await page.evaluate(() => {
|
||
const nameEl = document.querySelector('.product_title, h1.entry-title');
|
||
const name = nameEl ? nameEl.textContent.trim() : '';
|
||
const priceEl = document.querySelector('.summary .price .woocommerce-Price-amount');
|
||
let price = 0;
|
||
if (priceEl) { const m = priceEl.textContent.match(/\$?([\d.]+)/); if (m) price = parseFloat(m[1]) || 0; }
|
||
const descEl = document.querySelector('.woocommerce-product-details__short-description');
|
||
const desc = descEl ? descEl.textContent.trim().substring(0, 200) : '';
|
||
const imgEl = document.querySelector('.woocommerce-product-gallery__image img');
|
||
const img = imgEl ? (imgEl.src || '') : '';
|
||
const catEl = document.querySelector('.posted_in a');
|
||
const cat = catEl ? catEl.textContent.trim() : '';
|
||
return { name, price, description: desc, imageUrl: img, category: cat, postId: '' };
|
||
});
|
||
|
||
if (pd.name) allProducts.push(pd);
|
||
} catch (e) {
|
||
log("Error on product link: " + e.message);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Now extract modifiers by visiting individual product pages
|
||
const modifierGroupsMap = {};
|
||
const itemModifierMap = {};
|
||
|
||
// For inline products, we need their permalink - try /product/{slug} or ?p={postId}
|
||
for (let i = 0; i < allProducts.length; i++) {
|
||
const prod = allProducts[i];
|
||
log(`[${i + 1}/${allProducts.length}] Extracting modifiers for: ${prod.name}`);
|
||
|
||
// Build product URL from name slug or postId
|
||
let productUrl = '';
|
||
if (prod.postId) {
|
||
productUrl = url.replace(/\/$/, '') + '/?p=' + prod.postId;
|
||
} else {
|
||
const slug = prod.name.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
|
||
productUrl = url.replace(/\/$/, '') + '/product/' + slug + '/';
|
||
}
|
||
|
||
try {
|
||
await page.goto(productUrl, { waitUntil: "domcontentloaded", timeout: 20000 });
|
||
await page.waitForTimeout(1500);
|
||
|
||
const groups = await page.evaluate(() => {
|
||
const results = [];
|
||
const seen = new Set();
|
||
|
||
// Helper: clean TMEPO value strings like "Small_0" -> "Small"
|
||
const cleanVal = (v) => v ? v.replace(/_\d+$/, '').trim() : '';
|
||
|
||
// Helper: parse price from data-rules JSON or data-price attr
|
||
const parseRulesPrice = (input) => {
|
||
if (!input) return 0;
|
||
const rules = input.getAttribute('data-rules');
|
||
if (rules) {
|
||
try {
|
||
const arr = JSON.parse(rules);
|
||
if (Array.isArray(arr) && arr.length > 0) {
|
||
const v = parseFloat(arr[0]);
|
||
if (!isNaN(v) && v > 0) return v;
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
const dp = input.getAttribute('data-price');
|
||
if (dp) { const v = parseFloat(dp); if (!isNaN(v) && v > 0) return v; }
|
||
return 0;
|
||
};
|
||
|
||
// Prefer granular containers (individual field cells) over broad row containers
|
||
// tm-cell with cpf-type are individual modifier groups; tc-cell.tcell are TMEPO cells
|
||
const granular = document.querySelectorAll('.tm-cell[class*="cpf-type-"], .tc-cell.tcell');
|
||
const broad = document.querySelectorAll('.tc-row, .tm-row, .cpf-section');
|
||
// Use granular if they have labels, otherwise fall back to broad
|
||
const granularWithLabels = [...granular].filter(el =>
|
||
el.querySelector('.tm-epo-field-label label, .tm-epo-element-label, h3.tm-epo-field-label, label:first-of-type')
|
||
);
|
||
const elements = granularWithLabels.length > 0 ? granularWithLabels : (broad.length > 0 ? broad : granular);
|
||
|
||
elements.forEach(section => {
|
||
const labelEl = section.querySelector('.tm-epo-field-label label, .tm-epo-element-label, h3.tm-epo-field-label, label:first-of-type');
|
||
if (!labelEl) return;
|
||
|
||
const groupName = labelEl.textContent.trim();
|
||
if (!groupName || groupName.length > 80 || seen.has(groupName)) return;
|
||
if (/special request|sandwich name|your name|instructions|quantity/i.test(groupName)) return;
|
||
seen.add(groupName);
|
||
|
||
const options = [];
|
||
let groupType = 'select'; // default
|
||
|
||
// Radio buttons and checkboxes — get name from input.value, price from data-rules
|
||
const radios = section.querySelectorAll('input[type="radio"]');
|
||
const checkboxes = section.querySelectorAll('input[type="checkbox"]');
|
||
if (radios.length > 0) groupType = 'radio';
|
||
else if (checkboxes.length > 0) groupType = 'checkbox';
|
||
|
||
// Check if this checkbox group has both preselected and non-preselected (split into two groups)
|
||
const hasPreselected = checkboxes.length > 0 && [...checkboxes].some(c => c.className.includes('custom-preselected'));
|
||
const hasAdditions = checkboxes.length > 0 && [...checkboxes].some(c => !c.className.includes('custom-preselected'));
|
||
const shouldSplit = hasPreselected && hasAdditions;
|
||
|
||
const additionOptions = []; // only used if splitting
|
||
|
||
section.querySelectorAll('.tmcp-field-wrap, .tm-field-wrap, label.tm-epo-field-label-wrap').forEach(wrap => {
|
||
const input = wrap.querySelector('input[type="radio"], input[type="checkbox"]');
|
||
if (input) {
|
||
const optName = cleanVal(input.value);
|
||
if (!optName || optName.length > 80) return;
|
||
const optPrice = parseRulesPrice(input);
|
||
const selected = input.checked || wrap.classList.contains('tc-active');
|
||
const isPreselected = input.className.includes('custom-preselected');
|
||
|
||
// Skip disabled duplicates (size variants)
|
||
if (input.disabled || input.className.includes('tcdisabled')) return;
|
||
|
||
const entry = { name: optName, price: optPrice, selected };
|
||
|
||
if (shouldSplit && !isPreselected) {
|
||
// Deduplicate
|
||
if (!additionOptions.find(o => o.name === optName)) {
|
||
additionOptions.push(entry);
|
||
}
|
||
} else {
|
||
// Deduplicate
|
||
if (!options.find(o => o.name === optName)) {
|
||
options.push(entry);
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
// Fallback: try label text
|
||
const lbl = wrap.querySelector('.tm-label, .tm-value, label span:not(.tm-price)');
|
||
if (lbl) {
|
||
const optName = lbl.textContent.replace(/[\n\r\t]+/g, ' ').trim();
|
||
if (!optName || optName.length > 80) return;
|
||
let optPrice = 0;
|
||
const priceSpan = wrap.querySelector('.tm-price, .price .amount, [class*="price"]');
|
||
if (priceSpan) {
|
||
const m = priceSpan.textContent.match(/\+?\$?([\d.]+)/);
|
||
if (m) optPrice = parseFloat(m[1]) || 0;
|
||
}
|
||
if (!options.find(o => o.name === optName)) {
|
||
options.push({ name: optName, price: optPrice, selected: false });
|
||
}
|
||
}
|
||
});
|
||
|
||
// Select dropdowns — get name from option.value, price from data-price
|
||
if (options.length === 0) {
|
||
section.querySelectorAll('select option').forEach(opt => {
|
||
if (!opt.value) return;
|
||
let optName = cleanVal(opt.value);
|
||
if (!optName || optName.length > 80) return;
|
||
let optPrice = 0;
|
||
const dp = opt.getAttribute('data-price');
|
||
if (dp) { const v = parseFloat(dp); if (!isNaN(v) && v > 0) optPrice = v; }
|
||
const text = opt.textContent.trim();
|
||
if (text && text.length < 80 && text !== optName) {
|
||
const m = text.match(/\+?\$?([\d.]+)/);
|
||
if (m) optPrice = optPrice || (parseFloat(m[1]) || 0);
|
||
optName = text.replace(/\s*\(\+?\$?[\d.]+\)\s*$/, '').trim() || optName;
|
||
}
|
||
options.push({ name: optName, price: optPrice, selected: opt.selected });
|
||
});
|
||
}
|
||
|
||
if (options.length > 0) {
|
||
const required = section.querySelector('.required, [data-required="1"]') !== null;
|
||
results.push({ name: groupName, type: groupType, options, required });
|
||
}
|
||
|
||
});
|
||
|
||
// Standard WooCommerce variations fallback
|
||
if (results.length === 0) {
|
||
const vForm = document.querySelector('.variations_form');
|
||
if (vForm) {
|
||
vForm.querySelectorAll('.variations tr').forEach(row => {
|
||
const lbl = row.querySelector('th label, .label label');
|
||
const sel = row.querySelector('select');
|
||
if (lbl && sel) {
|
||
const opts = [...sel.querySelectorAll('option')].filter(o => o.value).map(o => ({ name: o.textContent.trim(), price: 0 }));
|
||
if (opts.length > 0) results.push({ name: lbl.textContent.trim(), options: opts, required: true });
|
||
}
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
});
|
||
|
||
// If item has no price, try to get it from the product page
|
||
if (prod.price === 0) {
|
||
const pagePrice = await page.evaluate(() => {
|
||
const pe = document.querySelector('.summary .price .woocommerce-Price-amount, .summary .price .amount, .product .price .amount');
|
||
if (pe) {
|
||
const m = pe.textContent.match(/\$?([\d.]+)/);
|
||
if (m) { const v = parseFloat(m[1]); if (v > 0) return v; }
|
||
}
|
||
const hid = document.querySelector('input.cpf-product-price');
|
||
if (hid && hid.value) { const v = parseFloat(hid.value); if (v > 0) return v; }
|
||
return 0;
|
||
});
|
||
if (pagePrice > 0) {
|
||
prod.price = pagePrice;
|
||
} else if (groups.length > 0) {
|
||
// Use lowest price from first modifier group that has prices
|
||
for (const g of groups) {
|
||
const prices = g.options.map(o => o.price).filter(p => p > 0);
|
||
if (prices.length > 0) { prod.price = Math.min(...prices); break; }
|
||
}
|
||
}
|
||
}
|
||
|
||
// Convert any modifier group with absolute prices to relative (subtract base price)
|
||
if (prod.price > 0 && groups.length > 0) {
|
||
for (const g of groups) {
|
||
const prices = g.options.map(o => o.price).filter(p => p > 0);
|
||
if (prices.length > 0) {
|
||
const minPrice = Math.min(...prices);
|
||
// Only convert if options have prices near or above the item base price (absolute pricing)
|
||
if (minPrice >= prod.price * 0.8) {
|
||
for (const opt of g.options) {
|
||
if (opt.price > 0) {
|
||
opt.price = Math.round((opt.price - prod.price) * 100) / 100;
|
||
if (opt.price < 0) opt.price = 0;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (groups.length > 0) {
|
||
const itemModGroups = [];
|
||
for (const g of groups) {
|
||
if (!modifierGroupsMap[g.name]) {
|
||
modifierGroupsMap[g.name] = {
|
||
name: g.name,
|
||
type: g.type || 'select',
|
||
options: g.options,
|
||
required: g.required,
|
||
minSelections: g.required ? 1 : 0,
|
||
maxSelections: g.type === 'radio' || g.type === 'select' ? 1 : 0
|
||
};
|
||
}
|
||
itemModGroups.push(g.name);
|
||
}
|
||
itemModifierMap[prod.name] = itemModGroups;
|
||
log(" -> " + groups.length + " modifier groups" + (prod.price > 0 ? " ($" + prod.price + ")" : ""));
|
||
}
|
||
} catch (e) {
|
||
log(" -> Error: " + e.message);
|
||
}
|
||
}
|
||
|
||
const modifiers = Object.values(modifierGroupsMap);
|
||
const stats = {
|
||
totalProducts: allProducts.length,
|
||
itemsExtracted: allProducts.length,
|
||
modifierGroups: modifiers.length,
|
||
itemsWithModifiers: Object.keys(itemModifierMap).length
|
||
};
|
||
|
||
log("Done: " + stats.itemsExtracted + " items, " + stats.modifierGroups + " modifier groups, " + stats.itemsWithModifiers + " items with modifiers");
|
||
|
||
console.log(JSON.stringify({
|
||
business: businessInfo,
|
||
items: allProducts,
|
||
modifiers,
|
||
itemModifierMap,
|
||
stats
|
||
}));
|
||
|
||
} catch (err) {
|
||
log("Fatal: " + err.message);
|
||
console.log(JSON.stringify({ error: err.message }));
|
||
process.exit(1);
|
||
} finally {
|
||
if (browser) await browser.close();
|
||
}
|
||
})();
|