import puppeteer from "puppeteer-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { faker } from "@faker-js/faker"; import axios from "axios"; import { JSDOM } from "jsdom"; // Optional Tor const useTor = process.env.USE_TOR === "true"; const torProxy = process.env.TOR_SOCKS_PROXY || "socks5://192.168.1.119:9050"; // Apply stealth plugin puppeteer.use(StealthPlugin()); export const getWeeklyPullList = async (url: string) => { const browser = await puppeteer.launch({ headless: true, slowMo: 50, args: useTor ? [`--proxy-server=${torProxy}`] : ["--no-sandbox", "--disable-setuid-sandbox"], }); const page = await browser.newPage(); await page.setExtraHTTPHeaders({ "Accept-Language": "en-US,en;q=0.9", "Referer": "https://leagueofcomicgeeks.com/", }); await page.setUserAgent(faker.internet.userAgent()); await page.setViewport({ width: faker.number.int({ min: 1024, max: 1920 }), height: faker.number.int({ min: 768, max: 1080 }), }); try { await page.goto(url, { waitUntil: "domcontentloaded", // Faster and more reliable for JS-rendered content timeout: 30000, // Give it time on Tor or slow networks }); await page.waitForSelector(".issue", { timeout: 30000 }); console.log("✅ Found .issue blocks"); return await page.evaluate(() => { const issues = Array.from(document.querySelectorAll(".issue")); return issues.map(issue => { const issueUrlElement = issue.querySelector(".cover a"); const coverImageElement = issue.querySelector(".cover img.lazy"); const publisherText = issue.querySelector("div.publisher")?.textContent?.trim() || null; const issueName = issue .querySelector("div.title") ?.getAttribute("data-sorting") || null; // Convert Unix timestamp (in seconds) to YYYY-MM-DD const publicationDateRaw = issue .querySelector(".date") ?.getAttribute("data-date"); const publicationDate = publicationDateRaw ? new Date(parseInt(publicationDateRaw, 10) * 1000) .toISOString() .split("T")[0] : null; const imageUrl = coverImageElement?.getAttribute("data-src") || coverImageElement?.getAttribute("src") || null; const coverImageUrl = imageUrl ? imageUrl.replace(/\/medium-(\d+\.jpg)/, "/large-$1") : null; const issueUrl = issueUrlElement?.getAttribute("href") || null; return { issueName, coverImageUrl, issueUrl, publisher: publisherText, publicationDate, }; }); }); } catch (err) { console.error("❌ Scraper error:", err); throw err; } finally { await browser.close(); } }; export const scrapeIssuePage = async (url: string) => { const response = await axios(url); const dom = new JSDOM(response.data, { url, referrer: url, contentType: "text/html", includeNodeLocations: true, storageQuota: 10000000, }); const seriesDOMElement = dom.window.document .querySelector("div.series-pagination > a.series") .getAttribute("href"); return seriesDOMElement; };