🔧 Fixed LoCG scraping
This commit is contained in:
@@ -1,64 +1,142 @@
|
||||
import jsdom from "jsdom";
|
||||
import puppeteer from "puppeteer-extra";
|
||||
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
||||
import { faker } from "@faker-js/faker";
|
||||
import axios from "axios";
|
||||
const { JSDOM } = jsdom;
|
||||
import { JSDOM } from "jsdom";
|
||||
|
||||
export const scrapeIssuesFromSeriesPage = async (url: string) => {
|
||||
const response = await axios(url);
|
||||
const dom = new JSDOM(response.data, {
|
||||
url,
|
||||
referrer: url,
|
||||
contentType: "text/html",
|
||||
includeNodeLocations: true,
|
||||
storageQuota: 10000000,
|
||||
// Optional Tor
|
||||
const useTor = process.env.USE_TOR === "true";
|
||||
const torProxy = process.env.TOR_SOCKS_PROXY || "socks5://192.168.1.119:9050";
|
||||
|
||||
// Apply stealth plugin
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export const getWeeklyPullList = async (url: string) => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
slowMo: 50,
|
||||
args: useTor
|
||||
? [`--proxy-server=${torProxy}`]
|
||||
: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
const seriesId = dom.window.document
|
||||
.querySelector("#comic-list-block")
|
||||
.getAttribute("data-series-id");
|
||||
const issueNodes = dom.window.document.querySelectorAll(
|
||||
"ul.comic-list-thumbs > li"
|
||||
);
|
||||
|
||||
const issues: any = [];
|
||||
issueNodes.forEach(node => {
|
||||
const comicHref = node.querySelector("a").getAttribute("href");
|
||||
const issueCoverImage = node.querySelector("img").getAttribute("src");
|
||||
const issueDetails = node.querySelector("img").getAttribute("alt");
|
||||
const issueDate = node.querySelector("span.date").getAttribute("data-date");
|
||||
const formattedIssueDate = node.querySelector("span.date").textContent.trim();
|
||||
const publisher = node.querySelector("div.publisher").textContent.trim();
|
||||
const page = await browser.newPage();
|
||||
|
||||
issues.push({
|
||||
comicHref,
|
||||
issueCoverImage,
|
||||
issueDetails,
|
||||
issueDate,
|
||||
formattedIssueDate,
|
||||
publisher,
|
||||
await page.setExtraHTTPHeaders({
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Referer": "https://leagueofcomicgeeks.com/",
|
||||
});
|
||||
|
||||
await page.setUserAgent(faker.internet.userAgent());
|
||||
|
||||
await page.setViewport({
|
||||
width: faker.number.int({ min: 1024, max: 1920 }),
|
||||
height: faker.number.int({ min: 768, max: 1080 }),
|
||||
});
|
||||
|
||||
try {
|
||||
await page.goto(url, {
|
||||
waitUntil: "domcontentloaded", // faster and more reliable for JS-rendered content
|
||||
timeout: 30000, // give it time on Tor or slow networks
|
||||
});
|
||||
});
|
||||
return {
|
||||
seriesId,
|
||||
issues,
|
||||
};
|
||||
|
||||
await page.waitForSelector(".issue", { timeout: 30000 });
|
||||
console.log("✅ Found .issue blocks");
|
||||
|
||||
return await page.evaluate(() => {
|
||||
const issues = Array.from(document.querySelectorAll(".issue"));
|
||||
|
||||
return issues.map(issue => {
|
||||
const issueUrlElement = issue.querySelector(".cover a");
|
||||
const coverImageElement =
|
||||
issue.querySelector(".cover img.lazy");
|
||||
const publisherText =
|
||||
issue.querySelector("div.publisher")?.textContent?.trim() ||
|
||||
null;
|
||||
const issueName =
|
||||
issue
|
||||
.querySelector("div.title")
|
||||
?.getAttribute("data-sorting") || null;
|
||||
|
||||
// Convert Unix timestamp (in seconds) to YYYY-MM-DD
|
||||
const publicationDateRaw = issue
|
||||
.querySelector(".date")
|
||||
?.getAttribute("data-date");
|
||||
const publicationDate = publicationDateRaw
|
||||
? new Date(parseInt(publicationDateRaw, 10) * 1000)
|
||||
.toISOString()
|
||||
.split("T")[0]
|
||||
: null;
|
||||
|
||||
const imageUrl =
|
||||
coverImageElement?.getAttribute("data-src") ||
|
||||
coverImageElement?.getAttribute("src") ||
|
||||
null;
|
||||
|
||||
const coverImageUrl = imageUrl
|
||||
? imageUrl.replace(/\/medium-(\d+\.jpg)/, "/large-$1")
|
||||
: null;
|
||||
|
||||
const issueUrl = issueUrlElement?.getAttribute("href") || null;
|
||||
|
||||
return {
|
||||
issueName,
|
||||
coverImageUrl,
|
||||
issueUrl,
|
||||
publisher: publisherText,
|
||||
publicationDate,
|
||||
};
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("❌ Scraper error:", err);
|
||||
throw err;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
};
|
||||
|
||||
// export const scrapeIssuesFromSeriesPage = async (url: string) => {
|
||||
// const response = await axios(url);
|
||||
// const dom = new JSDOM(response.data, {
|
||||
// url,
|
||||
// referrer: url,
|
||||
// contentType: "text/html",
|
||||
// includeNodeLocations: true,
|
||||
// storageQuota: 10000000,
|
||||
// });
|
||||
// const seriesId = dom.window.document
|
||||
// .querySelector("#comic-list-block")
|
||||
// .getAttribute("data-series-id");
|
||||
// const issueNodes = dom.window.document.querySelectorAll(
|
||||
// "ul.comic-list-thumbs > li"
|
||||
// );
|
||||
|
||||
// const issues: any = [];
|
||||
// issueNodes.forEach(node => {
|
||||
// const comicHref = node.querySelector("a").getAttribute("href");
|
||||
// const issueCoverImage = node.querySelector("img").getAttribute("src");
|
||||
// const issueDetails = node.querySelector("img").getAttribute("alt");
|
||||
// const issueDate = node.querySelector("span.date").getAttribute("data-date");
|
||||
// const formattedIssueDate = node.querySelector("span.date").textContent.trim();
|
||||
// const publisher = node.querySelector("div.publisher").textContent.trim();
|
||||
|
||||
// issues.push({
|
||||
// comicHref,
|
||||
// issueCoverImage,
|
||||
// issueDetails,
|
||||
// issueDate,
|
||||
// formattedIssueDate,
|
||||
// publisher,
|
||||
// });
|
||||
// });
|
||||
// return {
|
||||
// seriesId,
|
||||
// issues,
|
||||
// };
|
||||
// };
|
||||
|
||||
export const scrapeIssuePage = async (url: string) => {
|
||||
const response = await axios(url);
|
||||
const dom = new JSDOM(response.data, {
|
||||
url,
|
||||
referrer: url,
|
||||
contentType: "text/html",
|
||||
includeNodeLocations: true,
|
||||
storageQuota: 10000000,
|
||||
});
|
||||
const seriesDOMElement = dom.window.document
|
||||
.querySelector("div.series-pagination > a.series").getAttribute("href");
|
||||
return seriesDOMElement;
|
||||
};
|
||||
|
||||
|
||||
export const getWeeklyPullList = async () => {
|
||||
const url = "https://www.tfaw.com/comics/new-releases.html";
|
||||
const response = await axios(url);
|
||||
const dom = new JSDOM(response.data, {
|
||||
url,
|
||||
@@ -67,22 +145,8 @@ export const getWeeklyPullList = async () => {
|
||||
includeNodeLocations: true,
|
||||
storageQuota: 10000000,
|
||||
});
|
||||
|
||||
const pullList: any[] = [];
|
||||
// Node for the comics container
|
||||
const issueNodes = dom.window.document.querySelectorAll("ol.products > li");
|
||||
|
||||
issueNodes.forEach(node => {
|
||||
const coverImageUrl = node.querySelector("img.photo").getAttribute("data-src");
|
||||
const name = node.querySelector("div.product > a.product").textContent.trim();
|
||||
const publicationDate = node.querySelector("div.product-item-date").textContent.trim();
|
||||
pullList.push({
|
||||
coverImageUrl,
|
||||
name,
|
||||
publicationDate,
|
||||
});
|
||||
});
|
||||
|
||||
return pullList;
|
||||
|
||||
const seriesDOMElement = dom.window.document
|
||||
.querySelector("div.series-pagination > a.series")
|
||||
.getAttribute("href");
|
||||
return seriesDOMElement;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user