✏️ Scraping tools updated to support LoCG

🔧 Fixed LoCG scraping
🐳 Updated Dockerfile to use a builder step
2025-07-14 12:10:27 -04:00 · 2025-05-06 18:17:01 -04:00 · 2025-02-25 16:20:44 -05:00 · 2025-02-20 17:33:17 -05:00 · 2025-02-20 12:38:56 -05:00
7 changed files with 3591 additions and 942 deletions
--- a/35
+++ b/35
@@ -1,20 +1,37 @@
-FROM node:12-alpine
+# Use Node 21 as the base image for the builder stage
+FROM node:21-alpine AS builder
 LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"

-# Working directory
+# Set the working directory
 WORKDIR /metadata-service
-# Install dependencies
+
+# Copy and install dependencies
 COPY package.json package-lock.json ./
 RUN npm ci --silent

-# Copy source
+# Copy source code and build the application
 COPY . .
+RUN npm run build

-# Build and cleanup
+# Clean up development dependencies
+RUN npm prune --production
+
+# Final image using Node 21
+FROM node:21-alpine
+
+LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
+
+# Set the working directory
+WORKDIR /metadata-service
+
+# Copy the necessary files from the builder image
+COPY --from=builder /metadata-service /metadata-service
+
+# Set environment variables
 ENV NODE_ENV=production
-RUN npm run build \
- && npm prune

+# Expose the application's port
 EXPOSE 3080
-# Start server
-CMD ["npm", "start"]
+
+# Start the application
+CMD ["npm", "start"]
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -20,6 +20,7 @@
  ],
  "author": "",
  "devDependencies": {
+    "@faker-js/faker": "^9.7.0",
    "@types/jsdom": "^16.2.14",
    "@types/lodash": "^4.14.171",
    "@types/string-similarity": "^4.0.0",
@@ -31,6 +32,10 @@
    "jest": "^25.1.0",
    "jest-cli": "^25.1.0",
    "moleculer-repl": "^0.6.2",
+    "puppeteer": "^24.7.1",
+    "puppeteer-extra": "^3.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "telnet-client": "^2.2.5",
    "threetwo-ui-typings": "^1.0.14",
    "ts-jest": "^25.3.0",
    "ts-node": "^8.8.1"
--- a/services/comicvine.service.ts
+++ b/services/comicvine.service.ts
@@ -5,11 +5,7 @@ import axios from "axios";
 import { isNil, isUndefined } from "lodash";
 import { fetchReleases, FilterTypes, SortTypes } from "comicgeeks";
 import { matchScorer, rankVolumes } from "../utils/searchmatchscorer.utils";
-import {
-	scrapeIssuesFromSeriesPage,
-	scrapeIssuePage,
-	getWeeklyPullList,
-} from "../utils/scraping.utils";
+import { scrapeIssuePage, getWeeklyPullList } from "../utils/scraping.utils";
 const { calculateLimitAndOffset, paginate } = require("paginate-info");
 const { MoleculerError } = require("moleculer").Errors;

@@ -108,22 +104,10 @@ export default class ComicVineService extends Service {
 						return issues.data;
 					},
 				},
-				scrapeLOCGForSeries: {
-					rest: "POST /scrapeLOCGForSeries",
-					params: {},
-					handler: async (ctx: Context<{}>) => {
-						const seriesURIFragment = await scrapeIssuePage(
-							"https://leagueofcomicgeeks.com/comic/5878833/hulk-4"
-						);
-						return await scrapeIssuesFromSeriesPage(
-							`https://leagueofcomicgeeks.com/${seriesURIFragment}`
-						);
-					},
-				},
 				getWeeklyPullList: {
-					rest: "GET /getWeeklyPullList",
+					rest: "POST /scrapeLOCGForSeries",
+					timeout: 30000,
 					params: {},
-					timeout: 10000000,
 					handler: async (
 						ctx: Context<{
 							startDate: string;
@@ -131,26 +115,32 @@ export default class ComicVineService extends Service {
 							pageSize: string;
 						}>
 					) => {
-						const { currentPage, pageSize } = ctx.params;
+						const { currentPage, pageSize, startDate } = ctx.params;
+						console.log(`date for the pull list: ${startDate}`);
 						const { limit, offset } = calculateLimitAndOffset(
-							currentPage,
-							pageSize
+							parseInt(currentPage, 10),
+							parseInt(pageSize, 10)
 						);

-						const response = await getWeeklyPullList();
-						console.log(JSON.stringify(response, null, 4));
+						const url = `https://leagueofcomicgeeks.com/comics/new-comics/${startDate}`;
+						const issues = await getWeeklyPullList(url);

-						const count = response.length;
-						const paginatedData = response.slice(
+						const count = issues.length;
+						const paginatedData = issues.slice(
 							offset,
 							offset + limit
 						);
+
 						const paginationInfo = paginate(
-							currentPage,
+							parseInt(currentPage, 10),
 							count,
 							paginatedData
 						);
-						return { result: paginatedData, meta: paginationInfo };
+
+						return {
+							result: paginatedData,
+							meta: paginationInfo,
+						};
 					},
 				},
 				getResource: {
@@ -186,7 +176,7 @@ export default class ComicVineService extends Service {
 								field_list: `${fieldList}`,
 							},
 							headers: {
-								Accept: "application/json",
+								"Accept": "application/json",
 								"User-Agent": "ThreeTwo",
 							},
 						});
@@ -289,7 +279,7 @@ export default class ComicVineService extends Service {
 									filter: filterString,
 								},
 								headers: {
-									Accept: "application/json",
+									"Accept": "application/json",
 									"User-Agent": "ThreeTwo",
 								},
 							});
@@ -345,7 +335,7 @@ export default class ComicVineService extends Service {
 					rest: "POST /getComicVineMatchScores",
 					handler: async (
 						ctx: Context<{
-							finalMatches: Array<any>;
+							finalMatches: any[];
 							rawFileDetails: any;
 							scorerConfiguration: any;
 						}>
@@ -382,7 +372,7 @@ export default class ComicVineService extends Service {
 									resources: "volumes",
 								},
 								headers: {
-									Accept: "application/json",
+									"Accept": "application/json",
 									"User-Agent": "ThreeTwo",
 								},
 							});
@@ -401,7 +391,7 @@ export default class ComicVineService extends Service {
 															format: "json",
 														},
 														headers: {
-															Accept: "application/json",
+															"Accept": "application/json",
 															"User-Agent":
 																"ThreeTwo",
 														},
@@ -493,7 +483,7 @@ export default class ComicVineService extends Service {
 									limit: 100,
 								},
 								headers: {
-									Accept: "application/json",
+									"Accept": "application/json",
 									"User-Agent": "ThreeTwo",
 								},
 							});
@@ -508,7 +498,7 @@ export default class ComicVineService extends Service {
 										: null; // Extract the year from cover_date
 									return {
 										...issue,
-										year: year,
+										year,
 										description: issue.description || "",
 										image: issue.image || {},
 									};
@@ -548,7 +538,7 @@ export default class ComicVineService extends Service {
 							resources,
 						},
 						headers: {
-							Accept: "application/json",
+							"Accept": "application/json",
 							"User-Agent": "ThreeTwo",
 						},
 					});
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -8,7 +8,8 @@
 		"sourceMap": true,
 		"pretty": true,
 		"target": "es6",
-		"outDir": "dist"
+		"outDir": "dist",
+		"skipLibCheck": true,
 	},
 	"include": ["./**/*"],
 	"exclude": [
--- a/utils/scraping.utils.ts
+++ b/utils/scraping.utils.ts
@@ -1,64 +1,103 @@
-import jsdom from "jsdom";
+import puppeteer from "puppeteer-extra";
+import StealthPlugin from "puppeteer-extra-plugin-stealth";
+import { faker } from "@faker-js/faker";
 import axios from "axios";
-const { JSDOM } = jsdom;
+import { JSDOM } from "jsdom";

-export const scrapeIssuesFromSeriesPage = async (url: string) => {
-	const response = await axios(url);
-	const dom = new JSDOM(response.data, {
-		url,
-		referrer: url,
-		contentType: "text/html",
-		includeNodeLocations: true,
-		storageQuota: 10000000,
+// Optional Tor
+const useTor = process.env.USE_TOR === "true";
+const torProxy = process.env.TOR_SOCKS_PROXY || "socks5://192.168.1.119:9050";
+
+// Apply stealth plugin
+puppeteer.use(StealthPlugin());
+
+export const getWeeklyPullList = async (url: string) => {
+	const browser = await puppeteer.launch({
+		headless: true,
+		slowMo: 50,
+		args: useTor
+			? [`--proxy-server=${torProxy}`]
+			: ["--no-sandbox", "--disable-setuid-sandbox"],
 	});
-	const seriesId = dom.window.document
-		.querySelector("#comic-list-block")
-		.getAttribute("data-series-id");
-	const issueNodes = dom.window.document.querySelectorAll(
-		"ul.comic-list-thumbs > li"
-	);

-	const issues: any = [];
-	issueNodes.forEach(node => {
-		const comicHref = node.querySelector("a").getAttribute("href");
-		const issueCoverImage = node.querySelector("img").getAttribute("src");
-		const issueDetails = node.querySelector("img").getAttribute("alt");
-        const issueDate = node.querySelector("span.date").getAttribute("data-date");
-        const formattedIssueDate = node.querySelector("span.date").textContent.trim();
-        const publisher = node.querySelector("div.publisher").textContent.trim();
+	const page = await browser.newPage();

-		issues.push({
-			comicHref,
-			issueCoverImage,
-			issueDetails,
-            issueDate,
-            formattedIssueDate,
-            publisher,
+	await page.setExtraHTTPHeaders({
+		"Accept-Language": "en-US,en;q=0.9",
+		"Referer": "https://leagueofcomicgeeks.com/",
+	});
+
+	await page.setUserAgent(faker.internet.userAgent());
+
+	await page.setViewport({
+		width: faker.number.int({ min: 1024, max: 1920 }),
+		height: faker.number.int({ min: 768, max: 1080 }),
+	});
+
+	try {
+		await page.goto(url, {
+			waitUntil: "domcontentloaded", // Faster and more reliable for JS-rendered content
+			timeout: 30000, // Give it time on Tor or slow networks
 		});
-	});
-	return {
-		seriesId,
-		issues,
-	};
+
+		await page.waitForSelector(".issue", { timeout: 30000 });
+		console.log("✅ Found .issue blocks");
+
+		return await page.evaluate(() => {
+			const issues = Array.from(document.querySelectorAll(".issue"));
+
+			return issues.map(issue => {
+				const issueUrlElement = issue.querySelector(".cover a");
+				const coverImageElement =
+					issue.querySelector(".cover img.lazy");
+				const publisherText =
+					issue.querySelector("div.publisher")?.textContent?.trim() ||
+					null;
+				const issueName =
+					issue
+						.querySelector("div.title")
+						?.getAttribute("data-sorting") || null;
+
+				// Convert Unix timestamp (in seconds) to YYYY-MM-DD
+				const publicationDateRaw = issue
+					.querySelector(".date")
+					?.getAttribute("data-date");
+				const publicationDate = publicationDateRaw
+					? new Date(parseInt(publicationDateRaw, 10) * 1000)
+							.toISOString()
+							.split("T")[0]
+					: null;
+
+				const imageUrl =
+					coverImageElement?.getAttribute("data-src") ||
+					coverImageElement?.getAttribute("src") ||
+					null;
+
+				const coverImageUrl = imageUrl
+					? imageUrl.replace(/\/medium-(\d+\.jpg)/, "/large-$1")
+					: null;
+
+				const issueUrl = issueUrlElement?.getAttribute("href") || null;
+
+				return {
+					issueName,
+					coverImageUrl,
+					issueUrl,
+					publisher: publisherText,
+					publicationDate,
+				};
+			});
+		});
+	} catch (err) {
+		console.error("❌ Scraper error:", err);
+		throw err;
+	} finally {
+		await browser.close();
+	}
 };

+
 export const scrapeIssuePage = async (url: string) => {
-    const response = await axios(url);
-    const dom = new JSDOM(response.data, {
-		url,
-		referrer: url,
-		contentType: "text/html",
-		includeNodeLocations: true,
-		storageQuota: 10000000,
-	});
-    const seriesDOMElement = dom.window.document
-		.querySelector("div.series-pagination > a.series").getAttribute("href");
-    return seriesDOMElement;
-};
-
-
-export const getWeeklyPullList = async () => {
-	const url = "https://www.tfaw.com/comics/new-releases.html";
 	const response = await axios(url);
 	const dom = new JSDOM(response.data, {
 		url,
@@ -67,22 +106,8 @@ export const getWeeklyPullList = async () => {
 		includeNodeLocations: true,
 		storageQuota: 10000000,
 	});
-
-	const pullList: any[] = [];
-	// Node for the comics container
-	const issueNodes = dom.window.document.querySelectorAll("ol.products > li");
-
-	issueNodes.forEach(node => {
-		const coverImageUrl = node.querySelector("img.photo").getAttribute("data-src");
-		const name = node.querySelector("div.product > a.product").textContent.trim();
-		const publicationDate = node.querySelector("div.product-item-date").textContent.trim();
-		pullList.push({
-			coverImageUrl,
-			name,
-			publicationDate,
-		});
-	});
-
-	return pullList;
-
+	const seriesDOMElement = dom.window.document
+		.querySelector("div.series-pagination > a.series")
+		.getAttribute("href");
+	return seriesDOMElement;
 };
--- a/utils/searchmatchscorer.utils.ts
+++ b/utils/searchmatchscorer.utils.ts
@@ -152,8 +152,9 @@ const calculateLevenshteinDistance = async (match: any, rawFileDetails: any) =>
 			console.log(rawFileDetails.cover.filePath);
 			const fileName = match.id + "_" + rawFileDetails.name + ".jpg";
 			// Ensure the `temporary` directory exists
-			if (!existsSync("temporary")) {
-				mkdirSync("temporary", { recursive: true });
+			const tempDir = path.join(`${process.env.USERDATA_DIRECTORY}`, "temporary");
+			if (!existsSync(tempDir)) {
+				mkdirSync(tempDir, { recursive: true });
 			}
 			const file = createWriteStream(
 				`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`
Author	SHA1	Message	Date
Rishi Ghan	cad3326417	✏️ Scraping tools updated to support LoCG	2025-07-14 12:10:27 -04:00
Rishi Ghan	8d5283402e	🔧 Fixed LoCG scraping	2025-05-06 18:17:01 -04:00
Rishi Ghan	f337c0f3e6	🐳 Updated Dockerfile to use a builder step	2025-02-25 16:20:44 -05:00
Rishi Ghan	69ed09180a	🔧 Fixed error handling for temp directory	2025-02-20 17:33:17 -05:00
Rishi Ghan	aa350ef307	Merge pull request #2 from rishighan/comicvine-improvements Comicvine improvements	2025-02-20 12:38:56 -05:00