5 Commits

Author SHA1 Message Date
cad3326417 ✏️ Scraping tools updated to support LoCG 2025-07-14 12:10:27 -04:00
8d5283402e 🔧 Fixed LoCG scraping 2025-05-06 18:17:01 -04:00
f337c0f3e6 🐳 Updated Dockerfile to use a builder step 2025-02-25 16:20:44 -05:00
69ed09180a 🔧 Fixed error handling for temp directory 2025-02-20 17:33:17 -05:00
aa350ef307 Merge pull request #2 from rishighan/comicvine-improvements
Comicvine improvements
2025-02-20 12:38:56 -05:00
7 changed files with 3591 additions and 942 deletions

View File

@@ -1,20 +1,37 @@
FROM node:12-alpine # Use Node 21 as the base image for the builder stage
FROM node:21-alpine AS builder
LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>" LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
# Working directory # Set the working directory
WORKDIR /metadata-service WORKDIR /metadata-service
# Install dependencies
# Copy and install dependencies
COPY package.json package-lock.json ./ COPY package.json package-lock.json ./
RUN npm ci --silent RUN npm ci --silent
# Copy source # Copy source code and build the application
COPY . . COPY . .
RUN npm run build
# Build and cleanup # Clean up development dependencies
RUN npm prune --production
# Final image using Node 21
FROM node:21-alpine
LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
# Set the working directory
WORKDIR /metadata-service
# Copy the necessary files from the builder image
COPY --from=builder /metadata-service /metadata-service
# Set environment variables
ENV NODE_ENV=production ENV NODE_ENV=production
RUN npm run build \
&& npm prune
# Expose the application's port
EXPOSE 3080 EXPOSE 3080
# Start server
# Start the application
CMD ["npm", "start"] CMD ["npm", "start"]

4258
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -20,6 +20,7 @@
], ],
"author": "", "author": "",
"devDependencies": { "devDependencies": {
"@faker-js/faker": "^9.7.0",
"@types/jsdom": "^16.2.14", "@types/jsdom": "^16.2.14",
"@types/lodash": "^4.14.171", "@types/lodash": "^4.14.171",
"@types/string-similarity": "^4.0.0", "@types/string-similarity": "^4.0.0",
@@ -31,6 +32,10 @@
"jest": "^25.1.0", "jest": "^25.1.0",
"jest-cli": "^25.1.0", "jest-cli": "^25.1.0",
"moleculer-repl": "^0.6.2", "moleculer-repl": "^0.6.2",
"puppeteer": "^24.7.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"telnet-client": "^2.2.5",
"threetwo-ui-typings": "^1.0.14", "threetwo-ui-typings": "^1.0.14",
"ts-jest": "^25.3.0", "ts-jest": "^25.3.0",
"ts-node": "^8.8.1" "ts-node": "^8.8.1"

View File

@@ -5,11 +5,7 @@ import axios from "axios";
import { isNil, isUndefined } from "lodash"; import { isNil, isUndefined } from "lodash";
import { fetchReleases, FilterTypes, SortTypes } from "comicgeeks"; import { fetchReleases, FilterTypes, SortTypes } from "comicgeeks";
import { matchScorer, rankVolumes } from "../utils/searchmatchscorer.utils"; import { matchScorer, rankVolumes } from "../utils/searchmatchscorer.utils";
import { import { scrapeIssuePage, getWeeklyPullList } from "../utils/scraping.utils";
scrapeIssuesFromSeriesPage,
scrapeIssuePage,
getWeeklyPullList,
} from "../utils/scraping.utils";
const { calculateLimitAndOffset, paginate } = require("paginate-info"); const { calculateLimitAndOffset, paginate } = require("paginate-info");
const { MoleculerError } = require("moleculer").Errors; const { MoleculerError } = require("moleculer").Errors;
@@ -108,22 +104,10 @@ export default class ComicVineService extends Service {
return issues.data; return issues.data;
}, },
}, },
scrapeLOCGForSeries: {
rest: "POST /scrapeLOCGForSeries",
params: {},
handler: async (ctx: Context<{}>) => {
const seriesURIFragment = await scrapeIssuePage(
"https://leagueofcomicgeeks.com/comic/5878833/hulk-4"
);
return await scrapeIssuesFromSeriesPage(
`https://leagueofcomicgeeks.com/${seriesURIFragment}`
);
},
},
getWeeklyPullList: { getWeeklyPullList: {
rest: "GET /getWeeklyPullList", rest: "POST /scrapeLOCGForSeries",
timeout: 30000,
params: {}, params: {},
timeout: 10000000,
handler: async ( handler: async (
ctx: Context<{ ctx: Context<{
startDate: string; startDate: string;
@@ -131,26 +115,32 @@ export default class ComicVineService extends Service {
pageSize: string; pageSize: string;
}> }>
) => { ) => {
const { currentPage, pageSize } = ctx.params; const { currentPage, pageSize, startDate } = ctx.params;
console.log(`date for the pull list: ${startDate}`);
const { limit, offset } = calculateLimitAndOffset( const { limit, offset } = calculateLimitAndOffset(
currentPage, parseInt(currentPage, 10),
pageSize parseInt(pageSize, 10)
); );
const response = await getWeeklyPullList(); const url = `https://leagueofcomicgeeks.com/comics/new-comics/${startDate}`;
console.log(JSON.stringify(response, null, 4)); const issues = await getWeeklyPullList(url);
const count = response.length; const count = issues.length;
const paginatedData = response.slice( const paginatedData = issues.slice(
offset, offset,
offset + limit offset + limit
); );
const paginationInfo = paginate( const paginationInfo = paginate(
currentPage, parseInt(currentPage, 10),
count, count,
paginatedData paginatedData
); );
return { result: paginatedData, meta: paginationInfo };
return {
result: paginatedData,
meta: paginationInfo,
};
}, },
}, },
getResource: { getResource: {
@@ -186,7 +176,7 @@ export default class ComicVineService extends Service {
field_list: `${fieldList}`, field_list: `${fieldList}`,
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "ThreeTwo", "User-Agent": "ThreeTwo",
}, },
}); });
@@ -289,7 +279,7 @@ export default class ComicVineService extends Service {
filter: filterString, filter: filterString,
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "ThreeTwo", "User-Agent": "ThreeTwo",
}, },
}); });
@@ -345,7 +335,7 @@ export default class ComicVineService extends Service {
rest: "POST /getComicVineMatchScores", rest: "POST /getComicVineMatchScores",
handler: async ( handler: async (
ctx: Context<{ ctx: Context<{
finalMatches: Array<any>; finalMatches: any[];
rawFileDetails: any; rawFileDetails: any;
scorerConfiguration: any; scorerConfiguration: any;
}> }>
@@ -382,7 +372,7 @@ export default class ComicVineService extends Service {
resources: "volumes", resources: "volumes",
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "ThreeTwo", "User-Agent": "ThreeTwo",
}, },
}); });
@@ -401,7 +391,7 @@ export default class ComicVineService extends Service {
format: "json", format: "json",
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "User-Agent":
"ThreeTwo", "ThreeTwo",
}, },
@@ -493,7 +483,7 @@ export default class ComicVineService extends Service {
limit: 100, limit: 100,
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "ThreeTwo", "User-Agent": "ThreeTwo",
}, },
}); });
@@ -508,7 +498,7 @@ export default class ComicVineService extends Service {
: null; // Extract the year from cover_date : null; // Extract the year from cover_date
return { return {
...issue, ...issue,
year: year, year,
description: issue.description || "", description: issue.description || "",
image: issue.image || {}, image: issue.image || {},
}; };
@@ -548,7 +538,7 @@ export default class ComicVineService extends Service {
resources, resources,
}, },
headers: { headers: {
Accept: "application/json", "Accept": "application/json",
"User-Agent": "ThreeTwo", "User-Agent": "ThreeTwo",
}, },
}); });

View File

@@ -8,7 +8,8 @@
"sourceMap": true, "sourceMap": true,
"pretty": true, "pretty": true,
"target": "es6", "target": "es6",
"outDir": "dist" "outDir": "dist",
"skipLibCheck": true,
}, },
"include": ["./**/*"], "include": ["./**/*"],
"exclude": [ "exclude": [

View File

@@ -1,64 +1,103 @@
import jsdom from "jsdom"; import puppeteer from "puppeteer-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { faker } from "@faker-js/faker";
import axios from "axios"; import axios from "axios";
const { JSDOM } = jsdom; import { JSDOM } from "jsdom";
export const scrapeIssuesFromSeriesPage = async (url: string) => { // Optional Tor
const response = await axios(url); const useTor = process.env.USE_TOR === "true";
const dom = new JSDOM(response.data, { const torProxy = process.env.TOR_SOCKS_PROXY || "socks5://192.168.1.119:9050";
url,
referrer: url, // Apply stealth plugin
contentType: "text/html", puppeteer.use(StealthPlugin());
includeNodeLocations: true,
storageQuota: 10000000, export const getWeeklyPullList = async (url: string) => {
const browser = await puppeteer.launch({
headless: true,
slowMo: 50,
args: useTor
? [`--proxy-server=${torProxy}`]
: ["--no-sandbox", "--disable-setuid-sandbox"],
}); });
const seriesId = dom.window.document
.querySelector("#comic-list-block")
.getAttribute("data-series-id");
const issueNodes = dom.window.document.querySelectorAll(
"ul.comic-list-thumbs > li"
);
const issues: any = []; const page = await browser.newPage();
issueNodes.forEach(node => {
const comicHref = node.querySelector("a").getAttribute("href");
const issueCoverImage = node.querySelector("img").getAttribute("src");
const issueDetails = node.querySelector("img").getAttribute("alt");
const issueDate = node.querySelector("span.date").getAttribute("data-date");
const formattedIssueDate = node.querySelector("span.date").textContent.trim();
const publisher = node.querySelector("div.publisher").textContent.trim();
issues.push({ await page.setExtraHTTPHeaders({
comicHref, "Accept-Language": "en-US,en;q=0.9",
issueCoverImage, "Referer": "https://leagueofcomicgeeks.com/",
issueDetails, });
issueDate,
formattedIssueDate, await page.setUserAgent(faker.internet.userAgent());
publisher,
await page.setViewport({
width: faker.number.int({ min: 1024, max: 1920 }),
height: faker.number.int({ min: 768, max: 1080 }),
});
try {
await page.goto(url, {
waitUntil: "domcontentloaded", // Faster and more reliable for JS-rendered content
timeout: 30000, // Give it time on Tor or slow networks
}); });
});
return { await page.waitForSelector(".issue", { timeout: 30000 });
seriesId, console.log("✅ Found .issue blocks");
issues,
}; return await page.evaluate(() => {
const issues = Array.from(document.querySelectorAll(".issue"));
return issues.map(issue => {
const issueUrlElement = issue.querySelector(".cover a");
const coverImageElement =
issue.querySelector(".cover img.lazy");
const publisherText =
issue.querySelector("div.publisher")?.textContent?.trim() ||
null;
const issueName =
issue
.querySelector("div.title")
?.getAttribute("data-sorting") || null;
// Convert Unix timestamp (in seconds) to YYYY-MM-DD
const publicationDateRaw = issue
.querySelector(".date")
?.getAttribute("data-date");
const publicationDate = publicationDateRaw
? new Date(parseInt(publicationDateRaw, 10) * 1000)
.toISOString()
.split("T")[0]
: null;
const imageUrl =
coverImageElement?.getAttribute("data-src") ||
coverImageElement?.getAttribute("src") ||
null;
const coverImageUrl = imageUrl
? imageUrl.replace(/\/medium-(\d+\.jpg)/, "/large-$1")
: null;
const issueUrl = issueUrlElement?.getAttribute("href") || null;
return {
issueName,
coverImageUrl,
issueUrl,
publisher: publisherText,
publicationDate,
};
});
});
} catch (err) {
console.error("❌ Scraper error:", err);
throw err;
} finally {
await browser.close();
}
}; };
export const scrapeIssuePage = async (url: string) => { export const scrapeIssuePage = async (url: string) => {
const response = await axios(url);
const dom = new JSDOM(response.data, {
url,
referrer: url,
contentType: "text/html",
includeNodeLocations: true,
storageQuota: 10000000,
});
const seriesDOMElement = dom.window.document
.querySelector("div.series-pagination > a.series").getAttribute("href");
return seriesDOMElement;
};
export const getWeeklyPullList = async () => {
const url = "https://www.tfaw.com/comics/new-releases.html";
const response = await axios(url); const response = await axios(url);
const dom = new JSDOM(response.data, { const dom = new JSDOM(response.data, {
url, url,
@@ -67,22 +106,8 @@ export const getWeeklyPullList = async () => {
includeNodeLocations: true, includeNodeLocations: true,
storageQuota: 10000000, storageQuota: 10000000,
}); });
const seriesDOMElement = dom.window.document
const pullList: any[] = []; .querySelector("div.series-pagination > a.series")
// Node for the comics container .getAttribute("href");
const issueNodes = dom.window.document.querySelectorAll("ol.products > li"); return seriesDOMElement;
issueNodes.forEach(node => {
const coverImageUrl = node.querySelector("img.photo").getAttribute("data-src");
const name = node.querySelector("div.product > a.product").textContent.trim();
const publicationDate = node.querySelector("div.product-item-date").textContent.trim();
pullList.push({
coverImageUrl,
name,
publicationDate,
});
});
return pullList;
}; };

View File

@@ -152,8 +152,9 @@ const calculateLevenshteinDistance = async (match: any, rawFileDetails: any) =>
console.log(rawFileDetails.cover.filePath); console.log(rawFileDetails.cover.filePath);
const fileName = match.id + "_" + rawFileDetails.name + ".jpg"; const fileName = match.id + "_" + rawFileDetails.name + ".jpg";
// Ensure the `temporary` directory exists // Ensure the `temporary` directory exists
if (!existsSync("temporary")) { const tempDir = path.join(`${process.env.USERDATA_DIRECTORY}`, "temporary");
mkdirSync("temporary", { recursive: true }); if (!existsSync(tempDir)) {
mkdirSync(tempDir, { recursive: true });
} }
const file = createWriteStream( const file = createWriteStream(
`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}` `${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`