Compare commits
5 Commits
comicvine-
...
graphql-re
| Author | SHA1 | Date | |
|---|---|---|---|
| cad3326417 | |||
| 8d5283402e | |||
| f337c0f3e6 | |||
| 69ed09180a | |||
| aa350ef307 |
33
Dockerfile
33
Dockerfile
@@ -1,20 +1,37 @@
|
|||||||
FROM node:12-alpine
|
# Use Node 21 as the base image for the builder stage
|
||||||
|
FROM node:21-alpine AS builder
|
||||||
LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
|
LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
|
||||||
|
|
||||||
# Working directory
|
# Set the working directory
|
||||||
WORKDIR /metadata-service
|
WORKDIR /metadata-service
|
||||||
# Install dependencies
|
|
||||||
|
# Copy and install dependencies
|
||||||
COPY package.json package-lock.json ./
|
COPY package.json package-lock.json ./
|
||||||
RUN npm ci --silent
|
RUN npm ci --silent
|
||||||
|
|
||||||
# Copy source
|
# Copy source code and build the application
|
||||||
COPY . .
|
COPY . .
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
# Build and cleanup
|
# Clean up development dependencies
|
||||||
|
RUN npm prune --production
|
||||||
|
|
||||||
|
# Final image using Node 21
|
||||||
|
FROM node:21-alpine
|
||||||
|
|
||||||
|
LABEL maintainer="Rishi Ghan <rishi.ghan@gmail.com>"
|
||||||
|
|
||||||
|
# Set the working directory
|
||||||
|
WORKDIR /metadata-service
|
||||||
|
|
||||||
|
# Copy the necessary files from the builder image
|
||||||
|
COPY --from=builder /metadata-service /metadata-service
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
ENV NODE_ENV=production
|
ENV NODE_ENV=production
|
||||||
RUN npm run build \
|
|
||||||
&& npm prune
|
|
||||||
|
|
||||||
|
# Expose the application's port
|
||||||
EXPOSE 3080
|
EXPOSE 3080
|
||||||
# Start server
|
|
||||||
|
# Start the application
|
||||||
CMD ["npm", "start"]
|
CMD ["npm", "start"]
|
||||||
4258
package-lock.json
generated
4258
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -20,6 +20,7 @@
|
|||||||
],
|
],
|
||||||
"author": "",
|
"author": "",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@faker-js/faker": "^9.7.0",
|
||||||
"@types/jsdom": "^16.2.14",
|
"@types/jsdom": "^16.2.14",
|
||||||
"@types/lodash": "^4.14.171",
|
"@types/lodash": "^4.14.171",
|
||||||
"@types/string-similarity": "^4.0.0",
|
"@types/string-similarity": "^4.0.0",
|
||||||
@@ -31,6 +32,10 @@
|
|||||||
"jest": "^25.1.0",
|
"jest": "^25.1.0",
|
||||||
"jest-cli": "^25.1.0",
|
"jest-cli": "^25.1.0",
|
||||||
"moleculer-repl": "^0.6.2",
|
"moleculer-repl": "^0.6.2",
|
||||||
|
"puppeteer": "^24.7.1",
|
||||||
|
"puppeteer-extra": "^3.3.6",
|
||||||
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
|
"telnet-client": "^2.2.5",
|
||||||
"threetwo-ui-typings": "^1.0.14",
|
"threetwo-ui-typings": "^1.0.14",
|
||||||
"ts-jest": "^25.3.0",
|
"ts-jest": "^25.3.0",
|
||||||
"ts-node": "^8.8.1"
|
"ts-node": "^8.8.1"
|
||||||
|
|||||||
@@ -5,11 +5,7 @@ import axios from "axios";
|
|||||||
import { isNil, isUndefined } from "lodash";
|
import { isNil, isUndefined } from "lodash";
|
||||||
import { fetchReleases, FilterTypes, SortTypes } from "comicgeeks";
|
import { fetchReleases, FilterTypes, SortTypes } from "comicgeeks";
|
||||||
import { matchScorer, rankVolumes } from "../utils/searchmatchscorer.utils";
|
import { matchScorer, rankVolumes } from "../utils/searchmatchscorer.utils";
|
||||||
import {
|
import { scrapeIssuePage, getWeeklyPullList } from "../utils/scraping.utils";
|
||||||
scrapeIssuesFromSeriesPage,
|
|
||||||
scrapeIssuePage,
|
|
||||||
getWeeklyPullList,
|
|
||||||
} from "../utils/scraping.utils";
|
|
||||||
const { calculateLimitAndOffset, paginate } = require("paginate-info");
|
const { calculateLimitAndOffset, paginate } = require("paginate-info");
|
||||||
const { MoleculerError } = require("moleculer").Errors;
|
const { MoleculerError } = require("moleculer").Errors;
|
||||||
|
|
||||||
@@ -108,22 +104,10 @@ export default class ComicVineService extends Service {
|
|||||||
return issues.data;
|
return issues.data;
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
scrapeLOCGForSeries: {
|
|
||||||
rest: "POST /scrapeLOCGForSeries",
|
|
||||||
params: {},
|
|
||||||
handler: async (ctx: Context<{}>) => {
|
|
||||||
const seriesURIFragment = await scrapeIssuePage(
|
|
||||||
"https://leagueofcomicgeeks.com/comic/5878833/hulk-4"
|
|
||||||
);
|
|
||||||
return await scrapeIssuesFromSeriesPage(
|
|
||||||
`https://leagueofcomicgeeks.com/${seriesURIFragment}`
|
|
||||||
);
|
|
||||||
},
|
|
||||||
},
|
|
||||||
getWeeklyPullList: {
|
getWeeklyPullList: {
|
||||||
rest: "GET /getWeeklyPullList",
|
rest: "POST /scrapeLOCGForSeries",
|
||||||
|
timeout: 30000,
|
||||||
params: {},
|
params: {},
|
||||||
timeout: 10000000,
|
|
||||||
handler: async (
|
handler: async (
|
||||||
ctx: Context<{
|
ctx: Context<{
|
||||||
startDate: string;
|
startDate: string;
|
||||||
@@ -131,26 +115,32 @@ export default class ComicVineService extends Service {
|
|||||||
pageSize: string;
|
pageSize: string;
|
||||||
}>
|
}>
|
||||||
) => {
|
) => {
|
||||||
const { currentPage, pageSize } = ctx.params;
|
const { currentPage, pageSize, startDate } = ctx.params;
|
||||||
|
console.log(`date for the pull list: ${startDate}`);
|
||||||
const { limit, offset } = calculateLimitAndOffset(
|
const { limit, offset } = calculateLimitAndOffset(
|
||||||
currentPage,
|
parseInt(currentPage, 10),
|
||||||
pageSize
|
parseInt(pageSize, 10)
|
||||||
);
|
);
|
||||||
|
|
||||||
const response = await getWeeklyPullList();
|
const url = `https://leagueofcomicgeeks.com/comics/new-comics/${startDate}`;
|
||||||
console.log(JSON.stringify(response, null, 4));
|
const issues = await getWeeklyPullList(url);
|
||||||
|
|
||||||
const count = response.length;
|
const count = issues.length;
|
||||||
const paginatedData = response.slice(
|
const paginatedData = issues.slice(
|
||||||
offset,
|
offset,
|
||||||
offset + limit
|
offset + limit
|
||||||
);
|
);
|
||||||
|
|
||||||
const paginationInfo = paginate(
|
const paginationInfo = paginate(
|
||||||
currentPage,
|
parseInt(currentPage, 10),
|
||||||
count,
|
count,
|
||||||
paginatedData
|
paginatedData
|
||||||
);
|
);
|
||||||
return { result: paginatedData, meta: paginationInfo };
|
|
||||||
|
return {
|
||||||
|
result: paginatedData,
|
||||||
|
meta: paginationInfo,
|
||||||
|
};
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
getResource: {
|
getResource: {
|
||||||
@@ -186,7 +176,7 @@ export default class ComicVineService extends Service {
|
|||||||
field_list: `${fieldList}`,
|
field_list: `${fieldList}`,
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent": "ThreeTwo",
|
"User-Agent": "ThreeTwo",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -289,7 +279,7 @@ export default class ComicVineService extends Service {
|
|||||||
filter: filterString,
|
filter: filterString,
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent": "ThreeTwo",
|
"User-Agent": "ThreeTwo",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -345,7 +335,7 @@ export default class ComicVineService extends Service {
|
|||||||
rest: "POST /getComicVineMatchScores",
|
rest: "POST /getComicVineMatchScores",
|
||||||
handler: async (
|
handler: async (
|
||||||
ctx: Context<{
|
ctx: Context<{
|
||||||
finalMatches: Array<any>;
|
finalMatches: any[];
|
||||||
rawFileDetails: any;
|
rawFileDetails: any;
|
||||||
scorerConfiguration: any;
|
scorerConfiguration: any;
|
||||||
}>
|
}>
|
||||||
@@ -382,7 +372,7 @@ export default class ComicVineService extends Service {
|
|||||||
resources: "volumes",
|
resources: "volumes",
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent": "ThreeTwo",
|
"User-Agent": "ThreeTwo",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -401,7 +391,7 @@ export default class ComicVineService extends Service {
|
|||||||
format: "json",
|
format: "json",
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent":
|
"User-Agent":
|
||||||
"ThreeTwo",
|
"ThreeTwo",
|
||||||
},
|
},
|
||||||
@@ -493,7 +483,7 @@ export default class ComicVineService extends Service {
|
|||||||
limit: 100,
|
limit: 100,
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent": "ThreeTwo",
|
"User-Agent": "ThreeTwo",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -508,7 +498,7 @@ export default class ComicVineService extends Service {
|
|||||||
: null; // Extract the year from cover_date
|
: null; // Extract the year from cover_date
|
||||||
return {
|
return {
|
||||||
...issue,
|
...issue,
|
||||||
year: year,
|
year,
|
||||||
description: issue.description || "",
|
description: issue.description || "",
|
||||||
image: issue.image || {},
|
image: issue.image || {},
|
||||||
};
|
};
|
||||||
@@ -548,7 +538,7 @@ export default class ComicVineService extends Service {
|
|||||||
resources,
|
resources,
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "application/json",
|
"Accept": "application/json",
|
||||||
"User-Agent": "ThreeTwo",
|
"User-Agent": "ThreeTwo",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -8,7 +8,8 @@
|
|||||||
"sourceMap": true,
|
"sourceMap": true,
|
||||||
"pretty": true,
|
"pretty": true,
|
||||||
"target": "es6",
|
"target": "es6",
|
||||||
"outDir": "dist"
|
"outDir": "dist",
|
||||||
|
"skipLibCheck": true,
|
||||||
},
|
},
|
||||||
"include": ["./**/*"],
|
"include": ["./**/*"],
|
||||||
"exclude": [
|
"exclude": [
|
||||||
|
|||||||
@@ -1,64 +1,103 @@
|
|||||||
import jsdom from "jsdom";
|
import puppeteer from "puppeteer-extra";
|
||||||
|
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
||||||
|
import { faker } from "@faker-js/faker";
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
const { JSDOM } = jsdom;
|
import { JSDOM } from "jsdom";
|
||||||
|
|
||||||
export const scrapeIssuesFromSeriesPage = async (url: string) => {
|
// Optional Tor
|
||||||
const response = await axios(url);
|
const useTor = process.env.USE_TOR === "true";
|
||||||
const dom = new JSDOM(response.data, {
|
const torProxy = process.env.TOR_SOCKS_PROXY || "socks5://192.168.1.119:9050";
|
||||||
url,
|
|
||||||
referrer: url,
|
// Apply stealth plugin
|
||||||
contentType: "text/html",
|
puppeteer.use(StealthPlugin());
|
||||||
includeNodeLocations: true,
|
|
||||||
storageQuota: 10000000,
|
export const getWeeklyPullList = async (url: string) => {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
slowMo: 50,
|
||||||
|
args: useTor
|
||||||
|
? [`--proxy-server=${torProxy}`]
|
||||||
|
: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||||
});
|
});
|
||||||
const seriesId = dom.window.document
|
|
||||||
.querySelector("#comic-list-block")
|
|
||||||
.getAttribute("data-series-id");
|
|
||||||
const issueNodes = dom.window.document.querySelectorAll(
|
|
||||||
"ul.comic-list-thumbs > li"
|
|
||||||
);
|
|
||||||
|
|
||||||
const issues: any = [];
|
const page = await browser.newPage();
|
||||||
issueNodes.forEach(node => {
|
|
||||||
const comicHref = node.querySelector("a").getAttribute("href");
|
|
||||||
const issueCoverImage = node.querySelector("img").getAttribute("src");
|
|
||||||
const issueDetails = node.querySelector("img").getAttribute("alt");
|
|
||||||
const issueDate = node.querySelector("span.date").getAttribute("data-date");
|
|
||||||
const formattedIssueDate = node.querySelector("span.date").textContent.trim();
|
|
||||||
const publisher = node.querySelector("div.publisher").textContent.trim();
|
|
||||||
|
|
||||||
issues.push({
|
await page.setExtraHTTPHeaders({
|
||||||
comicHref,
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
issueCoverImage,
|
"Referer": "https://leagueofcomicgeeks.com/",
|
||||||
issueDetails,
|
});
|
||||||
issueDate,
|
|
||||||
formattedIssueDate,
|
await page.setUserAgent(faker.internet.userAgent());
|
||||||
publisher,
|
|
||||||
|
await page.setViewport({
|
||||||
|
width: faker.number.int({ min: 1024, max: 1920 }),
|
||||||
|
height: faker.number.int({ min: 768, max: 1080 }),
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(url, {
|
||||||
|
waitUntil: "domcontentloaded", // Faster and more reliable for JS-rendered content
|
||||||
|
timeout: 30000, // Give it time on Tor or slow networks
|
||||||
});
|
});
|
||||||
});
|
|
||||||
return {
|
await page.waitForSelector(".issue", { timeout: 30000 });
|
||||||
seriesId,
|
console.log("✅ Found .issue blocks");
|
||||||
issues,
|
|
||||||
};
|
return await page.evaluate(() => {
|
||||||
|
const issues = Array.from(document.querySelectorAll(".issue"));
|
||||||
|
|
||||||
|
return issues.map(issue => {
|
||||||
|
const issueUrlElement = issue.querySelector(".cover a");
|
||||||
|
const coverImageElement =
|
||||||
|
issue.querySelector(".cover img.lazy");
|
||||||
|
const publisherText =
|
||||||
|
issue.querySelector("div.publisher")?.textContent?.trim() ||
|
||||||
|
null;
|
||||||
|
const issueName =
|
||||||
|
issue
|
||||||
|
.querySelector("div.title")
|
||||||
|
?.getAttribute("data-sorting") || null;
|
||||||
|
|
||||||
|
// Convert Unix timestamp (in seconds) to YYYY-MM-DD
|
||||||
|
const publicationDateRaw = issue
|
||||||
|
.querySelector(".date")
|
||||||
|
?.getAttribute("data-date");
|
||||||
|
const publicationDate = publicationDateRaw
|
||||||
|
? new Date(parseInt(publicationDateRaw, 10) * 1000)
|
||||||
|
.toISOString()
|
||||||
|
.split("T")[0]
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const imageUrl =
|
||||||
|
coverImageElement?.getAttribute("data-src") ||
|
||||||
|
coverImageElement?.getAttribute("src") ||
|
||||||
|
null;
|
||||||
|
|
||||||
|
const coverImageUrl = imageUrl
|
||||||
|
? imageUrl.replace(/\/medium-(\d+\.jpg)/, "/large-$1")
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const issueUrl = issueUrlElement?.getAttribute("href") || null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
issueName,
|
||||||
|
coverImageUrl,
|
||||||
|
issueUrl,
|
||||||
|
publisher: publisherText,
|
||||||
|
publicationDate,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Scraper error:", err);
|
||||||
|
throw err;
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export const scrapeIssuePage = async (url: string) => {
|
export const scrapeIssuePage = async (url: string) => {
|
||||||
const response = await axios(url);
|
|
||||||
const dom = new JSDOM(response.data, {
|
|
||||||
url,
|
|
||||||
referrer: url,
|
|
||||||
contentType: "text/html",
|
|
||||||
includeNodeLocations: true,
|
|
||||||
storageQuota: 10000000,
|
|
||||||
});
|
|
||||||
const seriesDOMElement = dom.window.document
|
|
||||||
.querySelector("div.series-pagination > a.series").getAttribute("href");
|
|
||||||
return seriesDOMElement;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
export const getWeeklyPullList = async () => {
|
|
||||||
const url = "https://www.tfaw.com/comics/new-releases.html";
|
|
||||||
const response = await axios(url);
|
const response = await axios(url);
|
||||||
const dom = new JSDOM(response.data, {
|
const dom = new JSDOM(response.data, {
|
||||||
url,
|
url,
|
||||||
@@ -67,22 +106,8 @@ export const getWeeklyPullList = async () => {
|
|||||||
includeNodeLocations: true,
|
includeNodeLocations: true,
|
||||||
storageQuota: 10000000,
|
storageQuota: 10000000,
|
||||||
});
|
});
|
||||||
|
const seriesDOMElement = dom.window.document
|
||||||
const pullList: any[] = [];
|
.querySelector("div.series-pagination > a.series")
|
||||||
// Node for the comics container
|
.getAttribute("href");
|
||||||
const issueNodes = dom.window.document.querySelectorAll("ol.products > li");
|
return seriesDOMElement;
|
||||||
|
|
||||||
issueNodes.forEach(node => {
|
|
||||||
const coverImageUrl = node.querySelector("img.photo").getAttribute("data-src");
|
|
||||||
const name = node.querySelector("div.product > a.product").textContent.trim();
|
|
||||||
const publicationDate = node.querySelector("div.product-item-date").textContent.trim();
|
|
||||||
pullList.push({
|
|
||||||
coverImageUrl,
|
|
||||||
name,
|
|
||||||
publicationDate,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
return pullList;
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -152,8 +152,9 @@ const calculateLevenshteinDistance = async (match: any, rawFileDetails: any) =>
|
|||||||
console.log(rawFileDetails.cover.filePath);
|
console.log(rawFileDetails.cover.filePath);
|
||||||
const fileName = match.id + "_" + rawFileDetails.name + ".jpg";
|
const fileName = match.id + "_" + rawFileDetails.name + ".jpg";
|
||||||
// Ensure the `temporary` directory exists
|
// Ensure the `temporary` directory exists
|
||||||
if (!existsSync("temporary")) {
|
const tempDir = path.join(`${process.env.USERDATA_DIRECTORY}`, "temporary");
|
||||||
mkdirSync("temporary", { recursive: true });
|
if (!existsSync(tempDir)) {
|
||||||
|
mkdirSync(tempDir, { recursive: true });
|
||||||
}
|
}
|
||||||
const file = createWriteStream(
|
const file = createWriteStream(
|
||||||
`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`
|
`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`
|
||||||
|
|||||||
Reference in New Issue
Block a user