🏗️ Refactored CV search match scorer

This commit is contained in:
2024-01-04 23:33:02 -05:00
parent 395e25aa37
commit 5004dd5990
2 changed files with 107 additions and 55 deletions

View File

@@ -190,6 +190,8 @@ export default class ComicVineService extends Service {
"Searching against: ", "Searching against: ",
ctx.params.scorerConfiguration.searchParams ctx.params.scorerConfiguration.searchParams
); );
const { rawFileDetails, scorerConfiguration } =
ctx.params;
const results: any = []; const results: any = [];
console.log( console.log(
"passed to fetchVolumesFromCV", "passed to fetchVolumesFromCV",
@@ -290,12 +292,44 @@ export default class ComicVineService extends Service {
return issue; return issue;
} }
); );
// Score the final matches
const foo = await this.broker.call(
"comicvine.getComicVineMatchScores",
{
finalMatches,
rawFileDetails,
scorerConfiguration,
}
);
return Promise.all(finalMatches); return Promise.all(finalMatches);
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
}, },
}, },
getComicVineMatchScores: {
rest: "POST /getComicVineMatchScores",
handler: async (
ctx: Context<{
finalMatches: Array<any>;
rawFileDetails: any;
scorerConfiguration: any;
}>
) => {
const {
finalMatches,
rawFileDetails,
scorerConfiguration,
} = ctx.params;
console.log(ctx.params);
return await matchScorer(
finalMatches,
scorerConfiguration.searchParams,
rawFileDetails
);
},
},
}, },
methods: { methods: {
fetchVolumesFromCV: async (payload, output: any[] = []) => { fetchVolumesFromCV: async (payload, output: any[] = []) => {

View File

@@ -42,60 +42,57 @@ import { isAfter, isSameYear, parseISO } from "date-fns";
const imghash = require("imghash"); const imghash = require("imghash");
export const matchScorer = async ( export const matchScorer = async (
searchMatches: any, searchMatches: Promise<any>[],
searchQuery: any, searchQuery: any,
rawFileDetails: any rawFileDetails: any
): Promise<any> => { ): Promise<any> => {
// 1. Check if it exists in the db (score: 0) const scoredMatches: any = [];
// 2. Check if issue name matches strongly (score: ++)
// 3. Check if issue number matches strongly (score: ++) try {
// 4. Check if issue covers hash match strongly (score: +++) const matches = await Promise.all(searchMatches);
// 5. Check if issue year matches strongly (score: +)
const scoredMatches = map(searchMatches, async (match, idx) => { for (const match of matches) {
match.score = 0; match.score = 0;
// Check for the issue name match // Check for the issue name match
if ( if (!isNil(searchQuery.name) && !isNil(match.name)) {
!isNil(searchQuery.issue.searchParams.searchTerms.name) &&
!isNil(match.name)
) {
const issueNameScore = stringSimilarity.compareTwoStrings( const issueNameScore = stringSimilarity.compareTwoStrings(
searchQuery.issue.searchParams.searchTerms.name, searchQuery.name,
match.name match.name
); );
match.score = issueNameScore; match.score = issueNameScore;
} }
// Issue number matches // Issue number matches
if (!isNil(searchQuery.number) && !isNil(match.issue_number)) {
if ( if (
!isNil(searchQuery.issue.searchParams.searchTerms.number) && parseInt(searchQuery.number, 10) ===
!isNil(match.issue_number) parseInt(match.issue_number, 10)
) {
if (
parseInt(
searchQuery.issue.searchParams.searchTerms.number,
10
) === parseInt(match.issue_number, 10)
) { ) {
match.score += 1; match.score += 1;
} }
} }
// Cover image hash match // Cover image hash match
return await calculateLevenshteinDistance(match, rawFileDetails); scoredMatches.push(
}); await calculateLevenshteinDistance(match, rawFileDetails)
return Promise.all(scoredMatches); );
}
return scoredMatches;
} catch (error) {
// Handle errors here
console.error("Error in matchScorer:", error);
throw error;
}
}; };
export const rankVolumes = (volumes: any, scorerConfiguration: any) => { export const rankVolumes = (volumes: any, scorerConfiguration: any) => {
// Iterate over volumes, checking to see: // Iterate over volumes, checking to see:
// 1. If the detected year of the issue falls in the range (end_year >= {detected year for issue} >= start_year ) // 1. If the detected year of the issue falls in the range (end_year >= {detected year for issue} >= start_year )
// 2. If there is a strong string comparison between the volume name and the issue name ?? // 2. If there is a strong string comparison between the volume name and the issue name ??
const issueNumber = parseInt( const issueNumber = parseInt(scorerConfiguration.searchParams.number, 10);
scorerConfiguration.searchParams.number, const issueYear = parseISO(scorerConfiguration.searchParams.year);
10
);
const issueYear = parseISO(
scorerConfiguration.searchParams.year
);
const foo = volumes.map((volume: any, idx: number) => { const foo = volumes.map((volume: any, idx: number) => {
let volumeMatchScore = 0; let volumeMatchScore = 0;
const volumeStartYear = !isNil(volume.start_year) const volumeStartYear = !isNil(volume.start_year)
@@ -115,21 +112,29 @@ export const rankVolumes = (volumes: any, scorerConfiguration: any) => {
// If not, move on. // If not, move on.
let subtitleMatchScore = 0; let subtitleMatchScore = 0;
if (!isNil(scorerConfiguration.searchParams.subtitle)) { if (!isNil(scorerConfiguration.searchParams.subtitle)) {
subtitleMatchScore = stringSimilarity.compareTwoStrings(scorerConfiguration.searchParams.subtitle, volume.name); subtitleMatchScore = stringSimilarity.compareTwoStrings(
scorerConfiguration.searchParams.subtitle,
volume.name
);
if (subtitleMatchScore > 0.1) { if (subtitleMatchScore > 0.1) {
issueNameMatchScore += subtitleMatchScore; issueNameMatchScore += subtitleMatchScore;
} }
} }
// 2. If issue year starts after the candidate volume's start year or is the same year, +2 to volumeMatchScore // 2. If issue year starts after the candidate volume's start year or is the same year, +2 to volumeMatchScore
if (!isNil(volumeStartYear)) { if (!isNil(volumeStartYear)) {
if (isSameYear(issueYear, volumeStartYear) || if (
isAfter(issueYear, volumeStartYear)) { isSameYear(issueYear, volumeStartYear) ||
isAfter(issueYear, volumeStartYear)
) {
volumeMatchScore += 2; volumeMatchScore += 2;
} }
} }
// 3. If issue number falls in the range of candidate volume's first issue # and last issue #, +3 to volumeMatchScore // 3. If issue number falls in the range of candidate volume's first issue # and last issue #, +3 to volumeMatchScore
if (!isNil(firstIssueNumber) && !isNil(lastIssueNumber)) { if (!isNil(firstIssueNumber) && !isNil(lastIssueNumber)) {
if(firstIssueNumber <= issueNumber || issueNumber <= lastIssueNumber) { if (
firstIssueNumber <= issueNumber ||
issueNumber <= lastIssueNumber
) {
volumeMatchScore += 3; volumeMatchScore += 3;
} }
} }
@@ -144,17 +149,30 @@ export const rankVolumes = (volumes: any, scorerConfiguration: any) => {
const calculateLevenshteinDistance = async (match: any, rawFileDetails: any) => const calculateLevenshteinDistance = async (match: any, rawFileDetails: any) =>
new Promise((resolve, reject) => { new Promise((resolve, reject) => {
https.get(match.image.small_url, (response: any) => { https.get(match.image.small_url, (response: any) => {
console.log(rawFileDetails.cover.filePath);
const fileName = match.id + "_" + rawFileDetails.name + ".jpg"; const fileName = match.id + "_" + rawFileDetails.name + ".jpg";
const file = createWriteStream(`./userdata/temporary/${fileName}`); const file = createWriteStream(
`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`
);
const fileStream = response.pipe(file); const fileStream = response.pipe(file);
fileStream.on("finish", async () => { fileStream.on("finish", async () => {
// 1. hash of the cover image we have on hand // 1. hash of the cover image we have on hand
const coverFileName = rawFileDetails.cover.filePath
.split("/")
.at(-1);
const coverDirectory = rawFileDetails.containedIn
.split("/")
.at(-1);
const hash1 = await imghash.hash( const hash1 = await imghash.hash(
path.resolve(rawFileDetails.cover.filePath) path.resolve(
`${process.env.USERDATA_DIRECTORY}/covers/${coverDirectory}/${coverFileName}`
)
); );
// 2. hash of the cover of the potential match // 2. hash of the cover of the potential match
const hash2 = await imghash.hash( const hash2 = await imghash.hash(
path.resolve(`./userdata/temporary/${fileName}`) path.resolve(
`${process.env.USERDATA_DIRECTORY}/temporary/${fileName}`
)
); );
if (!isUndefined(hash1) && !isUndefined(hash2)) { if (!isUndefined(hash1) && !isUndefined(hash2)) {
const levenshteinDistance = leven(hash1, hash2); const levenshteinDistance = leven(hash1, hash2);