🤼‍♀️ Comic Vine Match algorithm, 2nd draft

This commit is contained in:
2021-07-22 16:07:52 -07:00
parent 96a6438fb4
commit 57f7621c0e
3 changed files with 37 additions and 26 deletions

View File

@@ -1,5 +1,9 @@
import axios from "axios"; import axios from "axios";
import { IFolderData, IExtractedComicBookCoverFile } from "threetwo-ui-typings"; import {
IFolderData,
IExtractedComicBookCoverFile,
IComicVineSearchQuery,
} from "threetwo-ui-typings";
import { API_BASE_URI, SOCKET_BASE_URI } from "../constants/endpoints"; import { API_BASE_URI, SOCKET_BASE_URI } from "../constants/endpoints";
import { io } from "socket.io-client"; import { io } from "socket.io-client";
import { import {
@@ -105,8 +109,8 @@ export const getRecentlyImportedComicBooks = (options) => async (dispatch) => {
export const fetchComicVineMatches = (searchPayload) => (dispatch) => { export const fetchComicVineMatches = (searchPayload) => (dispatch) => {
try { try {
const issueString = searchPayload.rawFileDetails.path.split("/").pop(); const issueString = searchPayload.rawFileDetails.path.split("/").pop();
let seriesSearchQuery = {}; const issueSearchQuery: IComicVineSearchQuery = refineQuery(issueString);
const issueSearchQuery = refineQuery(issueString); let seriesSearchQuery: IComicVineSearchQuery = {} as IComicVineSearchQuery;
if (searchPayload.rawFileDetails.containedIn !== "comics") { if (searchPayload.rawFileDetails.containedIn !== "comics") {
seriesSearchQuery = refineQuery( seriesSearchQuery = refineQuery(
searchPayload.rawFileDetails.containedIn.split("/").pop(), searchPayload.rawFileDetails.containedIn.split("/").pop(),

View File

@@ -3,8 +3,9 @@ import { default as dates } from "compromise-dates";
import { default as sentences } from "compromise-sentences"; import { default as sentences } from "compromise-sentences";
import { default as numbers } from "compromise-numbers"; import { default as numbers } from "compromise-numbers";
import xregexp from "xregexp"; import xregexp from "xregexp";
import { MatchArray } from "xregexp/types";
import voca from "voca"; import voca from "voca";
import { map, xor, isEmpty, isNull } from "lodash"; import { xor, isEmpty, isNull } from "lodash";
nlp.extend(sentences); nlp.extend(sentences);
nlp.extend(numbers); nlp.extend(numbers);
@@ -72,18 +73,12 @@ export const tokenize = (inputString: string) => {
// regexes to match constituent parts of the search string // regexes to match constituent parts of the search string
// and isolate the search terms // and isolate the search terms
const chapters = inputString.replace( inputString.replace(/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi, "");
/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi, inputString.replace(
"",
);
const volumes = inputString.replace(
/(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi, /(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
"", "",
); );
const pageCounts = inputString.replace( inputString.replace(/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi, "");
/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi,
"",
);
// if the name has things like "4 of 5", remove the " of 5" part // if the name has things like "4 of 5", remove the " of 5" part
// also, if the name has 3-6, remove the -6 part. note that we'll // also, if the name has 3-6, remove the -6 part. note that we'll
@@ -103,19 +98,29 @@ export const tokenize = (inputString: string) => {
let issueNumber = hyphenatedIssueRange[0]; let issueNumber = hyphenatedIssueRange[0];
} }
if (voca.includes(inputString, "_") && !voca.includes(inputString, " ")) {
inputString.replace(/[-_#]/gi, "");
}
const readingListIndicators = inputString.match( const readingListIndicators = inputString.match(
/^\s*\d+(\.\s+?|\s*-?\s*)/gim, /^\s*\d+(\.\s+?|\s*-?\s*)/gim,
); );
let issueNumbers = ""; let issueNumbers = "";
let parsedIssueNumber = "";
const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi); const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi);
if (!isEmpty(issues)) {
if (!isEmpty(issues) && !isNull(issues)) {
issueNumbers = issues[0].trim(); issueNumbers = issues[0].trim();
const matches = extractNumerals(issueNumbers);
// if we parsed out some potential issue numbers, designate the LAST
// (rightmost) one as the actual issue number, and remove it from the name
if (matches.length > 0) {
parsedIssueNumber = matches[0].pop();
}
} }
// const issueHashes = inputString.match(/\#\d/gi);
inputString = voca.replace(inputString, parsedIssueNumber, "");
inputString = voca.replace(inputString, /_.-# /gi, "");
inputString = nlp(inputString).text("normal").trim();
const yearMatches = inputString.match(/\d{4}/gi); const yearMatches = inputString.match(/\d{4}/gi);
const sentenceToProcess = sentence[0].normal.replace(/_/g, " "); const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
@@ -126,12 +131,8 @@ export const tokenize = (inputString: string) => {
const queryObject = { const queryObject = {
comicbook_identifier_tokens: { comicbook_identifier_tokens: {
issueNumbers, inputString,
chapters, parsedIssueNumber,
pageCounts,
readingListIndicators,
volumes,
}, },
years: { years: {
yearMatches, yearMatches,
@@ -144,14 +145,20 @@ export const tokenize = (inputString: string) => {
return queryObject; return queryObject;
}; };
export const extractNumerals = (inputString: string): string => { export const extractNumerals = (inputString: string): MatchArray[string] => {
// Searches through the given string left-to-right, building an ordered list of // Searches through the given string left-to-right, building an ordered list of
// "issue number-like" re.match objects. For example, this method finds // "issue number-like" re.match objects. For example, this method finds
// matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0 // matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0
const matches: MatchArray[string] = [];
xregexp.forEach(inputString, /(^|[_\s#])(-?\d*\.?\d\w*)/gmu, (match) => {
matches.push(match);
});
return matches;
}; };
export const refineQuery = (inputString) => { export const refineQuery = (inputString) => {
const queryObj = tokenize(inputString); const queryObj = tokenize(inputString);
console.log("QWEQWEQWE", queryObj);
const removedYears = xor( const removedYears = xor(
queryObj.sentence_tokens.normalized, queryObj.sentence_tokens.normalized,
queryObj.years.yearMatches, queryObj.years.yearMatches,
@@ -162,7 +169,6 @@ export const refineQuery = (inputString) => {
name: queryObj.sentence_tokens.detailed[0].text, name: queryObj.sentence_tokens.detailed[0].text,
number: queryObj.comicbook_identifier_tokens.issueNumbers, number: queryObj.comicbook_identifier_tokens.issueNumbers,
}, },
year: queryObj.years,
}, },
meta: { meta: {
queryObj, queryObj,

View File

@@ -111,6 +111,7 @@ interface SearchInstance {
searches_sent_ago: number; searches_sent_ago: number;
} }
app.use(opdsRouter()); app.use(opdsRouter());
const foo = SocketService.connect("admin", "password"); const foo = SocketService.connect("admin", "password");
foo.then(async (data) => { foo.then(async (data) => {
const instance: SearchInstance = await SocketService.post("search"); const instance: SearchInstance = await SocketService.post("search");