🤼‍♀️ Comic Vine Match algorithm, 2nd draft

2021-07-22 16:07:52 -07:00
parent 96a6438fb4
commit 57f7621c0e
3 changed files with 37 additions and 26 deletions
--- a/src/client/actions/fileops.actions.tsx
+++ b/src/client/actions/fileops.actions.tsx
@@ -1,5 +1,9 @@
 import axios from "axios";
-import { IFolderData, IExtractedComicBookCoverFile } from "threetwo-ui-typings";
+import {
  IFolderData,
  IExtractedComicBookCoverFile,
  IComicVineSearchQuery,
 } from "threetwo-ui-typings";
 import { API_BASE_URI, SOCKET_BASE_URI } from "../constants/endpoints";
 import { io } from "socket.io-client";
 import {
@@ -105,8 +109,8 @@ export const getRecentlyImportedComicBooks = (options) => async (dispatch) => {
 export const fetchComicVineMatches = (searchPayload) => (dispatch) => {
  try {
    const issueString = searchPayload.rawFileDetails.path.split("/").pop();
-    let seriesSearchQuery = {};
+    const issueSearchQuery: IComicVineSearchQuery = refineQuery(issueString);
-    const issueSearchQuery = refineQuery(issueString);
+    let seriesSearchQuery: IComicVineSearchQuery = {} as IComicVineSearchQuery;
    if (searchPayload.rawFileDetails.containedIn !== "comics") {
      seriesSearchQuery = refineQuery(
        searchPayload.rawFileDetails.containedIn.split("/").pop(),
--- a/src/client/shared/utils/filenameparser.utils.ts
+++ b/src/client/shared/utils/filenameparser.utils.ts
@@ -3,8 +3,9 @@ import { default as dates } from "compromise-dates";
 import { default as sentences } from "compromise-sentences";
 import { default as numbers } from "compromise-numbers";
 import xregexp from "xregexp";
 import { MatchArray } from "xregexp/types";
 import voca from "voca";
-import { map, xor, isEmpty, isNull } from "lodash";
+import { xor, isEmpty, isNull } from "lodash";
 nlp.extend(sentences);
 nlp.extend(numbers);
@@ -72,18 +73,12 @@ export const tokenize = (inputString: string) => {
  // regexes to match constituent parts of the search string
  // and isolate the search terms
-  const chapters = inputString.replace(
+  inputString.replace(/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi, "");
-    /ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi,
+  inputString.replace(
    "",
  );
  const volumes = inputString.replace(
    /(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
    "",
  );
-  const pageCounts = inputString.replace(
+  inputString.replace(/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi, "");
    /\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi,
    "",
  );
  // if the name has things like "4 of 5", remove the " of 5" part
  // also, if the name has 3-6, remove the -6 part.  note that we'll
@@ -103,19 +98,29 @@ export const tokenize = (inputString: string) => {
    let issueNumber = hyphenatedIssueRange[0];
  }
  if (voca.includes(inputString, "_") && !voca.includes(inputString, " ")) {
    inputString.replace(/[-_#]/gi, "");
  }
  const readingListIndicators = inputString.match(
    /^\s*\d+(\.\s+?|\s*-?\s*)/gim,
  );
  let issueNumbers = "";
  let parsedIssueNumber = "";
  const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi);
-  if (!isEmpty(issues)) {
+
  if (!isEmpty(issues) && !isNull(issues)) {
    issueNumbers = issues[0].trim();
    const matches = extractNumerals(issueNumbers);
    // if we parsed out some potential issue numbers, designate the LAST
    // (rightmost) one as the actual issue number, and remove it from the name
    if (matches.length > 0) {
      parsedIssueNumber = matches[0].pop();
    }
  }
-  // const issueHashes = inputString.match(/\#\d/gi);
+
  inputString = voca.replace(inputString, parsedIssueNumber, "");
  inputString = voca.replace(inputString, /_.-# /gi, "");
  inputString = nlp(inputString).text("normal").trim();
  const yearMatches = inputString.match(/\d{4}/gi);
  const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
@@ -126,12 +131,8 @@ export const tokenize = (inputString: string) => {
  const queryObject = {
    comicbook_identifier_tokens: {
-      issueNumbers,
+      inputString,
-      chapters,
+      parsedIssueNumber,
      pageCounts,
      readingListIndicators,
      volumes,
    },
    years: {
      yearMatches,
@@ -144,14 +145,20 @@ export const tokenize = (inputString: string) => {
  return queryObject;
 };
-export const extractNumerals = (inputString: string): string => {
+export const extractNumerals = (inputString: string): MatchArray[string] => {
  // Searches through the given string left-to-right, building an ordered list of
  //  "issue number-like" re.match objects.  For example, this method finds
  //  matches substrings like:  3, #4, 5a, 6.00, 10.0b, .5, -1.0
  const matches: MatchArray[string] = [];
  xregexp.forEach(inputString, /(^|[_\s#])(-?\d*\.?\d\w*)/gmu, (match) => {
    matches.push(match);
  });
  return matches;
 };
 export const refineQuery = (inputString) => {
  const queryObj = tokenize(inputString);
  console.log("QWEQWEQWE", queryObj);
  const removedYears = xor(
    queryObj.sentence_tokens.normalized,
    queryObj.years.yearMatches,
@@ -162,7 +169,6 @@ export const refineQuery = (inputString) => {
        name: queryObj.sentence_tokens.detailed[0].text,
        number: queryObj.comicbook_identifier_tokens.issueNumbers,
      },
      year: queryObj.years,
    },
    meta: {
      queryObj,
--- a/src/server/index.ts
+++ b/src/server/index.ts
@@ -111,6 +111,7 @@ interface SearchInstance {
  searches_sent_ago: number;
 }
 app.use(opdsRouter());
 const foo = SocketService.connect("admin", "password");
 foo.then(async (data) => {
  const instance: SearchInstance = await SocketService.post("search");