🔧 Abstracted filename-parser into its own npm package

This commit is contained in:
2022-01-31 08:36:29 -08:00
parent 27bd383f00
commit 3cedb9238b
5 changed files with 272 additions and 486 deletions

View File

@@ -43,6 +43,7 @@
"ellipsize": "^0.1.0",
"express": "^4.17.1",
"fastest-validator": "^1.11.0",
"filename-parser": "^1.0.0",
"final-form": "^4.20.2",
"final-form-arrays": "^3.0.2",
"html-to-text": "^8.1.0",

View File

@@ -3,7 +3,7 @@ import React, { ReactElement, useCallback } from "react";
import { useSelector, useDispatch } from "react-redux";
import Select, { components } from "react-select";
import { fetchComicVineMatches } from "../../../actions/fileops.actions";
import { refineQuery } from "../../../shared/utils/filenameparser.utils";
import { refineQuery } from "filename-parser";
export const Menu = (props): ReactElement => {
const { data } = props;

View File

@@ -1,4 +1,4 @@
import { isUndefined } from "lodash";
import { isEmpty, isNil, isUndefined } from "lodash";
import React, { useEffect, ReactElement } from "react";
import { useDispatch, useSelector } from "react-redux";
import { useParams } from "react-router";
@@ -58,12 +58,17 @@ const VolumeDetails = (props): ReactElement => {
>
{issues.map((issue) => {
return (
<Card
key={issue.id}
imageUrl={issue.image.thumb_url}
orientation={"vertical"}
hasDetails={false}
/>
<>
<Card
key={issue.id}
imageUrl={issue.image.thumb_url}
orientation={"vertical"}
hasDetails={false}
/>
{!isEmpty(issue.potentialMatches)
? "matches available"
: null}
</>
);
})}
</Masonry>

View File

@@ -1,216 +0,0 @@
import { default as nlp } from "compromise";
import { default as dates } from "compromise-dates";
import { default as sentences } from "compromise-sentences";
import { default as numbers } from "compromise-numbers";
import xregexp from "xregexp";
import { MatchArray } from "xregexp/types";
import voca from "voca";
import { xor, isEmpty, isNull, isNil } from "lodash";
nlp.extend(sentences);
nlp.extend(numbers);
nlp.extend(dates);
interface M {
start: number;
end: number;
value: string;
}
const replaceRecursive = (
text: string,
left: string,
right: string,
replacer: (match: string) => string,
): string => {
const r: M[] = xregexp.matchRecursive(text, left, right, "g", {
valueNames: [null, null, "match", null],
});
let offset = 0;
for (const m of r) {
const replacement = replacer(m.value);
text = replaceAt(text, m.start + offset, m.value.length, replacement);
offset += replacement.length - m.value.length;
}
return text;
};
function replaceAt(
string: string,
index: number,
length: number,
replacement: string,
): string {
return string.substr(0, index) + replacement + string.substr(index + length);
}
export const preprocess = (inputString: string) => {
// see if the comic matches the following format, and if so, remove everything
// after the first number:
// "nnn series name #xx (etc) (etc)" -> "series name #xx (etc) (etc)"
const format1 = inputString.match(/^\s*(\d+)[\s._-]+?([^#]+)(\W+.*)/gim);
// see if the comic matches the following format, and if so, remove everything
// after the first number that isn't in brackets:
// "series name #xxx - title (etc) (etc)" -> "series name #xxx (etc) (etc)
const format2 = inputString.match(
/^((?:[a-zA-Z,.-]+\s)+)(\#?(?:\d+[.0-9*])\s*(?:-))(.*((\(.*)?))$/gis,
);
return {
matches: {
format1,
format2,
},
};
};
/**
* Tokenizes a search string
* @function
* @param {string} inputString - The string used to search against CV, Shortboxed, and other APIs.
*/
export const tokenize = (inputString: string) => {
const doc = nlp(inputString);
const sentence = doc.sentences().json();
const yearMatches = extractYears(inputString);
const hyphenatedIssueRange = inputString.match(/(\d)(-\d+)/gi);
if (!isNull(hyphenatedIssueRange) && hyphenatedIssueRange.length > 2) {
const issueNumber = hyphenatedIssueRange[0];
}
const readingListIndicators = inputString.match(
/^\s*\d+(\.\s+?|\s*-?\s*)/gim,
);
// Issue numbers
let issueNumbers = "";
let parsedIssueNumber = "";
// https://regex101.com/r/fgmd22/1
const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi);
const tpbIssueNumber = inputString.match(/((\s|\|-|:)v?\d?\s)/gim);
inputString.replace(
/(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
"",
);
// find the matches for a tpb "issue" number such as v2
if (!isNil(tpbIssueNumber)) {
parsedIssueNumber = tpbIssueNumber[0].trim();
}
if (!isEmpty(issues) && !isNull(issues)) {
issueNumbers = issues[0].trim();
const matches = extractNumerals(issueNumbers);
// if we parsed out some potential issue numbers, designate the LAST
// (rightmost) one as the actual issue number, and remove it from the name
if (matches.length > 0) {
parsedIssueNumber = matches[0].pop();
}
}
inputString = voca.replace(inputString, parsedIssueNumber, "");
// filter out anything at the end of the title in parantheses
inputString = inputString.replace(/\((.*?)\)$/gi, "");
// get a subtitle for titles such as:
// Commando 4779 - Evil in the East (2015) (Digital) (DR & Quinch-Empire)
// will match "Evil in the East (2015) (Digital) (DR & Quinch-Empire)"
const subtitleMatch = inputString.match(/\s\-\s(.*)/gm);
let subtitle = "";
if (!isNil(subtitleMatch)) {
subtitle = subtitleMatch[0].replace(/[^a-zA-Z0-9 ]/gm, "");
subtitle = subtitle.trim();
// Remove the subtitle from the main input string
// Commando 4779 - Evil in the East (2015) (Digital) (DR & Quinch-Empire)
// will return "Commando 4779"
inputString = inputString.replace(/\s\-\s(.*)/gm, "");
}
// replace special characters with... nothing
inputString = inputString.replace(/[^a-zA-Z0-9(\+?\s?\-?\'?)]/gm, "");
// regexes to match constituent parts of the search string
// and isolate the search terms
inputString.replace(/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi, "");
inputString.replace(/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi, "");
// if the name has things like "4 of 5", remove the " of 5" part
// also, if the name has 3-6, remove the -6 part. note that we'll
// try to handle the word "of" in a few common languages, like french/
// spanish (de), italian (di), german (von), dutch (van) or polish (z)
replaceRecursive(inputString, "\\(", "\\)", () => "");
replaceRecursive(inputString, "\\[", "\\]", () => "");
replaceRecursive(inputString, "\\{", "\\}", () => "");
inputString.replace(/\([^\(]*?\)/gi, "");
inputString.replace(/\{[^\{]*?\}/gi, "");
inputString.replace(/\[[^\[]*?\]/gi, "");
inputString.replace(/([^\d]+)(\s*(of|de|di|von|van|z)\s*#*\d+)/gi, "");
// inputString = voca.replace(inputString, /_.-# /gi, "");
// inputString = nlp(inputString).text("normal").trim();
const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
const normalizedSentence = nlp(sentenceToProcess)
.text("normal")
.trim()
.split(" ");
console.log(inputString)
const queryObject = {
comicbook_identifier_tokens: {
inputString,
parsedIssueNumber: Number(parsedIssueNumber),
subtitle,
},
years: yearMatches,
sentence_tokens: {
detailed: sentence,
normalized: normalizedSentence,
},
};
return queryObject;
};
export const extractNumerals = (inputString: string): MatchArray[string] => {
// Searches through the given string left-to-right, building an ordered list of
// "issue number-like" re.match objects. For example, this method finds
// matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0
const matches: MatchArray[string] = [];
xregexp.forEach(inputString, /(^|[_\s#v?])(-?\d*\.?\d\w*)/gmu, (match) => {
matches.push(match);
});
return matches;
};
export const extractYears = (inputString: string): RegExpMatchArray | null => {
// Searches through the given string left-to-right, seeing if an intelligible
// publication year can be extracted.
const yearRegex = /(?:19|20)\d{2}/gm;
return inputString.match(yearRegex);
};
export const refineQuery = (inputString: string) => {
const queryObj = tokenize(inputString);
const removedYears = xor(queryObj.sentence_tokens.normalized, queryObj.years);
return {
searchParams: {
searchTerms: {
name: queryObj.comicbook_identifier_tokens.inputString.trim(),
number: queryObj.comicbook_identifier_tokens.parsedIssueNumber,
year: queryObj.years?.toString(),
subtitle: queryObj.comicbook_identifier_tokens.subtitle,
},
},
meta: {
queryObj,
tokenized: removedYears,
normalized: removedYears.join(" "),
},
};
};

520
yarn.lock

File diff suppressed because it is too large Load Diff