🔧 Abstracted filename-parser into its own npm package
This commit is contained in:
@@ -43,6 +43,7 @@
|
|||||||
"ellipsize": "^0.1.0",
|
"ellipsize": "^0.1.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
"fastest-validator": "^1.11.0",
|
"fastest-validator": "^1.11.0",
|
||||||
|
"filename-parser": "^1.0.0",
|
||||||
"final-form": "^4.20.2",
|
"final-form": "^4.20.2",
|
||||||
"final-form-arrays": "^3.0.2",
|
"final-form-arrays": "^3.0.2",
|
||||||
"html-to-text": "^8.1.0",
|
"html-to-text": "^8.1.0",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import React, { ReactElement, useCallback } from "react";
|
|||||||
import { useSelector, useDispatch } from "react-redux";
|
import { useSelector, useDispatch } from "react-redux";
|
||||||
import Select, { components } from "react-select";
|
import Select, { components } from "react-select";
|
||||||
import { fetchComicVineMatches } from "../../../actions/fileops.actions";
|
import { fetchComicVineMatches } from "../../../actions/fileops.actions";
|
||||||
import { refineQuery } from "../../../shared/utils/filenameparser.utils";
|
import { refineQuery } from "filename-parser";
|
||||||
|
|
||||||
export const Menu = (props): ReactElement => {
|
export const Menu = (props): ReactElement => {
|
||||||
const { data } = props;
|
const { data } = props;
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { isUndefined } from "lodash";
|
import { isEmpty, isNil, isUndefined } from "lodash";
|
||||||
import React, { useEffect, ReactElement } from "react";
|
import React, { useEffect, ReactElement } from "react";
|
||||||
import { useDispatch, useSelector } from "react-redux";
|
import { useDispatch, useSelector } from "react-redux";
|
||||||
import { useParams } from "react-router";
|
import { useParams } from "react-router";
|
||||||
@@ -58,12 +58,17 @@ const VolumeDetails = (props): ReactElement => {
|
|||||||
>
|
>
|
||||||
{issues.map((issue) => {
|
{issues.map((issue) => {
|
||||||
return (
|
return (
|
||||||
<Card
|
<>
|
||||||
key={issue.id}
|
<Card
|
||||||
imageUrl={issue.image.thumb_url}
|
key={issue.id}
|
||||||
orientation={"vertical"}
|
imageUrl={issue.image.thumb_url}
|
||||||
hasDetails={false}
|
orientation={"vertical"}
|
||||||
/>
|
hasDetails={false}
|
||||||
|
/>
|
||||||
|
{!isEmpty(issue.potentialMatches)
|
||||||
|
? "matches available"
|
||||||
|
: null}
|
||||||
|
</>
|
||||||
);
|
);
|
||||||
})}
|
})}
|
||||||
</Masonry>
|
</Masonry>
|
||||||
|
|||||||
@@ -1,216 +0,0 @@
|
|||||||
import { default as nlp } from "compromise";
|
|
||||||
import { default as dates } from "compromise-dates";
|
|
||||||
import { default as sentences } from "compromise-sentences";
|
|
||||||
import { default as numbers } from "compromise-numbers";
|
|
||||||
import xregexp from "xregexp";
|
|
||||||
import { MatchArray } from "xregexp/types";
|
|
||||||
import voca from "voca";
|
|
||||||
import { xor, isEmpty, isNull, isNil } from "lodash";
|
|
||||||
|
|
||||||
nlp.extend(sentences);
|
|
||||||
nlp.extend(numbers);
|
|
||||||
nlp.extend(dates);
|
|
||||||
|
|
||||||
interface M {
|
|
||||||
start: number;
|
|
||||||
end: number;
|
|
||||||
value: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const replaceRecursive = (
|
|
||||||
text: string,
|
|
||||||
left: string,
|
|
||||||
right: string,
|
|
||||||
replacer: (match: string) => string,
|
|
||||||
): string => {
|
|
||||||
const r: M[] = xregexp.matchRecursive(text, left, right, "g", {
|
|
||||||
valueNames: [null, null, "match", null],
|
|
||||||
});
|
|
||||||
let offset = 0;
|
|
||||||
for (const m of r) {
|
|
||||||
const replacement = replacer(m.value);
|
|
||||||
text = replaceAt(text, m.start + offset, m.value.length, replacement);
|
|
||||||
offset += replacement.length - m.value.length;
|
|
||||||
}
|
|
||||||
return text;
|
|
||||||
};
|
|
||||||
|
|
||||||
function replaceAt(
|
|
||||||
string: string,
|
|
||||||
index: number,
|
|
||||||
length: number,
|
|
||||||
replacement: string,
|
|
||||||
): string {
|
|
||||||
return string.substr(0, index) + replacement + string.substr(index + length);
|
|
||||||
}
|
|
||||||
|
|
||||||
export const preprocess = (inputString: string) => {
|
|
||||||
// see if the comic matches the following format, and if so, remove everything
|
|
||||||
// after the first number:
|
|
||||||
// "nnn series name #xx (etc) (etc)" -> "series name #xx (etc) (etc)"
|
|
||||||
const format1 = inputString.match(/^\s*(\d+)[\s._-]+?([^#]+)(\W+.*)/gim);
|
|
||||||
|
|
||||||
// see if the comic matches the following format, and if so, remove everything
|
|
||||||
// after the first number that isn't in brackets:
|
|
||||||
// "series name #xxx - title (etc) (etc)" -> "series name #xxx (etc) (etc)
|
|
||||||
const format2 = inputString.match(
|
|
||||||
/^((?:[a-zA-Z,.-]+\s)+)(\#?(?:\d+[.0-9*])\s*(?:-))(.*((\(.*)?))$/gis,
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
matches: {
|
|
||||||
format1,
|
|
||||||
format2,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenizes a search string
|
|
||||||
* @function
|
|
||||||
* @param {string} inputString - The string used to search against CV, Shortboxed, and other APIs.
|
|
||||||
*/
|
|
||||||
export const tokenize = (inputString: string) => {
|
|
||||||
const doc = nlp(inputString);
|
|
||||||
const sentence = doc.sentences().json();
|
|
||||||
|
|
||||||
const yearMatches = extractYears(inputString);
|
|
||||||
|
|
||||||
const hyphenatedIssueRange = inputString.match(/(\d)(-\d+)/gi);
|
|
||||||
if (!isNull(hyphenatedIssueRange) && hyphenatedIssueRange.length > 2) {
|
|
||||||
const issueNumber = hyphenatedIssueRange[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
const readingListIndicators = inputString.match(
|
|
||||||
/^\s*\d+(\.\s+?|\s*-?\s*)/gim,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Issue numbers
|
|
||||||
let issueNumbers = "";
|
|
||||||
let parsedIssueNumber = "";
|
|
||||||
|
|
||||||
// https://regex101.com/r/fgmd22/1
|
|
||||||
const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi);
|
|
||||||
const tpbIssueNumber = inputString.match(/((\s|\|-|:)v?\d?\s)/gim);
|
|
||||||
inputString.replace(
|
|
||||||
/(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
|
|
||||||
"",
|
|
||||||
);
|
|
||||||
|
|
||||||
// find the matches for a tpb "issue" number such as v2
|
|
||||||
if (!isNil(tpbIssueNumber)) {
|
|
||||||
parsedIssueNumber = tpbIssueNumber[0].trim();
|
|
||||||
}
|
|
||||||
if (!isEmpty(issues) && !isNull(issues)) {
|
|
||||||
issueNumbers = issues[0].trim();
|
|
||||||
const matches = extractNumerals(issueNumbers);
|
|
||||||
// if we parsed out some potential issue numbers, designate the LAST
|
|
||||||
// (rightmost) one as the actual issue number, and remove it from the name
|
|
||||||
|
|
||||||
if (matches.length > 0) {
|
|
||||||
parsedIssueNumber = matches[0].pop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inputString = voca.replace(inputString, parsedIssueNumber, "");
|
|
||||||
|
|
||||||
// filter out anything at the end of the title in parantheses
|
|
||||||
inputString = inputString.replace(/\((.*?)\)$/gi, "");
|
|
||||||
|
|
||||||
// get a subtitle for titles such as:
|
|
||||||
// Commando 4779 - Evil in the East (2015) (Digital) (DR & Quinch-Empire)
|
|
||||||
// will match "Evil in the East (2015) (Digital) (DR & Quinch-Empire)"
|
|
||||||
const subtitleMatch = inputString.match(/\s\-\s(.*)/gm);
|
|
||||||
let subtitle = "";
|
|
||||||
if (!isNil(subtitleMatch)) {
|
|
||||||
subtitle = subtitleMatch[0].replace(/[^a-zA-Z0-9 ]/gm, "");
|
|
||||||
subtitle = subtitle.trim();
|
|
||||||
|
|
||||||
// Remove the subtitle from the main input string
|
|
||||||
// Commando 4779 - Evil in the East (2015) (Digital) (DR & Quinch-Empire)
|
|
||||||
// will return "Commando 4779"
|
|
||||||
inputString = inputString.replace(/\s\-\s(.*)/gm, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace special characters with... nothing
|
|
||||||
inputString = inputString.replace(/[^a-zA-Z0-9(\+?\s?\-?\'?)]/gm, "");
|
|
||||||
|
|
||||||
// regexes to match constituent parts of the search string
|
|
||||||
// and isolate the search terms
|
|
||||||
inputString.replace(/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi, "");
|
|
||||||
inputString.replace(/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi, "");
|
|
||||||
|
|
||||||
// if the name has things like "4 of 5", remove the " of 5" part
|
|
||||||
// also, if the name has 3-6, remove the -6 part. note that we'll
|
|
||||||
// try to handle the word "of" in a few common languages, like french/
|
|
||||||
// spanish (de), italian (di), german (von), dutch (van) or polish (z)
|
|
||||||
replaceRecursive(inputString, "\\(", "\\)", () => "");
|
|
||||||
replaceRecursive(inputString, "\\[", "\\]", () => "");
|
|
||||||
replaceRecursive(inputString, "\\{", "\\}", () => "");
|
|
||||||
inputString.replace(/\([^\(]*?\)/gi, "");
|
|
||||||
inputString.replace(/\{[^\{]*?\}/gi, "");
|
|
||||||
inputString.replace(/\[[^\[]*?\]/gi, "");
|
|
||||||
|
|
||||||
inputString.replace(/([^\d]+)(\s*(of|de|di|von|van|z)\s*#*\d+)/gi, "");
|
|
||||||
|
|
||||||
// inputString = voca.replace(inputString, /_.-# /gi, "");
|
|
||||||
// inputString = nlp(inputString).text("normal").trim();
|
|
||||||
|
|
||||||
const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
|
|
||||||
const normalizedSentence = nlp(sentenceToProcess)
|
|
||||||
.text("normal")
|
|
||||||
.trim()
|
|
||||||
.split(" ");
|
|
||||||
|
|
||||||
console.log(inputString)
|
|
||||||
const queryObject = {
|
|
||||||
comicbook_identifier_tokens: {
|
|
||||||
inputString,
|
|
||||||
parsedIssueNumber: Number(parsedIssueNumber),
|
|
||||||
subtitle,
|
|
||||||
},
|
|
||||||
years: yearMatches,
|
|
||||||
sentence_tokens: {
|
|
||||||
detailed: sentence,
|
|
||||||
normalized: normalizedSentence,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
return queryObject;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const extractNumerals = (inputString: string): MatchArray[string] => {
|
|
||||||
// Searches through the given string left-to-right, building an ordered list of
|
|
||||||
// "issue number-like" re.match objects. For example, this method finds
|
|
||||||
// matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0
|
|
||||||
const matches: MatchArray[string] = [];
|
|
||||||
xregexp.forEach(inputString, /(^|[_\s#v?])(-?\d*\.?\d\w*)/gmu, (match) => {
|
|
||||||
matches.push(match);
|
|
||||||
});
|
|
||||||
return matches;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const extractYears = (inputString: string): RegExpMatchArray | null => {
|
|
||||||
// Searches through the given string left-to-right, seeing if an intelligible
|
|
||||||
// publication year can be extracted.
|
|
||||||
const yearRegex = /(?:19|20)\d{2}/gm;
|
|
||||||
return inputString.match(yearRegex);
|
|
||||||
};
|
|
||||||
|
|
||||||
export const refineQuery = (inputString: string) => {
|
|
||||||
const queryObj = tokenize(inputString);
|
|
||||||
const removedYears = xor(queryObj.sentence_tokens.normalized, queryObj.years);
|
|
||||||
return {
|
|
||||||
searchParams: {
|
|
||||||
searchTerms: {
|
|
||||||
name: queryObj.comicbook_identifier_tokens.inputString.trim(),
|
|
||||||
number: queryObj.comicbook_identifier_tokens.parsedIssueNumber,
|
|
||||||
year: queryObj.years?.toString(),
|
|
||||||
subtitle: queryObj.comicbook_identifier_tokens.subtitle,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
meta: {
|
|
||||||
queryObj,
|
|
||||||
tokenized: removedYears,
|
|
||||||
normalized: removedYears.join(" "),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
Reference in New Issue
Block a user