🪛 Refactoring and validation
This commit is contained in:
173
src/client/shared/utils/filenameparser.utils.ts
Normal file
173
src/client/shared/utils/filenameparser.utils.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
import { default as nlp } from "compromise";
|
||||
import { default as dates } from "compromise-dates";
|
||||
import { default as sentences } from "compromise-sentences";
|
||||
import { default as numbers } from "compromise-numbers";
|
||||
import xregexp from "xregexp";
|
||||
import voca from "voca";
|
||||
import { map, xor, isEmpty, isNull } from "lodash";
|
||||
|
||||
nlp.extend(sentences);
|
||||
nlp.extend(numbers);
|
||||
nlp.extend(dates);
|
||||
|
||||
interface M {
|
||||
start: number;
|
||||
end: number;
|
||||
value: string;
|
||||
}
|
||||
|
||||
function replaceRecursive(
|
||||
text: string,
|
||||
left: string,
|
||||
right: string,
|
||||
replacer: (match: string) => string,
|
||||
): string {
|
||||
const r: M[] = xregexp.matchRecursive(text, left, right, "g", {
|
||||
valueNames: [null, null, "match", null],
|
||||
});
|
||||
let offset = 0;
|
||||
for (const m of r) {
|
||||
const replacement = replacer(m.value);
|
||||
text = replaceAt(text, m.start + offset, m.value.length, replacement);
|
||||
offset += replacement.length - m.value.length;
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function replaceAt(
|
||||
string: string,
|
||||
index: number,
|
||||
length: number,
|
||||
replacement: string,
|
||||
): string {
|
||||
return string.substr(0, index) + replacement + string.substr(index + length);
|
||||
}
|
||||
|
||||
export const preprocess = (inputString: string) => {
|
||||
// see if the comic matches the following format, and if so, remove everything
|
||||
// after the first number:
|
||||
// "nnn series name #xx (etc) (etc)" -> "series name #xx (etc) (etc)"
|
||||
const format1 = "124 series name #xx (etc) (etc)".match(
|
||||
/^\s*(\d+)[\s._-]+?([^#]+)(\W+.*)/,
|
||||
);
|
||||
|
||||
// see if the comic matches the following format, and if so, remove everything
|
||||
// after the first number that isn't in brackets:
|
||||
// "series name #xxx - title (etc) (etc)" -> "series name #xxx (etc) (etc)
|
||||
const format2 = "".match(
|
||||
/^((?:[a-zA-Z,.-]+\s)+)(\#?(?:\d+[.0-9*])\s*(?:-))(.*((\(.*)?))$/gis,
|
||||
);
|
||||
};
|
||||
|
||||
/**
|
||||
* Tokenizes a search string
|
||||
* @function
|
||||
* @param {string} inputString - The string used to search against CV, Shortboxed, and other APIs.
|
||||
*/
|
||||
export const tokenize = (inputString: string) => {
|
||||
const doc = nlp(inputString);
|
||||
const sentence = doc.sentences().json();
|
||||
const number = doc.numbers().fractions();
|
||||
|
||||
// regexes to match constituent parts of the search string
|
||||
// and isolate the search terms
|
||||
|
||||
const chapters = inputString.replace(
|
||||
/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi,
|
||||
"",
|
||||
);
|
||||
const volumes = inputString.replace(
|
||||
/(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
|
||||
"",
|
||||
);
|
||||
const pageCounts = inputString.replace(
|
||||
/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi,
|
||||
"",
|
||||
);
|
||||
|
||||
// if the name has things like "4 of 5", remove the " of 5" part
|
||||
// also, if the name has 3-6, remove the -6 part. note that we'll
|
||||
// try to handle the word "of" in a few common languages, like french/
|
||||
// spanish (de), italian (di), german (von), dutch (van) or polish (z)
|
||||
replaceRecursive(inputString, "\\(", "\\)", () => "");
|
||||
replaceRecursive(inputString, "\\[", "\\]", () => "");
|
||||
replaceRecursive(inputString, "\\{", "\\}", () => "");
|
||||
inputString.replace(/\([^\(]*?\)/gi, "");
|
||||
inputString.replace(/\{[^\{]*?\}/gi, "");
|
||||
inputString.replace(/\[[^\[]*?\]/gi, "");
|
||||
|
||||
inputString.replace(/([^\d]+)(\s*(of|de|di|von|van|z)\s*#*\d+)/gi, "");
|
||||
|
||||
const hyphenatedIssueRange = inputString.match(/(\d)(-\d+)/gi);
|
||||
if (!isNull(hyphenatedIssueRange) && hyphenatedIssueRange.length > 2) {
|
||||
let issueNumber = hyphenatedIssueRange[0];
|
||||
}
|
||||
|
||||
if (voca.includes(inputString, "_") && !voca.includes(inputString, " ")) {
|
||||
inputString.replace(/[-_#]/gi, "");
|
||||
}
|
||||
const readingListIndicators = inputString.match(
|
||||
/^\s*\d+(\.\s+?|\s*-?\s*)/gim,
|
||||
);
|
||||
|
||||
let issueNumbers = "";
|
||||
const issues = inputString.match(/(^|[_\s#])(-?\d*\.?\d\w*)/gi);
|
||||
if (!isEmpty(issues)) {
|
||||
issueNumbers = issues[0].trim();
|
||||
}
|
||||
// const issueHashes = inputString.match(/\#\d/gi);
|
||||
const yearMatches = inputString.match(/\d{4}/gi);
|
||||
|
||||
const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
|
||||
const normalizedSentence = nlp(sentenceToProcess)
|
||||
.text("normal")
|
||||
.trim()
|
||||
.split(" ");
|
||||
|
||||
const queryObject = {
|
||||
comicbook_identifier_tokens: {
|
||||
issueNumbers,
|
||||
chapters,
|
||||
pageCounts,
|
||||
|
||||
readingListIndicators,
|
||||
volumes,
|
||||
},
|
||||
years: {
|
||||
yearMatches,
|
||||
},
|
||||
sentence_tokens: {
|
||||
detailed: sentence,
|
||||
normalized: normalizedSentence,
|
||||
},
|
||||
};
|
||||
return queryObject;
|
||||
};
|
||||
|
||||
export const extractNumerals = (inputString: string): string => {
|
||||
// Searches through the given string left-to-right, building an ordered list of
|
||||
// "issue number-like" re.match objects. For example, this method finds
|
||||
// matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0
|
||||
};
|
||||
|
||||
export const refineQuery = (inputString) => {
|
||||
const queryObj = tokenize(inputString);
|
||||
const removedYears = xor(
|
||||
queryObj.sentence_tokens.normalized,
|
||||
queryObj.years.yearMatches,
|
||||
);
|
||||
return {
|
||||
searchParams: {
|
||||
searchTerms: {
|
||||
name: queryObj.sentence_tokens.detailed[0].text,
|
||||
number: queryObj.comicbook_identifier_tokens.issueNumbers,
|
||||
},
|
||||
year: queryObj.years,
|
||||
},
|
||||
meta: {
|
||||
queryObj,
|
||||
tokenized: removedYears,
|
||||
normalized: removedYears.join(" "),
|
||||
},
|
||||
};
|
||||
};
|
||||
Reference in New Issue
Block a user