84 lines
2.4 KiB
TypeScript
84 lines
2.4 KiB
TypeScript
import { default as nlp } from "compromise";
|
|
import { default as dates } from "compromise-dates";
|
|
import { default as sentences } from "compromise-sentences";
|
|
import { default as numbers } from "compromise-numbers";
|
|
import _ from "lodash";
|
|
|
|
nlp.extend(sentences);
|
|
nlp.extend(numbers);
|
|
nlp.extend(dates);
|
|
|
|
/**
|
|
* Tokenizes a search string
|
|
* @function
|
|
* @param {string} inputString - The string used to search against CV, Shortboxed, and other APIs.
|
|
*/
|
|
export const tokenize = (searchCriteriaPayload) => {
|
|
const { inputString } = searchCriteriaPayload;
|
|
const doc = nlp(inputString);
|
|
const sentence = doc.sentences().json();
|
|
const number = doc.numbers().fractions();
|
|
|
|
// regexes to match constituent parts of the search string
|
|
// and isolate the search terms
|
|
const chapters = inputString.match(/ch(a?p?t?e?r?)(\W?)(\_?)(\#?)(\d)/gi);
|
|
const volumes = inputString.match(
|
|
/(\b(vo?l?u?m?e?)\.?)(\s*-|\s*_)?(\s*[0-9]+[.0-9a-z]*)/gi,
|
|
);
|
|
const pageCounts = inputString.match(
|
|
/\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b\s*/gi,
|
|
);
|
|
|
|
const parantheses = inputString.match(/\([^\(]*?\)/gi);
|
|
const curlyBraces = inputString.match(/\{[^\{]*?\}/gi);
|
|
const squareBrackets = inputString.match(/\[[^\[]*?\]/gi);
|
|
const genericNumericRange = inputString.match(
|
|
/([^\d]+)(\s*(of|de|di|von|van|z)\s*#*\d+)/gi,
|
|
);
|
|
const hyphenatedNumericRange = inputString.match(/([^\d])?(-\d+)/gi);
|
|
const readingListIndicators = inputString.match(
|
|
/^\s*\d+(\.\s+?|\s*-?\s*)/gim,
|
|
);
|
|
|
|
const issues = inputString.match(/issue(\W?)(\_?)(\d+)/gi);
|
|
const issueHashes = inputString.match(/\#\d/gi);
|
|
const yearMatches = inputString.match(/\d{4}/gi);
|
|
|
|
const sentenceToProcess = sentence[0].normal.replace(/_/g, " ");
|
|
const normalizedSentence = nlp(sentenceToProcess)
|
|
.text("normal")
|
|
.trim()
|
|
.split(" ");
|
|
|
|
const queryObject = {
|
|
comicbook_identifiers: {
|
|
issues,
|
|
issueHashes,
|
|
chapters,
|
|
volumes,
|
|
issueRanges: number,
|
|
},
|
|
years: {
|
|
yearMatches,
|
|
},
|
|
sentences: {
|
|
detailed: sentence,
|
|
normalized: normalizedSentence,
|
|
},
|
|
};
|
|
return queryObject;
|
|
};
|
|
|
|
export function refineQuery(queryString) {
|
|
const queryObj = tokenize(queryString);
|
|
const removedYears = _.xor(
|
|
queryObj.sentences.normalized,
|
|
queryObj.years.yearMatches,
|
|
);
|
|
return {
|
|
tokenized: removedYears,
|
|
normalized: removedYears.join(" "),
|
|
meta: queryObj,
|
|
};
|
|
}
|