From 22eb9a44228cf6eb63758c650f477c245b90bc85 Mon Sep 17 00:00:00 2001 From: Arild Matsson Date: Fri, 5 Jul 2024 10:20:55 +0200 Subject: [PATCH] refactor(ts): CQP --- CHANGELOG.md | 1 + app/scripts/components/statistics.js | 2 +- app/scripts/cqp_parser/{cqp.js => cqp.ts} | 92 +++++++++++++++-------- app/scripts/cqp_parser/cqp.types.ts | 43 +++++++++++ 4 files changed, 105 insertions(+), 33 deletions(-) rename app/scripts/cqp_parser/{cqp.js => cqp.ts} (63%) create mode 100644 app/scripts/cqp_parser/cqp.types.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 06df5d1d5..a9f8eb98e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Added - TypeScript typings for config/settings +- TypeScript typings for CQP queries ### Changed diff --git a/app/scripts/components/statistics.js b/app/scripts/components/statistics.js index d6bff7503..994504555 100644 --- a/app/scripts/components/statistics.js +++ b/app/scripts/components/statistics.js @@ -6,7 +6,7 @@ import settings from "@/settings" import { html } from "@/util" import { loc, locObj } from "@/i18n" import { getCqp } from "../../config/statistics_config.js" -import { expandOperators } from "@/cqp_parser/cqp.js" +import { expandOperators } from "@/cqp_parser/cqp" import "@/components/corpus-distribution-chart" angular.module("korpApp").component("statistics", { diff --git a/app/scripts/cqp_parser/cqp.js b/app/scripts/cqp_parser/cqp.ts similarity index 63% rename from app/scripts/cqp_parser/cqp.js rename to app/scripts/cqp_parser/cqp.ts index 594d48041..4315e1364 100644 --- a/app/scripts/cqp_parser/cqp.js +++ b/app/scripts/cqp_parser/cqp.ts @@ -1,16 +1,27 @@ /** @format */ import _ from "lodash" -import moment from "moment" +import moment, { type Moment } from "moment" import settings from "@/settings" -import { parse } from "./CQPParser" +import { parse as parse_ } from "./CQPParser" +import type { Condition, CqpQuery, DateRange, OperatorKorp } from "./cqp.types" +/** Parse CQP string to syntax tree. */ +// Rename to be able to add typing. +const parse: (input: string, options?: never) => CqpQuery = parse_ export { parse } -export function parseDateInterval(op, val, expanded_format) { - let out +/** + * Create CQP expression for a date interval condition. + * + * @param opKorp Operator to use if not using `expanded_format` + * @param val An array like `[fromdate, todate, fromtime, totime]` + * @param expanded_format Whether to convert to standard CQP or keep Korp-specific operators + */ +export function parseDateInterval(opKorp: OperatorKorp, val: DateRange, expanded_format?: boolean) { + let out: string val = _.invokeMap(val, "toString") if (!expanded_format) { - return `$date_interval ${op} '${val.join(",")}'` + return `$date_interval ${opKorp} '${val.join(",")}'` } const [fromdate, todate, fromtime, totime] = val @@ -25,7 +36,7 @@ export function parseDateInterval(op, val, expanded_format) { text_timeto: totime, } - op = function (field, operator, valfield) { + function op(field: string, operator: string, valfield?: string) { val = valfield ? fieldMapping[valfield] : fieldMapping[field] return `int(_.${field}) ${operator} ${val}` } @@ -62,7 +73,12 @@ export function parseDateInterval(op, val, expanded_format) { return out } -export function stringify(cqp_obj, expanded_format) { +/** + * Serialize syntax tree to CQP string. + * @param cqp_obj Syntax tree + * @param expanded_format Whether to convert to standard CQP or keep Korp-specific operators + */ +export function stringify(cqp_obj: CqpQuery, expanded_format?: boolean): string { if (expanded_format == null) { expanded_format = false } @@ -80,13 +96,13 @@ export function stringify(cqp_obj, expanded_format) { continue } - const outer_and_array = [] + const outer_and_array: string[][] = [] for (let and_array of token.and_block) { - const or_array = [] + const or_array: string[] = [] for (let { type, op, val, flags } of and_array) { var out if (expanded_format) { - ;[val, op] = { + ;[val, op] = ({ "^=": [val + ".*", "="], "_=": [`.*${val}.*`, "="], "&=": [`.*${val}`, "="], @@ -104,7 +120,7 @@ export function stringify(cqp_obj, expanded_format) { not_incontains_contains: [`.*${val}.*`, "not contains"], ends_with_contains: [`.*${val}`, "contains"], not_ends_with_contains: [`.*${val}`, "not contains"], - }[op] || [val, op] + }[op] || [val, op]) as [string | DateRange, OperatorKorp] } let flagstr = "" @@ -115,7 +131,7 @@ export function stringify(cqp_obj, expanded_format) { if (type === "word" && val === "") { out = "" } else if (settings.corpusListing.isDateInterval(type)) { - out = parseDateInterval(op, val, expanded_format) + out = parseDateInterval(op, val as DateRange, expanded_format) } else { out = `${type} ${op} \"${val}\"` } @@ -129,7 +145,7 @@ export function stringify(cqp_obj, expanded_format) { } } - let or_out = outer_and_array.map((x) => (x.length > 1 ? `(${x.join(" | ")})` : x.join(" | "))) + let or_out: string[] = outer_and_array.map((x) => (x.length > 1 ? `(${x.join(" | ")})` : x.join(" | "))) if (token.bound) { or_out = _.compact(or_out) @@ -149,46 +165,58 @@ export function stringify(cqp_obj, expanded_format) { return output.join(" ") } -export const expandOperators = (cqpstr) => stringify(parse(cqpstr), true) +export const expandOperators = (cqpstr: string) => stringify(parse(cqpstr), true) -export function getTimeInterval(obj) { - let from = [] - let to = [] +/** + * Find first and last date in any date interval conditions. + * @param obj Syntax tree + * @returns `[from, to]` as Moment objects, or `undefined` if query has no intervals + */ +export function getTimeInterval(obj: CqpQuery): [Moment, Moment] | undefined { + let froms: Moment[] = [] + let tos: Moment[] = [] for (let token of obj) { for (let or_block of token.and_block) { for (let item of or_block) { if (item.type === "date_interval") { - from.push(moment(`${item.val[0]}${item.val[2]}`, "YYYYMMDDhhmmss")) - to.push(moment(`${item.val[1]}${item.val[3]}`, "YYYYMMDDhhmmss")) + froms.push(moment(`${item.val[0]}${item.val[2]}`, "YYYYMMDDhhmmss")) + tos.push(moment(`${item.val[1]}${item.val[3]}`, "YYYYMMDDhhmmss")) } } } } - if (!from.length) { + if (!froms.length) { return } - from = _.minBy(from, (mom) => mom.toDate()) - to = _.maxBy(to, (mom) => mom.toDate()) + const from = _.minBy(froms, (m) => m.toDate()) + const to = _.maxBy(tos, (m) => m.toDate()) return [from, to] } -export function prioSort(cqpObjs) { - const getPrio = function (and_array) { - const numbers = _.map(and_array, (item) => _.indexOf(settings["cqp_prio"], item.type)) +/** + * Sort the conditions in each token according to the `cqp_prio` setting. + */ +export function prioSort(cqpObjs: CqpQuery) { + const getPrio = function (or_block: Condition[]) { + const numbers = _.map(or_block, (item) => _.indexOf(settings.cqp_prio, item.type)) return Math.min(...(numbers || [])) } for (let token of cqpObjs) { - token.and_block = _.sortBy(token.and_block, getPrio).reverse() + token.and_block = (_.sortBy(token.and_block, getPrio) as Condition[][]).reverse() } return cqpObjs } -// assume cqpObj2 to contain fewer tokens than cqpObj1 -export function mergeCqpExprs(cqpObj1, cqpObj2) { +/** + * Create intersection of two queries. + * + * The second query is assumed to contain fewer tokens than the first. + */ +export function mergeCqpExprs(cqpObj1: CqpQuery, cqpObj2: CqpQuery) { for (let i = 0; i < cqpObj2.length; i++) { const token = cqpObj2[i] for (let j = 0; j < cqpObj1.length; j++) { @@ -202,14 +230,14 @@ export function mergeCqpExprs(cqpObj1, cqpObj2) { } /** Check if a query has any wildcards (`[]`) */ -export const hasWildcard = (cqpObjs) => cqpObjs.some((token) => stringify([token]).indexOf("[]") === 0) +export const hasWildcard = (cqpObjs: CqpQuery) => cqpObjs.some((token) => stringify([token]).indexOf("[]") === 0) /** Check if a query has any tokens with repetition */ -export const hasRepetition = (cqpObjs) => cqpObjs.some((token) => token.repeat) +export const hasRepetition = (cqpObjs: CqpQuery) => cqpObjs.some((token) => token.repeat) /** Check if a query has any structure boundaries, e.g. sentence start */ -export const hasStruct = (cqpObjs) => cqpObjs.some((token) => token.struct) +export const hasStruct = (cqpObjs: CqpQuery) => cqpObjs.some((token) => token.struct) /** Determine whether a query will work with the in_order option */ -export const supportsInOrder = (cqpObjs) => +export const supportsInOrder = (cqpObjs: CqpQuery) => cqpObjs.length > 1 && !hasWildcard(cqpObjs) && !hasRepetition(cqpObjs) && !hasStruct(cqpObjs) diff --git a/app/scripts/cqp_parser/cqp.types.ts b/app/scripts/cqp_parser/cqp.types.ts new file mode 100644 index 000000000..a98bbc71d --- /dev/null +++ b/app/scripts/cqp_parser/cqp.types.ts @@ -0,0 +1,43 @@ +/** @format */ +export type CqpQuery = CqpToken[] + +export type CqpToken = { + and_block: Condition[][] + bound?: Record<"lbound" | "rbound", true> + /** `[min]` or `[min, max]` */ + repeat?: [number] | [number, number] + struct?: string + start?: number +} + +export type Condition = { + type: string + op: OperatorKorp + val: string | DateRange + flags: Record +} + +/** Should be `[fromdate, todate, fromtime, totime]` */ +export type DateRange = (string | number)[] + +export type Operator = "=" | "!=" | "contains" | "not contains" + +export type OperatorKorp = + | Operator + | "^=" + | "_=" + | "&=" + | "*=" + | "!*=" + | "rank_contains" + | "not_rank_contains" + | "highest_rank" + | "not_highest_rank" + | "regexp_contains" + | "not_regexp_contains" + | "starts_with_contains" + | "not_starts_with_contains" + | "incontains_contains" + | "not_incontains_contains" + | "ends_with_contains" + | "not_ends_with_contains"