-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathnodeBundle.js
711 lines (666 loc) · 26.2 KB
/
nodeBundle.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
'use strict';
/**
* AbbrevIso v1.2 JS lib for publication title abbreviation per ISO-4 standard.
* Copyright (C) 2023 by Marcin Wrochna. MIT License, see file: LICENSE.
* @fileoverview Utils for handling different ways of writing equivalent
* characters.
*/
/**
* A replacement for the /\b/ regex, which wrongly matches foreign characters;
* Do not add global matching //g, since this RegExp object is used and reused!
* Instead, use new RegExp(boundariesRegex, "g").
* Avoid using \W and \w, they match a nonsense range.
* @type {RegExp}
*/
const boundariesRegex = /[-\s\u2013\u2014_.,:;!|=+*\\/"()&#%@$?]/;
/**
* A regex for matching line breaks, as per Unicode standards:
* {@link http://www.unicode.org/reports/tr18/#Line_Boundaries}.
* @type {RegExp}
*/
const newlineRegex = /\r\n|[\n\v\f\r\x85\u2028\u2029]/;
/**
* Remove diacritics and try to replace foreign letters with `[a-zA-Z].
* After this function, LTWA patterns only match `[a-zA-Z\ \-.'(),]*`,
* but this is not always true for strings outside the LTWA.
* @param {string} s
* @return {string}
*/
function normalize(s) {
return s
.replace(/\u00DF/g, 'ss').replace(/\u1E9E/g, 'SS') // scharfes S
.replace(/\u0111/g, 'd').replace(/\u0110/g, 'D') // crossed D
.replace(/\u00F0/g, 'd').replace(/\u00D0/g, 'D') // eth
.replace(/\u00FE/g, 'th').replace(/\u00DE/g, 'TH') // thorn
.replace(/\u0127/g, 'h').replace(/\u0126/g, 'H') // H-bar
.replace(/\u0142/g, 'l').replace(/\u0141/g, 'L') // L with stroke
.replace(/\u0153/g, 'oe').replace(/\u0152/g, 'Oe') // oe ligature
.replace(/\u00E6/g, 'ae').replace(/\u00C6/g, 'Ae') // ae ligature
.replace(/\u0131/g, 'i') // dotless i
.replace(/\u00F8/g, 'o').replace(/\u00D8/g, 'O') // o with stroke
// Catalan middle dot, double prime (weirdly used for slavic langs),
// unicode replacement character (for some mis-utf'd Turkish).
.replace(/[\u00B7\u02BA\uFFFD]/g, '')
// Most diacritics are handled by this standard unicode normalization:
// it decomposes characters into simpler characters plus modifiers,
// and throws out the modifiers.
.normalize('NFKD').replace(/[\u0300-\u036f]/gu, '');
}
/**
* Normalize more promiscuously, always returning a string in `[a-z\ ]*`.
* It is used only for bucketing patterns in prefix trees, not for actual
* matching, so it may merge many strings just in case (e.g. remove all 'h').
* @param {string} s
* @return {string}
*/
function promiscuouslyNormalize(s) {
return normalize(s)
.toLowerCase()
.replace(new RegExp(boundariesRegex, 'g'), ' ')
.replace(/\s+/gu, ' ').replace(/^\s/gu, '').replace(/\s$/gu, '')
.replace(/[^a-z\ ]/g, ' ')
.replace(/kh/g, '').replace(/h/g, '');
}
/**
* Returns whether the two strings represent the same character.
* Some characters may be equivalent to the empty string, e.g. the 'flown dot'.
* Others (like ligatures 'ae') can be equivalent to a string of two characters.
* @param {string} s
* @param {string} t
* @return {boolean}
*/
function cEquiv(s, t) {
// TODO perhaps we could use instead the following more standard collator?
// c = new Intl.Collator('en-u', {usage:'search', sensitivity:'base'});
// return c.compare(s,t);
return (normalize(s).toLowerCase() == normalize(t).toLowerCase());
}
/**
* Attempts to match `t` to a prefix of `s`.
* E.g., for `s='dæl·lete'`, `t='daell'` the output should be
* `[['d','æ','l','·','l'] , ['d','ae','l','','l']]`.
* @param {string} s
* @param {string} t
* @return {Array<Array<string>>} Pair of equal-length Arrays of consecutive
* characters in `s` and `t` that were found to be equivalent.
*/
function getCollatingMatch(s, t) {
const ss = Array.from(s);
const tt = Array.from(t);
let i = 0;
let j = 0;
const result = [[], []];
while (j < tt.length) {
if (i >= ss.length) {
if (cEquiv('', tt[j])) {
result[0].push('');
result[1].push(tt[j]);
j++;
} else {
return false; // `ss` is too short to match `tt`.
}
} else if (i + 1 < ss.length && j + 1 < tt.length
&& cEquiv(ss[i]+ss[i+1], tt[j]+tt[j+1])) {
result[0].push(ss[i]);
result[1].push(tt[j]);
i++;
j++;
} else if (j + 1 < tt.length
&& cEquiv(ss[i], tt[j]+tt[j+1]) && !cEquiv(tt[j+1], '')) {
if (cEquiv('', tt[j])) {
result[0].push('');
result[1].push(tt[j]);
j++;
} else {
result[0].push(ss[i]);
result[1].push(tt[j]+tt[j+1]);
i++;
j += 2;
}
} else if (i + 1 < ss.length
&& cEquiv(ss[i] + ss[i+1], tt[j]) && !cEquiv(ss[i+1], '')) {
if (cEquiv(ss[i], '')) {
result[0].push(ss[i]);
result[1].push('');
i++;
} else {
result[0].push(ss[i]+ss[i+1]);
result[1].push(tt[j]);
i += 2;
j++;
}
} else if (cEquiv(ss[i], tt[j])) {
result[0].push(ss[i]);
result[1].push(tt[j]);
i++;
j++;
} else if (cEquiv(ss[i], '')) {
result[0].push(ss[i]);
result[1].push('');
i++;
} else {
return false; // Characters don't match.
}
}
return result;
}
/**
* AbbrevIso v1.2 JS lib for publication title abbreviation per ISO-4 standard.
* Copyright (C) 2023 by Marcin Wrochna. MIT License, see file: LICENSE.
* @fileoverview Prefix trees for quickly finding patterns.
*/
/**
* Maximum size of a node in a prefix tree. Smaller is slower, but the results
* will contain fewer superfluous objects.
*/
const maxNodeSize = 5;
/**
* A structure that allows to add objects at given string positions, and
* retrieve all objects (together with some superfluous ones!) that at positions
* that are prefixes of a given string. The string should not contain characters
* '-' nor '?'.
*/
class PrefixTree {
/** Constructs new empty PrefixTree. */
constructor() {
/** @private @const {!Map} The root node.*/
this.root_ = new Map();
this.root_.set('-', []);
}
/**
* Adds an object at a given string position.
* @param {string} position
* @param {*} object
*/
add(position, object) {
let node = this.root_;
let i = 0;
for (const c of position) {
// Go deeper into nodes as far as possible.
if (node.has(c)) {
node = node.get(c);
i++;
} else if (node.has('?')) {
// If a node has already been split, add the next character to it.
node.set(c, new Map());
node.get(c).set('-', []);
node = node.get(c);
i++;
} else {
break;
}
}
node.get('-').push([position.substr(i), object]);
if (node.get('-').length > maxNodeSize)
this.splitNode(node);
}
/**
* Helper function that splits a node Map into a Map of Maps.
* @param {!Map} node
*/
splitNode(node) {
const objectsEndingAtNode = [];
for (const [position, object] of node.get('-')) {
if (position.length == 0) {
objectsEndingAtNode.push([position, object]);
continue;
}
const c = position.charAt(0);
if (!node.has(c)) {
node.set(c, new Map());
node.get(c).set('-', []);
}
node.get(c).get('-').push([position.substr(1), object]);
}
node.set('-', objectsEndingAtNode);
node.set('?', true);
}
/**
* Returns Array of all objects under positions that are prefixes of 'value'.
* This returns some superfluous objects too!
* @param {string} value
* @return {Array<*>}
*/
get(value) {
let node = this.root_;
let result = node.get('-');
for (const c of value) {
if (node.has(c)) {
node = node.get(c);
result = result.concat(node.get('-'));
} else {
break;
}
}
return result.map(([_position, object]) => object);
}
}
/**
* AbbrevIso v1.2 JS lib for publication title abbreviation per ISO-4 standard.
* Copyright (C) 2023 by Marcin Wrochna. MIT License, see file: LICENSE.
* @fileoverview The library implements the method of abbreviating titles of
* publications according to the ISO-4 standard. It also provides a way to list
* matching patterns from the LTWA (List of Title Word Abbreviations).
*/
/**
* A single pattern line from the LTWA.
* @property {string} pattern - The actual pattern from the LTWA, with dashes.
* @property {string} replacement - The replacement from the LTWA.
* @property {Array<String>} languages - Languages from which the word came.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* All patterns apply to all titles regardless of language, so this should be avoided.
* @property {boolean} startDash - Does it have a starting dash?
* @property {boolean} endDash - Does it have an ending dash?
* @property {string} line - The original full line from the LTWA.
*/
class LTWAPattern {
/** @param {string} line A full tab-separated line from the LTWA CSV.*/
constructor(line) {
const a = line.split('\t');
if (a.length != 3)
throw new Error('Number of fields in LTWA line is not 3: "' + line + '"');
this.line = line;
let p = a[0].normalize('NFC').trim();
// Some patterns include a disambiguation comment in parentheses, remove it.
p = p.replace(/\(.*\)/, '').trim();
this.pattern = p;
if (p.length < 3)
throw new Error('LTWA line has too short pattern: "' + line + '"');
this.replacement = a[1].normalize('NFC').trim();
if (['n.a.', 'n. a.', 'n.a'].includes(this.replacement))
this.replacement = '–';
this.languages = a[2].split(',').map(Function.prototype.call,
String.prototype.trim);
this.startDash = (p.charAt(0) == '-');
this.endDash = (p.charAt(p.length - 1) == '-');
}
/**
* Returns a string representation for easy sorting.
* @return {string}
*/
toString() {
return '[object LTWAPattern: ' + this.line + ']';
}
}
/**
* The main class for finding LTWA matches and ISO-4 abbreviations.
*/
class AbbrevIso {
/**
* @param {(string|Array<string>)} ltwa - The LTWA, tab-separated CSV format.
* @param {(string|Array<string>)} shortWords - A list of short words
* (articles, prepositions, conjuctions) to be omitted from titles. Note
* that articles in a few languages are already hard-coded, as they are
* handled a bit differently by ISO-4 rules. 99.9% of English cases are
* handled by: in/to/of/on/a/an/the/into/as/for/from/with/and.
*/
constructor(ltwa, shortWords) {
/**
* @private {!Array<LTWAPattern>}
* Patterns not starting with a letter (all begin with ').
*/
this.badPatterns_ = [];
/**
* @private {!PrefixTree<LTWAPattern>}
* A prefix tree of patterns beginning with a dash.
*/
this.nonprefixPatterns_ = new PrefixTree();
/**
* @private {!PrefixTree<LTWAPattern>}
* A prefix tree of patterns not beginning with a dash.
*/
this.dictPatterns_ = new PrefixTree();
/**
* @private The number of patterns added.
*/
this.size_ = 0;
// Add all patterns from ltwa as new `LTWAPattern`s.
if (!(ltwa instanceof Array))
ltwa = ltwa.split(newlineRegex);
let firstLine = true;
for (const line of ltwa) {
if (firstLine) { // Skip header.
firstLine = false;
continue;
}
if (line.trim().length == 0) // Skip empty lines.
continue;
this.addPattern(new LTWAPattern(line));
}
// Trim all shortWords.
if (!(shortWords instanceof Array))
shortWords = shortWords.split(newlineRegex);
shortWords = shortWords.map((s) => s.trim());
this.shortWords_ = shortWords.filter((s) => s.length > 0);
}
/** @return {number} Number of patterns added. */
get size() {
return this.size_;
}
/** @param {LTWAPattern} pattern */
addPattern(pattern) {
let p = pattern.pattern;
p = p.replace(/^-/, '');
p = p.replace(/-$/, '');
p = normalize(p);
if (!/^[A-Za-z]/u.test(p))
this.badPatterns_.push(pattern);
p = promiscuouslyNormalize(p);
if (pattern.startDash)
this.nonprefixPatterns_.add(p, pattern);
else
this.dictPatterns_.add(p, pattern);
this.size_++;
}
/**
* Returns any patterns that could potentially match `s` somewhere.
* This returns around 5 times more patterns than actually match.
* @param {string} s
* @param {boolean} pretendDash - If true, pretend all patterns start
* and end with a dash (to find potential compound words)
* @return {Array<LTWAPattern>}
*/
getPotentialPatterns(s, pretendDash = false) {
// Always add all bad patterns.
let result = this.badPatterns_;
s = promiscuouslyNormalize(s);
// Add dict-Patterns/nonprefix-Patterns potentially matching each position,
// depending on whether this position starts a word or not.
let isNewWord = true;
for (let i = 0; i < s.length; i++) {
if (!/[a-zA-Z]/.test(s.charAt(i))) {
isNewWord = true;
continue;
}
if (isNewWord || pretendDash)
result = result.concat(this.dictPatterns_.get(s.substr(i)));
result = result.concat(this.nonprefixPatterns_.get(s.substr(i)));
isNewWord = false;
}
// Remove duplicates in result.
result.sort();
result = result.filter((x, i, res) => !i || x !== res[i-1]);
return result;
}
/**
* Returns all matches of one given LTWAPattern in `value`.
* We only call this function for patterns from `getPotentialPatterns`, so
* we can do more expensive stuff here. Note that some overlapping matches and
* abbreviations that would not strictly decrease the length (with the dot)
* are returned, but should NOT be applied.
* @param {string} value
* @param {LTWAPattern} pattern
* @param {?Array<string>} [languages=['*']] - Languages to consider when
* matching. Default '*' means languages are disregarded, otherwise we
* check intersection with `pattern.languages`: if empty, we return no
* matches (an empty array). Languages are listed as ISO-639-2 (B) codes,
* e.g. 'mul' for multiple, 'und' for undefined language.
* In the standard, all patterns apply to all titles regardless of language.
* @param {boolean} pretendDash - If true, pretend all patterns start and end
* with a dash (to find potential compound words)
* @return {Array} An Array of `[i, iend, abbr, pattern, appendix]` Arrays,
* where the `pattern` matches `value[i..iend-1]`, `abbr` is the
* computed abbreviation that should be put in place of the match; it has
* capitalization, diacritics etc. preserved. `pattern` is the input
* LTWAPattern; `appendix` is the flection ending that was accepted
* even though the pattern had no ending dash (like -s, -ian).
*/
getPatternMatches(value, pattern, languages = ['*'], pretendDash = false) {
// If a list of languages is given, check it intersects the pattern's list.
if (languages !== undefined &&
!languages.includes('*') &&
!pattern.languages.some((x) => languages.includes(x)))
return [];
let replacement = pattern.replacement;
if (replacement == '–')
replacement = '';
let p = pattern.pattern;
if (pattern.startDash || pretendDash) {
p = p.replace(/^-/, '');
replacement = replacement.replace(/^-/, '');
}
if (pattern.endDash || pretendDash)
p = p.replace(/-$/, '');
replacement = Array.from(replacement);
const result = [];
let isPreviousCharBoundary = true;
let i = 0;
while (i < value.length) {
if (!pattern.startDash && !pretendDash && !isPreviousCharBoundary) {
isPreviousCharBoundary = boundariesRegex.test(value[i]);
i++;
continue;
}
const r = getCollatingMatch(value.substr(i), p);
if (r === false) {
isPreviousCharBoundary = boundariesRegex.test(value[i]);
i++;
continue;
}
// Now pattern (ignoring dashes) has a match in `value`,
// starting from i-th position.
let abbr = '';
let ii = 0;
let iend = i + r[0][ii].length;
let appendix = '';
for (let j = 0; j < replacement.length; j++) {
if (replacement[j] == '.') {
abbr += '.';
continue;
}
// Omit value characters until we get to one
// also present in the replacement.
while (!cEquiv(r[1][ii], replacement[j]) &&
(j + 1 >= replacement.length ||
!cEquiv(r[1][ii], replacement[j] + replacement[j + 1]))) {
ii++;
iend += r[0][ii].length;
}
// If r[1][ii] is equivalent to two characters of the replacement,
// we have to advance j twice.
if (!cEquiv(r[1][ii], replacement[j]))
j++;
// Now r[1][ii] is also present in the replacement,
// so we copy it to abbr and move on to the next replacement character.
abbr += r[0][ii];
ii++;
if (ii < r[0].length)
iend += r[0][ii].length;
}
// We omit all remaining characters of the match
// (with no counterpart in replacement).
for (ii++; ii < r[0].length; ii++)
iend += r[0][ii].length;
// If the pattern had an ending dash,
// omit all characters until we get a boundary.
if (pattern.endDash || pretendDash) {
while (iend < value.length &&
!boundariesRegex.test(value[iend]))
iend++;
// If the pattern had no ending dash, try to omit some characters due to
// flection and if we don't have a boundary at iend, discard the pattern.
} else {
let valid = true;
const ending = new RegExp('^([iaesn\'’]{0,3})' + '($|' + boundariesRegex.source + ')', 'u');
const match = value.substr(iend).match(ending);
if (match) {
appendix = match[1];
iend += appendix.length;
} else {
valid = false;
}
if (!valid) {
isPreviousCharBoundary = boundariesRegex.test(value[i]);
i++;
continue;
}
}
// If the replacement was 'n. a.' (not abbreviated), we make it so.
if (replacement == '')
abbr = value.substring(i, iend);
// Report the match.
result.push([i, iend, abbr, pattern, appendix]);
i++;
isPreviousCharBoundary = boundariesRegex.test(value[i-1]);
}
return result;
}
/**
* Returns all patterns matching `value`, sorted by start index of match.
* Note this is not called by `makeAbbreviation`.
* @param {string} value
* @param {?Array<string>} languages - Only use patterns from these.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* @param {boolean} pretendDash - If true, pretend all patterns start
* and end with a dash (to find potential compound words)
* @param {?Array<LTWAPattern>} [patterns=getPotentialPatterns(value)]
* @return {Array<LTWAPattern>}
*/
getMatchingPatterns(value, languages = undefined,
pretendDash = false, patterns = undefined) {
if (patterns === undefined)
patterns = this.getPotentialPatterns(value, pretendDash=pretendDash);
value = value.normalize('NFC').trim();
let matches = [];
for (const pattern of patterns) {
matches = matches.concat(
this.getPatternMatches(value, pattern, languages, pretendDash)
);
}
const getBeginning = ([i, _iend, _abbr, _pattern, _appendix]) => i;
matches.sort((a, b) => (getBeginning(a) - getBeginning(b)));
return matches.map(([_i, _iend, _abbr, pattern, _appendix]) => pattern);
}
/**
* Remove short words from s, under some boundary constraints.
* @param {string} s
* @param {Array<string>} shortWords
* @param {string} before regex source for boundary to be matched before word,
* with one parenthesised group to keep.
*/
removeShortWords(s, shortWords, before, after) {
// Omit articles, prepositions, and conjunctions, unless first preposition
// in title/subtitle, parts of names, meant as initialisms, 'A' meant as
// 'Part A', national practice... Here I omit them only when preceded by a
// boundary, succeeded by whitespace, and lower case or CamelCase (e.g. 'OR'
// is preserved, since it may mean 'Operations Research', but 'B-A ' would
// lose the 'A').
// Also try the word with the first letter capitalized.
let wordList = shortWords.concat(shortWords.map((s) => s.charAt(0).toUpperCase() + s.substr(1)));
for (const word of wordList)
s = s.replace(new RegExp(before + word + '\\s', 'gu'), '$1');
return s;
}
/**
* Compute an abbreviation according to all ISO-4 rules.
* @param {string} value
* @param {?Array<string>} languages - Only use patterns from these.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* All patterns apply to all titles regardless of language, so this should be avoided.
* @param {?Array<LTWAPattern>} [patterns=getPotentialPatterns(value)]
* A list of potential patterns (you could give all, it's just damn slow).
* @return {string}
*/
makeAbbreviation(value, languages = undefined, patterns = undefined) {
let result = value;
if (patterns === undefined)
patterns = this.getPotentialPatterns(result);
// Some basic lossless Unicode normalization.
result = result.normalize('NFC').trim();
// Punctuation:
// Remove ellipsis.
result = result.replace(/\.\.\./ug, '');
result = result.replace(/\u2026/ug, '');
// Remove commas.
result = result.replace(/,/ug, '');
// Replace periods with commas, unless part of acronyms/initialisms,
// ordinals, or already abbreviated expressions.
result = result.replace(/\./ug, ',');
// Return periods in acronyms (repeat for overlaps).
result = result.replace(/((^|[A-Z,\.&\-\\\/])\s?[A-Z]),/ug, '$1.');
result = result.replace(/((^|[A-Z,\.&\-\\\/])\s?[A-Z]),/ug, '$1.');
result = result.replace(/(\s[A-Z]),/ug, '$1.');
// Return periods inside words (like Eco.mont)
result = result.replace(/([A-Za-z]),([A-Za-z])/ug, '$1.$2');
// Return periods in ordinals and common expressions.
result = result.replace(/([\s\-:,&#()\\\/][0-9]{1,3}),/ug, '$1.');
result = result.replace(/((^|\s)(St|Mr|Ms|Mrs|Mx|Dr|Prof|vs)),/ug, '$1.');
result = result.replace(/^J,/ug, 'J.');
// (Standard says commas and periods for dependent titles can be
// preserved, but it doesn't seem to apply any such exceptions in
// examples).
// Omit '&' and '+' (when they stand for 'and'),
// unless part of names like AT&T.
result = result.replace(/([^A-Z0-9])[&+]([^A-Z0-9])/ug, '$1$2');
// All other punctuation is preserved.
// Omit generic terms separating dependent titles.
// If preceded by [^a-z]
//result = result.replace(/([^a-z\s])\s*(Series|Serie|Ser|Part|Section|Sect|Sec|Série)[,.]?/ug, '$1');
// If followed by single letter A-Z, roman numeral, or digit
result = result.replace(new RegExp(
'(Series|Serie|Ser|Part|Section|Sect|Sec|Série)[,.]?\\s*([A-Z]|[0-9IVXivx]+)'
+ '(' + boundariesRegex.source + '|$)', 'gu'), '$2$3');
// (Otherwise it may be part of a title, like "Bulletin of the Section of Logic").
// Capitalization is preserved.
// (First letter should be capitalized, but we leave that to local
// style, check e.g. 'tm-Technisches Messen').
// This is the same as collation.boundariesRegex, except that we don't
// consider +&?' as boundaries (they are part of initialisms like A&A
// or words like Baha'i).
const boundariesRegex$1 = /[-\s\u2013\u2014_.,:;!|=*\\/"()#%@$]/;
// Articles, as opposed to other short words, are removed from the
// beginning also, and are not preserved in single word titles.
const articles = ['a', 'an', 'the', 'der', 'die', 'das', 'den', 'dem',
'des', 'le', 'la', 'les', 'el', 'il', 'lo', 'los', 'de', 'het',
'els', 'ses', 'es', 'gli', 'een', '\'t', '\'n'];
result = this.removeShortWords(result, articles, '(^|' + boundariesRegex$1.source + ')');
// French articles "l'", "d'" may be followed by whatever.
result = result.replace(new RegExp(
'((^|' + boundariesRegex.source + '))(l|L|d|D|dell|nell)(\'|’)', 'gu'), '$1');
// Check if we have a single word after removing all short words.
let preResult = this.removeShortWords(result, this.shortWords_, '(^|' + boundariesRegex$1.source + ')');
const r = new RegExp('.' + boundariesRegex.source + '.', 'u');
if (!(r.test(preResult)))
return result.replace(/\s+/gu, ' ').trim();
// Now the main part: applying LTWA rules.
// Find and apply patterns, being careful about overlaps.
let matches = []; // A list of [i, iend, startDash, endDash, abbr, line].
for (const pattern of patterns) {
matches = matches.concat(
this.getPatternMatches(result, pattern, languages)
);
}
// Sort by priority: patterns with no starting dashes first,
// patterns with longer matches first, longer patterns first.
// The fine details regulate whether we prefer to match 'futures' to 'futur-' or 'future'.
const getPriority = ([i, iend, _abbr, pattern, appendix]) => (
(pattern.startDash ? 100 : 0) + (pattern.endDash ? 3 : 0)
+ appendix.length - (iend - i - appendix.length) - pattern.pattern.length
);
matches.sort((a, b) => (getPriority(a) - getPriority(b)));
// Resolve overlapping patterns according to priority.
for (let j = 0; j < matches.length; ++j) {
for (let k = j + 1; k < matches.length; ++k) {
if (matches[j][1] > matches[k][0] && matches[k][1] > matches[j][0])
matches.splice(k--, 1); // Remove the later one from matches.
}
}
// Apply matches starting from the later ones.
const getBeginning = ([i, _iend, _abbr, _pattern]) => i;
matches.sort((a, b) => (getBeginning(b) - getBeginning(a)));
for (const [i, iend, abbr, _pattern] of matches) {
// If we'd abbreviate only one character or less (and add a dot),
// we don't abbreviate at all.
if (abbr.length < iend - i) {
result = result.substring(0, i) + abbr + result.substr(iend);
}
}
// Other short words are not removed from beginning.
result = this.removeShortWords(result, this.shortWords_, '(' + boundariesRegex$1.source + ')');
// Remove superfluous whitepace.
result = result.replace(/\s+/gu, ' ').trim();
return result;
}
}
exports.AbbrevIso = AbbrevIso;
exports.LTWAPattern = LTWAPattern;