-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathAbbrevIso.js
459 lines (429 loc) · 18.7 KB
/
AbbrevIso.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/**
* AbbrevIso v1.2 JS lib for publication title abbreviation per ISO-4 standard.
* Copyright (C) 2023 by Marcin Wrochna. MIT License, see file: LICENSE.
* @fileoverview The library implements the method of abbreviating titles of
* publications according to the ISO-4 standard. It also provides a way to list
* matching patterns from the LTWA (List of Title Word Abbreviations).
*/
import * as collation from './collation.js';
import PrefixTree from './PrefixTree.js';
/**
* A single pattern line from the LTWA.
* @property {string} pattern - The actual pattern from the LTWA, with dashes.
* @property {string} replacement - The replacement from the LTWA.
* @property {Array<String>} languages - Languages from which the word came.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* All patterns apply to all titles regardless of language, so this should be avoided.
* @property {boolean} startDash - Does it have a starting dash?
* @property {boolean} endDash - Does it have an ending dash?
* @property {string} line - The original full line from the LTWA.
*/
export class LTWAPattern {
/** @param {string} line A full tab-separated line from the LTWA CSV.*/
constructor(line) {
const a = line.split('\t');
if (a.length != 3)
throw new Error('Number of fields in LTWA line is not 3: "' + line + '"');
this.line = line;
let p = a[0].normalize('NFC').trim();
// Some patterns include a disambiguation comment in parentheses, remove it.
p = p.replace(/\(.*\)/, '').trim();
this.pattern = p;
if (p.length < 3)
throw new Error('LTWA line has too short pattern: "' + line + '"');
this.replacement = a[1].normalize('NFC').trim();
if (['n.a.', 'n. a.', 'n.a'].includes(this.replacement))
this.replacement = '–';
this.languages = a[2].split(',').map(Function.prototype.call,
String.prototype.trim);
this.startDash = (p.charAt(0) == '-');
this.endDash = (p.charAt(p.length - 1) == '-');
}
/**
* Returns a string representation for easy sorting.
* @return {string}
*/
toString() {
return '[object LTWAPattern: ' + this.line + ']';
}
}
/**
* The main class for finding LTWA matches and ISO-4 abbreviations.
*/
export class AbbrevIso {
/**
* @param {(string|Array<string>)} ltwa - The LTWA, tab-separated CSV format.
* @param {(string|Array<string>)} shortWords - A list of short words
* (articles, prepositions, conjuctions) to be omitted from titles. Note
* that articles in a few languages are already hard-coded, as they are
* handled a bit differently by ISO-4 rules. 99.9% of English cases are
* handled by: in/to/of/on/a/an/the/into/as/for/from/with/and.
*/
constructor(ltwa, shortWords) {
/**
* @private {!Array<LTWAPattern>}
* Patterns not starting with a letter (all begin with ').
*/
this.badPatterns_ = [];
/**
* @private {!PrefixTree<LTWAPattern>}
* A prefix tree of patterns beginning with a dash.
*/
this.nonprefixPatterns_ = new PrefixTree();
/**
* @private {!PrefixTree<LTWAPattern>}
* A prefix tree of patterns not beginning with a dash.
*/
this.dictPatterns_ = new PrefixTree();
/**
* @private The number of patterns added.
*/
this.size_ = 0;
// Add all patterns from ltwa as new `LTWAPattern`s.
if (!(ltwa instanceof Array))
ltwa = ltwa.split(collation.newlineRegex);
let firstLine = true;
for (const line of ltwa) {
if (firstLine) { // Skip header.
firstLine = false;
continue;
}
if (line.trim().length == 0) // Skip empty lines.
continue;
this.addPattern(new LTWAPattern(line));
}
// Trim all shortWords.
if (!(shortWords instanceof Array))
shortWords = shortWords.split(collation.newlineRegex);
shortWords = shortWords.map((s) => s.trim());
this.shortWords_ = shortWords.filter((s) => s.length > 0);
}
/** @return {number} Number of patterns added. */
get size() {
return this.size_;
}
/** @param {LTWAPattern} pattern */
addPattern(pattern) {
let p = pattern.pattern;
p = p.replace(/^-/, '');
p = p.replace(/-$/, '');
p = collation.normalize(p);
if (!/^[A-Za-z]/u.test(p))
this.badPatterns_.push(pattern);
p = collation.promiscuouslyNormalize(p);
if (pattern.startDash)
this.nonprefixPatterns_.add(p, pattern);
else
this.dictPatterns_.add(p, pattern);
this.size_++;
}
/**
* Returns any patterns that could potentially match `s` somewhere.
* This returns around 5 times more patterns than actually match.
* @param {string} s
* @param {boolean} pretendDash - If true, pretend all patterns start
* and end with a dash (to find potential compound words)
* @return {Array<LTWAPattern>}
*/
getPotentialPatterns(s, pretendDash = false) {
// Always add all bad patterns.
let result = this.badPatterns_;
s = collation.promiscuouslyNormalize(s);
// Add dict-Patterns/nonprefix-Patterns potentially matching each position,
// depending on whether this position starts a word or not.
let isNewWord = true;
for (let i = 0; i < s.length; i++) {
if (!/[a-zA-Z]/.test(s.charAt(i))) {
isNewWord = true;
continue;
}
if (isNewWord || pretendDash)
result = result.concat(this.dictPatterns_.get(s.substr(i)));
result = result.concat(this.nonprefixPatterns_.get(s.substr(i)));
isNewWord = false;
}
// Remove duplicates in result.
result.sort();
result = result.filter((x, i, res) => !i || x !== res[i-1]);
return result;
}
/**
* Returns all matches of one given LTWAPattern in `value`.
* We only call this function for patterns from `getPotentialPatterns`, so
* we can do more expensive stuff here. Note that some overlapping matches and
* abbreviations that would not strictly decrease the length (with the dot)
* are returned, but should NOT be applied.
* @param {string} value
* @param {LTWAPattern} pattern
* @param {?Array<string>} [languages=['*']] - Languages to consider when
* matching. Default '*' means languages are disregarded, otherwise we
* check intersection with `pattern.languages`: if empty, we return no
* matches (an empty array). Languages are listed as ISO-639-2 (B) codes,
* e.g. 'mul' for multiple, 'und' for undefined language.
* In the standard, all patterns apply to all titles regardless of language.
* @param {boolean} pretendDash - If true, pretend all patterns start and end
* with a dash (to find potential compound words)
* @return {Array} An Array of `[i, iend, abbr, pattern, appendix]` Arrays,
* where the `pattern` matches `value[i..iend-1]`, `abbr` is the
* computed abbreviation that should be put in place of the match; it has
* capitalization, diacritics etc. preserved. `pattern` is the input
* LTWAPattern; `appendix` is the flection ending that was accepted
* even though the pattern had no ending dash (like -s, -ian).
*/
getPatternMatches(value, pattern, languages = ['*'], pretendDash = false) {
// If a list of languages is given, check it intersects the pattern's list.
if (languages !== undefined &&
!languages.includes('*') &&
!pattern.languages.some((x) => languages.includes(x)))
return [];
let replacement = pattern.replacement;
if (replacement == '–')
replacement = '';
let p = pattern.pattern;
if (pattern.startDash || pretendDash) {
p = p.replace(/^-/, '');
replacement = replacement.replace(/^-/, '');
}
if (pattern.endDash || pretendDash)
p = p.replace(/-$/, '');
replacement = Array.from(replacement);
const result = [];
let isPreviousCharBoundary = true;
let i = 0;
while (i < value.length) {
if (!pattern.startDash && !pretendDash && !isPreviousCharBoundary) {
isPreviousCharBoundary = collation.boundariesRegex.test(value[i]);
i++;
continue;
}
const r = collation.getCollatingMatch(value.substr(i), p);
if (r === false) {
isPreviousCharBoundary = collation.boundariesRegex.test(value[i]);
i++;
continue;
}
// Now pattern (ignoring dashes) has a match in `value`,
// starting from i-th position.
let abbr = '';
let ii = 0;
let iend = i + r[0][ii].length;
let appendix = '';
for (let j = 0; j < replacement.length; j++) {
if (replacement[j] == '.') {
abbr += '.';
continue;
}
// Omit value characters until we get to one
// also present in the replacement.
while (!collation.cEquiv(r[1][ii], replacement[j]) &&
(j + 1 >= replacement.length ||
!collation.cEquiv(r[1][ii], replacement[j] + replacement[j + 1]))) {
ii++;
iend += r[0][ii].length;
}
// If r[1][ii] is equivalent to two characters of the replacement,
// we have to advance j twice.
if (!collation.cEquiv(r[1][ii], replacement[j]))
j++;
// Now r[1][ii] is also present in the replacement,
// so we copy it to abbr and move on to the next replacement character.
abbr += r[0][ii];
ii++;
if (ii < r[0].length)
iend += r[0][ii].length;
}
// We omit all remaining characters of the match
// (with no counterpart in replacement).
for (ii++; ii < r[0].length; ii++)
iend += r[0][ii].length;
// If the pattern had an ending dash,
// omit all characters until we get a boundary.
if (pattern.endDash || pretendDash) {
while (iend < value.length &&
!collation.boundariesRegex.test(value[iend]))
iend++;
// If the pattern had no ending dash, try to omit some characters due to
// flection and if we don't have a boundary at iend, discard the pattern.
} else {
let valid = true;
const ending = new RegExp('^([iaesn\'’]{0,3})' + '($|' + collation.boundariesRegex.source + ')', 'u');
const match = value.substr(iend).match(ending);
if (match) {
appendix = match[1];
iend += appendix.length;
} else {
valid = false;
}
if (!valid) {
isPreviousCharBoundary = collation.boundariesRegex.test(value[i]);
i++;
continue;
}
}
// If the replacement was 'n. a.' (not abbreviated), we make it so.
if (replacement == '')
abbr = value.substring(i, iend);
// Report the match.
result.push([i, iend, abbr, pattern, appendix]);
i++;
isPreviousCharBoundary = collation.boundariesRegex.test(value[i-1]);
}
return result;
}
/**
* Returns all patterns matching `value`, sorted by start index of match.
* Note this is not called by `makeAbbreviation`.
* @param {string} value
* @param {?Array<string>} languages - Only use patterns from these.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* @param {boolean} pretendDash - If true, pretend all patterns start
* and end with a dash (to find potential compound words)
* @param {?Array<LTWAPattern>} [patterns=getPotentialPatterns(value)]
* @return {Array<LTWAPattern>}
*/
getMatchingPatterns(value, languages = undefined,
pretendDash = false, patterns = undefined) {
if (patterns === undefined)
patterns = this.getPotentialPatterns(value, pretendDash=pretendDash);
value = value.normalize('NFC').trim();
let matches = [];
for (const pattern of patterns) {
matches = matches.concat(
this.getPatternMatches(value, pattern, languages, pretendDash)
);
}
const getBeginning = ([i, _iend, _abbr, _pattern, _appendix]) => i;
matches.sort((a, b) => (getBeginning(a) - getBeginning(b)));
return matches.map(([_i, _iend, _abbr, pattern, _appendix]) => pattern);
}
/**
* Remove short words from s, under some boundary constraints.
* @param {string} s
* @param {Array<string>} shortWords
* @param {string} before regex source for boundary to be matched before word,
* with one parenthesised group to keep.
*/
removeShortWords(s, shortWords, before, after) {
// Omit articles, prepositions, and conjunctions, unless first preposition
// in title/subtitle, parts of names, meant as initialisms, 'A' meant as
// 'Part A', national practice... Here I omit them only when preceded by a
// boundary, succeeded by whitespace, and lower case or CamelCase (e.g. 'OR'
// is preserved, since it may mean 'Operations Research', but 'B-A ' would
// lose the 'A').
// Also try the word with the first letter capitalized.
let wordList = shortWords.concat(shortWords.map((s) => s.charAt(0).toUpperCase() + s.substr(1)));
for (const word of wordList)
s = s.replace(new RegExp(before + word + '\\s', 'gu'), '$1');
return s;
}
/**
* Compute an abbreviation according to all ISO-4 rules.
* @param {string} value
* @param {?Array<string>} languages - Only use patterns from these.
* (as ISO-639-2 (B) codes, e.g. 'mul' for multiple, 'und' for undefined).
* All patterns apply to all titles regardless of language, so this should be avoided.
* @param {?Array<LTWAPattern>} [patterns=getPotentialPatterns(value)]
* A list of potential patterns (you could give all, it's just damn slow).
* @return {string}
*/
makeAbbreviation(value, languages = undefined, patterns = undefined) {
let result = value;
if (patterns === undefined)
patterns = this.getPotentialPatterns(result);
// Some basic lossless Unicode normalization.
result = result.normalize('NFC').trim();
// Punctuation:
// Remove ellipsis.
result = result.replace(/\.\.\./ug, '');
result = result.replace(/\u2026/ug, '');
// Remove commas.
result = result.replace(/,/ug, '');
// Replace periods with commas, unless part of acronyms/initialisms,
// ordinals, or already abbreviated expressions.
result = result.replace(/\./ug, ',');
// Return periods in acronyms (repeat for overlaps).
result = result.replace(/((^|[A-Z,\.&\-\\\/])\s?[A-Z]),/ug, '$1.');
result = result.replace(/((^|[A-Z,\.&\-\\\/])\s?[A-Z]),/ug, '$1.');
result = result.replace(/(\s[A-Z]),/ug, '$1.');
// Return periods inside words (like Eco.mont)
result = result.replace(/([A-Za-z]),([A-Za-z])/ug, '$1.$2');
// Return periods in ordinals and common expressions.
result = result.replace(/([\s\-:,&#()\\\/][0-9]{1,3}),/ug, '$1.');
result = result.replace(/((^|\s)(St|Mr|Ms|Mrs|Mx|Dr|Prof|vs)),/ug, '$1.');
result = result.replace(/^J,/ug, 'J.');
// (Standard says commas and periods for dependent titles can be
// preserved, but it doesn't seem to apply any such exceptions in
// examples).
// Omit '&' and '+' (when they stand for 'and'),
// unless part of names like AT&T.
result = result.replace(/([^A-Z0-9])[&+]([^A-Z0-9])/ug, '$1$2');
// All other punctuation is preserved.
// Omit generic terms separating dependent titles.
// If preceded by [^a-z]
//result = result.replace(/([^a-z\s])\s*(Series|Serie|Ser|Part|Section|Sect|Sec|Série)[,.]?/ug, '$1');
// If followed by single letter A-Z, roman numeral, or digit
result = result.replace(new RegExp(
'(Series|Serie|Ser|Part|Section|Sect|Sec|Série)[,.]?\\s*([A-Z]|[0-9IVXivx]+)'
+ '(' + collation.boundariesRegex.source + '|$)', 'gu'), '$2$3');
// (Otherwise it may be part of a title, like "Bulletin of the Section of Logic").
// Capitalization is preserved.
// (First letter should be capitalized, but we leave that to local
// style, check e.g. 'tm-Technisches Messen').
// This is the same as collation.boundariesRegex, except that we don't
// consider +&?' as boundaries (they are part of initialisms like A&A
// or words like Baha'i).
const boundariesRegex = /[-\s\u2013\u2014_.,:;!|=*\\/"()#%@$]/;
// Articles, as opposed to other short words, are removed from the
// beginning also, and are not preserved in single word titles.
const articles = ['a', 'an', 'the', 'der', 'die', 'das', 'den', 'dem',
'des', 'le', 'la', 'les', 'el', 'il', 'lo', 'los', 'de', 'het',
'els', 'ses', 'es', 'gli', 'een', '\'t', '\'n'];
result = this.removeShortWords(result, articles, '(^|' + boundariesRegex.source + ')');
// French articles "l'", "d'" may be followed by whatever.
result = result.replace(new RegExp(
'((^|' + collation.boundariesRegex.source + '))(l|L|d|D|dell|nell)(\'|’)', 'gu'), '$1');
// Check if we have a single word after removing all short words.
let preResult = this.removeShortWords(result, this.shortWords_, '(^|' + boundariesRegex.source + ')');
const r = new RegExp('.' + collation.boundariesRegex.source + '.', 'u');
if (!(r.test(preResult)))
return result.replace(/\s+/gu, ' ').trim();
// Now the main part: applying LTWA rules.
// Find and apply patterns, being careful about overlaps.
let matches = []; // A list of [i, iend, startDash, endDash, abbr, line].
for (const pattern of patterns) {
matches = matches.concat(
this.getPatternMatches(result, pattern, languages)
);
}
// Sort by priority: patterns with no starting dashes first,
// patterns with longer matches first, longer patterns first.
// The fine details regulate whether we prefer to match 'futures' to 'futur-' or 'future'.
const getPriority = ([i, iend, _abbr, pattern, appendix]) => (
(pattern.startDash ? 100 : 0) + (pattern.endDash ? 3 : 0)
+ appendix.length - (iend - i - appendix.length) - pattern.pattern.length
);
matches.sort((a, b) => (getPriority(a) - getPriority(b)));
// Resolve overlapping patterns according to priority.
for (let j = 0; j < matches.length; ++j) {
for (let k = j + 1; k < matches.length; ++k) {
if (matches[j][1] > matches[k][0] && matches[k][1] > matches[j][0])
matches.splice(k--, 1); // Remove the later one from matches.
}
}
// Apply matches starting from the later ones.
const getBeginning = ([i, _iend, _abbr, _pattern]) => i;
matches.sort((a, b) => (getBeginning(b) - getBeginning(a)));
for (const [i, iend, abbr, _pattern] of matches) {
// If we'd abbreviate only one character or less (and add a dot),
// we don't abbreviate at all.
if (abbr.length < iend - i) {
result = result.substring(0, i) + abbr + result.substr(iend);
}
}
// Other short words are not removed from beginning.
result = this.removeShortWords(result, this.shortWords_, '(' + boundariesRegex.source + ')');
// Remove superfluous whitepace.
result = result.replace(/\s+/gu, ' ').trim();
return result;
}
}