Skip to content

Commit

Permalink
Fix tokenization of ꝡé
Browse files Browse the repository at this point in the history
  • Loading branch information
lynn committed Aug 17, 2023
1 parent c97a5f0 commit 0b49b32
Showing 1 changed file with 21 additions and 17 deletions.
38 changes: 21 additions & 17 deletions src/tokenize.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { dictionary, underscoredWordTypes, WordType } from './dictionary';
import { Tone } from './types';

// Vyái → ꝡáı
export function clean(word: string): string {
return word
.toLowerCase()
Expand All @@ -9,6 +10,7 @@ export function clean(word: string): string {
.normalize();
}

// Vyái → ꝡaı
export function bare(word: string): string {
return clean(word)
.normalize('NFKD')
Expand All @@ -17,6 +19,18 @@ export function bare(word: string): string {
.replace(/i/gu, 'ı');
}

// hâo → hao
// dâ → dâ
// vyé → ꝡë
export function baseForm(word: string): string {
const cleanForm = clean(word);
if (dictionary.has(cleanForm)) return cleanForm;
const bareForm = bare(word);
if (bareForm === 'e') return 'ë';
if (bareForm === 'ꝡe') return 'ꝡë';
return bareForm;
}

export function diacriticForTone(tone: Tone): string {
return ['', '', '\u0301', '\u0308', '\u0302'][tone];
}
Expand Down Expand Up @@ -58,18 +72,6 @@ export class ToaqTokenizer {
for (const m of [...text.matchAll(/[\p{L}\p{N}\p{Diacritic}]+-?/gu)]) {
const tokenText = m[0];
const lemmaForm = clean(tokenText);

if (lemmaForm === 'é') {
this.tokens.push({ type: 'determiner', value: '◌́', index: m.index });
this.tokens.push({
type: 'event_accessor',
value: 'ë',
index: m.index,
});
continue;
}

const bareWord = bare(tokenText);
const exactEntry = dictionary.get(lemmaForm);

if (exactEntry) {
Expand All @@ -80,17 +82,19 @@ export class ToaqTokenizer {
});
continue;
}
const bareEntry = dictionary.get(bareWord);

const base = baseForm(tokenText);
const entry = dictionary.get(base);
const wordTone = tone(tokenText);
if (bareEntry) {
if (entry) {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: bareEntry.type.replace(/ /g, '_'),
value: bareWord,
type: entry.type.replace(/ /g, '_'),
value: base,
index: m.index,
});
continue;
Expand All @@ -109,7 +113,7 @@ export class ToaqTokenizer {
});
this.tokens.push({
type: 'predicate',
value: bareWord,
value: base,
index: m.index,
});
}
Expand Down

0 comments on commit 0b49b32

Please sign in to comment.