Skip to content

Commit

Permalink
Tokenize prefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
lynn committed Aug 26, 2023
1 parent 538b25f commit 48ee8dd
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 59 deletions.
16 changes: 1 addition & 15 deletions src/gloss.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { Entry, dictionary } from './dictionary';
import { bare, clean, tone } from './tokenize';
import { bare, clean, splitPrefixes, tone } from './tokenize';
import { Tone } from './types';
import * as fs from 'fs';

Expand Down Expand Up @@ -101,20 +101,6 @@ function displayLength(text: string): number {
return text.normalize('NFKD').replace(/\p{Diacritic}/gu, '').length;
}

function splitIntoRaku(word: string): string[] {
return [...word.matchAll(/'?[^aeiıou][aeiıou+][qm]?/gu)].map(m => m[0]);
}

function splitPrefixes(word: string): { prefixes: string[]; root: string } {
const parts = word
.normalize('NFKD')
.replace(/\u0323/gu, '-')
.normalize('NFC')
.split('-');
const root = parts.pop()!;
return { prefixes: parts.flatMap(splitIntoRaku), root };
}

export class Glosser {
useEasyGlosses: boolean;
constructor(easy: boolean) {
Expand Down
118 changes: 74 additions & 44 deletions src/tokenize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,34 @@ export function tone(word: string): Tone {
}[norm[0]]!;
}

function splitIntoRaku(word: string): string[] {
return [
...word.matchAll(
/(b|c|ch|d|f|g|h|j|k|l|m|n|p|r|s|sh|t|vy?|wy?|ꝡ|y|z|')?[aeiıou]\p{Diacritic}?[aeiıou]*(q|m(?![aeiıou]))?-?/giu,
),
].map(m => {
return m[0];
});
}

export function splitPrefixes(word: string): {
prefixes: string[];
root: string;
} {
const raku = splitIntoRaku(word.normalize('NFKD')).map(x =>
x.includes('\u0323') ? x.replace(/\u0323/gu, '') + '-' : x,
);
const prefixCount = raku.findIndex(p => p.endsWith('-')) + 1;
const prefixes = raku
.slice(0, prefixCount)
.map(x => x.normalize('NFC').replace(/-$/, ''));
const root = raku
.slice(prefixCount)
.map(x => x.normalize('NFC'))
.join('');
return { prefixes, root };
}

export interface ToaqToken {
type: string;
value: string;
Expand All @@ -70,52 +98,54 @@ export class ToaqTokenizer {
this.tokens = [];
this.pos = 0;
for (const m of [...text.matchAll(/[\p{L}\p{N}\p{Diacritic}]+-?/gu)]) {
const tokenText = m[0];
const lemmaForm = clean(tokenText);
const exactEntry = dictionary.get(lemmaForm);
const { prefixes, root } = splitPrefixes(m[0]);
for (const tokenText of [...prefixes.map(p => p + '-'), root]) {
const lemmaForm = clean(tokenText);
const exactEntry = dictionary.get(lemmaForm);

if (exactEntry) {
this.tokens.push({
type: exactEntry.type.replace(/ /g, '_'),
value: tokenText,
index: m.index,
});
continue;
}
if (exactEntry) {
this.tokens.push({
type: exactEntry.type.replace(/ /g, '_'),
value: tokenText,
index: m.index,
});
continue;
}

const base = baseForm(tokenText);
const entry = dictionary.get(base);
const wordTone = tone(tokenText);
if (entry) {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: entry.type.replace(/ /g, '_'),
value: base,
index: m.index,
});
continue;
}
if (wordTone === Tone.T1) {
this.tokens.push({
type: 'predicate',
value: tokenText,
index: m.index,
});
} else {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: 'predicate',
value: base,
index: m.index,
});
const base = baseForm(tokenText);
const entry = dictionary.get(base);
const wordTone = tone(tokenText);
if (entry) {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: entry.type.replace(/ /g, '_'),
value: base,
index: m.index,
});
continue;
}
if (wordTone === Tone.T1) {
this.tokens.push({
type: 'predicate',
value: tokenText,
index: m.index,
});
} else {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: 'predicate',
value: base,
index: m.index,
});
}
}
}
}
Expand Down

0 comments on commit 48ee8dd

Please sign in to comment.