Skip to content

Commit

Permalink
Tokenize unknown words in t2/t4 correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
lynn committed Aug 14, 2023
1 parent 06582de commit 63246f1
Showing 1 changed file with 19 additions and 6 deletions.
25 changes: 19 additions & 6 deletions src/tokenize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ export class ToaqTokenizer {
continue;
}
const bareEntry = dictionary.get(bareWord);
const wordTone = tone(tokenText);
if (bareEntry) {
const wordTone = tone(tokenText);
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
Expand All @@ -95,11 +95,24 @@ export class ToaqTokenizer {
});
continue;
}
this.tokens.push({
type: 'predicate',
value: tokenText,
index: m.index,
});
if (wordTone === Tone.T1) {
this.tokens.push({
type: 'predicate',
value: tokenText,
index: m.index,
});
} else {
this.tokens.push({
type: wordTone === Tone.T2 ? 'determiner' : 'preposition',
value: wordTone === Tone.T2 ? '◌́' : '◌̂',
index: m.index,
});
this.tokens.push({
type: 'predicate',
value: bareWord,
index: m.index,
});
}
}
}
next(): ToaqToken | undefined {
Expand Down

0 comments on commit 63246f1

Please sign in to comment.