Skip to content

Commit

Permalink
parser can now separate numbers from words also when they are not
Browse files Browse the repository at this point in the history
separated by space, i.e. 4.7Ohm
  • Loading branch information
Orbiter committed Sep 2, 2023
1 parent 079eafe commit 5db97a8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
2 changes: 1 addition & 1 deletion source/net/yacy/document/SentenceReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ public synchronized void close() {
}

public static void main(String[] args) {
String s = "a b 1.5 ccc 4,7 d. so o et, qu.";
String s = "a b 1.5 ccc 4,7 d. so o et, qu. 4.7Ohm 2.54inch.";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) System.out.println(a);
sr = new SentenceReader(s);
Expand Down
14 changes: 14 additions & 0 deletions source/net/yacy/document/WordTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ private StringBuilder nextElement0() {
continue; // Continue to the next character without further checks.
}

// Transition from digit (or digit separator) to a letter. Save the number as a token and start a new token for the word.
if (wasDigitSep && Character.isLetter(c)) {
if (sb.length() > 0) {
this.s.add(sb);
sb = new StringBuilder(20);
}
wasDigitSep = false;
}

// Check if the current character is a punctuation.
// Punctuation checks are prioritized over invisibles due to simplicity and speed.
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
Expand Down Expand Up @@ -189,6 +198,11 @@ else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked
// If the character is not punctuation or invisible, add it to the current token.
else {
sb = sb.append(c);
// Check for transition from number to word, e.g., "4.7Ohm"
if (i < r.length() - 1 && Character.isDigit(c) && Character.isLetter(r.charAt(i + 1))) {
this.s.add(sb);
sb = new StringBuilder(20); // Start capturing the word as a new token.
}
}
}

Expand Down

0 comments on commit 5db97a8

Please sign in to comment.