Skip to content

Commit

Permalink
enhanced the word tokenizer to recognize numbers in a proper way
Browse files Browse the repository at this point in the history
  • Loading branch information
Orbiter committed Sep 1, 2023
1 parent 88cd17e commit e3797de
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 14 deletions.
24 changes: 21 additions & 3 deletions source/net/yacy/document/SentenceReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;

import net.yacy.cora.order.Base64Order;
import net.yacy.kelondro.data.word.Word;

/**
* Read sentences from a given text.
Expand Down Expand Up @@ -129,6 +134,10 @@ public final static boolean punctuation(final char c) {
return c == '.' || c == '!' || c == '?';
}

public final static boolean digitsep(final char c) {
return c == '.' || c == ',';
}

@Override
public boolean hasNext() {
return this.buffer != null;
Expand Down Expand Up @@ -169,10 +178,19 @@ public synchronized void close() {
}

public static void main(String[] args) {
String s = "a b ccc d";
String s = "a b 1.5 ccc 4,7 d. so o et, qu.";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) {
System.out.println(a);
for (StringBuilder a: sr) System.out.println(a);
sr = new SentenceReader(s);

WordTokenizer words = new WordTokenizer(sr, null);
try {
while (words.hasMoreElements()) {
System.out.println(words.nextElement().toString());
}
} finally {
words.close();
words = null;
}
}
}
83 changes: 72 additions & 11 deletions source/net/yacy/document/WordTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,52 @@ public synchronized void close() {
this.buffer = null;
}

/**
* Enumeration implementation for unsieved words.
* This class provides an enumeration of words (in the form of StringBuilders) that haven't been sieved or filtered.
*/
private class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null;
// Buffer to hold the next element in the enumeration.
private StringBuilder buffer;

// Sentence reader instance to read sentences.
private SentenceReader sr;

// List to hold tokenized words from the sentence.
private List<StringBuilder> s;

// Index to traverse the tokenized words list.
private int sIndex;

/**
* Constructor initializes the enumeration with a SentenceReader.
*
* @param sr0 The SentenceReader instance.
*/
public unsievedWordsEnum(final SentenceReader sr0) {
assert sr0 != null;
this.sr = sr0;
this.s = new ArrayList<StringBuilder>();
this.sIndex = 0;

// Populate the buffer with the first word.
this.buffer = nextElement0();
}

/**
* Pre-process method of the SentenceReader.
*
* @param x The boolean value for pre-processing.
*/
public void pre(final boolean x) {
this.sr.pre(x);
}

/**
* Utility method to fetch the next unsieved word.
*
* @return The next word, or null if no more words are available.
*/
private StringBuilder nextElement0() {
StringBuilder r;
StringBuilder sb;
Expand All @@ -112,26 +139,60 @@ private StringBuilder nextElement0() {
}
while (this.s.isEmpty()) {
if (!this.sr.hasNext()) return null;
r = this.sr.next(); // read next sentence (incl. ending punctuation)

// Read the next sentence, including ending punctuation.
r = this.sr.next();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(20);

// Tokenize the sentence into words and punctuation marks.
sb = new StringBuilder(20); // Initialize StringBuilder to capture tokens (words or punctuation) from the sentence.

// A variable to track whether the previous character was a digit separator within a number.
boolean wasDigitSep = false;

// Iterate through each character in the sentence to tokenize it.
for (int i = 0; i < r.length(); i++) { // tokenize one sentence
c = r.charAt(i);

// Check if the current character is a digit separator within a number.
if (SentenceReader.digitsep(c) && i > 0 && Character.isDigit(r.charAt(i - 1)) && (i < r.length() - 1) && Character.isDigit(r.charAt(i + 1))) {
sb.append(c); // Add the digit separator to the current token.
wasDigitSep = true; // Set the flag to true.
continue; // Continue to the next character without further checks.
}

// Check if the current character is a punctuation.
// Punctuation checks are prioritized over invisibles due to simplicity and speed.
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
// If the current token (sb) has content, add it to the list of tokens.
if (sb.length() > 0 && !wasDigitSep) {
this.s.add(sb);
sb = new StringBuilder(1); // Prepare to capture the punctuation.
}
sb.append(c); // Add the punctuation to the token.
this.s.add(sb); // Add the punctuation token to the list.
sb = new StringBuilder(20); // Reset token builder for the next token.
wasDigitSep = false; // Reset the digit separator flag.
}

// Check if the current character is invisible.
// Note: This check currently has overlap with punctuation check.
else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
// If the current token (sb) has content, add it to the list and reset the token builder.
if (sb.length() > 0) {
this.s.add(sb);
sb = new StringBuilder(1);
sb = new StringBuilder(20);
}
sb.append(c);
this.s.add(sb);
sb = new StringBuilder(20);
} else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
} else {
wasDigitSep = false; // Reset the digit separator flag.
}
// If the character is not punctuation or invisible, add it to the current token.
else {
sb = sb.append(c);
}
}

// If there's any content left in the token builder after processing the sentence, add it to the list.
if (sb.length() > 0) {
this.s.add(sb);
sb = null;
Expand Down

0 comments on commit e3797de

Please sign in to comment.