From e3797de7de4f3446f364fa08ce00cf6fa77dc5a5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 1 Sep 2023 20:10:08 +0200 Subject: [PATCH] enhanced the word tokenizer to recognize numbers in a proper way --- source/net/yacy/document/SentenceReader.java | 24 +++++- source/net/yacy/document/WordTokenizer.java | 83 +++++++++++++++++--- 2 files changed, 93 insertions(+), 14 deletions(-) diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index be8d8b5b4..e7012c68d 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -27,6 +27,11 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +import net.yacy.cora.order.Base64Order; +import net.yacy.kelondro.data.word.Word; /** * Read sentences from a given text. @@ -129,6 +134,10 @@ public final static boolean punctuation(final char c) { return c == '.' || c == '!' || c == '?'; } + public final static boolean digitsep(final char c) { + return c == '.' || c == ','; + } + @Override public boolean hasNext() { return this.buffer != null; @@ -169,10 +178,19 @@ public synchronized void close() { } public static void main(String[] args) { - String s = "a b ccc d"; + String s = "a b 1.5 ccc 4,7 d. so o et, qu."; SentenceReader sr = new SentenceReader(s); - for (StringBuilder a: sr) { - System.out.println(a); + for (StringBuilder a: sr) System.out.println(a); + sr = new SentenceReader(s); + + WordTokenizer words = new WordTokenizer(sr, null); + try { + while (words.hasMoreElements()) { + System.out.println(words.nextElement().toString()); + } + } finally { + words.close(); + words = null; } } } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index f2acc4a39..d5b2ea851 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -83,25 +83,52 @@ public synchronized void close() { this.buffer = null; } + /** + * Enumeration implementation for unsieved words. + * This class provides an enumeration of words (in the form of StringBuilders) that haven't been sieved or filtered. + */ private class unsievedWordsEnum implements Enumeration { - // returns an enumeration of StringBuilder Objects - private StringBuilder buffer = null; + // Buffer to hold the next element in the enumeration. + private StringBuilder buffer; + + // Sentence reader instance to read sentences. private SentenceReader sr; + + // List to hold tokenized words from the sentence. private List s; + + // Index to traverse the tokenized words list. private int sIndex; + /** + * Constructor initializes the enumeration with a SentenceReader. + * + * @param sr0 The SentenceReader instance. + */ public unsievedWordsEnum(final SentenceReader sr0) { assert sr0 != null; this.sr = sr0; this.s = new ArrayList(); this.sIndex = 0; + + // Populate the buffer with the first word. this.buffer = nextElement0(); } + /** + * Pre-process method of the SentenceReader. + * + * @param x The boolean value for pre-processing. + */ public void pre(final boolean x) { this.sr.pre(x); } + /** + * Utility method to fetch the next unsieved word. + * + * @return The next word, or null if no more words are available. + */ private StringBuilder nextElement0() { StringBuilder r; StringBuilder sb; @@ -112,26 +139,60 @@ private StringBuilder nextElement0() { } while (this.s.isEmpty()) { if (!this.sr.hasNext()) return null; - r = this.sr.next(); // read next sentence (incl. ending punctuation) + + // Read the next sentence, including ending punctuation. + r = this.sr.next(); if (r == null) return null; r = trim(r); - sb = new StringBuilder(20); + + // Tokenize the sentence into words and punctuation marks. + sb = new StringBuilder(20); // Initialize StringBuilder to capture tokens (words or punctuation) from the sentence. + + // A variable to track whether the previous character was a digit separator within a number. + boolean wasDigitSep = false; + + // Iterate through each character in the sentence to tokenize it. for (int i = 0; i < r.length(); i++) { // tokenize one sentence c = r.charAt(i); + + // Check if the current character is a digit separator within a number. + if (SentenceReader.digitsep(c) && i > 0 && Character.isDigit(r.charAt(i - 1)) && (i < r.length() - 1) && Character.isDigit(r.charAt(i + 1))) { + sb.append(c); // Add the digit separator to the current token. + wasDigitSep = true; // Set the flag to true. + continue; // Continue to the next character without further checks. + } + + // Check if the current character is a punctuation. + // Punctuation checks are prioritized over invisibles due to simplicity and speed. if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible + // If the current token (sb) has content, add it to the list of tokens. + if (sb.length() > 0 && !wasDigitSep) { + this.s.add(sb); + sb = new StringBuilder(1); // Prepare to capture the punctuation. + } + sb.append(c); // Add the punctuation to the token. + this.s.add(sb); // Add the punctuation token to the list. + sb = new StringBuilder(20); // Reset token builder for the next token. + wasDigitSep = false; // Reset the digit separator flag. + } + + // Check if the current character is invisible. + // Note: This check currently has overlap with punctuation check. + else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible() + // If the current token (sb) has content, add it to the list and reset the token builder. if (sb.length() > 0) { this.s.add(sb); - sb = new StringBuilder(1); + sb = new StringBuilder(20); } - sb.append(c); - this.s.add(sb); - sb = new StringBuilder(20); - } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible() - if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} - } else { + wasDigitSep = false; // Reset the digit separator flag. + } + // If the character is not punctuation or invisible, add it to the current token. + else { sb = sb.append(c); } } + + // If there's any content left in the token builder after processing the sentence, add it to the list. if (sb.length() > 0) { this.s.add(sb); sb = null;