Skip to content

Commit

Permalink
feat: use java Normalizer for titleNorm #1173
Browse files Browse the repository at this point in the history
  • Loading branch information
mvanzalu committed Sep 1, 2023
1 parent a72f855 commit 571bccf
Showing 1 changed file with 8 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import java.io.IOException;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.*;

import static java.lang.System.currentTimeMillis;
Expand Down Expand Up @@ -132,7 +133,7 @@ Map<String, Object> getDocumentMap(TikaDocument document) throws IOException {
jsonDocument.put("contentLength", Long.valueOf(ofNullable(document.getMetadata().get(CONTENT_LENGTH)).orElse("-1")));
jsonDocument.put("contentEncoding", ofNullable(document.getMetadata().get(CONTENT_ENCODING)).orElse(DEFAULT_VALUE_UNKNOWN));
jsonDocument.put("title", ofNullable(getTitle(document.getMetadata())).orElse(DEFAULT_VALUE_UNKNOWN));
jsonDocument.put("titleNorm", Optional.of(getTitle(document.getMetadata()).toLowerCase()).orElse(DEFAULT_VALUE_UNKNOWN));
jsonDocument.put("titleNorm", ofNullable(normalize(getTitle(document.getMetadata()))).orElse(DEFAULT_VALUE_UNKNOWN));

String content = toString(document.getReader()).trim();
if (maxContentLength != -1 && content.length() > maxContentLength) {
Expand Down Expand Up @@ -174,6 +175,12 @@ public ElasticsearchSpewer withRefresh(WriteRequest.RefreshPolicy refreshPolicy)
return this;
}

public static String normalize(String input) {
// Normalize special characters to their ASCII equivalents
// and convert to lowercase
return Normalizer.normalize(input, Normalizer.Form.NFD).toLowerCase();
}

int getMaxContentLength(PropertiesProvider propertiesProvider) {
return (int) Math.min(HumanReadableSize.parse(propertiesProvider.get("maxContentLength").orElse("-1")), Integer.MAX_VALUE);
}
Expand Down

0 comments on commit 571bccf

Please sign in to comment.