forked from apache/tika
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TIKA-1960 -- put legacy language detection code back in so that we do…
…n't break the API without some warning. Deprecate it thoroughly.
- Loading branch information
Showing
51 changed files
with
31,740 additions
and
0 deletions.
There are no files selected for viewing
241 changes: 241 additions & 0 deletions
241
tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.tika.language; | ||
|
||
import static java.nio.charset.StandardCharsets.UTF_8; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.Properties; | ||
import java.util.Set; | ||
|
||
/** | ||
* Identifier of the language that best matches a given content profile. | ||
* The content profile is compared to generic language profiles based on | ||
* material from various sources. | ||
* @since Apache Tika 0.5 | ||
* @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/"> | ||
* Europarl: A Parallel Corpus for Statistical Machine Translation</a> | ||
* @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php"> | ||
* ISO 639 Language Codes</a> | ||
* @deprecated use a concrete class of {@link org.apache.tika.language.detect.LanguageDetector} | ||
*/ | ||
@Deprecated | ||
public class LanguageIdentifier { | ||
|
||
/** | ||
* The available language profiles. | ||
*/ | ||
private static final Map<String, LanguageProfile> PROFILES = | ||
new HashMap<String, LanguageProfile>(); | ||
private static final String PROFILE_SUFFIX = ".ngp"; | ||
|
||
private static Properties props = new Properties(); | ||
private static String errors = ""; | ||
|
||
private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties"; | ||
private static final String PROPERTIES_FILE = "tika.language.properties"; | ||
private static final String LANGUAGES_KEY = "languages"; | ||
private static final double CERTAINTY_LIMIT = 0.022; | ||
|
||
private final String language; | ||
|
||
private final double distance; | ||
|
||
/* | ||
* Always attempt initializing language profiles when class is loaded first time | ||
*/ | ||
static { | ||
initProfiles(); | ||
} | ||
|
||
/* | ||
* Add one language profile based on config in property file | ||
*/ | ||
private static void addProfile(String language) throws Exception { | ||
try { | ||
LanguageProfile profile = new LanguageProfile(); | ||
|
||
try (InputStream stream = | ||
LanguageIdentifier.class.getResourceAsStream( | ||
language + PROFILE_SUFFIX)) { | ||
BufferedReader reader = | ||
new BufferedReader(new InputStreamReader(stream, UTF_8)); | ||
String line = reader.readLine(); | ||
while (line != null) { | ||
if (line.length() > 0 && !line.startsWith("#")) { | ||
int space = line.indexOf(' '); | ||
profile.add( | ||
line.substring(0, space), | ||
Long.parseLong(line.substring(space + 1))); | ||
} | ||
line = reader.readLine(); | ||
} | ||
} | ||
|
||
addProfile(language, profile); | ||
} catch (Throwable t) { | ||
throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage()); | ||
} | ||
} | ||
|
||
/** | ||
* Adds a single language profile | ||
* @param language an ISO 639 code representing language | ||
* @param profile the language profile | ||
*/ | ||
public static void addProfile(String language, LanguageProfile profile) { | ||
PROFILES.put(language, profile); | ||
} | ||
|
||
/** | ||
* Constructs a language identifier based on a LanguageProfile | ||
* @param profile the language profile | ||
*/ | ||
public LanguageIdentifier(LanguageProfile profile) { | ||
String minLanguage = "unknown"; | ||
double minDistance = 1.0; | ||
for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) { | ||
double distance = profile.distance(entry.getValue()); | ||
if (distance < minDistance) { | ||
minDistance = distance; | ||
minLanguage = entry.getKey(); | ||
} | ||
} | ||
|
||
this.language = minLanguage; | ||
this.distance = minDistance; | ||
} | ||
|
||
/** | ||
* Constructs a language identifier based on a String of text content | ||
* @param content the text | ||
*/ | ||
public LanguageIdentifier(String content) { | ||
this(new LanguageProfile(content)); | ||
} | ||
|
||
/** | ||
* Gets the identified language | ||
* @return an ISO 639 code representing the detected language | ||
*/ | ||
public String getLanguage() { | ||
return language; | ||
} | ||
|
||
/** | ||
* Tries to judge whether the identification is certain enough | ||
* to be trusted. | ||
* WARNING: Will never return true for small amount of input texts. | ||
* @return <code>true</code> if the distance is smaller then {@value #CERTAINTY_LIMIT}, <code>false</code> otherwise | ||
*/ | ||
public boolean isReasonablyCertain() { | ||
return distance < CERTAINTY_LIMIT; | ||
} | ||
|
||
/** | ||
* Builds the language profiles. | ||
* The list of languages are fetched from a property file named "tika.language.properties" | ||
* If a file called "tika.language.override.properties" is found on classpath, this is used instead | ||
* The property file contains a key "languages" with values being comma-separated language codes | ||
*/ | ||
public static void initProfiles() { | ||
clearProfiles(); | ||
|
||
errors = ""; | ||
InputStream stream; | ||
stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE); | ||
if(stream == null) { | ||
stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE); | ||
} | ||
|
||
if(stream != null){ | ||
try { | ||
props = new Properties(); | ||
props.load(stream); | ||
} catch (IOException e) { | ||
errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n"; | ||
} | ||
} | ||
|
||
String[] languages = props.getProperty(LANGUAGES_KEY).split(","); | ||
for(String language : languages) { | ||
language = language.trim(); | ||
String name = props.getProperty("name."+language, "Unknown"); | ||
try { | ||
addProfile(language); | ||
} catch (Exception e) { | ||
errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n"; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Initializes the language profiles from a user supplied initialized Map. | ||
* This overrides the default set of profiles initialized at startup, | ||
* and provides an alternative to configuring profiles through property file | ||
* | ||
* @param profilesMap map of language profiles | ||
*/ | ||
public static void initProfiles(Map<String, LanguageProfile> profilesMap) { | ||
clearProfiles(); | ||
for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) { | ||
addProfile(entry.getKey(), entry.getValue()); | ||
} | ||
} | ||
|
||
/** | ||
* Clears the current map of language profiles | ||
*/ | ||
public static void clearProfiles() { | ||
PROFILES.clear(); | ||
} | ||
|
||
/** | ||
* Tests whether there were errors initializing language config | ||
* @return true if there are errors. Use getErrors() to retrieve. | ||
*/ | ||
public static boolean hasErrors() { | ||
return errors != ""; | ||
} | ||
|
||
/** | ||
* Returns a string of error messages related to initializing langauge profiles | ||
* @return the String containing the error messages | ||
*/ | ||
public static String getErrors() { | ||
return errors; | ||
} | ||
|
||
/** | ||
* Returns what languages are supported for language identification | ||
* @return A set of Strings being the ISO 639 language codes | ||
*/ | ||
public static Set<String> getSupportedLanguages() { | ||
return PROFILES.keySet(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return language + " (" + distance + ")"; | ||
} | ||
|
||
} |
Oops, something went wrong.