diff --git a/lbjava-examples/pom.xml b/lbjava-examples/pom.xml index 804e89c8..a3f485f6 100755 --- a/lbjava-examples/pom.xml +++ b/lbjava-examples/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.3 4.0.0 @@ -27,12 +27,12 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.1 + 1.3.2 @@ -63,7 +63,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.1 + 1.3.2 ${project.basedir}/src/main/java ${project.basedir}/target/classes diff --git a/lbjava-mvn-plugin/pom.xml b/lbjava-mvn-plugin/pom.xml index 71bfa199..339b6ab8 100644 --- a/lbjava-mvn-plugin/pom.xml +++ b/lbjava-mvn-plugin/pom.xml @@ -5,7 +5,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.3 lbjava-maven-plugin @@ -76,7 +76,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 jar compile diff --git a/lbjava/pom.xml b/lbjava/pom.xml index 13a58757..b6162f54 100644 --- a/lbjava/pom.xml +++ b/lbjava/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.3 4.0.0 diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java index 792d5032..0e1506ad 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java @@ -221,7 +221,7 @@ public Feature encode(String e) { * @return The hash code of this feature. **/ public int hashCode() { - return 31 * super.hashCode() + 17 * identifier.hashCode() + value.hashCode(); + return super.hashCode() + 17 * identifier.hashCode() + value.hashCode(); } @@ -237,8 +237,7 @@ public boolean equals(Object o) { return false; if (o instanceof DiscretePrimitiveStringFeature) { DiscretePrimitiveStringFeature f = (DiscretePrimitiveStringFeature) o; - return identifier.equals(f.identifier) && valueIndex > -1 ? valueIndex == f.valueIndex - : value.equals(f.value); + return identifier.equals(f.identifier) && value.equals(f.value); } DiscretePrimitiveFeature f = (DiscretePrimitiveFeature) o; diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java index bb55b6a7..4f753473 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java @@ -22,6 +22,7 @@ import edu.illinois.cs.cogcomp.lbjava.util.ClassUtils; import edu.illinois.cs.cogcomp.lbjava.util.FVector; import edu.illinois.cs.cogcomp.lbjava.util.TableFormat; +import gnu.trove.map.hash.THashMap; /** @@ -132,7 +133,7 @@ public static Lexicon readLexicon(ExceptionlessInputStream in, boolean readCount // Member variables. /** The map of features to integer keys. */ - protected Map lexicon; + protected Map lexicon; /** The inverted map of integer keys to their features. */ protected FVector lexiconInv; /** The encoding to use for new features added to this lexicon. */ @@ -182,7 +183,7 @@ public Lexicon(String e) { /** Clears the data structures associated with this instance. */ public void clear() { - lexicon = new HashMap(); + lexicon = new THashMap(); lexiconInv = new FVector(); lexiconChildren = null; pruneCutoff = -1; @@ -709,7 +710,7 @@ public Object clone() { } if (lexicon != null) { - clone.lexicon = new HashMap(); + clone.lexicon = new THashMap(); clone.lexicon.putAll(lexicon); } clone.lexiconInv = (FVector) lexiconInv.clone(); diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java index 822fc1fd..d89bd59f 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java @@ -8,9 +8,9 @@ package edu.illinois.cs.cogcomp.lbjava.learn; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; -import java.util.Map.Entry; import edu.illinois.cs.cogcomp.core.datastructures.vectors.ExceptionlessInputStream; import edu.illinois.cs.cogcomp.core.datastructures.vectors.ExceptionlessOutputStream; @@ -688,6 +688,44 @@ public void read(ExceptionlessInputStream in) { for (int i = 0; i < N; ++i) network.add(Learner.readLearner(in)); } + + /** + * This method will discard learners not associated with the provided labels. For labels that are + * not needed at runtime, this would improve performance as well as memory footprint. For example, + * imagine you have a 4 class model, PER, ORG, LOC and OTHER, but you could care less about OTHER. + * In this case, you could eliminate that label and improve the performance of the model proportionally. + *

+ * Use of this feature may cause terms previously classified by a discarded classifier to be labeled + * as one of the remaining classes. + *

+ * @param keepers A list of the only labels to keep. + */ + public void pruneUnusedLabels(ArrayList keepers) { + int N = network.size(); + for (int i = 0; i < N; ++i) { + LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i); + if (ltu == null) + continue; + + // get the label and determine if it should be pruned. + String label = labelLexicon.lookupKey(i).getStringValue(); + if (label.length() > 2) { + // Take off the B-, I-, L- or U- + label = label.substring(2); + boolean keepit = false; + for (String checkme : keepers) { + if (label.equals(checkme)) { + keepit = true; + break; + } + } + if (!keepit) + network.set(i, null); + } else { + // keep other("O"), this is like a non-label to begin with. + } + } + } /** Returns a deep clone of this learning algorithm. */ public Object clone() { diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java index cbae0aa4..ea163d86 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java @@ -36,6 +36,8 @@ public class ArrayFileParser implements Parser { /** Reader for file currently being parsed. */ protected DataInputStream in; + /** the zip file must also be closed, if this is compressed file. */ + protected ZipFile zipFile=null; /** The name of the file to parse. */ protected String exampleFileName; /** A single array from which all examples can be parsed. */ @@ -190,13 +192,11 @@ public void reset() { try { if (exampleFileName != null) { if (zipped) { - ZipFile zip = new ZipFile(exampleFileName); - in = - new DataInputStream(new BufferedInputStream(zip.getInputStream(zip + zipFile = new ZipFile(exampleFileName); + in = new DataInputStream(new BufferedInputStream(zipFile.getInputStream(zipFile .getEntry(ExceptionlessInputStream.zipEntryName)))); } else - in = - new DataInputStream(new BufferedInputStream(new FileInputStream( + in = new DataInputStream(new BufferedInputStream(new FileInputStream( exampleFileName))); } else if (zipped) { ZipInputStream zip = new ZipInputStream(new ByteArrayInputStream(exampleData)); @@ -218,6 +218,9 @@ public void close() { return; try { in.close(); + if (zipFile != null) { + zipFile.close(); + } } catch (Exception e) { System.err.println("Can't close '" + exampleFileName + "':"); e.printStackTrace(); diff --git a/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java b/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java index e5202d51..e6e4aaf1 100644 --- a/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java +++ b/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java @@ -16,5 +16,4 @@ public class SparseNetworkLearningPruneTest { @Test public void test() { } - } diff --git a/pom.xml b/pom.xml index db0a0881..8fcd6b74 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp lbjava-project pom - 1.3.1 + 1.3.3 lbjava