From 9ba827be973cddbed5e7981649b1ec51474f1027 Mon Sep 17 00:00:00 2001 From: crazytan Date: Sun, 5 Jun 2016 14:44:31 -0700 Subject: [PATCH] change HashSet to trove.THashSet --- .../pipe/TokenSequenceRemoveStopwords.java | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java b/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java index 2abf18c34..a3ca209db 100644 --- a/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java +++ b/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java @@ -12,7 +12,6 @@ package cc.mallet.pipe; -import java.util.HashSet; import java.util.ArrayList; import java.io.*; @@ -20,6 +19,8 @@ import cc.mallet.types.Instance; import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; +import gnu.trove.THashSet; + /** * Remove tokens from the token sequence in the data field whose text is in the stopword list. @author Andrew McCallum mccallum@cs.umass.edu @@ -28,13 +29,13 @@ public class TokenSequenceRemoveStopwords extends Pipe implements Serializable { // xxx Use a gnu.trove collection instead - HashSet stoplist = null; + THashSet stoplist = null; boolean caseSensitive = true; boolean markDeletions = false; - private HashSet newDefaultStopList () + private THashSet newDefaultStopList () { - HashSet sl = new HashSet(); + THashSet sl = new THashSet<>(); for (int i = 0; i < stopwords.length; i++) sl.add (stopwords[i]); return sl; @@ -67,7 +68,7 @@ public TokenSequenceRemoveStopwords () */ public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault, boolean caseSensitive, boolean markDeletions) { - if (! includeDefault) { stoplist = new HashSet(); } + if (! includeDefault) { stoplist = new THashSet<>(); } else { stoplist = newDefaultStopList(); } addStopWords (fileToStringArray(stoplistFile, encoding)); @@ -145,7 +146,7 @@ private String[] fileToStringArray (File f, String encoding) } return (String[]) wordarray.toArray(new String[]{}); } - + public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); @@ -167,30 +168,30 @@ public Instance pipe (Instance carrier) return carrier; } - // Serialization - + // Serialization + private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 2; - + private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeBoolean(caseSensitive); out.writeBoolean(markDeletions); out.writeObject(stoplist); // New as of CURRENT_SERIAL_VERSION 2 } - + private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt (); caseSensitive = in.readBoolean(); if (version > 0) markDeletions = in.readBoolean(); if (version > 1) { - stoplist = (HashSet) in.readObject(); + stoplist = (THashSet) in.readObject(); } } - + static final String[] stopwords = { "a", @@ -727,7 +728,7 @@ private void readObject (ObjectInputStream in) throws IOException, ClassNotFound //"concludes", //"based", //"approach" - }; + }; //stopwords for french, added by Limin Yao static final String[] stopwordsFrench = { "fut",