Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change HashSet to trove.THashSet #67

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
package cc.mallet.pipe;


import java.util.HashSet;
import java.util.ArrayList;
import java.io.*;

import cc.mallet.types.FeatureSequenceWithBigrams;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import gnu.trove.THashSet;

/**
* Remove tokens from the token sequence in the data field whose text is in the stopword list.
@author Andrew McCallum <a href="mailto:[email protected]">[email protected]</a>
Expand All @@ -28,13 +29,13 @@
public class TokenSequenceRemoveStopwords extends Pipe implements Serializable
{
// xxx Use a gnu.trove collection instead
HashSet<String> stoplist = null;
THashSet<String> stoplist = null;
boolean caseSensitive = true;
boolean markDeletions = false;

private HashSet<String> newDefaultStopList ()
private THashSet<String> newDefaultStopList ()
{
HashSet<String> sl = new HashSet<String>();
THashSet<String> sl = new THashSet<>();
for (int i = 0; i < stopwords.length; i++)
sl.add (stopwords[i]);
return sl;
Expand Down Expand Up @@ -81,7 +82,7 @@ private static InputStream fileToInputStream(File file)

public TokenSequenceRemoveStopwords(InputStream stoplistStream, String encoding, boolean includeDefault,
boolean caseSensitive, boolean markDeletions) {
if (! includeDefault) { stoplist = new HashSet<String>(); }
if (! includeDefault) { stoplist = new THashSet<String>(); }
else { stoplist = newDefaultStopList(); }

try {
Expand Down Expand Up @@ -164,7 +165,6 @@ private String[] streamToStringArray(InputStream stream, String encoding) throws
return wordarray.toArray(new String[]{});
}


public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
Expand All @@ -186,30 +186,30 @@ public Instance pipe (Instance carrier)
return carrier;
}

// Serialization
// Serialization

private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 2;

private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeBoolean(caseSensitive);
out.writeBoolean(markDeletions);
out.writeObject(stoplist); // New as of CURRENT_SERIAL_VERSION 2
}

private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
caseSensitive = in.readBoolean();
if (version > 0)
markDeletions = in.readBoolean();
if (version > 1) {
stoplist = (HashSet<String>) in.readObject();
stoplist = (THashSet<String>) in.readObject();
}

}


static final String[] stopwords =
{
"a",
Expand Down Expand Up @@ -746,7 +746,7 @@ private void readObject (ObjectInputStream in) throws IOException, ClassNotFound
//"concludes",
//"based",
//"approach"
};
};
//stopwords for french, added by Limin Yao
static final String[] stopwordsFrench = {
"fut",
Expand Down