Changes preparing for release

- model as jar resource - auto-detect input format - greedy decoder by default - standalone twokenize.sh script - conll conversion script - fake bill nye example - other stuff
brendano · Sep 19, 2012 · f4179c4 · f4179c4
1 parent a5c0957
commit f4179c4
Show file tree

Hide file tree

Showing 16 changed files with 189 additions and 92 deletions.
diff --git a/README.txt b/README.txt
@@ -1,43 +1,28 @@
-CMU ARK Twitter Part-of-Speech Tagger v0.3-pre 
+CMU ARK Twitter Part-of-Speech Tagger v0.3
 http://www.ark.cs.cmu.edu/TweetNLP/
 
 Basic usage
 ===========
 
 Requires Java 6.  To run the tagger from unix shell:
 
-    ./runTagger.sh example_tweets.txt modelfile > tagged_tweets.txt
+    ./runTagger.sh examples/example_tweets.txt
 
-Another example:
+The tagger outputs tokens, predicted part-of-speech tags, and confidences.
+For more information:
 
-    ./runTagger.sh --input-format json barackobama.jsonlines.txt -output tagged_barackobama.txt
+    ./runTagger.sh --help
 
-The outputs should match tagged_tweets_expected.txt and barackobamaexpected.txt respectively.
+We also include a script that invokes just the tokenizer:
 
+    ./twokenize.sh examples/example_tweets.txt
 
+Information
+===========
 
-Advanced usage
---------------
-
-We include a pre-compiled .jar of the tagger so you hopefully don't need to
-compile it.  But if you need to recompile, do:
-  mvn install
-NOTE: requires Maven 3.0.3+
-
-To train and evalute the tagger, see:
-  ark-tweet-nlp/src/main/java/edu/cmu/cs/lti/ark/ssl/pos/SemiSupervisedPOSTagger.java
-  scripts/train.sh and scripts/test.sh
-
-Contents
---------
- * runTagger.sh       is the script you probably want
- * lib/               dependencies
- * ark-tweet-nlp/src  the project code itself (all java)
+Version 0.3 of the tagger is 40 times faster and more accurate.  Please see the tech report on the website for details.
 
-Information
------------
-This tagger is described in the following paper.  Please cite it if you write a
-research paper using this software.
+This tagger is described in the following two papers, available at the website.  Please cite this if you write a research paper using this software.
 
   Part-of-Speech Tagging for Twitter: Annotation, Features, and Experiments
   Kevin Gimpel, Nathan Schneider, Brendan O'Connor, Dipanjan Das, Daniel Mills,
@@ -46,23 +31,7 @@ research paper using this software.
   Linguistics, companion volume, Portland, OR, June 2011.
   http://www.ark.cs.cmu.edu/TweetNLP/gimpel+etal.acl11.pdf
 
-The software is licensed under Apache 2.0 (see LICENSE file).
-
-Version 0.2 of the tagger differs from version 0.1 in the following ways:
-
-* The tokenizer has been improved and integrated with the tagger in a single Java program.
-
-* The new tokenizer was run on the 1,827 tweets used for the annotation effort and the
-annotations were adapted for tweets with differing tokenizations. The revised annotations
-are contained in a companion v0.2 release of the data (twpos-data-v0.2).
-
-* The tagging model is trained on ALL of the available annotated data in twpos-data-v0.2.
-The model in v0.1 was only trained on the training set.
-
-* The tokenizer/tagger is integrated with Twitter's text commons annotations API.
-
 Contact
--------
-Please contact Brendan O'Connor ([email protected]) and Kevin Gimpel ([email protected])
-if you encounter any problems.
+=======
 
+Please contact Brendan O'Connor ([email protected]) and Kevin Gimpel ([email protected]) if you encounter any problems.
diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml
@@ -4,7 +4,7 @@
     <groupId>edu.cmu.cs</groupId>
     <artifactId>ark-tweet-nlp</artifactId>
     <packaging>jar</packaging>
-    <version>0.3-SNAPSHOT</version>
+    <version>0.3</version>
     <name>CMU ARK TweetNLP: Twitter POS tagger</name>
     <url>http://www.ark.cs.cmu.edu/TweetNLP/</url>
     <properties>

diff --git a/examples/bill_nye_tho.txt b/examples/bill_nye_tho.txt
@@ -0,0 +1,16 @@
+247120554400821248	2012-09-15T23:51:46	Bill_Nye_tho	all out of wood facts
+247120392324542464	2012-09-15T23:51:07	Bill_Nye_tho	u can build a house w/ it
+247119965784784896	2012-09-15T23:49:25	Bill_Nye_tho	more wood facts still to come
+247119210113802240	2012-09-15T23:46:25	Bill_Nye_tho	its biodegradable #woodfacts
+247118527113355264	2012-09-15T23:43:42	Bill_Nye_tho	u could burn it #woodfacts
+247117483482431488	2012-09-15T23:39:33	Bill_Nye_tho	SHOUT OUT TO MY NIGGAS EATIN HUMUS
+247114115762499584	2012-09-15T23:26:11	Bill_Nye_tho	if u want me to give a lecture at ur school contact ur student board or w/e. or contact me and i'll just come i don't give a fuck lol
+247113014011109378	2012-09-15T23:21:48	Bill_Nye_tho	u ever been havin the illest dream ever n u wake up right as its gettin real good and ur like damn i wasnt done smashin Jane Goodall's shit
+247089985625395202	2012-09-15T21:50:17	Bill_Nye_tho	sometimes i'll freeze water then melt it then freeze it again an i just keep doing that until it stops being awesome but it never does
+246819478107746304	2012-09-15T03:55:23	Bill_Nye_tho	YO I CANT THINK OF ANYTHING THAT GETS ME MORE HEATED THAN ARTIFICIAL PLANTS
+246815764902981632	2012-09-15T03:40:38	Bill_Nye_tho	look at u lookn all cute over there girl come here a min lemme holla atcha whats ur bigges fantasy mine is to visit Triton,Neptunes 7th moon
+246811004770590721	2012-09-15T03:21:43	Bill_Nye_tho	its cool that chameleons can blend in with their environment but at a certain points it's like just do u homie!!!
+246806113645907969	2012-09-15T03:02:17	Bill_Nye_tho	@Wendys what up Wendy's on average one fully grown bovine can produce about 2400 hamburger patties. and u can fact check that shit homeboyyy
+246590149234925569	2012-09-14T12:44:07	Bill_Nye_tho	SHOUTS OUT TO PEOPLE WILLINGLY LIVIN IN TOWNS RIGHT NEXT TO ACTIVE VOLCANOES LIKE "NAH WE'RE GOOD"
+246589808217051138	2012-09-14T12:42:46	Bill_Nye_tho	a lotta people refer to this as a novelty account i dont see whats so novel about science but whatever p.s. lava can flow up to 10km perhour
+246348019048542208	2012-09-13T20:41:59	Bill_Nye_tho	Jane Goodall is a bad bitch
diff --git a/runTagger.sh b/runTagger.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
+set -eu
 
-# For development
-# java -Xmx1g -jar $(dirname $0)/ark-tweet-nlp/target/bin/ark-tweet-nlp-0.3-SNAPSHOT.jar "$@"
-
-# For release
-java -Xmx1g -jar $(dirname $0)/ark-tweet-nlp-0.3.jar "$@"
+# Run the tagger (and tokenizer).
+java -Xmx500m -jar $(dirname $0)/ark-tweet-nlp-0.3.jar "$@"
diff --git a/scripts/prepare_release.sh b/scripts/prepare_release.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+VERSION=0.3
+DIR=ark-tweet-nlp-$VERSION
+
+set -eux
+
+rm -rf $DIR
+mkdir $DIR
+
+# mvn clean
+# mvn package
+cp ark-tweet-nlp/target/bin/ark-tweet-nlp-${VERSION}.jar $DIR
+
+cp -r examples $DIR
+cp -r scripts $DIR
+rm $DIR/scripts/prepare_release.sh
+rm $DIR/scripts/java.sh
+cp *.sh $DIR
+cp *.txt $DIR
+
+# these dont work, need to fix
+rm $DIR/examples/barackobama*
diff --git a/scripts/toconll.py b/scripts/toconll.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# Take the pretsv format and make it CoNLL-like ("supertsv", having tweet metadata headers)
+import sys,json
+from datetime import datetime
+
+for line in sys.stdin:
+    parts = line.split('\t')
+    tokens = parts[0].split()
+    tags = parts[1].split()
+    try:
+        d = json.loads(parts[-1])
+        print "TWEET\t{}\t{}".format(d['id'], datetime.strptime(d['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime("%Y-%m-%dT%H:%M:%S"))
+        print "TOKENS"
+    except:
+        pass
+
+    for tok,tag in zip(tokens,tags):
+        print "{}\t{}".format(tag,tok)
+    print ""
+
diff --git a/src/cmu/arktweetnlp/RunTagger.java b/src/cmu/arktweetnlp/RunTagger.java
@@ -28,18 +28,19 @@ public class RunTagger {
 	Tagger tagger;
 
 	// Commandline I/O-ish options
-	String inputFormat = "json";
+	String inputFormat = "auto";
 	String outputFormat = "auto";
 	int inputField = 1;
 
 	String inputFilename;
-	String modelFilename;
+	/** Can be either filename or resource name **/
+	String modelFilename = "/cmu/arktweetnlp/model.20120919";
 
 	public boolean noOutput = false;
 	public boolean justTokenize = false;
 
 	public static enum Decoder { GREEDY, VITERBI };
-	public Decoder decoder = Decoder.VITERBI; 
+	public Decoder decoder = Decoder.GREEDY; 
 	public boolean showConfidence = true;
 
 	PrintStream outputStream;
@@ -64,6 +65,17 @@ public RunTagger() throws UnsupportedEncodingException {
 		// force UTF-8 here, so don't need -Dfile.encoding
 		this.outputStream = new PrintStream(System.out, true, "UTF-8");
 	}
+	public void detectAndSetInputFormat(String tweetData) throws IOException {
+		JsonTweetReader jsonTweetReader = new JsonTweetReader();
+		if (jsonTweetReader.isJson(tweetData)) {
+			System.err.println("Detected JSON input format");
+			inputFormat = "json";
+		} else {
+			System.err.println("Detected text input format");
+			inputFormat = "text";
+		}
+	}
+
 	public void runTagger() throws IOException, ClassNotFoundException {
 
 		tagger = new Tagger();
@@ -86,9 +98,20 @@ public void runTagger() throws IOException, ClassNotFoundException {
 		while ( (line = reader.readLine()) != null) {
 			String[] parts = line.split("\t");
 			String tweetData = parts[inputField-1];
+
+			if (reader.getLineNumber()==1) {
+				if (inputFormat.equals("auto")) {
+					detectAndSetInputFormat(tweetData);
+				}
+			}
+
 			String text;
 			if (inputFormat.equals("json")) {
-				text = jsonTweetReader.getText(tweetData); 
+				text = jsonTweetReader.getText(tweetData);
+				if (text==null) {
+					System.err.println("Warning, null text (JSON parse error?), using blank string instead");
+					text = "";
+				}
 			} else {
 				text = tweetData;
 			}
@@ -107,7 +130,7 @@ public void runTagger() throws IOException, ClassNotFoundException {
 			if (outputFormat.equals("conll")) {
 				outputJustTagging(sentence, modelSentence);
 			} else {
-				outputPrependedTagging(sentence, modelSentence, justTokenize, tweetData);				
+				outputPrependedTagging(sentence, modelSentence, justTokenize, line);				
 			}
 			numtoks += sentence.T();
 		}
@@ -209,7 +232,7 @@ public void evaluateSentenceTagging(Sentence lSent, ModelSentence mSent) {
 
 	private String formatConfidence(double confidence) {
 		// too many decimal places wastes space
-		return String.format("%.3f", confidence);
+		return String.format("%.4f", confidence);
 	}
 
 	/**
@@ -329,24 +352,30 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
 		if (!tagger.justTokenize && tagger.modelFilename == null) {
 			usage("Need to specify model");
 		}
-
-		tagger.finalizeOutputFormat();
 
 		if (args.length - i > 1) usage();
+		if (args.length <= i) usage();
 		tagger.inputFilename = args[i];
+		tagger.finalizeOptions();
 
-		tagger.runTagger();
-
+		tagger.runTagger();		
 	}
 
-	public void finalizeOutputFormat() {
+	public void finalizeOptions() throws IOException {
 		if (outputFormat.equals("auto")) {
 			if (inputFormat.equals("conll")) {
 				outputFormat = "conll";
 			} else {
 				outputFormat = "pretsv";
 			}
 		}
+		if (showConfidence && decoder==Decoder.VITERBI) {
+			System.err.println("Confidence output is unimplemented in Viterbi, turning it off.");
+			showConfidence = false;
+		}
+		if (justTokenize) {
+			showConfidence = false;
+		}
 	}
 
 	public static void usage() {
@@ -359,11 +388,10 @@ public static void usage(String extra) {
 "\n  runs the CMU ARK Twitter tagger on tweets from ExamplesFilename, " +
 "\n  writing taggings to standard output." +
 "\n\nOptions:" +
-"\n  --model <Filename>        Specify model filename." +
-"\n                            [TODO should this default to something?]" +
+"\n  --model <Filename>        Specify model filename. (Else use built-in.)" +
 "\n  --just-tokenize           Only run the tokenizer; no POS tags." +
 "\n  --quiet                   Quiet: no output" +
-"\n  --input-format <Format>   Default: json." +
+"\n  --input-format <Format>   Default: auto" +
 "\n                            Options: json, text, conll" +
 "\n  --output-format <Format>  Default: automatically decide from input format." +
 "\n                            Options: pretsv, conll" +
@@ -373,31 +401,23 @@ public static void usage(String extra) {
 "\n                            Only for {json, text} input formats." +
 "\n  --word-clusters <File>    Alternate word clusters file (see FeatureExtractor)" +
 "\n  --no-confidence           Don't output confidence probabilities" +
+"\n  --decoder <Decoder>       Change the decoding algorithm (default: greedy)" +
 "\n" +
-"\nThere are two types of input-output formats: " +
-"\n(1) tweet-per-line, and (2) token-per-line." +
 "\nTweet-per-line input formats:" +
 "\n   json: Every input line has a JSON object containing the tweet," +
-"\n         as per the Streaming API. (The 'text' field gets used.)" +
+"\n         as per the Streaming API. (The 'text' field is used.)" +
 "\n   text: Every input line has the text for one tweet." +
-"\nFor both cases, we the lines in the input are actually TSV," +
-"\nand the tweets (text or json) are one particular field." +
+"\nWe actually assume input lines are TSV and the tweet data is one field."+
 "\n(Therefore tab characters are not allowed in tweets." +
 "\nTwitter's own JSON formats guarantee this;" +
 "\nif you extract the text yourself, you must remove tabs and newlines.)" +
-"\nThis allows metadata to be passed through." +
-"\nBy default, the first field is used; change with --input-field." +
 "\nTweet-per-line output format is" +
-"\n   pretsv: Prepend the tokenization and tagging as two new TSV fields, " +
+"\n   pretsv: Prepend the tokenization and tagging as new TSV fields, " +
 "\n           so the output includes a complete copy of the input." +
-"\n           (Control where the fields are inserted with --output-field.)" +
-"\nBy default, two TSV fields are prepended:" +
-"\n          Tokenization \\t POSTags \\t (original data...)" +
+"\nBy default, three TSV fields are prepended:" +
+"\n   Tokenization \\t POSTags \\t Confidences \\t (original data...)" +
 "\nThe tokenization and tags are parallel space-separated lists." +
-"\nWrite your own Java wrapper to Tagger.java for a different format." +
-"\n" +
-"\nThere is only one token-per-line format:" +
-"\n   conll: Each line is: Token \\t Tag, and blank line separating tweets." +
+"\nThe 'conll' format is token-per-line, blank spaces separating tweets."+
 "\n");
 
 		if (extra != null) {

diff --git a/src/cmu/arktweetnlp/Train.java b/src/cmu/arktweetnlp/Train.java
@@ -75,7 +75,7 @@ public void constructLabelVocab() {
 		model.numLabels = model.labelVocab.size();
 	}
 
-	public void dumpFeatures() {
+	public void dumpFeatures() throws IOException {
 		FeatureExtractor fe = new FeatureExtractor(model, true);
 		fe.dumpMode = true;
 		for (Sentence lSent : lSentences) {
@@ -84,7 +84,7 @@ public void dumpFeatures() {
 		}
 	}
 
-	public void extractFeatures() {
+	public void extractFeatures() throws IOException {
 		System.out.println("Extracting features");
 		FeatureExtractor fe = new FeatureExtractor(model, true);
 		for (Sentence lSent : lSentences) {

diff --git a/src/cmu/arktweetnlp/impl/Model.java b/src/cmu/arktweetnlp/impl/Model.java
@@ -406,7 +406,7 @@ public void saveModelAsText(String outputFilename) throws IOException {
 
 	public static Model loadModelFromText(String filename) throws IOException {
 		Model model = new Model();
-		BufferedReader reader = BasicFileIO.openFileToReadUTF8(filename);
+		BufferedReader reader = BasicFileIO.openFileOrResource(filename);
 		String line;
 
 		ArrayList<Double> biasCoefs = 

diff --git a/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java b/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java
@@ -1,5 +1,6 @@
 package cmu.arktweetnlp.impl.features;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
@@ -22,7 +23,7 @@ public class FeatureExtractor {
 	private ArrayList<FeatureExtractorInterface> allFeatureExtractors;
 	public boolean isTrainingTime;
 	public boolean dumpMode = false;
-	public FeatureExtractor(Model model, boolean isTrainingTime){
+	public FeatureExtractor(Model model, boolean isTrainingTime) throws IOException{
 		this.model = model;
 		this.isTrainingTime = isTrainingTime;
 		assert model.labelVocab.isLocked();
@@ -139,7 +140,7 @@ public void add(int labelIndex, String featureID, double featureValue) {
 
 
 
-	private void initializeFeatureExtractors() {
+	private void initializeFeatureExtractors() throws IOException {
 		allFeatureExtractors = new ArrayList<FeatureExtractorInterface>();
 
 		allFeatureExtractors.add(new WordClusterPaths());