diff --git a/README.txt b/README.txt index fad9a1a..c266a15 100644 --- a/README.txt +++ b/README.txt @@ -1,4 +1,4 @@ -CMU ARK Twitter Part-of-Speech Tagger v0.3-pre +CMU ARK Twitter Part-of-Speech Tagger v0.3 http://www.ark.cs.cmu.edu/TweetNLP/ Basic usage @@ -6,38 +6,23 @@ Basic usage Requires Java 6. To run the tagger from unix shell: - ./runTagger.sh example_tweets.txt modelfile > tagged_tweets.txt + ./runTagger.sh examples/example_tweets.txt -Another example: +The tagger outputs tokens, predicted part-of-speech tags, and confidences. +For more information: - ./runTagger.sh --input-format json barackobama.jsonlines.txt -output tagged_barackobama.txt + ./runTagger.sh --help -The outputs should match tagged_tweets_expected.txt and barackobamaexpected.txt respectively. +We also include a script that invokes just the tokenizer: + ./twokenize.sh examples/example_tweets.txt +Information +=========== -Advanced usage --------------- - -We include a pre-compiled .jar of the tagger so you hopefully don't need to -compile it. But if you need to recompile, do: - mvn install -NOTE: requires Maven 3.0.3+ - -To train and evalute the tagger, see: - ark-tweet-nlp/src/main/java/edu/cmu/cs/lti/ark/ssl/pos/SemiSupervisedPOSTagger.java - scripts/train.sh and scripts/test.sh - -Contents --------- - * runTagger.sh is the script you probably want - * lib/ dependencies - * ark-tweet-nlp/src the project code itself (all java) +Version 0.3 of the tagger is 40 times faster and more accurate. Please see the tech report on the website for details. -Information ------------ -This tagger is described in the following paper. Please cite it if you write a -research paper using this software. +This tagger is described in the following two papers, available at the website. Please cite this if you write a research paper using this software. Part-of-Speech Tagging for Twitter: Annotation, Features, and Experiments Kevin Gimpel, Nathan Schneider, Brendan O'Connor, Dipanjan Das, Daniel Mills, @@ -46,23 +31,7 @@ research paper using this software. Linguistics, companion volume, Portland, OR, June 2011. http://www.ark.cs.cmu.edu/TweetNLP/gimpel+etal.acl11.pdf -The software is licensed under Apache 2.0 (see LICENSE file). - -Version 0.2 of the tagger differs from version 0.1 in the following ways: - -* The tokenizer has been improved and integrated with the tagger in a single Java program. - -* The new tokenizer was run on the 1,827 tweets used for the annotation effort and the -annotations were adapted for tweets with differing tokenizations. The revised annotations -are contained in a companion v0.2 release of the data (twpos-data-v0.2). - -* The tagging model is trained on ALL of the available annotated data in twpos-data-v0.2. -The model in v0.1 was only trained on the training set. - -* The tokenizer/tagger is integrated with Twitter's text commons annotations API. - Contact -------- -Please contact Brendan O'Connor (brenocon@cmu.edu) and Kevin Gimpel (kgimpel@cs.cmu.edu) -if you encounter any problems. +======= +Please contact Brendan O'Connor (brenocon@cmu.edu) and Kevin Gimpel (kgimpel@cs.cmu.edu) if you encounter any problems. diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml index e8afce9..cbe2082 100644 --- a/ark-tweet-nlp/pom.xml +++ b/ark-tweet-nlp/pom.xml @@ -4,7 +4,7 @@ edu.cmu.cs ark-tweet-nlp jar - 0.3-SNAPSHOT + 0.3 CMU ARK TweetNLP: Twitter POS tagger http://www.ark.cs.cmu.edu/TweetNLP/ diff --git a/examples/bill_nye_tho.txt b/examples/bill_nye_tho.txt new file mode 100644 index 0000000..eae48b9 --- /dev/null +++ b/examples/bill_nye_tho.txt @@ -0,0 +1,16 @@ +247120554400821248 2012-09-15T23:51:46 Bill_Nye_tho all out of wood facts +247120392324542464 2012-09-15T23:51:07 Bill_Nye_tho u can build a house w/ it +247119965784784896 2012-09-15T23:49:25 Bill_Nye_tho more wood facts still to come +247119210113802240 2012-09-15T23:46:25 Bill_Nye_tho its biodegradable #woodfacts +247118527113355264 2012-09-15T23:43:42 Bill_Nye_tho u could burn it #woodfacts +247117483482431488 2012-09-15T23:39:33 Bill_Nye_tho SHOUT OUT TO MY NIGGAS EATIN HUMUS +247114115762499584 2012-09-15T23:26:11 Bill_Nye_tho if u want me to give a lecture at ur school contact ur student board or w/e. or contact me and i'll just come i don't give a fuck lol +247113014011109378 2012-09-15T23:21:48 Bill_Nye_tho u ever been havin the illest dream ever n u wake up right as its gettin real good and ur like damn i wasnt done smashin Jane Goodall's shit +247089985625395202 2012-09-15T21:50:17 Bill_Nye_tho sometimes i'll freeze water then melt it then freeze it again an i just keep doing that until it stops being awesome but it never does +246819478107746304 2012-09-15T03:55:23 Bill_Nye_tho YO I CANT THINK OF ANYTHING THAT GETS ME MORE HEATED THAN ARTIFICIAL PLANTS +246815764902981632 2012-09-15T03:40:38 Bill_Nye_tho look at u lookn all cute over there girl come here a min lemme holla atcha whats ur bigges fantasy mine is to visit Triton,Neptunes 7th moon +246811004770590721 2012-09-15T03:21:43 Bill_Nye_tho its cool that chameleons can blend in with their environment but at a certain points it's like just do u homie!!! +246806113645907969 2012-09-15T03:02:17 Bill_Nye_tho @Wendys what up Wendy's on average one fully grown bovine can produce about 2400 hamburger patties. and u can fact check that shit homeboyyy +246590149234925569 2012-09-14T12:44:07 Bill_Nye_tho SHOUTS OUT TO PEOPLE WILLINGLY LIVIN IN TOWNS RIGHT NEXT TO ACTIVE VOLCANOES LIKE "NAH WE'RE GOOD" +246589808217051138 2012-09-14T12:42:46 Bill_Nye_tho a lotta people refer to this as a novelty account i dont see whats so novel about science but whatever p.s. lava can flow up to 10km perhour +246348019048542208 2012-09-13T20:41:59 Bill_Nye_tho Jane Goodall is a bad bitch diff --git a/runTagger.sh b/runTagger.sh index a77e6bb..82ddab6 100755 --- a/runTagger.sh +++ b/runTagger.sh @@ -1,7 +1,5 @@ #!/bin/bash +set -eu -# For development -# java -Xmx1g -jar $(dirname $0)/ark-tweet-nlp/target/bin/ark-tweet-nlp-0.3-SNAPSHOT.jar "$@" - -# For release -java -Xmx1g -jar $(dirname $0)/ark-tweet-nlp-0.3.jar "$@" +# Run the tagger (and tokenizer). +java -Xmx500m -jar $(dirname $0)/ark-tweet-nlp-0.3.jar "$@" diff --git a/scripts/prepare_release.sh b/scripts/prepare_release.sh new file mode 100644 index 0000000..a93a59d --- /dev/null +++ b/scripts/prepare_release.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +VERSION=0.3 +DIR=ark-tweet-nlp-$VERSION + +set -eux + +rm -rf $DIR +mkdir $DIR + +# mvn clean +# mvn package +cp ark-tweet-nlp/target/bin/ark-tweet-nlp-${VERSION}.jar $DIR + +cp -r examples $DIR +cp -r scripts $DIR +rm $DIR/scripts/prepare_release.sh +rm $DIR/scripts/java.sh +cp *.sh $DIR +cp *.txt $DIR + +# these dont work, need to fix +rm $DIR/examples/barackobama* diff --git a/scripts/toconll.py b/scripts/toconll.py new file mode 100644 index 0000000..80c8009 --- /dev/null +++ b/scripts/toconll.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# Take the pretsv format and make it CoNLL-like ("supertsv", having tweet metadata headers) +import sys,json +from datetime import datetime + +for line in sys.stdin: + parts = line.split('\t') + tokens = parts[0].split() + tags = parts[1].split() + try: + d = json.loads(parts[-1]) + print "TWEET\t{}\t{}".format(d['id'], datetime.strptime(d['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime("%Y-%m-%dT%H:%M:%S")) + print "TOKENS" + except: + pass + + for tok,tag in zip(tokens,tags): + print "{}\t{}".format(tag,tok) + print "" + diff --git a/src/cmu/arktweetnlp/RunTagger.java b/src/cmu/arktweetnlp/RunTagger.java index ccfa628..b821b8f 100644 --- a/src/cmu/arktweetnlp/RunTagger.java +++ b/src/cmu/arktweetnlp/RunTagger.java @@ -28,18 +28,19 @@ public class RunTagger { Tagger tagger; // Commandline I/O-ish options - String inputFormat = "json"; + String inputFormat = "auto"; String outputFormat = "auto"; int inputField = 1; String inputFilename; - String modelFilename; + /** Can be either filename or resource name **/ + String modelFilename = "/cmu/arktweetnlp/model.20120919"; public boolean noOutput = false; public boolean justTokenize = false; public static enum Decoder { GREEDY, VITERBI }; - public Decoder decoder = Decoder.VITERBI; + public Decoder decoder = Decoder.GREEDY; public boolean showConfidence = true; PrintStream outputStream; @@ -64,6 +65,17 @@ public RunTagger() throws UnsupportedEncodingException { // force UTF-8 here, so don't need -Dfile.encoding this.outputStream = new PrintStream(System.out, true, "UTF-8"); } + public void detectAndSetInputFormat(String tweetData) throws IOException { + JsonTweetReader jsonTweetReader = new JsonTweetReader(); + if (jsonTweetReader.isJson(tweetData)) { + System.err.println("Detected JSON input format"); + inputFormat = "json"; + } else { + System.err.println("Detected text input format"); + inputFormat = "text"; + } + } + public void runTagger() throws IOException, ClassNotFoundException { tagger = new Tagger(); @@ -86,9 +98,20 @@ public void runTagger() throws IOException, ClassNotFoundException { while ( (line = reader.readLine()) != null) { String[] parts = line.split("\t"); String tweetData = parts[inputField-1]; + + if (reader.getLineNumber()==1) { + if (inputFormat.equals("auto")) { + detectAndSetInputFormat(tweetData); + } + } + String text; if (inputFormat.equals("json")) { - text = jsonTweetReader.getText(tweetData); + text = jsonTweetReader.getText(tweetData); + if (text==null) { + System.err.println("Warning, null text (JSON parse error?), using blank string instead"); + text = ""; + } } else { text = tweetData; } @@ -107,7 +130,7 @@ public void runTagger() throws IOException, ClassNotFoundException { if (outputFormat.equals("conll")) { outputJustTagging(sentence, modelSentence); } else { - outputPrependedTagging(sentence, modelSentence, justTokenize, tweetData); + outputPrependedTagging(sentence, modelSentence, justTokenize, line); } numtoks += sentence.T(); } @@ -209,7 +232,7 @@ public void evaluateSentenceTagging(Sentence lSent, ModelSentence mSent) { private String formatConfidence(double confidence) { // too many decimal places wastes space - return String.format("%.3f", confidence); + return String.format("%.4f", confidence); } /** @@ -329,17 +352,16 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio if (!tagger.justTokenize && tagger.modelFilename == null) { usage("Need to specify model"); } - - tagger.finalizeOutputFormat(); if (args.length - i > 1) usage(); + if (args.length <= i) usage(); tagger.inputFilename = args[i]; + tagger.finalizeOptions(); - tagger.runTagger(); - + tagger.runTagger(); } - public void finalizeOutputFormat() { + public void finalizeOptions() throws IOException { if (outputFormat.equals("auto")) { if (inputFormat.equals("conll")) { outputFormat = "conll"; @@ -347,6 +369,13 @@ public void finalizeOutputFormat() { outputFormat = "pretsv"; } } + if (showConfidence && decoder==Decoder.VITERBI) { + System.err.println("Confidence output is unimplemented in Viterbi, turning it off."); + showConfidence = false; + } + if (justTokenize) { + showConfidence = false; + } } public static void usage() { @@ -359,11 +388,10 @@ public static void usage(String extra) { "\n runs the CMU ARK Twitter tagger on tweets from ExamplesFilename, " + "\n writing taggings to standard output." + "\n\nOptions:" + -"\n --model Specify model filename." + -"\n [TODO should this default to something?]" + +"\n --model Specify model filename. (Else use built-in.)" + "\n --just-tokenize Only run the tokenizer; no POS tags." + "\n --quiet Quiet: no output" + -"\n --input-format Default: json." + +"\n --input-format Default: auto" + "\n Options: json, text, conll" + "\n --output-format Default: automatically decide from input format." + "\n Options: pretsv, conll" + @@ -373,31 +401,23 @@ public static void usage(String extra) { "\n Only for {json, text} input formats." + "\n --word-clusters Alternate word clusters file (see FeatureExtractor)" + "\n --no-confidence Don't output confidence probabilities" + +"\n --decoder Change the decoding algorithm (default: greedy)" + "\n" + -"\nThere are two types of input-output formats: " + -"\n(1) tweet-per-line, and (2) token-per-line." + "\nTweet-per-line input formats:" + "\n json: Every input line has a JSON object containing the tweet," + -"\n as per the Streaming API. (The 'text' field gets used.)" + +"\n as per the Streaming API. (The 'text' field is used.)" + "\n text: Every input line has the text for one tweet." + -"\nFor both cases, we the lines in the input are actually TSV," + -"\nand the tweets (text or json) are one particular field." + +"\nWe actually assume input lines are TSV and the tweet data is one field."+ "\n(Therefore tab characters are not allowed in tweets." + "\nTwitter's own JSON formats guarantee this;" + "\nif you extract the text yourself, you must remove tabs and newlines.)" + -"\nThis allows metadata to be passed through." + -"\nBy default, the first field is used; change with --input-field." + "\nTweet-per-line output format is" + -"\n pretsv: Prepend the tokenization and tagging as two new TSV fields, " + +"\n pretsv: Prepend the tokenization and tagging as new TSV fields, " + "\n so the output includes a complete copy of the input." + -"\n (Control where the fields are inserted with --output-field.)" + -"\nBy default, two TSV fields are prepended:" + -"\n Tokenization \\t POSTags \\t (original data...)" + +"\nBy default, three TSV fields are prepended:" + +"\n Tokenization \\t POSTags \\t Confidences \\t (original data...)" + "\nThe tokenization and tags are parallel space-separated lists." + -"\nWrite your own Java wrapper to Tagger.java for a different format." + -"\n" + -"\nThere is only one token-per-line format:" + -"\n conll: Each line is: Token \\t Tag, and blank line separating tweets." + +"\nThe 'conll' format is token-per-line, blank spaces separating tweets."+ "\n"); if (extra != null) { diff --git a/src/cmu/arktweetnlp/Train.java b/src/cmu/arktweetnlp/Train.java index 28da973..e5fc4bb 100644 --- a/src/cmu/arktweetnlp/Train.java +++ b/src/cmu/arktweetnlp/Train.java @@ -75,7 +75,7 @@ public void constructLabelVocab() { model.numLabels = model.labelVocab.size(); } - public void dumpFeatures() { + public void dumpFeatures() throws IOException { FeatureExtractor fe = new FeatureExtractor(model, true); fe.dumpMode = true; for (Sentence lSent : lSentences) { @@ -84,7 +84,7 @@ public void dumpFeatures() { } } - public void extractFeatures() { + public void extractFeatures() throws IOException { System.out.println("Extracting features"); FeatureExtractor fe = new FeatureExtractor(model, true); for (Sentence lSent : lSentences) { diff --git a/src/cmu/arktweetnlp/impl/Model.java b/src/cmu/arktweetnlp/impl/Model.java index dddc54d..f2e036b 100644 --- a/src/cmu/arktweetnlp/impl/Model.java +++ b/src/cmu/arktweetnlp/impl/Model.java @@ -406,7 +406,7 @@ public void saveModelAsText(String outputFilename) throws IOException { public static Model loadModelFromText(String filename) throws IOException { Model model = new Model(); - BufferedReader reader = BasicFileIO.openFileToReadUTF8(filename); + BufferedReader reader = BasicFileIO.openFileOrResource(filename); String line; ArrayList biasCoefs = diff --git a/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java b/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java index a3a0b97..1751801 100644 --- a/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java +++ b/src/cmu/arktweetnlp/impl/features/FeatureExtractor.java @@ -1,5 +1,6 @@ package cmu.arktweetnlp.impl.features; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; @@ -22,7 +23,7 @@ public class FeatureExtractor { private ArrayList allFeatureExtractors; public boolean isTrainingTime; public boolean dumpMode = false; - public FeatureExtractor(Model model, boolean isTrainingTime){ + public FeatureExtractor(Model model, boolean isTrainingTime) throws IOException{ this.model = model; this.isTrainingTime = isTrainingTime; assert model.labelVocab.isLocked(); @@ -139,7 +140,7 @@ public void add(int labelIndex, String featureID, double featureValue) { - private void initializeFeatureExtractors() { + private void initializeFeatureExtractors() throws IOException { allFeatureExtractors = new ArrayList(); allFeatureExtractors.add(new WordClusterPaths()); diff --git a/src/cmu/arktweetnlp/impl/features/TagDictionary.java b/src/cmu/arktweetnlp/impl/features/TagDictionary.java index 0f08807..dab8963 100644 --- a/src/cmu/arktweetnlp/impl/features/TagDictionary.java +++ b/src/cmu/arktweetnlp/impl/features/TagDictionary.java @@ -7,17 +7,26 @@ import cmu.arktweetnlp.util.BasicFileIO; public class TagDictionary { - public final static Map> WORD_TO_POS; + public static Map> WORD_TO_POS; static { - WORD_TO_POS = loadData(); + WORD_TO_POS = null; + + try { + WORD_TO_POS = loadData(); + + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } public static TagDictionary instance() { return new TagDictionary(); } - static Map> loadData() { + static Map> loadData() throws IOException { // log.info("loading POS tag dictionary..."); Metaphone _metaphone = new Metaphone(); _metaphone.setMaxCodeLen(100); diff --git a/src/cmu/arktweetnlp/impl/features/WordClusterPaths.java b/src/cmu/arktweetnlp/impl/features/WordClusterPaths.java index 537151d..9aecee4 100644 --- a/src/cmu/arktweetnlp/impl/features/WordClusterPaths.java +++ b/src/cmu/arktweetnlp/impl/features/WordClusterPaths.java @@ -1,6 +1,7 @@ package cmu.arktweetnlp.impl.features; import java.io.BufferedReader; +import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; @@ -21,7 +22,7 @@ public class WordClusterPaths implements FeatureExtractorInterface { public static HashMap wordToPath; - public WordClusterPaths() { + public WordClusterPaths() throws IOException { // log.info("Loading clusters"); //read in paths file diff --git a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java index 2b6077e..e4146d2 100644 --- a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java +++ b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java @@ -1,6 +1,7 @@ package cmu.arktweetnlp.impl.features; import java.io.BufferedReader; +import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -58,7 +59,7 @@ public void addFeatures(List tokens, PositionFeaturePairs pairs) { public static class Listofnames implements FeatureExtractorInterface { String Listname=""; HashSet members; - public Listofnames(String str) { + public Listofnames(String str) throws IOException { Listname=str; this.members = initDict(Listname); } @@ -92,7 +93,7 @@ public void addFeatures(List tokens, PositionFeaturePairs pairs) { } } - private static HashSet initDict(String dict) { + private static HashSet initDict(String dict) throws IOException { BufferedReader bReader = BasicFileIO.getResourceReader("/cmu/arktweetnlp/" + dict); HashSet dictset = new HashSet(); String line=BasicFileIO.getLine(bReader); diff --git a/src/cmu/arktweetnlp/io/JsonTweetReader.java b/src/cmu/arktweetnlp/io/JsonTweetReader.java index 7ef0e2c..b49f46f 100644 --- a/src/cmu/arktweetnlp/io/JsonTweetReader.java +++ b/src/cmu/arktweetnlp/io/JsonTweetReader.java @@ -46,5 +46,23 @@ public String getText(String tweetJson) { return textValue.asText(); } + + public boolean isJson(String isThisJson) { + JsonNode rootNode; + + if (isThisJson.charAt(0) != '{') + return false; + + try { + rootNode = mapper.readValue(isThisJson, JsonNode.class); + } catch (JsonParseException e) { + return false; + } catch (IOException e) { + System.err.println("WTF -- got IOException in isJson()"); + return false; + } + return true; + + } } diff --git a/src/cmu/arktweetnlp/util/BasicFileIO.java b/src/cmu/arktweetnlp/util/BasicFileIO.java index 8a9c345..6879912 100644 --- a/src/cmu/arktweetnlp/util/BasicFileIO.java +++ b/src/cmu/arktweetnlp/util/BasicFileIO.java @@ -224,15 +224,30 @@ public static Object readSerializedObject(InputStream iFile) { * e.g. http://stackoverflow.com/questions/1464291/how-to-really-read-text-file-from-classpath-in-java * * (added by Brendan 2012-08-14) + * @throws IOException */ - public static BufferedReader getResourceReader(String resourceName) { + public static BufferedReader getResourceReader(String resourceName) throws IOException { assert resourceName.startsWith("/") : "Absolute path needed for resource"; InputStream stream = BasicFileIO.class.getResourceAsStream(resourceName); - if (stream == null) throw new RuntimeException("failed to find resource " + resourceName); + if (stream == null) throw new IOException("failed to find resource " + resourceName); //read in paths file BufferedReader bReader = new BufferedReader(new InputStreamReader( stream, Charset.forName("UTF-8"))); return bReader; } + + /** Try to get a file, if it doesn't exist, backoff to a resource. + * @throws IOException **/ + public static BufferedReader openFileOrResource(String fileOrResource) throws IOException { + try { + if (new File(fileOrResource).exists()) { + return openFileToReadUTF8(fileOrResource); + } else { + return getResourceReader(fileOrResource); + } + } catch (IOException e) { + throw new IOException("Neither file nor resource found for: " + fileOrResource); + } + } } diff --git a/twokenize.sh b/twokenize.sh new file mode 100755 index 0000000..e737c94 --- /dev/null +++ b/twokenize.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Only run the tokenizer. + +set -eu +java -Xmx100m -jar $(dirname $0)/ark-tweet-nlp-0.3.jar --just-tokenize "$@"