JSON input, pretsv output, and escaping troubles

brendano · Aug 15, 2012 · 05e662f · 05e662f
1 parent 8c3163a
commit 05e662f
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 21 deletions.
diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml
@@ -175,6 +175,13 @@
             <version>1.1</version>
             <scope>test</scope>
         </dependency>
+
+		<dependency>
+		  <groupId>com.fasterxml.jackson.core</groupId>
+		  <artifactId>jackson-databind</artifactId>
+		  <version>2.0.0</version>
+		</dependency>
+
         <!-- END testing dependecies -->
     </dependencies>
 </project>
diff --git a/examples/tweets.jsonline b/examples/tweets.jsonline
@@ -0,0 +1,2 @@
+{"text":"RT @TheyCallMeGriff @SkinniiMini u kno where i stay u want it just knock nigga &lt;&gt; u tryna lose ur life tonight nigga?","in_reply_to_status_id":null,"truncated":false,"source":"\u003Ca href=\"http:\/\/twidroyd.com\" rel=\"nofollow\"\u003Etwidroyd\u003C\/a\u003E","favorited":false,"in_reply_to_user_id":null,"entities":{"urls":[],"hashtags":[],"user_mentions":[{"indices":[3,19],"screen_name":"TheyCallMeGriff","name":"Call me C-Griff","id_str":"116846416","id":116846416},{"indices":[20,32],"screen_name":"SkinniiMini","name":"Tiara Mallory","id_str":"70174532","id":70174532}]},"in_reply_to_screen_name":null,"created_at":"Wed Apr 06 05:40:42 +0000 2011","in_reply_to_status_id_str":null,"place":null,"id_str":"55505187816734720","contributors":null,"coordinates":null,"geo":null,"user":{"contributors_enabled":false,"profile_link_color":"fa9600","profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1299807059\/199606_1576886593292_1569420007_31095569_8292446_n_normal.jpg","profile_sidebar_border_color":"fa9a0a","follow_request_sent":null,"statuses_count":17911,"friends_count":617,"screen_name":"SkinniiMini","profile_use_background_image":true,"description":"Name explains it all: Tiara- the crowned one. Address me as Her Highness. Secretary of Checkmate Ent. I love mi team! #CME","profile_background_color":"020500","location":"Royal castle\/VSU","profile_background_image_url":"http:\/\/a1.twimg.com\/profile_background_images\/223660852\/mail.jpg","listed_count":9,"followers_count":618,"default_profile":false,"lang":"en","time_zone":"Quito","created_at":"Sun Aug 30 18:02:58 +0000 2009","profile_text_color":"e60afa","protected":false,"profile_sidebar_fill_color":"0e0f0e","url":null,"name":"Tiara Mallory","id_str":"70174532","is_translator":false,"verified":false,"profile_background_tile":true,"id":70174532,"default_profile_image":false,"show_all_inline_media":false,"following":null,"geo_enabled":false,"notifications":null,"utc_offset":-18000,"favourites_count":32},"retweeted":false,"id":55505187816734720,"in_reply_to_user_id_str":null,"retweet_count":0}
+{"in_reply_to_user_id":null,"favorited":false,"text":"Going to school early tomorrow. Ehhh gotta wake up early now. Ughhh","in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"id_str":"55440176318660608","coordinates":null,"geo":null,"contributors":null,"retweeted":false,"source":"\u003Ca href=\"http:\/\/twitter.com\/devices\" rel=\"nofollow\"\u003Etxt\u003C\/a\u003E","in_reply_to_user_id_str":null,"retweet_count":0,"entities":{"user_mentions":[],"urls":[],"hashtags":[]},"in_reply_to_status_id":null,"created_at":"Wed Apr 06 01:22:22 +0000 2011","place":null,"user":{"followers_count":1416,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1268192445\/Picture_85_normal.jpg","listed_count":246,"time_zone":"Central Time (US & Canada)","profile_text_color":"333333","default_profile":false,"profile_sidebar_fill_color":"E6F6F9","screen_name":"BeliebinAustin","id_str":"46909648","verified":false,"profile_background_tile":true,"description":"Im a Belieber . I love Justin Bieber, Cody Simpson, Austin Mahone, and Dylan Holland. Loving is all I do. ;D","is_translator":false,"notifications":null,"location":"Michigan","default_profile_image":false,"show_all_inline_media":false,"geo_enabled":false,"favourites_count":6,"profile_link_color":"CC3366","contributors_enabled":false,"profile_sidebar_border_color":"DBE9ED","lang":"en","created_at":"Sat Jun 13 15:59:52 +0000 2009","protected":false,"follow_request_sent":null,"statuses_count":4791,"profile_use_background_image":true,"friends_count":1369,"url":null,"name":"Anai Anaya","profile_background_color":"DBE9ED","id":46909648,"following":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/208302168\/imagesCA3X9PC6.jpg","utc_offset":-21600},"truncated":false,"id":55440176318660608}
diff --git a/lib/jackson-core/jackson-core-2.1.0-SNAPSHOT.jar b/lib/jackson-core/jackson-core-2.1.0-SNAPSHOT.jar
diff --git a/scripts/show.py b/scripts/show.py
@@ -0,0 +1,10 @@
+# Take the pretsv format and make it easier to read
+
+import sys
+for line in sys.stdin:
+    parts = line.split('\t')
+    tokens = parts[0].split()
+    tags = parts[1].split()
+    pairs = ["%s/%s" % (tok, tag) for tok,tag in zip(tokens,tags)]
+    print ' '.join(pairs)
+
diff --git a/src/cmu/arktweetnlp/RunTagger.java b/src/cmu/arktweetnlp/RunTagger.java
@@ -1,14 +1,18 @@
 package cmu.arktweetnlp;
 
+import java.io.BufferedReader;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.HashSet;
 import java.util.List;
 
+import cmu.arktweetnlp.Twokenize.Tokenization;
 import cmu.arktweetnlp.impl.ModelSentence;
 import cmu.arktweetnlp.impl.Sentence;
 import cmu.arktweetnlp.io.CoNLLReader;
+import cmu.arktweetnlp.io.JsonTweetReader;
+import cmu.arktweetnlp.util.BasicFileIO;
 import edu.stanford.nlp.util.Pair;
 import edu.stanford.nlp.util.StringUtils;
 
@@ -70,23 +74,42 @@ public void runTagger() throws IOException, ClassNotFoundException {
 		} 
 		assert (inputFormat.equals("json") || inputFormat.equals("text"));
 
-		die("TODO");
-		for (Pair<String, String> recordAndText : iterateTweets()) {
-			Sentence sentence=null;
-			ModelSentence mSent=null;
-			// run tokenizer to fill out Sentence
-			// run tagger to get tags
-			outputPrependedTagging(sentence, mSent, this.justTokenize, recordAndText.first);
+		JsonTweetReader jsonTweetReader = new JsonTweetReader();
+
+		BufferedReader reader = BasicFileIO.openFileToReadUTF8(inputFilename);
+		String line;
+		while ( (line = reader.readLine()) != null) {
+			String[] parts = line.split("\t");
+			String tweetData = parts[inputField-1];
+			String text;
+			if (inputFormat.equals("json")) {
+				text = jsonTweetReader.getText(tweetData); 
+			} else {
+				text = tweetData;
+			}
+
+			Sentence sentence = new Sentence();
+
+//			Tokenization tokenization = Twokenize.tokenizeForTaggerAndOriginal(text);
+//			sentence.tokens = tokenization.normalizedTokens;
+
+			sentence.tokens = Twokenize.tokenizeForTagger(text);
+
+			ModelSentence modelSentence = new ModelSentence(sentence.T());
+			tagger.featureExtractor.computeFeatures(sentence, modelSentence);
+			goDecode(modelSentence);
+
+			if (outputFormat.equals("conll")) {
+				outputJustTagging(sentence, modelSentence);
+			} else {
+				outputPrependedTagging(sentence, modelSentence,
+						this.justTokenize, tweetData);				
+			}
+
 		}
 	}
-	/** yields (FullInputLine, TweetText) pairs. **/
-	private Iterable<Pair<String, String>> iterateTweets() {
-		// TODO
-		return null;
-	}
-
 
-	/** Runs the correct algorithm (TODO make config option?) **/
+	/** Runs the correct algorithm (make config option perhaps) **/
 	public void goDecode(ModelSentence mSent) {
 		//tagger.model.greedyDecode(mSent);
 		tagger.model.viterbiDecode(mSent);		
@@ -234,6 +257,9 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
 				tagger.noOutput = true;
 				i += 1;
 			} else if (args[i].equals("--input-format")) {
+				String s = args[i+1];
+				if (!(s.equals("json")||s.equals("text")||s.equals("conll")))
+					usage("input format must be: json, text, or conll");
 				tagger.inputFormat = args[i+1];
 				i += 2;
 			} else if (args[i].equals("--output-format")) {
@@ -297,7 +323,7 @@ public static void usage(String extra) {
 "\n(1) tweet-per-line, and (2) token-per-line." +
 "\nTweet-per-line input formats:" +
 "\n   json: Every input line has a JSON object containing the tweet," +
-"\n         as per certain Twitter APIs. (The 'text' field will be tagged.)" +
+"\n         as per the Streaming API. (The 'text' field gets used.)" +
 "\n   text: Every input line has the text for one tweet." +
 "\nFor both cases, we the lines in the input are actually TSV," +
 "\nand the tweets (text or json) are one particular field." +

diff --git a/src/cmu/arktweetnlp/Twokenize.java b/src/cmu/arktweetnlp/Twokenize.java
@@ -294,22 +294,28 @@ public static List<String> tokenize (String text){
     }
 
 
+    // OLD COMMENT
     // Very slight normalization for AFTER tokenization.
     // The tokenization regexes are written to work on non-normalized text.
     // (to make byte offsets easier to compute)
     // Hm: 2+ repeated character normalization here?
     // No, that's more linguistic, should be further down the pipeline 
     public static String normalizeText(String text) {
-        //return text.replaceAll("&lt;", "<").replaceAll("&gt;",">").replaceAll("&amp;","&").replaceAll("&lt;", "<").replaceAll("&gt;",">").replaceAll("&amp;","&");
+//    	text = text.replaceAll("&lt;", "<").replaceAll("&gt;",">");
     	text = text.replaceAll("&amp;", "&");
-    	return StringEscapeUtils.unescapeHtml(text);
+    	text = StringEscapeUtils.unescapeHtml(text);
+    	return text;
     }
 
-    public static List<String> tokenizeForTagger (String text) {
+    /**
+     * Note this normalizes text BEFORE calling the tokenizer. So the tokens you get back may not exactly correspond to
+     * substrings of the original text.
+     */
+    public static List<String> tokenizeForTagger(String text) {
         List<String> res = new ArrayList<String>();
         List<String> pretokenized = tokenize(normalizeText(text));
         for(String token:pretokenized){
-            res.add((token));
+            res.add(token);
         }
         return res;
     }
@@ -320,6 +326,10 @@ public static class Tokenization {
         public List<String> normalizedTokens;
     }
 
+    /**
+     * This is subtly broken dont use right now -
+     * to make it work, need to redo tokenizer regexes to handle unescaped html. 
+     **/
     public static Tokenization tokenizeForTaggerAndOriginal(String text) {
         Tokenization tokenization = new Tokenization();
         tokenization.rawTokens = tokenize(text);
@@ -330,8 +340,10 @@ public static Tokenization tokenizeForTaggerAndOriginal(String text) {
         return tokenization;
     }
 
-	  // Convenience method to produce a string representation of the 
-	  // tokenized tweet in a standard-ish format.
+
+  /**
+   * Returns tokenization as a single string of space-separated tokens.
+   */
     public static String tokenizeToString (String text){
     	List<String> tokenized = tokenizeForTagger(text);
     	if (tokenized.size()==0)

diff --git a/src/cmu/arktweetnlp/io/JsonTweetReader.java b/src/cmu/arktweetnlp/io/JsonTweetReader.java
@@ -0,0 +1,51 @@
+package cmu.arktweetnlp.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Iterator;
+
+import cmu.arktweetnlp.util.BasicFileIO;
+
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ *
+ */
+public class JsonTweetReader  {
+	ObjectMapper mapper;
+
+	public JsonTweetReader() {
+		mapper = new ObjectMapper();
+	}
+
+	/**
+	 * Get the text from a raw Tweet JSON string.
+	 * 
+	 * @param tweetJson
+	 * @return null if there is no text field, or invalid JSON.
+	 */
+	public String getText(String tweetJson) {
+		JsonNode rootNode; 
+
+		// wtf, we have to allocate a new parser for every line?
+		try {
+			rootNode = mapper.readValue(tweetJson, JsonNode.class);
+		} catch (JsonParseException e) {
+			return null;
+		} catch (IOException e) {
+			return null;
+		}
+
+		if (! rootNode.isObject())
+			return null;
+
+		JsonNode textValue = rootNode.get("text");
+		if (textValue==null)
+			return null;
+
+		return textValue.asText();
+	}
+
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"text":"RT @TheyCallMeGriff @SkinniiMini u kno where i stay u want it just knock nigga <> u tryna lose ur life tonight nigga?","in_reply_to_status_id":null,"truncated":false,"source":"\u003Ca href=\"http:\/\/twidroyd.com\" rel=\"nofollow\"\u003Etwidroyd\u003C\/a\u003E","favorited":false,"in_reply_to_user_id":null,"entities":{"urls":[],"hashtags":[],"user_mentions":[{"indices":[3,19],"screen_name":"TheyCallMeGriff","name":"Call me C-Griff","id_str":"116846416","id":116846416},{"indices":[20,32],"screen_name":"SkinniiMini","name":"Tiara Mallory","id_str":"70174532","id":70174532}]},"in_reply_to_screen_name":null,"created_at":"Wed Apr 06 05:40:42 +0000 2011","in_reply_to_status_id_str":null,"place":null,"id_str":"55505187816734720","contributors":null,"coordinates":null,"geo":null,"user":{"contributors_enabled":false,"profile_link_color":"fa9600","profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1299807059\/199606_1576886593292_1569420007_31095569_8292446_n_normal.jpg","profile_sidebar_border_color":"fa9a0a","follow_request_sent":null,"statuses_count":17911,"friends_count":617,"screen_name":"SkinniiMini","profile_use_background_image":true,"description":"Name explains it all: Tiara- the crowned one. Address me as Her Highness. Secretary of Checkmate Ent. I love mi team! #CME","profile_background_color":"020500","location":"Royal castle\/VSU","profile_background_image_url":"http:\/\/a1.twimg.com\/profile_background_images\/223660852\/mail.jpg","listed_count":9,"followers_count":618,"default_profile":false,"lang":"en","time_zone":"Quito","created_at":"Sun Aug 30 18:02:58 +0000 2009","profile_text_color":"e60afa","protected":false,"profile_sidebar_fill_color":"0e0f0e","url":null,"name":"Tiara Mallory","id_str":"70174532","is_translator":false,"verified":false,"profile_background_tile":true,"id":70174532,"default_profile_image":false,"show_all_inline_media":false,"following":null,"geo_enabled":false,"notifications":null,"utc_offset":-18000,"favourites_count":32},"retweeted":false,"id":55505187816734720,"in_reply_to_user_id_str":null,"retweet_count":0}
		{"in_reply_to_user_id":null,"favorited":false,"text":"Going to school early tomorrow. Ehhh gotta wake up early now. Ughhh","in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"id_str":"55440176318660608","coordinates":null,"geo":null,"contributors":null,"retweeted":false,"source":"\u003Ca href=\"http:\/\/twitter.com\/devices\" rel=\"nofollow\"\u003Etxt\u003C\/a\u003E","in_reply_to_user_id_str":null,"retweet_count":0,"entities":{"user_mentions":[],"urls":[],"hashtags":[]},"in_reply_to_status_id":null,"created_at":"Wed Apr 06 01:22:22 +0000 2011","place":null,"user":{"followers_count":1416,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1268192445\/Picture_85_normal.jpg","listed_count":246,"time_zone":"Central Time (US & Canada)","profile_text_color":"333333","default_profile":false,"profile_sidebar_fill_color":"E6F6F9","screen_name":"BeliebinAustin","id_str":"46909648","verified":false,"profile_background_tile":true,"description":"Im a Belieber . I love Justin Bieber, Cody Simpson, Austin Mahone, and Dylan Holland. Loving is all I do. ;D","is_translator":false,"notifications":null,"location":"Michigan","default_profile_image":false,"show_all_inline_media":false,"geo_enabled":false,"favourites_count":6,"profile_link_color":"CC3366","contributors_enabled":false,"profile_sidebar_border_color":"DBE9ED","lang":"en","created_at":"Sat Jun 13 15:59:52 +0000 2009","protected":false,"follow_request_sent":null,"statuses_count":4791,"profile_use_background_image":true,"friends_count":1369,"url":null,"name":"Anai Anaya","profile_background_color":"DBE9ED","id":46909648,"following":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/208302168\/imagesCA3X9PC6.jpg","utc_offset":-21600},"truncated":false,"id":55440176318660608}