Skip to content

Commit

Permalink
JSON input, pretsv output, and escaping troubles
Browse files Browse the repository at this point in the history
  • Loading branch information
brendano committed Aug 15, 2012
1 parent 8c3163a commit 05e662f
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 21 deletions.
7 changes: 7 additions & 0 deletions ark-tweet-nlp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@
<version>1.1</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.0.0</version>
</dependency>

<!-- END testing dependecies -->
</dependencies>
</project>
2 changes: 2 additions & 0 deletions examples/tweets.jsonline
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"text":"RT @TheyCallMeGriff @SkinniiMini u kno where i stay u want it just knock nigga &lt;&gt; u tryna lose ur life tonight nigga?","in_reply_to_status_id":null,"truncated":false,"source":"\u003Ca href=\"http:\/\/twidroyd.com\" rel=\"nofollow\"\u003Etwidroyd\u003C\/a\u003E","favorited":false,"in_reply_to_user_id":null,"entities":{"urls":[],"hashtags":[],"user_mentions":[{"indices":[3,19],"screen_name":"TheyCallMeGriff","name":"Call me C-Griff","id_str":"116846416","id":116846416},{"indices":[20,32],"screen_name":"SkinniiMini","name":"Tiara Mallory","id_str":"70174532","id":70174532}]},"in_reply_to_screen_name":null,"created_at":"Wed Apr 06 05:40:42 +0000 2011","in_reply_to_status_id_str":null,"place":null,"id_str":"55505187816734720","contributors":null,"coordinates":null,"geo":null,"user":{"contributors_enabled":false,"profile_link_color":"fa9600","profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1299807059\/199606_1576886593292_1569420007_31095569_8292446_n_normal.jpg","profile_sidebar_border_color":"fa9a0a","follow_request_sent":null,"statuses_count":17911,"friends_count":617,"screen_name":"SkinniiMini","profile_use_background_image":true,"description":"Name explains it all: Tiara- the crowned one. Address me as Her Highness. Secretary of Checkmate Ent. I love mi team! #CME","profile_background_color":"020500","location":"Royal castle\/VSU","profile_background_image_url":"http:\/\/a1.twimg.com\/profile_background_images\/223660852\/mail.jpg","listed_count":9,"followers_count":618,"default_profile":false,"lang":"en","time_zone":"Quito","created_at":"Sun Aug 30 18:02:58 +0000 2009","profile_text_color":"e60afa","protected":false,"profile_sidebar_fill_color":"0e0f0e","url":null,"name":"Tiara Mallory","id_str":"70174532","is_translator":false,"verified":false,"profile_background_tile":true,"id":70174532,"default_profile_image":false,"show_all_inline_media":false,"following":null,"geo_enabled":false,"notifications":null,"utc_offset":-18000,"favourites_count":32},"retweeted":false,"id":55505187816734720,"in_reply_to_user_id_str":null,"retweet_count":0}
{"in_reply_to_user_id":null,"favorited":false,"text":"Going to school early tomorrow. Ehhh gotta wake up early now. Ughhh","in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"id_str":"55440176318660608","coordinates":null,"geo":null,"contributors":null,"retweeted":false,"source":"\u003Ca href=\"http:\/\/twitter.com\/devices\" rel=\"nofollow\"\u003Etxt\u003C\/a\u003E","in_reply_to_user_id_str":null,"retweet_count":0,"entities":{"user_mentions":[],"urls":[],"hashtags":[]},"in_reply_to_status_id":null,"created_at":"Wed Apr 06 01:22:22 +0000 2011","place":null,"user":{"followers_count":1416,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1268192445\/Picture_85_normal.jpg","listed_count":246,"time_zone":"Central Time (US & Canada)","profile_text_color":"333333","default_profile":false,"profile_sidebar_fill_color":"E6F6F9","screen_name":"BeliebinAustin","id_str":"46909648","verified":false,"profile_background_tile":true,"description":"Im a Belieber . I love Justin Bieber, Cody Simpson, Austin Mahone, and Dylan Holland. Loving is all I do. ;D","is_translator":false,"notifications":null,"location":"Michigan","default_profile_image":false,"show_all_inline_media":false,"geo_enabled":false,"favourites_count":6,"profile_link_color":"CC3366","contributors_enabled":false,"profile_sidebar_border_color":"DBE9ED","lang":"en","created_at":"Sat Jun 13 15:59:52 +0000 2009","protected":false,"follow_request_sent":null,"statuses_count":4791,"profile_use_background_image":true,"friends_count":1369,"url":null,"name":"Anai Anaya","profile_background_color":"DBE9ED","id":46909648,"following":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/208302168\/imagesCA3X9PC6.jpg","utc_offset":-21600},"truncated":false,"id":55440176318660608}
Binary file removed lib/jackson-core/jackson-core-2.1.0-SNAPSHOT.jar
Binary file not shown.
10 changes: 10 additions & 0 deletions scripts/show.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Take the pretsv format and make it easier to read

import sys
for line in sys.stdin:
parts = line.split('\t')
tokens = parts[0].split()
tags = parts[1].split()
pairs = ["%s/%s" % (tok, tag) for tok,tag in zip(tokens,tags)]
print ' '.join(pairs)

56 changes: 41 additions & 15 deletions src/cmu/arktweetnlp/RunTagger.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
package cmu.arktweetnlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashSet;
import java.util.List;

import cmu.arktweetnlp.Twokenize.Tokenization;
import cmu.arktweetnlp.impl.ModelSentence;
import cmu.arktweetnlp.impl.Sentence;
import cmu.arktweetnlp.io.CoNLLReader;
import cmu.arktweetnlp.io.JsonTweetReader;
import cmu.arktweetnlp.util.BasicFileIO;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

Expand Down Expand Up @@ -70,23 +74,42 @@ public void runTagger() throws IOException, ClassNotFoundException {
}
assert (inputFormat.equals("json") || inputFormat.equals("text"));

die("TODO");
for (Pair<String, String> recordAndText : iterateTweets()) {
Sentence sentence=null;
ModelSentence mSent=null;
// run tokenizer to fill out Sentence
// run tagger to get tags
outputPrependedTagging(sentence, mSent, this.justTokenize, recordAndText.first);
JsonTweetReader jsonTweetReader = new JsonTweetReader();

BufferedReader reader = BasicFileIO.openFileToReadUTF8(inputFilename);
String line;
while ( (line = reader.readLine()) != null) {
String[] parts = line.split("\t");
String tweetData = parts[inputField-1];
String text;
if (inputFormat.equals("json")) {
text = jsonTweetReader.getText(tweetData);
} else {
text = tweetData;
}

Sentence sentence = new Sentence();

// Tokenization tokenization = Twokenize.tokenizeForTaggerAndOriginal(text);
// sentence.tokens = tokenization.normalizedTokens;

sentence.tokens = Twokenize.tokenizeForTagger(text);

ModelSentence modelSentence = new ModelSentence(sentence.T());
tagger.featureExtractor.computeFeatures(sentence, modelSentence);
goDecode(modelSentence);

if (outputFormat.equals("conll")) {
outputJustTagging(sentence, modelSentence);
} else {
outputPrependedTagging(sentence, modelSentence,
this.justTokenize, tweetData);
}

}
}
/** yields (FullInputLine, TweetText) pairs. **/
private Iterable<Pair<String, String>> iterateTweets() {
// TODO
return null;
}


/** Runs the correct algorithm (TODO make config option?) **/
/** Runs the correct algorithm (make config option perhaps) **/
public void goDecode(ModelSentence mSent) {
//tagger.model.greedyDecode(mSent);
tagger.model.viterbiDecode(mSent);
Expand Down Expand Up @@ -234,6 +257,9 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
tagger.noOutput = true;
i += 1;
} else if (args[i].equals("--input-format")) {
String s = args[i+1];
if (!(s.equals("json")||s.equals("text")||s.equals("conll")))
usage("input format must be: json, text, or conll");
tagger.inputFormat = args[i+1];
i += 2;
} else if (args[i].equals("--output-format")) {
Expand Down Expand Up @@ -297,7 +323,7 @@ public static void usage(String extra) {
"\n(1) tweet-per-line, and (2) token-per-line." +
"\nTweet-per-line input formats:" +
"\n json: Every input line has a JSON object containing the tweet," +
"\n as per certain Twitter APIs. (The 'text' field will be tagged.)" +
"\n as per the Streaming API. (The 'text' field gets used.)" +
"\n text: Every input line has the text for one tweet." +
"\nFor both cases, we the lines in the input are actually TSV," +
"\nand the tweets (text or json) are one particular field." +
Expand Down
24 changes: 18 additions & 6 deletions src/cmu/arktweetnlp/Twokenize.java
Original file line number Diff line number Diff line change
Expand Up @@ -294,22 +294,28 @@ public static List<String> tokenize (String text){
}


// OLD COMMENT
// Very slight normalization for AFTER tokenization.
// The tokenization regexes are written to work on non-normalized text.
// (to make byte offsets easier to compute)
// Hm: 2+ repeated character normalization here?
// No, that's more linguistic, should be further down the pipeline
public static String normalizeText(String text) {
//return text.replaceAll("&lt;", "<").replaceAll("&gt;",">").replaceAll("&amp;","&").replaceAll("&lt;", "<").replaceAll("&gt;",">").replaceAll("&amp;","&");
// text = text.replaceAll("&lt;", "<").replaceAll("&gt;",">");
text = text.replaceAll("&amp;", "&");
return StringEscapeUtils.unescapeHtml(text);
text = StringEscapeUtils.unescapeHtml(text);
return text;
}

public static List<String> tokenizeForTagger (String text) {
/**
* Note this normalizes text BEFORE calling the tokenizer. So the tokens you get back may not exactly correspond to
* substrings of the original text.
*/
public static List<String> tokenizeForTagger(String text) {
List<String> res = new ArrayList<String>();
List<String> pretokenized = tokenize(normalizeText(text));
for(String token:pretokenized){
res.add((token));
res.add(token);
}
return res;
}
Expand All @@ -320,6 +326,10 @@ public static class Tokenization {
public List<String> normalizedTokens;
}

/**
* This is subtly broken dont use right now -
* to make it work, need to redo tokenizer regexes to handle unescaped html.
**/
public static Tokenization tokenizeForTaggerAndOriginal(String text) {
Tokenization tokenization = new Tokenization();
tokenization.rawTokens = tokenize(text);
Expand All @@ -330,8 +340,10 @@ public static Tokenization tokenizeForTaggerAndOriginal(String text) {
return tokenization;
}

// Convenience method to produce a string representation of the
// tokenized tweet in a standard-ish format.

/**
* Returns tokenization as a single string of space-separated tokens.
*/
public static String tokenizeToString (String text){
List<String> tokenized = tokenizeForTagger(text);
if (tokenized.size()==0)
Expand Down
51 changes: 51 additions & 0 deletions src/cmu/arktweetnlp/io/JsonTweetReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package cmu.arktweetnlp.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Iterator;

import cmu.arktweetnlp.util.BasicFileIO;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
*
*/
public class JsonTweetReader {
ObjectMapper mapper;

public JsonTweetReader() {
mapper = new ObjectMapper();
}

/**
* Get the text from a raw Tweet JSON string.
*
* @param tweetJson
* @return null if there is no text field, or invalid JSON.
*/
public String getText(String tweetJson) {
JsonNode rootNode;

// wtf, we have to allocate a new parser for every line?
try {
rootNode = mapper.readValue(tweetJson, JsonNode.class);
} catch (JsonParseException e) {
return null;
} catch (IOException e) {
return null;
}

if (! rootNode.isObject())
return null;

JsonNode textValue = rootNode.get("text");
if (textValue==null)
return null;

return textValue.asText();
}

}

0 comments on commit 05e662f

Please sign in to comment.