-
Notifications
You must be signed in to change notification settings - Fork 199
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
JSON input, pretsv output, and escaping troubles
- Loading branch information
Showing
7 changed files
with
129 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"text":"RT @TheyCallMeGriff @SkinniiMini u kno where i stay u want it just knock nigga <> u tryna lose ur life tonight nigga?","in_reply_to_status_id":null,"truncated":false,"source":"\u003Ca href=\"http:\/\/twidroyd.com\" rel=\"nofollow\"\u003Etwidroyd\u003C\/a\u003E","favorited":false,"in_reply_to_user_id":null,"entities":{"urls":[],"hashtags":[],"user_mentions":[{"indices":[3,19],"screen_name":"TheyCallMeGriff","name":"Call me C-Griff","id_str":"116846416","id":116846416},{"indices":[20,32],"screen_name":"SkinniiMini","name":"Tiara Mallory","id_str":"70174532","id":70174532}]},"in_reply_to_screen_name":null,"created_at":"Wed Apr 06 05:40:42 +0000 2011","in_reply_to_status_id_str":null,"place":null,"id_str":"55505187816734720","contributors":null,"coordinates":null,"geo":null,"user":{"contributors_enabled":false,"profile_link_color":"fa9600","profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1299807059\/199606_1576886593292_1569420007_31095569_8292446_n_normal.jpg","profile_sidebar_border_color":"fa9a0a","follow_request_sent":null,"statuses_count":17911,"friends_count":617,"screen_name":"SkinniiMini","profile_use_background_image":true,"description":"Name explains it all: Tiara- the crowned one. Address me as Her Highness. Secretary of Checkmate Ent. I love mi team! #CME","profile_background_color":"020500","location":"Royal castle\/VSU","profile_background_image_url":"http:\/\/a1.twimg.com\/profile_background_images\/223660852\/mail.jpg","listed_count":9,"followers_count":618,"default_profile":false,"lang":"en","time_zone":"Quito","created_at":"Sun Aug 30 18:02:58 +0000 2009","profile_text_color":"e60afa","protected":false,"profile_sidebar_fill_color":"0e0f0e","url":null,"name":"Tiara Mallory","id_str":"70174532","is_translator":false,"verified":false,"profile_background_tile":true,"id":70174532,"default_profile_image":false,"show_all_inline_media":false,"following":null,"geo_enabled":false,"notifications":null,"utc_offset":-18000,"favourites_count":32},"retweeted":false,"id":55505187816734720,"in_reply_to_user_id_str":null,"retweet_count":0} | ||
{"in_reply_to_user_id":null,"favorited":false,"text":"Going to school early tomorrow. Ehhh gotta wake up early now. Ughhh","in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"id_str":"55440176318660608","coordinates":null,"geo":null,"contributors":null,"retweeted":false,"source":"\u003Ca href=\"http:\/\/twitter.com\/devices\" rel=\"nofollow\"\u003Etxt\u003C\/a\u003E","in_reply_to_user_id_str":null,"retweet_count":0,"entities":{"user_mentions":[],"urls":[],"hashtags":[]},"in_reply_to_status_id":null,"created_at":"Wed Apr 06 01:22:22 +0000 2011","place":null,"user":{"followers_count":1416,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1268192445\/Picture_85_normal.jpg","listed_count":246,"time_zone":"Central Time (US & Canada)","profile_text_color":"333333","default_profile":false,"profile_sidebar_fill_color":"E6F6F9","screen_name":"BeliebinAustin","id_str":"46909648","verified":false,"profile_background_tile":true,"description":"Im a Belieber . I love Justin Bieber, Cody Simpson, Austin Mahone, and Dylan Holland. Loving is all I do. ;D","is_translator":false,"notifications":null,"location":"Michigan","default_profile_image":false,"show_all_inline_media":false,"geo_enabled":false,"favourites_count":6,"profile_link_color":"CC3366","contributors_enabled":false,"profile_sidebar_border_color":"DBE9ED","lang":"en","created_at":"Sat Jun 13 15:59:52 +0000 2009","protected":false,"follow_request_sent":null,"statuses_count":4791,"profile_use_background_image":true,"friends_count":1369,"url":null,"name":"Anai Anaya","profile_background_color":"DBE9ED","id":46909648,"following":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/208302168\/imagesCA3X9PC6.jpg","utc_offset":-21600},"truncated":false,"id":55440176318660608} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Take the pretsv format and make it easier to read | ||
|
||
import sys | ||
for line in sys.stdin: | ||
parts = line.split('\t') | ||
tokens = parts[0].split() | ||
tags = parts[1].split() | ||
pairs = ["%s/%s" % (tok, tag) for tok,tag in zip(tokens,tags)] | ||
print ' '.join(pairs) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package cmu.arktweetnlp.io; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.util.Iterator; | ||
|
||
import cmu.arktweetnlp.util.BasicFileIO; | ||
|
||
import com.fasterxml.jackson.core.JsonParseException; | ||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
||
/** | ||
* | ||
*/ | ||
public class JsonTweetReader { | ||
ObjectMapper mapper; | ||
|
||
public JsonTweetReader() { | ||
mapper = new ObjectMapper(); | ||
} | ||
|
||
/** | ||
* Get the text from a raw Tweet JSON string. | ||
* | ||
* @param tweetJson | ||
* @return null if there is no text field, or invalid JSON. | ||
*/ | ||
public String getText(String tweetJson) { | ||
JsonNode rootNode; | ||
|
||
// wtf, we have to allocate a new parser for every line? | ||
try { | ||
rootNode = mapper.readValue(tweetJson, JsonNode.class); | ||
} catch (JsonParseException e) { | ||
return null; | ||
} catch (IOException e) { | ||
return null; | ||
} | ||
|
||
if (! rootNode.isObject()) | ||
return null; | ||
|
||
JsonNode textValue = rootNode.get("text"); | ||
if (textValue==null) | ||
return null; | ||
|
||
return textValue.asText(); | ||
} | ||
|
||
} |