From 97920c25ed80e13326eb8a6cb2208f8b1fedf4b1 Mon Sep 17 00:00:00 2001 From: Brendan O'Connor Date: Tue, 23 Oct 2012 00:01:34 -0400 Subject: [PATCH] Fixed twokenizer errors Fixed error where ........ would tokenize to ... ... ... and the error where ~......... would hang Hopefully fixes github issue #14 (from https://github.com/tobiowo/ark-tweet-nlp/commit/471d22307e2930a7a905faa235eead7b28c69b13 but with CRLF->LF) Change analysis: sample of 1,355,000 tweets changes 30,000 of them they all look like improvements, mostly the ellipsis --- src/cmu/arktweetnlp/Twokenize.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cmu/arktweetnlp/Twokenize.java b/src/cmu/arktweetnlp/Twokenize.java index 41b61c4..4f1dcea 100644 --- a/src/cmu/arktweetnlp/Twokenize.java +++ b/src/cmu/arktweetnlp/Twokenize.java @@ -98,12 +98,13 @@ public static String OR(String... parts) { // @aliciakeys Put it in a love song :-)) // @hellocalyclops =))=))=)) Oh well - static String bfLeft = "(♥|0|o|°|v|\\$|t|x|\\.|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)"; + static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)"; static String bfCenter = "(?:[\\.]|[_-]+)"; static String bfRight = "\\2"; static String s3 = "(?:--['\"])"; static String s4 = "(?:<|<|>|>)[\\._-]+(?:<|<|>|>)"; - static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" + s4; + static String s5 = "(?:[.][_]+[.])"; + static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5; static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+"; static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+";