Skip to content

Commit

Permalink
refactored Eastern emoticon regex.
Browse files Browse the repository at this point in the history
tested via
    static {
    	System.out.println(eastEmote.equals(eastEmote2));
    }
  • Loading branch information
brendano committed Oct 22, 2012
1 parent d0988f8 commit f281da3
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/cmu/arktweetnlp/Twokenize.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ public static String OR(String... parts) {
static String s4 = "(?:<|&lt;|>|&gt;)[\\._-]+(?:<|&lt;|>|&gt;)";
static String basicface = "(?:(?i)" +s1+s2+ ")|" +s3+ "|" + s4;

static String eastEmote = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+(?:"+basicface+"|[^A-Za-z0-9\\s\\(\\)\\*:=-])+[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+";
static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+";
static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+";
static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]";
static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight;


public static String emoticon = OR(
// Standard version :) :( :] :D :P
Expand Down

0 comments on commit f281da3

Please sign in to comment.