dasmith · bsatts · Jun 9, 2016
diff --git a/corenlp.py b/corenlp.py
@@ -3,17 +3,17 @@
 # corenlp  - Python interface to Stanford Core NLP tools
 # Copyright (c) 2014 Dustin Smith
 #   https://github.com/dasmith/stanford-corenlp-python
-# 
+#
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -30,6 +30,8 @@
 
 STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
 WORD_PATTERN = re.compile('\[([^\]]+)\]')
+SENTIMENT_PATTERN = re.compile('sentiment: (.*)\)')
+SENTCLASS_PATTERN = re.compile('[^=\s]*=(Very\spositive|Neutral|Very negative|Positive|Negative)')
 CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\]\) -> \((\d*),(\d)*,\[(\d*),(\d*)\]\), that is: \"(.*)\" -> \"(.*)\"")
 
 # initialize logger
@@ -70,34 +72,49 @@ def parse_parser_results(text):
     and then returns a Python list of dictionaries, one for each parsed
     sentence.
     """
+    #print text
     results = {"sentences": []}
     state = STATE_START
-    for line in text.encode('utf-8').split("\n"):
+    lines = text.encode('utf-8').split("\n")
+    for i, line in enumerate(lines):
         line = line.strip()
-
+        if (i < len(lines) - 1):
+            next_line = lines[i+1].strip()
+        else:
+            next_line = None
+
         if line.startswith("Sentence #"):
-            sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
+            #Get the lines sentiment
+            sentiment = re.findall(SENTIMENT_PATTERN, line)[0]
+            sentence = {'sentiment': sentiment, 'words':[], 'parsetree':[], 'dependencies':[]}
             results["sentences"].append(sentence)
             state = STATE_TEXT
-        
+
         elif state == STATE_TEXT:
             sentence['text'] = line
             state = STATE_WORDS
-        
+
         elif state == STATE_WORDS:
             if not line.startswith("[Text="):
                 raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
             for s in WORD_PATTERN.findall(line):
-                sentence['words'].append(parse_bracketed(s))
-            state = STATE_TREE
-
+                #First extract the sentiment class if it exists
+                word_sentiment = re.findall(SENTCLASS_PATTERN, s)[0]
+                #Strip the sentimentClass attr as it doesn't parse well
+                word, attrs = parse_bracketed(re.sub(SENTCLASS_PATTERN, "", s))
+                attrs['Sentiment'] = word_sentiment
+                sentence['words'].append((word, attrs))
+            #Check if the nextline also starts with "[Text="
+            if (next_line is None) or (not next_line.startswith("[Text=")):
+                state = STATE_TREE
+
         elif state == STATE_TREE:
             if len(line) == 0:
                 state = STATE_DEPENDENCY
                 sentence['parsetree'] = " ".join(sentence['parsetree'])
             else:
                 sentence['parsetree'].append(line)
-        
+
         elif state == STATE_DEPENDENCY:
             if len(line) == 0:
                 state = STATE_COREFERENCE
@@ -106,7 +123,7 @@ def parse_parser_results(text):
                 if len(split_entry) == 3:
                     rel, left, right = map(lambda x: remove_id(x), split_entry)
                     sentence['dependencies'].append(tuple([rel,left,right]))
-        
+
         elif state == STATE_COREFERENCE:
             if "Coreference set" in line:
                 if 'coref' not in results:
@@ -118,7 +135,7 @@ def parse_parser_results(text):
                     src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
                     sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
                     coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
-    
+
     return results
 
 
@@ -132,40 +149,42 @@ def __init__(self, corenlp_path=None):
         Checks the location of the jar files.
         Spawns the server as a process.
         """
-        jars = ["stanford-corenlp-3.4.1.jar",
-                "stanford-corenlp-3.4.1-models.jar",
+        jars = ["stanford-corenlp-3.5.2.jar",
+                "stanford-corenlp-3.5.2-models.jar",
                 "joda-time.jar",
                 "xom.jar",
+                "ejml-0.23.jar",
                 "jollyday.jar"]
-       
+
         # if CoreNLP libraries are in a different directory,
         # change the corenlp_path variable to point to them
         if not corenlp_path:
-            corenlp_path = "./stanford-corenlp-full-2014-08-27/"
-        
+            corenlp_path = "./stanford-corenlp-full-2015-04-20/"
+
         java_path = "java"
         classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
         # include the properties file, so you can change defaults
         # but any changes in output format will break parse_parser_results()
-        props = "-props default.properties" 
-        
+        props = "-props default.properties"
+
         # add and check classpaths
         jars = [corenlp_path + jar for jar in jars]
         for jar in jars:
             if not os.path.exists(jar):
                 logger.error("Error! Cannot locate %s" % jar)
                 sys.exit(1)
-        
+
         # spawn the server
         start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
-        if VERBOSE: 
+        print start_corenlp
+        if VERBOSE:
             logger.debug(start_corenlp)
         self.corenlp = pexpect.spawn(start_corenlp)
-        
+
         # show progress bar while loading the models
         widgets = ['Loading Models: ', Fraction()]
         pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
-        self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
+        self.corenlp.expect("done.", timeout=200) # Load pos tagger model (~5sec)
         pbar.update(1)
         self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
         pbar.update(2)
@@ -177,11 +196,11 @@ def __init__(self, corenlp_path=None):
         pbar.update(5)
         self.corenlp.expect("Entering interactive shell.")
         pbar.finish()
-    
+
     def _parse(self, text):
         """
         This is the core interaction with the parser.
-        
+
         It returns a Python data-structure, while the parse()
         function returns a JSON object
         """
@@ -191,11 +210,11 @@ def _parse(self, text):
                 self.corenlp.read_nonblocking (4000, 0.3)
             except pexpect.TIMEOUT:
                 break
-        
+
         self.corenlp.sendline(text)
-        
+
         # How much time should we give the parser to parse it?
-        # the idea here is that you increase the timeout as a 
+        # the idea here is that you increase the timeout as a
         # function of the text's length.
         # anything longer than 5 seconds requires that you also
         # increase timeout=5 in jsonrpc.py
@@ -207,7 +226,7 @@ def _parse(self, text):
             # Time left, read more data
             try:
                 incoming += self.corenlp.read_nonblocking(2000, 1)
-                if "\nNLP>" in incoming: 
+                if "\nNLP>" in incoming:
                     break
                 time.sleep(0.0001)
             except pexpect.TIMEOUT:
@@ -218,20 +237,20 @@ def _parse(self, text):
                     continue
             except pexpect.EOF:
                 break
-        
-        if VERBOSE: 
+
+        if VERBOSE:
             logger.debug("%s\n%s" % ('='*40, incoming))
         try:
             results = parse_parser_results(incoming)
         except Exception, e:
-            if VERBOSE: 
+            if VERBOSE:
                 logger.debug(traceback.format_exc())
             raise e
-        
+
         return results
-    
+
     def parse(self, text):
-        """ 
+        """
         This function takes a text string, sends it to the Stanford parser,
         reads in the result, parses the results and returns a list
         with one dictionary entry for each parsed sentence, in JSON format.
@@ -253,9 +272,9 @@ def parse(self, text):
     options, args = parser.parse_args()
     server = jsonrpc.Server(jsonrpc.JsonRpc20(),
                             jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
-    
+
     nlp = StanfordCoreNLP()
     server.register_function(nlp.parse)
-    
+
     logger.info('Serving on http://%s:%s' % (options.host, options.port))
     server.serve()
diff --git a/default.properties b/default.properties
@@ -1,4 +1,4 @@
-annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
+annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment
 
 # A true-casing annotator is also available (see below)
 #annotators = tokenize, ssplit, pos, lemma, truecase
@@ -57,7 +57,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
 #clean.xmltags = .*
 # A set of tags which will force the end of a sentence.  HTML example:
 # you would not want to end on <i>, but you would want to end on <p>.
-# Once again, a regular expression.  
+# Once again, a regular expression.
 # (Blank means there are no sentence enders.)
 #clean.sentenceendingtags =
 # Whether or not to allow malformed xml