From 98ed930b900d8cb237bcf50b30eebbd094b11825 Mon Sep 17 00:00:00 2001 From: sarky21 Date: Thu, 9 Jun 2016 01:15:16 -0500 Subject: [PATCH] Modified to output sentiment of the text and words --- corenlp.py | 97 +++++++++++++++++++++++++++------------------- default.properties | 4 +- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/corenlp.py b/corenlp.py index 753e51c..38cb275 100644 --- a/corenlp.py +++ b/corenlp.py @@ -3,17 +3,17 @@ # corenlp - Python interface to Stanford Core NLP tools # Copyright (c) 2014 Dustin Smith # https://github.com/dasmith/stanford-corenlp-python -# +# # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -30,6 +30,8 @@ STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 WORD_PATTERN = re.compile('\[([^\]]+)\]') +SENTIMENT_PATTERN = re.compile('sentiment: (.*)\)') +SENTCLASS_PATTERN = re.compile('[^=\s]*=(Very\spositive|Neutral|Very negative|Positive|Negative)') CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\]\) -> \((\d*),(\d)*,\[(\d*),(\d*)\]\), that is: \"(.*)\" -> \"(.*)\"") # initialize logger @@ -70,34 +72,49 @@ def parse_parser_results(text): and then returns a Python list of dictionaries, one for each parsed sentence. """ + #print text results = {"sentences": []} state = STATE_START - for line in text.encode('utf-8').split("\n"): + lines = text.encode('utf-8').split("\n") + for i, line in enumerate(lines): line = line.strip() - + if (i < len(lines) - 1): + next_line = lines[i+1].strip() + else: + next_line = None + if line.startswith("Sentence #"): - sentence = {'words':[], 'parsetree':[], 'dependencies':[]} + #Get the lines sentiment + sentiment = re.findall(SENTIMENT_PATTERN, line)[0] + sentence = {'sentiment': sentiment, 'words':[], 'parsetree':[], 'dependencies':[]} results["sentences"].append(sentence) state = STATE_TEXT - + elif state == STATE_TEXT: sentence['text'] = line state = STATE_WORDS - + elif state == STATE_WORDS: if not line.startswith("[Text="): raise Exception('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): - sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - + #First extract the sentiment class if it exists + word_sentiment = re.findall(SENTCLASS_PATTERN, s)[0] + #Strip the sentimentClass attr as it doesn't parse well + word, attrs = parse_bracketed(re.sub(SENTCLASS_PATTERN, "", s)) + attrs['Sentiment'] = word_sentiment + sentence['words'].append((word, attrs)) + #Check if the nextline also starts with "[Text=" + if (next_line is None) or (not next_line.startswith("[Text=")): + state = STATE_TREE + elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY sentence['parsetree'] = " ".join(sentence['parsetree']) else: sentence['parsetree'].append(line) - + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE @@ -106,7 +123,7 @@ def parse_parser_results(text): if len(split_entry) == 3: rel, left, right = map(lambda x: remove_id(x), split_entry) sentence['dependencies'].append(tuple([rel,left,right])) - + elif state == STATE_COREFERENCE: if "Coreference set" in line: if 'coref' not in results: @@ -118,7 +135,7 @@ def parse_parser_results(text): src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1 sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1 coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) - + return results @@ -132,40 +149,42 @@ def __init__(self, corenlp_path=None): Checks the location of the jar files. Spawns the server as a process. """ - jars = ["stanford-corenlp-3.4.1.jar", - "stanford-corenlp-3.4.1-models.jar", + jars = ["stanford-corenlp-3.5.2.jar", + "stanford-corenlp-3.5.2-models.jar", "joda-time.jar", "xom.jar", + "ejml-0.23.jar", "jollyday.jar"] - + # if CoreNLP libraries are in a different directory, # change the corenlp_path variable to point to them if not corenlp_path: - corenlp_path = "./stanford-corenlp-full-2014-08-27/" - + corenlp_path = "./stanford-corenlp-full-2015-04-20/" + java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - props = "-props default.properties" - + props = "-props default.properties" + # add and check classpaths jars = [corenlp_path + jar for jar in jars] for jar in jars: if not os.path.exists(jar): logger.error("Error! Cannot locate %s" % jar) sys.exit(1) - + # spawn the server start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props) - if VERBOSE: + print start_corenlp + if VERBOSE: logger.debug(start_corenlp) self.corenlp = pexpect.spawn(start_corenlp) - + # show progress bar while loading the models widgets = ['Loading Models: ', Fraction()] pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() - self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec) + self.corenlp.expect("done.", timeout=200) # Load pos tagger model (~5sec) pbar.update(1) self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec) pbar.update(2) @@ -177,11 +196,11 @@ def __init__(self, corenlp_path=None): pbar.update(5) self.corenlp.expect("Entering interactive shell.") pbar.finish() - + def _parse(self, text): """ This is the core interaction with the parser. - + It returns a Python data-structure, while the parse() function returns a JSON object """ @@ -191,11 +210,11 @@ def _parse(self, text): self.corenlp.read_nonblocking (4000, 0.3) except pexpect.TIMEOUT: break - + self.corenlp.sendline(text) - + # How much time should we give the parser to parse it? - # the idea here is that you increase the timeout as a + # the idea here is that you increase the timeout as a # function of the text's length. # anything longer than 5 seconds requires that you also # increase timeout=5 in jsonrpc.py @@ -207,7 +226,7 @@ def _parse(self, text): # Time left, read more data try: incoming += self.corenlp.read_nonblocking(2000, 1) - if "\nNLP>" in incoming: + if "\nNLP>" in incoming: break time.sleep(0.0001) except pexpect.TIMEOUT: @@ -218,20 +237,20 @@ def _parse(self, text): continue except pexpect.EOF: break - - if VERBOSE: + + if VERBOSE: logger.debug("%s\n%s" % ('='*40, incoming)) try: results = parse_parser_results(incoming) except Exception, e: - if VERBOSE: + if VERBOSE: logger.debug(traceback.format_exc()) raise e - + return results - + def parse(self, text): - """ + """ This function takes a text string, sends it to the Stanford parser, reads in the result, parses the results and returns a list with one dictionary entry for each parsed sentence, in JSON format. @@ -253,9 +272,9 @@ def parse(self, text): options, args = parser.parse_args() server = jsonrpc.Server(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) - + nlp = StanfordCoreNLP() server.register_function(nlp.parse) - + logger.info('Serving on http://%s:%s' % (options.host, options.port)) server.serve() diff --git a/default.properties b/default.properties index e069ea9..51f0598 100644 --- a/default.properties +++ b/default.properties @@ -1,4 +1,4 @@ -annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref +annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment # A true-casing annotator is also available (see below) #annotators = tokenize, ssplit, pos, lemma, truecase @@ -57,7 +57,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref #clean.xmltags = .* # A set of tags which will force the end of a sentence. HTML example: # you would not want to end on , but you would want to end on

. -# Once again, a regular expression. +# Once again, a regular expression. # (Blank means there are no sentence enders.) #clean.sentenceendingtags = # Whether or not to allow malformed xml