Skip to content

Modified to output sentiment of the text and words as well #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 58 additions & 39 deletions corenlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
# corenlp - Python interface to Stanford Core NLP tools
# Copyright (c) 2014 Dustin Smith
# https://github.com/dasmith/stanford-corenlp-python
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
Expand All @@ -30,6 +30,8 @@

STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
WORD_PATTERN = re.compile('\[([^\]]+)\]')
SENTIMENT_PATTERN = re.compile('sentiment: (.*)\)')
SENTCLASS_PATTERN = re.compile('[^=\s]*=(Very\spositive|Neutral|Very negative|Positive|Negative)')
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\]\) -> \((\d*),(\d)*,\[(\d*),(\d*)\]\), that is: \"(.*)\" -> \"(.*)\"")

# initialize logger
Expand Down Expand Up @@ -70,34 +72,49 @@ def parse_parser_results(text):
and then returns a Python list of dictionaries, one for each parsed
sentence.
"""
#print text
results = {"sentences": []}
state = STATE_START
for line in text.encode('utf-8').split("\n"):
lines = text.encode('utf-8').split("\n")
for i, line in enumerate(lines):
line = line.strip()

if (i < len(lines) - 1):
next_line = lines[i+1].strip()
else:
next_line = None

if line.startswith("Sentence #"):
sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
#Get the lines sentiment
sentiment = re.findall(SENTIMENT_PATTERN, line)[0]
sentence = {'sentiment': sentiment, 'words':[], 'parsetree':[], 'dependencies':[]}
results["sentences"].append(sentence)
state = STATE_TEXT

elif state == STATE_TEXT:
sentence['text'] = line
state = STATE_WORDS

elif state == STATE_WORDS:
if not line.startswith("[Text="):
raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
for s in WORD_PATTERN.findall(line):
sentence['words'].append(parse_bracketed(s))
state = STATE_TREE

#First extract the sentiment class if it exists
word_sentiment = re.findall(SENTCLASS_PATTERN, s)[0]
#Strip the sentimentClass attr as it doesn't parse well
word, attrs = parse_bracketed(re.sub(SENTCLASS_PATTERN, "", s))
attrs['Sentiment'] = word_sentiment
sentence['words'].append((word, attrs))
#Check if the nextline also starts with "[Text="
if (next_line is None) or (not next_line.startswith("[Text=")):
state = STATE_TREE

elif state == STATE_TREE:
if len(line) == 0:
state = STATE_DEPENDENCY
sentence['parsetree'] = " ".join(sentence['parsetree'])
else:
sentence['parsetree'].append(line)

elif state == STATE_DEPENDENCY:
if len(line) == 0:
state = STATE_COREFERENCE
Expand All @@ -106,7 +123,7 @@ def parse_parser_results(text):
if len(split_entry) == 3:
rel, left, right = map(lambda x: remove_id(x), split_entry)
sentence['dependencies'].append(tuple([rel,left,right]))

elif state == STATE_COREFERENCE:
if "Coreference set" in line:
if 'coref' not in results:
Expand All @@ -118,7 +135,7 @@ def parse_parser_results(text):
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))

return results


Expand All @@ -132,40 +149,42 @@ def __init__(self, corenlp_path=None):
Checks the location of the jar files.
Spawns the server as a process.
"""
jars = ["stanford-corenlp-3.4.1.jar",
"stanford-corenlp-3.4.1-models.jar",
jars = ["stanford-corenlp-3.5.2.jar",
"stanford-corenlp-3.5.2-models.jar",
"joda-time.jar",
"xom.jar",
"ejml-0.23.jar",
"jollyday.jar"]

# if CoreNLP libraries are in a different directory,
# change the corenlp_path variable to point to them
if not corenlp_path:
corenlp_path = "./stanford-corenlp-full-2014-08-27/"
corenlp_path = "./stanford-corenlp-full-2015-04-20/"

java_path = "java"
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
# include the properties file, so you can change defaults
# but any changes in output format will break parse_parser_results()
props = "-props default.properties"
props = "-props default.properties"

# add and check classpaths
jars = [corenlp_path + jar for jar in jars]
for jar in jars:
if not os.path.exists(jar):
logger.error("Error! Cannot locate %s" % jar)
sys.exit(1)

# spawn the server
start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
if VERBOSE:
print start_corenlp
if VERBOSE:
logger.debug(start_corenlp)
self.corenlp = pexpect.spawn(start_corenlp)

# show progress bar while loading the models
widgets = ['Loading Models: ', Fraction()]
pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
self.corenlp.expect("done.", timeout=200) # Load pos tagger model (~5sec)
pbar.update(1)
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
pbar.update(2)
Expand All @@ -177,11 +196,11 @@ def __init__(self, corenlp_path=None):
pbar.update(5)
self.corenlp.expect("Entering interactive shell.")
pbar.finish()

def _parse(self, text):
"""
This is the core interaction with the parser.

It returns a Python data-structure, while the parse()
function returns a JSON object
"""
Expand All @@ -191,11 +210,11 @@ def _parse(self, text):
self.corenlp.read_nonblocking (4000, 0.3)
except pexpect.TIMEOUT:
break

self.corenlp.sendline(text)

# How much time should we give the parser to parse it?
# the idea here is that you increase the timeout as a
# the idea here is that you increase the timeout as a
# function of the text's length.
# anything longer than 5 seconds requires that you also
# increase timeout=5 in jsonrpc.py
Expand All @@ -207,7 +226,7 @@ def _parse(self, text):
# Time left, read more data
try:
incoming += self.corenlp.read_nonblocking(2000, 1)
if "\nNLP>" in incoming:
if "\nNLP>" in incoming:
break
time.sleep(0.0001)
except pexpect.TIMEOUT:
Expand All @@ -218,20 +237,20 @@ def _parse(self, text):
continue
except pexpect.EOF:
break
if VERBOSE:

if VERBOSE:
logger.debug("%s\n%s" % ('='*40, incoming))
try:
results = parse_parser_results(incoming)
except Exception, e:
if VERBOSE:
if VERBOSE:
logger.debug(traceback.format_exc())
raise e

return results

def parse(self, text):
"""
"""
This function takes a text string, sends it to the Stanford parser,
reads in the result, parses the results and returns a list
with one dictionary entry for each parsed sentence, in JSON format.
Expand All @@ -253,9 +272,9 @@ def parse(self, text):
options, args = parser.parse_args()
server = jsonrpc.Server(jsonrpc.JsonRpc20(),
jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))

nlp = StanfordCoreNLP()
server.register_function(nlp.parse)

logger.info('Serving on http://%s:%s' % (options.host, options.port))
server.serve()
4 changes: 2 additions & 2 deletions default.properties
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment

# A true-casing annotator is also available (see below)
#annotators = tokenize, ssplit, pos, lemma, truecase
Expand Down Expand Up @@ -57,7 +57,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
#clean.xmltags = .*
# A set of tags which will force the end of a sentence. HTML example:
# you would not want to end on <i>, but you would want to end on <p>.
# Once again, a regular expression.
# Once again, a regular expression.
# (Blank means there are no sentence enders.)
#clean.sentenceendingtags =
# Whether or not to allow malformed xml
Expand Down