Skip to content

Commit

Permalink
fixes #5: We eliminate multiple pipelines and adjust the output of th…
Browse files Browse the repository at this point in the history
…e entity call such that json can

appropriately deserialize the results.  I tested on a smaller document, but it was not too slow at all.  The key
really is to start the corenlp (which is a prerequisite for CharLSTM) with enough memory, perhaps 8g
  • Loading branch information
Greg Werner committed Jan 17, 2019
1 parent c7c0a2d commit 44048ed
Show file tree
Hide file tree
Showing 5,146 changed files with 55,226 additions and 93 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
197 changes: 111 additions & 86 deletions CharLSTMSentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,32 @@
'''

import Config
import json
from lib_model.bidirectional_lstm import LSTM
import logging
import nltk
from nltk import Tree
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from os import listdir
from os.path import isfile, join
from pycorenlp import StanfordCoreNLP
from queue import Queue
from stanfordcorenlp import StanfordCoreNLP

nltk.download('punkt')

# for testing only please! Use the server created in Entry => StanfordSentiment please for deployment usage
def getCoreNlpInstance(config_item):
# don't need sentiment, however the stanford annotator does need it
props={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,coref,sentiment',
'pipelineLanguage':'en',
'outputFormat':'json',
'parse.model':'edu/stanford/nlp/models/srparser/englishSR.ser.gz',
'sentiment.model': os.path.realpath(__file__) + '/../model/stanford/model-0000-70.74.ser.gz'
}
# we do not provide the same level of recovery as in StanfordSentiment. Please manually start your server first
return StanfordCoreNLP(config_item.STANFORD_SERVER, port=config_item.STANFORD_PORT, logging_level=logging.DEBUG, max_retries=5, memory='8g')

def convert_scale(positive):
return 2 * positive - 1

Expand Down Expand Up @@ -70,21 +85,16 @@ def get_subtrees(tree):
subtrees.append((noun, sentence))
return subtrees


class CharLSTMSentiment(object):

def __init__(self):
self.network = LSTM()
self.network.build()
self.server_on = False

def config(self, config):
try:
self.nlp = StanfordCoreNLP(config.STANFORD_SERVER + ':' + str(config.STANFORD_PORT))
self.server_on = True
except Exception as e:
print('Stanford server could not be found')
print(e)
def config(self, config, nlp):
self.nlp = nlp
self.server_on = True

def init_dict(self):
local_dict = {}
Expand All @@ -93,69 +103,39 @@ def init_dict(self):
local_dict[k] = None
self.entities = local_dict

def parse_sentence(self, sentence):
""" sentence --> named-entity chunked tree """
try:
output = self.nlp.annotate(sentence.decode('utf-8'), properties={'annotators': 'tokenize, ssplit, pos,'
' lemma, ner, parse',
'outputFormat': 'json'})
# print_tree(output)
return Tree.fromstring(output['sentences'][0]['parse'])
except TypeError as e:
import pdb; pdb.set_trace()

def coreference_resolution(self, sentence):
# coreference resolution
output = self.nlp.annotate(sentence, properties={'annotators': 'coref',
'outputFormat': 'json'})
tokens = word_tokenize(sentence)
coreferences = output['corefs']
entity_keys = coreferences.keys()
for k in entity_keys:
# skip non PERSON NP
if coreferences[k][0]['gender'] == 'MALE' or coreferences[k][0]['gender'] == 'FEMALE':
rep_mention, pos = get_rep_mention(coreferences[k])
for reference in coreferences[k]:
if not reference['isRepresentativeMention']:
start, end = reference['startIndex'] - 1, reference['headIndex'] - 1
if start == end:
tokens[start] = rep_mention
else:
tokens[start] = rep_mention
del tokens[start + 1: end]

sentence = ' '.join(tokens)
return sentence.encode('utf-8')
def evaluate_single_document(self, document, mode):
if mode == 'document':
document = document[0:1000]
p = self.network.predict_sentences([document])
positive = p[0][0][0]
return [convert_scale(positive)]
elif mode == 'sentence':
return self.evaluate_sentences(sent_tokenize(document))
elif mode == 'entity':
return self.get_entity_sentiment(document)
else:
return ['UNKNOWN MODE']

def parse_doc(self, document):
""" Extract relevant entities in a document """
print('Tokenizing sentences...')
sentences = sent_tokenize(document)
print('Done!')
# Context of all named entities
ne_context = []
for sentence in sentences:
# change pronouns to their respective nouns
print('Anaphora resolution for sentence: %s' % sentence)
tree = self.parse_sentence(self.coreference_resolution(sentence))
print('Done!')

# get context for each noun
print('Named Entity Clustering:')
context = get_subtrees(tree)
for n, s in context:
print('%s' % s)
ne_context.append(context)
self.contexts = flatten(ne_context)
#sentence sentiment function
def evaluate_sentences(self, sentences):
scores = []
p = self.network.predict_sentences(sentences)
for i in range(0, len(sentences)):
positive = p[0][i][0]
scores.append(convert_scale(positive))
return scores

# the following in this class all have to do with entity sentiment
# we need to make sure it is serializable to json (i.e. beware of float32)
def get_entity_sentiment(self, document):
""" Create a dict of every entities with their associated sentiment """
print('Parsing Document...')
self.parse_doc(document)
print('Done!')
print('Done Parsing Document!')
self.init_dict()
#sentences = [sentence.encode('utf-8') for _, sentence in self.contexts]
sentences = [sentence for _, sentence in self.contexts]
print('Predicting!')
predictions = self.network.predict_sentences(sentences)

for i, c in enumerate(self.contexts):
Expand All @@ -167,30 +147,71 @@ def get_entity_sentiment(self, document):
self.entities[key] = (predictions[0][i][0] - predictions[0][i][1])

for e in self.entities.keys():
# conversion for json purposes
self.entities[e] = str(self.entities[e])
print('Entity: %s -- sentiment: %s' % (e, self.entities[e]))

return self.entities

def evaluate_single_document(self, document, mode):
if mode == 'document':
document = document[0:1000]
p = self.network.predict_sentences([document])
positive = p[0][0][0]
return [convert_scale(positive)]
elif mode == 'sentence':
return self.evaluate_sentences(sent_tokenize(document))
elif mode == 'entity':
return self.get_entity_sentiment(document)
else:
return ['UNKNOWN MODE']
def parse_doc(self, document):
""" Extract relevant entities in a document """
print('Tokenizing sentences...')
# why are we mixing nlp pipelines here?
#nltk
sentences = sent_tokenize(document)
print('Done Sentence Tokenize!')
# Context of all named entities
ne_context = []
for sentence in sentences:
# change pronouns to their respective nouns
print('Anaphora resolution for sentence: %s' % sentence)
(output, modified_sentence) = self.coreference_resolution(sentence)
tree = self.parse_sentence(output, modified_sentence)
print('Done Anaphora Resolution!')

# get context for each noun
print('Named Entity Clustering:')
context = get_subtrees(tree)
for n, s in context:
print('%s' % s)
ne_context.append(context)
self.contexts = flatten(ne_context)

def coreference_resolution(self, sentence):
# coreference resolution
# corenlp
print('Starting document annotation for ' + sentence)
output_string = self.nlp.annotate(sentence)
print('Done document annotation')
output = json.loads(output_string)
coreferences = output['corefs']
entity_keys = coreferences.keys()

def evaluate_sentences(self, sentences):
scores = []
p = self.network.predict_sentences(sentences)
for i in range(0, len(sentences)):
positive = p[0][i][0]
scores.append(convert_scale(positive))
return scores
tokens = word_tokenize(sentence)

for k in entity_keys:
# skip non PERSON NP
if coreferences[k][0]['gender'] == 'MALE' or coreferences[k][0]['gender'] == 'FEMALE':
rep_mention, pos = get_rep_mention(coreferences[k])
for reference in coreferences[k]:
if not reference['isRepresentativeMention']:
start, end = reference['startIndex'] - 1, reference['headIndex'] - 1
if start == end:
tokens[start] = rep_mention
else:
tokens[start] = rep_mention
del tokens[start + 1: end]

sentence = ' '.join(tokens)
print('Ending coref function')
return (output, sentence.encode('utf-8'))

def parse_sentence(self, output, sentence):
""" sentence --> named-entity chunked tree """
try:
return Tree.fromstring(output['sentences'][0]['parse'])
except TypeError as e:
import pdb; pdb.set_trace()

side_effect = []

Expand All @@ -206,18 +227,22 @@ def fetch_files(directory):

if __name__ == '__main__':
cls = CharLSTMSentiment()
cls.config('document', Config.StagingConfig())
config_item = Config.DevelopmentConfig
cls.config(config_item, getCoreNlpInstance(config_item))
document = 'Bob talked with the great ruler John yesterday. John mentioned how horrible Tesla is. The nefarious Bob agreed.'

print('Fetching files')
filelines = fetch_files('input/train')
filelines = fetch_files('input/test')

print(len(filelines))

limit_files_to = 10

for i in range(0, len(filelines)):
if i == limit_files_to:
break
print(i)
fileline = filelines[i]
document = '\n'.join(fileline)
result = cls.evaluate_single_document(document)
result = cls.evaluate_single_document(document, 'entity')
print(result)
cls.network.close()
13 changes: 7 additions & 6 deletions Entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,6 @@
else:
raise ValueError('Invalid environment name ' + env)

stanford_sentiment.config(sent_config)
google_sentiment.config()
aylien_sentiment.config()
char_lstm_sentiment.config(sent_config)

def init():
print('Loading Spacy Vectors')
global nlp, sa
Expand All @@ -55,6 +50,11 @@ def init():

init()

stanford_sentiment.config(sent_config)
google_sentiment.config()
aylien_sentiment.config()
char_lstm_sentiment.config(sent_config, stanford_sentiment.nlp)

@application.route("/spacy", methods = ['GET', 'POST'])
def get_spacy_sentiment():
if request.method == 'GET':
Expand Down Expand Up @@ -166,7 +166,8 @@ def get_char_lstm_sentiment():
mode = request.form['mode']
else:
return ('Unknown method!!!')
return json.dumps(compute_lstm_sentiment(text, mode))
result = compute_lstm_sentiment(text, mode)
return json.dumps(result)

def compute_lstm_sentiment(text, mode):
return char_lstm_sentiment.evaluate_single_document(text, mode)
Expand Down
3 changes: 2 additions & 1 deletion StanfordSentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def convert_scale(original):
class StanfordSentiment(object):

def __init__(self):
self.props={'annotators': 'tokenize,ssplit,pos,parse,sentiment',
# we do more than is necessary because we need coref for the CharLSTM service
self.props={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,coref,sentiment',
'pipelineLanguage':'en',
'outputFormat':'json',
'parse.model':'edu/stanford/nlp/models/srparser/englishSR.ser.gz',
Expand Down
1 change: 1 addition & 0 deletions input/test/Test1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,<br />Aug. 24, 2018<br />/PRNewswire/ -- US demand for fabricated metal products is forecast to rise 2.7% per annum through 2022, according to<br />Fabricated Metal Products:<br />United States<br />, a report recently released by Freedonia Focus Reports. Suppliers will benefit from rising domestic durable goods shipments and continued growth in the US construction sector and tariffs on imports of foreign steel and aluminum, as well as fabricated metal products. However, further gains will be limited by ongoing competition from metal castings and alternative materials such as plastics.<br />More information about the report is available at:<br />Demand for structural metals – the largest segment – is expected to see increases through 2022. Expansion in commercial building and nonbuilding construction expenditures will drive gains. In addition, rising prices due to tariff protection will help boost demand in value terms.<br />These and other key insights are featured in<br />Fabricated Metal Products:<br />United States<br />. This report forecasts to 2022 US fabricated metal products demand and shipments in nominal US dollars at the manufacturer level. Total demand is segmented by product in terms of:<br />structural metals<br /
8 changes: 8 additions & 0 deletions input/test/Test10.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Tweet
RR Donnelley & Sons Co (NYSE:RRD)'s share price rose 5% on Thursday . The stock traded as high as $5.06 and last traded at $5.04. Approximately 1,323,300 shares were traded during mid-day trading, an increase of 27% from the average daily volume of 1,043,013 shares. The stock had previously closed at $4.80.
RRD has been the topic of several research reports. ValuEngine downgraded RR Donnelley & Sons from a "buy" rating to a "hold" rating in a research report on Wednesday, May 2nd. Buckingham Research initiated coverage on RR Donnelley & Sons in a research report on Monday, June 25th. They issued a "neutral" rating and a $8.00 target price on the stock. Get RR Donnelley & Sons alerts:
The stock has a market cap of $350.59 million, a price-to-earnings ratio of 4.20 and a beta of 1.23. The company has a quick ratio of 1.25, a current ratio of 1.48 and a debt-to-equity ratio of -8.22. RR Donnelley & Sons (NYSE:RRD) last released its earnings results on Wednesday, August 1st. The business services provider reported ($0.09) earnings per share for the quarter, missing the consensus estimate of ($0.04) by ($0.05). The firm had revenue of $1.68 billion during the quarter, compared to analysts' expectations of $1.64 billion. RR Donnelley & Sons had a negative net margin of 1.18% and a negative return on equity of 32.28%. The business's revenue was up 3.7% compared to the same quarter last year. During the same quarter last year, the business earned ($0.06) EPS. analysts forecast that RR Donnelley & Sons Co will post 0.95 earnings per share for the current fiscal year.
The company also recently announced a quarterly dividend, which will be paid on Tuesday, September 4th. Stockholders of record on Wednesday, August 15th will be given a $0.03 dividend. The ex-dividend date is Tuesday, August 14th. This represents a $0.12 annualized dividend and a dividend yield of 2.38%. RR Donnelley & Sons's dividend payout ratio (DPR) is presently 10.00%.
Several hedge funds and other institutional investors have recently added to or reduced their stakes in RRD. Towle & Co. lifted its position in RR Donnelley & Sons by 32.1% during the second quarter. Towle & Co. now owns 5,755,856 shares of the business services provider's stock worth $33,154,000 after buying an additional 1,398,106 shares in the last quarter. BlackRock Inc. lifted its position in RR Donnelley & Sons by 13.1% during the second quarter. BlackRock Inc. now owns 10,767,877 shares of the business services provider's stock worth $62,023,000 after buying an additional 1,250,568 shares in the last quarter. Dimensional Fund Advisors LP purchased a new position in RR Donnelley & Sons during the second quarter worth approximately $4,454,000. Millennium Management LLC lifted its position in RR Donnelley & Sons by 279.8% during the first quarter. Millennium Management LLC now owns 799,078 shares of the business services provider's stock worth $6,976,000 after buying an additional 588,656 shares in the last quarter. Finally, Advisors Asset Management Inc. lifted its position in RR Donnelley & Sons by 859.8% during the second quarter. Advisors Asset Management Inc. now owns 530,891 shares of the business services provider's stock worth $245,000 after buying an additional 475,578 shares in the last quarter. Institutional investors and hedge funds own 82.81% of the company's stock.
About RR Donnelley & Sons ( NYSE:RRD )
R.R. Donnelley & Sons Company, an integrated communications company, enables organizations to create, manage, deliver, and optimize their multichannel marketing and business communications. The company operates through Variable Print, Strategic Services, and International segments. It offers commercial and digital print, direct mail, statement printing, logistics, sourcing, and digital and creative services, as well as produces and sells labels, forms, educational testing materials, inserts, and books
Loading

0 comments on commit 44048ed

Please sign in to comment.