fixes #5: We eliminate multiple pipelines and adjust the output of th…

…e entity call such that json can appropriately deserialize the results. I tested on a smaller document, but it was not too slow at all. The key really is to start the corenlp (which is a prerequisite for CharLSTM) with enough memory, perhaps 8g
demongolem · Jan 17, 2019 · 44048ed · 44048ed
1 parent c7c0a2d
commit 44048ed
Show file tree

Hide file tree

Showing 5,146 changed files with 55,226 additions and 93 deletions.
diff --git a/CharLSTMSentiment.py b/CharLSTMSentiment.py
@@ -7,17 +7,32 @@
 '''
 
 import Config
+import json
 from lib_model.bidirectional_lstm import LSTM
+import logging
 import nltk
 from nltk import Tree
 from nltk.tokenize import sent_tokenize, word_tokenize
+import os
 from os import listdir
 from os.path import isfile, join
-from pycorenlp import StanfordCoreNLP
 from queue import Queue
+from stanfordcorenlp import StanfordCoreNLP
 
 nltk.download('punkt')
 
+# for testing only please!  Use the server created in Entry => StanfordSentiment please for deployment usage
+def getCoreNlpInstance(config_item):
+    # don't need sentiment, however the stanford annotator does need it
+    props={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,coref,sentiment',
+            'pipelineLanguage':'en',
+            'outputFormat':'json',
+            'parse.model':'edu/stanford/nlp/models/srparser/englishSR.ser.gz',
+            'sentiment.model': os.path.realpath(__file__) + '/../model/stanford/model-0000-70.74.ser.gz'
+    }
+    # we do not provide the same level of recovery as in StanfordSentiment.  Please manually start your server first
+    return StanfordCoreNLP(config_item.STANFORD_SERVER, port=config_item.STANFORD_PORT, logging_level=logging.DEBUG, max_retries=5,  memory='8g')
+
 def convert_scale(positive):
     return 2 * positive - 1
 
@@ -70,21 +85,16 @@ def get_subtrees(tree):
                             subtrees.append((noun, sentence))
     return subtrees
 
-
 class CharLSTMSentiment(object):
 
     def __init__(self):
         self.network = LSTM()
         self.network.build()
         self.server_on = False 
 
-    def config(self, config):
-        try:
-            self.nlp = StanfordCoreNLP(config.STANFORD_SERVER + ':' + str(config.STANFORD_PORT))
-            self.server_on = True
-        except Exception as e:
-            print('Stanford server could not be found')
-            print(e)
+    def config(self, config, nlp):
+        self.nlp = nlp
+        self.server_on = True
 
     def init_dict(self):
         local_dict = {}
@@ -93,69 +103,39 @@ def init_dict(self):
                 local_dict[k] = None
         self.entities = local_dict
 
-    def parse_sentence(self, sentence):
-        """ sentence --> named-entity chunked tree """
-        try:
-            output = self.nlp.annotate(sentence.decode('utf-8'), properties={'annotators':   'tokenize, ssplit, pos,'
-                                                                        ' lemma, ner, parse',
-                                                        'outputFormat': 'json'})
-            # print_tree(output)
-            return Tree.fromstring(output['sentences'][0]['parse'])
-        except TypeError as e:
-            import pdb; pdb.set_trace()
-
-    def coreference_resolution(self, sentence):
-        # coreference resolution
-        output = self.nlp.annotate(sentence, properties={'annotators':   'coref',
-                                                    'outputFormat': 'json'})
-        tokens = word_tokenize(sentence)
-        coreferences = output['corefs']
-        entity_keys = coreferences.keys()
-        for k in entity_keys:
-            # skip non PERSON NP
-            if coreferences[k][0]['gender'] == 'MALE' or coreferences[k][0]['gender'] == 'FEMALE':
-                rep_mention, pos = get_rep_mention(coreferences[k])
-                for reference in coreferences[k]:
-                    if not reference['isRepresentativeMention']:
-                        start, end = reference['startIndex'] - 1, reference['headIndex'] - 1
-                        if start == end:
-                            tokens[start] = rep_mention
-                        else:
-                            tokens[start] = rep_mention
-                            del tokens[start + 1: end]
-
-        sentence = ' '.join(tokens)
-        return sentence.encode('utf-8')
+    def evaluate_single_document(self, document, mode):
+        if mode == 'document':
+            document = document[0:1000]
+            p = self.network.predict_sentences([document])
+            positive = p[0][0][0]
+            return [convert_scale(positive)]
+        elif mode == 'sentence':
+            return self.evaluate_sentences(sent_tokenize(document))
+        elif mode == 'entity':
+            return self.get_entity_sentiment(document)
+        else:
+            return ['UNKNOWN MODE']
 
-    def parse_doc(self, document):
-        """ Extract relevant entities in a document """
-        print('Tokenizing sentences...')
-        sentences = sent_tokenize(document)
-        print('Done!')
-        # Context of all named entities
-        ne_context = []
-        for sentence in sentences:
-            # change pronouns to their respective nouns
-            print('Anaphora resolution for sentence: %s' % sentence)
-            tree = self.parse_sentence(self.coreference_resolution(sentence))
-            print('Done!')
-
-            # get context for each noun
-            print('Named Entity Clustering:')
-            context = get_subtrees(tree)
-            for n, s in context:
-                print('%s' % s)
-            ne_context.append(context)
-        self.contexts = flatten(ne_context)
+    #sentence sentiment function
+    def evaluate_sentences(self, sentences):
+        scores = []
+        p = self.network.predict_sentences(sentences)
+        for i in range(0, len(sentences)):
+            positive = p[0][i][0]
+            scores.append(convert_scale(positive))
+        return scores
 
+    # the following in this class all have to do with entity sentiment
+    # we need to make sure it is serializable to json (i.e. beware of float32)
     def get_entity_sentiment(self, document):
         """ Create a dict of every entities with their associated sentiment """
         print('Parsing Document...')
         self.parse_doc(document)
-        print('Done!')
+        print('Done Parsing Document!')
         self.init_dict()
         #sentences = [sentence.encode('utf-8') for _, sentence in self.contexts]
         sentences = [sentence for _, sentence in self.contexts]
+        print('Predicting!')
         predictions = self.network.predict_sentences(sentences)
 
         for i, c in enumerate(self.contexts):
@@ -167,30 +147,71 @@ def get_entity_sentiment(self, document):
                 self.entities[key] = (predictions[0][i][0] - predictions[0][i][1])
 
         for e in self.entities.keys():
+            # conversion for json purposes
+            self.entities[e] = str(self.entities[e])
             print('Entity: %s -- sentiment: %s' % (e, self.entities[e]))
 
         return self.entities
 
-    def evaluate_single_document(self, document, mode):
-        if mode == 'document':
-            document = document[0:1000]
-            p = self.network.predict_sentences([document])
-            positive = p[0][0][0]
-            return [convert_scale(positive)]
-        elif mode == 'sentence':
-            return self.evaluate_sentences(sent_tokenize(document))
-        elif mode == 'entity':
-            return self.get_entity_sentiment(document)
-        else:
-            return ['UNKNOWN MODE']
+    def parse_doc(self, document):
+        """ Extract relevant entities in a document """
+        print('Tokenizing sentences...')
+        # why are we mixing nlp pipelines here?
+        #nltk
+        sentences = sent_tokenize(document)
+        print('Done Sentence Tokenize!')
+        # Context of all named entities
+        ne_context = []
+        for sentence in sentences:
+            # change pronouns to their respective nouns
+            print('Anaphora resolution for sentence: %s' % sentence)
+            (output, modified_sentence) = self.coreference_resolution(sentence)
+            tree = self.parse_sentence(output, modified_sentence)
+            print('Done Anaphora Resolution!')
+
+            # get context for each noun
+            print('Named Entity Clustering:')
+            context = get_subtrees(tree)
+            for n, s in context:
+                print('%s' % s)
+            ne_context.append(context)
+        self.contexts = flatten(ne_context)
+
+    def coreference_resolution(self, sentence):
+        # coreference resolution
+        # corenlp
+        print('Starting document annotation for ' + sentence)
+        output_string = self.nlp.annotate(sentence)
+        print('Done document annotation')
+        output = json.loads(output_string)
+        coreferences = output['corefs']
+        entity_keys = coreferences.keys()
 
-    def evaluate_sentences(self, sentences):
-        scores = []
-        p = self.network.predict_sentences(sentences)
-        for i in range(0, len(sentences)):
-            positive = p[0][i][0]
-            scores.append(convert_scale(positive))
-        return scores
+        tokens = word_tokenize(sentence)
+
+        for k in entity_keys:
+            # skip non PERSON NP
+            if coreferences[k][0]['gender'] == 'MALE' or coreferences[k][0]['gender'] == 'FEMALE':
+                rep_mention, pos = get_rep_mention(coreferences[k])
+                for reference in coreferences[k]:
+                    if not reference['isRepresentativeMention']:
+                        start, end = reference['startIndex'] - 1, reference['headIndex'] - 1
+                        if start == end:
+                            tokens[start] = rep_mention
+                        else:
+                            tokens[start] = rep_mention
+                            del tokens[start + 1: end]
+
+        sentence = ' '.join(tokens)
+        print('Ending coref function')
+        return (output, sentence.encode('utf-8'))
+
+    def parse_sentence(self, output, sentence):
+        """ sentence --> named-entity chunked tree """
+        try:
+            return Tree.fromstring(output['sentences'][0]['parse'])
+        except TypeError as e:
+            import pdb; pdb.set_trace()
 
 side_effect = []
 
@@ -206,18 +227,22 @@ def fetch_files(directory):
 
 if __name__ == '__main__':
     cls = CharLSTMSentiment()
-    cls.config('document', Config.StagingConfig())
+    config_item = Config.DevelopmentConfig
+    cls.config(config_item, getCoreNlpInstance(config_item))
     document = 'Bob talked with the great ruler John yesterday.  John mentioned how horrible Tesla is.  The nefarious Bob agreed.'
 
     print('Fetching files')
-    filelines = fetch_files('input/train')
+    filelines = fetch_files('input/test')
 
     print(len(filelines))
+
+    limit_files_to = 10
 
     for i in range(0, len(filelines)):
+        if i == limit_files_to:
+            break
         print(i)
         fileline = filelines[i]
         document = '\n'.join(fileline)
-        result = cls.evaluate_single_document(document)
+        result = cls.evaluate_single_document(document, 'entity')
         print(result)  
-    cls.network.close()       
diff --git a/Entry.py b/Entry.py
@@ -41,11 +41,6 @@
 else:
     raise ValueError('Invalid environment name ' + env)
 
-stanford_sentiment.config(sent_config)
-google_sentiment.config()
-aylien_sentiment.config()
-char_lstm_sentiment.config(sent_config)
-
 def init():
     print('Loading Spacy Vectors')
     global nlp, sa
@@ -55,6 +50,11 @@ def init():
 
 init()
 
+stanford_sentiment.config(sent_config)
+google_sentiment.config()
+aylien_sentiment.config()
+char_lstm_sentiment.config(sent_config, stanford_sentiment.nlp)
+
 @application.route("/spacy", methods = ['GET', 'POST'])
 def get_spacy_sentiment():
     if request.method == 'GET':
@@ -166,7 +166,8 @@ def get_char_lstm_sentiment():
         mode = request.form['mode']
     else:
         return ('Unknown method!!!')
-    return json.dumps(compute_lstm_sentiment(text, mode))
+    result = compute_lstm_sentiment(text, mode)
+    return json.dumps(result)
 
 def compute_lstm_sentiment(text, mode):
     return char_lstm_sentiment.evaluate_single_document(text, mode)

diff --git a/StanfordSentiment.py b/StanfordSentiment.py
@@ -24,7 +24,8 @@ def convert_scale(original):
 class StanfordSentiment(object):
 
     def __init__(self):
-        self.props={'annotators': 'tokenize,ssplit,pos,parse,sentiment',
+        # we do more than is necessary because we need coref for the CharLSTM service
+        self.props={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,coref,sentiment',
                     'pipelineLanguage':'en',
                     'outputFormat':'json',
                     'parse.model':'edu/stanford/nlp/models/srparser/englishSR.ser.gz',

diff --git a/input/test/Test1.txt b/input/test/Test1.txt
@@ -0,0 +1 @@
+,<br />Aug. 24, 2018<br />/PRNewswire/ -- US demand for fabricated metal products is forecast to rise 2.7% per annum through 2022, according to<br />Fabricated Metal Products:<br />United States<br />, a report recently released by Freedonia Focus Reports. Suppliers will benefit from rising domestic durable goods shipments and continued growth in the US construction sector and tariffs on imports of foreign steel and aluminum, as well as fabricated metal products. However, further gains will be limited by ongoing competition from metal castings and alternative materials such as plastics.<br />More information about the report is available at:<br />Demand for structural metals â€“ the largest segment â€“ is expected to see increases through 2022. Expansion in commercial building and nonbuilding construction expenditures will drive gains. In addition, rising prices due to tariff protection will help boost demand in value terms.<br />These and other key insights are featured in<br />Fabricated Metal Products:<br />United States<br />. This report forecasts to 2022 US fabricated metal products demand and shipments in nominal US dollars at the manufacturer level. Total demand is segmented by product in terms of:<br />structural metals<br /
diff --git a/input/test/Test10.txt b/input/test/Test10.txt
@@ -0,0 +1,8 @@
+Tweet
+RR Donnelley & Sons Co (NYSE:RRD)'s share price rose 5% on Thursday . The stock traded as high as $5.06 and last traded at $5.04. Approximately 1,323,300 shares were traded during mid-day trading, an increase of 27% from the average daily volume of 1,043,013 shares. The stock had previously closed at $4.80.
+RRD has been the topic of several research reports. ValuEngine downgraded RR Donnelley & Sons from a "buy" rating to a "hold" rating in a research report on Wednesday, May 2nd. Buckingham Research initiated coverage on RR Donnelley & Sons in a research report on Monday, June 25th. They issued a "neutral" rating and a $8.00 target price on the stock. Get RR Donnelley & Sons alerts:
+The stock has a market cap of $350.59 million, a price-to-earnings ratio of 4.20 and a beta of 1.23. The company has a quick ratio of 1.25, a current ratio of 1.48 and a debt-to-equity ratio of -8.22. RR Donnelley & Sons (NYSE:RRD) last released its earnings results on Wednesday, August 1st. The business services provider reported ($0.09) earnings per share for the quarter, missing the consensus estimate of ($0.04) by ($0.05). The firm had revenue of $1.68 billion during the quarter, compared to analysts' expectations of $1.64 billion. RR Donnelley & Sons had a negative net margin of 1.18% and a negative return on equity of 32.28%. The business's revenue was up 3.7% compared to the same quarter last year. During the same quarter last year, the business earned ($0.06) EPS. analysts forecast that RR Donnelley & Sons Co will post 0.95 earnings per share for the current fiscal year.
+The company also recently announced a quarterly dividend, which will be paid on Tuesday, September 4th. Stockholders of record on Wednesday, August 15th will be given a $0.03 dividend. The ex-dividend date is Tuesday, August 14th. This represents a $0.12 annualized dividend and a dividend yield of 2.38%. RR Donnelley & Sons's dividend payout ratio (DPR) is presently 10.00%.
+Several hedge funds and other institutional investors have recently added to or reduced their stakes in RRD. Towle & Co. lifted its position in RR Donnelley & Sons by 32.1% during the second quarter. Towle & Co. now owns 5,755,856 shares of the business services provider's stock worth $33,154,000 after buying an additional 1,398,106 shares in the last quarter. BlackRock Inc. lifted its position in RR Donnelley & Sons by 13.1% during the second quarter. BlackRock Inc. now owns 10,767,877 shares of the business services provider's stock worth $62,023,000 after buying an additional 1,250,568 shares in the last quarter. Dimensional Fund Advisors LP purchased a new position in RR Donnelley & Sons during the second quarter worth approximately $4,454,000. Millennium Management LLC lifted its position in RR Donnelley & Sons by 279.8% during the first quarter. Millennium Management LLC now owns 799,078 shares of the business services provider's stock worth $6,976,000 after buying an additional 588,656 shares in the last quarter. Finally, Advisors Asset Management Inc. lifted its position in RR Donnelley & Sons by 859.8% during the second quarter. Advisors Asset Management Inc. now owns 530,891 shares of the business services provider's stock worth $245,000 after buying an additional 475,578 shares in the last quarter. Institutional investors and hedge funds own 82.81% of the company's stock.
+About RR Donnelley & Sons ( NYSE:RRD )
+R.R. Donnelley & Sons Company, an integrated communications company, enables organizations to create, manage, deliver, and optimize their multichannel marketing and business communications. The company operates through Variable Print, Strategic Services, and International segments. It offers commercial and digital print, direct mail, statement printing, logistics, sourcing, and digital and creative services, as well as produces and sells labels, forms, educational testing materials, inserts, and books