forked from silverasm/stanford-corenlp-python
-
Notifications
You must be signed in to change notification settings - Fork 27
/
example.py
49 lines (41 loc) · 1.49 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
from nltk.tokenize import sent_tokenize
from corenlp import StanfordCoreNLP
# The directory in which the stanford core NLP .jar is located -- you have to
# download this from their website.
CORE_NLP_DIR = "stanford-corenlp-dir/"
PARSER = StanfordCoreNLP(CORE_NLP_DIR)
in_file = "sentences.txt"
text = open(in_file, 'r').read()
sentences = sent_tokenize(text) # Break the text into sentences.
for i, sentence in enumerate(sentences):
try:
parse = PARSER.raw_parse(sentence)
if i%50 == 0:
print " Entered sentence " + str(i) + " of " + str(len(sentences))
write_parse_products(parse['sentences'][0])
except Exception:
print "Error on sentence:\n\t " + sentence + " \n "
pass
def write_parse_products(self, parse):
words = parse['words']
word_objects = []
text = ""
for i, word_info in enumerate(words):
properties = word_info[1]
token = word_info[0].lower().strip()
surface = word_info[0].strip()
pos = properties['PartOfSpeech']
space_before = ""
if i > 0:
after_previous_word = int(words[i-1][1]['CharacterOffsetEnd'])
space_before = " "*(int(properties['CharacterOffsetBegin']) -
after_previous_word)
text += space_before + surface
raw_sentence = text.replace("(", "(").replace(")", ")").replace("``", "\"").replace("\"\"", "\"")
for dependency_info in parse['dependencies']:
relation_name = dependency_info[0]
gov_index = int(dependency_info[2]) - 1
gov = word_objects[gov_index]
dep_index = int(dependency_info[4]) - 1
dep = word_objects[dep_index]