-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlemmatizer_template.py
33 lines (26 loc) · 1.04 KB
/
lemmatizer_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from spacy import load
language = load('language_pack')
# Available languages (need to be downloaded):
# Language: download (language_pack to load is the last variable, e.g. en_core_web_sm)
# English: python -m spacy download en_core_web_sm
# German: python -m spacy download de_core_news_sm
# French: python -m spacy download fr_core_news_sm
# Spanish: python -m spacy download es_core_news_sm
# Portuguese: python -m spacy download pt_core_news_sm
# Italian: python -m spacy download it_core_news_sm
# Dutch: python -m spacy download nl_core_news_sm
# Greek: python -m spacy download el_core_news_sm
def file_to_string(file):
with open(file, 'r', encoding = 'utf-8') as file:
file = file.read()
return file
def lemmatize(sent):
words = language(sent)
for token in words:
if token.lemma_ == '-PRON-':
yield token
else:
yield token.lemma_
if __name__ == '__main__':
sent = [str(l) for l in lemmatize(file_to_string('file_to_lemmatize'))]
# print('Lemmas:', '\n', ' '.join(sent))