-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathparse.py
115 lines (94 loc) · 3.08 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python3
import string
import os
import sys
import getopt
from decimal import Decimal
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from os import path
dir = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
def calculateCompoundRank(compound):
decimalCompound = Decimal(compound)
if(decimalCompound.compare(Decimal(0.75)) == 1):
return 4
elif(decimalCompound.compare(Decimal(0.5)) == 1):
return 3
elif(decimalCompound.compare(Decimal(0.25)) == 1):
return 2
elif(decimalCompound.compare(Decimal(0)) == 1):
return 1
elif(decimalCompound.compare(Decimal(0)) == 0):
return 0
elif(decimalCompound.compare(Decimal(-0.25)) == 1):
return -1
elif(decimalCompound.compare(Decimal(-0.5)) == 1):
return -2
elif(decimalCompound.compare(Decimal(-0.75)) == 1):
return -3
return -4
def createTokenizedFile(input, root=False):
inputFilename = path.join(dir, input)
basename = path.basename(path.splitext(input)[0])
if(root == True):
filename = basename + "-root.txt"
else :
filename = basename + ".txt"
outputFilename = path.join(dir, "output", filename)
inputFile = open(inputFilename, "r")
text = inputFile.read()
inputFile.close()
# PROCESAMIENTO DE LAS PALABRAS
# Ver: https://machinelearningmastery.com/clean-text-machine-learning-python/
# 1. Separo en palabras (tokenizar) y las paso a minusculas
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
# 2. Reduccion de palabras a su raíz lingüística
if(root):
porter = PorterStemmer()
tokens = [porter.stem(word) for word in tokens]
# 3. Remuevo puntuaciones y todo lo no alfanumérico
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
# 4. Filtro las stopwords en español
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
# GUARDADO DEL ARCHIVO FINAL
outputText = " ".join(str(x) for x in words)
outputFile = open(outputFilename, "w+")
outputFile.write(outputText)
return outputFilename
def printCmd():
print('parse.py -i <input> --root')
def main(argv):
input = ''
root = False
try:
opts, args = getopt.getopt(argv, "hi:r", [
"input=",
"root",
])
except getopt.GetoptError:
printCmd()
sys.exit(2)
if len(opts) < 1:
printCmd()
sys.exit()
else:
for opt, arg in opts:
if opt == '-h':
printCmd()
sys.exit()
elif opt in ("-i", "--input"):
input = arg.strip()
elif opt in ('-r', '--root'):
root = True
tokenizedFilename = createTokenizedFile(
input,
root,
)
return tokenizedFilename
if __name__ == "__main__":
main(sys.argv[1:])