-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfoo.py
60 lines (52 loc) · 1.78 KB
/
foo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python
import psycopg2, os
from nltk.corpus import wordnet as wn
from nltk import pos_tag as pos
from pdb import set_trace as debug
'''
This file parses each corpus by
1) Removing stems from words, reducing
them into their root lexemes
2) Removing stop words
3) Tagging the part of speech
'''
def pg_connect(f):
with psycopg2.connect("dbname='ml' user='mingy' password='{0}'".format(os.environ['PG_PASSWORD'])) as conn:
with conn.cursor() as cur:
return f(cur)
def parse_file(path_to_file, cur):
# Write from txt file in text to csv file in output
path_to_write = path_to_file.replace('text','output').replace('txt','csv')
print path_to_write
# Skip if file is already written to output or is empty
if os.path.isfile(path_to_write): return
if os.path.getsize(path_to_file) <= 1: return
with open(path_to_file) as f:
thing = f.readlines()
thing = ' '.join(thing[0].split())
# Yay sql injection
cur.execute("""SELECT to_tsvector('public.english', '{0}')""".format(thing))
rows = cur.fetchall()
# Do some cleaning up
# Dimensionality reduction with hypernyms?
string = rows[0][0]
words = string.split(' ')
vector = {}
for word in words:
[key, locations] = word.split(':')
key = key.strip("'")
locations = locations.count(',') + 1
parts_of_speech = pos([key])[0][1]
vector[key] = (parts_of_speech, locations)
with open(path_to_write, 'w') as f:
f.write("# Lexeme, Part of Speech, Frequency\n")
for key, value in vector.iteritems():
stringy = "{0}, {1}, {2}\n".format(key, value[0], value[1])
f.write(stringy)
@pg_connect
def main(cur):
for root, dirs, files in os.walk("text"):
dirs.sort()
for name in sorted(files):
path_to_file = os.path.join(root, name)
parse_file(path_to_file, cur)