-
Notifications
You must be signed in to change notification settings - Fork 3
/
option-b-ml.py
69 lines (62 loc) · 2.19 KB
/
option-b-ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import psycopg2
# postgresql
conn = psycopg2.connect("dbname='tweetreplies'")
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS bset')
cur.execute('CREATE TABLE bset AS (SELECT * FROM combined)')
cur.execute('ALTER TABLE bset ADD COLUMN profane_text BOOLEAN')
cur.execute('UPDATE bset SET profane_text = FALSE WHERE 1 = 1')
badphrases = open('./profanity.txt', 'r').read().split('\n')
for phrase in badphrases:
if len(phrase) > 1:
# print(phrase)
phrase = phrase.lower()
cur.execute('UPDATE bset SET profane_text = TRUE WHERE screenname IN ( \
SELECT screenname FROM bset \
WHERE LOWER(screenname) LIKE \'%' + phrase + '%\' \
OR LOWER(printname) LIKE \'%' + phrase + '%\' \
OR LOWER(body) LIKE \'%' + phrase + '%\' \
)')
cur.execute('DROP TABLE IF EXISTS bset_automl')
cur.execute("""CREATE TABLE bset_automl AS (
SELECT REPLACE(CONCAT(CONCAT(originbody, ' || '), body), E'\n', ''), profane_text
FROM bset
WHERE profane_text
)
UNION (
SELECT REPLACE(CONCAT(CONCAT(originbody, ' || '), body), E'\n', ''), profane_text
FROM bset
WHERE profane_text = FALSE
ORDER BY RANDOM()
LIMIT 15000
)""")
cur.execute('DROP TABLE IF EXISTS bset_automl_2')
cur.execute("""CREATE TABLE bset_automl_2 AS (
SELECT REPLACE(body, E'\n', ''), profane_text
FROM bset
WHERE profane_text
)
UNION (
SELECT REPLACE(body, E'\n', ''), profane_text
FROM bset
WHERE profane_text = FALSE
ORDER BY RANDOM()
LIMIT 15000
)""")
conn.commit()
# sql2csv --no-header-row --db postgres:///tweetreplies --query 'SELECT * FROM bset_automl' > all_tweets/bset_automl.csv
# sql2csv --no-header-row --db postgres:///tweetreplies --query 'SELECT * FROM bset_automl_2 WHERE LENGTH(TRIM(replace)) > 0' > all_tweets/bset_automl_2.csv
cur.execute('DROP TABLE IF EXISTS bset_azure')
cur.execute("""CREATE TABLE bset_azure AS (
SELECT * FROM bset
WHERE profane_text
)
UNION (
SELECT * FROM bset
WHERE profane_text = FALSE
ORDER BY RANDOM()
LIMIT 15000
)""")
conn.commit()
os.system("sql2csv --db postgres:///tweetreplies --query 'SELECT * FROM bset_azure WHERE LENGTH(TRIM(body)) > 0' > all_tweets/bset_azure.csv")