-
Notifications
You must be signed in to change notification settings - Fork 0
/
TODO
145 lines (86 loc) · 3.33 KB
/
TODO
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
TODO amadantk
analysis.py
===========
- multiple all=collectioncleaned.find(subset,{"text": 1, "_id":0}) vervangen door een globale opvraagroutine
- drempelwaarde voor zoeken toevoegen: woord moet vaker dan x keer voorkomen.
- tfidf routine inbouwen bijvoorbeeld op basis hiervan:
#-*- coding: utf-8 -*-
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams, trigrams
import math
stopwords = nltk.corpus.stopwords.words('german')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
def freq(word, doc):
return doc.count(word)
def word_count(doc):
return len(doc)
def tf(word, doc):
return (freq(word, doc) / float(word_count(doc)))
def num_docs_containing(word, list_of_docs):
count = 0
for document in list_of_docs:
if freq(word, document) > 0:
count += 1
return 1 + count
def idf(word, list_of_docs):
return math.log(len(list_of_docs) /
float(num_docs_containing(word, list_of_docs)))
def tf_idf(word, doc, list_of_docs):
return (tf(word, doc) * idf(word, list_of_docs))
#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (['Nou ik bak er niks van, maar misschien is dit wel goed zo', 'Ik bak een biefstuk', 'Hoi jongens']):
tokens = tokenizer.tokenize(tip)
bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)
tokens = [token.lower() for token in tokens if len(token) > 2]
tokens = [token for token in tokens if token not in stopwords]
bi_tokens = [' '.join(token).lower() for token in bi_tokens]
bi_tokens = [token for token in bi_tokens if token not in stopwords]
tri_tokens = [' '.join(token).lower() for token in tri_tokens]
tri_tokens = [token for token in tri_tokens if token not in stopwords]
final_tokens = []
final_tokens.extend(tokens)
final_tokens.extend(bi_tokens)
final_tokens.extend(tri_tokens)
docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
'tf-idf': {}, 'tokens': []}
for token in final_tokens:
#The frequency computed for each tip
docs[tip]['freq'][token] = freq(token, final_tokens)
#The term-frequency (Normalized Frequency)
docs[tip]['tf'][token] = tf(token, final_tokens)
docs[tip]['tokens'] = final_tokens
vocabulary.append(final_tokens)
for doc in docs:
for token in docs[doc]['tf']:
#The Inverse-Document-Frequency
docs[doc]['idf'][token] = idf(token, vocabulary)
#The tf-idf
docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)
#Now let's find out the most relevant words by tf-idf.
words = {}
for doc in docs:
for token in docs[doc]['tf-idf']:
if token not in words:
words[token] = docs[doc]['tf-idf'][token]
else:
if docs[doc]['tf-idf'][token] > words[token]:
words[token] = docs[doc]['tf-idf'][token]
print doc
for token in docs[doc]['tf-idf']:
print token, docs[doc]['tf-idf'][token]
for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
print "%f <= %s" % (item[1], item[0])
datamanger.py
=============
- add other modules than lexis nexis, especially rsshond
- add tools for removing articles
generally:
==========
- write readme/manual
- systematic aux tools (e.g., to convert LN to UTF8)