-
Notifications
You must be signed in to change notification settings - Fork 1
/
vowpalwabbit.py
executable file
·151 lines (131 loc) · 6.01 KB
/
vowpalwabbit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/env python
import os, re, numpy as np, collections
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
from scikits.learn.feature_extraction.text import WordNGramAnalyzer
from scikits.learn.base import BaseEstimator
from scipy.sparse import coo_matrix
from collections import defaultdict
from pandas import *
class VowpalWabbit(BaseEstimator):
"""Handles parsing and vectorizing of vowpal wabbit data.
Naturally handles both raw text and arbitrary features.
Maintains fidelity of core concepts, including key meta data and feature namespaces.
Allow for large numbers of sparse features.
Simple support for feature pruning.
"""
label_section_regex = re.compile("([^ ]+)(?: ([^ ]+))?(?: ([^ ]+))?")
feature_section_regex = re.compile("([^ ]+)[ ]+(.*)")
feature_regex = re.compile("([^:]+)(?::(.*))?")
default_analyzer = WordNGramAnalyzer(min_n=1, max_n=1)
def __init__(self, analyzer=default_analyzer, min_support=0.001):
self.analyzer = analyzer
self.min_support = min_support
self.namespaces = set()
self.dfs = defaultdict(lambda: defaultdict(float))
self.index = defaultdict(lambda: defaultdict(int))
def parse_file(self,filename):
"""low-level routine to parse vowpal wabbit data using generator
"""
for line in open(filename):
sections = line.strip().split('|')
assert len(sections) >= 2, "No delimiter | found in line %s" % line
label,importance,tag = self.label_section_regex.match(sections[0]).groups()
features = defaultdict(lambda: defaultdict(float))
for feature_section in sections[1:]:
feature_section = feature_section.strip()
assert " " in feature_section, "No space delim found in section: %s" % feature_section
ns,feature_group = self.feature_section_regex.match(feature_section).groups()
#simplyfing assumption: any namespace that with >= 1 valued feature does not contain text
if ":" in feature_group:
ns_features = feature_group.split(" ")
for feature in ns_features:
feature_match = self.feature_regex.match(feature)
assert feature_match, "No features found in %s" % feature_group
name,value = feature_match.groups()
value = float(value) if value != None else 1
features[ns][name] += value
#otherwise if no values, treat as text
else:
for term in set(self.analyzer.analyze(feature_group)):
features[ns][term] += 1
yield label,importance,tag,features
def flatten(self,d, parent_key='',keys_to_exclude=None):
"""Convenience method to flatten a dict"""
items = []
for k, v in d.items():
if k in keys_to_exclude:
continue
new_key = parent_key + ':' + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(self.flatten(v, new_key,keys_to_exclude).items())
else:
items.append((new_key, v))
return dict(items)
def load_file(self,filename,ns_to_exclude_from_df = []):
"""Given filename and optionally list of namespaces to exclude from DataFrame, load file
Return dataframe and dict of namespace->array of feature dicts
"""
labels = []; imps = []; tags = []; features_by_ns = []; df_features = [];
for label, imp, tag, features in self.parse_file(filename):
label = int(label)
labels.append(label);
imps.append(float(imp));
tags.append(tag)
features_by_ns.append(features)
features_flat = self.flatten(features,keys_to_exclude=set(ns_to_exclude_from_df))
df_features.append(features_flat)
df_features = DataFrame(df_features)
two_level_cols = map(lambda flat: tuple(flat.split(":")) ,list(df_features.columns.values))
df_features.columns = MultiIndex.from_tuples(two_level_cols, names=['ns', 'features'])
df_meta = DataFrame({'label':labels,'imp':imps,'tag':tags})
meta_tuples = map(lambda f: ('meta',f) ,list(df_meta.columns.values))
df_meta.columns = MultiIndex.from_tuples(meta_tuples,names=['ns', 'features'])
df = df_meta.join(df_features)
return df, features_by_ns
def fit(self,features_by_ns):
"""Given array of feature dicts, compute index and feature counts
"""
self.namespaces = set()
self.dfs = defaultdict(lambda: defaultdict(float))
for features in features_by_ns:
for ns in features.keys():
self.namespaces.update(features.keys())
for feature in features[ns].keys():
self.dfs[ns][feature] += features[ns][feature]
# only put supported terms in the final index
min_df = self.min_support * len(features_by_ns)
self.index = defaultdict(lambda: defaultdict(int))
for ns in self.dfs.keys():
idx = 0
for name, value in self.dfs[ns].iteritems():
if value >= min_df:
self.index[ns][name] = idx
idx += 1
def fit_transform(self,features_by_ns):
self.fit(features_by_ns)
return self.transform(features_by_ns)
def transform(self,features_by_ns):
"""Given array of feature counts, return dict of ns->sparse feature matrix
"""
assert len(self.index) > 0, 'No index. Call fit() first.'
row_ids_by_ns = defaultdict(list)
feature_ids_by_ns = defaultdict(list)
values_by_ns = defaultdict(list)
row_id = 0
# build large sparse triples of row id, feature id, & value
for features_by_ns in features_by_ns:
for ns in features_by_ns.keys():
features = features_by_ns[ns]
for feature in features.keys():
feature_idx = self.index[ns].get(feature)
if feature_idx:
row_ids_by_ns[ns].append(row_id)
feature_ids_by_ns[ns].append(feature_idx)
values_by_ns[ns].append(features[feature])
row_id += 1
matrix_by_ns = {}
for ns in self.namespaces:
idxs = (row_ids_by_ns[ns], feature_ids_by_ns[ns])
num_cols = len(self.index[ns])
matrix_by_ns[ns] = coo_matrix((values_by_ns[ns], idxs), shape=(row_id, num_cols), dtype=np.float32)
return matrix_by_ns