Skip to content

Commit

Permalink
parsing from csv to vw was too slow
Browse files Browse the repository at this point in the history
  • Loading branch information
Mathias M. Andersen authored and Mathias M. Andersen committed Aug 15, 2016
1 parent 73ac4a4 commit 02353bb
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 72 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ combined.csv
large_files/
__pycache__/
combined.libsvm
combined.vw
combined.vw
*.vw
*.csv
*.cache
*.model
94 changes: 34 additions & 60 deletions csv_to_libsvm.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,56 @@

"""
Convert CSV file to libsvm format. Works only with numeric variables.
Put -1 as label index (argv[3]) if there are no labels in your file.
Expecting no headers. If present, headers can be skipped with argv[4] == 1.
"""

import csv
import pandas
from pandas import Series
import numpy as np
import sys
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer

def float_convertable(s):
try:
s=float(s)
except ValueError:
pass
return s

def int_convertable(s):
try:
s=float(s)
except ValueError:
return False
return True

def date_convertible(s):
try: #2021-06-29
try: #2021-06-29
s=datetime.strptime(s, '%Y-%m-%d')
except ValueError:
return False
return True

def bag_of_words(df):
word_index = list(Series(df.values.ravel()).unique())
return word_index

def convert(input, output = "output.vw"):
df = pandas.read_csv(input)

#remove columns
df = df.drop(['date_activity', 'date_people'], 1)

for column in df:
df[column] = df[column].astype(str)

# add column prefrix to values
for column_idx, column in enumerate(df):
for row_idx, row in enumerate(df.iterrows()):
if not str(df.get_value(row_idx, column)) == "nan":
value = str(column) + str(df.get_value(row_idx, column))
df.set_value(row_idx, column, value)

#all distinct "words"
word_index = bag_of_words(df)
print(len(word_index))

#build vw datafile
def convert(input, word_index, train, output = "output.vw"):
o = open(output, 'w')
for row_idx, row in enumerate(df.iterrows()):
new_line = ""

label = None
if df.get_value(row_idx, "outcome") == "outcome1":
label = 1
else:
label = -1
new_line += str(label) + " " + "|features "

features = ""
for column_idx, column in enumerate(df):
if not str(df.get_value(row_idx, column)) == "nan":
value = str(df.get_value(row_idx, column))
if value in word_index:
index = word_index.index(value)
features += str(index) + ":1 "
new_line += features
new_line += "\n"
o.write(new_line)
count = 0
with open(input) as f:
reader = csv.DictReader(f)
for row in reader:
# print(row)
count += 1
if count % 10000 == 0:
print(count, "--",datetime.now())
new_line = ""
label = ""
features = ""
for (k,v) in row.items():
if v != "":
if train and k == "outcome":
if v == "0":
label = "-1"
if v == "1":
label = "1"

# make features
# -- bag of words
# -- skip some columns
if ((k != "date_activity") and (k != "date_people") and (k != "activity_id") and (k != "outcome")):
if k+v in word_index:
pass
index = word_index[k+v]
features += str(index) + ":1 "

# line finished
new_line += label + " " + "|features " + features + "\n"
# print(new_line)
o.write(new_line)
39 changes: 38 additions & 1 deletion features.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,40 @@
from csv_to_libsvm import convert
import pandas as pd
from pandas import Series

convert("combined.csv","combined.vw")
def bag_of_words(train_csv,test_csv):
# df_train = pd.read_csv(train_csv, dtype={'': object})
# df_train = df_train.drop(['outcome','date_activity', 'date_people','activity_id'], 1)
#
# df_test = pd.read_csv(test_csv, dtype={'': object})
# df_test = df_test.drop(['date_activity', 'date_people','activity_id'], 1)

# combine test/train: gather all words
#df = df_train.append(df_test, ignore_index=True)
#df.to_csv("small_combined_test_train.csv", index=False)

df = pd.read_csv("combined_test_train.csv")
print("frames combined")

# find uniques
word_index = {}
word_set = set()
count = 1
for column in df:
words = list(Series(df[column].values.ravel()).unique()) # unique values in column
for word in words:
if word != "nan":
word_index[str(column)+str(word)] = count
count += 1
print("words:",len(word_index))
return word_index

word_index = bag_of_words("combined_train.csv", "combined_test.csv")

#Convert training data
print("converting training")
convert("combined_train.csv",word_index,train=True,output="combined2_train.vw")

#Convert test data
print("converting test")
convert("combined_test.csv",word_index,train=False,output="combined2_test.vw")
22 changes: 12 additions & 10 deletions parse.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import pandas as pd

activity = pd.read_csv("act_train.csv")
#Train data
activity_train = pd.read_csv("act_train.csv")
people = pd.read_csv("people.csv")

"""omg en bitch"""
# how="left": take value from "left" and check in "right". left = activity.
joined = pd.merge(activity, people, on="people_id", how="left" , suffixes=('_activity', '_people'))
joined.to_csv("combined.csv", index=False)

# print(people.columns, activity.columns)
# print(joined.columns)
#
# print(people.shape, activity.shape)
# print(joined.shape)
joined = pd.merge(activity_train, people, on="people_id", how="left" , suffixes=('_activity', '_people'))
joined.to_csv("small_combined_train.csv", index=False)

# Test data
activity_test = pd.read_csv("act_test.csv")
people = pd.read_csv("people.csv")

# how="left": take value from "left" and check in "right". left = activity.
joined = pd.merge(activity_test, people, on="people_id", how="left" , suffixes=('_activity', '_people'))
joined.to_csv("small_combined_test.csv", index=False)

0 comments on commit 02353bb

Please sign in to comment.