-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mathias M. Andersen
authored and
Mathias M. Andersen
committed
Aug 15, 2016
1 parent
73ac4a4
commit 02353bb
Showing
4 changed files
with
89 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,4 +7,8 @@ combined.csv | |
large_files/ | ||
__pycache__/ | ||
combined.libsvm | ||
combined.vw | ||
combined.vw | ||
*.vw | ||
*.csv | ||
*.cache | ||
*.model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,82 +1,56 @@ | ||
|
||
""" | ||
Convert CSV file to libsvm format. Works only with numeric variables. | ||
Put -1 as label index (argv[3]) if there are no labels in your file. | ||
Expecting no headers. If present, headers can be skipped with argv[4] == 1. | ||
""" | ||
|
||
import csv | ||
import pandas | ||
from pandas import Series | ||
import numpy as np | ||
import sys | ||
from datetime import datetime | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
|
||
def float_convertable(s): | ||
try: | ||
s=float(s) | ||
except ValueError: | ||
pass | ||
return s | ||
|
||
def int_convertable(s): | ||
try: | ||
s=float(s) | ||
except ValueError: | ||
return False | ||
return True | ||
|
||
def date_convertible(s): | ||
try: #2021-06-29 | ||
try: #2021-06-29 | ||
s=datetime.strptime(s, '%Y-%m-%d') | ||
except ValueError: | ||
return False | ||
return True | ||
|
||
def bag_of_words(df): | ||
word_index = list(Series(df.values.ravel()).unique()) | ||
return word_index | ||
|
||
def convert(input, output = "output.vw"): | ||
df = pandas.read_csv(input) | ||
|
||
#remove columns | ||
df = df.drop(['date_activity', 'date_people'], 1) | ||
|
||
for column in df: | ||
df[column] = df[column].astype(str) | ||
|
||
# add column prefrix to values | ||
for column_idx, column in enumerate(df): | ||
for row_idx, row in enumerate(df.iterrows()): | ||
if not str(df.get_value(row_idx, column)) == "nan": | ||
value = str(column) + str(df.get_value(row_idx, column)) | ||
df.set_value(row_idx, column, value) | ||
|
||
#all distinct "words" | ||
word_index = bag_of_words(df) | ||
print(len(word_index)) | ||
|
||
#build vw datafile | ||
def convert(input, word_index, train, output = "output.vw"): | ||
o = open(output, 'w') | ||
for row_idx, row in enumerate(df.iterrows()): | ||
new_line = "" | ||
|
||
label = None | ||
if df.get_value(row_idx, "outcome") == "outcome1": | ||
label = 1 | ||
else: | ||
label = -1 | ||
new_line += str(label) + " " + "|features " | ||
|
||
features = "" | ||
for column_idx, column in enumerate(df): | ||
if not str(df.get_value(row_idx, column)) == "nan": | ||
value = str(df.get_value(row_idx, column)) | ||
if value in word_index: | ||
index = word_index.index(value) | ||
features += str(index) + ":1 " | ||
new_line += features | ||
new_line += "\n" | ||
o.write(new_line) | ||
count = 0 | ||
with open(input) as f: | ||
reader = csv.DictReader(f) | ||
for row in reader: | ||
# print(row) | ||
count += 1 | ||
if count % 10000 == 0: | ||
print(count, "--",datetime.now()) | ||
new_line = "" | ||
label = "" | ||
features = "" | ||
for (k,v) in row.items(): | ||
if v != "": | ||
if train and k == "outcome": | ||
if v == "0": | ||
label = "-1" | ||
if v == "1": | ||
label = "1" | ||
|
||
# make features | ||
# -- bag of words | ||
# -- skip some columns | ||
if ((k != "date_activity") and (k != "date_people") and (k != "activity_id") and (k != "outcome")): | ||
if k+v in word_index: | ||
pass | ||
index = word_index[k+v] | ||
features += str(index) + ":1 " | ||
|
||
# line finished | ||
new_line += label + " " + "|features " + features + "\n" | ||
# print(new_line) | ||
o.write(new_line) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,40 @@ | ||
from csv_to_libsvm import convert | ||
import pandas as pd | ||
from pandas import Series | ||
|
||
convert("combined.csv","combined.vw") | ||
def bag_of_words(train_csv,test_csv): | ||
# df_train = pd.read_csv(train_csv, dtype={'': object}) | ||
# df_train = df_train.drop(['outcome','date_activity', 'date_people','activity_id'], 1) | ||
# | ||
# df_test = pd.read_csv(test_csv, dtype={'': object}) | ||
# df_test = df_test.drop(['date_activity', 'date_people','activity_id'], 1) | ||
|
||
# combine test/train: gather all words | ||
#df = df_train.append(df_test, ignore_index=True) | ||
#df.to_csv("small_combined_test_train.csv", index=False) | ||
|
||
df = pd.read_csv("combined_test_train.csv") | ||
print("frames combined") | ||
|
||
# find uniques | ||
word_index = {} | ||
word_set = set() | ||
count = 1 | ||
for column in df: | ||
words = list(Series(df[column].values.ravel()).unique()) # unique values in column | ||
for word in words: | ||
if word != "nan": | ||
word_index[str(column)+str(word)] = count | ||
count += 1 | ||
print("words:",len(word_index)) | ||
return word_index | ||
|
||
word_index = bag_of_words("combined_train.csv", "combined_test.csv") | ||
|
||
#Convert training data | ||
print("converting training") | ||
convert("combined_train.csv",word_index,train=True,output="combined2_train.vw") | ||
|
||
#Convert test data | ||
print("converting test") | ||
convert("combined_test.csv",word_index,train=False,output="combined2_test.vw") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,17 @@ | ||
import pandas as pd | ||
|
||
activity = pd.read_csv("act_train.csv") | ||
#Train data | ||
activity_train = pd.read_csv("act_train.csv") | ||
people = pd.read_csv("people.csv") | ||
|
||
"""omg en bitch""" | ||
# how="left": take value from "left" and check in "right". left = activity. | ||
joined = pd.merge(activity, people, on="people_id", how="left" , suffixes=('_activity', '_people')) | ||
joined.to_csv("combined.csv", index=False) | ||
|
||
# print(people.columns, activity.columns) | ||
# print(joined.columns) | ||
# | ||
# print(people.shape, activity.shape) | ||
# print(joined.shape) | ||
joined = pd.merge(activity_train, people, on="people_id", how="left" , suffixes=('_activity', '_people')) | ||
joined.to_csv("small_combined_train.csv", index=False) | ||
|
||
# Test data | ||
activity_test = pd.read_csv("act_test.csv") | ||
people = pd.read_csv("people.csv") | ||
|
||
# how="left": take value from "left" and check in "right". left = activity. | ||
joined = pd.merge(activity_test, people, on="people_id", how="left" , suffixes=('_activity', '_people')) | ||
joined.to_csv("small_combined_test.csv", index=False) |