diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..184e6da5 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = /home/ajkumar/hackathonDVC/dvcDataStore diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..d9b3d28d --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 5151fcd60de0b43c2d18fde128ee5e09 + size: 83326 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..53c085ca --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 0004b6e0e6d489f6902d6d6db47c24fe + size: 206725 + path: train.csv diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..14162364 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7adc7abb69056f4d7afb512c78f2fce9.dir + size: 75309082 + nfiles: 9470 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..237148f4 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + path: val diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..959bf973 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.7351077313054499} \ No newline at end of file diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..fd3b52d6 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 13c7384218e443fdf941f153ce53d134 + size: 241222 + path: model.joblib diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib")