-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsteps.txt
166 lines (111 loc) · 3.56 KB
/
steps.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
echo "# DVC-ML-Demo-AIops" >> README.md
git init
git add README.md
git commit -m "first commit"
git branch -M main
git remote add origin https://github.com/sidh1603/DVC-ML-Demo-AIops.git
git push -u origin main
## steps ##
conda create -n dvc-ml python-3.7 -y
conda activate dvc-ml
git init
git remote add origin https://github.com/sidh1603/DVC-ML-Demo-AIops.git
git branch -M main
touch .gitignore
touch README.md
on .gitignore file make the changes add .idea/
after changing
git status
git add .
git commit -m ""
git push origin main
touch requirements.txt
in requirements file add dvc,pandas, scikit-learn
pip install -r requirements.txt
git status
git add requirements.txt
git commit -m "requirements added"
git push origin main
dvc init
mkdir -p src/utils
touch src/utils/__init__.py
touch src/__init__.py
touch params.yaml dvc.yaml
mkdir config
in config folder make a file config.yaml
git add .
git commit -m ""
git push origin main
in config.yaml file
add the path where our data is
data_source: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
artifacts:
artifacts_dir: artifacts
raw_local_dir: raw_local_dir
raw_local_file: data.csv
now in src folder make 1 file
touch src/stage_01_load_save.py
now in utils create a file
touch src/utils/all_utils.py
then make a changes in all_utils.py
import yaml
import os
def read_yaml(path_to_yaml: str) -> dict:
with open(path_to_yaml) as yaml_file:
content = yaml.safe_load(yaml_file)
return content
def create_directory(dirs: list):
for dir_path in dirs:
os.makedirs(dir_path, exist_ok = True)
print(f"directory is created at {dir_path}")
create a setup file
touch setup.py
from setuptools import setup
with open("README.md", "r", encoding="utf-8") as f:
long_description = f.read()
setup(
name="src",
version="0.0.1",
author="siddharth rana",
description="A small package for dvc ml pipeline demo",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/sidh1603/DVC-ML-Demo-AIops.git",
author_email="[email protected]",
packages=["src"],
python_requires=">=3.7",
install_requires=[
'dvc',
'pandas',
'scikit-learn'
]
)
after creating a setup file make a small change in requirements file
add # local packages -
-e .
pip install -r requirements.txt
now go to stage_01_load_save.py file and make the changes
from src.utils.all_utils import read_yaml, create_directory
import argparse
import pandas as pd
import os
def get_data(config_path):
config = read_yaml(config_path)
remote_data_path = config["data_source"]
df = pd.read_csv(remote_data_path, sep=";")
# save dataset in the local directory
# create path to directory: artifacts/raw_local_dir/data.csv
artifacts_dir = config["artifacts"]['artifacts_dir']
raw_local_dir = config["artifacts"]['raw_local_dir']
raw_local_file = config["artifacts"]['raw_local_file']
raw_local_dir_path = os.path.join(artifacts_dir, raw_local_dir)
create_directory(dirs= [raw_local_dir_path])
raw_local_file_path = os.path.join(raw_local_dir_path, raw_local_file)
df.to_csv(raw_local_file_path, sep=",", index=False)
if __name__ == '__main__':
args = argparse.ArgumentParser()
args.add_argument("--config", "-c", default="config/config.yaml")
parsed_args = args.parse_args()
get_data(config_path=parsed_args.config)
now use the command to run this stage_01_load_save file
python src/stage_01_load_save.py