-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
71 lines (58 loc) · 1.53 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Written by Jason Cain, Spring 2022
import os
from typing import List
import pandas as pd
DATA_PATH = "./data/wdbc.data"
METADATA_PATH = "./data/wdbc.names"
HEADERS = [
"ID",
"diagnosis",
"mean_radius",
"mean_texture",
"mean_perimeter",
"mean_area",
"mean_smoothness",
"mean_compactness",
"mean_concavity",
"mean_concave_points",
"mean_symmetry",
"mean_fractal_dimension",
"stderror_radius",
"stderror_texture",
"stderror_perimeter",
"stderror_area",
"stderror_smoothness",
"stderror_compactness",
"stderror_concavity",
"stderror_concave_points",
"stderror_symmetry",
"stderror_fractal_dimension",
"worst_radius",
"worst_texture",
"worst_perimeter",
"worst_area",
"worst_smoothness",
"worst_compactness",
"worst_concavity",
"worst_concave_points",
"worst_symmetry",
"worst_fractal_dimension",
]
def load_files(
directory: str = DATA_PATH, headers: List[str] = HEADERS
) -> pd.DataFrame:
"""Load in data file."""
cancer_dataframe = pd.read_csv(DATA_PATH, names=headers, index_col=[0, 1])
return cancer_dataframe
def get_only_mean_values(dataframe: pd.DataFrame) -> pd.DataFrame:
return dataframe.filter(regex="mean_")
def generate_clean_dataframe() -> pd.DataFrame:
df = load_files()
df = get_only_mean_values(df)
return df
def main() -> None:
cancer_dataframe = generate_clean_dataframe()
print(cancer_dataframe.describe())
print(cancer_dataframe)
if __name__ == "__main__":
main()