-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_and_filter.py
36 lines (22 loc) · 983 Bytes
/
clean_and_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
from normalize_arabic_text import normalize_arabic_text as normalize
def filter_on_expense_date_after(df, date_value):
df['date_of_expenses'] = pd.to_datetime(df['date_of_expenses'], errors='coerce')
filter_date = pd.to_datetime(date_value)
filtered_df = df[df['date_of_expenses'] > filter_date]
return filtered_df
def clean_data(df, num_cols=[], text_cols=[], date_cols=[]):
print('cleaning data...')
type_cast_and_fillna(df, num_cols, date_cols)
df[text_cols] = df[text_cols].astype(str)
df[text_cols] = df[text_cols].apply(lambda x: x.str.strip())
return df
def normalize_arabic_text(df, cols):
df[cols] = df[cols].applymap(lambda x: normalize(x))
return df
def type_cast_and_fillna(df, num_columns, date_cols):
for col in date_cols:
df[col] = pd.to_datetime(df[col])
for col in num_columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
return df