Skip to content

Commit

Permalink
Update data_transformation.py
Browse files Browse the repository at this point in the history
  • Loading branch information
12ankitsingh99 authored Jan 15, 2024
1 parent 7aa49ae commit 61ae1f2
Showing 1 changed file with 124 additions and 0 deletions.
124 changes: 124 additions & 0 deletions src/components/data_transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import sys
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

from src.exception import CustomException
from src.logger import logging
import os

from src.utils import save_object

@dataclass
class DataTransformationConfig:
preprocessor_obj_file_path=os.path.join('artifacts',"proprocessor.pkl")

class DataTransformation:
def __init__(self):
self.data_transformation_config=DataTransformationConfig()

def get_data_transformer_object(self):
'''
This function si responsible for data trnasformation
'''
try:
numerical_columns = ["writing_score", "reading_score"]
categorical_columns = [
"gender",
"race_ethnicity",
"parental_level_of_education",
"lunch",
"test_preparation_course",
]

num_pipeline= Pipeline(
steps=[
("imputer",SimpleImputer(strategy="median")),
("scaler",StandardScaler())

]
)

cat_pipeline=Pipeline(

steps=[
("imputer",SimpleImputer(strategy="most_frequent")),
("one_hot_encoder",OneHotEncoder()),
("scaler",StandardScaler(with_mean=False))
]

)

logging.info(f"Categorical columns: {categorical_columns}")
logging.info(f"Numerical columns: {numerical_columns}")

preprocessor=ColumnTransformer(
[
("num_pipeline",num_pipeline,numerical_columns),
("cat_pipelines",cat_pipeline,categorical_columns)

]


)

return preprocessor

except Exception as e:
raise CustomException(e,sys)

def initiate_data_transformation(self,train_path,test_path):

try:
train_df=pd.read_csv(train_path)
test_df=pd.read_csv(test_path)

logging.info("Read train and test data completed")

logging.info("Obtaining preprocessing object")

preprocessing_obj=self.get_data_transformer_object()

target_column_name="math_score"
numerical_columns = ["writing_score", "reading_score"]

input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]

input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

logging.info(
f"Applying preprocessing object on training dataframe and testing dataframe."
)

input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

train_arr = np.c_[
input_feature_train_arr, np.array(target_feature_train_df)
]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

logging.info(f"Saved preprocessing object.")

save_object(

file_path=self.data_transformation_config.preprocessor_obj_file_path,
obj=preprocessing_obj

)

return (
train_arr,
test_arr,
self.data_transformation_config.preprocessor_obj_file_path,
)
except Exception as e:
raise CustomException(e,sys)

0 comments on commit 61ae1f2

Please sign in to comment.