Use header=None when the columns are not labeled in your csv file
df = pd.read_csv("pathToFile.csv", header=None)
Use header=None when the columns are not labeled in your xlsx file
df = pd.read_excel("pathToFile.xlsx", header=None)
df.head()
df.tail()
df.shape
df.columns
df['column_name'].value_counts()
df.describe()
df.info()
df.isnull().sum()
NOTE: import seaborn as sns
sns.heatmap(df.isnull())
axis=1 is for columns
df.drop(['column_1','column_2'],axis=1,inplace=True)
df['column_name']=df['column_name'].fillna(df['column_name'].mean())
df['column_name'] = pd.factorize(df['column_name'])[0]
unique = pd.factorize(df['column_name'])[1]
df['column_name'].unique()
df['columns_name'] = df['column_name'].astype("float")
df = df.set_index(df['column_name'])
df_bangalore = df[df['city']=='bangalore']
df_lucknow = df[df['city']=='lucknow']
df.index
NOTE: Column names are ignored and only float/integers allowed
df.to_numpy()
df.sort_values(by='colName')
df.copy()
df.dropna()
df.fillna(value=10)
pd.isna(df)
df.mean()
df.mean(1)
pd.concat([df[:2],df[3:6]])
pd.merge(df1,df2,on='indexColName')
df.groupby('colName').sum()
df.subtract(df['col'],axis=0)
df.to_csv('filename.csv')
df.to_excel('filename.xlsx',sheet_name='Sheet1')
Will change categorical data into one column of integer data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['column_name'])
df_processed = pd.get_dummies(df, prefix_sep="__",columns=["column_1", "column_2"])
NOTE: Make sure you use fit_transform only on train dataset and use just transform for test and post-deployment dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)