diff --git a/_kaggle/_render/breast-cancer-diagnostic-classification/nb.py b/_kaggle/_render/breast-cancer-diagnostic-classification/nb.py index e9eef63..38f2819 100644 --- a/_kaggle/_render/breast-cancer-diagnostic-classification/nb.py +++ b/_kaggle/_render/breast-cancer-diagnostic-classification/nb.py @@ -24,7 +24,8 @@ # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os -for dirname, _, filenames in os.walk('/kaggle/input'): + +for dirname, _, filenames in os.walk("/kaggle/input"): for filename in filenames: print(os.path.join(dirname, filename)) @@ -35,7 +36,7 @@ import os import warnings -warnings.filterwarnings('ignore') +warnings.filterwarnings("ignore") import time as t import pandas as pd import numpy as np @@ -45,7 +46,17 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import resample from imblearn.over_sampling import SMOTE -from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, classification_report, roc_curve, auc, roc_auc_score +from sklearn.metrics import ( + confusion_matrix, + accuracy_score, + f1_score, + recall_score, + precision_score, + classification_report, + roc_curve, + auc, + roc_auc_score, +) from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier @@ -62,8 +73,7 @@ # %% -def data_load( -): #check for the availability of the dataset and change cwd if not found +def data_load(): # check for the availability of the dataset and change cwd if not found df = pd.read_csv("../input/breast-cancer-prediction/data.csv") return df @@ -73,15 +83,15 @@ def data_clean(df): def X_y_split(df): - X = df.drop(['diagnosis'], axis=1) - y = df['diagnosis'] + X = df.drop(["diagnosis"], axis=1) + y = df["diagnosis"] return X, y def data_split_scale(X, y, sampling): - #Splitting dataset into Train and Test Set + # Splitting dataset into Train and Test Set X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.3) - #Feature Scaling using Standardization + # Feature Scaling using Standardization ss = StandardScaler() X_tr = ss.fit_transform(X_tr) X_test = ss.fit_transform(X_test) @@ -91,8 +101,7 @@ def data_split_scale(X, y, sampling): samp_sel = int(input("Now enter your selection for sampling strategy: \t")) samp = [sampling.upsample, sampling.downsample, sampling.smote] temp = samp[samp_sel - 1] - X_train, y_train = temp(X_train=pd.DataFrame(X_tr), - y_train=pd.DataFrame(y_tr)) + X_train, y_train = temp(X_train=pd.DataFrame(X_tr), y_train=pd.DataFrame(y_tr)) return pd.DataFrame(X_train), pd.DataFrame(X_test), y_train, y_test @@ -107,29 +116,28 @@ def data_split_scale(X, y, sampling): class sampling: def upsample(X_train, y_train): - #combine them back for resampling + # combine them back for resampling train_data = pd.concat([X_train, y_train], axis=1) # separate minority and majority classes negative = train_data[train_data.diagnosis == 0] positive = train_data[train_data.diagnosis == 1] # upsample minority - pos_upsampled = resample(positive, - replace=True, - n_samples=len(negative), - random_state=30) + pos_upsampled = resample( + positive, replace=True, n_samples=len(negative), random_state=30 + ) # combine majority and upsampled minority upsampled = pd.concat([negative, pos_upsampled]) # check new class counts - #print(upsampled.diagnosis.value_counts()) + # print(upsampled.diagnosis.value_counts()) print(upsampled.diagnosis.value_counts()) upsampled = upsampled.sample(frac=1) X_train = upsampled.iloc[:, 0:-2] y_train = upsampled.iloc[:, -1] - #graph barplot counts + # graph barplot counts return X_train, y_train def downsample(X_train, y_train): - #combine them back for resampling + # combine them back for resampling train_data = pd.concat([X_train, y_train], axis=1) # separate minority and majority classes negative = train_data[train_data.diagnosis == 0] @@ -139,7 +147,8 @@ def downsample(X_train, y_train): negative, replace=True, # sample with replacement n_samples=len(positive), # match number in minority class - random_state=30) # reproducible results + random_state=30, + ) # reproducible results # combine minority and downsampled majority downsampled = pd.concat([positive, neg_downsampled]) downsampled = downsampled.sample(frac=1) @@ -147,15 +156,15 @@ def downsample(X_train, y_train): y_train = downsampled.iloc[:, -1] # check new class counts print(downsampled.diagnosis.value_counts()) - #graph + # graph return X_train, y_train def smote(X_train, y_train): sm = SMOTE(random_state=30) X_train, y_train = sm.fit_resample(X_train, y_train) - y_train = pd.DataFrame(y_train, columns=['diagnosis']) + y_train = pd.DataFrame(y_train, columns=["diagnosis"]) print(y_train.diagnosis.value_counts()) - #graph + # graph return X_train, y_train @@ -207,10 +216,20 @@ def feat5(): df = data_load() # Loading Dataset into Dataframe df = data_clean(df) drop_cols = [ - 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', - 'symmetry_worst', 'fractal_dimension_worst', 'perimeter_mean', - 'perimeter_se', 'area_mean', 'area_se', 'concavity_mean', - 'concavity_se', 'concave points_mean', 'concave points_se' + "radius_worst", + "texture_worst", + "perimeter_worst", + "area_worst", + "symmetry_worst", + "fractal_dimension_worst", + "perimeter_mean", + "perimeter_se", + "area_mean", + "area_se", + "concavity_mean", + "concavity_se", + "concave points_mean", + "concave points_se", ] df_sf = df.drop(drop_cols, axis=1) X, y = X_y_split(df_sf) @@ -221,9 +240,7 @@ def feature(): "'\t The number '1' stands for 'ALL- FEATURES'. \n \t The number '2' stands for 'MEAN- FEATURES' . \n \t The number '3' stands for 'SQUARED- ERROR FEATURES'. \n \t The number '4' stands for 'WORST- FEATURES'. \n \t The number '5' stands for 'SELECTED- FEATURES'.'" ) selection = input("\t Enter your choice of feature selection: \t") - feat_options = [ - feat.feat1, feat.feat2, feat.feat3, feat.feat4, feat.feat5 - ] + feat_options = [feat.feat1, feat.feat2, feat.feat3, feat.feat4, feat.feat5] return feat_options[int(selection) - 1]() @@ -274,7 +291,7 @@ def rfc(dat): def knn(dat): # K-Nearest Neighbors start = t.time() - knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) + knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2) model_knn = knn.fit(dat[0], dat[2]) pred = model_knn.predict(dat[1]) pred_prob = model_knn.predict_proba(dat[1]) @@ -284,7 +301,7 @@ def knn(dat): def svc_l(dat): # Linear SVM start = t.time() - svc_l = SVC(kernel='linear', random_state=0, probability=True) + svc_l = SVC(kernel="linear", random_state=0, probability=True) model_svc_l = svc_l.fit(dat[0], dat[2]) pred = model_svc_l.predict(dat[1]) pred_prob = model_svc_l.predict_proba(dat[1]) @@ -294,7 +311,7 @@ def svc_l(dat): def svc_r(dat): # Kernel SVM start = t.time() - svc_r = SVC(kernel='rbf', random_state=0, probability=True) + svc_r = SVC(kernel="rbf", random_state=0, probability=True) model_svc_r = svc_r.fit(dat[0], dat[2]) pred = model_svc_r.predict(dat[1]) pred_prob = model_svc_r.predict_proba(dat[1]) @@ -320,8 +337,13 @@ def gnb(dat): def train_n_test(): ft = feat.feature() modelsss = [ - models.lr, models.dtc, models.rfc, models.knn, models.svc_l, - models.svc_r, models.gnb + models.lr, + models.dtc, + models.rfc, + models.knn, + models.svc_l, + models.svc_r, + models.gnb, ] print( "'\t The number '1' stands for 'LOGISTIC REGRESSION'. \n \t The number '2' stands for 'Decision Tree' . \n \t The number '3' stands for 'Random Forest Classifier'. \n \t The number '4' stands for 'KNN'. \n \t The number '5' stands for 'Liner SVM'. \n \t The number '6' stands for 'Kernal SVM'. \n \t The number '7' stands for 'Guassian NB'.'" @@ -339,9 +361,13 @@ def train_n_test(): def performance(): out, y_test, mdl_selection = train_n_test() models = [ - "Logistic Regression", "Desicion Tree Classifier", - "Random Forest Classifier", "KNN", "Liner SVM", "Kernal SVM", - "Guassian NB" + "Logistic Regression", + "Desicion Tree Classifier", + "Random Forest Classifier", + "KNN", + "Liner SVM", + "Kernal SVM", + "Guassian NB", ] cm_lr = confusion_matrix(y_test, out[2]) sns.heatmap(cm_lr, annot=True, cmap="Reds") @@ -350,50 +376,60 @@ def performance(): rs = recall_score(y_test, out[2]) fs = f1_score(y_test, out[2]) ps = precision_score(y_test, out[2]) - #Report Bar Plot - report = pd.DataFrame( - classification_report(y_test, out[2], output_dict=True)) + # Report Bar Plot + report = pd.DataFrame(classification_report(y_test, out[2], output_dict=True)) rg = report.drop(report.index[3]).drop(report.columns[2:], axis=1) - plt.style.use('seaborn') - rg.plot(kind='bar', color=["red", "salmon"]) + plt.style.use("seaborn") + rg.plot(kind="bar", color=["red", "salmon"]) plt.title("Classification Report of {}".format(models[mdl_selection - 1])) - plt.legend(report.columns, - ncol=2, - loc="lower center", - bbox_to_anchor=(0.5, -0.3)) + plt.legend(report.columns, ncol=2, loc="lower center", bbox_to_anchor=(0.5, -0.3)) plt.yticks(np.arange(0, 1.05, step=0.05)) print( - '\n\t The accuracy score of {} with given parameters is: {}%.'.format( - models[mdl_selection - 1], acs * 100)) - print('\n\t The recall score of {} with given parameters is: {}%.'.format( - models[mdl_selection - 1], rs * 100)) + "\n\t The accuracy score of {} with given parameters is: {}%.".format( + models[mdl_selection - 1], acs * 100 + ) + ) + print( + "\n\t The recall score of {} with given parameters is: {}%.".format( + models[mdl_selection - 1], rs * 100 + ) + ) + print( + "\n\t The precision score of {} with given parameters is: {}%.".format( + models[mdl_selection - 1], ps * 100 + ) + ) print( - '\n\t The precision score of {} with given parameters is: {}%.'.format( - models[mdl_selection - 1], ps * 100)) - print('\n\t The F1 score of {} with given parameters is: {}%.'.format( - models[mdl_selection - 1], fs * 100)) + "\n\t The F1 score of {} with given parameters is: {}%.".format( + models[mdl_selection - 1], fs * 100 + ) + ) print( - '\n\t The training and testing time taken by {} with given parameters is: {} seconds.' - .format(models[mdl_selection - 1], out[1])) + "\n\t The training and testing time taken by {} with given parameters is: {} seconds.".format( + models[mdl_selection - 1], out[1] + ) + ) prob = out[3] prob = prob[:, 1] - #ROC + # ROC false_pos, true_pos, thresh = roc_curve(y_test, prob, pos_label=1) auc_score = roc_auc_score(y_test, prob) rand_pr = [0 for i in range(len(y_test))] p_fpr, p_tpr, _ = roc_curve(y_test, rand_pr, pos_label=1) plt.figure() - plt.style.use('seaborn') - plt.plot(false_pos, - true_pos, - linestyle='--', - color='orange', - label=models[mdl_selection - 1]) - plt.plot(p_fpr, p_tpr, linestyle='--', color='green') - plt.title('ROC Curve') - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.legend(loc='best') + plt.style.use("seaborn") + plt.plot( + false_pos, + true_pos, + linestyle="--", + color="orange", + label=models[mdl_selection - 1], + ) + plt.plot(p_fpr, p_tpr, linestyle="--", color="green") + plt.title("ROC Curve") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.legend(loc="best") return out[0], out[2], auc_score diff --git a/_kaggle/_render/heart-diseases-modeling/nb.py b/_kaggle/_render/heart-diseases-modeling/nb.py index 0b08888..1af910f 100644 --- a/_kaggle/_render/heart-diseases-modeling/nb.py +++ b/_kaggle/_render/heart-diseases-modeling/nb.py @@ -25,7 +25,8 @@ # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os -for dirname, _, filenames in os.walk('/kaggle/input'): + +for dirname, _, filenames in os.walk("/kaggle/input"): for filename in filenames: print(os.path.join(dirname, filename)) @@ -44,7 +45,15 @@ import matplotlib.pyplot as plt from sklearn.preprocessing import scale, StandardScaler from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score -from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report +from sklearn.metrics import ( + confusion_matrix, + accuracy_score, + mean_squared_error, + r2_score, + roc_auc_score, + roc_curve, + classification_report, +) from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC @@ -87,13 +96,19 @@ warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) models = [ - LogisticRegression, KNeighborsClassifier, SVC, MLPClassifier, - DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier, - XGBClassifier, LGBMClassifier -] #,CatBoostClassifier -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', 10) -pd.set_option('display.float_format', lambda x: '%.5f' % x) + LogisticRegression, + KNeighborsClassifier, + SVC, + MLPClassifier, + DecisionTreeClassifier, + RandomForestClassifier, + GradientBoostingClassifier, + XGBClassifier, + LGBMClassifier, +] # ,CatBoostClassifier +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", 10) +pd.set_option("display.float_format", lambda x: "%.5f" % x) # %% [markdown] # ## Adding Functions @@ -102,55 +117,50 @@ # %% def degisken_tiplerine_ayirma(data, cat_th, car_th): """ - Veri:data parametresi ili fonksiyona girilen verinin değişkenlerin sınıflandırılması. - Parameters - ---------- - data: pandas.DataFrame - İşlem yapılacak veri seti - - cat_th:int - categoric değişken threshold değeri - - car_th:int - Cardinal değişkenler için threshold değeri - - Returns - ------- - cat_deg:list - categorik değişken listesi - num_deg:list - numeric değişken listesi - car_deg:list - categoric ama cardinal değişken listesi - - Examples - ------- - df = dataset_yukle("breast_cancer") - cat,num,car=degisken_tiplerine_ayirma(df,10,20) - Notes - ------- - cat_deg + num_deg + car_deg = toplam değişken sayısı - - """ + Veri:data parametresi ili fonksiyona girilen verinin değişkenlerin sınıflandırılması. + Parameters + ---------- + data: pandas.DataFrame + İşlem yapılacak veri seti + + cat_th:int + categoric değişken threshold değeri + + car_th:int + Cardinal değişkenler için threshold değeri + + Returns + ------- + cat_deg:list + categorik değişken listesi + num_deg:list + numeric değişken listesi + car_deg:list + categoric ama cardinal değişken listesi + + Examples + ------- + df = dataset_yukle("breast_cancer") + cat,num,car=degisken_tiplerine_ayirma(df,10,20) + Notes + ------- + cat_deg + num_deg + car_deg = toplam değişken sayısı + + """ num_but_cat = [ - i for i in data.columns - if data[i].dtypes != "O" and data[i].nunique() < cat_th + i for i in data.columns if data[i].dtypes != "O" and data[i].nunique() < cat_th ] car_deg = [ - i for i in data.columns - if data[i].dtypes == "O" and data[i].nunique() > car_th + i for i in data.columns if data[i].dtypes == "O" and data[i].nunique() > car_th ] num_deg = [ - i for i in data.columns - if data[i].dtypes != "O" and i not in num_but_cat + i for i in data.columns if data[i].dtypes != "O" and i not in num_but_cat ] - cat_deg = [ - i for i in data.columns if data[i].dtypes == "O" and i not in car_deg - ] + cat_deg = [i for i in data.columns if data[i].dtypes == "O" and i not in car_deg] cat_deg = cat_deg + num_but_cat @@ -201,16 +211,19 @@ def categoric_ozet(data, degisken, plot=False, null_control=False): """ print( - pd.DataFrame({ + pd.DataFrame( + { + degisken: data[degisken].value_counts(), + "Ratio": 100 * data[degisken].value_counts() / len(data), + } + ) + ) + tablo = pd.DataFrame( + { degisken: data[degisken].value_counts(), - "Ratio": 100 * data[degisken].value_counts() / len(data) - })) - tablo = pd.DataFrame({ - degisken: - data[degisken].value_counts(), - "Ratio": - 100 * data[degisken].value_counts() / len(data) - }) + "Ratio": 100 * data[degisken].value_counts() / len(data), + } + ) print("##########################################") if plot: sns.countplot(x=data[degisken], data=data) @@ -258,8 +271,8 @@ def threshold_degisimi(data, degisken): alt_limit, ust_limit = outlier_threshold(data, degisken) data.loc[(data[degisken] < alt_limit), degisken] = alt_limit data.loc[(data[degisken] > ust_limit), degisken] = ust_limit - #data[data[degisken]ust_limit][degisken]=ust_limit + # data[data[degisken]ust_limit][degisken]=ust_limit return data @@ -292,9 +305,7 @@ def numeric_ozet(data, degisken, plot=False, null_control=False): for i in cat_deg: tablo=categoric_ozet(df,i,True,True) """ - quantiles = [ - 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99 - ] + quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99] print(data[degisken].describe(quantiles).T) if plot: @@ -309,16 +320,15 @@ def numeric_ozet(data, degisken, plot=False, null_control=False): def missing_values_table(dataframe, na_name=False): - na_columns = [ - col for col in dataframe.columns if dataframe[col].isnull().sum() > 0 - ] + na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0] n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False) - ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * - 100).sort_values(ascending=False) - missing_df = pd.concat([n_miss, np.round(ratio, 2)], - axis=1, - keys=['n_miss', 'ratio']) + ratio = ( + dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100 + ).sort_values(ascending=False) + missing_df = pd.concat( + [n_miss, np.round(ratio, 2)], axis=1, keys=["n_miss", "ratio"] + ) print(missing_df, end="\n") if na_name: @@ -326,9 +336,9 @@ def missing_values_table(dataframe, na_name=False): def one_hot_encoder(dataframe, categorical_cols, drop_first=True): - dataframe = pd.get_dummies(dataframe, - columns=categorical_cols, - drop_first=drop_first) + dataframe = pd.get_dummies( + dataframe, columns=categorical_cols, drop_first=drop_first + ) return dataframe @@ -337,10 +347,9 @@ def model_karsilastirma(df, model, target): y = df[target] - X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.15, - random_state=42) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.15, random_state=42 + ) model_fit = model().fit(X_train, y_train) y_pred = model_fit.predict(X_test) acc = accuracy_score(y_test, y_pred) @@ -352,19 +361,23 @@ def target_analyser(dataframe, target, num_deg, cat_deg): for degisken in dataframe.columns: if degisken in cat_deg: print(degisken, ":", len(dataframe[degisken].value_counts())) - print(pd.DataFrame({ - "COUNT": - dataframe[degisken].value_counts(), - "RATIO": - dataframe[degisken].value_counts() / len(dataframe), - "TARGET_MEAN": - dataframe.groupby(degisken)[target].mean() - }), - end="\n\n\n") + print( + pd.DataFrame( + { + "COUNT": dataframe[degisken].value_counts(), + "RATIO": dataframe[degisken].value_counts() / len(dataframe), + "TARGET_MEAN": dataframe.groupby(degisken)[target].mean(), + } + ), + end="\n\n\n", + ) if degisken in num_deg: - print(pd.DataFrame( - {"TARGET_MEAN": dataframe.groupby(target)[degisken].mean()}), - end="\n\n\n") + print( + pd.DataFrame( + {"TARGET_MEAN": dataframe.groupby(target)[degisken].mean()} + ), + end="\n\n\n", + ) # %% [markdown] @@ -372,7 +385,7 @@ def target_analyser(dataframe, target, num_deg, cat_deg): # ![This is an image](https://www.sbbs-soc.com/wp-content/uploads/2020/09/Heart-Disease.jpg) # %% -#loading dataset +# loading dataset df = pd.read_csv("../input/heart-disease-uci/heart.csv") df.head() @@ -394,12 +407,12 @@ def target_analyser(dataframe, target, num_deg, cat_deg): # * target: Heart disease (0 = no, 1 = yes) # %% -#Analysis of Dataset +# Analysis of Dataset dataset_ozet(df) cat_deg, num_deg, car_deg = degisken_tiplerine_ayirma(df, 10, 20) # %% -#EDA of Dataset +# EDA of Dataset for i in cat_deg: categoric_ozet(df, i, True, True) @@ -407,62 +420,60 @@ def target_analyser(dataframe, target, num_deg, cat_deg): numeric_ozet(df, i, True, True) # %% -#All columns analaysis based on target column +# All columns analaysis based on target column target_analyser(df, "target", num_deg, cat_deg) # %% -#Filling missing values +# Filling missing values null_cols = missing_values_table(df, True) for i in null_cols: df[i].fillna(df[i].transform("mean"), inplace=True) -#There is no missing values +# There is no missing values # %% -#Outlier processing +# Outlier processing for i in num_deg: df = threshold_degisimi(df, i) # %% -#Data Extraction +# Data Extraction df.age.describe() -df.loc[(df["age"] < 40), 'NEW_AGE_CAT'] = 'Young' -df.loc[(df["age"] >= 40) & (df["age"] < 50), 'NEW_AGE_CAT'] = 'Middle Age' -df.loc[(df["age"] >= 50) & (df["age"] < 60), 'NEW_AGE_CAT'] = 'Pre-Old' -df.loc[(df["age"] >= 60), 'NEW_AGE_CAT'] = 'Old' -df.groupby('NEW_AGE_CAT')["target"].mean() +df.loc[(df["age"] < 40), "NEW_AGE_CAT"] = "Young" +df.loc[(df["age"] >= 40) & (df["age"] < 50), "NEW_AGE_CAT"] = "Middle Age" +df.loc[(df["age"] >= 50) & (df["age"] < 60), "NEW_AGE_CAT"] = "Pre-Old" +df.loc[(df["age"] >= 60), "NEW_AGE_CAT"] = "Old" +df.groupby("NEW_AGE_CAT")["target"].mean() # %% df.trestbps.describe() -df.loc[(df["trestbps"] < 90), 'NEW_RBP_CAT'] = 'Low' -df.loc[(df["trestbps"] >= 90) & (df["trestbps"] < 120), - 'NEW_RBP_CAT'] = 'Ideal' -df.loc[(df["trestbps"] >= 120) & (df["trestbps"] < 140), - 'NEW_RBP_CAT'] = 'Pre-HIGH' -df.loc[(df["trestbps"] >= 140), 'NEW_RBP_CAT'] = 'Hypertension' -df.groupby('NEW_RBP_CAT')["target"].mean() +df.loc[(df["trestbps"] < 90), "NEW_RBP_CAT"] = "Low" +df.loc[(df["trestbps"] >= 90) & (df["trestbps"] < 120), "NEW_RBP_CAT"] = "Ideal" +df.loc[(df["trestbps"] >= 120) & (df["trestbps"] < 140), "NEW_RBP_CAT"] = "Pre-HIGH" +df.loc[(df["trestbps"] >= 140), "NEW_RBP_CAT"] = "Hypertension" +df.groupby("NEW_RBP_CAT")["target"].mean() # %% df.chol.describe() -df.loc[(df["chol"] < 200), 'NEW_CHOL_CAT'] = 'Ideal' -df.loc[(df["chol"] >= 200) & (df["chol"] < 240), 'NEW_CHOL_CAT'] = 'HIGH' -df.loc[(df["chol"] >= 240), 'NEW_CHOL_CAT'] = 'Very Risky' -df.groupby('NEW_CHOL_CAT')["target"].mean() +df.loc[(df["chol"] < 200), "NEW_CHOL_CAT"] = "Ideal" +df.loc[(df["chol"] >= 200) & (df["chol"] < 240), "NEW_CHOL_CAT"] = "HIGH" +df.loc[(df["chol"] >= 240), "NEW_CHOL_CAT"] = "Very Risky" +df.groupby("NEW_CHOL_CAT")["target"].mean() # %% -#Encoding of categoric columns +# Encoding of categoric columns cat_deg, num_deg, car_deg = degisken_tiplerine_ayirma(df, 10, 20) cat_deg = [i for i in cat_deg if i != "target"] df = one_hot_encoder(df, cat_deg) df.head() # %% -#Scaling of numeric columns +# Scaling of numeric columns scaler = StandardScaler() df[num_deg] = scaler.fit_transform(df[num_deg]) # %% -#Comparing of all models +# Comparing of all models for mod in models: model_karsilastirma(df, mod, "target") @@ -473,10 +484,9 @@ def target_analyser(dataframe, target, num_deg, cat_deg): X = df.drop(columns="target") y = df["target"] -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.15, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.15, random_state=42 +) svm = SVC() svm_tuned = SVC(C=1, kernel="linear").fit(X_train, y_train) @@ -492,10 +502,9 @@ def target_analyser(dataframe, target, num_deg, cat_deg): X = df.drop(columns="target") y = df["target"] -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.15, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.15, random_state=42 +) loj_model = LogisticRegression(solver="liblinear").fit(X_train, y_train) @@ -510,13 +519,13 @@ def target_analyser(dataframe, target, num_deg, cat_deg): X = df.drop(columns="target") y = df["target"] -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.15, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.15, random_state=42 +) -lgbm_tuned = LGBMClassifier(learning_rate=0.01, max_depth=5, - n_estimators=250).fit(X_train, y_train) +lgbm_tuned = LGBMClassifier(learning_rate=0.01, max_depth=5, n_estimators=250).fit( + X_train, y_train +) y_pred = lgbm_tuned.predict(X_test) acc = accuracy_score(y_test, y_pred) diff --git a/_kaggle/_render/heart-failure-prediction-using-knn-h2o-ai/nb.py b/_kaggle/_render/heart-failure-prediction-using-knn-h2o-ai/nb.py index 2cad8b0..0d6b81a 100644 --- a/_kaggle/_render/heart-failure-prediction-using-knn-h2o-ai/nb.py +++ b/_kaggle/_render/heart-failure-prediction-using-knn-h2o-ai/nb.py @@ -19,15 +19,15 @@ # In this Notebook we will see how to apply KNN and how to use H2o.ai automl library for classification task. If you find this notebook usefull please Upvote! # %% id="-eFeHGM7wjXi" -#importing Libraries +# importing Libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # %% id="BeFbH5mmwjXj" outputId="1d28b870-a332-42d9-db22-6bf5979bd77b" -#importing dataset -df = pd.read_csv('../input/heart-failure-prediction/heart.csv') +# importing dataset +df = pd.read_csv("../input/heart-failure-prediction/heart.csv") df.head() # %% id="58V7HLIKwjXk" outputId="16830d86-2484-458f-bf53-30e988f99219" @@ -41,7 +41,7 @@ # %% id="vCpUngBKVac2" outputId="68162c6f-6bd5-4794-c332-e15701dabfe5" df.isnull().sum() -#There is no null values +# There is no null values # %% [markdown] id="o2qmlr1DdLX9" # ## Data Exploration @@ -51,14 +51,14 @@ # Now we can plot the distribution of data wrt dependent variable i.e HeartDisease # %% id="JsDv6UI4wjXn" outputId="6b218281-de0c-472c-b871-6595a7d069d2" -sns.pairplot(df, hue='HeartDisease') +sns.pairplot(df, hue="HeartDisease") # %% [markdown] id="ahSOpNk1zEta" # 5. Which are most useful variable in classification? Prove using correlation. # %% id="uXcHuo7pzCLZ" outputId="6c4cdfd1-7b22-4967-89e8-ffb3e9e2a152" corr = df.corr() -corr.style.background_gradient(cmap='coolwarm') +corr.style.background_gradient(cmap="coolwarm") # %% id="OG7hK1UJ0Rja" outputId="85b4bd55-da22-45b9-cce1-4cfea00ab94f" sns.set_theme(style="whitegrid") @@ -71,17 +71,17 @@ df.hist(ax=ax) # %% id="kzgu_ezi03y8" outputId="76bbae80-a830-45c2-b0e6-e6c787ecd193" -df.HeartDisease.value_counts().plot(kind='bar') +df.HeartDisease.value_counts().plot(kind="bar") plt.xlabel("Heart Diseases or Not") plt.ylabel("Count") plt.title("Heart Diseases") -#Here we can see that dataset is not much imbalanced so there is no need to balance. +# Here we can see that dataset is not much imbalanced so there is no need to balance. # %% [markdown] id="-7E3IpLKdRV1" # ## Data Preprocessing # %% id="zaHcUNcbWjkZ" -cat = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] +cat = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"] # %% id="tVKS4fLEWBZc" from sklearn.preprocessing import LabelEncoder @@ -90,20 +90,17 @@ df[cat] = df[cat].apply(lb.fit_transform) # %% id="4OPXop0HwjXo" outputId="997a4068-3bfd-4298-8a60-584126552665" -X = df.drop('HeartDisease', axis=1) +X = df.drop("HeartDisease", axis=1) X.head() # %% id="KLJJYpttwjXo" outputId="00502435-bf1a-42e9-d631-c767be3f82bc" -y = df['HeartDisease'] +y = df["HeartDisease"] y.head() # %% id="0T5IVw1awjXp" from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.2, - random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # %% id="Cw17uwfRwjXp" outputId="e218a7c7-4a0e-42b3-c151-3b0b3d4e5205" X_train.shape @@ -128,7 +125,7 @@ from sklearn.neighbors import KNeighborsClassifier # %% id="ruOf41A6wjXw" outputId="d39b639a-1acf-484a-dbea-0684c9945b3b" -knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', p=2) +knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean", p=2) knn.fit(X_train, y_train) # %% id="To358l1ewjXw" outputId="67755f95-06f1-427d-aa9b-49660f3c2c51" @@ -154,14 +151,14 @@ # %% id="vefokjVQ2loi" outputId="73ce3707-11c8-40c1-c6b4-b5f4b9eefb18" from sklearn.metrics import classification_report -target_names = ['Heart Diseases', 'Normal'] +target_names = ["Heart Diseases", "Normal"] print(classification_report(y_test, y_pred, target_names=target_names)) # %% [markdown] id="mIL4fznQ3A4P" # To select optimize k value we will use elbow method # %% id="OrZlJRGMwjXy" -#For selecting K value +# For selecting K value error_rate = [] # Will take some time @@ -176,22 +173,24 @@ import matplotlib.pyplot as plt plt.figure(figsize=(10, 6)) -plt.plot(range(1, 40), - error_rate, - color='red', - linestyle='dashed', - marker='o', - markerfacecolor='green', - markersize=10) -plt.title('Error Rate vs. K Value') -plt.xlabel('K') -plt.ylabel('Error Rate') +plt.plot( + range(1, 40), + error_rate, + color="red", + linestyle="dashed", + marker="o", + markerfacecolor="green", + markersize=10, +) +plt.title("Error Rate vs. K Value") +plt.xlabel("K") +plt.ylabel("Error Rate") # %% id="AgeM11Jv3F9X" outputId="d58693b2-1908-4924-e1a6-2b16792815d9" -#From graph we can see that optimize k value is 16,17,18 +# From graph we can see that optimize k value is 16,17,18 # Now we will train our KNN classifier with this k values -knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean', p=2) +knn = KNeighborsClassifier(n_neighbors=3, metric="euclidean", p=2) knn.fit(X_train, y_train) # %% id="dCLyRoU13X9n" outputId="f7ae117c-2e18-496c-925d-a78728eefabe" @@ -211,7 +210,7 @@ # %% id="Z7NJY1xg3a2v" outputId="203352f9-cb86-433d-c406-68e139682617" from sklearn.metrics import classification_report -target_names = ['Diabetes', 'Normal'] +target_names = ["Diabetes", "Normal"] print(classification_report(y_test, y_pred, target_names=target_names)) # %% [markdown] id="IkXajhdm4Pur" @@ -241,6 +240,7 @@ # %% id="bXVffHNX4JkT" outputId="360dcee6-ea22-4555-9cfe-0e117b36fe41" import h2o + # We will be using default parameter Here with H2O init method h2o.init() @@ -250,7 +250,7 @@ # %% id="DlXM7bxrY8Fn" outputId="28a1baa7-be21-4f52-8b36-05d90ba90c7e" # Data Transform - Split train : test datasets -train, valid = hf.split_frame(ratios=[.80], seed=1234) +train, valid = hf.split_frame(ratios=[0.80], seed=1234) print("Training Dataset", train.shape) print("Validation Dataset", valid.shape) @@ -272,10 +272,9 @@ # Run AutoML for YY base models (limited to 1 hour max runtime by default) aml = H2OAutoML(max_models=12, seed=1234, balance_classes=True) -aml.train(x=featureColumns, - y=targetColumn, - training_frame=train, - validation_frame=valid) +aml.train( + x=featureColumns, y=targetColumn, training_frame=train, validation_frame=valid +) # %% id="ql70026xZfdI" outputId="95e3404d-ad3b-46fe-a432-ab34593de3bc" lb = aml.leaderboard @@ -303,24 +302,29 @@ valid_dataset = valid.as_data_frame() # Evaluate the skill of the Trained model -acc = accuracy_score(valid_dataset[targetColumn], - np.round(abs(predicted_data['predict']))) -classReport = classification_report(valid_dataset[targetColumn], - np.round(abs(predicted_data['predict']))) -confMatrix = confusion_matrix(valid_dataset[targetColumn], - np.round(abs(predicted_data['predict']))) +acc = accuracy_score( + valid_dataset[targetColumn], np.round(abs(predicted_data["predict"])) +) +classReport = classification_report( + valid_dataset[targetColumn], np.round(abs(predicted_data["predict"])) +) +confMatrix = confusion_matrix( + valid_dataset[targetColumn], np.round(abs(predicted_data["predict"])) +) print() -print('Testing Results of the trained model: ') +print("Testing Results of the trained model: ") print() -print('Accuracy : ', acc) +print("Accuracy : ", acc) print() -print('Confusion Matrix :\n', confMatrix) +print("Confusion Matrix :\n", confMatrix) print() -print('Classification Report :\n', classReport) +print("Classification Report :\n", classReport) # Confusion matrix -skplt.metrics.plot_confusion_matrix(valid_dataset[targetColumn], - np.round(abs(predicted_data['predict'])), - figsize=(7, 7)) +skplt.metrics.plot_confusion_matrix( + valid_dataset[targetColumn], + np.round(abs(predicted_data["predict"])), + figsize=(7, 7), +) plt.show() diff --git a/_kaggle/_render/hollywood-theatrical-market-synopsis-1995-to-2021/nb.py b/_kaggle/_render/hollywood-theatrical-market-synopsis-1995-to-2021/nb.py index 9ebd7ba..3601d32 100644 --- a/_kaggle/_render/hollywood-theatrical-market-synopsis-1995-to-2021/nb.py +++ b/_kaggle/_render/hollywood-theatrical-market-synopsis-1995-to-2021/nb.py @@ -24,7 +24,8 @@ # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os -for dirname, _, filenames in os.walk('/kaggle/input'): + +for dirname, _, filenames in os.walk("/kaggle/input"): for filename in filenames: print(os.path.join(dirname, filename)) import numpy as np @@ -54,27 +55,32 @@ ) Data_Annual_Ticket = pd.read_csv( "/kaggle/input/hollywood-theatrical-market-synopsis-1995-to-2021/AnnualTicketSales.csv", - thousands=',') + thousands=",", +) # %% [markdown] # ## **Cleaning and Tiding Data** # %% -Data_Annual_Ticket["TICKETS SOLD"] = Data_Annual_Ticket[ - "TICKETS SOLD"].replace(',', '') +Data_Annual_Ticket["TICKETS SOLD"] = Data_Annual_Ticket["TICKETS SOLD"].replace(",", "") Data_Annual_Ticket["TOTAL BOX OFFICE"] = Data_Annual_Ticket[ - "TOTAL BOX OFFICE"].str.replace(',', '') + "TOTAL BOX OFFICE" +].str.replace(",", "") Data_Annual_Ticket["TOTAL BOX OFFICE"] = Data_Annual_Ticket[ - "TOTAL BOX OFFICE"].str.replace('$', '') + "TOTAL BOX OFFICE" +].str.replace("$", "") Data_Annual_Ticket["TOTAL INFLATION ADJUSTED BOX OFFICE"] = Data_Annual_Ticket[ - "TOTAL INFLATION ADJUSTED BOX OFFICE"].str.replace(',', '') + "TOTAL INFLATION ADJUSTED BOX OFFICE" +].str.replace(",", "") Data_Annual_Ticket["TOTAL INFLATION ADJUSTED BOX OFFICE"] = Data_Annual_Ticket[ - "TOTAL INFLATION ADJUSTED BOX OFFICE"].str.replace('$', '') + "TOTAL INFLATION ADJUSTED BOX OFFICE" +].str.replace("$", "") Data_Annual_Ticket["AVERAGE TICKET PRICE"] = Data_Annual_Ticket[ - "AVERAGE TICKET PRICE"].str.replace('$', '') + "AVERAGE TICKET PRICE" +].str.replace("$", "") Data_Annual_Ticket = Data_Annual_Ticket.drop(labels="Unnamed: 5", axis=1) @@ -85,26 +91,28 @@ # ## **Changing the type of Data(object to float)** # %% -Data_Annual_Ticket['TICKETS SOLD'] = Data_Annual_Ticket['TICKETS SOLD'].astype( - float) -Data_Annual_Ticket['TOTAL BOX OFFICE'] = Data_Annual_Ticket[ - 'TOTAL BOX OFFICE'].astype(float) +Data_Annual_Ticket["TICKETS SOLD"] = Data_Annual_Ticket["TICKETS SOLD"].astype(float) +Data_Annual_Ticket["TOTAL BOX OFFICE"] = Data_Annual_Ticket["TOTAL BOX OFFICE"].astype( + float +) # %% [markdown] # ## **Using bar chart to illustrate the total box office each year** # %% -px.bar(Data_Annual_Ticket, - x='YEAR', - y='TOTAL BOX OFFICE', - title='Total Box Office vs. Year') +px.bar( + Data_Annual_Ticket, + x="YEAR", + y="TOTAL BOX OFFICE", + title="Total Box Office vs. Year", +) # %% [markdown] # ## **Calculating the total box office if last two years were normal years (*using linear regression*)** # %% x = list(range(0, (2020 - 1995))) -y = list(Data_Annual_Ticket['TOTAL BOX OFFICE']) +y = list(Data_Annual_Ticket["TOTAL BOX OFFICE"]) y.reverse() y.pop() y.pop() @@ -112,36 +120,36 @@ x1 = list(range(0, (2022 - 1995))) y1 = [slope * x + intercept for x in x1] y1.reverse() -Data_Annual_Ticket['TOTAL BOX OFFICE WITHOUT COVID'] = y1 -Data_Annual_Ticket["Diff"] = Data_Annual_Ticket[ - 'TOTAL BOX OFFICE WITHOUT COVID'] - Data_Annual_Ticket['TOTAL BOX OFFICE'] +Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] = y1 +Data_Annual_Ticket["Diff"] = ( + Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] + - Data_Annual_Ticket["TOTAL BOX OFFICE"] +) # %% [markdown] # ## **Illustrate the difference between total box office with covid and without covid** # %% -px.line(Data_Annual_Ticket, - x='YEAR', - y=["TOTAL BOX OFFICE", "TOTAL BOX OFFICE WITHOUT COVID"], - labels={ - 'YEAR': "Years", - "value": "Total Sale" - }, - title='TOTAL BOX OFFICE vs TOTAL BOX OFFICE WITHOUT COVID') +px.line( + Data_Annual_Ticket, + x="YEAR", + y=["TOTAL BOX OFFICE", "TOTAL BOX OFFICE WITHOUT COVID"], + labels={"YEAR": "Years", "value": "Total Sale"}, + title="TOTAL BOX OFFICE vs TOTAL BOX OFFICE WITHOUT COVID", +) # %% [markdown] # ## **Calculate that how much does covid-19 affect on last two years** # %% -px.bar(Data_Annual_Ticket, - x='YEAR', - y="Diff", - labels={ - 'YEAR': "Year", - "Diff": "Financial Loss" - }, - title='Financial Loss (just last two years are important)', - barmode='group') +px.bar( + Data_Annual_Ticket, + x="YEAR", + y="Diff", + labels={"YEAR": "Year", "Diff": "Financial Loss"}, + title="Financial Loss (just last two years are important)", + barmode="group", +) # %% [markdown] # @@ -149,18 +157,24 @@ # %% Data_Annual_Ticket["Percentage of Financial Loss"] = ( - Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] - - Data_Annual_Ticket["TOTAL BOX OFFICE"] -) / Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] * 100 - -px.bar(Data_Annual_Ticket, - x='YEAR', - y="Percentage of Financial Loss", - labels={ - 'YEAR': "Year", - "Percentage of Financial Loss": "Percentage of Financial Loss %" - }, - title='Financial Loss % (just last two years are important) ') + ( + Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] + - Data_Annual_Ticket["TOTAL BOX OFFICE"] + ) + / Data_Annual_Ticket["TOTAL BOX OFFICE WITHOUT COVID"] + * 100 +) + +px.bar( + Data_Annual_Ticket, + x="YEAR", + y="Percentage of Financial Loss", + labels={ + "YEAR": "Year", + "Percentage of Financial Loss": "Percentage of Financial Loss %", + }, + title="Financial Loss % (just last two years are important) ", +) # %% [markdown] # ## **Now Visualizing the Highest Grossers** @@ -170,16 +184,18 @@ # %% HighestGrossers["TOTAL IN 2019 DOLLARS"] = HighestGrossers[ - "TOTAL IN 2019 DOLLARS"].str.replace(',', '') + "TOTAL IN 2019 DOLLARS" +].str.replace(",", "") HighestGrossers["TOTAL IN 2019 DOLLARS"] = HighestGrossers[ - "TOTAL IN 2019 DOLLARS"].str.replace('$', '') + "TOTAL IN 2019 DOLLARS" +].str.replace("$", "") -HighestGrossers["TICKETS SOLD"] = HighestGrossers["TICKETS SOLD"].str.replace( - ',', '') +HighestGrossers["TICKETS SOLD"] = HighestGrossers["TICKETS SOLD"].str.replace(",", "") -HighestGrossers['TOTAL IN 2019 DOLLARS'] = HighestGrossers[ - 'TOTAL IN 2019 DOLLARS'].astype(float) -HighestGrossers['TICKETS SOLD'] = HighestGrossers['TICKETS SOLD'].astype(float) +HighestGrossers["TOTAL IN 2019 DOLLARS"] = HighestGrossers[ + "TOTAL IN 2019 DOLLARS" +].astype(float) +HighestGrossers["TICKETS SOLD"] = HighestGrossers["TICKETS SOLD"].astype(float) # %% HighestGrossers.head(5) @@ -191,50 +207,66 @@ # ## **We use pie chart to illustrate the percentage of different thing** # %% -px.pie(HighestGrossers, - values='TOTAL IN 2019 DOLLARS', - names='DISTRIBUTOR', - title='Percentage of Each Distributors in Total Ticket Sale', - color_discrete_sequence=px.colors.sequential.RdBu, - height=600) +px.pie( + HighestGrossers, + values="TOTAL IN 2019 DOLLARS", + names="DISTRIBUTOR", + title="Percentage of Each Distributors in Total Ticket Sale", + color_discrete_sequence=px.colors.sequential.RdBu, + height=600, +) # %% -px.pie(HighestGrossers, - values='TOTAL IN 2019 DOLLARS', - names='MPAA RATING', - title='Percentage of Each MPAA Rating in Total Ticket Sale', - color_discrete_sequence=px.colors.sequential.RdBu, - height=600) +px.pie( + HighestGrossers, + values="TOTAL IN 2019 DOLLARS", + names="MPAA RATING", + title="Percentage of Each MPAA Rating in Total Ticket Sale", + color_discrete_sequence=px.colors.sequential.RdBu, + height=600, +) # %% [markdown] # ## **using bar chart to state the sum of total ticket sale each distributor and each genre** # %% -df_g = HighestGrossers.groupby( - by=['DISTRIBUTOR', 'GENRE'])['TICKETS SOLD'].sum().unstack().iplot( - kind='bar') +df_g = ( + HighestGrossers.groupby(by=["DISTRIBUTOR", "GENRE"])["TICKETS SOLD"] + .sum() + .unstack() + .iplot(kind="bar") +) # %% [markdown] # ## **using bar chart to state the count of total ticket sale each distributor and each genre** # %% -df_g = HighestGrossers.groupby( - by=['DISTRIBUTOR', 'GENRE'])['TICKETS SOLD'].count().unstack().iplot( - kind='bar') +df_g = ( + HighestGrossers.groupby(by=["DISTRIBUTOR", "GENRE"])["TICKETS SOLD"] + .count() + .unstack() + .iplot(kind="bar") +) # %% [markdown] # ## **doing the same thing to the MPAA rating** # # %% -df_g = HighestGrossers.groupby( - by=['DISTRIBUTOR', 'MPAA RATING'])['TICKETS SOLD'].sum().unstack().iplot( - kind='bar') +df_g = ( + HighestGrossers.groupby(by=["DISTRIBUTOR", "MPAA RATING"])["TICKETS SOLD"] + .sum() + .unstack() + .iplot(kind="bar") +) # %% -df_g = HighestGrossers.groupby( - by=['DISTRIBUTOR', 'MPAA RATING'])['TICKETS SOLD'].count().unstack().iplot( - kind='bar') +df_g = ( + HighestGrossers.groupby(by=["DISTRIBUTOR", "MPAA RATING"])["TICKETS SOLD"] + .count() + .unstack() + .iplot(kind="bar") +) # %% [markdown] # ## **now visualising the Popular Creative Types** @@ -243,74 +275,91 @@ PopularCreativeTypes.head(5) # %% -PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes[ - "TOTAL GROSS"].str.replace(',', '') -PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes[ - "TOTAL GROSS"].str.replace('$', '') +PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes["TOTAL GROSS"].str.replace( + ",", "" +) +PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes["TOTAL GROSS"].str.replace( + "$", "" +) PopularCreativeTypes["AVERAGE GROSS"] = PopularCreativeTypes[ - "AVERAGE GROSS"].str.replace(',', '') + "AVERAGE GROSS" +].str.replace(",", "") PopularCreativeTypes["AVERAGE GROSS"] = PopularCreativeTypes[ - "AVERAGE GROSS"].str.replace('$', '') + "AVERAGE GROSS" +].str.replace("$", "") -PopularCreativeTypes["MARKET SHARE"] = PopularCreativeTypes[ - "MARKET SHARE"].str.replace('%', '') +PopularCreativeTypes["MARKET SHARE"] = PopularCreativeTypes["MARKET SHARE"].str.replace( + "%", "" +) -PopularCreativeTypes["MOVIES"] = PopularCreativeTypes["MOVIES"].str.replace( - ',', '') +PopularCreativeTypes["MOVIES"] = PopularCreativeTypes["MOVIES"].str.replace(",", "") # %% PopularCreativeTypes = PopularCreativeTypes.drop(index=9, axis=0) # %% PopularCreativeTypes["MOVIES"] = PopularCreativeTypes["MOVIES"].astype(float) -PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes[ - "TOTAL GROSS"].astype(float) -PopularCreativeTypes["AVERAGE GROSS"] = PopularCreativeTypes[ - "AVERAGE GROSS"].astype(float) -PopularCreativeTypes["MARKET SHARE"] = PopularCreativeTypes[ - "MARKET SHARE"].astype(float) +PopularCreativeTypes["TOTAL GROSS"] = PopularCreativeTypes["TOTAL GROSS"].astype(float) +PopularCreativeTypes["AVERAGE GROSS"] = PopularCreativeTypes["AVERAGE GROSS"].astype( + float +) +PopularCreativeTypes["MARKET SHARE"] = PopularCreativeTypes["MARKET SHARE"].astype( + float +) # %% -px.pie(PopularCreativeTypes, - values='TOTAL GROSS', - names='CREATIVE TYPES', - title='Percentage of Creative Types in Total Gross', - color_discrete_sequence=px.colors.sequential.RdBu, - height=600) +px.pie( + PopularCreativeTypes, + values="TOTAL GROSS", + names="CREATIVE TYPES", + title="Percentage of Creative Types in Total Gross", + color_discrete_sequence=px.colors.sequential.RdBu, + height=600, +) # %% -px.bar(PopularCreativeTypes, - x="TOTAL GROSS", - y="CREATIVE TYPES", - title="Total Gross of Different type") +px.bar( + PopularCreativeTypes, + x="TOTAL GROSS", + y="CREATIVE TYPES", + title="Total Gross of Different type", +) # %% -px.pie(PopularCreativeTypes, - values='AVERAGE GROSS', - names='CREATIVE TYPES', - title='Percentage of Creative Types in Average Gross', - color_discrete_sequence=px.colors.sequential.RdBu, - height=600) +px.pie( + PopularCreativeTypes, + values="AVERAGE GROSS", + names="CREATIVE TYPES", + title="Percentage of Creative Types in Average Gross", + color_discrete_sequence=px.colors.sequential.RdBu, + height=600, +) # %% -px.bar(PopularCreativeTypes, - x="AVERAGE GROSS", - y="CREATIVE TYPES", - title="Average Gross in Different type") +px.bar( + PopularCreativeTypes, + x="AVERAGE GROSS", + y="CREATIVE TYPES", + title="Average Gross in Different type", +) # %% -px.pie(PopularCreativeTypes, - values='MOVIES', - names='CREATIVE TYPES', - title='Percentage of Number of Muvies in Each Types', - color_discrete_sequence=px.colors.sequential.RdBu, - height=600) +px.pie( + PopularCreativeTypes, + values="MOVIES", + names="CREATIVE TYPES", + title="Percentage of Number of Muvies in Each Types", + color_discrete_sequence=px.colors.sequential.RdBu, + height=600, +) # %% -px.bar(PopularCreativeTypes, - x="MOVIES", - y="CREATIVE TYPES", - title="Number of Muvies in Different type") +px.bar( + PopularCreativeTypes, + x="MOVIES", + y="CREATIVE TYPES", + title="Number of Muvies in Different type", +) # %% diff --git a/_kaggle/_render/named-entity-recognition-ner-with-tensorflow/nb.py b/_kaggle/_render/named-entity-recognition-ner-with-tensorflow/nb.py index cb660e8..6353c5d 100644 --- a/_kaggle/_render/named-entity-recognition-ner-with-tensorflow/nb.py +++ b/_kaggle/_render/named-entity-recognition-ner-with-tensorflow/nb.py @@ -37,9 +37,9 @@ # %% data_path = "../input/entity-annotated-corpus/ner_dataset.csv" -data = pd.read_csv(data_path, encoding='unicode_escape') +data = pd.read_csv(data_path, encoding="unicode_escape") # filling the first column that determines which sentence each word belongs to. -data.fillna(method='ffill', inplace=True) +data.fillna(method="ffill", inplace=True) data.head() # %% @@ -55,17 +55,18 @@ def join_a_sentence(sentence_number, data): """ Args.: - sentence_number: sentence number we want to join and return. - + sentence_number: sentence number we want to join and return. + Returns: The joined sentence. """ sentence_number = str(sentence_number) - the_sentence_words_list = list(data[ - data['Sentence #'] == 'Sentence: {}'.format(sentence_number)]['Word']) + the_sentence_words_list = list( + data[data["Sentence #"] == "Sentence: {}".format(sentence_number)]["Word"] + ) - return ' '.join(the_sentence_words_list) + return " ".join(the_sentence_words_list) # %% @@ -80,7 +81,7 @@ def join_a_sentence(sentence_number, data): # %% # Number of unique sentences -len(np.unique(data['Sentence #'])) +len(np.unique(data["Sentence #"])) # %% print("Number of unique words in the dataset: {}".format(data.Word.nunique())) @@ -93,14 +94,14 @@ def join_a_sentence(sentence_number, data): # %% def num_words_tags(tags, data): - """This functions takes the tags we want to count and the datafram + """This functions takes the tags we want to count and the datafram and return a dict where the key is the tag and the value is the frequency of that tag""" tags_count = {} for tag in tags: - len_tag = len(data[data['Tag'] == tag]) + len_tag = len(data[data["Tag"] == tag]) tags_count[tag] = len_tag return tags_count @@ -112,9 +113,9 @@ def num_words_tags(tags, data): # %% plt.figure(figsize=(10, 6)) -plt.hist(data.Tag, log=True, label='Tags', color='olive', bins=50) -plt.xlabel('Tags', fontsize=16) -plt.ylabel('Count', fontsize=16) +plt.hist(data.Tag, log=True, label="Tags", color="olive", bins=50) +plt.xlabel("Tags", fontsize=16) +plt.ylabel("Count", fontsize=16) plt.title("Tags Frequency", fontsize=20) plt.grid(alpha=0.3) plt.legend() @@ -159,8 +160,8 @@ def num_words_tags(tags, data): ready_data.head() # %% -X = list(ready_data['Sentence']) -Y = list(ready_data['Tag']) +X = list(ready_data["Sentence"]) +Y = list(ready_data["Tag"]) # %% from ast import literal_eval @@ -222,7 +223,7 @@ def num_words_tags(tags, data): # %% # pad the sequences so that all sequences are of the same size -X_preprocessed = pad_sequences(sequences, maxlen=maxlen, padding='post') +X_preprocessed = pad_sequences(sequences, maxlen=maxlen, padding="post") # %% # first example after tokenization and padding. @@ -286,7 +287,7 @@ def preprocess_tags(tags2id, Y_ready): num_O_to_add = maxlen - len_new_tag_list # add 'O's to padd the tag lists - padded_tags = Y_place_holder + ([tags2id['O']] * num_O_to_add) + padded_tags = Y_place_holder + ([tags2id["O"]] * num_O_to_add) Y_preprocessed.append(padded_tags) return Y_preprocessed @@ -345,29 +346,34 @@ def preprocess_tags(tags2id, Y_ready): Y_preprocessed = Y_preprocessed[indices] # %% -X_train = X_preprocessed[:int(0.7 * len(X_preprocessed))] +X_train = X_preprocessed[: int(0.7 * len(X_preprocessed))] print("Number of training examples: {}".format(len(X_train))) -X_val = X_preprocessed[int(0.7 * - len(X_preprocessed)):int(0.7 * - len(X_preprocessed)) + - (int(0.15 * len(X_preprocessed)) + 1)] +X_val = X_preprocessed[ + int(0.7 * len(X_preprocessed)) : int(0.7 * len(X_preprocessed)) + + (int(0.15 * len(X_preprocessed)) + 1) +] print("Number of validation examples: {}".format(len(X_val))) -X_test = X_preprocessed[int(0.7 * len(X_preprocessed)) + - (int(0.15 * len(X_preprocessed)) + 1):] +X_test = X_preprocessed[ + int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1) : +] print("Number of testing examples: {}".format(len(X_test))) -Y_train = Y_preprocessed[:int(0.7 * len(X_preprocessed))] -Y_val = Y_preprocessed[int(0.7 * - len(X_preprocessed)):int(0.7 * - len(X_preprocessed)) + - (int(0.15 * len(X_preprocessed)) + 1)] -Y_test = Y_preprocessed[int(0.7 * len(X_preprocessed)) + - (int(0.15 * len(X_preprocessed)) + 1):] - -print("Total number of examples after shuffling and splitting: {}".format( - len(X_train) + len(X_val) + len(X_test))) +Y_train = Y_preprocessed[: int(0.7 * len(X_preprocessed))] +Y_val = Y_preprocessed[ + int(0.7 * len(X_preprocessed)) : int(0.7 * len(X_preprocessed)) + + (int(0.15 * len(X_preprocessed)) + 1) +] +Y_test = Y_preprocessed[ + int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1) : +] + +print( + "Total number of examples after shuffling and splitting: {}".format( + len(X_train) + len(X_val) + len(X_test) + ) +) # %% [markdown] # ## 5- Model Training and Evaluation @@ -404,27 +410,28 @@ def preprocess_tags(tags2id, Y_ready): max_words = 36000 num_tags = len(tags) -model = tf.keras.models.Sequential([ - tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen), - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(units=100, - activation='tanh', - return_sequences=True)), - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(units=100, - activation='tanh', - return_sequences=True)), - tf.keras.layers.TimeDistributed( - tf.keras.layers.Dense(num_tags, activation='softmax')) -]) +model = tf.keras.models.Sequential( + [ + tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen), + tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(units=100, activation="tanh", return_sequences=True) + ), + tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(units=100, activation="tanh", return_sequences=True) + ), + tf.keras.layers.TimeDistributed( + tf.keras.layers.Dense(num_tags, activation="softmax") + ), + ] +) # %% model.summary() # %% -model.compile(loss='sparse_categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) +model.compile( + loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"] +) # %% history = model.fit(train_dataset, validation_data=val_dataset, epochs=15) @@ -433,31 +440,31 @@ def preprocess_tags(tags2id, Y_ready): model.evaluate(test_dataset) # %% -acc = history.history['accuracy'] -val_acc = history.history['val_accuracy'] +acc = history.history["accuracy"] +val_acc = history.history["val_accuracy"] -loss = history.history['loss'] -val_loss = history.history['val_loss'] +loss = history.history["loss"] +val_loss = history.history["val_loss"] epochs = range(1, len(acc) + 1) fig, ax = plt.subplots(1, 2, constrained_layout=True, figsize=(6, 4), dpi=80) -ax[0].plot(epochs, acc, label="Training Accuracy", color='darkblue') -ax[0].plot(epochs, val_acc, label="Validation Accuracy", color='darkgreen') +ax[0].plot(epochs, acc, label="Training Accuracy", color="darkblue") +ax[0].plot(epochs, val_acc, label="Validation Accuracy", color="darkgreen") ax[0].grid(alpha=0.3) -ax[0].title.set_text('Training Vs Validation Accuracy') -ax[0].fill_between(epochs, acc, val_acc, color='crimson', alpha=0.3) -plt.setp(ax[0], xlabel='Epochs') -plt.setp(ax[0], ylabel='Accuracy') +ax[0].title.set_text("Training Vs Validation Accuracy") +ax[0].fill_between(epochs, acc, val_acc, color="crimson", alpha=0.3) +plt.setp(ax[0], xlabel="Epochs") +plt.setp(ax[0], ylabel="Accuracy") -ax[1].plot(epochs, loss, label="Training Loss", color='darkblue') -ax[1].plot(epochs, val_loss, label="Validation Loss", color='darkgreen') +ax[1].plot(epochs, loss, label="Training Loss", color="darkblue") +ax[1].plot(epochs, val_loss, label="Validation Loss", color="darkgreen") ax[1].grid(alpha=0.3) -ax[1].title.set_text('Training Vs Validation Loss') -ax[1].fill_between(epochs, loss, val_loss, color='crimson', alpha=0.3) -plt.setp(ax[1], xlabel='Epochs') -plt.setp(ax[1], ylabel='Loss') +ax[1].title.set_text("Training Vs Validation Loss") +ax[1].fill_between(epochs, loss, val_loss, color="crimson", alpha=0.3) +plt.setp(ax[1], xlabel="Epochs") +plt.setp(ax[1], ylabel="Loss") plt.show() @@ -465,7 +472,7 @@ def preprocess_tags(tags2id, Y_ready): # %% def make_prediction(model, preprocessed_sentence, id2word, id2tag): - #if preprocessed_sentence.shape() != (1, 110): + # if preprocessed_sentence.shape() != (1, 110): preprocessed_sentence = preprocessed_sentence.reshape((1, 110)) # return preprocessed sentence to its orginal form @@ -473,7 +480,7 @@ def make_prediction(model, preprocessed_sentence, id2word, id2tag): word_list = [] for word in list(sentence): word_list.append(id2word[word]) - orginal_sententce = ' '.join(word_list) + orginal_sententce = " ".join(word_list) len_orginal_sententce = len(word_list) @@ -493,10 +500,8 @@ def make_prediction(model, preprocessed_sentence, id2word, id2tag): # %% orginal_sententce, pred_tag_list = make_prediction( - model=model, - preprocessed_sentence=X_test[520], - id2word=id2word, - id2tag=id2tag) + model=model, preprocessed_sentence=X_test[520], id2word=id2word, id2tag=id2tag +) # %% print(orginal_sententce) diff --git a/_kaggle/_render/netflix-subscription-fee-in-different-countries/nb.py b/_kaggle/_render/netflix-subscription-fee-in-different-countries/nb.py index 19ee30e..1a02bbf 100644 --- a/_kaggle/_render/netflix-subscription-fee-in-different-countries/nb.py +++ b/_kaggle/_render/netflix-subscription-fee-in-different-countries/nb.py @@ -16,7 +16,7 @@ # # Which Countries Pay The Most and Least for Netflix in 2021? # %% -#import library +# import library import numpy as np import pandas as pd import seaborn as sns @@ -27,251 +27,253 @@ # # Data Extraction # %% -#read dataset +# read dataset df = pd.read_csv( - '../input/netflix-subscription-price-in-different-countries/Netflix subscription fee Dec-2021.csv' + "../input/netflix-subscription-price-in-different-countries/Netflix subscription fee Dec-2021.csv" ) # %% -#show head of dataset +# show head of dataset df.head() # %% [markdown] # # Exploratory Data Analysis (EDA) # %% -#check rows and columns of dataset +# check rows and columns of dataset df.shape # %% -#check all columns +# check all columns df.columns # %% -#rename attribute columns of dataset +# rename attribute columns of dataset df = df.rename( columns={ - 'Country_code': 'Country_Code', - 'Total Library Size': 'Library_Size', - 'No. of TV Shows': 'No_TV_Shows', - 'No. of Movies': 'No_Movies', - 'Cost Per Month - Basic ($)': 'Basic_Cost_Per_Month', - 'Cost Per Month - Standard ($)': 'Standard_Cost_Per_Month', - 'Cost Per Month - Premium ($)': 'Premium_Cost_Per_Month' - }) + "Country_code": "Country_Code", + "Total Library Size": "Library_Size", + "No. of TV Shows": "No_TV_Shows", + "No. of Movies": "No_Movies", + "Cost Per Month - Basic ($)": "Basic_Cost_Per_Month", + "Cost Per Month - Standard ($)": "Standard_Cost_Per_Month", + "Cost Per Month - Premium ($)": "Premium_Cost_Per_Month", + } +) df.head() # %% -#check type of dataset +# check type of dataset df.dtypes # %% -#check missing value of dataset +# check missing value of dataset df.isnull().sum() # %% -#describe all columns -df.describe(include='object') +# describe all columns +df.describe(include="object") # %% -#check correlation of each variable +# check correlation of each variable df.corr() # %% [markdown] # ## heatmap # %% -#visualize correlation of each variable using pearson correlation -sns.heatmap(df.corr(), vmax=0.9, linewidths=0.9, cmap='YlGnBu') -plt.title('Pearson Correlation', fontsize=15, pad=12) +# visualize correlation of each variable using pearson correlation +sns.heatmap(df.corr(), vmax=0.9, linewidths=0.9, cmap="YlGnBu") +plt.title("Pearson Correlation", fontsize=15, pad=12) plt.show() # %% -#check unique of country code column -df['Country_Code'].unique() +# check unique of country code column +df["Country_Code"].unique() # %% -#check number of country code -df['Country_Code'].value_counts() +# check number of country code +df["Country_Code"].value_counts() # %% -#check unique of country column -df['Country'].unique() +# check unique of country column +df["Country"].unique() # %% -#check number of country -df['Country'].value_counts() +# check number of country +df["Country"].value_counts() # %% [markdown] # ## outliers # %% -#visualize the outlier of each variable +# visualize the outlier of each variable chart = df.boxplot() chart.set_xticklabels(chart.get_xticklabels(), rotation=45) -plt.ylabel('Count', fontsize=12) +plt.ylabel("Count", fontsize=12) plt.show() -print('Maximum of library size :', df['Library_Size'].max()) -print('Minimum of library size :', df['Library_Size'].min()) -print('Median of library size :', df['Library_Size'].median()) -print('Average of library size :', df['Library_Size'].mean()) -print('Total of library size :', df['Library_Size'].sum()) -print('\n') -print('Maximum of number TV shows :', df['No_TV_Shows'].max()) -print('Minimum of number TV shows :', df['No_TV_Shows'].min()) -print('Median of number TV shows :', df['No_TV_Shows'].median()) -print('Average of number TV shows :', df['No_TV_Shows'].mean()) -print('Total of number TV shows :', df['No_TV_Shows'].sum()) -print('\n') -print('Maximum of number movies :', df['No_Movies'].max()) -print('Minimum of number movies :', df['No_Movies'].min()) -print('Median of number movies :', df['No_Movies'].median()) -print('Average of number movies :', df['No_Movies'].mean()) -print('Total of number movies :', df['No_Movies'].sum()) -print('\n') -print('Maximum of basic cost per month :', df['Basic_Cost_Per_Month'].max()) -print('Minimum of basic cost per month :', df['Basic_Cost_Per_Month'].min()) -print('Median of basic cost per month :', df['Basic_Cost_Per_Month'].median()) -print('Average of basic cost per month :', df['Basic_Cost_Per_Month'].mean()) -print('Total of basic cost per month :', df['Basic_Cost_Per_Month'].sum()) -print('\n') -print('Maximum of standard cost per month :', - df['Standard_Cost_Per_Month'].max()) -print('Minimum of standard cost per month :', - df['Standard_Cost_Per_Month'].min()) -print('Median of standard cost per month :', - df['Standard_Cost_Per_Month'].median()) -print('Average of standard cost per month :', - df['Standard_Cost_Per_Month'].mean()) -print('Total of standard cost per month :', - df['Standard_Cost_Per_Month'].sum()) -print('\n') -print('Maximum of premium cost per month :', - df['Premium_Cost_Per_Month'].max()) -print('Minimum of premium cost per month :', - df['Premium_Cost_Per_Month'].min()) -print('Median of premium cost per month :', - df['Premium_Cost_Per_Month'].median()) -print('Average of premium cost per month :', - df['Premium_Cost_Per_Month'].mean()) -print('Total of premium cost per month :', df['Premium_Cost_Per_Month'].sum()) +print("Maximum of library size :", df["Library_Size"].max()) +print("Minimum of library size :", df["Library_Size"].min()) +print("Median of library size :", df["Library_Size"].median()) +print("Average of library size :", df["Library_Size"].mean()) +print("Total of library size :", df["Library_Size"].sum()) +print("\n") +print("Maximum of number TV shows :", df["No_TV_Shows"].max()) +print("Minimum of number TV shows :", df["No_TV_Shows"].min()) +print("Median of number TV shows :", df["No_TV_Shows"].median()) +print("Average of number TV shows :", df["No_TV_Shows"].mean()) +print("Total of number TV shows :", df["No_TV_Shows"].sum()) +print("\n") +print("Maximum of number movies :", df["No_Movies"].max()) +print("Minimum of number movies :", df["No_Movies"].min()) +print("Median of number movies :", df["No_Movies"].median()) +print("Average of number movies :", df["No_Movies"].mean()) +print("Total of number movies :", df["No_Movies"].sum()) +print("\n") +print("Maximum of basic cost per month :", df["Basic_Cost_Per_Month"].max()) +print("Minimum of basic cost per month :", df["Basic_Cost_Per_Month"].min()) +print("Median of basic cost per month :", df["Basic_Cost_Per_Month"].median()) +print("Average of basic cost per month :", df["Basic_Cost_Per_Month"].mean()) +print("Total of basic cost per month :", df["Basic_Cost_Per_Month"].sum()) +print("\n") +print("Maximum of standard cost per month :", df["Standard_Cost_Per_Month"].max()) +print("Minimum of standard cost per month :", df["Standard_Cost_Per_Month"].min()) +print("Median of standard cost per month :", df["Standard_Cost_Per_Month"].median()) +print("Average of standard cost per month :", df["Standard_Cost_Per_Month"].mean()) +print("Total of standard cost per month :", df["Standard_Cost_Per_Month"].sum()) +print("\n") +print("Maximum of premium cost per month :", df["Premium_Cost_Per_Month"].max()) +print("Minimum of premium cost per month :", df["Premium_Cost_Per_Month"].min()) +print("Median of premium cost per month :", df["Premium_Cost_Per_Month"].median()) +print("Average of premium cost per month :", df["Premium_Cost_Per_Month"].mean()) +print("Total of premium cost per month :", df["Premium_Cost_Per_Month"].sum()) # %% [markdown] # ## analyze # %% -#analyze of library size under 5195 based on country and country code -df[df['Library_Size'] < 5195.0][['Country_Code', 'Country', 'Library_Size']] +# analyze of library size under 5195 based on country and country code +df[df["Library_Size"] < 5195.0][["Country_Code", "Country", "Library_Size"]] # %% -#analyze of library size over 5195 based on country and country code -df[df['Library_Size'] > 5195.0][['Country_Code', 'Country', 'Library_Size']] +# analyze of library size over 5195 based on country and country code +df[df["Library_Size"] > 5195.0][["Country_Code", "Country", "Library_Size"]] # %% -#analyze of number TV shows under 3512 based on country and country code -df[df['No_TV_Shows'] < 3512.0][['Country_Code', 'Country', 'No_TV_Shows']] +# analyze of number TV shows under 3512 based on country and country code +df[df["No_TV_Shows"] < 3512.0][["Country_Code", "Country", "No_TV_Shows"]] # %% -#analyze of number TV shows over 3512 based on country and country code -df[df['No_TV_Shows'] > 3512.0][['Country_Code', 'Country', 'No_TV_Shows']] +# analyze of number TV shows over 3512 based on country and country code +df[df["No_TV_Shows"] > 3512.0][["Country_Code", "Country", "No_TV_Shows"]] # %% -#analyze of number movies under 1841 based on country and country code -df[df['No_Movies'] < 1841.0][['Country_Code', 'Country', 'No_Movies']] +# analyze of number movies under 1841 based on country and country code +df[df["No_Movies"] < 1841.0][["Country_Code", "Country", "No_Movies"]] # %% -#analyze of number movies over 1841 based on country and country code -df[df['No_Movies'] > 1841.0][['Country_Code', 'Country', 'No_Movies']] +# analyze of number movies over 1841 based on country and country code +df[df["No_Movies"] > 1841.0][["Country_Code", "Country", "No_Movies"]] # %% -#analyze of basic cost per month under 8.99 based on country and country code -df[df['Basic_Cost_Per_Month'] < 8.99][[ - 'Country_Code', 'Country', 'Basic_Cost_Per_Month' -]] +# analyze of basic cost per month under 8.99 based on country and country code +df[df["Basic_Cost_Per_Month"] < 8.99][ + ["Country_Code", "Country", "Basic_Cost_Per_Month"] +] # %% -#analyze of basic cost per month over 8.99 based on country and country code -df[df['Basic_Cost_Per_Month'] > 8.99][[ - 'Country_Code', 'Country', 'Basic_Cost_Per_Month' -]] +# analyze of basic cost per month over 8.99 based on country and country code +df[df["Basic_Cost_Per_Month"] > 8.99][ + ["Country_Code", "Country", "Basic_Cost_Per_Month"] +] # %% -#analyze of standard cost per month under 11.49 based on country and country code -df[df['Standard_Cost_Per_Month'] < 11.49][[ - 'Country_Code', 'Country', 'Standard_Cost_Per_Month' -]] +# analyze of standard cost per month under 11.49 based on country and country code +df[df["Standard_Cost_Per_Month"] < 11.49][ + ["Country_Code", "Country", "Standard_Cost_Per_Month"] +] # %% -#analyze of standard cost per month over 11.49 based on country and country code -df[df['Standard_Cost_Per_Month'] > 11.49][[ - 'Country_Code', 'Country', 'Standard_Cost_Per_Month' -]] +# analyze of standard cost per month over 11.49 based on country and country code +df[df["Standard_Cost_Per_Month"] > 11.49][ + ["Country_Code", "Country", "Standard_Cost_Per_Month"] +] # %% -#analyze of premium cost per month under 14.45 based on country and country code -df[df['Premium_Cost_Per_Month'] < 14.45][[ - 'Country_Code', 'Country', 'Premium_Cost_Per_Month' -]] +# analyze of premium cost per month under 14.45 based on country and country code +df[df["Premium_Cost_Per_Month"] < 14.45][ + ["Country_Code", "Country", "Premium_Cost_Per_Month"] +] # %% -#analyze of premium cost per month over 14.45 based on country and country code -df[df['Premium_Cost_Per_Month'] > 14.45][[ - 'Country_Code', 'Country', 'Premium_Cost_Per_Month' -]] +# analyze of premium cost per month over 14.45 based on country and country code +df[df["Premium_Cost_Per_Month"] > 14.45][ + ["Country_Code", "Country", "Premium_Cost_Per_Month"] +] # %% [markdown] # ## top 20 countries # %% -#visualize top 20 of country based on total basic cost per month +# visualize top 20 of country based on total basic cost per month plt.figure(figsize=(10, 5)) -top_20_country = df['Country'][:20] -chart = df.groupby(top_20_country)['Basic_Cost_Per_Month'].sum().sort_values( - ascending=False).plot(kind='bar', color='maroon') +top_20_country = df["Country"][:20] +chart = ( + df.groupby(top_20_country)["Basic_Cost_Per_Month"] + .sum() + .sort_values(ascending=False) + .plot(kind="bar", color="maroon") +) chart.set_xticklabels(chart.get_xticklabels(), rotation=45) -plt.title('Top 20 of Country based on Total Basic Cost Per Month', - fontsize=15, - pad=12) -plt.xlabel('Country', fontsize=12) -plt.ylabel('Total Basic Cost Per Month ($)', fontsize=12) +plt.title("Top 20 of Country based on Total Basic Cost Per Month", fontsize=15, pad=12) +plt.xlabel("Country", fontsize=12) +plt.ylabel("Total Basic Cost Per Month ($)", fontsize=12) plt.show() # %% [markdown] # ## top 20 standard cost # %% -#visualize top 20 of country based on total standard cost per month +# visualize top 20 of country based on total standard cost per month plt.figure(figsize=(10, 5)) -chart = df.groupby(top_20_country)['Standard_Cost_Per_Month'].sum( -).sort_values(ascending=False).plot(kind='bar', color='lightseagreen') +chart = ( + df.groupby(top_20_country)["Standard_Cost_Per_Month"] + .sum() + .sort_values(ascending=False) + .plot(kind="bar", color="lightseagreen") +) chart.set_xticklabels(chart.get_xticklabels(), rotation=45) -plt.title('Top 20 of Country based on Total Standard Cost Per Month', - fontsize=15, - pad=12) -plt.xlabel('Country', fontsize=12) -plt.ylabel('Total Standard Cost Per Month ($)', fontsize=12) +plt.title( + "Top 20 of Country based on Total Standard Cost Per Month", fontsize=15, pad=12 +) +plt.xlabel("Country", fontsize=12) +plt.ylabel("Total Standard Cost Per Month ($)", fontsize=12) plt.show() # %% [markdown] # ## top 20 premium cost # %% -#visualize top 20 of country based on total premium cost per month +# visualize top 20 of country based on total premium cost per month plt.figure(figsize=(10, 5)) -chart = df.groupby(top_20_country)['Premium_Cost_Per_Month'].sum().sort_values( - ascending=False).plot(kind='bar', color='peru') +chart = ( + df.groupby(top_20_country)["Premium_Cost_Per_Month"] + .sum() + .sort_values(ascending=False) + .plot(kind="bar", color="peru") +) chart.set_xticklabels(chart.get_xticklabels(), rotation=45) -plt.title('Top 20 of Country based on Total Premium Cost Per Month', - fontsize=15, - pad=12) -plt.xlabel('Country', fontsize=12) -plt.ylabel('Total Premium Cost Per Month ($)', fontsize=12) +plt.title( + "Top 20 of Country based on Total Premium Cost Per Month", fontsize=15, pad=12 +) +plt.xlabel("Country", fontsize=12) +plt.ylabel("Total Premium Cost Per Month ($)", fontsize=12) plt.show() # %% [markdown] # ## report # %% -#profile report of dataset +# profile report of dataset ProfileReport(df) diff --git a/_kaggle/_render/pneumonia-classification-with-resnet50/nb.py b/_kaggle/_render/pneumonia-classification-with-resnet50/nb.py index a0535c6..022ab76 100644 --- a/_kaggle/_render/pneumonia-classification-with-resnet50/nb.py +++ b/_kaggle/_render/pneumonia-classification-with-resnet50/nb.py @@ -18,17 +18,35 @@ import numpy as np from tensorflow.keras import layers, optimizers from tensorflow.keras.applications.resnet50 import ResNet50 -from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout +from tensorflow.keras.layers import ( + Input, + Add, + Dense, + Activation, + ZeroPadding2D, + BatchNormalization, + Flatten, + Conv2D, + AveragePooling2D, + MaxPooling2D, + Dropout, +) from tensorflow.keras.models import Model, load_model from tensorflow.keras import backend as K from tensorflow.keras.preprocessing.image import ImageDataGenerator -from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler +from tensorflow.keras.callbacks import ( + ReduceLROnPlateau, + EarlyStopping, + ModelCheckpoint, + LearningRateScheduler, +) import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import os -for dirname, _, filenames in os.walk('/kaggle/input'): + +for dirname, _, filenames in os.walk("/kaggle/input"): for filename in filenames: print(os.path.join(dirname, filename)) @@ -37,23 +55,25 @@ # %% # Save the training data directory in a variable -XRay_Directory = '/kaggle/input/pneumonia-clasification/Dataset/Dataset/' +XRay_Directory = "/kaggle/input/pneumonia-clasification/Dataset/Dataset/" # Check the folders in the directory os.listdir(XRay_Directory) # %% # Create tensor images, normalize them and create a validation set -image_generator = ImageDataGenerator(rescale=1. / 255, validation_split=0.2) +image_generator = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2) # %% # Create the training generator -train_generator = image_generator.flow_from_directory(batch_size=40, - directory=XRay_Directory, - shuffle=True, - target_size=(256, 256), - class_mode='categorical', - subset="training") +train_generator = image_generator.flow_from_directory( + batch_size=40, + directory=XRay_Directory, + shuffle=True, + target_size=(256, 256), + class_mode="categorical", + subset="training", +) # %% # Create the validation generator @@ -62,8 +82,9 @@ directory=XRay_Directory, shuffle=True, target_size=(256, 256), - class_mode='categorical', - subset="validation") + class_mode="categorical", + subset="validation", +) # %% [markdown] # ## generate @@ -76,10 +97,10 @@ # %% # Dictionary with the categories label_names = { - 0: 'Covid-19', - 1: 'Normal', - 2: 'Viral Pneumonia', - 3: 'Bacterial Pneumonia' + 0: "Covid-19", + 1: "Normal", + 2: "Viral Pneumonia", + 3: "Bacterial Pneumonia", } # %% @@ -93,7 +114,7 @@ for i in np.arange(0, L * W): axes[i].imshow(train_images[i]) axes[i].set_title(label_names[np.argmax(train_labels[i])]) - axes[i].axis('off') + axes[i].axis("off") plt.subplots_adjust(wspace=0.3) @@ -102,9 +123,9 @@ # %% # Using the ResNet50 imagenet weights for transfer learning -basemodel = ResNet50(weights='imagenet', - include_top=False, - input_tensor=Input(shape=(256, 256, 3))) +basemodel = ResNet50( + weights="imagenet", include_top=False, input_tensor=Input(shape=(256, 256, 3)) +) basemodel.summary() # %% @@ -117,105 +138,109 @@ # Create the last layers for our model headmodel = basemodel.output headmodel = AveragePooling2D(pool_size=(4, 4))(headmodel) -headmodel = Flatten(name='flatten')(headmodel) +headmodel = Flatten(name="flatten")(headmodel) headmodel = Dense(256, activation="relu")(headmodel) headmodel = Dropout(0.3)(headmodel) headmodel = Dense(128, activation="relu")(headmodel) headmodel = Dropout(0.2)(headmodel) -headmodel = Dense(4, activation='softmax')(headmodel) +headmodel = Dense(4, activation="softmax")(headmodel) model = Model(inputs=basemodel.input, outputs=headmodel) # %% -model.compile(loss='categorical_crossentropy', - optimizer=optimizers.RMSprop(learning_rate=0.0001, - decay=0.000001), - metrics=["accuracy"]) +model.compile( + loss="categorical_crossentropy", + optimizer=optimizers.RMSprop(learning_rate=0.0001, decay=0.000001), + metrics=["accuracy"], +) # %% # Using earlystopping to avoid overfitting -earlystopping = EarlyStopping(monitor='val_loss', - mode='min', - verbose=1, - patience=20) +earlystopping = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=20) # %% # Saving the best model weights -checkpoint = ModelCheckpoint(filepath="weights.hdf5", - verbose=1, - save_best_only=True) +checkpoint = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True) # %% -train_generator = image_generator.flow_from_directory(batch_size=4, - directory=XRay_Directory, - shuffle=True, - target_size=(256, 256), - class_mode='categorical', - subset="training") -val_generator = image_generator.flow_from_directory(batch_size=4, - directory=XRay_Directory, - shuffle=True, - target_size=(256, 256), - class_mode='categorical', - subset="validation") +train_generator = image_generator.flow_from_directory( + batch_size=4, + directory=XRay_Directory, + shuffle=True, + target_size=(256, 256), + class_mode="categorical", + subset="training", +) +val_generator = image_generator.flow_from_directory( + batch_size=4, + directory=XRay_Directory, + shuffle=True, + target_size=(256, 256), + class_mode="categorical", + subset="validation", +) # %% -history = model.fit_generator(train_generator, - steps_per_epoch=train_generator.n // 4, - epochs=25, - validation_data=val_generator, - validation_steps=val_generator.n // 4, - callbacks=[checkpoint, earlystopping]) +history = model.fit_generator( + train_generator, + steps_per_epoch=train_generator.n // 4, + epochs=25, + validation_data=val_generator, + validation_steps=val_generator.n // 4, + callbacks=[checkpoint, earlystopping], +) # %% [markdown] # ## plotting # %% -plt.plot(history.history['accuracy']) -plt.plot(history.history['loss']) +plt.plot(history.history["accuracy"]) +plt.plot(history.history["loss"]) -plt.title('Model Loss and Accuracy Progress During Training') -plt.xlabel('Epoch') -plt.ylabel('Training Accuracy and Loss') -plt.legend(['Training Accuracy', 'Training Loss']) +plt.title("Model Loss and Accuracy Progress During Training") +plt.xlabel("Epoch") +plt.ylabel("Training Accuracy and Loss") +plt.legend(["Training Accuracy", "Training Loss"]) plt.show() # %% -plt.plot(history.history['val_loss']) -plt.title('Model Loss During Cross-Validation') -plt.xlabel('Epoch') -plt.ylabel('Validation Loss') -plt.legend(['Validation Loss']) +plt.plot(history.history["val_loss"]) +plt.title("Model Loss During Cross-Validation") +plt.xlabel("Epoch") +plt.ylabel("Validation Loss") +plt.legend(["Validation Loss"]) plt.show() # %% -plt.plot(history.history['val_accuracy']) -plt.title('Model Accuracy Progress During Cross-Validation') -plt.xlabel('Epoch') -plt.ylabel('Validation Accuracy') -plt.legend(['Validation Accuracy']) +plt.plot(history.history["val_accuracy"]) +plt.title("Model Accuracy Progress During Cross-Validation") +plt.xlabel("Epoch") +plt.ylabel("Validation Accuracy") +plt.legend(["Validation Accuracy"]) plt.show() # %% [markdown] # ## testing # %% -test_directory = '/kaggle/input/pneumonia-clasification/Test/Test/' +test_directory = "/kaggle/input/pneumonia-clasification/Test/Test/" # %% -test_gen = ImageDataGenerator(rescale=1. / 255) +test_gen = ImageDataGenerator(rescale=1.0 / 255) -test_generator = test_gen.flow_from_directory(batch_size=40, - directory=test_directory, - shuffle=True, - target_size=(256, 256), - class_mode='categorical') +test_generator = test_gen.flow_from_directory( + batch_size=40, + directory=test_directory, + shuffle=True, + target_size=(256, 256), + class_mode="categorical", +) -evaluate = model.evaluate_generator(test_generator, - steps=test_generator.n // 4, - verbose=1) +evaluate = model.evaluate_generator( + test_generator, steps=test_generator.n // 4, verbose=1 +) -print('Accuracy Test : {}'.format(evaluate[1])) +print("Accuracy Test : {}".format(evaluate[1])) # %% from sklearn.metrics import confusion_matrix, classification_report, accuracy_score @@ -252,9 +277,12 @@ for i in np.arange(0, L * W): axes[i].imshow(image[i]) - axes[i].set_title('Predict= {}\nReal= {}'.format( - str(label_names[prediction[i]]), str(label_names[original[i]]))) - axes[i].axis('off') + axes[i].set_title( + "Predict= {}\nReal= {}".format( + str(label_names[prediction[i]]), str(label_names[original[i]]) + ) + ) + axes[i].axis("off") plt.subplots_adjust(wspace=1.2) @@ -264,11 +292,11 @@ # %% cm = confusion_matrix(np.asarray(original), np.asarray(prediction)) ax = plt.subplot() -sns.heatmap(cm, annot=True, ax=ax, cmap='Blues') +sns.heatmap(cm, annot=True, ax=ax, cmap="Blues") -ax.set_xlabel('Predicted') -ax.set_ylabel('Original') -ax.set_title('Confusion_matrix') +ax.set_xlabel("Predicted") +ax.set_ylabel("Original") +ax.set_title("Confusion_matrix") plt.show() # %% diff --git a/_kaggle/_render/seven-models-comparsion-predition-score-mse/nb.py b/_kaggle/_render/seven-models-comparsion-predition-score-mse/nb.py index 18de567..c4bb79f 100644 --- a/_kaggle/_render/seven-models-comparsion-predition-score-mse/nb.py +++ b/_kaggle/_render/seven-models-comparsion-predition-score-mse/nb.py @@ -17,14 +17,15 @@ # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load -import numpy as np # linear algebra -import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os -for dirname, _, filenames in os.walk('/kaggle/input'): + +for dirname, _, filenames in os.walk("/kaggle/input"): for filename in filenames: print(os.path.join(dirname, filename)) @@ -37,8 +38,8 @@ # # **1. Read the Data** # %% -Train_data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv') -Test_data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv') +Train_data = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv") +Test_data = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv") train_data = Train_data.copy() test_data = Test_data.copy() @@ -52,30 +53,36 @@ # # * I dummies the 'country', 'store', 'product' columns. + # %% -#Month +# Month def split_month(date): - return date.split('-')[1] + return date.split("-")[1] -#Day + +# Day def split_day(date): - return date.split('-')[2] + return date.split("-")[2] + -#Weekend +# Weekend def weekend(date): - import datetime - weekend = [] - a = pd.to_datetime(date) - for i in range(len(a)): - if a.iloc[i].weekday() >= 5 : - weekend.append(1) - else: - weekend.append(0) - return weekend - -#Weekday + import datetime + + weekend = [] + a = pd.to_datetime(date) + for i in range(len(a)): + if a.iloc[i].weekday() >= 5: + weekend.append(1) + else: + weekend.append(0) + return weekend + + +# Weekday def weekday(date): import datetime + weekday = [] a = pd.to_datetime(date) for i in range(len(a)): @@ -84,44 +91,47 @@ def weekday(date): # %% -train_data['Month'] = train_data['date'].apply(split_month) -train_data['Day'] = train_data['date'].apply(split_day) -train_data['Weekend'] = weekend(train_data['date']) -train_data['Weekday'] = weekday(train_data['date']) -train_data = train_data.drop(columns = ['row_id', 'date']) - -test_data['Month'] = test_data['date'].apply(split_month) -test_data['Day'] = test_data['date'].apply(split_day) -test_data['Weekend'] = weekend(test_data['date']) -test_data['Weekday'] = weekday(test_data['date']) -test_data = test_data.drop(columns = ['row_id', 'date']) +train_data["Month"] = train_data["date"].apply(split_month) +train_data["Day"] = train_data["date"].apply(split_day) +train_data["Weekend"] = weekend(train_data["date"]) +train_data["Weekday"] = weekday(train_data["date"]) +train_data = train_data.drop(columns=["row_id", "date"]) + +test_data["Month"] = test_data["date"].apply(split_month) +test_data["Day"] = test_data["date"].apply(split_day) +test_data["Weekend"] = weekend(test_data["date"]) +test_data["Weekday"] = weekday(test_data["date"]) +test_data = test_data.drop(columns=["row_id", "date"]) # %% -#Dummies the 'country', 'store', 'product' -train_data_dum = pd.get_dummies(train_data[['country', 'store', 'product']]) -test_data_dum = pd.get_dummies(test_data[['country', 'store', 'product']]) +# Dummies the 'country', 'store', 'product' +train_data_dum = pd.get_dummies(train_data[["country", "store", "product"]]) +test_data_dum = pd.get_dummies(test_data[["country", "store", "product"]]) -train_data = pd.concat([train_data, train_data_dum],axis = 1) -test_data = pd.concat([test_data, test_data_dum],axis = 1) +train_data = pd.concat([train_data, train_data_dum], axis=1) +test_data = pd.concat([test_data, test_data_dum], axis=1) -train_data = train_data.drop(columns = ['country', 'store', 'product']) -test_data = test_data.drop(columns = ['country', 'store', 'product']) +train_data = train_data.drop(columns=["country", "store", "product"]) +test_data = test_data.drop(columns=["country", "store", "product"]) # %% [markdown] # ### Define the training data and training target # %% -data = train_data.drop(columns = 'num_sold') -target = train_data['num_sold'] +data = train_data.drop(columns="num_sold") +target = train_data["num_sold"] from sklearn.preprocessing import StandardScaler + Normalize = StandardScaler() target = np.log(target) # %% from sklearn.model_selection import train_test_split -x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8, random_state = 5) +x_train, x_test, y_train, y_test = train_test_split( + data, target, train_size=0.8, random_state=5 +) x_train = Normalize.fit_transform(x_train) x_test = Normalize.transform(x_test) @@ -135,14 +145,15 @@ def weekday(date): # %% from sklearn.tree import DecisionTreeRegressor + sns.set() -DTR = DecisionTreeRegressor(max_depth = 12, min_samples_leaf = 2).fit(x_train, y_train) +DTR = DecisionTreeRegressor(max_depth=12, min_samples_leaf=2).fit(x_train, y_train) y_pred_DTR = DTR.predict(x_test) plt.scatter(y_test, y_pred_DTR) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('DecisionTreeRegressor') +plt.title("DecisionTreeRegressor") plt.show() plt.clf() @@ -151,14 +162,15 @@ def weekday(date): # %% from sklearn.ensemble import RandomForestRegressor + sns.set() -RFR = RandomForestRegressor(max_depth = 15, random_state = 2).fit(x_train, y_train) +RFR = RandomForestRegressor(max_depth=15, random_state=2).fit(x_train, y_train) y_pred_RFR = RFR.predict(x_test) plt.scatter(y_test, y_pred_RFR) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('RandomFroestRegressor') +plt.title("RandomFroestRegressor") plt.show() plt.clf() @@ -167,15 +179,22 @@ def weekday(date): # %% from sklearn.ensemble import GradientBoostingRegressor + sns.set() -GBR = GradientBoostingRegressor(learning_rate=0.10, max_depth= 6, - min_samples_leaf = 5,n_estimators = 500, random_state = 40,subsample = 0.3).fit(x_train, y_train) +GBR = GradientBoostingRegressor( + learning_rate=0.10, + max_depth=6, + min_samples_leaf=5, + n_estimators=500, + random_state=40, + subsample=0.3, +).fit(x_train, y_train) y_pred_GBR = GBR.predict(x_test) plt.scatter(y_test, y_pred_GBR) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('GradientBoostingRegressor') +plt.title("GradientBoostingRegressor") plt.show() plt.clf() print(GBR.score(x_test, y_test)) @@ -185,14 +204,17 @@ def weekday(date): # %% from sklearn.svm import SVR + sns.set() -svr_rbf = SVR(kernel = 'rbf', gamma = 0.2 , C = 0.15, degree = 2, epsilon=0.1).fit(x_train, y_train) +svr_rbf = SVR(kernel="rbf", gamma=0.2, C=0.15, degree=2, epsilon=0.1).fit( + x_train, y_train +) y_pred_svr = svr_rbf.predict(x_test) plt.scatter(y_test, y_pred_svr) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('SVM_RBF') +plt.title("SVM_RBF") plt.show() plt.clf() @@ -201,14 +223,15 @@ def weekday(date): # %% from sklearn.svm import SVR + sns.set() -svr_linear = SVR(kernel = 'linear').fit(x_train, y_train) +svr_linear = SVR(kernel="linear").fit(x_train, y_train) y_pred_svr = svr_linear.predict(x_test) plt.scatter(y_test, y_pred_svr) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('SVM_Linear') +plt.title("SVM_Linear") plt.show() plt.clf() @@ -218,25 +241,29 @@ def weekday(date): # %% from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout + data = Normalize.fit_transform(data) model = Sequential() -model.add(Dense(512, input_shape = (data.shape[1], ), activation = 'sigmoid')) -model.add(Dense(64,activation = 'sigmoid')) +model.add(Dense(512, input_shape=(data.shape[1],), activation="sigmoid")) +model.add(Dense(64, activation="sigmoid")) model.add(Dense(8)) model.add(Dense(1)) -model.compile(loss = 'mse', optimizer = 'adam', metrics= 'mse') -history = model.fit(data, target, batch_size = 128, epochs = 100 , validation_split= 0.2, verbose = 1) +model.compile(loss="mse", optimizer="adam", metrics="mse") +history = model.fit( + data, target, batch_size=128, epochs=100, validation_split=0.2, verbose=1 +) # %% import seaborn as sns + sns.set() df_history = pd.DataFrame(history.history) -sns.lineplot(x = df_history.index, y = df_history.loss) +sns.lineplot(x=df_history.index, y=df_history.loss) # %% y_pred = model.predict(data) plt.scatter(target, y_pred) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") plt.show() @@ -247,14 +274,15 @@ def weekday(date): # %% from sklearn.neighbors import KNeighborsRegressor + sns.set() -KNN = KNeighborsRegressor(n_neighbors = 3, weights = 'distance').fit(x_train, y_train) +KNN = KNeighborsRegressor(n_neighbors=3, weights="distance").fit(x_train, y_train) y_pred_KNN = KNN.predict(x_test) plt.scatter(y_test, y_pred_KNN) -plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color = 'r') +plt.plot([x for x in range(4, 10)], [x for x in range(4, 10)], color="r") plt.xlabel("Reality") plt.ylabel("Predicted") -plt.title('KNeighborsRegressor') +plt.title("KNeighborsRegressor") plt.show() plt.clf() @@ -264,27 +292,32 @@ def weekday(date): # %% from sklearn.metrics import mean_squared_log_error from sklearn.metrics import mean_squared_error + + def model_fit(x_train, x_test, y_train, y_test): - DTR = DecisionTreeRegressor(max_depth = 10, - min_samples_leaf = 8).fit(x_train, y_train) + DTR = DecisionTreeRegressor(max_depth=10, min_samples_leaf=8).fit(x_train, y_train) - RFR = RandomForestRegressor(max_depth = 30).fit(x_train, y_train) - - GBR = GradientBoostingRegressor(learning_rate=0.10, max_depth= 6, - min_samples_leaf = 5,n_estimators = 500, random_state = 40,subsample = 0.3).fit(x_train, y_train) + RFR = RandomForestRegressor(max_depth=30).fit(x_train, y_train) - svr_rbf = SVR(kernel = 'rbf', - gamma = 0.2 , - C = 0.15, - degree = 2, - epsilon=0.1).fit(x_train, y_train) + GBR = GradientBoostingRegressor( + learning_rate=0.10, + max_depth=6, + min_samples_leaf=5, + n_estimators=500, + random_state=40, + subsample=0.3, + ).fit(x_train, y_train) - svr_linear = SVR(kernel = 'linear').fit(x_train, y_train) + svr_rbf = SVR(kernel="rbf", gamma=0.2, C=0.15, degree=2, epsilon=0.1).fit( + x_train, y_train + ) - KNN = KNeighborsRegressor(n_neighbors = 10).fit(x_train, y_train) + svr_linear = SVR(kernel="linear").fit(x_train, y_train) - return DTR, RFR, GBR, svr_rbf, svr_linear, KNN + KNN = KNeighborsRegressor(n_neighbors=10).fit(x_train, y_train) + + return DTR, RFR, GBR, svr_rbf, svr_linear, KNN # %% [markdown] @@ -292,38 +325,56 @@ def model_fit(x_train, x_test, y_train, y_test): # %% Model = model_fit(x_train, x_test, y_train, y_test) -ML_model = ['DecisionTreeRegressor', 'RandomForestRegressor', 'GradientBoostingRegressor', 'svr_rbf', 'svr_linear', 'KNeighborsRegressor', 'DeepLearning'] +ML_model = [ + "DecisionTreeRegressor", + "RandomForestRegressor", + "GradientBoostingRegressor", + "svr_rbf", + "svr_linear", + "KNeighborsRegressor", + "DeepLearning", +] sns.set() from sklearn.metrics import r2_score + R_square_num = [] for i in range(6): - R_square = r2_score(y_test, Model[i].predict(x_test)) - R_square_num.append(R_square) + R_square = r2_score(y_test, Model[i].predict(x_test)) + R_square_num.append(R_square) R_square_num.append(r2_score(y_test, model.predict(x_test))) -plt.figure(figsize = (10, 10)) -plt.xlabel('R Square Score') -plt.ylabel('Model Type') -plt.title('The R Square Score Comparsion') -sns.barplot(x = R_square_num, y = ML_model) +plt.figure(figsize=(10, 10)) +plt.xlabel("R Square Score") +plt.ylabel("Model Type") +plt.title("The R Square Score Comparsion") +sns.barplot(x=R_square_num, y=ML_model) # %% [markdown] # ## 4-2. Mean Square Root Comparsion # %% Model = model_fit(x_train, x_test, y_train, y_test) -ML_model = ['DecisionTreeRegressor', 'RandomForestRegressor', 'GradientBoostingRegressor', 'svr_rbf', 'svr_linear', 'KNeighborsRegressor', 'DeepLearning'] +ML_model = [ + "DecisionTreeRegressor", + "RandomForestRegressor", + "GradientBoostingRegressor", + "svr_rbf", + "svr_linear", + "KNeighborsRegressor", + "DeepLearning", +] sns.set() from sklearn.metrics import mean_squared_error + mse_num = [] for i in range(6): - mse = mean_squared_error(y_test, Model[i].predict(x_test)) - mse_num.append(mse) + mse = mean_squared_error(y_test, Model[i].predict(x_test)) + mse_num.append(mse) mse_num.append(mean_squared_error(y_test, model.predict(x_test))) -plt.figure(figsize = (10, 10)) -plt.xlabel('mean_square_error') -plt.ylabel('Model Type') -plt.title('The mean_square_error Comparsion') -sns.barplot(x = mse_num, y = ML_model) +plt.figure(figsize=(10, 10)) +plt.xlabel("mean_square_error") +plt.ylabel("Model Type") +plt.title("The mean_square_error Comparsion") +sns.barplot(x=mse_num, y=ML_model) # %% [markdown] # # **5. Predict the Test_data** @@ -336,9 +387,11 @@ def model_fit(x_train, x_test, y_train, y_test): # # **6. Submission** # %% -submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv') -submission['num_sold'] = submission_target -submission.to_csv('submission.csv', index=False) +submission = pd.read_csv( + "/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv" +) +submission["num_sold"] = submission_target +submission.to_csv("submission.csv", index=False) # %% submission diff --git a/examples/exploratory/nb.py b/examples/exploratory/nb.py index 696815e..9728baa 100644 --- a/examples/exploratory/nb.py +++ b/examples/exploratory/nb.py @@ -22,7 +22,7 @@ from sklearn.datasets import load_iris # %% -df = load_iris(as_frame=True)['data'] +df = load_iris(as_frame=True)["data"] # %% [markdown] # ## Clean @@ -34,7 +34,7 @@ df.shape # %% -df = df[df['petal length (cm)'] > 2] +df = df[df["petal length (cm)"] > 2] # %% df.shape @@ -43,4 +43,4 @@ # ## Plot # %% -sns.histplot(df['petal length (cm)']) +sns.histplot(df["petal length (cm)"]) diff --git a/examples/machine-learning/nb.py b/examples/machine-learning/nb.py index c76744d..77d228d 100644 --- a/examples/machine-learning/nb.py +++ b/examples/machine-learning/nb.py @@ -24,12 +24,12 @@ import matplotlib as mpl # %% -plt.style.use('ggplot') -mpl.rcParams['figure.figsize'] = (12, 8) +plt.style.use("ggplot") +mpl.rcParams["figure.figsize"] = (12, 8) # %% ca_housing = datasets.fetch_california_housing(as_frame=True) -df = ca_housing['frame'] +df = ca_housing["frame"] # %% df.head() @@ -66,14 +66,13 @@ from sklearn.model_selection import train_test_split # noqa # %% -X = df.drop('MedHouseVal', axis='columns') +X = df.drop("MedHouseVal", axis="columns") y = df.MedHouseVal # %% -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.33, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 +) # %% [markdown] # ## Linear regression diff --git a/setup.py b/setup.py index 4339ab5..a9018b6 100644 --- a/setup.py +++ b/setup.py @@ -7,72 +7,71 @@ from setuptools import find_packages from setuptools import setup -_version_re = re.compile(r'__version__\s+=\s+(.*)') +_version_re = re.compile(r"__version__\s+=\s+(.*)") -with open('src/soorgeon/__init__.py', 'rb') as f: +with open("src/soorgeon/__init__.py", "rb") as f: VERSION = str( - ast.literal_eval( - _version_re.search(f.read().decode('utf-8')).group(1))) + ast.literal_eval(_version_re.search(f.read().decode("utf-8")).group(1)) + ) REQUIRES = [ - 'ploomber-core>=0.0.4', - 'jupytext', - 'parso', - 'nbformat', - 'jinja2', - 'pyyaml', - 'click', - 'isort', + "ploomber-core>=0.0.4", + "jupytext", + "parso", + "nbformat", + "jinja2", + "pyyaml", + "click", + "isort", # for checking code errors - 'pyflakes', - 'black[jupyter]>=22.6.0', - 'papermill' + "pyflakes", + "black[jupyter]>=22.6.0", + "papermill", ] DEV = [ - 'pkgmt', - 'pytest', - 'yapf', - 'flake8', - 'invoke', - 'twine', - 'ipython', - 'ploomber', + "pkgmt", + "pytest", + "yapf", + "flake8", + "invoke", + "twine", + "ipython", + "ploomber", # to download data for running _kaggle notebooks - 'kaggle', + "kaggle", # to fetch from github repo - 'pygithub', + "pygithub", # to run some of the examples - 'pandas', - 'scikit-learn', - 'seaborn', - 'pkgmt', - 'twine' + "pandas", + "scikit-learn", + "seaborn", + "pkgmt", + "twine", ] -DESCRIPTION = ('Convert monolithic Jupyter notebooks' - ' into maintainable pipelines.') +DESCRIPTION = "Convert monolithic Jupyter notebooks" " into maintainable pipelines." setup( - name='soorgeon', + name="soorgeon", version=VERSION, description=DESCRIPTION, license=None, - author='Eduardo Blancas', - author_email='hello@ploomber.io', - url='https://github.com/ploomber/soorgeon', - packages=find_packages('src'), - package_dir={'': 'src'}, - py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], + author="Eduardo Blancas", + author_email="hello@ploomber.io", + url="https://github.com/ploomber/soorgeon", + packages=find_packages("src"), + package_dir={"": "src"}, + py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], include_package_data=True, package_data={"": []}, classifiers=[], keywords=[], install_requires=REQUIRES, extras_require={ - 'dev': DEV, + "dev": DEV, }, entry_points={ - 'console_scripts': ['soorgeon=soorgeon.cli:cli'], + "console_scripts": ["soorgeon=soorgeon.cli:cli"], }, ) diff --git a/src/soorgeon/__init__.py b/src/soorgeon/__init__.py index 1e9eb2d..075f203 100644 --- a/src/soorgeon/__init__.py +++ b/src/soorgeon/__init__.py @@ -44,4 +44,4 @@ argument. """ -__version__ = '0.0.20dev' +__version__ = "0.0.20dev" diff --git a/src/soorgeon/_debug.py b/src/soorgeon/_debug.py index 4ab1013..07db78e 100644 --- a/src/soorgeon/_debug.py +++ b/src/soorgeon/_debug.py @@ -1,8 +1,8 @@ from soorgeon import io -if __name__ == '__main__': +if __name__ == "__main__": s = """ """ in_, out = io.find_inputs_and_outputs(s) - print(f'Inputs: {in_}') - print(f'Outputs: {out}') + print(f"Inputs: {in_}") + print(f"Outputs: {out}") diff --git a/src/soorgeon/_kaggle.py b/src/soorgeon/_kaggle.py index 55d3ac9..6ad81aa 100644 --- a/src/soorgeon/_kaggle.py +++ b/src/soorgeon/_kaggle.py @@ -1,6 +1,7 @@ """ CLI for downloading Kaggle notebooks for integration testing """ + import zipfile import shutil from pathlib import PurePosixPath, Path @@ -16,15 +17,15 @@ def download_from_competition(name, files=None): api.competition_download_cli(name, file_name=files) if not files: - with zipfile.ZipFile(f'{name}.zip', 'r') as file: - file.extractall('input') + with zipfile.ZipFile(f"{name}.zip", "r") as file: + file.extractall("input") else: - Path('input').mkdir() - shutil.move(files, Path('input', files)) + Path("input").mkdir() + shutil.move(files, Path("input", files)) def download_from_dataset(name): - api.dataset_download_cli(name, unzip=True, path='input') + api.dataset_download_cli(name, unzip=True, path="input") @click.group() @@ -33,18 +34,18 @@ def cli(): @cli.command() -@click.argument('kernel_path') +@click.argument("kernel_path") def notebook(kernel_path): - click.echo('Downloading notebook...') + click.echo("Downloading notebook...") name = PurePosixPath(kernel_path).name api.kernels_pull_cli(kernel=kernel_path, path=name) - click.echo('Converting to .py...') - ipynb = Path(name, f'{name}.ipynb') - py = Path(name, 'nb.py') + click.echo("Converting to .py...") + ipynb = Path(name, f"{name}.ipynb") + py = Path(name, "nb.py") nb = jupytext.read(ipynb) # TODO: remove cells that are !pip install ... - jupytext.write(nb, py, fmt='py:percent') + jupytext.write(nb, py, fmt="py:percent") ipynb.unlink() @@ -52,26 +53,26 @@ def notebook(kernel_path): # update CONTRIBUTING.md # FIXME: add files arg @cli.command() -@click.argument('name') +@click.argument("name") def competition(name): download_from_competition(name=name) @cli.command() -@click.argument('name') +@click.argument("name") def dataset(name): download_from_dataset(name=name) @cli.command() -@click.argument('path', type=click.Path(exists=True)) +@click.argument("path", type=click.Path(exists=True)) def test(path): - nb = jupytext.read(path, fmt='py:percent') - click.echo('Generating test.ipynb...') - jupytext.write(nb, 'test.ipynb') - click.echo('Executing test.ipynb...') - pm.execute_notebook('test.ipynb', 'test.ipynb') + nb = jupytext.read(path, fmt="py:percent") + click.echo("Generating test.ipynb...") + jupytext.write(nb, "test.ipynb") + click.echo("Executing test.ipynb...") + pm.execute_notebook("test.ipynb", "test.ipynb") -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/src/soorgeon/_pygithub.py b/src/soorgeon/_pygithub.py index a7834bf..fa3b95d 100644 --- a/src/soorgeon/_pygithub.py +++ b/src/soorgeon/_pygithub.py @@ -2,6 +2,7 @@ from pathlib import Path import shutil from github import Github + # import os @@ -13,7 +14,7 @@ def download_directory(dir): g = Github() repo = g.get_repo("ploomber/ci-notebooks") contents = repo.get_contents(dir) - Path('input').mkdir() + Path("input").mkdir() for file_content in contents: try: @@ -22,8 +23,8 @@ def download_directory(dir): file_out = open(file_content.name, "wb") file_out.write(file_data) file_out.close() - if file_content.name != 'nb.py': + if file_content.name != "nb.py": path = Path(file_content.name) - shutil.move(path, Path('input', file_content.name)) + shutil.move(path, Path("input", file_content.name)) except IOError as exc: - print('Error processing %s: %s', file_content.path, exc) + print("Error processing %s: %s", file_content.path, exc) diff --git a/src/soorgeon/definitions.py b/src/soorgeon/definitions.py index b89ec0f..d8ee21c 100644 --- a/src/soorgeon/definitions.py +++ b/src/soorgeon/definitions.py @@ -8,10 +8,13 @@ def from_imports(tree): # build a defined-name -> import-statement-code mapping. Note that # the same code may appear more than once if it defines more than one name # e.g. from package import a, b, c - imports = [{ - name.value: import_.get_code().rstrip() - for name in import_.get_defined_names() - } for import_ in tree.iter_imports()] + imports = [ + { + name.value: import_.get_code().rstrip() + for name in import_.get_defined_names() + } + for import_ in tree.iter_imports() + ] if imports: imports = reduce(lambda x, y: {**x, **y}, imports) @@ -29,35 +32,38 @@ def packages_used(tree): Returns None if fails to parse them """ pkg_name = { - 'sklearn': 'scikit-learn', + "sklearn": "scikit-learn", } def flatten(elements): return [i for sub in elements for i in sub] def _extract_names(node): - if hasattr(node, 'children'): + if hasattr(node, "children"): return extract_names(node.children[0]) else: return [node.value] def extract_names(import_): - if import_.type == 'name': + if import_.type == "name": return [import_.value] - elif import_.type in {'dotted_name', 'dotted_as_name'}: + elif import_.type in {"dotted_name", "dotted_as_name"}: return [import_.children[0].value] second = import_.children[1] - if second.type in {'dotted_name', 'dotted_as_name'}: + if second.type in {"dotted_name", "dotted_as_name"}: return extract_names(second.children[0]) - elif second.type == 'dotted_as_names': + elif second.type == "dotted_as_names": # import a as something, b as another - return flatten([ - _extract_names(node) for i, node in enumerate(second.children) - if i % 2 == 0 - ]) + return flatten( + [ + _extract_names(node) + for i, node in enumerate(second.children) + if i % 2 == 0 + ] + ) else: return [second.value] @@ -65,8 +71,7 @@ def extract_names(import_): # replace using pkg_name mapping and ignore standard lib pkgs_final = [ - pkg_name.get(name, name) for name in pkgs - if place_module(name) == 'THIRDPARTY' + pkg_name.get(name, name) for name in pkgs if place_module(name) == "THIRDPARTY" ] # remove duplicates and sort @@ -74,10 +79,7 @@ def extract_names(import_): def from_def_and_class(tree): - fns = { - fn.name.value: fn.get_code().rstrip() - for fn in tree.iter_funcdefs() - } + fns = {fn.name.value: fn.get_code().rstrip() for fn in tree.iter_funcdefs()} classes = { class_.name.value: class_.get_code().rstrip() diff --git a/src/soorgeon/detect.py b/src/soorgeon/detect.py index cf20196..1817f6b 100644 --- a/src/soorgeon/detect.py +++ b/src/soorgeon/detect.py @@ -1,11 +1,12 @@ """ Detect which kind of structure we're dealing with """ + from soorgeon import get def is_f_string(leaf): - return leaf.type == 'fstring_start' + return leaf.type == "fstring_start" def is_funcdef(leaf): @@ -13,21 +14,21 @@ def is_funcdef(leaf): Returns true if the leaf is the beginning of a function definition (def keyword) """ - return leaf.type == 'keyword' and leaf.value == 'def' + return leaf.type == "keyword" and leaf.value == "def" def is_lambda(leaf, raise_=False): """ Returns true if the leaf is the beginning of a lambda definition """ - return leaf.type == 'keyword' and leaf.value == 'lambda' + return leaf.type == "keyword" and leaf.value == "lambda" def is_classdef(leaf): """ Returns true if the leaf is the beginning of a class definition """ - return leaf.type == 'keyword' and leaf.value == 'class' + return leaf.type == "keyword" and leaf.value == "class" def is_for_loop(leaf): @@ -38,10 +39,10 @@ def is_for_loop(leaf): parent = leaf.parent while parent: - if parent.type == 'suite': + if parent.type == "suite": has_suite_parent = True - if parent.type == 'for_stmt': + if parent.type == "for_stmt": return not has_suite_parent parent = parent.parent @@ -54,13 +55,15 @@ def is_comprehension(leaf): Return true if the leaf is the beginning of a list/set/dict comprehension. Returns true for generators as well """ - if leaf.type != 'operator' or leaf.value not in {'[', '(', '{'}: + if leaf.type != "operator" or leaf.value not in {"[", "(", "{"}: return False sibling = leaf.get_next_sibling() - return (sibling.type in {'testlist_comp', 'dictorsetmaker'} - and sibling.children[-1].type == 'sync_comp_for') + return ( + sibling.type in {"testlist_comp", "dictorsetmaker"} + and sibling.children[-1].type == "sync_comp_for" + ) def is_context_manager(leaf): @@ -71,10 +74,10 @@ def is_context_manager(leaf): parent = leaf.parent while parent: - if parent.type == 'suite': + if parent.type == "suite": has_suite_parent = True - if parent.type == 'with_stmt': + if parent.type == "with_stmt": return not has_suite_parent parent = parent.parent @@ -88,22 +91,21 @@ def is_left_side_of_assignment(node): if not to_check: return False - return to_check.children[1].value == '=' + return to_check.children[1].value == "=" # FIXME: delete def is_inside_list_comprehension(node): parent = get.first_non_atom_expr_parent(node) - return (parent.type == 'testlist_comp' - and parent.children[1].type == 'sync_comp_for') + return parent.type == "testlist_comp" and parent.children[1].type == "sync_comp_for" def is_inside_funcdef(leaf): parent = leaf.parent while parent: - if parent.type == 'funcdef': + if parent.type == "funcdef": return True parent = parent.parent @@ -113,7 +115,7 @@ def is_inside_funcdef(leaf): def is_inside_function_call(leaf): # ignore it if this is a function definition - if leaf.parent.type == 'param': + if leaf.parent.type == "param": return False next_sibling = leaf.get_next_sibling() @@ -126,7 +128,7 @@ def is_inside_function_call(leaf): # ignore names in keyword arguments # e.g., some_function(x=1) # (x does not count since it's) - if next_sibling_value == '=': + if next_sibling_value == "=": return False # check if the node is inside parenhesis: function(df) @@ -150,19 +152,18 @@ def is_inside_function_call(leaf): def is_inside_parenthesis(node): try: - left = node.get_previous_sibling().value == '(' + left = node.get_previous_sibling().value == "(" except AttributeError: left = False try: - right = node.get_next_sibling().value == ')' + right = node.get_next_sibling().value == ")" except AttributeError: right = False try: # to prevent (1, 2, 3) being detected as a function call - has_name = node.get_previous_sibling().get_previous_leaf( - ).type == 'name' + has_name = node.get_previous_sibling().get_previous_leaf().type == "name" except AttributeError: has_name = False @@ -182,7 +183,7 @@ def is_accessing_variable(leaf): except Exception: return False - getitem = children[0].value == '[' and children[-1].value == ']' - dotaccess = children[0].value == '.' + getitem = children[0].value == "[" and children[-1].value == "]" + dotaccess = children[0].value == "." # FIXME: adding dotacess breaks other tests return getitem or dotaccess diff --git a/src/soorgeon/exceptions.py b/src/soorgeon/exceptions.py index 1ccda1c..2a265ef 100644 --- a/src/soorgeon/exceptions.py +++ b/src/soorgeon/exceptions.py @@ -6,7 +6,7 @@ def _format_message(exception): - if hasattr(exception, 'format_message'): + if hasattr(exception, "format_message"): return exception.format_message() else: return str(exception) @@ -16,7 +16,7 @@ def _build_message(exception): msg = _format_message(exception) while exception.__cause__: - msg += f'\n{_format_message(exception.__cause__)}' + msg += f"\n{_format_message(exception.__cause__)}" exception = exception.__cause__ return msg @@ -37,18 +37,18 @@ def show(self, file: t.Optional[t.IO] = None) -> None: class InputWontRunError(BaseException): - """Raised when there are errors that make running the input infeasible - """ + """Raised when there are errors that make running the input infeasible""" + pass class InputError(BaseException): - """Raised when the input has issues and needs user's editing - """ + """Raised when the input has issues and needs user's editing""" + pass class InputSyntaxError(InputWontRunError): - """Raised if the notebook has invalid syntax - """ + """Raised if the notebook has invalid syntax""" + pass diff --git a/src/soorgeon/get.py b/src/soorgeon/get.py index f2ca8d4..6212f8f 100644 --- a/src/soorgeon/get.py +++ b/src/soorgeon/get.py @@ -4,7 +4,7 @@ def first_expr_stmt_parent(node): if not parent: return None - while parent.type != 'expr_stmt': + while parent.type != "expr_stmt": parent = parent.parent if not parent: @@ -18,7 +18,7 @@ def first_non_atom_expr_parent(node): # e.g., [x.attribute for x in range(10)] # x.attribute is an atom_expr - while parent.type == 'atom_expr': + while parent.type == "atom_expr": parent = parent.parent return parent diff --git a/src/soorgeon/io.py b/src/soorgeon/io.py index 48fa284..edc74cc 100644 --- a/src/soorgeon/io.py +++ b/src/soorgeon/io.py @@ -1,6 +1,7 @@ """ Module to determine inputs and outputs from code snippets. """ + from functools import reduce import parso @@ -35,7 +36,7 @@ def get_imports_cell_for_task(self, code_task): names = [] while leaf: - if leaf.type == 'name': + if leaf.type == "name": names.append(leaf.value) leaf = leaf.get_next_leaf() @@ -52,8 +53,9 @@ def get_imports_cell_for_task(self, code_task): # remove duplicated elements but keep order, then join if imports: - imports_to_use = ('\n'.join(list(dict.fromkeys(imports_to_use))) + - '\n\n\n').strip() or None + imports_to_use = ( + "\n".join(list(dict.fromkeys(imports_to_use))) + "\n\n\n" + ).strip() or None else: imports_to_use = None @@ -71,16 +73,16 @@ def get_local_scope(leaf): parent = leaf.parent while parent: - if parent.type == 'for_stmt': + if parent.type == "for_stmt": # call recursively for nested for loops to work - return (find_for_loop_def_and_io(parent)[0] - | get_local_scope(parent.parent)) + return find_for_loop_def_and_io(parent)[0] | get_local_scope(parent.parent) # FIXME: this wont work with nested functions - elif parent.type == 'funcdef': + elif parent.type == "funcdef": def_names = [ - c.get_defined_names() for c in parent.children[2].children - if c.type == 'param' + c.get_defined_names() + for c in parent.children[2].children + if c.type == "param" ] flatten = [name.value for sub in def_names for name in sub] @@ -100,9 +102,11 @@ def find_for_loop_def_and_io(for_stmt, local_scope=None): """ # TODO: add a only_input flag for cases where we dont care about # parsin outputs - if for_stmt.type != 'for_stmt': - raise ValueError(f'Expected a node with type "for_stmt", ' - f'got: {for_stmt} with type {for_stmt.type}') + if for_stmt.type != "for_stmt": + raise ValueError( + f'Expected a node with type "for_stmt", ' + f"got: {for_stmt} with type {for_stmt.type}" + ) local_scope = local_scope or set() @@ -115,7 +119,8 @@ def find_for_loop_def_and_io(for_stmt, local_scope=None): body_in, body_out = find_inputs_and_outputs_from_leaf( body_node.get_first_leaf(), local_scope=defined, - leaf_end=body_node.get_last_leaf()) + leaf_end=body_node.get_last_leaf(), + ) # Strictly speaking variables defined after the for keyword are also # outputs, since they're available after the loop ends (with the loop's @@ -125,15 +130,16 @@ def find_for_loop_def_and_io(for_stmt, local_scope=None): def _find_type_value_idx_in_children(type_, value, node): for idx, child in enumerate(node.children): - if (child.type, getattr(child, 'value', None)) == (type_, value): + if (child.type, getattr(child, "value", None)) == (type_, value): return idx return None def _process_context(context): - if ('keyword', 'as') in ((n.type, getattr(n, 'value', None)) - for n in context.children): + if ("keyword", "as") in ( + (n.type, getattr(n, "value", None)) for n in context.children + ): node_expression, _, node_definition = context.children defined = find_inputs(node_definition, parse_list_comprehension=False) else: @@ -147,10 +153,11 @@ def _process_context(context): def find_f_string_inputs(fstring_start, local_scope=None): - if fstring_start.type != 'fstring_start': + if fstring_start.type != "fstring_start": raise ValueError( f'Expected a node with type "fstring_start", ' - f'got: {fstring_start} with type {fstring_start.type}') + f"got: {fstring_start} with type {fstring_start.type}" + ) f_string = fstring_start.parent @@ -160,13 +167,15 @@ def find_f_string_inputs(fstring_start, local_scope=None): def find_context_manager_def_and_io(with_stmt, local_scope=None): - if with_stmt.type != 'with_stmt': - raise ValueError(f'Expected a node with type "with_stmt", ' - f'got: {with_stmt} with type {with_stmt.type}') + if with_stmt.type != "with_stmt": + raise ValueError( + f'Expected a node with type "with_stmt", ' + f"got: {with_stmt} with type {with_stmt.type}" + ) local_scope = local_scope or set() - idx_colon = _find_type_value_idx_in_children('operator', ':', with_stmt) + idx_colon = _find_type_value_idx_in_children("operator", ":", with_stmt) # get children that are relevant (ignore with keyword, commads, and colon # operator) @@ -184,7 +193,8 @@ def find_context_manager_def_and_io(with_stmt, local_scope=None): body_in, body_out = find_inputs_and_outputs_from_leaf( body_node.get_first_leaf(), local_scope=defined, - leaf_end=body_node.get_last_leaf()) + leaf_end=body_node.get_last_leaf(), + ) return defined, (exp | body_in) - local_scope, body_out @@ -204,9 +214,11 @@ def find_function_scope_and_io(funcdef, local_scope=None): set Variables declared in the body """ - if funcdef.type != 'funcdef': - raise ValueError(f'Expected a node with type "funcdef", ' - f'got: {funcdef} with type {funcdef.type}') + if funcdef.type != "funcdef": + raise ValueError( + f'Expected a node with type "funcdef", ' + f"got: {funcdef} with type {funcdef.type}" + ) local_scope = local_scope or set() @@ -223,26 +235,28 @@ def find_function_scope_and_io(funcdef, local_scope=None): # FIXME: test what happens if they user has a list comprehension as # argument. e.g. fn(x=[1,2,3]) - parameters = find_inputs(node_signature, - parse_list_comprehension=False, - allow_kwargs=True) + parameters = find_inputs( + node_signature, parse_list_comprehension=False, allow_kwargs=True + ) body_in, body_out = find_inputs_and_outputs_from_leaf( body_node.get_first_leaf(), local_scope=parameters, - leaf_end=body_node.get_last_leaf()) + leaf_end=body_node.get_last_leaf(), + ) if annotation_return: - body_in = body_in | find_inputs(annotation_return, - parse_list_comprehension=False) + body_in = body_in | find_inputs( + annotation_return, parse_list_comprehension=False + ) return parameters, body_in - local_scope, body_out # TODO: add unit tests def find_lambda_scope_and_inputs(lambda_, local_scope=None): - if lambda_.type != 'lambdef': - raise ValueError(f'Expected a lambdef, got {lambda_}') + if lambda_.type != "lambdef": + raise ValueError(f"Expected a lambdef, got {lambda_}") local_scope = local_scope or set() @@ -250,9 +264,9 @@ def find_lambda_scope_and_inputs(lambda_, local_scope=None): # FIXME: test what happens if they user has a list comprehension as # argument. e.g. fn(x=[1,2,3]) - parameters = find_inputs(node_signature, - parse_list_comprehension=False, - allow_kwargs=True) + parameters = find_inputs( + node_signature, parse_list_comprehension=False, allow_kwargs=True + ) body_in = find_inputs(body_node) @@ -260,14 +274,15 @@ def find_lambda_scope_and_inputs(lambda_, local_scope=None): def _flatten_sync_comp_for(node): - if node.type != 'sync_comp_for': - raise ValueError('Expected node type to be ' - f'"syncompfor" but got: {node.type}') + if node.type != "sync_comp_for": + raise ValueError( + "Expected node type to be " f'"syncompfor" but got: {node.type}' + ) total = [node] for child in node.children: - if child.type == 'sync_comp_for': + if child.type == "sync_comp_for": nodes = _flatten_sync_comp_for(child) total += nodes @@ -280,8 +295,7 @@ def _find_sync_comp_for_inputs_and_scope(synccompfor): parses a single node, for parsing nested ones use find_comprehension_inputs """ # these are the variables that the list comprehension declares - declared = find_inputs(synccompfor.children[1], - parse_list_comprehension=False) + declared = find_inputs(synccompfor.children[1], parse_list_comprehension=False) # parse the variables in the right expression # e,g, given: [x for x in expression(10)] @@ -289,20 +303,17 @@ def _find_sync_comp_for_inputs_and_scope(synccompfor): # the expression part should be the element at index 3, note that this # is not the same as getting the last one because if the list comprehension # has an 'if' statement, that will be the last element - inputs_right = find_inputs(synccompfor.children[3], - parse_list_comprehension=False) + inputs_right = find_inputs(synccompfor.children[3], parse_list_comprehension=False) return inputs_right, declared def find_comprehension_inputs(node): - """Find inpust for list/set/dict comprehension or generator - """ - types = {'testlist_comp', 'dictorsetmaker'} + """Find inpust for list/set/dict comprehension or generator""" + types = {"testlist_comp", "dictorsetmaker"} if node.type not in types: - raise ValueError('Expected node type be one of ' - f'{types!r}, got: {node.type}') + raise ValueError("Expected node type be one of " f"{types!r}, got: {node.type}") # list/set comprehension or generator if len(node.children) == 2: @@ -323,8 +334,9 @@ def find_comprehension_inputs(node): else: synccompfor = node.children[-1] - inputs_left = find_inputs_for_each(node.children[:-1], - parse_list_comprehension=False) + inputs_left = find_inputs_for_each( + node.children[:-1], parse_list_comprehension=False + ) inputs, declared = set(), set() @@ -336,28 +348,29 @@ def find_comprehension_inputs(node): return (inputs_left | inputs) - declared -def find_inputs_for_each(nodes, - parse_list_comprehension=True, - only_getitem_and_attribute_access=False): - """Like find_inputs, but takes a list of nodes - """ +def find_inputs_for_each( + nodes, parse_list_comprehension=True, only_getitem_and_attribute_access=False +): + """Like find_inputs, but takes a list of nodes""" inputs = set() for node in nodes: inputs_new = find_inputs( node, parse_list_comprehension=parse_list_comprehension, - only_getitem_and_attribute_access=only_getitem_and_attribute_access + only_getitem_and_attribute_access=only_getitem_and_attribute_access, ) inputs = inputs | inputs_new return inputs -def find_inputs(node, - parse_list_comprehension=True, - only_getitem_and_attribute_access=False, - allow_kwargs=False): +def find_inputs( + node, + parse_list_comprehension=True, + only_getitem_and_attribute_access=False, + allow_kwargs=False, +): """ Extract inputs from an expression e.g. function(x, y) returns {'function', 'x', 'y'} @@ -389,15 +402,13 @@ def find_inputs(node, else: # ignore f-string format specs {number:.2f} # and f-string conversions {object!r} - if leaf.parent.type in { - 'fstring_format_spec', 'fstring_conversion' - }: + if leaf.parent.type in {"fstring_format_spec", "fstring_conversion"}: leaf = leaf.get_next_leaf() continue # is this a kwarg? try: - key_arg = leaf.get_next_leaf().value == '=' + key_arg = leaf.get_next_leaf().value == "=" except Exception: key_arg = False @@ -406,26 +417,31 @@ def find_inputs(node, # is this an attribute? try: - is_attr = leaf.get_previous_leaf().value == '.' + is_attr = leaf.get_previous_leaf().value == "." except Exception: is_attr = False - if (leaf.type == 'name' and not key_arg and not is_attr - and leaf.value not in _BUILTIN): + if ( + leaf.type == "name" + and not key_arg + and not is_attr + and leaf.value not in _BUILTIN + ): # not allowing reads, check that this is not geitem # or that is accessing an attribute in the next leaf try: - is_getitem = leaf.get_next_leaf().value == '[' + is_getitem = leaf.get_next_leaf().value == "[" except Exception: is_getitem = False try: - is_accessing_attr = leaf.get_next_leaf().value == '.' + is_accessing_attr = leaf.get_next_leaf().value == "." except Exception: is_accessing_attr = False - if (only_getitem_and_attribute_access - and (is_getitem or is_accessing_attr)): + if only_getitem_and_attribute_access and ( + is_getitem or is_accessing_attr + ): names.append(leaf.value) elif not only_getitem_and_attribute_access: names.append(leaf.value) @@ -458,13 +474,14 @@ def find_inputs_and_outputs_from_tree(tree, local_scope=None): # NOTE: we use this in find_inputs_and_outputs and ImportParser, maybe # move the functionality to a class so we only compute it once defined_names = set(definitions.from_imports(tree)) | set( - definitions.from_def_and_class(tree)) + definitions.from_def_and_class(tree) + ) local_scope = local_scope or set() - return find_inputs_and_outputs_from_leaf(leaf, - local_scope=local_scope - | defined_names) + return find_inputs_and_outputs_from_leaf( + leaf, local_scope=local_scope | defined_names + ) # FIXME: try nested functions, and also functions inside for loops and loops @@ -504,7 +521,8 @@ def clean_up_candidates(candidates, *others): # FIXME: i think is hould also pass the current foudn inputs # to local scope - write a test to break this _, candidates_in = find_lambda_scope_and_inputs( - leaf.parent, local_scope=local_scope) + leaf.parent, local_scope=local_scope + ) inputs.extend(clean_up_candidates(candidates_in, local_variables)) # lambda's last leaf is the next one after the last in the # lambda node @@ -513,7 +531,8 @@ def clean_up_candidates(candidates, *others): # FIXME: i think is hould also pass the current foudn inputs # to local scope - write a test to break this (_, candidates_in, candidates_out) = find_for_loop_def_and_io( - leaf.parent, local_scope=local_scope) + leaf.parent, local_scope=local_scope + ) inputs.extend(clean_up_candidates(candidates_in, local_variables)) outputs = outputs | candidates_out # jump to the end of the foor loop @@ -521,9 +540,9 @@ def clean_up_candidates(candidates, *others): elif detect.is_context_manager(leaf): # FIXME: i think is hould also pass the current foudn inputs # to local scope - write a test to break this - (_, candidates_in, - candidates_out) = find_context_manager_def_and_io( - leaf.parent, local_scope=local_scope) + (_, candidates_in, candidates_out) = find_context_manager_def_and_io( + leaf.parent, local_scope=local_scope + ) inputs.extend(clean_up_candidates(candidates_in, local_variables)) outputs = outputs | candidates_out # jump to the end of the foor loop @@ -535,7 +554,8 @@ def clean_up_candidates(candidates, *others): # FIXME: i think is hould also pass the current foudn inputs # to local scope - write a test to break this (_, candidates_in, candidates_out) = find_function_scope_and_io( - leaf.parent, local_scope=local_scope) + leaf.parent, local_scope=local_scope + ) inputs.extend(clean_up_candidates(candidates_in, local_variables)) outputs = outputs | candidates_out # jump to the end of the function definition loop @@ -550,7 +570,7 @@ def clean_up_candidates(candidates, *others): leaf = leaf.parent.get_last_leaf() # the = operator is an indicator of [outputs] = [inputs] - elif leaf.type == 'operator' and leaf.value == '=': + elif leaf.type == "operator" and leaf.value == "=": next_s = leaf.get_next_sibling() previous = leaf.get_previous_leaf() @@ -565,8 +585,11 @@ def clean_up_candidates(candidates, *others): # only mark a variable as input if it hasn't been defined # locally - if (variable not in outputs and variable not in local_scope - and variable not in local_variables): + if ( + variable not in outputs + and variable not in local_scope + and variable not in local_variables + ): inputs.append(variable) # Process outputs @@ -580,7 +603,7 @@ def clean_up_candidates(candidates, *others): # a['x'] = 1 # a.b = 1 - if previous.parent.type != 'argument': + if previous.parent.type != "argument": prev_sibling = leaf.get_previous_sibling() @@ -596,15 +619,16 @@ def clean_up_candidates(candidates, *others): # e.g., a, b = 1, 2 (testlist_star_expr) # [a, b] = 1, 2 (atom) # (a, b) = 1, 2 (atom) - if prev_sibling.type in {'testlist_star_expr', 'atom'}: + if prev_sibling.type in {"testlist_star_expr", "atom"}: target = target | set( - name.value - for name in prev_sibling.parent.get_defined_names()) + name.value for name in prev_sibling.parent.get_defined_names() + ) # nope, only one value - elif prev_sibling.type == 'atom_expr': - target = target | (find_inputs( - prev_sibling, parse_list_comprehension=False) - - modified) + elif prev_sibling.type == "atom_expr": + target = target | ( + find_inputs(prev_sibling, parse_list_comprehension=False) + - modified + ) elif previous.value not in modified: target.add(previous.value) @@ -623,7 +647,8 @@ def clean_up_candidates(candidates, *others): inputs_candidates = find_inputs( prev_sibling, parse_list_comprehension=False, - only_getitem_and_attribute_access=True) + only_getitem_and_attribute_access=True, + ) # add to inputs if they haven't been locally defined or # modified @@ -644,17 +669,24 @@ def clean_up_candidates(candidates, *others): # go to the first conditional, and the next leaf is the function call # so then we go into this conditional - we're skipping the left part # but not the right part of = yet - elif (leaf.type == 'name' and (detect.is_inside_function_call(leaf) - or detect.is_accessing_variable(leaf) - or detect.is_inside_funcdef(leaf)) - # skip if this is to the left of an '=', because we'll check it - # when we get to that token since it'll go to the first - # conditional - and not detect.is_left_side_of_assignment(leaf) and - not detect.is_inside_list_comprehension(leaf) and - leaf.value not in outputs and leaf.value not in local_scope and - leaf.value not in _BUILTIN and leaf.value not in local_scope and - leaf.value not in local_variables): + elif ( + leaf.type == "name" + and ( + detect.is_inside_function_call(leaf) + or detect.is_accessing_variable(leaf) + or detect.is_inside_funcdef(leaf) + ) + # skip if this is to the left of an '=', because we'll check it + # when we get to that token since it'll go to the first + # conditional + and not detect.is_left_side_of_assignment(leaf) + and not detect.is_inside_list_comprehension(leaf) + and leaf.value not in outputs + and leaf.value not in local_scope + and leaf.value not in _BUILTIN + and leaf.value not in local_scope + and leaf.value not in local_variables + ): inputs.extend(find_inputs(leaf)) if leaf_end and leaf == leaf_end: @@ -675,7 +707,7 @@ def _get_modified_objects(leaf, outputs, names_from_imports): # iterate over leaves and grab names since the assignment may be modifying # more than one object while current: - if current.type == 'name': + if current.type == "name": if current.value in existing: names.append(current.value) @@ -739,9 +771,11 @@ def get(self, variable, task_name): provider = providers.get(variable) if not provider: - raise KeyError(f'Error parsing inputs for section {task_name!r} ' - 'notebook: could not find an earlier section ' - f'declaring variable {variable!r}') + raise KeyError( + f"Error parsing inputs for section {task_name!r} " + "notebook: could not find an earlier section " + f"declaring variable {variable!r}" + ) return provider @@ -793,9 +827,7 @@ def find_upstream(snippets): def _find_providers(io): # variable -> snippet that defines variable mapping - providers = [ - _map_outputs(snippet_name, v[1]) for snippet_name, v in io.items() - ] + providers = [_map_outputs(snippet_name, v[1]) for snippet_name, v in io.items()] providers = dict([i for sub in providers for i in sub]) @@ -812,8 +844,7 @@ def find_io(snippets): # FIXME: find_upstream already calls this, we should only compute it once io = { - snippet_name: find_inputs_and_outputs(snippet, - local_scope=im.get(snippet_name)) + snippet_name: find_inputs_and_outputs(snippet, local_scope=im.get(snippet_name)) for snippet_name, snippet in snippets.items() } @@ -857,7 +888,7 @@ def remove_imports(code_str): to_remove = [] for leaf in _leaf_iterator(tree): - if leaf.parent.type in {'import_name', 'import_from'}: + if leaf.parent.type in {"import_name", "import_from"}: to_remove.append(leaf) for leaf in to_remove: diff --git a/src/soorgeon/magics.py b/src/soorgeon/magics.py index 056fab2..b9a801b 100644 --- a/src/soorgeon/magics.py +++ b/src/soorgeon/magics.py @@ -1,18 +1,18 @@ import copy import re -_IS_IPYTHON_CELL_MAGIC = r'^\s*%{2}[a-zA-Z]+' -_IS_IPYTHON_LINE_MAGIC = r'^\s*%{1}[a-zA-Z]+' -_IS_INLINE_SHELL = r'^\s*!{1}.+' +_IS_IPYTHON_CELL_MAGIC = r"^\s*%{2}[a-zA-Z]+" +_IS_IPYTHON_LINE_MAGIC = r"^\s*%{1}[a-zA-Z]+" +_IS_INLINE_SHELL = r"^\s*!{1}.+" -_IS_COMMENTED_LINE_MAGIC = r'^(.+) # \[magic\] (%.*)' +_IS_COMMENTED_LINE_MAGIC = r"^(.+) # \[magic\] (%.*)" -_PREFIX = '# [magic] ' +_PREFIX = "# [magic] " _PREFIX_LEN = len(_PREFIX) # these are magics that can modify the dependency structure beacuse they # may declare new variables or use existing ones as inputs -HAS_INLINE_PYTHON = {'%%capture', '%%timeit', '%%time', '%time', '%timeit'} +HAS_INLINE_PYTHON = {"%%capture", "%%timeit", "%%time", "%time", "%timeit"} def comment_magics(nb): @@ -22,8 +22,8 @@ def comment_magics(nb): nb = copy.deepcopy(nb) for cell in nb.cells: - if cell.cell_type == 'code': - cell['source'] = _comment_if_ipython_magic(cell['source']) + if cell.cell_type == "code": + cell["source"] = _comment_if_ipython_magic(cell["source"]) return nb @@ -35,32 +35,29 @@ def uncomment_magics(nb): nb = copy.deepcopy(nb) for cell in nb.cells: - if cell.cell_type == 'code': - cell['source'] = _uncomment_magics_cell(cell['source']) + if cell.cell_type == "code": + cell["source"] = _uncomment_magics_cell(cell["source"]) return nb def _delete_magic(line): - """Returns an empty line if it starts with the # [magic] prefix - """ - return '' if line.startswith(_PREFIX) else line + """Returns an empty line if it starts with the # [magic] prefix""" + return "" if line.startswith(_PREFIX) else line def _delete_magics_cell(source): - """Reverts the comments applied to magics (cell level) - """ + """Reverts the comments applied to magics (cell level)""" if not source: return source lines = source.splitlines() lines_new = [_delete_magic(line) for line in lines] - return '\n'.join(lines_new).strip() + return "\n".join(lines_new).strip() def _uncomment_magic(line): - """Reverts the comments applied to magics (line level) - """ + """Reverts the comments applied to magics (line level)""" if line.startswith(_PREFIX): return line[_PREFIX_LEN:] @@ -68,23 +65,21 @@ def _uncomment_magic(line): if parts: code, magic = parts - return f'{magic} {code}' + return f"{magic} {code}" else: return line def _uncomment_magics_cell(source): - """Reverts the comments applied to magics (cell level) - """ + """Reverts the comments applied to magics (cell level)""" lines = source.splitlines() lines_new = [_uncomment_magic(line) for line in lines] - return '\n'.join(lines_new) + return "\n".join(lines_new) def _comment(line): - """Adds the # [magic] prefix (line level) - """ - return f'# [magic] {line}' + """Adds the # [magic] prefix (line level)""" + return f"# [magic] {line}" def _comment_ipython_line_magic(line, magic): @@ -96,12 +91,11 @@ def _comment_ipython_line_magic(line, magic): Into: x = 1 # [magic] %timeit """ - return line.replace(magic, '').strip() + f' # [magic] {magic.strip()}' + return line.replace(magic, "").strip() + f" # [magic] {magic.strip()}" def _comment_if_ipython_magic(source): - """Comments lines into comments if they're IPython magics (cell level) - """ + """Comments lines into comments if they're IPython magics (cell level)""" # TODO: support for nested cell magics. e.g., # %%timeit # %%timeit @@ -152,7 +146,7 @@ def _comment_if_ipython_magic(source): else: lines_out.append(line) - return '\n'.join(lines_out) + return "\n".join(lines_out) # NOTE: the code in the following lines is based on Ploomber's source code. diff --git a/src/soorgeon/proto.py b/src/soorgeon/proto.py index 8900dcb..5d66e9b 100644 --- a/src/soorgeon/proto.py +++ b/src/soorgeon/proto.py @@ -1,6 +1,7 @@ """ ProtoTask handles the logic to convert a notebook section into a Ploomber task """ + from copy import deepcopy from pathlib import Path @@ -10,7 +11,8 @@ from soorgeon import io, magics -_PICKLING_TEMPLATE = Template("""\ +_PICKLING_TEMPLATE = Template( + """\ {%- for product in products -%} {%- if product.startswith('df') and df_format in ('parquet', 'csv') -%} Path(product['{{product}}']).parent.mkdir(exist_ok=True, parents=True) @@ -27,9 +29,11 @@ {%- endif %} {% endfor -%}\ -""") +""" +) -_UNPICKLING_TEMPLATE = Template("""\ +_UNPICKLING_TEMPLATE = Template( + """\ {%- for up, key in up_and_in -%} {%- if key.startswith('df') and df_format in ('parquet', 'csv') -%} {{key}} = pd.read_{{df_format}}(upstream['{{up}}']['{{key}}']) @@ -41,30 +45,30 @@ {{key}} = pickle.loads(Path(upstream['{{up}}']['{{key}}']).read_bytes()) {%- endif %} {% endfor -%}\ -""") +""" +) def _new_pickling_cell(outputs, df_format, serializer): - df_format = df_format or '' - source = _PICKLING_TEMPLATE.render(products=sorted(outputs), - df_format=df_format, - serializer=serializer).strip() + df_format = df_format or "" + source = _PICKLING_TEMPLATE.render( + products=sorted(outputs), df_format=df_format, serializer=serializer + ).strip() return nbformat.v4.new_code_cell(source=source) def _new_unpickling_cell(up_and_in, df_format, serializer): - df_format = df_format or '' - source = _UNPICKLING_TEMPLATE.render(up_and_in=sorted(up_and_in, - key=lambda t: - (t[0], t[1])), - df_format=df_format, - serializer=serializer).strip() + df_format = df_format or "" + source = _UNPICKLING_TEMPLATE.render( + up_and_in=sorted(up_and_in, key=lambda t: (t[0], t[1])), + df_format=df_format, + serializer=serializer, + ).strip() return nbformat.v4.new_code_cell(source=source) class ProtoTask: - """A group of cells that will be converted into a Ploomber task - """ + """A group of cells that will be converted into a Ploomber task""" def __init__(self, name, cells, df_format, serializer, py): self._name = name @@ -78,67 +82,64 @@ def name(self): return self._name def exposes(self): - """Return a list of variables that this prototask creates - """ + """Return a list of variables that this prototask creates""" pass def uses(self): - """Return a list of variables that this prototask uses - """ + """Return a list of variables that this prototask uses""" pass def _pickling_cell(self, io): - """Add cell that pickles the outputs - """ + """Add cell that pickles the outputs""" _, outputs = io[self.name] if outputs: - pickling = _new_pickling_cell(outputs, self._df_format, - self._serializer) - pickling.metadata['tags'] = ['soorgeon-pickle'] + pickling = _new_pickling_cell(outputs, self._df_format, self._serializer) + pickling.metadata["tags"] = ["soorgeon-pickle"] return pickling else: return None def _unpickling_cell(self, io, providers): - """Add cell that unpickles the inputs - """ + """Add cell that unpickles the inputs""" inputs, _ = io[self.name] if inputs: - up_and_in = [(providers.get(input_, self.name), input_) - for input_ in inputs] + up_and_in = [ + (providers.get(input_, self.name), input_) for input_ in inputs + ] - unpickling = _new_unpickling_cell(up_and_in, self._df_format, - self._serializer) - unpickling.metadata['tags'] = ['soorgeon-unpickle'] + unpickling = _new_unpickling_cell( + up_and_in, self._df_format, self._serializer + ) + unpickling.metadata["tags"] = ["soorgeon-unpickle"] return unpickling else: return None def _add_parameters_cell(self, cells, upstream): - """Add parameters cell at the top - """ - source = '' + """Add parameters cell at the top""" + source = "" upstream_current = upstream[self.name] if upstream_current: - source += f'upstream = {list(upstream_current)}\n' + source += f"upstream = {list(upstream_current)}\n" else: - source += 'upstream = None\n' + source += "upstream = None\n" - source += 'product = None' + source += "product = None" parameters = nbformat.v4.new_code_cell(source=source) - parameters.metadata['tags'] = ['parameters'] + parameters.metadata["tags"] = ["parameters"] return [parameters] + cells - def _add_imports_cell(self, code_nb, add_pathlib_and_pickle, definitions, - df_format, serializer): + def _add_imports_cell( + self, code_nb, add_pathlib_and_pickle, definitions, df_format, serializer + ): # FIXME: instatiate this in the constructor so we only build it once ip = io.ImportsParser(code_nb) @@ -153,27 +154,27 @@ def _add_imports_cell(self, code_nb, add_pathlib_and_pickle, definitions, # FIXME: only add them if they're not already there if add_pathlib_and_pickle: - source = source or '' - source += '\nfrom pathlib import Path' - if serializer == 'cloudpickle': - source += '\nimport cloudpickle' - elif serializer == 'dill': - source += '\nimport dill' + source = source or "" + source += "\nfrom pathlib import Path" + if serializer == "cloudpickle": + source += "\nimport cloudpickle" + elif serializer == "dill": + source += "\nimport dill" else: - source += '\nimport pickle' + source += "\nimport pickle" # only add them if unserializing or serializing - if df_format in {'parquet', 'csv'}: - source += '\nimport pandas as pd' + if df_format in {"parquet", "csv"}: + source += "\nimport pandas as pd" if definitions: - names = ', '.join(definitions) - source = source or '' - source += f'\nfrom exported import {names}' + names = ", ".join(definitions) + source = source or "" + source += f"\nfrom exported import {names}" if source: cell = nbformat.v4.new_code_cell(source=source) - cell.metadata['tags'] = ['soorgeon-imports'] + cell.metadata["tags"] = ["soorgeon-imports"] return cell def export( @@ -203,12 +204,12 @@ def export( # remove import statements from code cells # FIXME: remove function definitions and class definitions for cell in cells: - if cell.cell_type == 'code': - cell['source'] = io.remove_imports(cell['source']) + if cell.cell_type == "code": + cell["source"] = io.remove_imports(cell["source"]) # remove empty cells and whitespace-only cells (we may have some after # removing imports) - cells = [cell for cell in cells if cell['source'].strip()] + cells = [cell for cell in cells if cell["source"].strip()] cell_unpickling = self._unpickling_cell(io_, providers) @@ -227,7 +228,8 @@ def export( add_pathlib_and_pickle=cell_pickling or cell_unpickling, definitions=definitions, df_format=self._df_format, - serializer=self._serializer) + serializer=self._serializer, + ) pre = [cell_imports] if cell_imports else [] @@ -243,13 +245,12 @@ def export( # ipynb has the kernelspec info if not self._py: nb_out.metadata.kernelspec = { - "display_name": 'Python 3', - "language": 'python', - "name": 'python3', + "display_name": "Python 3", + "language": "python", + "name": "python3", } - return jupytext.writes(nb_out, - fmt='py:percent' if self._py else 'ipynb') + return jupytext.writes(nb_out, fmt="py:percent" if self._py else "ipynb") def to_spec(self, io, product_prefix): """ @@ -264,29 +265,25 @@ def to_spec(self, io, product_prefix): # prefix products by name to guarantee they're unique products = { out: str( - Path(product_prefix, - _product_name(self.name, out, self._df_format))) + Path(product_prefix, _product_name(self.name, out, self._df_format)) + ) for out in outputs } # FIXME: check that there isn't an nb key already - products['nb'] = str(Path(product_prefix, f'{self.name}.ipynb')) + products["nb"] = str(Path(product_prefix, f"{self.name}.ipynb")) - ext = '.py' if self._py else '.ipynb' + ext = ".py" if self._py else ".ipynb" - return { - 'source': str(Path('tasks', self.name + ext)), - 'product': products - } + return {"source": str(Path("tasks", self.name + ext)), "product": products} def __str__(self): - """Retun the task as string (only code cells) - """ - return '\n'.join(cell['source'] for cell in self._cells - if cell.cell_type == 'code') + """Retun the task as string (only code cells)""" + return "\n".join( + cell["source"] for cell in self._cells if cell.cell_type == "code" + ) def _product_name(task, variable, df_format): - ext = ('pkl' - if not df_format or not variable.startswith('df') else df_format) - return f'{task}-{variable}.{ext}' + ext = "pkl" if not df_format or not variable.startswith("df") else df_format + return f"{task}-{variable}.{ext}" diff --git a/src/soorgeon/pyflakes.py b/src/soorgeon/pyflakes.py index b04f21c..ebb10a0 100644 --- a/src/soorgeon/pyflakes.py +++ b/src/soorgeon/pyflakes.py @@ -4,15 +4,21 @@ this functionality into soorgeon or create a ploomber-core package and move things over there """ + import warnings from io import StringIO from pyflakes import api as pyflakes_api from pyflakes.reporter import Reporter -from pyflakes.messages import (UndefinedName, UndefinedLocal, - DuplicateArgument, ReturnOutsideFunction, - YieldOutsideFunction, ContinueOutsideLoop, - BreakOutsideLoop) +from pyflakes.messages import ( + UndefinedName, + UndefinedLocal, + DuplicateArgument, + ReturnOutsideFunction, + YieldOutsideFunction, + ContinueOutsideLoop, + BreakOutsideLoop, +) from soorgeon.exceptions import InputWontRunError, InputSyntaxError @@ -29,7 +35,7 @@ def _process_messages(mesages): - return '\n'.join(str(msg) for msg in mesages) + return "\n".join(str(msg) for msg in mesages) def process_errors_and_warnings(messages): @@ -57,17 +63,15 @@ def __init__(self): def flake(self, message): self._stdout_raw.append(message) self._stdout.write(str(message)) - self._stdout.write('\n') + self._stdout.write("\n") def unexpectedError(self, *args, **kwargs): - """pyflakes calls this when ast.parse raises an unexpected error - """ + """pyflakes calls this when ast.parse raises an unexpected error""" self._unexpected = True return super().unexpectedError(*args, **kwargs) def syntaxError(self, *args, **kwargs): - """pyflakes calls this when ast.parse raises a SyntaxError - """ + """pyflakes calls this when ast.parse raises a SyntaxError""" self._syntax = True return super().syntaxError(*args, **kwargs) @@ -76,10 +80,12 @@ def _seek_zero(self): self._stderr.seek(0) def _make_error_message(self, error): - return ('Errors detected in your source code:' - f'\n{error}\n\n' - '(ensure that your notebook executes from top-to-bottom ' - 'and try again)') + return ( + "Errors detected in your source code:" + f"\n{error}\n\n" + "(ensure that your notebook executes from top-to-bottom " + "and try again)" + ) def _check(self): self._seek_zero() @@ -87,13 +93,15 @@ def _check(self): # syntax errors are stored in _stderr # https://github.com/PyCQA/pyflakes/blob/master/pyflakes/api.py - error_message = '\n'.join(self._stderr.readlines()) + error_message = "\n".join(self._stderr.readlines()) if self._syntax: raise InputSyntaxError(self._make_error_message(error_message)) elif self._unexpected: - warnings.warn('An unexpected error happened ' - f'when analyzing code: {error_message.strip()!r}') + warnings.warn( + "An unexpected error happened " + f"when analyzing code: {error_message.strip()!r}" + ) else: errors, warnings_ = process_errors_and_warnings(self._stdout_raw) @@ -129,13 +137,12 @@ def check_notebook(nb): When certain pyflakes errors are detected (e.g., undefined name) """ # concatenate all cell's source code in a single string - source_code = '\n'.join(c['source'] for c in nb.cells - if c.cell_type == 'code') + source_code = "\n".join(c["source"] for c in nb.cells if c.cell_type == "code") # this objects are needed to capture pyflakes output reporter = MyReporter() # run pyflakes.api.check on the source code - pyflakes_api.check(source_code, filename='', reporter=reporter) + pyflakes_api.check(source_code, filename="", reporter=reporter) reporter._check() diff --git a/src/soorgeon/split.py b/src/soorgeon/split.py index 6f58118..a4104db 100644 --- a/src/soorgeon/split.py +++ b/src/soorgeon/split.py @@ -1,6 +1,7 @@ """ Functions for splitting a notebook file into smaller parts """ + import string import re @@ -23,33 +24,39 @@ def find_breaks(nb): # TODO: this should return named tuples with index and extracted names for idx, cell in enumerate(nb.cells): # TODO: more robust H2 detector - if cell.cell_type == 'markdown' and _get_h2_header(cell.source): + if cell.cell_type == "markdown" and _get_h2_header(cell.source): breaks.append(idx) - if cell.cell_type == 'markdown' and _get_h1_header(cell.source): + if cell.cell_type == "markdown" and _get_h1_header(cell.source): found_h1_header = True if not breaks: - url = 'https://github.com/ploomber/soorgeon/blob/main/doc/guide.md' + url = "https://github.com/ploomber/soorgeon/blob/main/doc/guide.md" if found_h1_header: - raise exceptions.InputError('Only H1 headings are found. ' - 'At this time, only H2 headings ' - 'are supported. ' - f'Check out our guide: {url}') + raise exceptions.InputError( + "Only H1 headings are found. " + "At this time, only H2 headings " + "are supported. " + f"Check out our guide: {url}" + ) else: - raise exceptions.InputError('Expected notebook to have at least ' - 'one markdown H2 heading. ' - f'Check out our guide: {url}') + raise exceptions.InputError( + "Expected notebook to have at least " + "one markdown H2 heading. " + f"Check out our guide: {url}" + ) if len(breaks) == 1: - click.secho('Warning: refactoring successful ' - 'but only one H2 heading detected, ' - 'output pipeline has a single task. ' - "It's recommended to break down " - 'the analysis into multiple small notebooks. ' - 'Consider adding more H2 headings. \n' - 'Learn more: https://github.com/' - 'ploomber/soorgeon/blob/main/doc/guide.md\n', - fg='yellow') + click.secho( + "Warning: refactoring successful " + "but only one H2 heading detected, " + "output pipeline has a single task. " + "It's recommended to break down " + "the analysis into multiple small notebooks. " + "Consider adding more H2 headings. \n" + "Learn more: https://github.com/" + "ploomber/soorgeon/blob/main/doc/guide.md\n", + fg="yellow", + ) return breaks @@ -74,18 +81,17 @@ def split_with_breaks(cells, breaks): def names_with_breaks(cells, breaks): - return [_get_h2_header(cells[break_]['source']) for break_ in breaks] + return [_get_h2_header(cells[break_]["source"]) for break_ in breaks] def _sanitize_name(name): - """Sanitize content of an H2 header to be used as a filename - """ + """Sanitize content of an H2 header to be used as a filename""" # replace all non-aplhanumeric with a dash - sanitized = re.sub('[^0-9a-zA-Z]+', '-', name.lower()) + sanitized = re.sub("[^0-9a-zA-Z]+", "-", name.lower()) # argo does not allow names to start with a digit when using dependencies if sanitized[0] in string.digits: - sanitized = 'section-' + sanitized + sanitized = "section-" + sanitized return sanitized @@ -110,7 +116,7 @@ def _get_header(md): return _get_header -_get_h1_header = _get_header_factory(r'^\s*#\s+(.+)') +_get_h1_header = _get_header_factory(r"^\s*#\s+(.+)") -_get_h2_header = _get_header_factory(r'^\s*##\s+(.+)') +_get_h2_header = _get_header_factory(r"^\s*##\s+(.+)") diff --git a/tasks.py b/tasks.py index c87aa8c..162b286 100644 --- a/tasks.py +++ b/tasks.py @@ -7,11 +7,10 @@ @task def test(c, nbs=False): - """Run unit tests + flake8 - """ - args = '' if nbs else '--ignore=tests/test_sample_notebooks.py' - c.run(f'pytest {args}', pty=True) - c.run('flake8') + """Run unit tests + flake8""" + args = "" if nbs else "--ignore=tests/test_sample_notebooks.py" + c.run(f"pytest {args}", pty=True) + c.run("flake8") @task @@ -19,61 +18,63 @@ def setup(c, version=None): """ Setup dev environment, requires conda """ - version = version or '3.9' - suffix = '' if version == '3.9' else version.replace('.', '') - env_name = f'soorgeon{suffix}' + version = version or "3.9" + suffix = "" if version == "3.9" else version.replace(".", "") + env_name = f"soorgeon{suffix}" - c.run(f'conda create --name {env_name} python={version} --yes') - c.run('eval "$(conda shell.bash hook)" ' - f'&& conda activate {env_name} ' - '&& pip install --editable .[dev]') + c.run(f"conda create --name {env_name} python={version} --yes") + c.run( + 'eval "$(conda shell.bash hook)" ' + f"&& conda activate {env_name} " + "&& pip install --editable .[dev]" + ) - print(f'Done! Activate your environment with:\nconda activate {env_name}') + print(f"Done! Activate your environment with:\nconda activate {env_name}") -@task(aliases=['v']) +@task(aliases=["v"]) def version(c): - """Create a new version of this project - """ + """Create a new version of this project""" from pkgmt import versioneer - versioneer.version(project_root='.', tag=True) + versioneer.version(project_root=".", tag=True) -@task(aliases=['r']) + +@task(aliases=["r"]) def release(c, tag, production=True): - """Upload to PyPI (prod by default): inv upload {tag} - """ + """Upload to PyPI (prod by default): inv upload {tag}""" from pkgmt import versioneer + versioneer.upload(tag, production=production) @task def install_git_hook(c, force=False): - """Installs pre-push git hook - """ - path = Path('.git/hooks/pre-push') + """Installs pre-push git hook""" + path = Path(".git/hooks/pre-push") hook_exists = path.is_file() if hook_exists: if force: path.unlink() else: - sys.exit('Error: pre-push hook already exists. ' - 'Run: "invoke install-git-hook -f" to force overwrite.') + sys.exit( + "Error: pre-push hook already exists. " + 'Run: "invoke install-git-hook -f" to force overwrite.' + ) - shutil.copy('.githooks/pre-push', '.git/hooks') - print(f'pre-push hook installed at {str(path)}') + shutil.copy(".githooks/pre-push", ".git/hooks") + print(f"pre-push hook installed at {str(path)}") @task def uninstall_git_hook(c): - """Uninstalls pre-push git hook - """ - path = Path('.git/hooks/pre-push') + """Uninstalls pre-push git hook""" + path = Path(".git/hooks/pre-push") hook_exists = path.is_file() if hook_exists: path.unlink() - print(f'Deleted {str(path)}.') + print(f"Deleted {str(path)}.") else: - print('Hook doesn\'t exist, nothing to delete.') + print("Hook doesn't exist, nothing to delete.") diff --git a/tests/assets/nb-ml.py b/tests/assets/nb-ml.py index c76744d..77d228d 100644 --- a/tests/assets/nb-ml.py +++ b/tests/assets/nb-ml.py @@ -24,12 +24,12 @@ import matplotlib as mpl # %% -plt.style.use('ggplot') -mpl.rcParams['figure.figsize'] = (12, 8) +plt.style.use("ggplot") +mpl.rcParams["figure.figsize"] = (12, 8) # %% ca_housing = datasets.fetch_california_housing(as_frame=True) -df = ca_housing['frame'] +df = ca_housing["frame"] # %% df.head() @@ -66,14 +66,13 @@ from sklearn.model_selection import train_test_split # noqa # %% -X = df.drop('MedHouseVal', axis='columns') +X = df.drop("MedHouseVal", axis="columns") y = df.MedHouseVal # %% -X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.33, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 +) # %% [markdown] # ## Linear regression diff --git a/tests/conftest.py b/tests/conftest.py index 4f729d3..528bd09 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,16 +14,16 @@ def path_to_tests(): def path_to_assets(): - return path_to_tests() / 'assets' + return path_to_tests() / "assets" def read_nb(name): - path = path_to_assets() / f'nb-{name}.py' + path = path_to_assets() / f"nb-{name}.py" return Path(path).read_text() def read_snippets(name): - ne = NotebookExporter(jupytext.reads(read_nb(name), fmt='py:percent')) + ne = NotebookExporter(jupytext.reads(read_nb(name), fmt="py:percent")) return ne._snippets diff --git a/tests/test_cli.py b/tests/test_cli.py index 322d906..2502dee 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,36 +25,41 @@ """ -@pytest.mark.parametrize('args, product_prefix', [ - [['nb.py'], 'output'], - [['nb.py', '--product-prefix', 'another'], 'another'], - [['nb.py', '-p', 'another'], 'another'], -]) +@pytest.mark.parametrize( + "args, product_prefix", + [ + [["nb.py"], "output"], + [["nb.py", "--product-prefix", "another"], "another"], + [["nb.py", "-p", "another"], "another"], + ], +) def test_refactor_product_prefix(tmp_empty, args, product_prefix): - Path('nb.py').write_text(simple) + Path("nb.py").write_text(simple) runner = CliRunner() result = runner.invoke(cli.refactor, args) - spec = DAGSpec('pipeline.yaml') + spec = DAGSpec("pipeline.yaml") paths = [ - i for product in [t['product'].values() for t in spec['tasks']] - for i in product + i for product in [t["product"].values() for t in spec["tasks"]] for i in product ] assert result.exit_code == 0 assert all([p.startswith(product_prefix) for p in paths]) -@pytest.mark.parametrize('input_, out_ext, args', [ - ['nb.py', 'py', ['nb.py']], - ['nb.ipynb', 'ipynb', ['nb.ipynb']], - ['nb.py', 'ipynb', ['nb.py', '--file-format', 'ipynb']], - ['nb.ipynb', 'py', ['nb.ipynb', '--file-format', 'py']], -]) +@pytest.mark.parametrize( + "input_, out_ext, args", + [ + ["nb.py", "py", ["nb.py"]], + ["nb.ipynb", "ipynb", ["nb.ipynb"]], + ["nb.py", "ipynb", ["nb.py", "--file-format", "ipynb"]], + ["nb.ipynb", "py", ["nb.ipynb", "--file-format", "py"]], + ], +) def test_refactor_file_format(tmp_empty, input_, out_ext, args): - jupytext.write(jupytext.reads(simple, fmt='py:light'), input_) + jupytext.write(jupytext.reads(simple, fmt="py:light"), input_) runner = CliRunner() result = runner.invoke(cli.refactor, args) @@ -63,9 +68,9 @@ def test_refactor_file_format(tmp_empty, input_, out_ext, args): # test the output file has metadata, otherwise it may fail to execute # if missing the kernelspec info - assert jupytext.read(Path('tasks', f'cell-0.{out_ext}')).metadata - assert jupytext.read(Path('tasks', f'cell-2.{out_ext}')).metadata - assert jupytext.read(Path('tasks', f'cell-4.{out_ext}')).metadata + assert jupytext.read(Path("tasks", f"cell-0.{out_ext}")).metadata + assert jupytext.read(Path("tasks", f"cell-2.{out_ext}")).metadata + assert jupytext.read(Path("tasks", f"cell-4.{out_ext}")).metadata with_dfs = """\ @@ -92,71 +97,73 @@ def test_refactor_file_format(tmp_empty, input_, out_ext, args): """ -@pytest.mark.parametrize('args, ext, requirements', [ - [['nb.py'], 'pkl', 'ploomber>=0.14.7'], - [['nb.py', '--df-format', 'parquet'], 'parquet', - 'ploomber>=0.14.7\npyarrow'], - [['nb.py', '--df-format', 'csv'], 'csv', 'ploomber>=0.14.7'], -], - ids=[ - 'none', - 'parquet', - 'csv', - ]) -@pytest.mark.parametrize('nb, products_expected', [ +@pytest.mark.parametrize( + "args, ext, requirements", [ - simple, - [ - 'output/cell-0-x.pkl', - 'output/cell-0.ipynb', - 'output/cell-2-y.pkl', - 'output/cell-2.ipynb', - 'output/cell-4.ipynb', - ] + [["nb.py"], "pkl", "ploomber>=0.14.7"], + [["nb.py", "--df-format", "parquet"], "parquet", "ploomber>=0.14.7\npyarrow"], + [["nb.py", "--df-format", "csv"], "csv", "ploomber>=0.14.7"], ], - [ - with_dfs, - [ - 'output/first-df.{ext}', - 'output/first.ipynb', - 'output/second.ipynb', - ] + ids=[ + "none", + "parquet", + "csv", ], +) +@pytest.mark.parametrize( + "nb, products_expected", [ - mixed, [ - 'output/first-x.pkl', - 'output/first-df.{ext}', - 'output/first.ipynb', - 'output/second.ipynb', - ] + simple, + [ + "output/cell-0-x.pkl", + "output/cell-0.ipynb", + "output/cell-2-y.pkl", + "output/cell-2.ipynb", + "output/cell-4.ipynb", + ], + ], + [ + with_dfs, + [ + "output/first-df.{ext}", + "output/first.ipynb", + "output/second.ipynb", + ], + ], + [ + mixed, + [ + "output/first-x.pkl", + "output/first-df.{ext}", + "output/first.ipynb", + "output/second.ipynb", + ], + ], + ], + ids=[ + "simple", + "with-dfs", + "mixed", ], -], - ids=[ - 'simple', - 'with-dfs', - 'mixed', - ]) -def test_refactor_df_format(tmp_empty, args, ext, nb, products_expected, - requirements): - Path('nb.py').write_text(nb) +) +def test_refactor_df_format(tmp_empty, args, ext, nb, products_expected, requirements): + Path("nb.py").write_text(nb) runner = CliRunner() result = runner.invoke(cli.refactor, args) - spec = DAGSpec('pipeline.yaml') + spec = DAGSpec("pipeline.yaml") paths = [ - i for product in [t['product'].values() for t in spec['tasks']] - for i in product + i for product in [t["product"].values() for t in spec["tasks"]] for i in product ] assert result.exit_code == 0 assert set(paths) == set(p.format(ext=ext) for p in products_expected) - content = ('# Auto-generated file' - f', may need manual editing\n{requirements}\n') - assert Path('requirements.txt').read_text() == content + content = "# Auto-generated file" f", may need manual editing\n{requirements}\n" + assert Path("requirements.txt").read_text() == content with_dfs = """\ @@ -182,49 +189,53 @@ def test_refactor_df_format(tmp_empty, args, ext, nb, products_expected, """ -@pytest.mark.parametrize('args, requirements', [ - [['nb.py', '--serializer', 'cloudpickle'], 'cloudpickle\nploomber>=0.14.7' - ], - [['nb.py', '--serializer', 'dill'], 'dill\nploomber>=0.14.7'], -], - ids=['cloudpickle', 'dill']) -@pytest.mark.parametrize('nb, products_expected', [ +@pytest.mark.parametrize( + "args, requirements", [ - with_dfs, - [ - 'output/first-df.pkl', - 'output/first.ipynb', - 'output/second.ipynb', - ] + [["nb.py", "--serializer", "cloudpickle"], "cloudpickle\nploomber>=0.14.7"], + [["nb.py", "--serializer", "dill"], "dill\nploomber>=0.14.7"], ], + ids=["cloudpickle", "dill"], +) +@pytest.mark.parametrize( + "nb, products_expected", [ - with_lambda, [ - 'output/first-num_square.pkl', 'output/first.ipynb', - 'output/second.ipynb', 'output/second-num_square.pkl' - ] + with_dfs, + [ + "output/first-df.pkl", + "output/first.ipynb", + "output/second.ipynb", + ], + ], + [ + with_lambda, + [ + "output/first-num_square.pkl", + "output/first.ipynb", + "output/second.ipynb", + "output/second-num_square.pkl", + ], + ], ], -], - ids=['with-dfs', 'with-lambda']) -def test_refactor_serializer(tmp_empty, args, nb, products_expected, - requirements): - Path('nb.py').write_text(nb) + ids=["with-dfs", "with-lambda"], +) +def test_refactor_serializer(tmp_empty, args, nb, products_expected, requirements): + Path("nb.py").write_text(nb) runner = CliRunner() result = runner.invoke(cli.refactor, args) - spec = DAGSpec('pipeline.yaml') + spec = DAGSpec("pipeline.yaml") paths = [ - i for product in [t['product'].values() for t in spec['tasks']] - for i in product + i for product in [t["product"].values() for t in spec["tasks"]] for i in product ] assert set(paths) == set(products_expected) assert result.exit_code == 0 - content = ('# Auto-generated file' - f', may need manual editing\n{requirements}\n') - assert Path('requirements.txt').read_text() == content + content = "# Auto-generated file" f", may need manual editing\n{requirements}\n" + assert Path("requirements.txt").read_text() == content imports_pyarrow = """\ @@ -265,51 +276,57 @@ def test_refactor_serializer(tmp_empty, args, nb, products_expected, """ -@pytest.mark.parametrize('nb, requirements', [ - [imports_pyarrow, 'ploomber>=0.14.7\npyarrow'], - [imports_fastparquet, 'fastparquet\nploomber>=0.14.7'], - [imports_nothing, 'ploomber>=0.14.7\npyarrow'], -], - ids=[ - 'pyarrow', - 'fastparquet', - 'nothing', - ]) +@pytest.mark.parametrize( + "nb, requirements", + [ + [imports_pyarrow, "ploomber>=0.14.7\npyarrow"], + [imports_fastparquet, "fastparquet\nploomber>=0.14.7"], + [imports_nothing, "ploomber>=0.14.7\npyarrow"], + ], + ids=[ + "pyarrow", + "fastparquet", + "nothing", + ], +) def test_refactor_parquet_requirements(tmp_empty, nb, requirements): - Path('nb.py').write_text(nb) + Path("nb.py").write_text(nb) runner = CliRunner() - result = runner.invoke(cli.refactor, ['nb.py', '--df-format', 'parquet']) + result = runner.invoke(cli.refactor, ["nb.py", "--df-format", "parquet"]) assert result.exit_code == 0 - content = ('# Auto-generated file' - f', may need manual editing\n{requirements}\n') - assert Path('requirements.txt').read_text() == content + content = "# Auto-generated file" f", may need manual editing\n{requirements}\n" + assert Path("requirements.txt").read_text() == content -@pytest.mark.parametrize('input_, backup, file_format, source', [ - ['nb.ipynb', 'nb-backup.ipynb', [], 'nb.ipynb'], - ['nb.py', 'nb-backup.py', [], 'nb.py'], - ['nb.ipynb', 'nb-backup.ipynb', ['--file-format', 'py'], 'nb.py'], - ['nb.py', 'nb-backup.py', ['--file-format', 'ipynb'], 'nb.ipynb'], -]) +@pytest.mark.parametrize( + "input_, backup, file_format, source", + [ + ["nb.ipynb", "nb-backup.ipynb", [], "nb.ipynb"], + ["nb.py", "nb-backup.py", [], "nb.py"], + ["nb.ipynb", "nb-backup.ipynb", ["--file-format", "py"], "nb.py"], + ["nb.py", "nb-backup.py", ["--file-format", "ipynb"], "nb.ipynb"], + ], +) def test_single_task(tmp_empty, input_, backup, file_format, source): - jupytext.write(jupytext.reads(simple, fmt='py:light'), input_) + jupytext.write(jupytext.reads(simple, fmt="py:light"), input_) runner = CliRunner() - result = runner.invoke(cli.refactor, - [input_, '--single-task'] + file_format) + result = runner.invoke(cli.refactor, [input_, "--single-task"] + file_format) assert result.exit_code == 0 - with Path('pipeline.yaml').open() as f: + with Path("pipeline.yaml").open() as f: spec = yaml.safe_load(f) assert spec == { - 'tasks': [{ - 'source': source, - 'product': 'products/nb-report.ipynb', - }] + "tasks": [ + { + "source": source, + "product": "products/nb-report.ipynb", + } + ] } # test the output file has metadata, otherwise it may fail to execute @@ -318,81 +335,89 @@ def test_single_task(tmp_empty, input_, backup, file_format, source): assert jupytext.read(Path(backup)).metadata -@pytest.mark.parametrize('code', [ - """ +@pytest.mark.parametrize( + "code", + [ + """ # ## header if something pass -""", """ +""", + """ # ## header y = x + 1 -""" -], - ids=[ - 'syntax-error', - 'undefined-name', - ]) +""", + ], + ids=[ + "syntax-error", + "undefined-name", + ], +) def test_doesnt_suggest_single_task_if_nb_cannot_run(tmp_empty, code): - Path('nb.py').write_text(code) + Path("nb.py").write_text(code) runner = CliRunner() - result = runner.invoke(cli.refactor, ['nb.py']) + result = runner.invoke(cli.refactor, ["nb.py"]) assert result.exit_code == 1 - assert 'soorgeon refactor nb.py --single-task' not in result.output + assert "soorgeon refactor nb.py --single-task" not in result.output -@pytest.mark.parametrize('code', [ - """ +@pytest.mark.parametrize( + "code", + [ + """ from math import * -""", """ +""", + """ y = 1 def x(): return y -""", """ +""", + """ x = 1 -""" -], - ids=[ - 'star-import', - 'fn-with-global-vars', - 'missing-h2-heading', - ]) +""", + ], + ids=[ + "star-import", + "fn-with-global-vars", + "missing-h2-heading", + ], +) def test_doesnt_suggest_single_task_if_nb_can_run(tmp_empty, code): - Path('nb.py').write_text(code) + Path("nb.py").write_text(code) runner = CliRunner() - result = runner.invoke(cli.refactor, ['nb.py']) + result = runner.invoke(cli.refactor, ["nb.py"]) assert result.exit_code == 1 - assert 'soorgeon refactor nb.py --single-task' in result.output + assert "soorgeon refactor nb.py --single-task" in result.output def test_suggests_single_task_if_export_crashes(tmp_empty, monkeypatch): - monkeypatch.setattr(export.NotebookExporter, 'export', - Mock(side_effect=KeyError)) + monkeypatch.setattr(export.NotebookExporter, "export", Mock(side_effect=KeyError)) - Path('nb.py').write_text(simple) + Path("nb.py").write_text(simple) runner = CliRunner() - result = runner.invoke(cli.refactor, ['nb.py']) + result = runner.invoke(cli.refactor, ["nb.py"]) assert result.exit_code == 1 - assert 'soorgeon refactor nb.py --single-task' in result.output + assert "soorgeon refactor nb.py --single-task" in result.output # adds import if needed / and doesn't add import pickle def test_clean_py(tmp_empty): - Path('nb.py').write_text(simple) + Path("nb.py").write_text(simple) runner = CliRunner() - runner.invoke(cli.refactor, ['nb.py']) - result = runner.invoke(cli.clean, ['tasks/cell-2.py']) + runner.invoke(cli.refactor, ["nb.py"]) + result = runner.invoke(cli.clean, ["tasks/cell-2.py"]) assert result.exit_code == 0 # black assert "Reformatted tasks/cell-2.py with black" in result.output @@ -401,12 +426,12 @@ def test_clean_py(tmp_empty): def test_clean_ipynb(tmp_empty): - nb_ = jupytext.reads(simple, fmt='py:light') - jupytext.write(nb_, 'nb.ipynb') + nb_ = jupytext.reads(simple, fmt="py:light") + jupytext.write(nb_, "nb.ipynb") runner = CliRunner() - runner.invoke(cli.refactor, ['nb.ipynb']) - result = runner.invoke(cli.clean, ['tasks/cell-2.ipynb']) + runner.invoke(cli.refactor, ["nb.ipynb"]) + result = runner.invoke(cli.clean, ["tasks/cell-2.ipynb"]) assert result.exit_code == 0 # black @@ -415,9 +440,11 @@ def test_clean_ipynb(tmp_empty): assert "Finished cleaning tasks/cell-2.ipynb" in result.output -@pytest.mark.parametrize('content, fmt', [ +@pytest.mark.parametrize( + "content, fmt", [ - """ + [ + """ ```python import soorgeon import atexit @@ -425,10 +452,10 @@ def test_clean_ipynb(tmp_empty): s = 'something' ``` """, - 'markdown', - ], - [ - """\ + "markdown", + ], + [ + """\ --- jupytext: text_representation: @@ -443,18 +470,19 @@ def test_clean_ipynb(tmp_empty): s = 'something' ``` """, - 'myst', + "myst", + ], + ], + ids=[ + "md", + "myst", ], -], - ids=[ - 'md', - 'myst', - ]) +) def test_clean_markdown(tmp_empty, content, fmt): - Path('file.md').write_text(content) + Path("file.md").write_text(content) runner = CliRunner() - result = runner.invoke(cli.clean, ['file.md']) + result = runner.invoke(cli.clean, ["file.md"]) assert result.exit_code == 0 # black @@ -462,26 +490,31 @@ def test_clean_markdown(tmp_empty, content, fmt): # end of basic_clean() assert "Finished cleaning file.md" in result.output - metadata = jupytext.formats.read_metadata( - Path('file.md').read_text(), 'md') + metadata = jupytext.formats.read_metadata(Path("file.md").read_text(), "md") if metadata: - fmt_read = metadata['jupytext']['text_representation']['format_name'] + fmt_read = metadata["jupytext"]["text_representation"]["format_name"] assert fmt_read == fmt -@pytest.mark.parametrize('name, content', [ - ['file.py', 'import math'], - ['file.md', """ +@pytest.mark.parametrize( + "name, content", + [ + ["file.py", "import math"], + [ + "file.md", + """ ```python import math ``` -"""], -], - ids=[ - 'py', - 'md', - ]) +""", + ], + ], + ids=[ + "py", + "md", + ], +) def test_lint(tmp_empty, name, content): Path(name).write_text(content) @@ -493,12 +526,12 @@ def test_lint(tmp_empty, name, content): def test_clean_no_task(tmp_empty): - nb_ = jupytext.reads(simple, fmt='py:light') - jupytext.write(nb_, 'nb.ipynb') + nb_ = jupytext.reads(simple, fmt="py:light") + jupytext.write(nb_, "nb.ipynb") runner = CliRunner() - runner.invoke(cli.refactor, ['nb.ipynb']) - result = runner.invoke(cli.clean, ['tasks/cell-9.ipynb']) + runner.invoke(cli.refactor, ["nb.ipynb"]) + result = runner.invoke(cli.clean, ["tasks/cell-9.ipynb"]) assert result.exit_code == 2 assert "Error: Invalid value for 'FILENAME'" in result.output @@ -548,33 +581,33 @@ def test_clean_no_task(tmp_empty): def test_refactor_product_should_warning_if_notebook_output_file(tmp_empty): - Path('nb.py').write_text(output_test) - args = 'nb.py' + Path("nb.py").write_text(output_test) + args = "nb.py" runner = CliRunner() result = runner.invoke(cli.refactor, args) assert result.exit_code == 0 - assert 'open' in result.output - assert 'to_csv' in result.output - assert 'to_parquet' in result.output - assert 'write_text' in result.output - assert 'write_bytes' in result.output + assert "open" in result.output + assert "to_csv" in result.output + assert "to_parquet" in result.output + assert "write_text" in result.output + assert "write_bytes" in result.output def test_refactor_product_should_not_warning_if_comment(tmp_empty): - Path('nb.py').write_text(output_with_comment_test) - args = 'nb.py' + Path("nb.py").write_text(output_with_comment_test) + args = "nb.py" runner = CliRunner() result = runner.invoke(cli.refactor, args) assert result.exit_code == 0 - assert 'open' not in result.output - assert 'to_csv' in result.output - assert 'to_parquet' not in result.output - assert 'write_text' in result.output - assert 'write_bytes' not in result.output + assert "open" not in result.output + assert "to_csv" in result.output + assert "to_parquet" not in result.output + assert "write_text" in result.output + assert "write_bytes" not in result.output ModuleNotFoundError_sample = """ @@ -611,14 +644,17 @@ def test_refactor_product_should_not_warning_if_comment(tmp_empty): @pytest.mark.parametrize( - 'code, output', - [[simple, "no error encountered"], - [ModuleNotFoundError_sample, "packages are missing, please install them"], - [AttributeError_sample, "might be due to changes in the libraries"], - [SyntaxError_sample, "There are syntax errors in the notebook"]]) + "code, output", + [ + [simple, "no error encountered"], + [ModuleNotFoundError_sample, "packages are missing, please install them"], + [AttributeError_sample, "might be due to changes in the libraries"], + [SyntaxError_sample, "There are syntax errors in the notebook"], + ], +) def test_test_notebook_runs(tmp_empty, code, output): - nb_ = jupytext.reads(code, fmt='py:light') - filenames = ['nb.ipynb', 'nb.py'] + nb_ = jupytext.reads(code, fmt="py:light") + filenames = ["nb.ipynb", "nb.py"] output_paths = ["nb-output.ipynb", None] for filename in filenames: for output_path in output_paths: @@ -629,7 +665,7 @@ def test_test_notebook_runs(tmp_empty, code, output): expected_output_path = output_path result = runner.invoke(cli.test, [filename, output_path]) else: - expected_output_path = 'nb-soorgeon-test.ipynb' + expected_output_path = "nb-soorgeon-test.ipynb" result = runner.invoke(cli.test, [filename]) if output == "no error encountered": assert result.exit_code == 0 diff --git a/tests/test_definitions.py b/tests/test_definitions.py index ab9c239..273530e 100644 --- a/tests/test_definitions.py +++ b/tests/test_definitions.py @@ -49,37 +49,35 @@ """ -@pytest.mark.parametrize('code, expected', [ +@pytest.mark.parametrize( + "code, expected", [ - simple_imports, { - 'np': 'import numpy as np', - 'pd': '\nimport pandas as pd' - } + [simple_imports, {"np": "import numpy as np", "pd": "\nimport pandas as pd"}], ], -]) +) def test_from_imports(code, expected): assert definitions.from_imports(parso.parse(code)) == expected @pytest.mark.parametrize( - 'code, expected', + "code, expected", [ [ simple_imports, [ - 'numpy', - 'pandas', + "numpy", + "pandas", ], ], [ mixed_imports, [ - 'another', - 'matplotlib', - 'numpy', - 'pandas', - 'scikit-learn', - ] + "another", + "matplotlib", + "numpy", + "pandas", + "scikit-learn", + ], ], [ relative_imports, @@ -87,35 +85,35 @@ def test_from_imports(code, expected): ], [ duplicated_imports, - ['scikit-learn'], + ["scikit-learn"], ], [ comma_imports, - ['scikit-learn'], + ["scikit-learn"], ], [ two_import_as, [ - 'matplotlib', - 'numpy', + "matplotlib", + "numpy", ], ], [ two_imports, [ - 'numpy', - 'pandas', + "numpy", + "pandas", ], ], ], ids=[ - 'simple', - 'mixed', - 'relative', - 'duplicated', - 'comma', - 'two-import-as', - 'two-imports', + "simple", + "mixed", + "relative", + "duplicated", + "comma", + "two-import-as", + "two-imports", ], ) def test_packages_used(code, expected): @@ -123,30 +121,34 @@ def test_packages_used(code, expected): @pytest.mark.parametrize( - 'code, expected', - [[""" + "code, expected", + [ + [ + """ def x(): pass -""", { - 'x': '\ndef x():\n pass' - }], [""" +""", + {"x": "\ndef x():\n pass"}, + ], + [ + """ class X: pass -""", { - 'X': '\nclass X:\n pass' - }], - [ - """ +""", + {"X": "\nclass X:\n pass"}, + ], + [ + """ def x(): pass class X: pass -""", { - 'X': '\nclass X:\n pass', - 'x': '\ndef x():\n pass' - } - ]]) +""", + {"X": "\nclass X:\n pass", "x": "\ndef x():\n pass"}, + ], + ], +) def test_find_defined_names_from_def_and_class(code, expected): - out = (definitions.from_def_and_class(parso.parse(code))) + out = definitions.from_def_and_class(parso.parse(code)) assert out == expected diff --git a/tests/test_detect.py b/tests/test_detect.py index 01ef8d6..0230872 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -6,145 +6,166 @@ from soorgeon import detect -@pytest.mark.parametrize('code, expected', [ - ['x = 1', True], - ['x, y = 1, 2', True], - ['y, x = 1, 2', True], - ['(x, y) = 1, 2', True], - ['(y, x) = 1, 2', True], - ['[x, y] = 1, 2', True], - ['[y, x] = 1, 2', True], - ['(z, (y, x)) = 1, (2, 3)', True], - ['x(1)', False], - ['something(x)', False], - ['x == 1', False], -]) +@pytest.mark.parametrize( + "code, expected", + [ + ["x = 1", True], + ["x, y = 1, 2", True], + ["y, x = 1, 2", True], + ["(x, y) = 1, 2", True], + ["(y, x) = 1, 2", True], + ["[x, y] = 1, 2", True], + ["[y, x] = 1, 2", True], + ["(z, (y, x)) = 1, (2, 3)", True], + ["x(1)", False], + ["something(x)", False], + ["x == 1", False], + ], +) def test_is_left_side_of_assignment(code, expected): - node = testutils.get_first_leaf_with_value(code, 'x') + node = testutils.get_first_leaf_with_value(code, "x") assert detect.is_left_side_of_assignment(node) is expected -@pytest.mark.parametrize('code, expected', [ - ['for x in range(10):\n pass', True], - ['for x, y in range(10):\n pass', True], - ['for y, (z, x) in range(10):\n pass', True], - ['for y in range(10):\n x = y + 1', False], - ['for y in range(10):\n z = y + 1\nfunction(x)', False], -], - ids=[ - 'single', - 'tuple', - 'nested', - 'variable-in-loop-body', - 'variable-in-loop-body-2', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["for x in range(10):\n pass", True], + ["for x, y in range(10):\n pass", True], + ["for y, (z, x) in range(10):\n pass", True], + ["for y in range(10):\n x = y + 1", False], + ["for y in range(10):\n z = y + 1\nfunction(x)", False], + ], + ids=[ + "single", + "tuple", + "nested", + "variable-in-loop-body", + "variable-in-loop-body-2", + ], +) def test_is_for_loop(code, expected): - leaf = testutils.get_first_leaf_with_value(code, 'x') + leaf = testutils.get_first_leaf_with_value(code, "x") assert detect.is_for_loop(leaf) is expected -@pytest.mark.parametrize('code, expected', [ - ['def a():\n pass', False], - ['class A:\n pass', True], - ['class A:\n def __init__(self):\n pass', True], -], - ids=[ - 'function', - 'class-empty', - 'class', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["def a():\n pass", False], + ["class A:\n pass", True], + ["class A:\n def __init__(self):\n pass", True], + ], + ids=[ + "function", + "class-empty", + "class", + ], +) def test_is_classdef(code, expected): leaf = parso.parse(code).get_first_leaf() assert detect.is_classdef(leaf) is expected -@pytest.mark.parametrize('code, expected', [ - ['[1, 2, 3]', False], - ['{1, 2, 3}', False], - ['{1: 1, 2: 2, 3: 3}', False], - ['[x for x in range(10)]', True], - ['{x for x in range(10)}', True], - ['{x for x in range(10) if x > 1}', True], - ['{x: x + 1 for x in range(10)}', True], - ['{x: x + 1 for x in range(10) if x > 1 and x < 8}', True], - ['(x for x in range(10))', True], -], - ids=[ - 'list', - 'set', - 'dict', - 'simple', - 'set-comp', - 'set-comp-conditional', - 'dict-comp', - 'dict-comp-conditional', - 'generator', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["[1, 2, 3]", False], + ["{1, 2, 3}", False], + ["{1: 1, 2: 2, 3: 3}", False], + ["[x for x in range(10)]", True], + ["{x for x in range(10)}", True], + ["{x for x in range(10) if x > 1}", True], + ["{x: x + 1 for x in range(10)}", True], + ["{x: x + 1 for x in range(10) if x > 1 and x < 8}", True], + ["(x for x in range(10))", True], + ], + ids=[ + "list", + "set", + "dict", + "simple", + "set-comp", + "set-comp-conditional", + "dict-comp", + "dict-comp-conditional", + "generator", + ], +) def test_is_comprehension(code, expected): leaf = parso.parse(code).get_first_leaf() assert detect.is_comprehension(leaf) is expected -@pytest.mark.parametrize('code, expected', [ - ['for x in range(10):\n pass', False], - ['with open(x) as f:\n pass', True], -], - ids=[ - 'not-context-manager', - 'simple', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["for x in range(10):\n pass", False], + ["with open(x) as f:\n pass", True], + ], + ids=[ + "not-context-manager", + "simple", + ], +) def test_is_context_manager(code, expected): - leaf = testutils.get_first_leaf_with_value(code, 'x') + leaf = testutils.get_first_leaf_with_value(code, "x") assert detect.is_context_manager(leaf) is expected -@pytest.mark.parametrize('code, expected', [ - ['[x for x in range(10)]', True], - ['[x, y]', False], - ['[x.attribute for x in range(10)', True], - ['[x for x in range(10) if x > 0', True], -], - ids=[ - 'for', - 'list', - 'attribute', - 'conditional', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["[x for x in range(10)]", True], + ["[x, y]", False], + ["[x.attribute for x in range(10)", True], + ["[x for x in range(10) if x > 0", True], + ], + ids=[ + "for", + "list", + "attribute", + "conditional", + ], +) def test_is_inside_list_comprehension(code, expected): - node = testutils.get_first_leaf_with_value(code, 'x') + node = testutils.get_first_leaf_with_value(code, "x") assert detect.is_inside_list_comprehension(node) is expected -@pytest.mark.parametrize('code, expected', [ - ['sns.histplot(df.some_column)', True], - ['histplot(df.some_column)', True], - ['sns.histplot(df)', True], - ['histplot(df)', True], - ['sns.histplot(df["key"])', True], - ['def x(df):\n pass', False], - ['def x(df=1):\n pass', False], - ['(df, df2) = 1, 2', False], - ['function({"data": df})', True], - ['function(dict(data=df))', True], - ['function({"data": (df - 1)})', True], - ['Constructor({"data": df}).do_stuff()', True], - ['Constructor({"data": (df - 1)}).do_stuff()', True], -], - ids=[ - 'arg-attribute', - 'arg-attribute-2', - 'arg', - 'arg-2', - 'arg-getitem', - 'fn-signature', - 'fn-signature-default-value', - 'assignment', - 'arg-nested-dict', - 'arg-nested-dict-constructor', - 'arg-nested-dict-operation', - 'constructor-dict', - 'constructor-dict-operation', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["sns.histplot(df.some_column)", True], + ["histplot(df.some_column)", True], + ["sns.histplot(df)", True], + ["histplot(df)", True], + ['sns.histplot(df["key"])', True], + ["def x(df):\n pass", False], + ["def x(df=1):\n pass", False], + ["(df, df2) = 1, 2", False], + ['function({"data": df})', True], + ["function(dict(data=df))", True], + ['function({"data": (df - 1)})', True], + ['Constructor({"data": df}).do_stuff()', True], + ['Constructor({"data": (df - 1)}).do_stuff()', True], + ], + ids=[ + "arg-attribute", + "arg-attribute-2", + "arg", + "arg-2", + "arg-getitem", + "fn-signature", + "fn-signature-default-value", + "assignment", + "arg-nested-dict", + "arg-nested-dict-constructor", + "arg-nested-dict-operation", + "constructor-dict", + "constructor-dict-operation", + ], +) def test_inside_function_call(code, expected): - leaf = testutils.get_first_leaf_with_value(code, 'df') + leaf = testutils.get_first_leaf_with_value(code, "df") assert detect.is_inside_function_call(leaf) is expected diff --git a/tests/test_export.py b/tests/test_export.py index 10d6210..1d04c3f 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -12,7 +12,7 @@ def _read(nb_str): - return jupytext.reads(nb_str, fmt='py:light') + return jupytext.reads(nb_str, fmt="py:light") def _find_cells_with_tags(nb, tags): @@ -23,8 +23,8 @@ def _find_cells_with_tags(nb, tags): tags_to_find = list(tags) tags_found = {} - for index, cell in enumerate(nb['cells']): - for tag in cell['metadata'].get('tags', []): + for index, cell in enumerate(nb["cells"]): + for tag in cell["metadata"].get("tags", []): if tag in tags_to_find: tags_found[tag] = dict(cell=cell, index=index) tags_to_find.remove(tag) @@ -207,59 +207,65 @@ def do(): """ -@pytest.mark.parametrize('nb_str, tasks', [ - [simple, ['cell-0', 'cell-2', 'cell-4']], - [simple_branch, ['first', 'second', 'third-a', 'third-b']], - [eda, ['load', 'clean', 'plot']], - [complex, ['one', 'two', 'three']], - [magics, ['first', 'second']], - [magics_structured, ['first', 'second', 'third']], -], - ids=[ - 'simple', - 'simple-branch', - 'eda', - 'complex', - 'magics', - 'magics-structured', - ]) +@pytest.mark.parametrize( + "nb_str, tasks", + [ + [simple, ["cell-0", "cell-2", "cell-4"]], + [simple_branch, ["first", "second", "third-a", "third-b"]], + [eda, ["load", "clean", "plot"]], + [complex, ["one", "two", "three"]], + [magics, ["first", "second"]], + [magics_structured, ["first", "second", "third"]], + ], + ids=[ + "simple", + "simple-branch", + "eda", + "complex", + "magics", + "magics-structured", + ], +) def test_from_nb(tmp_empty, nb_str, tasks): export.from_nb(_read(nb_str), py=True) - dag = DAGSpec('pipeline.yaml').to_dag() + dag = DAGSpec("pipeline.yaml").to_dag() dag.build() assert list(dag) == tasks -@pytest.mark.parametrize('py, ext', [ - [True, 'py'], - [False, 'ipynb'], -], - ids=[ - 'py', - 'ipynb', - ]) +@pytest.mark.parametrize( + "py, ext", + [ + [True, "py"], + [False, "ipynb"], + ], + ids=[ + "py", + "ipynb", + ], +) def test_from_nb_works_with_magics(tmp_empty, py, ext): export.from_nb(_read(magics), py=py) - first = jupytext.read(Path('tasks', f'first.{ext}')) - second = jupytext.read(Path('tasks', f'second.{ext}')) + first = jupytext.read(Path("tasks", f"first.{ext}")) + second = jupytext.read(Path("tasks", f"second.{ext}")) - assert [c['source'] for c in first.cells] == [ - 'import math', - 'upstream = None\nproduct = None', - '## first', - '%%bash\nls', - '%%html\n
hi', - '\nmath.sqrt(1)', + assert [c["source"] for c in first.cells] == [ + "import math", + "upstream = None\nproduct = None", + "## first", + "%%bash\nls", + "%%html\n
hi", + "\nmath.sqrt(1)", ] - assert [c['source'] for c in second.cells] == [ - 'upstream = None\nproduct = None', - '## second', - '%timeit 1 + 1', - '%cd x', + assert [c["source"] for c in second.cells] == [ + "upstream = None\nproduct = None", + "## second", + "%timeit 1 + 1", + "%cd x", "%%capture\nprint('x')", ] @@ -267,16 +273,16 @@ def test_from_nb_works_with_magics(tmp_empty, py, ext): def test_exporter_infers_structure_from_line_magics(): exporter = export.NotebookExporter(_read(magics_structured)) - assert set(exporter.get_sources()) == {'first', 'second', 'third'} + assert set(exporter.get_sources()) == {"first", "second", "third"} assert io.find_upstream(exporter._snippets) == { - 'first': [], - 'second': [], - 'third': ['second'] + "first": [], + "second": [], + "third": ["second"], } assert exporter.io == { - 'first': (set(), set()), - 'second': (set(), {'x', 'y'}), - 'third': ({'x', 'y'}, set()) + "first": (set(), set()), + "second": (set(), {"x", "y"}), + "third": ({"x", "y"}, set()), } @@ -294,86 +300,90 @@ def test_from_nb_with_star_imports(tmp_empty): with pytest.raises(exceptions.InputError) as excinfo: export.from_nb(_read(nb_str), py=True) - assert 'from math import *' in str(excinfo.value) - assert 'from pathlib import *' in str(excinfo.value) + assert "from math import *" in str(excinfo.value) + assert "from pathlib import *" in str(excinfo.value) def test_from_nb_upstream_cell_only_shows_unique_values(tmp_empty): export.from_nb(_read(complex)) - dag = DAGSpec('pipeline.yaml').to_dag() + dag = DAGSpec("pipeline.yaml").to_dag() expected = "upstream = ['one']\nproduct = None" - assert dag['two'].source._get_parameters_cell() == expected + assert dag["two"].source._get_parameters_cell() == expected def test_from_nb_with_product_prefix(tmp_empty): - export.from_nb(_read(simple), product_prefix='some-directory') + export.from_nb(_read(simple), product_prefix="some-directory") - dag = DAGSpec('pipeline.yaml').to_dag() + dag = DAGSpec("pipeline.yaml").to_dag() products = [ - i for meta in (t.product.to_json_serializable().values() - for t in dag.values()) for i in meta + i + for meta in (t.product.to_json_serializable().values() for t in dag.values()) + for i in meta ] - expected = str(Path(tmp_empty, 'some-directory')) + expected = str(Path(tmp_empty, "some-directory")) assert all([p.startswith(expected) for p in products]) -@pytest.mark.parametrize('prefix, expected', [ - ['some-directory', 'some-directory\n'], - [None, 'output\n'], -]) +@pytest.mark.parametrize( + "prefix, expected", + [ + ["some-directory", "some-directory\n"], + [None, "output\n"], + ], +) def test_from_nb_creates_gitignore(tmp_empty, prefix, expected): export.from_nb(_read(simple), product_prefix=prefix) - assert Path('.gitignore').read_text() == expected + assert Path(".gitignore").read_text() == expected def test_from_nb_appends_gitignore(tmp_empty): - path = Path('.gitignore') - path.write_text('something') + path = Path(".gitignore") + path.write_text("something") - export.from_nb(_read(simple), product_prefix='some-directory') + export.from_nb(_read(simple), product_prefix="some-directory") - assert path.read_text() == 'something\nsome-directory\n' + assert path.read_text() == "something\nsome-directory\n" def test_from_nb_doesnt_create_gitignore_if_absolute_prefix(tmp_empty): - export.from_nb(_read(simple), product_prefix='/some/absolute/dir') + export.from_nb(_read(simple), product_prefix="/some/absolute/dir") - assert not Path('.gitignore').exists() + assert not Path(".gitignore").exists() def test_from_nb_doesnt_append_gitignore_if_absolute_prefix(tmp_empty): - path = Path('.gitignore') - path.write_text('something') + path = Path(".gitignore") + path.write_text("something") - export.from_nb(_read(simple), product_prefix='/some/absolute/dir') + export.from_nb(_read(simple), product_prefix="/some/absolute/dir") - assert path.read_text() == 'something' + assert path.read_text() == "something" def test_spec_style(tmp_empty): export.from_nb(_read(simple)) - spec = Path('pipeline.yaml').read_text() + spec = Path("pipeline.yaml").read_text() d = yaml.safe_load(spec) # check empty space between tasks - assert '\n\n-' in spec + assert "\n\n-" in spec # check source is the first key on every task - assert all([list(spec)[0] == 'source' for spec in d['tasks']]) + assert all([list(spec)[0] == "source" for spec in d["tasks"]]) def test_from_nb_does_not_serialize_unused_products(tmp_empty): export.from_nb(_read(unused_products)) - dag = DAGSpec('pipeline.yaml').to_dag() + dag = DAGSpec("pipeline.yaml").to_dag() - assert set(k for k in dag['cell-0'].product.to_json_serializable()) == { - 'nb', - 'x', + assert set(k for k in dag["cell-0"].product.to_json_serializable()) == { + "nb", + "x", } @@ -388,25 +398,25 @@ def eda_sources(): def test_exporter_removes_imports(eda_sources): - nb = jupytext.reads(eda_sources['load'], fmt='py:percent') + nb = jupytext.reads(eda_sources["load"], fmt="py:percent") # imports should only exist in the soorgeon-imports cell - m = _find_cells_with_tags(nb, ['soorgeon-imports']) - nb.cells.pop(m['soorgeon-imports']['index']) - tree = parso.parse(jupytext.writes(nb, fmt='py:percent')) + m = _find_cells_with_tags(nb, ["soorgeon-imports"]) + nb.cells.pop(m["soorgeon-imports"]["index"]) + tree = parso.parse(jupytext.writes(nb, fmt="py:percent")) assert not list(tree.iter_imports()) def test_exporter_does_not_add_unpickling_if_no_upstream(eda_sources): - nb = jupytext.reads(eda_sources['load'], fmt='py:percent') - assert not _find_cells_with_tags(nb, ['soorgeon-unpickle']) + nb = jupytext.reads(eda_sources["load"], fmt="py:percent") + assert not _find_cells_with_tags(nb, ["soorgeon-unpickle"]) # FIXME: another test but when we have outputs but they're not used def test_exporter_does_not_add_pickling_if_no_outputs(eda_sources): - nb = jupytext.reads(eda_sources['plot'], fmt='py:percent') - assert not _find_cells_with_tags(nb, ['soorgeon-pickle']) + nb = jupytext.reads(eda_sources["plot"], fmt="py:percent") + assert not _find_cells_with_tags(nb, ["soorgeon-pickle"]) with_definitions = """# ## load @@ -433,8 +443,9 @@ def plot(x): """ with_definitions_expected = ( - 'def load(x):\n return x\n\ndef plot(x):\n return x\n\n' - 'class Cleaner:\n pass') + "def load(x):\n return x\n\ndef plot(x):\n return x\n\n" + "class Cleaner:\n pass" +) definition_with_import = """ # ## load @@ -449,51 +460,59 @@ def plot(x): df = load() """ -definition_with_import_expected = ('import matplotlib.pyplot as plt' - '\n\n\ndef plot(x):\n plt.plot()') +definition_with_import_expected = ( + "import matplotlib.pyplot as plt" "\n\n\ndef plot(x):\n plt.plot()" +) -@pytest.mark.parametrize('code, expected', [ - [with_definitions, with_definitions_expected], - [definition_with_import, definition_with_import_expected], -], - ids=[ - 'with_definitions', - 'definition_with_import', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + [with_definitions, with_definitions_expected], + [definition_with_import, definition_with_import_expected], + ], + ids=[ + "with_definitions", + "definition_with_import", + ], +) def test_export_definitions(tmp_empty, code, expected): exporter = export.NotebookExporter(_read(code)) exporter.export_definitions() - assert Path('exported.py').read_text() == expected + assert Path("exported.py").read_text() == expected -@pytest.mark.parametrize('code, expected', [ - [with_definitions, 'ploomber>=0.14.7\n'], - [definition_with_import, 'load\nmatplotlib\nploomber>=0.14.7\n'], -], - ids=[ - 'with_definitions', - 'definition_with_import', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + [with_definitions, "ploomber>=0.14.7\n"], + [definition_with_import, "load\nmatplotlib\nploomber>=0.14.7\n"], + ], + ids=[ + "with_definitions", + "definition_with_import", + ], +) def test_export_requirements(tmp_empty, code, expected): exporter = export.NotebookExporter(_read(code)) exporter.export_requirements() - expected = ('# Auto-generated file, may need manual ' - f'editing\n{expected}') - assert Path('requirements.txt').read_text() == expected + expected = "# Auto-generated file, may need manual " f"editing\n{expected}" + assert Path("requirements.txt").read_text() == expected def test_export_requirements_doesnt_overwrite(tmp_empty): - reqs = Path('requirements.txt') - reqs.write_text('soorgeon\n') + reqs = Path("requirements.txt") + reqs.write_text("soorgeon\n") exporter = export.NotebookExporter(_read(definition_with_import)) exporter.export_requirements() - expected = ('soorgeon\n# Auto-generated file, may need manual ' - 'editing\nload\nmatplotlib\nploomber>=0.14.7\n') + expected = ( + "soorgeon\n# Auto-generated file, may need manual " + "editing\nload\nmatplotlib\nploomber>=0.14.7\n" + ) assert reqs.read_text() == expected @@ -501,7 +520,7 @@ def test_does_not_create_exported_py_if_no_definitions(tmp_empty): exporter = export.NotebookExporter(_read(simple)) exporter.export_definitions() - assert not Path('exported.py').exists() + assert not Path("exported.py").exists() def test_get_sources_includes_import_from_exported_definitions(tmp_empty): @@ -509,10 +528,10 @@ def test_get_sources_includes_import_from_exported_definitions(tmp_empty): sources = exporter.get_sources() - import_ = 'from exported import load, plot, Cleaner' - assert import_ in sources['load'] - assert import_ in sources['clean'] - assert import_ in sources['plot'] + import_ = "from exported import load, plot, Cleaner" + assert import_ in sources["load"] + assert import_ in sources["clean"] + assert import_ in sources["plot"] for_loop_with_output_in_body = """# ## section @@ -547,13 +566,14 @@ def test_raise_an_error_if_function_uses_global_variables(): """ -@pytest.mark.parametrize('code, expected', [ - [list_comp, { - 'first': (set(), {'x'}) - }], -]) +@pytest.mark.parametrize( + "code, expected", + [ + [list_comp, {"first": (set(), {"x"})}], + ], +) def test_get_raw_io(code, expected): - nb = jupytext.reads(code, fmt='py:light') + nb = jupytext.reads(code, fmt="py:light") exporter = export.NotebookExporter(nb) assert exporter._get_raw_io() == expected @@ -565,7 +585,7 @@ def test_exporter_init_with_syntax_error(): if """ - nb = jupytext.reads(code, fmt='py:light') + nb = jupytext.reads(code, fmt="py:light") with pytest.raises(exceptions.InputSyntaxError): export.NotebookExporter(nb) @@ -577,13 +597,14 @@ def test_exporter_init_with_undefined_name_error(): y = x + 1 """ - nb = jupytext.reads(code, fmt='py:light') + nb = jupytext.reads(code, fmt="py:light") with pytest.raises(exceptions.InputWontRunError) as excinfo: export.NotebookExporter(nb) - expected = ('(ensure that your notebook executes from ' - 'top-to-bottom and try again)') + expected = ( + "(ensure that your notebook executes from " "top-to-bottom and try again)" + ) assert expected in str(excinfo.value) @@ -595,11 +616,11 @@ def test_get_code(tmp_empty): print('hello') """ - nb_ = jupytext.reads(code, fmt='py:light') - jupytext.write(nb_, 'nb.ipynb') - pm.execute_notebook('nb.ipynb', 'nb.ipynb', kernel_name='python3') + nb_ = jupytext.reads(code, fmt="py:light") + jupytext.write(nb_, "nb.ipynb") + pm.execute_notebook("nb.ipynb", "nb.ipynb", kernel_name="python3") - nb = jupytext.read('nb.ipynb') + nb = jupytext.read("nb.ipynb") exporter = export.NotebookExporter(nb) assert exporter._get_code() == "print('hello')" @@ -617,12 +638,12 @@ def test_get_sources_add_import_if_needed(): y = something.another() """ - nb = jupytext.reads(code, fmt='py:light') + nb = jupytext.reads(code, fmt="py:light") exporter = export.NotebookExporter(nb) sources = exporter.get_sources() - assert 'import something' in sources['first'] - assert 'import something' in sources['second'] + assert "import something" in sources["first"] + assert "import something" in sources["second"] def test_get_task_specs(): @@ -637,24 +658,19 @@ def test_get_task_specs(): y = x + something.another() """ - nb = jupytext.reads(code, fmt='py:light') + nb = jupytext.reads(code, fmt="py:light") exporter = export.NotebookExporter(nb, py=True) - specs = exporter.get_task_specs(product_prefix='output') + specs = exporter.get_task_specs(product_prefix="output") assert specs == { - 'first': { - 'source': 'tasks/first.py', - 'product': { - 'x': 'output/first-x.pkl', - 'nb': 'output/first.ipynb' - } + "first": { + "source": "tasks/first.py", + "product": {"x": "output/first-x.pkl", "nb": "output/first.ipynb"}, + }, + "second": { + "source": "tasks/second.py", + "product": {"nb": "output/second.ipynb"}, }, - 'second': { - 'source': 'tasks/second.py', - 'product': { - 'nb': 'output/second.ipynb' - } - } } @@ -722,16 +738,19 @@ class c: """ -@pytest.mark.parametrize('df_format, pickling, unpickling', [ - [None, none_pickling, none_unpickling], - ['parquet', parquet_pickling, parquet_unpickling], - ['csv', csv_pickling, csv_unpickling], -], - ids=[ - 'none', - 'parquet', - 'csv', - ]) +@pytest.mark.parametrize( + "df_format, pickling, unpickling", + [ + [None, none_pickling, none_unpickling], + ["parquet", parquet_pickling, parquet_unpickling], + ["csv", csv_pickling, csv_unpickling], + ], + ids=[ + "none", + "parquet", + "csv", + ], +) def test_prototask_un_pickling_cells(df_format, pickling, unpickling): code = """\ # ## first @@ -748,12 +767,11 @@ def test_prototask_un_pickling_cells(df_format, pickling, unpickling): exporter = export.NotebookExporter(_read(code), df_format=df_format) one, two = exporter._proto_tasks - assert one._pickling_cell(exporter.io)['source'] == pickling + assert one._pickling_cell(exporter.io)["source"] == pickling assert two._pickling_cell(exporter.io) is None assert one._unpickling_cell(exporter.io, exporter.providers) is None - assert two._unpickling_cell(exporter.io, - exporter.providers)['source'] == unpickling + assert two._unpickling_cell(exporter.io, exporter.providers)["source"] == unpickling cloudpickle_pickling = """\ @@ -776,12 +794,14 @@ def test_prototask_un_pickling_cells(df_format, pickling, unpickling): @pytest.mark.parametrize( - 'serializer, pickling, unpickling', - [['cloudpickle', cloudpickle_pickling, cloudpickle_unpickling], - ['dill', dill_pickling, dill_unpickling]], - ids=['cloudpickle', 'dill']) -def test_prototask_un_pickling_cells_with_serializer(serializer, pickling, - unpickling): + "serializer, pickling, unpickling", + [ + ["cloudpickle", cloudpickle_pickling, cloudpickle_unpickling], + ["dill", dill_pickling, dill_unpickling], + ], + ids=["cloudpickle", "dill"], +) +def test_prototask_un_pickling_cells_with_serializer(serializer, pickling, unpickling): code = """\ # ## first @@ -797,62 +817,63 @@ def test_prototask_un_pickling_cells_with_serializer(serializer, pickling, exporter = export.NotebookExporter(_read(code), serializer=serializer) one, two = exporter._proto_tasks - assert one._pickling_cell(exporter.io)['source'] == pickling - assert two._pickling_cell(exporter.io)['source'] == pickling + assert one._pickling_cell(exporter.io)["source"] == pickling + assert two._pickling_cell(exporter.io)["source"] == pickling assert one._unpickling_cell(exporter.io, exporter.providers) is None - assert two._unpickling_cell(exporter.io, - exporter.providers)['source'] == unpickling + assert two._unpickling_cell(exporter.io, exporter.providers)["source"] == unpickling def test_validates_df_format(): with pytest.raises(ValueError) as excinfo: - export.NotebookExporter(_read(''), df_format='something') + export.NotebookExporter(_read(""), df_format="something") - assert 'df_format must be one of ' in str(excinfo.value) + assert "df_format must be one of " in str(excinfo.value) def test_validates_serializer(): with pytest.raises(ValueError) as excinfo: - export.NotebookExporter(_read(''), serializer='something') + export.NotebookExporter(_read(""), serializer="something") - assert 'serializer must be one of ' in str(excinfo.value) + assert "serializer must be one of " in str(excinfo.value) def test_creates_readme(tmp_empty): exporter = export.NotebookExporter(_read(simple)) exporter.export() - assert Path('README.md').read_text() == resources.read_text( - assets, 'README.md') + assert Path("README.md").read_text() == resources.read_text(assets, "README.md") def test_appends_to_readme(tmp_empty): - Path('README.md').write_text('# Some stuff') + Path("README.md").write_text("# Some stuff") exporter = export.NotebookExporter(_read(simple)) exporter.export() - expected = '# Some stuff\n' + resources.read_text(assets, 'README.md') - assert Path('README.md').read_text() == expected - - -@pytest.mark.parametrize('code, expect', [ - ("f = open('text.txt')", False), - ("f = open('read.txt' , 'r')", False), - ("f = open('txt', 'r')", False), - ("f = open('text.txt', 'rb') ", False), - ("f = open('text.txt' , 'ab')", True), - ("with open('text.txt', 'w')", True), - ("with open('txt' , 'w+')", True), - ("''' with open('txt' , 'w+') '''", False), - ("f = Path().write_text()", True), - ("f = path().write_bytes()", True), - ("df.to_csv()", True), - ("df.to_parquet()", True), - ("write_text = 6", False), - ("header = 'call to_csv function'", False), - ("# Path.write_text('txt')", False) -]) + expected = "# Some stuff\n" + resources.read_text(assets, "README.md") + assert Path("README.md").read_text() == expected + + +@pytest.mark.parametrize( + "code, expect", + [ + ("f = open('text.txt')", False), + ("f = open('read.txt' , 'r')", False), + ("f = open('txt', 'r')", False), + ("f = open('text.txt', 'rb') ", False), + ("f = open('text.txt' , 'ab')", True), + ("with open('text.txt', 'w')", True), + ("with open('txt' , 'w+')", True), + ("''' with open('txt' , 'w+') '''", False), + ("f = Path().write_text()", True), + ("f = path().write_bytes()", True), + ("df.to_csv()", True), + ("df.to_parquet()", True), + ("write_text = 6", False), + ("header = 'call to_csv function'", False), + ("# Path.write_text('txt')", False), + ], +) def test_find_output_file_events(code, expect): actual = export._find_output_file_events(code) assert actual == expect diff --git a/tests/test_io.py b/tests/test_io.py index 3702b49..5ba8943 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -13,7 +13,7 @@ def get_first_sibling_after_assignment(code, index): current = 0 while leaf: - if leaf.value == '=': + if leaf.value == "=": if index == current: break @@ -24,30 +24,35 @@ def get_first_sibling_after_assignment(code, index): return leaf.get_next_sibling() -@pytest.mark.parametrize('code, expected_len', [ - ['[x for x in range(10)]', 1], - ['[i for row in matrix for i in row]', 2], - ['[i for matrix in tensor for row in matrix for i in row]', 3], -], - ids=[ - 'simple', - 'nested', - 'nested-nested', - ]) +@pytest.mark.parametrize( + "code, expected_len", + [ + ["[x for x in range(10)]", 1], + ["[i for row in matrix for i in row]", 2], + ["[i for matrix in tensor for row in matrix for i in row]", 3], + ], + ids=[ + "simple", + "nested", + "nested-nested", + ], +) def test_flatten_sync_comp_for(code, expected_len): synccompfor = parso.parse(code).children[0].children[1].children[1] assert len(io._flatten_sync_comp_for(synccompfor)) == expected_len -@pytest.mark.parametrize('code, in_expected, declared_expected', [ - ['[x for x in range(10)]', set(), {'x'}], -], - ids=[ - 'simple', - ]) -def test_find_sync_comp_for_inputs_and_scope(code, in_expected, - declared_expected): +@pytest.mark.parametrize( + "code, in_expected, declared_expected", + [ + ["[x for x in range(10)]", set(), {"x"}], + ], + ids=[ + "simple", + ], +) +def test_find_sync_comp_for_inputs_and_scope(code, in_expected, declared_expected): synccompfor = parso.parse(code).children[0].children[1].children[1] in_, declared = io._find_sync_comp_for_inputs_and_scope(synccompfor) @@ -56,17 +61,19 @@ def test_find_sync_comp_for_inputs_and_scope(code, in_expected, assert declared == declared_expected -@pytest.mark.parametrize('snippets, local_scope, expected', [ - [ - 'import pandas as pd\n df = pd.read_csv("data.csv")', {'pd'}, - (set(), {'df'}) - ], +@pytest.mark.parametrize( + "snippets, local_scope, expected", [ - 'import pandas as pd\nimport do_stuff\n' - 'df = do_stuff(pd.read_csv("data.csv"))', {'pd'}, (set(), {'df'}) + ['import pandas as pd\n df = pd.read_csv("data.csv")', {"pd"}, (set(), {"df"})], + [ + "import pandas as pd\nimport do_stuff\n" + 'df = do_stuff(pd.read_csv("data.csv"))', + {"pd"}, + (set(), {"df"}), + ], ], -], - ids=['simple', 'inside-function']) + ids=["simple", "inside-function"], +) def test_find_inputs_and_outputs_local_scope(snippets, local_scope, expected): assert io.find_inputs_and_outputs(snippets, local_scope) == expected @@ -88,114 +95,124 @@ def test_find_inputs_and_outputs_local_scope(snippets, local_scope, expected): # exploratory data analysis example eda = { - 'load': "import load_data, plot\ndf = load_data()", + "load": "import load_data, plot\ndf = load_data()", # note that we re-define df - 'clean': "df = df[df.some_columns > 2]", - 'plot': "plot(df)" + "clean": "df = df[df.some_columns > 2]", + "plot": "plot(df)", } # imports on its own section imports = { - 'imports': 'import pandas as pd', - 'load': 'df = pd.read_csv("data.csv")', + "imports": "import pandas as pd", + "load": 'df = pd.read_csv("data.csv")', } def test_providermapping(): m = io.ProviderMapping(io.find_io(eda)) - assert m._providers_for_task('load') == {} - assert m._providers_for_task('clean') == {'df': 'load'} - assert m._providers_for_task('plot') == {'df': 'clean'} - assert m.get('df', 'clean') == 'load' + assert m._providers_for_task("load") == {} + assert m._providers_for_task("clean") == {"df": "load"} + assert m._providers_for_task("plot") == {"df": "clean"} + assert m.get("df", "clean") == "load" def test_providermapping_error(): m = io.ProviderMapping(io.find_io(eda)) with pytest.raises(KeyError) as excinfo: - m.get('unknown_variable', 'clean') + m.get("unknown_variable", "clean") - expected = ('"Error parsing inputs for section \'clean\' notebook: ' - 'could not find an earlier section declaring ' - 'variable \'unknown_variable\'"') + expected = ( + "\"Error parsing inputs for section 'clean' notebook: " + "could not find an earlier section declaring " + "variable 'unknown_variable'\"" + ) assert expected == str(excinfo.value) -@pytest.mark.parametrize('snippets, expected', [ - [{ - 'first': first, - 'second': second, - 'third': third - }, { - 'first': [], - 'second': ['first'], - 'third': ['second'] - }], - [eda, { - 'load': [], - 'clean': ['load'], - 'plot': ['clean'] - }], +@pytest.mark.parametrize( + "snippets, expected", [ - read_snippets('ml'), { - 'load': [], - 'clean': ['load'], - 'train-test-split': ['clean'], - 'linear-regression': ['train-test-split'], - 'random-forest-regressor': ['train-test-split'] - } + [ + {"first": first, "second": second, "third": third}, + {"first": [], "second": ["first"], "third": ["second"]}, + ], + [eda, {"load": [], "clean": ["load"], "plot": ["clean"]}], + [ + read_snippets("ml"), + { + "load": [], + "clean": ["load"], + "train-test-split": ["clean"], + "linear-regression": ["train-test-split"], + "random-forest-regressor": ["train-test-split"], + }, + ], ], -]) +) def test_find_upstream(snippets, expected): assert io.find_upstream(snippets) == expected -@pytest.mark.parametrize('snippets, expected', [ +@pytest.mark.parametrize( + "snippets, expected", [ - eda, { - 'load': (set(), {'df'}), - 'clean': ({'df'}, {'df'}), - 'plot': ({'df'}, set()) - } + [ + eda, + { + "load": (set(), {"df"}), + "clean": ({"df"}, {"df"}), + "plot": ({"df"}, set()), + }, + ], + [ + read_snippets("ml"), + { + "load": (set(), {"df", "ca_housing"}), + "clean": ({"df"}, {"df"}), + "train-test-split": ( + {"df"}, + {"y", "X", "X_train", "X_test", "y_train", "y_test"}, + ), + "linear-regression": ( + {"y_test", "X_test", "y_train", "X_train"}, + {"lr", "y_pred"}, + ), + "random-forest-regressor": ( + {"y_test", "X_test", "y_train", "X_train"}, + {"rf", "y_pred"}, + ), + }, + ], + [imports, {"imports": (set(), set()), "load": (set(), {"df"})}], ], - [ - read_snippets('ml'), - { - 'load': (set(), {'df', 'ca_housing'}), - 'clean': ({'df'}, {'df'}), - 'train-test-split': - ({'df'}, {'y', 'X', 'X_train', 'X_test', 'y_train', 'y_test'}), - 'linear-regression': - ({'y_test', 'X_test', 'y_train', 'X_train'}, {'lr', 'y_pred'}), - 'random-forest-regressor': - ({'y_test', 'X_test', 'y_train', 'X_train'}, {'rf', 'y_pred'}) - }, + ids=[ + "eda", + "ml", + "imports", ], - [imports, { - 'imports': (set(), set()), - 'load': (set(), {'df'}) - }], -], - ids=[ - 'eda', - 'ml', - 'imports', - ]) +) def test_find_io(snippets, expected): assert io.find_io(snippets) == expected -@pytest.mark.parametrize('io_, expected', [ - [{ - 'one': ({'a'}, {'b', 'c'}), - 'two': ({'b'}, set()), - }, { - 'one': ({'a'}, {'b'}), - 'two': ({'b'}, set()), - }], -]) +@pytest.mark.parametrize( + "io_, expected", + [ + [ + { + "one": ({"a"}, {"b", "c"}), + "two": ({"b"}, set()), + }, + { + "one": ({"a"}, {"b"}), + "two": ({"b"}, set()), + }, + ], + ], +) def test_prune_io(io_, expected): assert io.prune_io(io_) == expected @@ -212,92 +229,97 @@ def test_prune_io(io_, expected): """ -@pytest.mark.parametrize('code_nb, code_task, expected', [ - [ - exploratory, - "df = load_iris(as_frame=True)['data']", - "from sklearn.datasets import load_iris", - ], - [ - exploratory, - "df = df[df['petal length (cm)'] > 2]", - None, - ], +@pytest.mark.parametrize( + "code_nb, code_task, expected", [ - exploratory, - "sns.histplot(df['petal length (cm)'])", - "import seaborn as sns", + [ + exploratory, + "df = load_iris(as_frame=True)['data']", + "from sklearn.datasets import load_iris", + ], + [ + exploratory, + "df = df[df['petal length (cm)'] > 2]", + None, + ], + [ + exploratory, + "sns.histplot(df['petal length (cm)'])", + "import seaborn as sns", + ], ], -]) +) def test_importsparser(code_nb, code_task, expected): ip = io.ImportsParser(code_nb) assert ip.get_imports_cell_for_task(code_task) == expected -@pytest.mark.parametrize('code, expected', [ - ['import pandas as pd\nimport numpy as np', '\n'], - ['import math', ''], - ['import pandas as pd\n1+1', '\n1+1'], - ['import math\n1+1', '\n1+1'], -]) +@pytest.mark.parametrize( + "code, expected", + [ + ["import pandas as pd\nimport numpy as np", "\n"], + ["import math", ""], + ["import pandas as pd\n1+1", "\n1+1"], + ["import math\n1+1", "\n1+1"], + ], +) def test_remove_imports(code, expected): assert io.remove_imports(code) == expected -@pytest.mark.parametrize('snippets, names, a, b', [ - [{ - 'a': 'import pandas', - 'b': 'import numpy as np' - }, { - 'a': {'pandas'}, - 'b': {'np'} - }, - set(), {'pandas'}], - [{ - 'a': 'def x():\n pass', - 'b': 'def y():\n pass', - }, { - 'a': {'x'}, - 'b': {'y'} - }, - set(), {'x'}], -]) +@pytest.mark.parametrize( + "snippets, names, a, b", + [ + [ + {"a": "import pandas", "b": "import numpy as np"}, + {"a": {"pandas"}, "b": {"np"}}, + set(), + {"pandas"}, + ], + [ + { + "a": "def x():\n pass", + "b": "def y():\n pass", + }, + {"a": {"x"}, "b": {"y"}}, + set(), + {"x"}, + ], + ], +) def test_definitions_mapping(snippets, names, a, b): im = io.DefinitionsMapping(snippets) assert im._names == names - assert im.get('a') == a - assert im.get('b') == b - - -@pytest.mark.parametrize('code, def_expected, in_expected, out_expected', [ - ['for x in range(10):\n pass', {'x'}, - set(), set()], - ['for x, y in range(10):\n pass', {'x', 'y'}, - set(), set()], - ['for x, (y, z) in range(10):\n pass', {'x', 'y', 'z'}, - set(), - set()], - ['for x in range(10):\n pass\n\nj = i', {'x'}, - set(), set()], + assert im.get("a") == a + assert im.get("b") == b + + +@pytest.mark.parametrize( + "code, def_expected, in_expected, out_expected", [ - 'for i, a_range in enumerate(range(x)):\n pass', {'i', 'a_range'}, - {'x'}, - set() + ["for x in range(10):\n pass", {"x"}, set(), set()], + ["for x, y in range(10):\n pass", {"x", "y"}, set(), set()], + ["for x, (y, z) in range(10):\n pass", {"x", "y", "z"}, set(), set()], + ["for x in range(10):\n pass\n\nj = i", {"x"}, set(), set()], + [ + "for i, a_range in enumerate(range(x)):\n pass", + {"i", "a_range"}, + {"x"}, + set(), + ], + ["for i in range(10):\n print(i + 10)", {"i"}, set(), set()], ], - ['for i in range(10):\n print(i + 10)', {'i'}, - set(), set()], -], - ids=[ - 'one', - 'two', - 'nested', - 'code-outside-for-loop', - 'nested-calls', - 'uses-local-sope-in-body', - ]) -def test_find_for_loop_def_and_io(code, def_expected, in_expected, - out_expected): + ids=[ + "one", + "two", + "nested", + "code-outside-for-loop", + "nested-calls", + "uses-local-sope-in-body", + ], +) +def test_find_for_loop_def_and_io(code, def_expected, in_expected, out_expected): tree = parso.parse(code) # TODO: test with non-empty local_scope parameter def_, in_, out = io.find_for_loop_def_and_io(tree.children[0]) @@ -306,40 +328,38 @@ def test_find_for_loop_def_and_io(code, def_expected, in_expected, assert out == out_expected -@pytest.mark.parametrize('code, def_expected, in_expected, out_expected', [ - ['with open("file") as f:\n pass', {'f'}, - set(), set()], - ['with open("file"):\n pass', - set(), set(), set()], - [ - 'with open("file") as f, open("another") as g:\n pass', {'f', 'g'}, - set(), - set() - ], - ['with open("file") as f:\n x = f.read()', {'f'}, - set(), {'x'}], - ['with open("file") as f:\n x, y = f.read()', {'f'}, - set(), {'x', 'y'}], +@pytest.mark.parametrize( + "code, def_expected, in_expected, out_expected", [ - 'with open(some_path) as f:\n x = f.read()', {'f'}, {'some_path'}, - {'x'} + ['with open("file") as f:\n pass', {"f"}, set(), set()], + ['with open("file"):\n pass', set(), set(), set()], + [ + 'with open("file") as f, open("another") as g:\n pass', + {"f", "g"}, + set(), + set(), + ], + ['with open("file") as f:\n x = f.read()', {"f"}, set(), {"x"}], + ['with open("file") as f:\n x, y = f.read()', {"f"}, set(), {"x", "y"}], + ["with open(some_path) as f:\n x = f.read()", {"f"}, {"some_path"}, {"x"}], + [ + "with open(some_path, another_path) as (f, ff):\n x = f.read()", + {"f", "ff"}, + {"some_path", "another_path"}, + {"x"}, + ], ], - [ - 'with open(some_path, another_path) as (f, ff):\n x = f.read()', - {'f', 'ff'}, {'some_path', 'another_path'}, {'x'} + ids=[ + "one", + "no-alias", + "two", + "output-one", + "output-many", + "input-one", + "input-many", ], -], - ids=[ - 'one', - 'no-alias', - 'two', - 'output-one', - 'output-many', - 'input-one', - 'input-many', - ]) -def test_find_context_manager_def_and_io(code, def_expected, in_expected, - out_expected): +) +def test_find_context_manager_def_and_io(code, def_expected, in_expected, out_expected): tree = parso.parse(code) # TODO: test with non-empty local_scope parameter def_, in_, out = io.find_context_manager_def_and_io(tree.children[0]) @@ -348,215 +368,230 @@ def test_find_context_manager_def_and_io(code, def_expected, in_expected, assert out == out_expected -@pytest.mark.parametrize('code, def_expected, in_expected, out_expected', [ - ['def fn(x):\n pass', {'x'}, - set(), set()], - ['def fn(x, y):\n pass', {'x', 'y'}, - set(), set()], - ['def fn(x, y):\n something = z + x + y', {'x', 'y'}, {'z'}, - set()], - ['def fn(x, y):\n z(x, y)', {'x', 'y'}, {'z'}, - set()], - ['def fn(x, y):\n z.do(x, y)', {'x', 'y'}, {'z'}, - set()], - ['def fn(x, y):\n z[x]', {'x', 'y'}, {'z'}, - set()], - ['def fn(x, y):\n z + x + y', {'x', 'y'}, {'z'}, - set()], - ['def fn(x, y):\n z', {'x', 'y'}, {'z'}, - set()], - ['def fn() -> Mapping[str, int]:\n pass', - set(), {'Mapping'}, - set()], - [ - 'def fn(x: int, y: Mapping[str, int]):\n z + x + y', - {'Mapping', 'x', 'y'}, {'z'}, - set() - ], +@pytest.mark.parametrize( + "code, def_expected, in_expected, out_expected", [ - 'def fn(a=1):\n pass', - {'a'}, - set(), - set(), + ["def fn(x):\n pass", {"x"}, set(), set()], + ["def fn(x, y):\n pass", {"x", "y"}, set(), set()], + ["def fn(x, y):\n something = z + x + y", {"x", "y"}, {"z"}, set()], + ["def fn(x, y):\n z(x, y)", {"x", "y"}, {"z"}, set()], + ["def fn(x, y):\n z.do(x, y)", {"x", "y"}, {"z"}, set()], + ["def fn(x, y):\n z[x]", {"x", "y"}, {"z"}, set()], + ["def fn(x, y):\n z + x + y", {"x", "y"}, {"z"}, set()], + ["def fn(x, y):\n z", {"x", "y"}, {"z"}, set()], + ["def fn() -> Mapping[str, int]:\n pass", set(), {"Mapping"}, set()], + [ + "def fn(x: int, y: Mapping[str, int]):\n z + x + y", + {"Mapping", "x", "y"}, + {"z"}, + set(), + ], + [ + "def fn(a=1):\n pass", + {"a"}, + set(), + set(), + ], + [ + "def fn(a: str=1):\n pass", + {"a"}, + set(), + set(), + ], ], - [ - 'def fn(a: str=1):\n pass', - {'a'}, - set(), - set(), + ids=[ + "arg-one", + "arg-two", + "uses-outer-scope", + "uses-outer-scope-callable", + "uses-outer-scope-attribute", + "uses-outer-scope-getitem", + "uses-outer-scope-no-assignment", + "uses-outer-scope-reference", + "annotation-return", + "annotation-args", + "kwargs", + "annotation-kwargs", ], -], - ids=[ - 'arg-one', - 'arg-two', - 'uses-outer-scope', - 'uses-outer-scope-callable', - 'uses-outer-scope-attribute', - 'uses-outer-scope-getitem', - 'uses-outer-scope-no-assignment', - 'uses-outer-scope-reference', - 'annotation-return', - 'annotation-args', - 'kwargs', - 'annotation-kwargs', - ]) -def test_find_function_scope_and_io(code, def_expected, in_expected, - out_expected): +) +def test_find_function_scope_and_io(code, def_expected, in_expected, out_expected): tree = parso.parse(code) # TODO: test with non-empty local_scope parameter - def_, in_, out = (io.find_function_scope_and_io(tree.children[0])) + def_, in_, out = io.find_function_scope_and_io(tree.children[0]) assert def_ == def_expected assert in_ == in_expected assert out == out_expected @pytest.mark.parametrize( - 'code, expected', [['name(x, y)', {'name', 'x', 'y'}], - ['name(a=x, b=y)', {'name', 'x', 'y'}], - ['name(x, b=y)', {'name', 'x', 'y'}], - ['name({"x": x}, b=y)', {'name', 'x', 'y'}], - ['name(x, b={"y": y})', {'name', 'x', 'y'}], - ['name([x, y])', {'name', 'x', 'y'}], - ['name["a"]', {'name'}], ['name.atribute', {'name'}]], + "code, expected", + [ + ["name(x, y)", {"name", "x", "y"}], + ["name(a=x, b=y)", {"name", "x", "y"}], + ["name(x, b=y)", {"name", "x", "y"}], + ['name({"x": x}, b=y)', {"name", "x", "y"}], + ['name(x, b={"y": y})', {"name", "x", "y"}], + ["name([x, y])", {"name", "x", "y"}], + ['name["a"]', {"name"}], + ["name.atribute", {"name"}], + ], ids=[ - 'simple', - 'keywords', - 'mixed', - 'arg-dict', - 'keyarg-dict', - 'arg-list', - 'getitem', - 'attribute', - ]) + "simple", + "keywords", + "mixed", + "arg-dict", + "keyarg-dict", + "arg-list", + "getitem", + "attribute", + ], +) def test_find_inputs(code, expected): - atom_exp = testutils.get_first_leaf_with_value(code, 'name').parent + atom_exp = testutils.get_first_leaf_with_value(code, "name").parent assert io.find_inputs(atom_exp) == expected -@pytest.mark.parametrize('code, expected', [ - ['name["a"]', {'name'}], - ['name.atribute', {'name'}], - ['name', set()], -], - ids=[ - 'getitem', - 'attribute', - 'name', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ['name["a"]', {"name"}], + ["name.atribute", {"name"}], + ["name", set()], + ], + ids=[ + "getitem", + "attribute", + "name", + ], +) def test_find_inputs_only_getitem_and_attribute_access(code, expected): - atom_exp = testutils.get_first_leaf_with_value(code, 'name').parent + atom_exp = testutils.get_first_leaf_with_value(code, "name").parent out = io.find_inputs(atom_exp, only_getitem_and_attribute_access=True) assert out == expected -@pytest.mark.parametrize('code, expected', [ - ['[x for x in df["some_key"]]', {'df'}], - ['[x for x in df["some_key"]["another_key"]]', {'df'}], -], - ids=[ - 'getitem', - 'getitem-nested', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ['[x for x in df["some_key"]]', {"df"}], + ['[x for x in df["some_key"]["another_key"]]', {"df"}], + ], + ids=[ + "getitem", + "getitem-nested", + ], +) def test_find_inputs_only_getitem_and_attribute_access_list_comprehension( - code, expected): - out = io.find_inputs(parso.parse(code), - only_getitem_and_attribute_access=True) + code, expected +): + out = io.find_inputs(parso.parse(code), only_getitem_and_attribute_access=True) assert out == expected -@pytest.mark.parametrize('code, expected, index', [ - ['x = df.something', {'df'}, 0], - ['x = df.something.another', {'df'}, 0], - ['x = df.something()', {'df'}, 0], - ['x = df["column"]', {'df'}, 0], - ['x = df[another]', {'df', 'another'}, 0], - ['x = df[function(another)]', {'df', 'function', 'another'}, 0], - ['df = load()\nx = df + another', {'load'}, 0], - ['x = y + z', {'y', 'z'}, 0], - ['x = a + b + c + d', {'a', 'b', 'c', 'd'}, 0], - ['x = [y for y in range(10)]', set(), 0], - ['x = np.std([y for y in range(10)])', {'np'}, 0], -], - ids=[ - 'attribute', - 'attribute-nested', - 'method', - 'getitem-literal', - 'getitem-variable', - 'getitem-nested', - 'multiline', - 'expression', - 'expression-long', - 'list-comprehension', - 'list-comprehension-as-arg', - ]) +@pytest.mark.parametrize( + "code, expected, index", + [ + ["x = df.something", {"df"}, 0], + ["x = df.something.another", {"df"}, 0], + ["x = df.something()", {"df"}, 0], + ['x = df["column"]', {"df"}, 0], + ["x = df[another]", {"df", "another"}, 0], + ["x = df[function(another)]", {"df", "function", "another"}, 0], + ["df = load()\nx = df + another", {"load"}, 0], + ["x = y + z", {"y", "z"}, 0], + ["x = a + b + c + d", {"a", "b", "c", "d"}, 0], + ["x = [y for y in range(10)]", set(), 0], + ["x = np.std([y for y in range(10)])", {"np"}, 0], + ], + ids=[ + "attribute", + "attribute-nested", + "method", + "getitem-literal", + "getitem-variable", + "getitem-nested", + "multiline", + "expression", + "expression-long", + "list-comprehension", + "list-comprehension-as-arg", + ], +) def test_find_inputs_with_atom_expr(code, expected, index): atom_exp = get_first_sibling_after_assignment(code, index=index) assert io.find_inputs(atom_exp) == expected -@pytest.mark.parametrize('code, expected', [ - ['[x for x in range(10)]', set()], - ['[f"{x}" for x in range(10)]', set()], - ['(x for x in range(10))', set()], - ['[function(x) for x in range(10)]', {'function'}], - ['[(x, y) for x, y in something(10)]', {'something'}], - ['[x.attribute for x in range(10)]', - set()], - ['[x for x in obj if x > 0]', {'obj'}], - ['[i for row in matrix for i in row]', {'matrix'}], - ['[i for matrix in tensor for row in matrix for i in row]', {'tensor'}], -], - ids=[ - 'left-expression', - 'f-string', - 'generator', - 'both-expressions', - 'many-variables', - 'attributes', - 'conditional', - 'nested', - 'nested-double', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["[x for x in range(10)]", set()], + ['[f"{x}" for x in range(10)]', set()], + ["(x for x in range(10))", set()], + ["[function(x) for x in range(10)]", {"function"}], + ["[(x, y) for x, y in something(10)]", {"something"}], + ["[x.attribute for x in range(10)]", set()], + ["[x for x in obj if x > 0]", {"obj"}], + ["[i for row in matrix for i in row]", {"matrix"}], + ["[i for matrix in tensor for row in matrix for i in row]", {"tensor"}], + ], + ids=[ + "left-expression", + "f-string", + "generator", + "both-expressions", + "many-variables", + "attributes", + "conditional", + "nested", + "nested-double", + ], +) def test_find_list_comprehension_inputs(code, expected): tree = parso.parse(code) list_comp = tree.children[0].children[1] assert io.find_comprehension_inputs(list_comp) == expected -@pytest.mark.parametrize('code, expected', [ - ['for i in range(10):\n y = x + i', {'i'}], - ['for i, j in something():\n y = x + i', {'i', 'j'}], - ['def function(i):\n y = x + i', {'i'}], - ['def function(i, j):\n y = x + i + j', {'i', 'j'}], -], - ids=[ - 'for', - 'for-many', - 'def', - 'def-many', - ]) +@pytest.mark.parametrize( + "code, expected", + [ + ["for i in range(10):\n y = x + i", {"i"}], + ["for i, j in something():\n y = x + i", {"i", "j"}], + ["def function(i):\n y = x + i", {"i"}], + ["def function(i, j):\n y = x + i + j", {"i", "j"}], + ], + ids=[ + "for", + "for-many", + "def", + "def-many", + ], +) def test_get_local_scope(code, expected): - node = testutils.get_first_leaf_with_value(code, 'x') + node = testutils.get_first_leaf_with_value(code, "x") assert io.get_local_scope(node) == expected -@pytest.mark.parametrize('code, expected', [ - ['a = 1', set()], - ['a, b = 1, 2', set()], - ['i = 1', {'i'}], - ['a, i = 1, 2', {'i'}], - ['i, b = 1, 2', {'i'}], - ['(a, i) = 1, 2', {'i'}], - ['(i, b) = 1, 2', {'i'}], - ['[a, i] = 1, 2', {'i'}], - ['[i, b] = 1, 2', {'i'}], - ['[i["key"], b] = 1, 2', {'i'}], - ['[i.attribute, b] = 1, 2', {'i'}], - ['[i[key], b] = 1, 2', {'i'}], - ['(i, (j, a)) = 1, (2, 3)', {'i', 'j'}], - ['(i, (j, (k, a))) = 1, (2, (3, 4))', {'i', 'j', 'k'}], -]) +@pytest.mark.parametrize( + "code, expected", + [ + ["a = 1", set()], + ["a, b = 1, 2", set()], + ["i = 1", {"i"}], + ["a, i = 1, 2", {"i"}], + ["i, b = 1, 2", {"i"}], + ["(a, i) = 1, 2", {"i"}], + ["(i, b) = 1, 2", {"i"}], + ["[a, i] = 1, 2", {"i"}], + ["[i, b] = 1, 2", {"i"}], + ['[i["key"], b] = 1, 2', {"i"}], + ["[i.attribute, b] = 1, 2", {"i"}], + ["[i[key], b] = 1, 2", {"i"}], + ["(i, (j, a)) = 1, (2, 3)", {"i", "j"}], + ["(i, (j, (k, a))) = 1, (2, (3, 4))", {"i", "j", "k"}], + ], +) def test_get_modified_objects(code, expected): - leaf = testutils.get_first_leaf_with_value(code, '=') - assert (io._get_modified_objects(leaf, {'i', 'j', 'k'}, set()) == expected) + leaf = testutils.get_first_leaf_with_value(code, "=") + assert io._get_modified_objects(leaf, {"i", "j", "k"}, set()) == expected diff --git a/tests/test_io_find_inputs_and_outputs.py b/tests/test_io_find_inputs_and_outputs.py index 8acfa0b..7468a8a 100644 --- a/tests/test_io_find_inputs_and_outputs.py +++ b/tests/test_io_find_inputs_and_outputs.py @@ -436,212 +436,184 @@ def some_method(self, a, b=0): @pytest.mark.parametrize( - 'code_str, inputs, outputs', [ - [only_outputs, set(), {'x', 'y'}], - [simple, {'x', 'y'}, {'z'}], - [local_inputs, set(), {'x', 'y', 'z'}], - [imports, set(), {'z'}], - [imported_function, set(), {'df'}], - [input_in_function_call, {'df'}, set()], - [input_key_in_function_call, {'df'}, - set()], - [input_key_in_function_call_many, {'df', 'df_another'}, - set()], - [input_key_in_function_call_with_dot_access, {'df'}, - set()], - [modify_existing_obj_getitem, - set(), {'mapping'}], - [modify_imported_obj_getitem, - set(), set()], - [built_in, set(), {'mapping'}], - [built_in_as_arg, set(), {'something'}], - [input_existing_object, set(), {'X'}], - [define_multiple_outputs, - set(), {'a', 'b', 'c'}], - [define_multiple_outputs_square_brackets, - set(), {'a', 'b', 'c'}], - [define_multiple_outputs_parenthesis, - set(), {'a', 'b', 'c'}], - [define_multiple_outputs_inside_function, - set(), set()], + "code_str, inputs, outputs", + [ + [only_outputs, set(), {"x", "y"}], + [simple, {"x", "y"}, {"z"}], + [local_inputs, set(), {"x", "y", "z"}], + [imports, set(), {"z"}], + [imported_function, set(), {"df"}], + [input_in_function_call, {"df"}, set()], + [input_key_in_function_call, {"df"}, set()], + [input_key_in_function_call_many, {"df", "df_another"}, set()], + [input_key_in_function_call_with_dot_access, {"df"}, set()], + [modify_existing_obj_getitem, set(), {"mapping"}], + [modify_imported_obj_getitem, set(), set()], + [built_in, set(), {"mapping"}], + [built_in_as_arg, set(), {"something"}], + [input_existing_object, set(), {"X"}], + [define_multiple_outputs, set(), {"a", "b", "c"}], + [define_multiple_outputs_square_brackets, set(), {"a", "b", "c"}], + [define_multiple_outputs_parenthesis, set(), {"a", "b", "c"}], + [define_multiple_outputs_inside_function, set(), set()], [ define_multiple_replace_existing, set(), - {'b', 'c'}, + {"b", "c"}, ], - [local_function, set(), {'y'}], - [local_function_with_args, set(), {'y'}], + [local_function, set(), {"y"}], + [local_function_with_args, set(), {"y"}], [ local_function_with_args_and_body, set(), - {'y'}, + {"y"}, ], [ local_function_with_kwargs, set(), - {'y'}, + {"y"}, ], - [local_class, set(), {'y'}], - [for_loop, {'z'}, {'y'}], - [for_loop_many, set(), {'y'}], - [for_loop_names_with_parenthesis, - set(), {'x'}], + [local_class, set(), {"y"}], + [for_loop, {"z"}, {"y"}], + [for_loop_many, set(), {"y"}], + [for_loop_names_with_parenthesis, set(), {"x"}], [for_loop_nested, set(), set()], - [for_loop_nested_dependent, set(), - set()], + [for_loop_nested_dependent, set(), set()], [for_loop_name_reference, set(), set()], - [for_loop_with_input, {'some_input' - }, set()], - [for_loop_with_local_input, - set(), {'some_variable'}], - [for_loop_with_input_attribute, - {'some_input'}, set()], - [for_loop_with_input_nested_attribute, - {'some_input' - }, set()], - [for_loop_with_input_and_getitem, - {'some_input' - }, set()], - [ - for_loop_with_input_and_getitem_input, {'some_input', 'some_key'}, - set() - ], - [for_loop_with_input_and_nested_getitem, - {'some_input' - }, set()], - [for_loop_with_nested_input, - {'some_input'}, set()], - [getitem_input, {'df'}, set()], - [method_access_input, {'df'}, set()], - [overriding_name, {'x', 'y'}, {'x', 'y'}], - [list_comprehension, {'x'}, set()], - [list_comprehension_attributes, - {'x'}, set()], - [list_comprehension_with_conditional, {'df'}, {'selected', 'targets'}], + [for_loop_with_input, {"some_input"}, set()], + [for_loop_with_local_input, set(), {"some_variable"}], + [for_loop_with_input_attribute, {"some_input"}, set()], + [for_loop_with_input_nested_attribute, {"some_input"}, set()], + [for_loop_with_input_and_getitem, {"some_input"}, set()], + [for_loop_with_input_and_getitem_input, {"some_input", "some_key"}, set()], + [for_loop_with_input_and_nested_getitem, {"some_input"}, set()], + [for_loop_with_nested_input, {"some_input"}, set()], + [getitem_input, {"df"}, set()], + [method_access_input, {"df"}, set()], + [overriding_name, {"x", "y"}, {"x", "y"}], + [list_comprehension, {"x"}, set()], + [list_comprehension_attributes, {"x"}, set()], + [list_comprehension_with_conditional, {"df"}, {"selected", "targets"}], [ list_comprehension_with_conditional_and_local_variable, - set(), {'df', 'features'} + set(), + {"df", "features"}, ], - [list_comprehension_with_f_string, - set(), set()], - [list_comprehension_with_f_string_assignment, - set(), {'y'}], - [list_comprehension_nested, {'reduced_cats'}, {'out'}], + [list_comprehension_with_f_string, set(), set()], + [list_comprehension_with_f_string_assignment, set(), {"y"}], + [list_comprehension_nested, {"reduced_cats"}, {"out"}], [ list_comprehension_nested_another, set(), - {'out'}, + {"out"}, ], [ list_comprehension_nested_more, set(), - {'out'}, + {"out"}, ], [ list_comprehension_with_left_input, - {'y'}, + {"y"}, set(), ], - [set_comprehension, {'numbers'}, {'output'}], - [dict_comprehension, {'numbers', 'y'}, {'output'}], - [dict_comprehension_zip, {'y'}, {'output'}], - [function_with_global_variable, - {'b'}, set()], - [mutating_input, {'df'}, {'df'}], - [mutating_input_implicit, {'df'}, {'df'}], - [function_mutating_local_object, - set(), set()], - [nested_function_arg, {'y'}, set()], - [nested_function_kwarg, {'y'}, set()], - [context_manager, set(), {'x'}], - [ - f_string, {'some_variable', 'a_number', 'an_object', 'another'}, - set() - ], + [set_comprehension, {"numbers"}, {"output"}], + [dict_comprehension, {"numbers", "y"}, {"output"}], + [dict_comprehension_zip, {"y"}, {"output"}], + [function_with_global_variable, {"b"}, set()], + [mutating_input, {"df"}, {"df"}], + [mutating_input_implicit, {"df"}, {"df"}], + [function_mutating_local_object, set(), set()], + [nested_function_arg, {"y"}, set()], + [nested_function_kwarg, {"y"}, set()], + [context_manager, set(), {"x"}], + [f_string, {"some_variable", "a_number", "an_object", "another"}, set()], [ f_string_assignment, - {'some_variable', 'a_number', 'an_object', 'another'}, {'s'} + {"some_variable", "a_number", "an_object", "another"}, + {"s"}, ], - [class_, set(), {'some_object'}], + [class_, set(), {"some_object"}], [lambda_, set(), set()], - [lambda_with_input, {'y'}, set()], + [lambda_with_input, {"y"}, set()], [lambda_as_arg, set(), set()], - [lambda_assignment, set(), {'out'}], - [lambda_with_input_assignment, {'y'}, {'out'}], - [lambda_as_arg_assignment, set(), {'out'}], + [lambda_assignment, set(), {"out"}], + [lambda_with_input_assignment, {"y"}, {"out"}], + [lambda_as_arg_assignment, set(), {"out"}], ], ids=[ - 'only_outputs', - 'simple', - 'local_inputs', - 'imports', - 'imported_function', - 'input_in_function_call', - 'input_key_in_function_call', - 'input_key_in_function_call_many', - 'input_key_in_function_call_with_dot_access', - 'modify_existing_getitem', - 'modify_imported_getitem', - 'built_in', - 'built_in_as_arg', - 'input_existing_object', - 'define_multiple_outputs', - 'define_multiple_outputs_square_brackets', - 'define_multiple_outputs_parenthesis', - 'define_multiple_outputs_inside_function', - 'define_multiple_replace_existing', - 'local_function', - 'local_function_with_args', - 'local_function_with_args_and_body', - 'local_function_with_kwargs', - 'local_class', - 'for_loop', - 'for_loop_many', - 'for_loop_names_with_parenthesis', - 'for_loop_nested', - 'for_loop_nested_dependent', - 'for_loop_name_reference', - 'for_loop_with_input', - 'for_loop_with_local_input', - 'for_loop_with_input_attribute', - 'for_loop_with_input_nested_attribute', - 'for_loop_with_input_and_getitem', - 'for_loop_with_input_and_getitem_input', - 'for_loop_with_input_and_nested_getitem', - 'for_loop_with_nested_input', - 'getitem_input', - 'method_access_input', - 'overriding_name', - 'list_comprehension', - 'list_comprehension_attributes', - 'list_comprehension_with_conditional', - 'list_comprehension_with_conditional_and_local_variable', - 'list_comprehension_with_f_string', - 'list_comprehension_with_f_string_assignment', - 'list_comprehension_nested', - 'list_comprehension_nested_another', - 'list_comprehension_nested_more', - 'list_comprehension_with_left_input', - 'set_comprehension', - 'dict_comprehension', - 'dict_comprehension_zip', - 'function_with_global_variable', - 'mutating_input', - 'mutating_input_implicit', - 'function_mutating_local_object', - 'nested_function_arg', - 'nested_function_kwarg', - 'context_manager', - 'f_string', - 'f_string_assignment', - 'class_', - 'lambda_', - 'lambda_with_input', - 'lambda_as_arg', - 'lambda_assignment', - 'lambda_with_input_assignment', - 'lambda_as_arg_assignment', - ]) + "only_outputs", + "simple", + "local_inputs", + "imports", + "imported_function", + "input_in_function_call", + "input_key_in_function_call", + "input_key_in_function_call_many", + "input_key_in_function_call_with_dot_access", + "modify_existing_getitem", + "modify_imported_getitem", + "built_in", + "built_in_as_arg", + "input_existing_object", + "define_multiple_outputs", + "define_multiple_outputs_square_brackets", + "define_multiple_outputs_parenthesis", + "define_multiple_outputs_inside_function", + "define_multiple_replace_existing", + "local_function", + "local_function_with_args", + "local_function_with_args_and_body", + "local_function_with_kwargs", + "local_class", + "for_loop", + "for_loop_many", + "for_loop_names_with_parenthesis", + "for_loop_nested", + "for_loop_nested_dependent", + "for_loop_name_reference", + "for_loop_with_input", + "for_loop_with_local_input", + "for_loop_with_input_attribute", + "for_loop_with_input_nested_attribute", + "for_loop_with_input_and_getitem", + "for_loop_with_input_and_getitem_input", + "for_loop_with_input_and_nested_getitem", + "for_loop_with_nested_input", + "getitem_input", + "method_access_input", + "overriding_name", + "list_comprehension", + "list_comprehension_attributes", + "list_comprehension_with_conditional", + "list_comprehension_with_conditional_and_local_variable", + "list_comprehension_with_f_string", + "list_comprehension_with_f_string_assignment", + "list_comprehension_nested", + "list_comprehension_nested_another", + "list_comprehension_nested_more", + "list_comprehension_with_left_input", + "set_comprehension", + "dict_comprehension", + "dict_comprehension_zip", + "function_with_global_variable", + "mutating_input", + "mutating_input_implicit", + "function_mutating_local_object", + "nested_function_arg", + "nested_function_kwarg", + "context_manager", + "f_string", + "f_string_assignment", + "class_", + "lambda_", + "lambda_with_input", + "lambda_as_arg", + "lambda_assignment", + "lambda_with_input_assignment", + "lambda_as_arg_assignment", + ], +) def test_find_inputs_and_outputs(code_str, inputs, outputs): in_, out = io.find_inputs_and_outputs(code_str) diff --git a/tests/test_magics.py b/tests/test_magics.py index e8c10e8..ec2d297 100644 --- a/tests/test_magics.py +++ b/tests/test_magics.py @@ -27,105 +27,116 @@ """ -@pytest.mark.parametrize('source, expected', [ - ['%%html\na\nb', '# [magic] %%html\n# [magic] a\n# [magic] b'], - ['%%capture\na\nb', '# [magic] %%capture\na\nb'], - ['%%timeit\na\nb', '# [magic] %%timeit\na\nb'], - ['%%time\na\nb', '# [magic] %%time\na\nb'], - ['%time 1\n2\n%time 3', '# [magic] %time 1\n2\n# [magic] %time 3'], -], - ids=[ - 'another-language', - 'inline-python', - 'inline-python-2', - 'inline-python-3', - 'line-magics', - ]) +@pytest.mark.parametrize( + "source, expected", + [ + ["%%html\na\nb", "# [magic] %%html\n# [magic] a\n# [magic] b"], + ["%%capture\na\nb", "# [magic] %%capture\na\nb"], + ["%%timeit\na\nb", "# [magic] %%timeit\na\nb"], + ["%%time\na\nb", "# [magic] %%time\na\nb"], + ["%time 1\n2\n%time 3", "# [magic] %time 1\n2\n# [magic] %time 3"], + ], + ids=[ + "another-language", + "inline-python", + "inline-python-2", + "inline-python-3", + "line-magics", + ], +) def test_comment_if_ipython_magic(source, expected): assert magics._comment_if_ipython_magic(source) == expected def test_comment_magics(): - nb = jupytext.reads(source, fmt='py:light') + nb = jupytext.reads(source, fmt="py:light") nb_new = magics.comment_magics(nb) - assert [c['source'] for c in nb_new.cells] == [ - '## first', - '# [magic] %%bash\n# [magic] ls', - '# [magic] %%html\n# [magic]
hi', - '## second', - '# [magic] %timeit 1 + 1', - '# [magic] %cd x', + assert [c["source"] for c in nb_new.cells] == [ + "## first", + "# [magic] %%bash\n# [magic] ls", + "# [magic] %%html\n# [magic]
hi", + "## second", + "# [magic] %timeit 1 + 1", + "# [magic] %cd x", "# [magic] %%capture\nprint('x')", - '# [magic] ! echo hello', + "# [magic] ! echo hello", ] -@pytest.mark.parametrize('line, expected', [ - ['# [magic] %%timeit something()', '%%timeit something()'], - ['# [magic] %timeit something()', '%timeit something()'], - ['# [magic] %some_magic another()', '%some_magic another()'], -]) +@pytest.mark.parametrize( + "line, expected", + [ + ["# [magic] %%timeit something()", "%%timeit something()"], + ["# [magic] %timeit something()", "%timeit something()"], + ["# [magic] %some_magic another()", "%some_magic another()"], + ], +) def test_uncomment_magic(line, expected): assert magics._uncomment_magic(line) == expected -@pytest.mark.parametrize('line, expected', [ - ['# [magic] %%timeit something()', False], - ['something() # [magic] %timeit', ('something()', '%timeit')], - ['another() # [magic] %time', ('another()', '%time')], -]) +@pytest.mark.parametrize( + "line, expected", + [ + ["# [magic] %%timeit something()", False], + ["something() # [magic] %timeit", ("something()", "%timeit")], + ["another() # [magic] %time", ("another()", "%time")], + ], +) def test_is_commented_line_magic(line, expected): assert magics._is_commented_line_magic(line) == expected def test_uncomment_magics_cell(): - nb = jupytext.reads(source, fmt='py:light') + nb = jupytext.reads(source, fmt="py:light") nb_new = magics.comment_magics(nb) - assert [magics._uncomment_magics_cell(c['source']) - for c in nb_new.cells] == [ - '## first', - '%%bash\nls', - '%%html\n
hi', - '## second', - '%timeit 1 + 1', - '%cd x', - "%%capture\nprint('x')", - '! echo hello', - ] + assert [magics._uncomment_magics_cell(c["source"]) for c in nb_new.cells] == [ + "## first", + "%%bash\nls", + "%%html\n
hi", + "## second", + "%timeit 1 + 1", + "%cd x", + "%%capture\nprint('x')", + "! echo hello", + ] def test_uncomment_magics(): - nb = jupytext.reads(source, fmt='py:light') + nb = jupytext.reads(source, fmt="py:light") nb_new = magics.comment_magics(nb) nb_out = magics.uncomment_magics(nb_new) - assert [c['source'] for c in nb_out.cells] == [ - '## first', - '%%bash\nls', - '%%html\n
hi', - '## second', - '%timeit 1 + 1', - '%cd x', + assert [c["source"] for c in nb_out.cells] == [ + "## first", + "%%bash\nls", + "%%html\n
hi", + "## second", + "%timeit 1 + 1", + "%cd x", "%%capture\nprint('x')", - '! echo hello', + "! echo hello", ] -@pytest.mark.parametrize('line, expected', [ - ['%timeit x = 1', 'x = 1 # [magic] %timeit'], - ['%time x = 1', 'x = 1 # [magic] %time'], - [' %time x = 1', 'x = 1 # [magic] %time'], -], - ids=[ - 'time', - 'timeit', - 'leading-whitespace', - ]) +@pytest.mark.parametrize( + "line, expected", + [ + ["%timeit x = 1", "x = 1 # [magic] %timeit"], + ["%time x = 1", "x = 1 # [magic] %time"], + [" %time x = 1", "x = 1 # [magic] %time"], + ], + ids=[ + "time", + "timeit", + "leading-whitespace", + ], +) def test_comment_ipython_line_magic(line, expected): magic = magics._is_ipython_line_magic(line) assert magics._comment_ipython_line_magic(line, magic) == expected diff --git a/tests/test_overriding_var.py b/tests/test_overriding_var.py index 0b0c1d1..0433402 100644 --- a/tests/test_overriding_var.py +++ b/tests/test_overriding_var.py @@ -53,8 +53,8 @@ def test_overriding_same_cell(nb): export.from_nb(_read(nb)) - dag = DAGSpec('pipeline.yaml').to_dag().render() - assert set(dag['loading-df'].upstream) == set() + dag = DAGSpec("pipeline.yaml").to_dag().render() + assert set(dag["loading-df"].upstream) == set() assert set(dag["reload-df-and-load-df-2"].upstream) == set() @@ -62,8 +62,8 @@ def test_overriding_same_cell(nb): def test_overriding_diff_cell(nb): export.from_nb(_read(nb)) - dag = DAGSpec('pipeline.yaml').to_dag().render() + dag = DAGSpec("pipeline.yaml").to_dag().render() - assert set(dag['load-df'].upstream) == set() - assert set(dag['load-df-again'].upstream) == set() - assert set(dag['load-df-2'].upstream) == {'load-df-again'} + assert set(dag["load-df"].upstream) == set() + assert set(dag["load-df-again"].upstream) == set() + assert set(dag["load-df-2"].upstream) == {"load-df-again"} diff --git a/tests/test_proto.py b/tests/test_proto.py index 642388f..df8b0fe 100644 --- a/tests/test_proto.py +++ b/tests/test_proto.py @@ -6,35 +6,42 @@ # TODO: do we need roundtrip conversion? we'l only use this for static analysis # so i think we're fine -mixed_expected = '1 + 1 # Cell 1\n2 + 2 # Cell 3' +mixed_expected = "1 + 1 # Cell 1\n2 + 2 # Cell 3" -@pytest.mark.parametrize('code, expected', [ - [mixed, mixed_expected], -]) +@pytest.mark.parametrize( + "code, expected", + [ + [mixed, mixed_expected], + ], +) def test_prototask_str(code, expected): - assert str( - proto.ProtoTask('name', - _read(code).cells, - df_format=None, - serializer=None, - py=True)) == expected + assert ( + str( + proto.ProtoTask( + "name", _read(code).cells, df_format=None, serializer=None, py=True + ) + ) + == expected + ) -@pytest.mark.parametrize('cells_idx, expected', [ - [(0, 3), 'from sklearn.datasets import load_iris'], -]) +@pytest.mark.parametrize( + "cells_idx, expected", + [ + [(0, 3), "from sklearn.datasets import load_iris"], + ], +) def test_prototask_add_imports_cell(cells_idx, expected): - cells = jupytext.reads(exploratory, - fmt='py:light').cells[cells_idx[0]:cells_idx[1]] - pt = proto.ProtoTask('task', - cells, - df_format=None, - serializer=None, - py=True) - cell = pt._add_imports_cell(exploratory, - add_pathlib_and_pickle=False, - definitions=None, - df_format=None, - serializer=None) - assert cell['source'] == expected + cells = jupytext.reads(exploratory, fmt="py:light").cells[ + cells_idx[0] : cells_idx[1] + ] + pt = proto.ProtoTask("task", cells, df_format=None, serializer=None, py=True) + cell = pt._add_imports_cell( + exploratory, + add_pathlib_and_pickle=False, + definitions=None, + df_format=None, + serializer=None, + ) + assert cell["source"] == expected diff --git a/tests/test_render_notebooks.py b/tests/test_render_notebooks.py index ca7bc31..03161a1 100644 --- a/tests/test_render_notebooks.py +++ b/tests/test_render_notebooks.py @@ -1,6 +1,7 @@ """ Get some notebooks from kaggle, refactor and render DAG """ + from glob import glob from pathlib import Path @@ -10,8 +11,8 @@ from soorgeon import export -_kaggle = Path(PATH_TO_TESTS, '..', '_kaggle', '_render') -path_to_nbs = glob(str(Path(_kaggle, '*', '*.py'))) +_kaggle = Path(PATH_TO_TESTS, "..", "_kaggle", "_render") +path_to_nbs = glob(str(Path(_kaggle, "*", "*.py"))) def get_name(path): @@ -21,7 +22,7 @@ def get_name(path): names = [get_name(nb) for nb in path_to_nbs] -@pytest.mark.parametrize('path', path_to_nbs, ids=names) +@pytest.mark.parametrize("path", path_to_nbs, ids=names) def test_notebooks(tmp_empty, path): export.from_path(path, py=True) - DAGSpec('pipeline.yaml').to_dag().render() + DAGSpec("pipeline.yaml").to_dag().render() diff --git a/tests/test_sample_notebooks.py b/tests/test_sample_notebooks.py index 24590df..149794c 100644 --- a/tests/test_sample_notebooks.py +++ b/tests/test_sample_notebooks.py @@ -9,19 +9,21 @@ from soorgeon._pygithub import download_directory -dir_names = ['titanic-logistic-regression-with-python', - 'customer-segmentation-clustering', - 'intro-to-time-series-forecasting', - 'feature-selection-and-data-visualization', - 'linear-regression-house-price-prediction', - 'look-at-this-note-feature-engineering-is-easy'] +dir_names = [ + "titanic-logistic-regression-with-python", + "customer-segmentation-clustering", + "intro-to-time-series-forecasting", + "feature-selection-and-data-visualization", + "linear-regression-house-price-prediction", + "look-at-this-note-feature-engineering-is-easy", +] -@pytest.mark.parametrize('dir', dir_names, ids=list(dir_names)) +@pytest.mark.parametrize("dir", dir_names, ids=list(dir_names)) def test_notebooks(tmp_empty, dir): download_directory(dir) path = os.getcwd() - export.from_path(Path(path, 'nb.py'), py=True) + export.from_path(Path(path, "nb.py"), py=True) - dag = DAGSpec('pipeline.yaml').to_dag() + dag = DAGSpec("pipeline.yaml").to_dag() dag.build() diff --git a/tests/test_split.py b/tests/test_split.py index fe8ad01..fc252f7 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -86,22 +86,26 @@ # # Cell 4 """ -no_h2_but_h1_headers_error = 'Only H1 headings are found. ' \ - 'At this time, only H2 headings are supported' +no_h2_but_h1_headers_error = ( + "Only H1 headings are found. " "At this time, only H2 headings are supported" +) -no_h1_and_h2_headers_error = 'Expected notebook to have at least one' +no_h1_and_h2_headers_error = "Expected notebook to have at least one" -only_one_h2_header_warning = 'Warning: refactoring successful ' \ - 'but only one H2 heading detected,' +only_one_h2_header_warning = ( + "Warning: refactoring successful " "but only one H2 heading detected," +) # case with where cell only has H2 and H2 + more stuff # edge case: H1, then H2 with no code in between, we should ignore that break -@pytest.mark.parametrize('md, expected_msg', [ - [only_one_h2, only_one_h2_header_warning], - [only_one_h2_diff, only_one_h2_header_warning], - -]) +@pytest.mark.parametrize( + "md, expected_msg", + [ + [only_one_h2, only_one_h2_header_warning], + [only_one_h2_diff, only_one_h2_header_warning], + ], +) def test_find_breaks_warnings(md, expected_msg, tmp_empty, capsys): nb = _read(md) split.find_breaks(nb) @@ -109,12 +113,15 @@ def test_find_breaks_warnings(md, expected_msg, tmp_empty, capsys): assert expected_msg in captured.out -@pytest.mark.parametrize('md, expected_msg', [ - [no_h2_but_h1_headers, no_h2_but_h1_headers_error], - [no_markdown_but_json, no_h1_and_h2_headers_error], - [no_markdown_but_plain_text, no_h1_and_h2_headers_error], - [no_h1_and_h2_headers, no_h1_and_h2_headers_error], -]) +@pytest.mark.parametrize( + "md, expected_msg", + [ + [no_h2_but_h1_headers, no_h2_but_h1_headers_error], + [no_markdown_but_json, no_h1_and_h2_headers_error], + [no_markdown_but_plain_text, no_h1_and_h2_headers_error], + [no_h1_and_h2_headers, no_h1_and_h2_headers_error], + ], +) def test_find_breaks_errors(md, expected_msg, tmp_empty, capsys): nb = _read(md) @@ -124,53 +131,68 @@ def test_find_breaks_errors(md, expected_msg, tmp_empty, capsys): assert expected_msg in str(excinfo.value) -@pytest.mark.parametrize('md, expected', [ - ['## Header', 'header'], - ['# H1\n## H2', 'h2'], - [' ## H2', 'h2'], - [' ### H3', None], - ['something', None], - ['## Something\nignore me', 'something'], -]) +@pytest.mark.parametrize( + "md, expected", + [ + ["## Header", "header"], + ["# H1\n## H2", "h2"], + [" ## H2", "h2"], + [" ### H3", None], + ["something", None], + ["## Something\nignore me", "something"], + ], +) def test_get_h2_header(md, expected): assert split._get_h2_header(md) == expected -@pytest.mark.parametrize('md, expected', [ - ['# Header', 'header'], - ['# H1\n## H2', 'h1'], - [' \t # H1', 'h1'], - [' ## H2', None], - ['something', None], - ['# Something\nignore me', 'something'], -]) +@pytest.mark.parametrize( + "md, expected", + [ + ["# Header", "header"], + ["# H1\n## H2", "h1"], + [" \t # H1", "h1"], + [" ## H2", None], + ["something", None], + ["# Something\nignore me", "something"], + ], +) def test_get_h1_header(md, expected): assert split._get_h1_header(md) == expected -@pytest.mark.parametrize('nb_str, expected', [ - [mixed, [2, 4]], - [long_md, [0, 2]], - [h1_next_to_h2, [0, 2]], - [exploratory, [0, 3, 5]], -]) +@pytest.mark.parametrize( + "nb_str, expected", + [ + [mixed, [2, 4]], + [long_md, [0, 2]], + [h1_next_to_h2, [0, 2]], + [exploratory, [0, 3, 5]], + ], +) def test_find_breaks(tmp_empty, nb_str, expected): assert split.find_breaks(_read(nb_str)) == expected -@pytest.mark.parametrize('cells, breaks, expected', [ - [[1, 2, 3, 4], [1], [[1, 2, 3, 4]]], - [[1, 2, 3, 4], [1, 2], [[1, 2], [3, 4]]], -]) +@pytest.mark.parametrize( + "cells, breaks, expected", + [ + [[1, 2, 3, 4], [1], [[1, 2, 3, 4]]], + [[1, 2, 3, 4], [1, 2], [[1, 2], [3, 4]]], + ], +) def test_split_with_breaks(cells, breaks, expected): assert split.split_with_breaks(cells, breaks) == expected -@pytest.mark.parametrize('nb_str, expected', [ - [all_h2, ['cell-0', 'cell-2', 'cell-4']], - [long_md, ['h2', 'another']], - [exploratory, ['load', 'clean', 'plot']], -]) +@pytest.mark.parametrize( + "nb_str, expected", + [ + [all_h2, ["cell-0", "cell-2", "cell-4"]], + [long_md, ["h2", "another"]], + [exploratory, ["load", "clean", "plot"]], + ], +) def test_names_with_breaks(tmp_empty, nb_str, expected): nb = _read(nb_str) @@ -184,16 +206,19 @@ def test_names_with_breaks(tmp_empty, nb_str, expected): # FIXME: ensure _add_imports_cell removes comments -@pytest.mark.parametrize('name, expected', [ - ['task', 'task'], - ['a task', 'a-task'], - ['a ta/sk', 'a-ta-sk'], - ['some_task', 'some-task'], - ['this & that', 'this-that'], - ['some-task', 'some-task'], - ['`some_function()`', '-some-function-'], - ['some.task', 'some-task'], - ['1.1 some stuff', 'section-1-1-some-stuff'], -]) +@pytest.mark.parametrize( + "name, expected", + [ + ["task", "task"], + ["a task", "a-task"], + ["a ta/sk", "a-ta-sk"], + ["some_task", "some-task"], + ["this & that", "this-that"], + ["some-task", "some-task"], + ["`some_function()`", "-some-function-"], + ["some.task", "some-task"], + ["1.1 some stuff", "section-1-1-some-stuff"], + ], +) def test_sanitize_name(name, expected): assert split._sanitize_name(name) == expected diff --git a/tests/testutils.py b/tests/testutils.py index ac84ed7..f5e5c3b 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -11,11 +11,11 @@ def get_first_leaf_with_value(code, value): leaf = leaf.get_next_leaf() - raise ValueError(f'could not find leaf with value {value}') + raise ValueError(f"could not find leaf with value {value}") def _read(nb_str): - return jupytext.reads(nb_str, fmt='py:light') + return jupytext.reads(nb_str, fmt="py:light") exploratory = """# # Exploratory data analysis