Skip to content

Commit

Permalink
delete telemetry (#93)
Browse files Browse the repository at this point in the history
* remove telemetry

* formatting

* lint

* update

* test

* deletes integration tests
  • Loading branch information
edublancas authored Sep 18, 2024
1 parent e1633df commit db21413
Show file tree
Hide file tree
Showing 48 changed files with 3,110 additions and 2,790 deletions.
38 changes: 1 addition & 37 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
# python 3.6 throws an error:
# https://stackoverflow.com/q/69174965/709975
python-version: [3.7, 3.8, 3.9]
python-version: [3.9, "3.10"]

steps:
- uses: actions/checkout@v2
Expand All @@ -35,37 +33,3 @@ jobs:
- name: Unit tests
run: |
pytest --ignore=tests/test_sample_notebooks.py
integration-tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.9]
notebook:
- titanic-logistic-regression-with-python
- customer-segmentation-clustering
- intro-to-time-series-forecasting
- feature-selection-and-data-visualization
- linear-regression-house-price-prediction
- look-at-this-note-feature-engineering-is-easy
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Lint with flake8
run: |
pip install flake8
flake8
- name: Install dependencies
run: |
pip install ".[dev]"
- name: Install integration tests dependencies
run: |
pip install -r _kaggle/requirements.lock.txt
- name: Integration tests
env:
PLOOMBER_STATS_ENABLED: false
run: |
pytest "tests/test_sample_notebooks.py::test_notebooks[${{ matrix.notebook }}]"
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## 0.0.20dev

* Removes telemetry

## 0.0.19 (2022-08-30)
* Changes telemetry key

Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,3 @@ Whatever your skillset is, you can contribute to our mission. So whether you're

[Click here to know how you can contribute to Ploomber.](https://github.com/ploomber/contributing/blob/main/README.md)


## Telemetry

We collect anonymous statistics to understand and improve usage. For details, [see here](https://docs.ploomber.io/en/latest/community/user-stats.html)
174 changes: 105 additions & 69 deletions _kaggle/_render/breast-cancer-diagnostic-classification/nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):

for dirname, _, filenames in os.walk("/kaggle/input"):
for filename in filenames:
print(os.path.join(dirname, filename))

Expand All @@ -35,7 +36,7 @@
import os
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore")
import time as t
import pandas as pd
import numpy as np
Expand All @@ -45,7 +46,17 @@
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, classification_report, roc_curve, auc, roc_auc_score
from sklearn.metrics import (
confusion_matrix,
accuracy_score,
f1_score,
recall_score,
precision_score,
classification_report,
roc_curve,
auc,
roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Expand All @@ -62,8 +73,7 @@


# %%
def data_load(
): #check for the availability of the dataset and change cwd if not found
def data_load(): # check for the availability of the dataset and change cwd if not found
df = pd.read_csv("../input/breast-cancer-prediction/data.csv")
return df

Expand All @@ -73,15 +83,15 @@ def data_clean(df):


def X_y_split(df):
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']
X = df.drop(["diagnosis"], axis=1)
y = df["diagnosis"]
return X, y


def data_split_scale(X, y, sampling):
#Splitting dataset into Train and Test Set
# Splitting dataset into Train and Test Set
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.3)
#Feature Scaling using Standardization
# Feature Scaling using Standardization
ss = StandardScaler()
X_tr = ss.fit_transform(X_tr)
X_test = ss.fit_transform(X_test)
Expand All @@ -91,8 +101,7 @@ def data_split_scale(X, y, sampling):
samp_sel = int(input("Now enter your selection for sampling strategy: \t"))
samp = [sampling.upsample, sampling.downsample, sampling.smote]
temp = samp[samp_sel - 1]
X_train, y_train = temp(X_train=pd.DataFrame(X_tr),
y_train=pd.DataFrame(y_tr))
X_train, y_train = temp(X_train=pd.DataFrame(X_tr), y_train=pd.DataFrame(y_tr))
return pd.DataFrame(X_train), pd.DataFrame(X_test), y_train, y_test


Expand All @@ -107,29 +116,28 @@ def data_split_scale(X, y, sampling):
class sampling:

def upsample(X_train, y_train):
#combine them back for resampling
# combine them back for resampling
train_data = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
negative = train_data[train_data.diagnosis == 0]
positive = train_data[train_data.diagnosis == 1]
# upsample minority
pos_upsampled = resample(positive,
replace=True,
n_samples=len(negative),
random_state=30)
pos_upsampled = resample(
positive, replace=True, n_samples=len(negative), random_state=30
)
# combine majority and upsampled minority
upsampled = pd.concat([negative, pos_upsampled])
# check new class counts
#print(upsampled.diagnosis.value_counts())
# print(upsampled.diagnosis.value_counts())
print(upsampled.diagnosis.value_counts())
upsampled = upsampled.sample(frac=1)
X_train = upsampled.iloc[:, 0:-2]
y_train = upsampled.iloc[:, -1]
#graph barplot counts
# graph barplot counts
return X_train, y_train

def downsample(X_train, y_train):
#combine them back for resampling
# combine them back for resampling
train_data = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
negative = train_data[train_data.diagnosis == 0]
Expand All @@ -139,23 +147,24 @@ def downsample(X_train, y_train):
negative,
replace=True, # sample with replacement
n_samples=len(positive), # match number in minority class
random_state=30) # reproducible results
random_state=30,
) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
downsampled = downsampled.sample(frac=1)
X_train = downsampled.iloc[:, 0:-2]
y_train = downsampled.iloc[:, -1]
# check new class counts
print(downsampled.diagnosis.value_counts())
#graph
# graph
return X_train, y_train

def smote(X_train, y_train):
sm = SMOTE(random_state=30)
X_train, y_train = sm.fit_resample(X_train, y_train)
y_train = pd.DataFrame(y_train, columns=['diagnosis'])
y_train = pd.DataFrame(y_train, columns=["diagnosis"])
print(y_train.diagnosis.value_counts())
#graph
# graph
return X_train, y_train


Expand Down Expand Up @@ -207,10 +216,20 @@ def feat5():
df = data_load() # Loading Dataset into Dataframe
df = data_clean(df)
drop_cols = [
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
'symmetry_worst', 'fractal_dimension_worst', 'perimeter_mean',
'perimeter_se', 'area_mean', 'area_se', 'concavity_mean',
'concavity_se', 'concave points_mean', 'concave points_se'
"radius_worst",
"texture_worst",
"perimeter_worst",
"area_worst",
"symmetry_worst",
"fractal_dimension_worst",
"perimeter_mean",
"perimeter_se",
"area_mean",
"area_se",
"concavity_mean",
"concavity_se",
"concave points_mean",
"concave points_se",
]
df_sf = df.drop(drop_cols, axis=1)
X, y = X_y_split(df_sf)
Expand All @@ -221,9 +240,7 @@ def feature():
"'\t The number '1' stands for 'ALL- FEATURES'. \n \t The number '2' stands for 'MEAN- FEATURES' . \n \t The number '3' stands for 'SQUARED- ERROR FEATURES'. \n \t The number '4' stands for 'WORST- FEATURES'. \n \t The number '5' stands for 'SELECTED- FEATURES'.'"
)
selection = input("\t Enter your choice of feature selection: \t")
feat_options = [
feat.feat1, feat.feat2, feat.feat3, feat.feat4, feat.feat5
]
feat_options = [feat.feat1, feat.feat2, feat.feat3, feat.feat4, feat.feat5]
return feat_options[int(selection) - 1]()


Expand Down Expand Up @@ -274,7 +291,7 @@ def rfc(dat):
def knn(dat):
# K-Nearest Neighbors
start = t.time()
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
model_knn = knn.fit(dat[0], dat[2])
pred = model_knn.predict(dat[1])
pred_prob = model_knn.predict_proba(dat[1])
Expand All @@ -284,7 +301,7 @@ def knn(dat):
def svc_l(dat):
# Linear SVM
start = t.time()
svc_l = SVC(kernel='linear', random_state=0, probability=True)
svc_l = SVC(kernel="linear", random_state=0, probability=True)
model_svc_l = svc_l.fit(dat[0], dat[2])
pred = model_svc_l.predict(dat[1])
pred_prob = model_svc_l.predict_proba(dat[1])
Expand All @@ -294,7 +311,7 @@ def svc_l(dat):
def svc_r(dat):
# Kernel SVM
start = t.time()
svc_r = SVC(kernel='rbf', random_state=0, probability=True)
svc_r = SVC(kernel="rbf", random_state=0, probability=True)
model_svc_r = svc_r.fit(dat[0], dat[2])
pred = model_svc_r.predict(dat[1])
pred_prob = model_svc_r.predict_proba(dat[1])
Expand All @@ -320,8 +337,13 @@ def gnb(dat):
def train_n_test():
ft = feat.feature()
modelsss = [
models.lr, models.dtc, models.rfc, models.knn, models.svc_l,
models.svc_r, models.gnb
models.lr,
models.dtc,
models.rfc,
models.knn,
models.svc_l,
models.svc_r,
models.gnb,
]
print(
"'\t The number '1' stands for 'LOGISTIC REGRESSION'. \n \t The number '2' stands for 'Decision Tree' . \n \t The number '3' stands for 'Random Forest Classifier'. \n \t The number '4' stands for 'KNN'. \n \t The number '5' stands for 'Liner SVM'. \n \t The number '6' stands for 'Kernal SVM'. \n \t The number '7' stands for 'Guassian NB'.'"
Expand All @@ -339,9 +361,13 @@ def train_n_test():
def performance():
out, y_test, mdl_selection = train_n_test()
models = [
"Logistic Regression", "Desicion Tree Classifier",
"Random Forest Classifier", "KNN", "Liner SVM", "Kernal SVM",
"Guassian NB"
"Logistic Regression",
"Desicion Tree Classifier",
"Random Forest Classifier",
"KNN",
"Liner SVM",
"Kernal SVM",
"Guassian NB",
]
cm_lr = confusion_matrix(y_test, out[2])
sns.heatmap(cm_lr, annot=True, cmap="Reds")
Expand All @@ -350,50 +376,60 @@ def performance():
rs = recall_score(y_test, out[2])
fs = f1_score(y_test, out[2])
ps = precision_score(y_test, out[2])
#Report Bar Plot
report = pd.DataFrame(
classification_report(y_test, out[2], output_dict=True))
# Report Bar Plot
report = pd.DataFrame(classification_report(y_test, out[2], output_dict=True))
rg = report.drop(report.index[3]).drop(report.columns[2:], axis=1)
plt.style.use('seaborn')
rg.plot(kind='bar', color=["red", "salmon"])
plt.style.use("seaborn")
rg.plot(kind="bar", color=["red", "salmon"])
plt.title("Classification Report of {}".format(models[mdl_selection - 1]))
plt.legend(report.columns,
ncol=2,
loc="lower center",
bbox_to_anchor=(0.5, -0.3))
plt.legend(report.columns, ncol=2, loc="lower center", bbox_to_anchor=(0.5, -0.3))
plt.yticks(np.arange(0, 1.05, step=0.05))
print(
'\n\t The accuracy score of {} with given parameters is: {}%.'.format(
models[mdl_selection - 1], acs * 100))
print('\n\t The recall score of {} with given parameters is: {}%.'.format(
models[mdl_selection - 1], rs * 100))
"\n\t The accuracy score of {} with given parameters is: {}%.".format(
models[mdl_selection - 1], acs * 100
)
)
print(
"\n\t The recall score of {} with given parameters is: {}%.".format(
models[mdl_selection - 1], rs * 100
)
)
print(
"\n\t The precision score of {} with given parameters is: {}%.".format(
models[mdl_selection - 1], ps * 100
)
)
print(
'\n\t The precision score of {} with given parameters is: {}%.'.format(
models[mdl_selection - 1], ps * 100))
print('\n\t The F1 score of {} with given parameters is: {}%.'.format(
models[mdl_selection - 1], fs * 100))
"\n\t The F1 score of {} with given parameters is: {}%.".format(
models[mdl_selection - 1], fs * 100
)
)
print(
'\n\t The training and testing time taken by {} with given parameters is: {} seconds.'
.format(models[mdl_selection - 1], out[1]))
"\n\t The training and testing time taken by {} with given parameters is: {} seconds.".format(
models[mdl_selection - 1], out[1]
)
)
prob = out[3]
prob = prob[:, 1]
#ROC
# ROC
false_pos, true_pos, thresh = roc_curve(y_test, prob, pos_label=1)
auc_score = roc_auc_score(y_test, prob)
rand_pr = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, rand_pr, pos_label=1)
plt.figure()
plt.style.use('seaborn')
plt.plot(false_pos,
true_pos,
linestyle='--',
color='orange',
label=models[mdl_selection - 1])
plt.plot(p_fpr, p_tpr, linestyle='--', color='green')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
plt.style.use("seaborn")
plt.plot(
false_pos,
true_pos,
linestyle="--",
color="orange",
label=models[mdl_selection - 1],
)
plt.plot(p_fpr, p_tpr, linestyle="--", color="green")
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")

return out[0], out[2], auc_score

Expand Down
Loading

0 comments on commit db21413

Please sign in to comment.