From b8b97e32891b4be89a1adc1b316b8ba7142f951f Mon Sep 17 00:00:00 2001 From: Bob Date: Sat, 4 Aug 2018 14:56:08 -0400 Subject: [PATCH] change as_matrix to values --- ab_testing/client.py | 4 ++-- ann_logistic_extra/process.py | 2 +- cnn_class2/class_activation_maps.py | 7 ++----- cnn_class2/fashion.py | 2 +- cnn_class2/fashion2.py | 2 +- hmm_class/hmmd.py | 4 ++++ linear_regression_class/systolic.py | 2 +- nlp_class/nb.py | 2 +- nlp_class/spam2.py | 5 +++-- nlp_class3/bilstm_mnist.py | 2 +- supervised_class/bayes.py | 7 +++++++ supervised_class/util.py | 2 +- supervised_class2/rf_classification.py | 4 ++-- supervised_class2/rf_regression.py | 8 ++++---- unsupervised_class/kmeans_mnist.py | 4 ++-- unsupervised_class2/util.py | 2 +- 16 files changed, 34 insertions(+), 25 deletions(-) diff --git a/ab_testing/client.py b/ab_testing/client.py index 8cedc4d9..1b130447 100644 --- a/ab_testing/client.py +++ b/ab_testing/client.py @@ -16,8 +16,8 @@ df = pd.read_csv('advertisement_clicks.csv') a = df[df['advertisement_id'] == 'A'] b = df[df['advertisement_id'] == 'B'] -a = a['action'].as_matrix() -b = b['action'].as_matrix() +a = a['action'].values +b = b['action'].values print("a.mean:", a.mean()) print("b.mean:", b.mean()) diff --git a/ann_logistic_extra/process.py b/ann_logistic_extra/process.py index 568ba107..785755b7 100644 --- a/ann_logistic_extra/process.py +++ b/ann_logistic_extra/process.py @@ -21,7 +21,7 @@ def get_data(): # df.head() # easier to work with numpy array - data = df.as_matrix() + data = df.values # shuffle it np.random.shuffle(data) diff --git a/cnn_class2/class_activation_maps.py b/cnn_class2/class_activation_maps.py index 951a4fc5..e85ab504 100644 --- a/cnn_class2/class_activation_maps.py +++ b/cnn_class2/class_activation_maps.py @@ -6,14 +6,10 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.layers import Input, Lambda, Dense, Flatten from keras.models import Model from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions -# from keras.applications.inception_v3 import InceptionV3, preprocess_input from keras.preprocessing import image -from keras.preprocessing.image import ImageDataGenerator -from sklearn.metrics import confusion_matrix import numpy as np import scipy as sp import matplotlib.pyplot as plt @@ -24,7 +20,7 @@ -# useful for getting number of files +# get the image files image_files = glob('../large_files/256_ObjectCategories/*/*.jp*g') image_files += glob('../large_files/101_ObjectCategories/*/*.jp*g') @@ -72,6 +68,7 @@ cam = fmaps.dot(w) # upsample to 224 x 224 + # 7 x 32 = 224 cam = sp.ndimage.zoom(cam, (32, 32), order=1) plt.subplot(1,2,1) diff --git a/cnn_class2/fashion.py b/cnn_class2/fashion.py index db845f8a..858be20e 100644 --- a/cnn_class2/fashion.py +++ b/cnn_class2/fashion.py @@ -26,7 +26,7 @@ def y2indicator(Y): # get the data # https://www.kaggle.com/zalando-research/fashionmnist data = pd.read_csv('../large_files/fashionmnist/fashion-mnist_train.csv') -data = data.as_matrix() +data = data.values np.random.shuffle(data) X = data[:, 1:].reshape(-1, 28, 28, 1) / 255.0 diff --git a/cnn_class2/fashion2.py b/cnn_class2/fashion2.py index 33231b66..4d2d22d6 100644 --- a/cnn_class2/fashion2.py +++ b/cnn_class2/fashion2.py @@ -26,7 +26,7 @@ def y2indicator(Y): # get the data # https://www.kaggle.com/zalando-research/fashionmnist data = pd.read_csv('../large_files/fashionmnist/fashion-mnist_train.csv') -data = data.as_matrix() +data = data.values np.random.shuffle(data) X = data[:, 1:].reshape(-1, 28, 28, 1) / 255.0 diff --git a/hmm_class/hmmd.py b/hmm_class/hmmd.py index 2da74796..d50d5832 100644 --- a/hmm_class/hmmd.py +++ b/hmm_class/hmmd.py @@ -10,6 +10,7 @@ import numpy as np import matplotlib.pyplot as plt +from datetime import datetime def random_normalized(d1, d2): @@ -22,6 +23,7 @@ def __init__(self, M): self.M = M # number of hidden states def fit(self, X, max_iter=30): + t0 = datetime.now() np.random.seed(123) # train the HMM model using the Baum-Welch algorithm # a specific instance of the expectation-maximization algorithm @@ -136,6 +138,8 @@ def fit(self, X, max_iter=30): print("B:", self.B) print("pi:", self.pi) + print("Fit duration:", (datetime.now() - t0)) + plt.plot(costs) plt.show() diff --git a/linear_regression_class/systolic.py b/linear_regression_class/systolic.py index fc71ee99..b7451837 100644 --- a/linear_regression_class/systolic.py +++ b/linear_regression_class/systolic.py @@ -20,7 +20,7 @@ import pandas as pd df = pd.read_excel('mlr02.xls') -X = df.as_matrix() +X = df.values # using age to predict systolic blood pressure plt.scatter(X[:,1], X[:,0]) diff --git a/nlp_class/nb.py b/nlp_class/nb.py index 2b16fc08..5d575915 100644 --- a/nlp_class/nb.py +++ b/nlp_class/nb.py @@ -18,7 +18,7 @@ # it will work for other types of "counts", like tf-idf, so it should # also work for our "word proportions" -data = pd.read_csv('spambase.data').as_matrix() # use pandas for convenience +data = pd.read_csv('spambase.data').values # use pandas for convenience np.random.shuffle(data) # shuffle each row in-place, but preserve the row X = data[:,:48] diff --git a/nlp_class/spam2.py b/nlp_class/spam2.py index 37d787cf..b5e069cc 100644 --- a/nlp_class/spam2.py +++ b/nlp_class/spam2.py @@ -14,6 +14,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB +from sklearn.svm import SVC from wordcloud import WordCloud @@ -32,7 +33,7 @@ # create binary labels df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1}) -Y = df['b_labels'].as_matrix() +Y = df['b_labels'].values # try multiple ways of calculating features # tfidf = TfidfVectorizer(decode_error='ignore') @@ -49,7 +50,7 @@ model.fit(Xtrain, Ytrain) print("train score:", model.score(Xtrain, Ytrain)) print("test score:", model.score(Xtest, Ytest)) - +exit() # visualize the data diff --git a/nlp_class3/bilstm_mnist.py b/nlp_class3/bilstm_mnist.py index 9a79b546..4002b2ae 100644 --- a/nlp_class3/bilstm_mnist.py +++ b/nlp_class3/bilstm_mnist.py @@ -28,7 +28,7 @@ def get_mnist(limit=None): print("Reading in and transforming data...") df = pd.read_csv('../large_files/train.csv') - data = df.as_matrix() + data = df.values np.random.shuffle(data) X = data[:, 1:].reshape(-1, 28, 28) / 255.0 # data is from 0..255 Y = data[:, 0] diff --git a/supervised_class/bayes.py b/supervised_class/bayes.py index ce139051..ab3979cf 100644 --- a/supervised_class/bayes.py +++ b/supervised_class/bayes.py @@ -9,6 +9,7 @@ import numpy as np +import matplotlib.pyplot as plt from util import get_data from datetime import datetime from scipy.stats import norm @@ -60,3 +61,9 @@ def predict(self, X): t0 = datetime.now() print("Test accuracy:", model.score(Xtest, Ytest)) print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest)) + + # plot the mean of each class + for c, g in iteritems(model.gaussians): + plt.imshow(g['mean'].reshape(28, 28)) + plt.title(c) + plt.show() diff --git a/supervised_class/util.py b/supervised_class/util.py index cf041dc4..d733eaad 100644 --- a/supervised_class/util.py +++ b/supervised_class/util.py @@ -12,7 +12,7 @@ def get_data(limit=None): print("Reading in and transforming data...") df = pd.read_csv('../large_files/train.csv') - data = df.as_matrix() + data = df.values np.random.shuffle(data) X = data[:, 1:] / 255.0 # data is from 0..255 Y = data[:, 0] diff --git a/supervised_class2/rf_classification.py b/supervised_class2/rf_classification.py index 1991c7c3..6466f1b4 100644 --- a/supervised_class2/rf_classification.py +++ b/supervised_class2/rf_classification.py @@ -55,7 +55,7 @@ def transform(self, df): X = np.zeros((N, self.D)) i = 0 for col, scaler in iteritems(self.scalers): - X[:,i] = scaler.transform(df[col].as_matrix().reshape(-1, 1)).flatten() + X[:,i] = scaler.transform(df[col].values.reshape(-1, 1)).flatten() i += 1 for col, encoder in iteritems(self.labelEncoders): @@ -98,7 +98,7 @@ def get_data(): transformer = DataTransformer() X = transformer.fit_transform(df) - Y = df[0].as_matrix() + Y = df[0].values return X, Y diff --git a/supervised_class2/rf_regression.py b/supervised_class2/rf_regression.py index 37ffb012..2b219a34 100644 --- a/supervised_class2/rf_regression.py +++ b/supervised_class2/rf_regression.py @@ -44,7 +44,7 @@ def fit(self, df): self.scalers = {} for col in NUMERICAL_COLS: scaler = StandardScaler() - scaler.fit(df[col].as_matrix().reshape(-1, 1)) + scaler.fit(df[col].values.reshape(-1, 1)) self.scalers[col] = scaler def transform(self, df): @@ -53,7 +53,7 @@ def transform(self, df): X = np.zeros((N, D)) i = 0 for col, scaler in iteritems(self.scalers): - X[:,i] = scaler.transform(df[col].as_matrix().reshape(-1, 1)).flatten() + X[:,i] = scaler.transform(df[col].values.reshape(-1, 1)).flatten() i += 1 for col in NO_TRANSFORM: X[:,i] = df[col] @@ -96,9 +96,9 @@ def get_data(): df_test = df.loc[test_idx] Xtrain = transformer.fit_transform(df_train) - Ytrain = np.log(df_train['medv'].as_matrix()) + Ytrain = np.log(df_train['medv'].values) Xtest = transformer.transform(df_test) - Ytest = np.log(df_test['medv'].as_matrix()) + Ytest = np.log(df_test['medv'].values) return Xtrain, Ytrain, Xtest, Ytest diff --git a/unsupervised_class/kmeans_mnist.py b/unsupervised_class/kmeans_mnist.py index 3fa0293e..fd7ca76e 100644 --- a/unsupervised_class/kmeans_mnist.py +++ b/unsupervised_class/kmeans_mnist.py @@ -16,13 +16,13 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt -from .kmeans import plot_k_means, get_simple_data +from kmeans import plot_k_means, get_simple_data from datetime import datetime def get_data(limit=None): print("Reading in and transforming data...") df = pd.read_csv('../large_files/train.csv') - data = df.as_matrix() + data = df.values np.random.shuffle(data) X = data[:, 1:] / 255.0 # data is from 0..255 Y = data[:, 0] diff --git a/unsupervised_class2/util.py b/unsupervised_class2/util.py index 3b4f2e4c..24ed683e 100644 --- a/unsupervised_class2/util.py +++ b/unsupervised_class2/util.py @@ -23,7 +23,7 @@ def getKaggleMNIST(): # column 0 is labels # column 1-785 is data, with values 0 .. 255 # total size of CSV: (42000, 1, 28, 28) - train = pd.read_csv('../large_files/train.csv').as_matrix().astype(np.float32) + train = pd.read_csv('../large_files/train.csv').values.astype(np.float32) train = shuffle(train) Xtrain = train[:-1000,1:] / 255