deep learning with keras - binary classification.txt

x and y coordinates and labels 0 or 1 representing the colors

import seaborn as sns

sns.pairplot(circles, hue="target")

topology 

1. two input layer one for x and y
2. four hidden layer
3. one output layer

sigmoid function = 1/(1+e**-Z)
>>>

model= Sequential()

model.add(Dense(4, input_shape=(2,),
activation='sigmoid'
))

#model.add(Dense(50, activation='relu'))
model.add(Dense(1))

model.compile(optimizer=Adam(0.01),loss='binary_crossentropy')
#model.compile(optimizer=Adam(0.01),loss='mae')

model.summary()

plot_model(model, to_file='model.png')
img=plt.imread('model.png')
plt.imshow(img)
plt.show()

model.compile(optimizer='sgd',loss='binary_crossentropy')

model.train(coordinates, labels, epochs=20)

preds= model.predict(coordinates)


>>>

variance, skewness, kurtosis, entropy, class

# Import seaborn
import seaborn as sns

print(banknotes.keys)

# Use pairplot and set the hue to be our class
sns.pairplot(banknotes, hue='class') 

# Show the plot
plt.show()

# Describe the data
print('Dataset stats: \n', banknotes.describe)

# Count the number of observations of each class
print('Observations per class: \n', banknotes['class'].value_counts)

>> multi class classification

xCoord, yCoord competitor

1. 2 input, 128 dense, 64 dense, 32 dense, 4 outputs

softmax
.6 Michael
.1 Susan
.2 Kate
.1 Steve


model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy')

The log loss decreases as the model becomes more accurate in predicting.


>>>>>To Categorical
>>2 input, [128,64,32] hidden layer, 4 output

import pandas as pd
from keras.utils import to_categorical

df=pd.read_csv('data.csv')

df.response=pd.Categorical(df.response)
df.response=df.response.cat.codes

#turn response variable into one-hot encode response vector
y=to_categorical(df.response)

>>>
# Import to_categorical from keras utils module
from keras.utils import to_categorical

# Instantiate a sequential model
model = Sequential()
  
# Add 3 dense layers of 128, 64 and 32 neurons each
model.add(Dense(128, input_shape=(2,), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
  
# Add a dense layer with as many neurons as competitors
model.add(Dense(4, activation="softmax"))
  
# Compile your model using categorical_crossentropy loss
model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])
              
model.summary()



# Transform into a categorical variable
darts.competitor = pd.Categorical(darts.competitor)

# Assign a number to each category (label encoding)
darts.competitor = darts.competitor.cat.codes 

# Print the label encoded competitors
print('Label encoded competitors: \n',darts.competitor.head())

# Transform into a categorical variable
darts.competitor = pd.Categorical(darts.competitor)


# Use to_categorical on your labels
coordinates = darts.drop(['competitor'], axis=1)
competitors = to_categorical(darts.competitor)

# Now print the to_categorical() result
print('One-hot encoded competitors: \n',competitors)

# Train your model on the training data for 200 epochs
model.fit(coord_train,competitors_train,epochs=200)

# , your model accuracy on the test data
accuracy = model.evaluate(coord_test, competitors_test)[1]

# Print accuracy
print('Accuracy:', accuracy)

# Predict on coords_small_test
preds = model.predict(coords_small_test)

# Print preds vs true values
print("{:45} | {}".format('Raw Model Predictions','True labels'))
for i,pred in enumerate(preds):
  print("{} | {}".format(pred,competitors_small_test[i]))

# Predict on coords_small_test
preds = model.predict(coords_small_test)

# Print preds vs true values
print("{:45} | {}".format('Raw Model Predictions','True labels'))
for i,pred in enumerate(preds):
  print("{} | {}".format(pred,competitors_small_test[i]))

# Extract the indexes of the highest probable predictions
preds = [np.argmax(pred) for pred in preds]

# Print preds vs true values
print("{:10} | {}".format('Rounded Model Predictions','True labels'))
for i,pred in enumerate(preds):
  print("{:25} | {}".format(pred,competitors_small_test[i]))


>>> multi-label

model=Sequential()

model.add(Dense(2,input_shape=(1,)))

model.add(Dense(3,activation='sigmoid'))

#each output will be between 0 and 1

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train, y_train, epochs=100, validation_split=0.2)

#one versus rest classification

#sensor measurements result in parcels to water

Multi-label classification problems differ from multi-class problems in that each observation can be labeled with zero or more classes. So classes are not mutually exclusive. 

To account for this behavior what we do is have an output layer with as many neurons as classes but this time, unlike in multi-class problems, each output neuron has a sigmoid activation function. This makes the output layer able to output a number between 0 and 1 in any of its neurons.

>>

# Instantiate a Sequential model

model=Sequential()

# Add a hidden layer of 64 neurons and a 20 neuron's input

model.add(Dense(64,input_shape=(20,),activiation='relu'))

# Add an output layer of 3 neurons with sigmoid activation
model.add(Dense(3,activation='sigmoid'))

# Compile your model with adam and binary crossentropy loss
model.compile(optimizer="adam",
           loss='binary_crossentropy',
           metrics=['accuracy'])

model.summary()

# Train for 100 epochs using a validation split of 0.2
model.fit(sensors_train, parcels_train, epochs = 100, validation_split = 0.2)

# Predict on sensors_test and round up the predictions
preds = model.predict(sensors_test)
preds_rounded = np.round(preds)

# Print rounded preds
print('Rounded Predictions: \n', preds_rounded)

# Evaluate your model's accuracy on the test data
accuracy = model.evaluate(sensors_test, parcels_test)[1]

# Print accuracy
print('Accuracy:', accuracy)


>>callbacks
1) EarlyStopping
2) ModelCheckpoint
3) History

print(history.history['loss'])
print(history.history['acc'])
print(history.history['val_loss'])
print(history.history['val_acc'])

history, modelcheckpoint, earlystopping

fit does callbacks

#print(history.history['loss'])
plt.figure()
plt.plot(history.history['loss'])
plt.xlabel('loss')
plt.show()

>>>>>Early stopping
#useful because we don't know how many epochs will be required to complete training

from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

#patience is the number of epochs to improve before the model is stopped


model.fit(X_train, y_train, epochs=100,
validation_data=(X_test,y_test),
callbacks=[early_stopping])


>>>model checkpoint

from keras.callbacks import ModelCheckpoint

#allows up to save the model as strings

model_save=ModelCheckpoint('best_model.hdf5', save_best_only=True)

model.fit(X_train, y_train, epochs=100,
validation_data=(X_test,y_test),
callbacks=[model_save])

>>>

# Train your model and save its history
history = model.fit(X_train, y_train, epochs = 50,
               validation_data=(X_test, y_test))

# Plot train vs test loss during training
plot_loss(history.history['loss'], history.history['val_loss'])

# Plot train vs test accuracy during training
plot_accuracy(history.history['acc'], history.history['val_acc'])

# Import the early stopping callback
from keras.callbacks import EarlyStopping

# Define a callback to monitor val_acc
monitor_val_acc = EarlyStopping(monitor='val_acc', 
                       patience=5)

# Train your model using the early stopping callback
model.fit(X_train, y_train, 
           epochs=1000, validation_data=(X_test,y_test),
           callbacks=[monitor_val_acc])


>>>

# Import the EarlyStopping and ModelCheckpoint callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint

# Early stop on validation accuracy
monitor_val_acc = EarlyStopping(monitor ='val_acc', patience=3)

# Save the best model as best_banknote_model.hdf5
modelCheckpoint = ModelCheckpoint('best_banknote_model.hdf5', save_best_only = True)

# Fit your model for a stupid amount of epochs
history = model.fit(X_train, y_train,
                    epochs = 10000000,
                    callbacks = [monitor_val_acc, modelCheckpoint],
                    validation_data = (X_test, y_test))

>>>Learning Curves

1) loss learning curves decrease as epochs go by

accuracy learning curves
1) increase as epochs go by

****model overfitting can be identified if the training curves and the validation curves diverge


init_weights=model.get_weights()

train_accs[]
tests_accs[]

for train_size in train_sizes:
	X_train_frac, -, y_train_frac, = train_test_split(X_train,y_train,train_size=train_size)
	model.set_weights(initial_weights)
	model.fit(X_train_frac, y_train_frac, epochs=100, verbose=0,
	callbacks[EarlyStopping(monitor='loss', patience=1)]
	
	train_acc=model.evalute(X_train_frac, y_train_frac, verbose=0)[1]
	train_accs.append(train_acc)


	test_acc=model.evalute(X_test_frac, y_test_frac, verbose=0)[1]
	test_accs.append(train_acc)


>>>

# Instantiate a Sequential model
model = Sequential()

# Input and hidden layer with input_shape, 16 neurons, and relu 
model.add(Dense(16, input_shape = (64,), activation = 'relu'))

# Output layer with 10 neurons (one per digit) and softmax
model.add(Dense(10, activation='softmax'))

# Compile your model
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Test if your model works and can process input data
print(model.predict(X_train))

# Train your model for 60 epochs, using X_test and y_test as validation data
history = model.fit(X_train, y_train, epochs=60, validation_data=(X_test, y_test), verbose=0)

# Extract from the history object loss and val_loss to plot the learning curve
plot_loss(history.history['loss'],history.history['val_loss'])

for size in training_sizes:
  	# Get a fraction of training data (we only care about the training data)
    X_train_frac, y_train_frac = X_train[:size], y_train[:size]

    # Reset the model to the initial weights and train it on the new data fraction
    model.set_weights(initial_weights)
    model.fit(X_train_frac, y_train_frac, epochs = 50, callbacks = [early_stop])

    # Evaluate and store the train fraction and the complete test set results
    train_accs.append(model.evaluate(X_train_frac, y_train_frac)[1])
    test_accs.append(model.evaluate(X_test, y_test)[1])
    
# Plot train vs test accuracies
plot_results(train_accs, test_accs)

>>>>Activation Functions

a=sum of inputs * weights + bias

a is passed into an activation function producing y

1. sigmoid varies between 0 and 1
2. tanh varies between -1 and 1
3. relu varies between 0 and infinity
4. Leaky relu between negative value and infinity

np.random.seed(1)

def get_model(act_function):
	model=Sequential()
	model.add(Dense(4, input_shape=(2,), activation=act_function))
	model.add(Dense(1, activation='sigmoid'))
	return model


activations=['relu','sigmoid','tanh','leaky_relu']

for funct in activations:
	model= get_model(act_function=funct)
	history=model.fit(X_train, y_train,
		validation_data=(X_test,y_test),
		epochs=100,verbose=False)
	activation_results[funct]=history

val_loss_per_funct = {k:v.history['val_loss] for k,v in activation_results.items()}

val_loss_curves.pd.DataFrame(val_loss_per_funct)

val_loss_curves.plot(title='Loss per Activation function')


>>>>

# Activation functions to try
activations = ['relu','leaky_relu','sigmoid','tanh']

# Loop over the activation functions
activation_results = {}

for act in activations:
  # Get a new model with the current activation
  model = get_model(act)
  # Fit the model
  history = model.fit(X_train, y_train,
		validation_data=(X_test,y_test),
		epochs=20,verbose=0)
  activation_results[act] = history

# Create a dataframe from val_loss_per_function
val_loss=  {k:v.history['val_loss'] for k,v in activation_results.items()}

# Call plot on the dataframe
val_loss_per_function=pd.DataFrame(val_loss)
val_loss_per_function.plot()
plt.show()

# Create a dataframe from val_acc_per_function
val_acc =  {k:v.history['val_acc'] for k,v in activation_results.items()}

# Call plot on the dataframe
val_acc_per_function=pd.DataFrame(val_acc)
val_acc_per_function.plot()
plt.show()

>>>>batch size and batch normalization

1. mini-batches advantages
a. networks train faster (more weight updates in same amount of time)
b. less RAM memory required, can train on huge datasets
c. noise can help networks reach a lower error, escaping local minima

2. mini-batches disadvantages
a. more iterations need to be run
b. need to be adjusted, we need to find a good batch size

keras uses a batch size of 32

The smaller a batch size, the more weight updates per epoch, but at a cost of a more unstable gradient descent. Specially if the batch size is too small and it's not 

standardization 
data-mean/standard deviation

batch normalization makes sure that independently of the changes, the inputs to the next layer are normalized

batch normalization advantages
1. improves gradient flow
2. allows higher learning rates
3. reduces dependence on weight initializations
4. acts as an unintended form of regularization
5. limits internal covariate shift

from keras.layers import BatchNormalization

model=Sequential()
model.add(Dense(3, input_shape=(2,), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1,activation='sigmoid')

>>

model = get_model()

# Fit your model for 5 epochs with a batch of size the training set
model.fit(X_train, y_train, epochs=5, batch_size=len(X_train))
print("\n The accuracy when using the whole training set as a batch was: ",
      model.evaluate(X_test, y_test)[1])

>>>

# Import batch normalization from keras layers
from keras.layers import BatchNormalization

# Build your deep network
batchnorm_model = Sequential()
batchnorm_model.add(Dense(50, input_shape=(64,), activation='relu', kernel_initializer='normal'))
batchnorm_model.add(BatchNormalization())
batchnorm_model.add(Dense(50, activation='relu', kernel_initializer='normal'))
batchnorm_model.add(BatchNormalization())
batchnorm_model.add(Dense(50, activation='relu', kernel_initializer='normal'))
batchnorm_model.add(BatchNormalization())
batchnorm_model.add(Dense(10, activation='softmax', kernel_initializer='normal'))

# Compile your model with sgd
batchnorm_model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])


# Train your standard model, storing its history
history1 = standard_model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=10, verbose=0)

# Train the batch normalized model you recently built, store its history
history2 = batchnorm_model.fit(X_test, y_test, validation_data=(X_test,y_test), epochs=10, verbose=0)

# Call compare_acc_histories passing in both model histories
compare_histories_acc(history1, history2)