Merge branch 'develop'

UCA-Datalab · May 14, 2020 · e3f96bd · e3f96bd
2 parents f92e181 + be5311c
commit e3f96bd
Show file tree

Hide file tree

Showing 8 changed files with 751 additions and 16 deletions.
diff --git a/better_nilm/model/export.py b/better_nilm/model/export.py
@@ -0,0 +1,138 @@
+import os
+import pickle
+
+from keras.models import model_from_json
+
+
+def store_model_json(model, path_model, path_weights=None):
+    """
+    Serializes a model into a json file.
+    Also serializes its weights as a h5 file.
+    Parameters
+    ----------
+    model : keras.models.Sequential
+    path_model : str
+        Path to where the json is created, including the filename and json
+        termination.
+    path_weights : str, default=None
+        Path to where the h5 is created, including the filename and h5
+        termination. If None is provided, weights are stored in the same
+        route as the model, using the same name.
+
+    """
+    if not path_model.endswith(".json"):
+        raise ValueError("path_model must end in a json file. Current "
+                         f"route:\n{path_model}")
+
+    if path_weights is None:
+        path_weights = path_model.rsplit(".")[0]
+        path_weights = path_weights + ".h5"
+    elif not path_weights.endswith(".h5"):
+        raise ValueError("path_weights must end in a h5 file. Current "
+                         f"route:\n{path_weights}")
+    # serialize model to JSON
+    model_json = model.to_json()
+    with open(path_model, "w") as json_file:
+        json_file.write(model_json)
+    # serialize weights to HDF5
+    model.save_weights(path_weights)
+    print(f"Saved model to disk. Path:\n{path_model}")
+
+
+def load_model_json(path_model, path_weights=None):
+    """
+
+    Parameters
+    ----------
+    path_model : str
+        Path to where the serialized model is stored, in json format.
+    path_weights : str, default=None
+        Path to where the model weights are stored, in h5 format.
+        If None is provided, assumes the h5 file is located in the same route
+        as the model and with the same name.
+
+    Returns
+    -------
+    model : keras.models.Sequential
+
+    """
+    if not path_model.endswith(".json"):
+        raise ValueError("path_model must end in a json file. Current "
+                         f"route:\n{path_model}")
+
+    if path_weights is None:
+        path_weights = path_model.rsplit(".")[0]
+        path_weights = path_weights + ".h5"
+    elif not path_weights.endswith(".h5"):
+        raise ValueError("path_weights must end in a h5 file. Current "
+                         f"route:\n{path_weights}")
+
+    if not os.path.isfile(path_model):
+        raise FileNotFoundError(f"path_model does not lead to an existing "
+                                f"file:\n{path_model}")
+
+    if not os.path.isfile(path_weights):
+        raise FileNotFoundError(f"path_weights does not lead to an existing "
+                                f"file:\n{path_weights}")
+
+    # load json and create model
+    json_file = open(path_model, 'r')
+    model_json = json_file.read()
+    json_file.close()
+    model = model_from_json(model_json)
+    # load weights into new model
+    model.load_weights(path_weights)
+    print("Loaded model from disk")
+    return model
+
+
+def store_dict_pkl(dic, path_dic):
+    """
+    Stores a dictionary into a pkl file.
+
+    Parameters
+    ----------
+    dic : dict
+        Dictionary to store.
+    path_dic : str
+        Path to where the pkl is created, including the filename and pkl
+        termination.
+
+    """
+
+    if not path_dic.endswith(".pkl"):
+        raise ValueError("path_dic must end in a pkl file. Current "
+                         f"route:\n{path_dic}")
+
+    a_file = open(path_dic, "wb")
+    pickle.dump(dic, a_file)
+    a_file.close()
+
+
+def load_dict_pkl(path_dic):
+    """
+    Loads a dictionary from a pkl file.
+
+    Parameters
+    ----------
+    path_dic : str
+        Path to where the dictionary is stored, in pkl format.
+
+    Returns
+    -------
+    dic : dict
+        Dictionary.
+
+    """
+    if not path_dic.endswith(".pkl"):
+        raise ValueError("path_pkl must end in a pkl file. Current "
+                         f"route:\n{path_dic}")
+
+    if not os.path.isfile(path_dic):
+        raise FileNotFoundError(f"path_dic does not lead to an existing "
+                                f"file:\n{path_dic}")
+
+    a_file = open(path_dic, "rb")
+    dic = pickle.load(a_file)
+
+    return dic
diff --git a/better_nilm/model/gru.py b/better_nilm/model/gru.py
@@ -4,27 +4,44 @@
 from keras.layers import Conv1D
 from keras.layers import GRU
 from keras.layers import Bidirectional
+from keras.layers import Lambda
+from keras.layers import Activation
+from keras.activations import sigmoid
+
+from keras.optimizers import Adam
 
 
-def create_gru_model(series_len, num_appliances,
-                     regression_weight=1, classification_weight=1):
+def create_gru_model(series_len, num_appliances, thresholds,
+                     regression_weight=1, classification_weight=1,
+                     learning_rate=0.001):
     """
     Creates a Gated Recurrent Unit model.
+    Based on OdysseasKr GRU model:
+    https://github.com/OdysseasKr/neural-disaggregator/blob/master/GRU
 
     Parameters
     ----------
     series_len : int
     num_appliances : int
+    thresholds : numpy.array
+        shape = (num_appliances, )
+        Load threshold for each appliance
     regression_weight : float, default=1
         Weight for the regression loss (MSE)
     classification_weight : float, default=1
         Weight for the classification loss (BCE)
+    learning_rate : float, default=0.001
+        Starting learning rate for the Adam optimizer.
 
     Returns
     -------
     model : keras.models.Sequential
 
     """
+    assert len(thresholds) == num_appliances, "Number of thresholds must " \
+                                              "equal the amount of appliances"
+
+    # ARCHITECTURE
 
     # Input layer (batch, series_len, 1)
     inputs = Input(shape=(series_len, 1))
@@ -41,22 +58,35 @@ def create_gru_model(series_len, num_appliances,
     gru2 = Bidirectional(GRU(128, return_sequences=True, stateful=False),
                          merge_mode='concat')(gru1)
 
+    # Dense layer
+    dense = Dense(64, activation='relu')(gru2)
+
     # Regression output
     # Fully Connected Layers (batch, series_len, num_appliances)
     regression = Dense(num_appliances, activation='relu',
-                       name='regression')(gru2)
+                       name='regression')(dense)
 
     # Classification output
+    subtract = Lambda(lambda x: x - thresholds)(regression)
     # Fully Connected Layers (batch, series_len, num_appliances)
-    classification = Dense(num_appliances, activation="sigmoid",
-                           name="classification")(gru2)
+    classification = Activation(sigmoid, name='classification')(subtract)
+
+    # TRAINING
+
+    # Weights
+    # We scale the weights because BCE grows bigger than MSE
+    class_w = classification_weight * .003
+    reg_w = regression_weight * .997
+
+    # Optimizer
+    opt = Adam(learning_rate=learning_rate)
 
     model = Model(inputs=inputs,
                   outputs=[regression, classification])
     model.compile(loss={"regression": "mean_squared_error",
                         "classification": "binary_crossentropy"},
-                  loss_weights={"regression": regression_weight,
-                                "classification": classification_weight},
-                  optimizer='adam')
+                  loss_weights={"regression": reg_w,
+                                "classification": class_w},
+                  optimizer=opt)
 
     return model
diff --git a/better_nilm/model/preprocessing.py b/better_nilm/model/preprocessing.py
@@ -219,10 +219,8 @@ def _get_cluster_centroids(ser):
     std = np.zeros((num_meters, 2))
 
     for idx in range(num_meters):
-        # Take one meter record, and sort the in ascending order
-        # to ensure the first values correspond to OFF state
+        # Take one meter record
         meter = ser[:, :, idx].flatten()
-        meter = np.sort(meter)
         meter = meter.reshape((len(meter), -1))
         kmeans = KMeans(n_clusters=2).fit(meter)
 
@@ -297,8 +295,8 @@ def binarize(ser, thresholds):
 
     # Iterate through all the appliances
     for idx in range(num_app):
-        mask_on = ser[:, :, idx] >= thresholds[idx]
-        ser_bin[mask_on] = 1
+        mask_on = ser[:, :, idx] > thresholds[idx]
+        ser_bin[:, :, idx] = mask_on.astype(int)
 
     ser_bin = ser_bin.astype(int)
 

diff --git a/better_nilm/model/scores.py b/better_nilm/model/scores.py
@@ -0,0 +1,141 @@
+import numpy as np
+
+from sklearn.metrics import mean_squared_error
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import f1_score
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+
+
+def _assert_shape(y_pred, y_real, appliances):
+    if not y_pred.shape == y_real.shape:
+        raise ValueError("Array shape mismatch.\n"
+                         f"y_pred shape: {y_pred.shape}\n"
+                         f"y_real_shape: {y_real.shape}")
+
+    if y_pred.shape[2] != len(appliances):
+        raise ValueError("Number of appliances mismatch.\n"
+                         f"Appliances in y_pred array: {y_pred.shape[2]}\n"
+                         f"Appliances in appliances list: {len(appliances)}")
+
+
+def regression_score_dict(y_pred, y_real, appliances):
+    """
+    Returns a dictionary with some regression scores, for each appliance.
+        - MSE, Mean Square Error
+        - RMSE, Root Mean Squared Error
+
+    Parameters
+    ----------
+    y_pred : numpy.array
+        shape = (num_series, series_len, num_appliances)
+        - num_series : Amount of time series.
+        - series_len : Length of each time series.
+        - num_appliances : Meters contained in the array.
+    y_real : numpy.array
+        shape = (num_series, series_len, num_appliances)
+    appliances : list
+        len = num_appliances
+        Must be sorted following the order of both y_pred and y_real
+
+    Returns
+    -------
+    scores : dict
+        'appliance': {'metric': value}
+
+    """
+    _assert_shape(y_pred, y_real, appliances)
+
+    if np.mean(y_real) <= 1:
+        print("Warning!\nThe predicted values appear to be normalized.\n"
+              "It is recommended to use the de-normalized values\n"
+              "when computing the regression errors")
+
+    # Initialize dict
+    scores = {}
+
+    for idx, app in enumerate(appliances):
+        app_pred = y_pred[:, :, idx].flatten()
+        app_real = y_real[:, :, idx].flatten()
+
+        # MSE and RMSE
+        app_mse = mean_squared_error(app_real, app_pred)
+        app_rmse = np.sqrt(app_mse)
+
+        scores[app] = {"mse": round(app_mse, 2),
+                       "rmse": round(app_rmse, 2)}
+
+    return scores
+
+
+def classification_scores_dict(y_pred, y_real, appliances, threshold=.5):
+    """
+    Returns a dictionary with some regression scores, for each appliance.
+        - Accuracy
+        - F1-Score
+        - Precision
+        - Recall
+
+    Parameters
+    ----------
+    y_pred : numpy.array
+        shape = (num_series, series_len, num_appliances)
+        - num_series : Amount of time series.
+        - series_len : Length of each time series.
+        - num_appliances : Meters contained in the array.
+    y_real : numpy.array
+        shape = (num_series, series_len, num_appliances)
+    appliances : list
+        len = num_appliances
+        Must be sorted following the order of both y_pred and y_real
+    threshold : float, default=0.5
+        Minimum value (form 0 to 1) at which we consider the appliance to be ON
+
+    Returns
+    -------
+    scores : dict
+        'appliance': {'metric': value}
+
+    """
+
+    _assert_shape(y_pred, y_real, appliances)
+
+    if ((y_pred.max() > 1).any() or (y_real > 1).any()
+            or (y_pred.min() < 0).any() or (y_real.min() < 0).any()):
+        raise ValueError("Classification values must be between 0 and 1.")
+
+    # Binarize the arrays
+    bin_pred = np.zeros(y_pred.shape)
+    bin_pred[y_pred >= threshold] = 1
+    bin_pred = bin_pred.astype(int)
+
+    bin_real = np.zeros(y_real.shape)
+    bin_real[y_real >= threshold] = 1
+    bin_real = bin_real.astype(int)
+
+    # Initialize dict
+    scores = {}
+
+    for idx, app in enumerate(appliances):
+        app_pred = bin_pred[:, :, idx].flatten()
+        app_real = bin_real[:, :, idx].flatten()
+
+        # Precision
+        app_accuracy = accuracy_score(app_real, app_pred)
+
+        # F1-Score
+        app_f1 = f1_score(app_real, app_pred)
+
+        # Precision
+        app_precision = precision_score(app_real, app_pred)
+
+        # Recall
+        app_recall = recall_score(app_real, app_pred)
+
+        scores[app] = {"accuracy": round(app_accuracy, 4),
+                       "f1": round(app_f1, 4),
+                       "precision": round(app_precision, 4),
+                       "recall": round(app_recall, 4)}
+
+    return scores