jet-engine-audio-classification.py

{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# %% [markdown] {\"execution\":{\"iopub.status.busy\":\"2023-09-08T21:47:18.374779Z\",\"iopub.execute_input\":\"2023-09-08T21:47:18.375149Z\",\"iopub.status.idle\":\"2023-09-08T21:47:51.604581Z\",\"shell.execute_reply.started\":\"2023-09-08T21:47:18.375118Z\",\"shell.execute_reply\":\"2023-09-08T21:47:51.603287Z\"}}\n# ## Install required packages\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:00:40.521376Z\",\"iopub.execute_input\":\"2023-09-09T01:00:40.521901Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.301184Z\",\"shell.execute_reply.started\":\"2023-09-09T01:00:40.521858Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.299849Z\"}}\n!pip install -q tensorflow_io\n!pip install -q seaborn\n!pip install -q imbalanced-learn\n\n# %% [markdown]\n# ## Importing libraries\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.304983Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.305392Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.315449Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.305360Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.314478Z\"}}\n# Import necessary libraries\nimport os\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport tensorflow as tf\nimport tensorflow_io as tfio\nimport keras_tuner\nimport librosa\nimport librosa.display\nimport IPython.display as ipd\nfrom glob import glob\nfrom tqdm import tqdm\nimport seaborn as sns\nfrom itertools import cycle\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, recall_score\nfrom imblearn.under_sampling import ClusterCentroids\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom datetime import datetime\nfrom keras_tuner.tuners import RandomSearch\nfrom keras_tuner import HyperParameters\nfrom keras_tuner.engine.hyperparameters import Int, Choice\nfrom sklearn.metrics import accuracy_score, recall_score\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.316834Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.317227Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.343321Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.317165Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.342406Z\"}}\n# Set Kaggle Kernel to non-interactive mode and collect file paths\nfile_paths = []\n\nos.environ['KAGGLE_KERNEL_RUN_INTERACTIVE'] = '0'\nfor dirname, _, filenames in os.walk('/kaggle/input'):    \n    for filename in filenames:\n        file_paths.append(os.path.join(dirname, filename))\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.344954Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.345259Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.351195Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.345229Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.350285Z\"}}\n# check version of TensorFlow\nprint (tf.__version__)\nprint (tfio.__version__)\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.354722Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.355142Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.362496Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.355116Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.361524Z\"}}\n# Set up visualization parameters\nsns.set_theme(style=\"white\", palette=None)\ncolor_pal = plt.rcParams[\"axes.prop_cycle\"].by_key()[\"color\"]\ncolor_cycle = cycle(plt.rcParams[\"axes.prop_cycle\"].by_key()[\"color\"])\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.364222Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.364607Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.388062Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.364561Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.387119Z\"}}\nseed = 22\ntf.random.set_seed(seed)\n\n# %% [markdown]\n# ## Importing Data\n# \n# The following steps are perfomred below:\n# \n# Files are read from path\n# Each file is labeled as either True(1) or False(0)\n# Data is converted to a dataframe with 2 columns 'filepath' and 'label'.*\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.391705Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.392316Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.399046Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.392278Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.398082Z\"}}\n# Making a function to label each of the audio files with the plane type and the label type like true or false\ndef get_label(audio_list):\n    \"\"\"\n    Function to label each audio file as True (1) or False (0) based on filename.\n\n    Args:\n    audio_list (list): List of audio file paths.\n\n    Returns:\n    list: List of tuples containing the audio file path and its corresponding label.\n    \"\"\"  \n    \n    temp = []\n\n    # loop over the entire audio list and add the aircraft type and label, then save it in another list of tuples for df later\n    for i in range(len(audio_list)):\n        if audio_list[i][-5] == 'T':\n          label = 1\n        else:\n          label = 0\n        t = (audio_list[i] ,  label)\n        temp.append(t)   \n\n    return temp\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.400534Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.400878Z\",\"iopub.status.idle\":\"2023-09-09T01:01:13.419658Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.400849Z\",\"shell.execute_reply\":\"2023-09-09T01:01:13.418605Z\"}}\n# Load and preprocess audio data\ndata = sorted(glob('/kaggle/input/audio-data/Trains_/*.wav', recursive = True))\n\n# Label audio data as True (1) or False (0)\ndata = get_label(data)\n\n# Create a DataFrame to store file paths and labels\ndf = pd.DataFrame(data, columns = ['file_path', 'labels'])\n\nprint(df.head())\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:13.421061Z\",\"iopub.execute_input\":\"2023-09-09T01:01:13.421808Z\",\"iopub.status.idle\":\"2023-09-09T01:01:20.414328Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:13.421775Z\",\"shell.execute_reply\":\"2023-09-09T01:01:20.413316Z\"}}\n# Extract file paths and number of files\naudio_path = df['file_path']\nnum_files = len(audio_path)\n\n# Initialise arrays to store audio data and sample rates\naudio_data = np.empty(num_files, dtype=object)\nsamp_rate = np.empty(num_files, dtype=object)\n\n# Load audio files and sample rates\nfor i, path in enumerate(audio_path):\n    audio_, sr = librosa.load(path)\n    audio_data[i] = audio_\n    samp_rate[i] = sr\n\n# %% [markdown]\n# ## Preprocessing the Data\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:20.415800Z\",\"iopub.execute_input\":\"2023-09-09T01:01:20.416150Z\",\"iopub.status.idle\":\"2023-09-09T01:01:22.157303Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:20.416113Z\",\"shell.execute_reply\":\"2023-09-09T01:01:22.156295Z\"}}\n# Resampling audio data to a common target sample rate\ntarget_sr =  44100\naudio_data_resampled=[]\n\nfor audio_dt, s_rate in zip(audio_data,samp_rate):\n    audio_data_resmpld = librosa.resample(audio_dt,orig_sr=s_rate,target_sr=target_sr)\n    audio_data_normalised = librosa.util.normalize(audio_data_resmpld)\n    audio_data_resampled.append(audio_data_normalised)\n\n# %% [markdown]\n# ## Extracting MFCC features from the audio data\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:22.158779Z\",\"iopub.execute_input\":\"2023-09-09T01:01:22.159152Z\",\"iopub.status.idle\":\"2023-09-09T01:01:39.253161Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:22.159118Z\",\"shell.execute_reply\":\"2023-09-09T01:01:39.250257Z\"}}\nmfcc_features_arr = []\nfor audio_data_res in tqdm(audio_data_resampled):\n    mfcc_features = librosa.feature.mfcc(y=audio_data_res, n_mfcc=40)\n    scaled_mfcc = np.mean(mfcc_features.T, axis=0)\n    mfcc_features_arr.append(scaled_mfcc)\n\n# %% [markdown]\n# ## Data Visualistion of audio features\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:39.254692Z\",\"iopub.execute_input\":\"2023-09-09T01:01:39.255037Z\",\"iopub.status.idle\":\"2023-09-09T01:01:39.806362Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:39.255004Z\",\"shell.execute_reply\":\"2023-09-09T01:01:39.805484Z\"}}\nmfcc_features_arr = np.array(mfcc_features_arr)\nplt.figure(figsize=(10, 6))\nlibrosa.display.specshow(mfcc_features_arr, x_axis='time')\nplt.colorbar()\nplt.title('MFCC')\nplt.ylabel('MFCC Coefficients')\nplt.tight_layout()\nplt.show()\n\n# %% [markdown]\n# **Stack 2D data to 1D**\n\n# %% [code] {\"jupyter\":{\"source_hidden\":true},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:39.807850Z\",\"iopub.execute_input\":\"2023-09-09T01:01:39.808885Z\",\"iopub.status.idle\":\"2023-09-09T01:01:39.817536Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:39.808847Z\",\"shell.execute_reply\":\"2023-09-09T01:01:39.816615Z\"}}\n# Stack the 2D MFCC arrays vertically\nstacked_mfcc_features = np.vstack(mfcc_features_arr)\n\n# Assign the stacked MFCC features to the DataFrame\ndf['features'] = list(stacked_mfcc_features)\n\n\nlabels = df['labels']\nfeatures = df['features']\n\nX= np.array(features.tolist())\ny= np.array(labels.tolist())\n\n# %% [markdown]\n# ## Splitting the Data into Train and Test\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:39.822608Z\",\"iopub.execute_input\":\"2023-09-09T01:01:39.823362Z\",\"iopub.status.idle\":\"2023-09-09T01:01:39.830564Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:39.823327Z\",\"shell.execute_reply\":\"2023-09-09T01:01:39.829605Z\"}}\n# Splitting the Data 80% Train and 30% Test\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=labels, random_state=seed)\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:39.831972Z\",\"iopub.execute_input\":\"2023-09-09T01:01:39.832411Z\",\"iopub.status.idle\":\"2023-09-09T01:01:39.841486Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:39.832358Z\",\"shell.execute_reply\":\"2023-09-09T01:01:39.840497Z\"}}\n# Further split the training data into training and validation sets\nX_train_new,X_val,y_train_new,y_val = train_test_split(X_train,y_train,test_size=0.2,stratify=y_train, random_state=seed)\n\n# %% [markdown]\n# ## Handling Imbalanced Data - Undersampling\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:39.843524Z\",\"iopub.execute_input\":\"2023-09-09T01:01:39.843845Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.110145Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:39.843819Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.109400Z\"}}\ncluster_centroids = ClusterCentroids(random_state=seed)\nX_train_new_under_sampled, y_train_new_under_sampled = cluster_centroids.fit_resample(X_train_new, y_train_new)\n\n# %% [markdown]\n# ## Model Development\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.113647Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.115420Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.126479Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.115389Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.125596Z\"}}\n# Model development\nhp = HyperParameters()\ndef build_model(hp):\n    # Build a sequential model with tunable hyperparameters\n    model = Sequential()\n    model.add(layers.Dense(units=hp.Int('units_1', min_value=100, max_value=300, step=10),\n                           activation='relu', input_shape=(40,)))\n    model.add(layers.Dropout(rate=hp.Float('dropout_1', min_value=0.3, max_value=0.6, step=0.1)))\n    model.add(layers.Dense(units=hp.Int('units_2', min_value=100, max_value=300, step=10),\n                           activation='relu'))\n    model.add(layers.Dropout(rate=hp.Float('dropout_2', min_value=0.3, max_value=0.6, step=0.1)))\n    model.add(layers.Dense(units=hp.Int('units_3', min_value=50, max_value=200, step=10),\n                           activation='relu'))\n    model.add(layers.Dropout(rate=hp.Float('dropout_3', min_value=0.3, max_value=0.6, step=0.1)))\n    model.add(layers.Dense(1, activation='sigmoid'))\n\n    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=tf.keras.metrics.Recall()\n)\n    return model\n\n# %% [markdown]\n# ### Hyperparameter Tuning\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.127814Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.128852Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.151988Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.128819Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.151144Z\"}}\n# Hyperparameter tuning\ntuner = RandomSearch(\n    build_model,\n    objective=keras_tuner.Objective('val_recall',direction='max'),\n    max_trials=10,\n    executions_per_trial=1,\n    directory='my_dir',\n    project_name='Signal Processing',    \n    seed=seed\n)\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.153310Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.153956Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.160585Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.153924Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.159631Z\"}}\n# Perform hyperparameter search using undersampled data\ntuner.search(X_train_new_under_sampled, y_train_new_under_sampled, epochs=hp.Int('num_epochs', min_value=50, max_value=200, step=10),\n             batch_size=hp.Choice('batch_size', values=[2, 8, 16, 32, 64, 128]),\n             validation_data=(X_val, y_val))\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.161907Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.162457Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.764019Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.162419Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.763035Z\"}}\n# Get the best model from the tuner\nbest_model = tuner.get_best_models(1)[0]\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.765419Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.765802Z\",\"iopub.status.idle\":\"2023-09-09T01:01:40.798214Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.765768Z\",\"shell.execute_reply\":\"2023-09-09T01:01:40.797377Z\"}}\n# Display a summary of the best model\nbest_model.summary()\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:40.799333Z\",\"iopub.execute_input\":\"2023-09-09T01:01:40.800042Z\",\"iopub.status.idle\":\"2023-09-09T01:01:42.844700Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:40.800004Z\",\"shell.execute_reply\":\"2023-09-09T01:01:42.843683Z\"}}\n# Compile the best model\nbest_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=tf.keras.metrics.Recall())\n\n# Set up early stopping to prevent overfitting\ncallback = tf.keras.callbacks.EarlyStopping(\n    monitor='loss',\n    patience=50,\n    restore_best_weights=True    \n)\n\n# %% [markdown]\n# ### Fitting the Tuned Model\n\n# %% [code]\n# Train the best model\nstart_time =  datetime.now() \nhistory = best_model.fit(X_train_new_under_sampled, y_train_new_under_sampled,\n                         validation_data=(X_val, y_val), verbose=0, callbacks=[callback])\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:42.846266Z\",\"iopub.execute_input\":\"2023-09-09T01:01:42.846642Z\",\"iopub.status.idle\":\"2023-09-09T01:01:42.853590Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:42.846607Z\",\"shell.execute_reply\":\"2023-09-09T01:01:42.852638Z\"}}\n# Measure the training duration\nduration = datetime.now() - start_time\nprint(f'Training completed in time: {duration}')\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:42.854951Z\",\"iopub.execute_input\":\"2023-09-09T01:01:42.855977Z\",\"iopub.status.idle\":\"2023-09-09T01:01:42.866442Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:42.855944Z\",\"shell.execute_reply\":\"2023-09-09T01:01:42.865531Z\"}}\n# Prepare test data and evaluate the model on the test set\ny_test = np.array(y_test)\ny_test = y_test.reshape(-1,1)\ny_test.shape\n\n# %% [markdown]\n# ## Predicting Raw Test Data\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:42.867856Z\",\"iopub.execute_input\":\"2023-09-09T01:01:42.868363Z\",\"iopub.status.idle\":\"2023-09-09T01:01:43.389264Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:42.868329Z\",\"shell.execute_reply\":\"2023-09-09T01:01:43.388256Z\"}}\n#Evaluate model on unseen test set\ny_pred = best_model.predict(X_test)\n\n# Predict classes based on a threshold\nthreshold = 0.5  \n\ny_pred_classes = np.where(y_pred > threshold, 1, 0)\n\n# Calculate the best validation accuracy achieved during training\nbest_val_accuracy = max(history.history['val_recall_1'])\nprint(\"Best validation accuracy:\", best_val_accuracy)\n\n\n# Evaluate the test accuracy of the best model\ntest_accuracy= best_model.evaluate(X_test,y_test,verbose=0)\nprint(f'Test Accuracy:{test_accuracy[1]}')\n\n# %% [markdown]\n# ## Converting the model to TFLite format\n\n# %% [code] {\"jupyter\":{\"outputs_hidden\":false},\"execution\":{\"iopub.status.busy\":\"2023-09-09T01:01:43.390908Z\",\"iopub.execute_input\":\"2023-09-09T01:01:43.391275Z\",\"iopub.status.idle\":\"2023-09-09T01:01:45.242929Z\",\"shell.execute_reply.started\":\"2023-09-09T01:01:43.391242Z\",\"shell.execute_reply\":\"2023-09-09T01:01:45.241899Z\"}}\n# Convert the best model to TFLite format to enable deployment on mobile device\nconverter = tf.lite.TFLiteConverter.from_keras_model(best_model)\ntflite_model = converter.convert()\n\n# Save the TFLite model to a file\nwith open('model.tflite', 'wb') as f:\n    f.write(tflite_model)","metadata":{"_uuid":"9930ff6a-7d89-45f8-b2c8-05acfca9df7b","_cell_guid":"a65e2798-2ead-451f-bc31-4b157531a284","collapsed":false,"jupyter":{"outputs_hidden":false},"trusted":true},"execution_count":null,"outputs":[]}]}