updated demo notebook

mlcommons · Peter-Chang · Jan 20, 2025 · Oct 9, 2023 · Jan 22, 2024 · Jan 27, 2024
commit 02953c91573892abc45e2068a4475cfd805cd398
diff --git a/benchmark/training/streaming_wakeword/demo.ipynb b/benchmark/training/streaming_wakeword/demo.ipynb
@@ -65,7 +65,7 @@
     "# unrecognized argument error\n",
     "sys.argv = sys.argv[0:1] \n",
     "\n",
-    "Flags = util.parse_command()"
+    "Flags = util.parse_command(\"train\")"
    ]
   },
   {
@@ -79,6 +79,7 @@
     "\n",
     "if notebook_mode == \"inference\": \n",
     "  load_pretrained_model = True\n",
+    "  Flags.num_samples_training = 2000 # we don't need the full set for inference\n",
     "  save_model = False\n",
     "elif notebook_mode == \"short_training\":\n",
     "  ## Set these for an extra short test just to validate that the code runs\n",
@@ -97,8 +98,7 @@
     "  pass\n",
     "\n",
     "# 'trained_models/str_ww_model.h5' is the default save path for train.py\n",
-    "# pretrained_model_path = 'trained_models/str_ww_ref_model.h5' # path to load from if load_pretrained_model is True\n",
-    "pretrained_model_path = 'trained_models/str_ww_model.h5' # path to load from if load_pretrained_model is True\n",
+    "pretrained_model_path = 'trained_models/str_ww_ref_model.h5' # path to load from if load_pretrained_model is True\n",
     "\n",
     "samp_freq = Flags.sample_rate"
    ]
@@ -184,26 +184,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# max_target_examples = 3\n",
-    "# target_count = 0\n",
+    "max_target_examples = 3\n",
+    "target_count = 0\n",
     "\n",
-    "# plt.Figure(figsize=(10,4))\n",
-    "# for dat in ds_train.unbatch():\n",
-    "#   # label_string = dat[1].numpy().decode('utf8')\n",
-    "#   if np.argmax(dat[1]) == 0:\n",
-    "#     target_count += 1\n",
-    "#     ax = plt.subplot(max_target_examples, 1, target_count)\n",
-    "#     # display.display(display.Audio(dat[0].numpy(), rate=16000))\n",
+    "plt.Figure(figsize=(10,4))\n",
+    "for dat in ds_train.unbatch():\n",
+    "  if np.argmax(dat[1]) == 0:\n",
+    "    target_count += 1\n",
+    "    ax = plt.subplot(max_target_examples, 1, target_count)\n",
+    "    # display.display(display.Audio(dat[0].numpy(), rate=16000))\n",
     "\n",
-    "#     log_spec = dat[0].numpy().squeeze()\n",
-    "#     height = log_spec.shape[0]\n",
-    "#     width = log_spec.shape[1]\n",
-    "#     X = np.linspace(0, 1.0, num=width, dtype=float)\n",
-    "#     Y = range(height)\n",
-    "#     ax.pcolormesh(X, Y, np.squeeze(log_spec))\n",
-    "#     if target_count >= max_target_examples:\n",
-    "#       break\n",
-    "# plt.tight_layout()"
+    "    log_spec = dat[0].numpy().squeeze()\n",
+    "    height = log_spec.shape[0]\n",
+    "    width = log_spec.shape[1]\n",
+    "    X = np.linspace(0, 1.0, num=width, dtype=float)\n",
+    "    Y = range(height)\n",
+    "    ax.pcolormesh(X, Y, np.squeeze(log_spec))\n",
+    "    if target_count >= max_target_examples:\n",
+    "      break\n",
+    "plt.tight_layout()"
    ]
   },
   {
@@ -214,7 +213,7 @@
    "outputs": [],
    "source": [
     "## look at the label breakdown in the training set\n",
-    "# print(get_dataset.count_labels(ds_train))\n",
+    "print(get_dataset.count_labels(ds_train))\n",
     "\n"
    ]
   },
@@ -347,8 +346,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "label_list = ['marvin', 'silent', 'other']\n",
-    "\n",
     "build_and_plot_confusion_matrix(model, ds_train)"
    ]
   },
@@ -363,38 +360,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c0a3d5e5-27fc-4059-be36-6132fe073155",
+   "id": "45495640-7e98-474e-910f-6070f157ad60",
    "metadata": {},
    "outputs": [],
    "source": [
-    "num_calibration_steps = 5\n",
-    "tfl_file_name = \"strm_ww_int8.tflite\"\n",
-    "\n",
-    "# converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "if True: \n",
-    "  # If we omit this block, we'll get a floating-point TFLite model,\n",
-    "  # with this block, the weights and activations should be quantized to 8b integers, \n",
-    "  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "\n",
-    "  ds_calibration = ds_val.unbatch().batch(1).take(num_calibration_steps)\n",
-    "  def representative_dataset_gen():\n",
-    "    for next_spec, label in ds_calibration:\n",
-    "      yield [next_spec] \n",
-    "    \n",
-    "  converter.representative_dataset = representative_dataset_gen\n",
-    "  converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] # use this one\n",
-    "  # converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]\n",
-    "\n",
-    "  converter.inference_input_type = tf.int8  # or tf.uint8; should match dat_q in eval_quantized_model.py\n",
-    "  converter.inference_output_type = tf.int8  # or tf.uint8\n",
-    "\n",
-    "tflite_quant_model = converter.convert()\n",
-    "\n",
-    "with open(tfl_file_name, \"wb\") as fpo:\n",
-    "  fpo.write(tflite_quant_model)\n",
-    "print(f\"Wrote to {tfl_file_name}\")\n",
-    "!ls -l $tfl_file_name"
+    "!python quantize.py --saved_model_path=trained_models/str_ww_ref_model.h5\n"
    ]
   },
   {
@@ -433,7 +403,6 @@
     "spec, label = next(ds_val.unbatch().batch(1).take(1).as_numpy_iterator())\n",
     "\n",
     "spec_q = np.array(spec/input_scale + input_zero_point, dtype=np.int8)\n",
-    "print(f\"min = {np.min(spec_q)}, max = {np.max(spec_q)}\")\n",
     "\n",
     "interpreter.set_tensor(input_details[0]['index'], spec_q)\n",
     "interpreter.invoke()\n",
@@ -491,7 +460,7 @@
    "id": "7100ecee-9ed6-42b4-984f-8b75ad642d11",
    "metadata": {},
    "source": [
-    "As of 10 Feb 2024, the quantized accuracy on the training set is 83% and 83% on the validation set."
+    "As of 12 Aug 2024, the quantized accuracy on the validation set is 95.5%.  Now we can plot the confusion matrix of the quantized model."
    ]
   },
   {
@@ -501,6 +470,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "label_list = ['marvin', 'silent', 'other']\n",
     "confusion_mtx = tf.math.confusion_matrix(labels, predictions)\n",
     "plt.figure(figsize=(6, 6))\n",
     "sns.heatmap(confusion_mtx, xticklabels=label_list, yticklabels=label_list, \n",
@@ -516,7 +486,9 @@
    "id": "6b2750b5-4aa6-4613-bbc9-ddca78fe6cdd",
    "metadata": {},
    "source": [
-    "## Run Model on Long Waveform"
+    "## Run Model on Long Waveform\n",
+    "\n",
+    "The use case this benchmark is meant to model is one of detecting a \"wakeword\" (similar to \"Hey Siri\", \"Alexa\", or \"OK Google\") in a continuous stream of sound, including background noise.  So to mimic that use case, we will run the model on a longer waveform that includes several instances of the wakeword (\"Marvin\") and some background noise."
    ]
   },
   {
@@ -552,14 +524,11 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "857bba03-be91-4f66-b161-539a15b911e9",
+   "cell_type": "markdown",
+   "id": "e0e9c14e-48b6-4c1c-97c4-ee24016f64d9",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ww = model.get_weights()\n",
-    "ww_tv = model_tv.get_weights()\n"
+    "For the keras model, we can build an alternate version of the model that accepts inputs of arbitrary length."
    ]
   },
   {
@@ -569,8 +538,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "pretrained_model_uses_qat = hasattr(model.layers[1], \"quantizer\")\n",
     "Flags.variable_length=True\n",
-    "model_tv = models.get_model(args=Flags, use_qat=Flags.use_qat)\n",
+    "model_tv = models.get_model(args=Flags, use_qat=pretrained_model_uses_qat)\n",
     "Flags.variable_length=False\n",
     "# transfer weights from trained model into variable-length model\n",
     "model_tv.set_weights(model.get_weights())"
@@ -581,7 +551,9 @@
    "id": "d31d4877-e789-49ab-ac73-92065a747446",
    "metadata": {},
    "source": [
-    "## Run Streaming Test on Long Waveform"
+    "## Run Streaming Test on Long Waveform\n",
+    "\n",
+    "A pre-constructed test wav is included in the repo (`long_wav.wav`) along with a json file that indicates the beginning and end of every instance of the wakeword, `long_wav_ww_windows.json`."
    ]
   },
   {
@@ -619,11 +591,6 @@
     "## build a feature extractor that can operate on longer waveforms.\n",
     "## this one can operate on waveforms up to len(long_wav)\n",
     "data_config_long = get_dataset.get_data_config(Flags, 'training')\n",
-    "data_config_long['foreground_volume_max'] = data_config_long['foreground_volume_min'] = 1.0 # scale to [-1.0,1.0]\n",
-    "data_config_long['background_frequency'] = 0.0 # do not add background noise or time-shift the input\n",
-    "data_config_long['time_shift_ms'] = 0.0\n",
-    "data_config_long['desired_samples'] = len(long_wav)\n",
-    "data_config_long['num_samples'] = -1\n",
     "\n",
     "with open(\"data_config_nb.json\", \"w\") as fpo:\n",
     "    json.dump(data_config_long, fpo, indent=4)\n",
@@ -652,24 +619,14 @@
     "print(f\"Does spectrogram loaded from file match the one we created?: {specgrams_match}\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4a46d67e-418c-45a9-b529-52087dc17be5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# plt.plot(long_spec.reshape(-1), long_spec_from_file.reshape(-1), '.')\n",
-    "# plt.grid(True)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "97becd93-f5c3-4998-af50-f43bd6fd5f38",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# We'll count a detection when the softmax output for the wakeword exceeds the detection threshold det_thresh\n",
     "det_thresh = 0.95\n",
     "\n",
     "yy = model_tv(np.expand_dims(long_spec, 0))[0].numpy()\n",
@@ -688,6 +645,14 @@
     "plt.grid(True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "dcbb6fe0-1c4e-4321-bb62-0b5ff05d2efa",
+   "metadata": {},
+   "source": [
+    "Take a look at some of the false positives here, and then in the next cell, some of the false negatives."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -699,7 +664,7 @@
     "for i in range(num_fp_clips_to_show):\n",
     "  fp_start = np.nonzero(ww_false_detects)[0][i] # sample number where the false pos starts\n",
     "  print(f\"False positive at {fp_start/samp_freq:3.2f}s (sample {fp_start})\")\n",
-    "  fp_clip = slice(fp_start-32000,fp_start+32000) # add 2s before and after\n",
+    "  fp_clip = slice(fp_start-16000,fp_start+16000) # add 2s before and after\n",
     "  display.display(display.Audio(long_wav[fp_clip], rate=16000))\n"
    ]
   },
@@ -754,6 +719,14 @@
     "\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "50c9adcc-7a75-4297-8e29-755f494cd0a4",
+   "metadata": {},
+   "source": [
+    "Now we can take a closer look at one of the errors, showing the waveform plot, listening to the audio, and showing the spectrogram, along with the model outputs.  "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -770,56 +743,6 @@
     "examine_clip(wav_clip, model_tv, feature_extractor_long)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c80ccf31-8240-4c23-accd-e9eee62265c2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start = 258.85 - 2.0\n",
-    "t_stop  = 258.85 + 2.0\n",
-    "i_start_wav = int(t_start*Flags.sample_rate)\n",
-    "i_stop_wav = int(t_stop*Flags.sample_rate)\n",
-    "i_start_spec = int(t_start/(Flags.window_stride_ms/1000))\n",
-    "i_stop_spec = int(t_stop/(Flags.window_stride_ms/1000))\n",
-    "\n",
-    "wav_slice = slice(i_start_wav, i_stop_wav)\n",
-    "spec_slice = slice(i_start_spec, i_stop_spec)\n",
-    "\n",
-    "t_spec= np.arange(long_spec.shape[0])*(Flags.window_stride_ms/1000)\n",
-    "\n",
-    "ww_detected = np.repeat(ww_detected_spec_scale, Flags.window_stride_ms*Flags.sample_rate/1000)\n",
-    "extra_zeros = np.zeros(len(long_wav)-len(ww_detected))\n",
-    "print(f\"added {len(extra_zeros)} extra zeros\")\n",
-    "ww_detected = np.concatenate((extra_zeros, ww_detected), axis=0)\n",
-    "\n",
-    "plt.figure(figsize=(8, 6))\n",
-    "\n",
-    "plt.subplot(3,1,1)\n",
-    "# plt.imshow(np.squeeze(long_spec).T, origin=\"lower\", aspect='auto')\n",
-    "plt.pcolormesh(t_spec[spec_slice], np.arange(long_spec.shape[-1]), long_spec[spec_slice].squeeze().T)\n",
-    "\n",
-    "plt.subplot(3,1,2)\n",
-    "plt.plot(t[wav_slice], long_wav[wav_slice], \n",
-    "         t[wav_slice], ww_present[wav_slice],\n",
-    "         t[wav_slice], 1.1*ww_detected[wav_slice])\n",
-    "plt.xlim([t_start, t_stop])\n",
-    "plt.grid(True)\n",
-    "plt.legend(['Waveform', 'Wakeword Present', 'Wakeword Detected'], loc='lower right', fontsize=8)\n",
-    "\n",
-    "# The model output yy loses some length because of valid-padded convolutions. \n",
-    "# Add that length back to time-align input and output\n",
-    "yy_ext = np.concatenate((np.zeros((len(long_spec)-len(yy), yy.shape[1])), yy))\n",
-    "plt.subplot(3,1,3)\n",
-    "plt.plot(t_spec[spec_slice], yy_ext[spec_slice])\n",
-    "plt.plot(t_spec[spec_slice], det_thresh*np.ones(t_spec[spec_slice].shape), 'k-', linewidth=0.5)\n",
-    "plt.legend(label_list+[\"Threshold\"], loc='lower right', fontsize=8);\n",
-    "plt.xlim([t_start, t_stop])\n",
-    "plt.tight_layout()\n",
-    "# display.display(display.Audio(long_wav, rate=16000))"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "19666bc5-4b43-41ef-b93d-050ff7f87dd0",
@@ -865,7 +788,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "det_thresh = 0.85\n",
+    "det_thresh = 0.95\n",
     "## shows detection when wakeword activation is strongest output\n",
     "# ww_detected_spec_scale = (np.argmax(yy, axis=1)==0) # detections on the time scale of spectrograms\n",
     "\n",
@@ -904,7 +827,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0ff4c8cc-80ce-4ecb-b30f-2f545ce4f14c",
+   "id": "5fffca80-705b-46ff-b259-12624d10be62",
    "metadata": {},
    "outputs": [],
    "source": []