Patch audio summay to support multiple channels and save faster (#575)

* Faster saving using soundfile, support multiple channels * Support multiple channels, change shape from (c, L) to (L,c) * fix test and docs Co-authored-by: Tzu-Wei Huang <[email protected]>
lanpa · Jun 29, 2020 · f4a6c73 · f4a6c73
1 parent eb7e8d2
commit f4a6c73
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 21 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -56,6 +56,7 @@ install:
   - pip install tb-nightly
   - pip install crc32c
   - pip install protobuf==3.8.0
+  - pip install SoundFile
   - conda install ffmpeg
   - conda list
   - python -c "import imageio; imageio.plugins.ffmpeg.download()"

diff --git a/tensorboardX/summary.py b/tensorboardX/summary.py
@@ -392,32 +392,31 @@ def make_video(tensor, fps):
 
 
 def audio(tag, tensor, sample_rate=44100):
+    """
+    Args:
+      tensor: A 2-D float Tensor of shape `[frames, channels]` where `channels` is 1 or 2.
+        The values should between [-1, 1]. We also accepts 1-D tensor.
+    """
+    import io
+    import soundfile
     tensor = make_np(tensor)
-    tensor = tensor.squeeze()
     if abs(tensor).max() > 1:
         print('warning: audio amplitude out of range, auto clipped.')
         tensor = tensor.clip(-1, 1)
-    assert(tensor.ndim == 1), 'input tensor should be 1 dimensional.'
+    if tensor.ndim == 1:  # old API, which expects single channel audio
+        tensor = np.expand_dims(tensor, axis=1)
+
+    assert(tensor.ndim == 2), 'Input tensor should be 2 dimensional.'
+    length_frames, num_channels = tensor.shape
+    assert num_channels == 1 or num_channels == 2, 'The second dimension should be 1 or 2.'
+
+    with io.BytesIO() as fio:
+        soundfile.write(fio, tensor, samplerate=sample_rate, format='wav')
+        audio_string = fio.getvalue()
 
-    tensor_list = [int(32767.0 * x) for x in tensor]
-    import io
-    import wave
-    import struct
-    fio = io.BytesIO()
-    Wave_write = wave.open(fio, 'wb')
-    Wave_write.setnchannels(1)
-    Wave_write.setsampwidth(2)
-    Wave_write.setframerate(sample_rate)
-    tensor_enc = b''
-    tensor_enc += struct.pack("<" + "h" * len(tensor_list), *tensor_list)
-
-    Wave_write.writeframes(tensor_enc)
-    Wave_write.close()
-    audio_string = fio.getvalue()
-    fio.close()
     audio = Summary.Audio(sample_rate=sample_rate,
-                          num_channels=1,
-                          length_frames=len(tensor_list),
+                          num_channels=num_channels,
+                          length_frames=length_frames,
                           encoded_audio_string=audio_string,
                           content_type='audio/wav')
     return Summary(value=[Summary.Value(tag=tag, audio=audio)])

diff --git a/tensorboardX/writer.py b/tensorboardX/writer.py
@@ -747,7 +747,8 @@ def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, wallti
             sample_rate (int): sample rate in Hz
             walltime (float): Optional override default walltime (time.time()) of event
         Shape:
-            snd_tensor: :math:`(1, L)`. The values should lie between [-1, 1].
+            snd_tensor: :math:`(L, C)`. The values should lie between [-1, 1]. Where `L`
+              is the number of audio frames and `C` is the channel. 1 for mono, 2 for stereo.
         """
         if self._check_caffe2_blob(snd_tensor):
             snd_tensor = workspace.FetchBlob(snd_tensor)