Skip to content

Commit

Permalink
Add zipformer recipe for audio tagging (#1421)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcoyang1998 authored Apr 9, 2024
1 parent f2e36ec commit 1732daf
Show file tree
Hide file tree
Showing 22 changed files with 3,836 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ repos:
# E121,E123,E126,E226,E24,E704,W503,W504

- repo: https://github.com/pycqa/isort
rev: 5.10.1
rev: 5.12.0
hooks:
- id: isort
args: ["--profile=black"]
Expand Down
12 changes: 12 additions & 0 deletions egs/audioset/AT/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Introduction

This is an audio tagging recipe for [Audioset](https://research.google.com/audioset/#/). It aims at predicting the sound events of an audio clip.

[./RESULTS.md](./RESULTS.md) contains the latest results.


# Zipformer

| Encoder | Feature type |
| --------| -------------|
| Zipformer | Frame level fbank|
44 changes: 44 additions & 0 deletions egs/audioset/AT/RESULTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
## Results

### zipformer
See <https://github.com/k2-fsa/icefall/pull/1421> for more details

[zipformer](./zipformer)

You can find a pretrained model, training logs, decoding logs, and decoding results at:
<https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/>

The model achieves the following mean averaged precision on AudioSet:

| Model | mAP |
| ------ | ------- |
| Zipformer-AT | 45.1 |

The training command is:

```bash
export CUDA_VISIBLE_DEVICES="4,5,6,7"
subset=full

python zipformer/train.py \
--world-size 4 \
--num-epochs 50 \
--exp-dir zipformer/exp_at_as_${subset} \
--start-epoch 1 \
--use-fp16 1 \
--num-events 527 \
--audioset-subset $subset \
--max-duration 1000 \
--enable-musan True \
--master-port 13455
```

The evaluation command is:

```bash
python zipformer/evaluate.py \
--epoch 32 \
--avg 8 \
--exp-dir zipformer/exp_at_as_full \
--max-duration 500
```
1 change: 1 addition & 0 deletions egs/audioset/AT/local/compute_fbank_musan.py
177 changes: 177 additions & 0 deletions egs/audioset/AT/local/generate_audioset_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (authors: Xiaoyu Yang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file generates the manifest and computes the fbank features for AudioSet
dataset. The generated manifests and features are stored in data/fbank.
"""

import argparse
import csv
import glob
import logging
import os
from typing import Dict

import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.audio import Recording
from lhotse.cut import MonoCut
from lhotse.supervision import SupervisionSegment

from icefall.utils import get_executor

torch.set_num_threads(1)
torch.set_num_interop_threads(1)


def get_ID_mapping(csv_file):
# get a mapping between class ID and class name
mapping = {}
with open(csv_file, "r") as fin:
reader = csv.reader(fin, delimiter=",")
for i, row in enumerate(reader):
if i == 0:
continue
mapping[row[1]] = row[0]
return mapping


def parse_csv(csv_file: str, id_mapping: Dict):
# The content of the csv file shoud be something like this
# ------------------------------------------------------
# filename label
# dataset/AudioSet/balanced/xxxx.wav 0;451
# dataset/AudioSet/balanced/xxxy.wav 375
# ------------------------------------------------------

def name2id(names):
ids = [id_mapping[name] for name in names.split(",")]
return ";".join(ids)

mapping = {}
with open(csv_file, "r") as fin:
reader = csv.reader(fin, delimiter=" ")
for i, row in enumerate(reader):
if i <= 2:
continue
key = row[0].replace(",", "")
mapping[key] = name2id(row[-1])
return mapping


def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument("--dataset-dir", type=str, default="downloads/audioset")

parser.add_argument(
"--split",
type=str,
default="balanced",
choices=["balanced", "unbalanced", "eval"],
)

parser.add_argument(
"--feat-output-dir",
type=str,
default="data/fbank",
)

return parser


def main():
parser = get_parser()
args = parser.parse_args()

dataset_dir = args.dataset_dir
split = args.split
feat_output_dir = args.feat_output_dir

num_jobs = min(15, os.cpu_count())
num_mel_bins = 80

if split in ["balanced", "unbalanced"]:
csv_file = f"{dataset_dir}/{split}_train_segments.csv"
elif split == "eval":
csv_file = f"{dataset_dir}/eval_segments.csv"
else:
raise ValueError()

class_indices_csv = f"{dataset_dir}/class_labels_indices.csv"
id_mapping = get_ID_mapping(class_indices_csv)
labels = parse_csv(csv_file, id_mapping)

audio_files = glob.glob(f"{dataset_dir}/{split}/*.wav")

new_cuts = []
for i, audio in enumerate(audio_files):
cut_id = audio.split("/")[-1].split("_")[0]
recording = Recording.from_file(audio, cut_id)
cut = MonoCut(
id=cut_id,
start=0.0,
duration=recording.duration,
channel=0,
recording=recording,
)
supervision = SupervisionSegment(
id=cut_id,
recording_id=cut.recording.id,
start=0.0,
channel=0,
duration=cut.duration,
)
try:
supervision.audio_event = labels[cut_id]
except KeyError:
logging.info(f"No labels found for {cut_id}.")
continue
cut.supervisions = [supervision]
new_cuts.append(cut)

if i % 100 == 0 and i:
logging.info(f"Processed {i} cuts until now.")

cuts = CutSet.from_cuts(new_cuts)

extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

logging.info(f"Computing fbank features for {split}")
with get_executor() as ex:
cuts = cuts.compute_and_store_features(
extractor=extractor,
storage_path=f"{feat_output_dir}/{split}_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)

manifest_output_dir = feat_output_dir + "/" + f"cuts_audioset_{split}.jsonl.gz"

logging.info(f"Storing the manifest to {manifest_output_dir}")
cuts.to_jsonl(manifest_output_dir)


if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

logging.basicConfig(format=formatter, level=logging.INFO)
main()
104 changes: 104 additions & 0 deletions egs/audioset/AT/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env bash

# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

set -eou pipefail

# run step 0 to step 5 by default
stage=-1
stop_stage=4

dl_dir=$PWD/download

# we assume that you have your downloaded the AudioSet and placed
# it under $dl_dir/audioset, the folder structure should look like
# this:
# - $dl_dir/audioset
# - balanced
# - eval
# - unbalanced
# If you haven't downloaded the AudioSet, please refer to
# https://github.com/RicherMans/SAT/blob/main/datasets/audioset/1_download_audioset.sh.

. shared/parse_options.sh || exit 1

# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data

log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

log "Running prepare.sh"

log "dl_dir: $dl_dir"

if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage 0: Download the necessary csv files"
if [ ! -e $dl_dir/audioset/.csv.done]; then
wget --continue "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv" -O "${dl_dir}/audioset/class_labels_indices.csv"
wget --continue http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv -O "${dl_dir}/audioset/balanced_train_segments.csv"
wget --continue http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv -O "${dl_dir}/audioset/eval_segments.csv"
touch $dl_dir/audioset/.csv.done
fi
fi

if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Construct the audioset manifest and compute the fbank features for balanced set"
fbank_dir=data/fbank
if [! -e $fbank_dir/.balanced.done]; then
python local/generate_audioset_manifest.py \
--dataset-dir $dl_dir/audioset \
--split balanced \
--feat-output-dir $fbank_dir
touch $fbank_dir/.balanced.done
fi
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Construct the audioset manifest and compute the fbank features for unbalanced set"
fbank_dir=data/fbank
if [! -e $fbank_dir/.unbalanced.done]; then
python local/generate_audioset_manifest.py \
--dataset-dir $dl_dir/audioset \
--split unbalanced \
--feat-output-dir $fbank_dir
touch $fbank_dir/.unbalanced.done
fi
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Construct the audioset manifest and compute the fbank features for eval set"
fbank_dir=data/fbank
if [! -e $fbank_dir/.eval.done]; then
python local/generate_audioset_manifest.py \
--dataset-dir $dl_dir/audioset \
--split eval \
--feat-output-dir $fbank_dir
touch $fbank_dir/.eval.done
fi
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to $dl_dir/musan
mkdir -p data/manifests
if [ ! -e data/manifests/.musan.done ]; then
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan.done
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan"
mkdir -p data/fbank
if [ ! -e data/fbank/.musan.done ]; then
./local/compute_fbank_musan.py
touch data/fbank/.musan.done
fi
fi
1 change: 1 addition & 0 deletions egs/audioset/AT/shared
Loading

0 comments on commit 1732daf

Please sign in to comment.