diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index 89f131f2..256f78b9 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -41,11 +41,11 @@ # from https://arxiv.org/pdf/2402.08846 "Transcribe speech to text: <|audio|>", # from GPT-4 - "Capture every word from <|audio|> verbatim", - "Convert speech to text from <|audio|>", - "Listen and transcribe the complete text from <|audio|>", - "Record in writing what is spoken in <|audio|>", - "Transcribe the spoken words from <|audio|> with exact wording and punctuation", + "Capture every word from the audio verbatim\n<|audio|>", + "Convert speech to text from audio\n<|audio|>", + "Listen and transcribe the complete text from audio\n<|audio|>", + "Record in writing what is spoken in audio\n<|audio|>", + "Transcribe the spoken words from audio with exact wording and punctuation\n<|audio|>", ] ANSWER_PROMPTS = [ # from Gazelle diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml index 973656a7..36b7a5f6 100644 --- a/ultravox/training/configs/release_config.yaml +++ b/ultravox/training/configs/release_config.yaml @@ -1,9 +1,9 @@ # SLM with ultravox & llama3.1, trained wtih knowledge distillation. -exp_name: "ultravox-v0_3" +exp_name: "ultravox-v0_4" # Make sure to accept the license agreement on huggingface hub text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct" -audio_model: "openai/whisper-small" +audio_model: "openai/whisper-medium" loss_config: @@ -14,10 +14,11 @@ loss_config: val_sets: ["anyinstruct", "soda", "peoplespeech"] batch_size: 24 -max_steps: 7200 # x8x24 = 1,382,400 samples +max_steps: 14400 # x8x24 = 2,764,800 -data_sets: [] +data_sets: ["anyinstruct"] data_dicts: +# continuation - path: "fixie-ai/librispeech_asr" name: "clean" splits: @@ -35,6 +36,14 @@ data_dicts: assistant_template: "{{ continuation }}" transcript_template: "{{ text }}" weight: 1 + - path: "fixie-ai/peoples_speech" + name: "clean" + splits: + - "train" # 1_501_271 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ text_proc.format_asr_text(text) }}" + weight: 8 - path: "fixie-ai/common_voice_17_0" name: "en" splits: @@ -43,3 +52,165 @@ data_dicts: assistant_template: "{{ continuation }}" transcript_template: "{{ text_proc.format_asr_text(sentence) }}" weight: 8 + - path: "fixie-ai/common_voice_17_0" + name: "ar" + splits: + - "train" # 28_369 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 0.2 + - path: "fixie-ai/common_voice_17_0" + name: "de" + splits: + - "train" # 589_100 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 4 + - path: "fixie-ai/common_voice_17_0" + name: "es" + splits: + - "train" # 336_846 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 3 + - path: "fixie-ai/common_voice_17_0" + name: "fr" + splits: + - "train" # 558_054 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 4 + - path: "fixie-ai/common_voice_17_0" + name: "it" + splits: + - "train" # 169_771 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 1.2 + - path: "fixie-ai/common_voice_17_0" + name: "ja" + splits: + - "train" # 10_039 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 0.1 + - path: "fixie-ai/common_voice_17_0" + name: "pt" + splits: + - "train" # 21_968 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 0.2 + - path: "fixie-ai/common_voice_17_0" + name: "ru" + splits: + - "train" # 26_377 samples + user_template: "Continue the following text using less than 50 words:\n\n<|audio|>" + assistant_template: "{{ continuation }}" + transcript_template: "{{ sentence }}" + weight: 0.2 +# ASR task + - path: "fixie-ai/librispeech_asr" + name: "clean" + splits: + - "train.100" # 28_539 samples + - "train.360" # 104_014 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text }}" + transcript_template: "{{ text }}" + weight: 0.1 + - path: "fixie-ai/librispeech_asr" + name: "other" + splits: + - "train.500" # 148_688 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text }}" + transcript_template: "{{ text }}" + weight: 0.1 + - path: "fixie-ai/peoples_speech" + name: "clean" + splits: + - "train" # 1_501_271 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(text) }}" + transcript_template: "{{ text_proc.format_asr_text(text) }}" + weight: 0.8 + - path: "fixie-ai/common_voice_17_0" + name: "en" + splits: + - "train" # 1_101_170 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ text_proc.format_asr_text(sentence) }}" + weight: 0.8 + - path: "fixie-ai/common_voice_17_0" + name: "ar" + splits: + - "train" # 28_369 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.02 + - path: "fixie-ai/common_voice_17_0" + name: "de" + splits: + - "train" # 589_100 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.4 + - path: "fixie-ai/common_voice_17_0" + name: "es" + splits: + - "train" # 336_846 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.3 + - path: "fixie-ai/common_voice_17_0" + name: "fr" + splits: + - "train" # 558_054 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.4 + - path: "fixie-ai/common_voice_17_0" + name: "it" + splits: + - "train" # 169_771 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.12 + - path: "fixie-ai/common_voice_17_0" + name: "ja" + splits: + - "train" # 10_039 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.01 + - path: "fixie-ai/common_voice_17_0" + name: "pt" + splits: + - "train" # 21_968 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.02 + - path: "fixie-ai/common_voice_17_0" + name: "ru" + splits: + - "train" # 26_377 samples + user_template: "{{ dataset._get_transcribe_prompt() }}" + assistant_template: "{{ text_proc.format_asr_text(sentence) }}" + transcript_template: "{{ sentence }}" + weight: 0.02