Skip to content

Commit

Permalink
Merge branch 'main' into maanug/resiliency-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
maanug-nv authored Dec 2, 2024
2 parents f63a338 + f17c418 commit bf35284
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 8 deletions.
11 changes: 11 additions & 0 deletions examples/audio/process_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
raise RuntimeError('Model does not have a sampler')

if cfg.audio_dir is not None:
input_dir = cfg.audio_dir
filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
else:
# get filenames from manifest
Expand All @@ -193,6 +194,15 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
audio_file = manifest_dir / audio_file
filepaths.append(str(audio_file.absolute()))

# common path for all files
common_path = os.path.commonpath(filepaths)
if Path(common_path).is_relative_to(manifest_dir):
# if all paths are relative to the manifest, use manifest dir as input dir
input_dir = manifest_dir
else:
# use the parent of the common path as input dir
input_dir = Path(common_path).parent

if cfg.max_utts is not None:
# Limit the number of utterances to process
filepaths = filepaths[: cfg.max_utts]
Expand Down Expand Up @@ -238,6 +248,7 @@ def autocast():
batch_size=cfg.batch_size,
num_workers=cfg.num_workers,
input_channel_selector=cfg.input_channel_selector,
input_dir=input_dir,
)

logging.info(f"Finished processing {len(filepaths)} files!")
Expand Down
14 changes: 12 additions & 2 deletions nemo/collections/audio/models/audio_to_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def process(
batch_size: int = 1,
num_workers: Optional[int] = None,
input_channel_selector: Optional[ChannelSelectorType] = None,
input_dir: Optional[str] = None,
) -> List[str]:
"""
Takes paths to audio files and returns a list of paths to processed
Expand All @@ -344,6 +345,7 @@ def process(
num_workers: Number of workers for the dataloader
input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio.
If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
input_dir: Optional, directory that contains the input files. If provided, the output directory will mirror the input directory structure.
Returns:
Paths to processed audio signals.
Expand Down Expand Up @@ -413,9 +415,17 @@ def process(

for example_idx in range(processed_batch.size(0)):
# This assumes the data loader is not shuffling files
file_name = os.path.basename(paths2audio_files[file_idx])
if input_dir is not None:
# Make sure the output has the same directory structure as the input
filepath_relative = os.path.relpath(paths2audio_files[file_idx], start=input_dir)
else:
# Input dir is not provided, save files in the output directory
filepath_relative = os.path.basename(paths2audio_files[file_idx])
# Prepare output file
output_file = os.path.join(output_dir, f'processed_{file_name}')
output_file = os.path.join(output_dir, filepath_relative)
# Create output dir if necessary
if not os.path.isdir(os.path.dirname(output_file)):
os.makedirs(os.path.dirname(output_file))
# Crop the output signal to the actual length
output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
# Write audio
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/gpt/data/packed_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def prepare_packed_sequence_data(
sequences, histogram = create_hist(dataset, max_seq_length)

assignments = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm)
output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size)
output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size, tokenizer.eos_id)

# save output data
np.save(output_path, output_data)
Expand Down
11 changes: 9 additions & 2 deletions nemo/utils/sequence_packing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def create_packing_strategy(


def fill_packing_strategy(
assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int
assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int, pad_id: int
) -> List[Dict]:
"""
Fills the packing strategy with actual sequence data based on assignments and sequence information.
Expand All @@ -192,6 +192,7 @@ def fill_packing_strategy(
sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences
from the dataset (output of 'create_hist').
pack_size: The maximum capacity of each bin.
pad_id: The tokenizer's padding token.
Returns:
output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs,
Expand All @@ -205,7 +206,13 @@ def fill_packing_strategy(
input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist()
try:
loss_mask = np.array(
[[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
[
[
idx >= x['answer_start_idx'] and x['input_ids'][idx] != pad_id
for idx in range(len(x['input_ids']))
]
for x in per_seq_data
]
)[perm].tolist()
except KeyError:
loss_mask = None
Expand Down
7 changes: 4 additions & 3 deletions scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def tokenize_dataset(cfg: 'DictConfig'):

max_seq_length = dataset.max_seq_length
pad_id = dataset.tokenizer.eos_id
tokenizer = dataset.tokenizer
pad_seq_length_to_mult = dataset.pad_seq_length_to_mult
dataset = np.array([dataset[i] for i in range(len(dataset))])
if cp_size > 1:
Expand Down Expand Up @@ -162,7 +163,7 @@ def pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id):
for data in dataset:
max_length_to_pad = min(max_seq_length, ceil_to_nearest(len(data['input_ids']), pad_seq_length_to_mult))
pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id)
return dataset
return dataset, tokenizer


@dataclass
Expand All @@ -187,11 +188,11 @@ def from_config(self, cfg: 'DictConfig'):
)
def main(cfg: 'DictConfig') -> None:
args = PackingArgs().from_config(cfg)
dataset = tokenize_dataset(cfg)
dataset, tokenizer = tokenize_dataset(cfg)
sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
for pack_size in args.pack_sizes:
assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm)
output_data = fill_packing_strategy(assignments, sequences, pack_size)
output_data = fill_packing_strategy(assignments, sequences, pack_size, tokenizer.eos_id)

# save output data
os.makedirs(args.output_dir, exist_ok=True)
Expand Down

0 comments on commit bf35284

Please sign in to comment.