Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a CLI interface for conversion between file formats #103

Merged
merged 16 commits into from
Mar 24, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add cli file format converter
thequilo committed Mar 20, 2025
commit 7fdbe28c8942cc479be6da60bda77fbe8362d1a6
2 changes: 1 addition & 1 deletion meeteval/io/__init__.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
from . import uem
from . import seglst
from . import keyed_text
from .load_wrapper import load
from .smart import load, dump
from .stm import STM, STMLine
from .rttm import RTTM, RTTMLine
from .uem import UEM, UEMLine
132 changes: 132 additions & 0 deletions meeteval/io/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import os
import sys
from stat import S_ISFIFO
from pathlib import Path
import argparse
import meeteval

# formats = {
# 'ctm': meeteval.io.ctm.CTM,
# 'stm': meeteval.io.stm.STM,
# 'seglst': meeteval.io.seglst.SegLST,
# 'rttm': meeteval.io.rttm.RTTM,
# }

# d = meeteval.io.load('').to_seglst()
# for s in d:
# s['words'] = '<NA>'
# meeteval.io.STM.new(d).dump('.stm')


def is_piping_in():
"""
Checks whether input is being piped into the command.
"""
return S_ISFIFO(os.fstat(0).st_mode)


def convert(input_files, output_file, input_format, output_format, **kwargs):
data = []

for f in input_files:
d = meeteval.io.load(f, format=input_format).to_seglst()

extra = {}

if isinstance(f, (str, Path)):
extra['filestem'] = Path(f).stem

if kwargs:
for segment in d:
for k, v in kwargs.items():
if isinstance(v, str):
v = v.format(**segment, **extra)
segment[k] = v

data.append(d)

data = meeteval.io.SegLST.merge(*data)

meeteval.io.dump(data, output_file, output_format)


def cli():
from meeteval.wer.__main__ import SmartFormatter

piping_in = is_piping_in()

parser = argparse.ArgumentParser(
formatter_class=SmartFormatter,
)
commands = parser.add_subparsers(
title='Subcommands',
)

for reader in ['ctm', 'stm', 'seglst', 'rttm']:
for writer in ['stm', 'seglst', 'rttm']: # 'ctm' is not supported as output format
command_parser = commands.add_parser(
f'{reader}2{writer}',
help=f'Converts from {reader.upper()} to {writer.upper()}.',
formatter_class=SmartFormatter,
add_help=False,
)
command_parser.add_argument(
'--help',
help='show this help message and exit',
action='help',
default=argparse.SUPPRESS,
)
command_parser.add_argument(
'--force', '-f',
help='Overwrite the output file if it exists.',
action='store_true',
default=False,
)
# When piping into the command, we do not want to allow
# additional input files
command_parser.add_argument(
'input_files',
nargs='+' if not piping_in else '*',
help='The input files.',
)
command_parser.add_argument(
'output_file', type=str,
help='The output file. "-" means stdout.'
)
command_parser.set_defaults(
input_format=reader,
output_format=writer,
parser=command_parser,
)

# Special cases
if reader == 'ctm':
command_parser.add_argument(
'--speaker',
type=str,
help='The speaker name to use for the CTM. Defaults to the file stem.',
required=piping_in,
default=None if piping_in else '{filestem}',
)
if reader == 'rttm' and writer in ('stm', 'seglst'):
command_parser.add_argument(
'--words',
default='<NA>',
)

args = dict(vars(parser.parse_args()))
if not args['force']:
if os.path.exists(args['output_file']):
raise FileExistsError(
f'Output file "{args["output_file"]}" already exists. '
'Use --force / -f to overwrite.'
)
if piping_in:
if args['input_files']:
import subprocess
args['parser'].error(f'Input files ({subprocess.list2cmdline(args["input_files"])}) are not allowed when piping into the command.')
args['input_files'] = [sys.stdin]
if args['output_file'] == '-':
args['output_file'] = sys.stdout
args.pop('parser')
convert(**args)
22 changes: 4 additions & 18 deletions meeteval/io/base.py
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@
from dataclasses import dataclass
from itertools import groupby
import decimal
from meeteval.io.smart import _open

if typing.TYPE_CHECKING:
from typing import Self
@@ -353,31 +354,16 @@ def filenames(self):
def to_seglst(self) -> 'SegLST':
from meeteval.io.seglst import SegLST
return SegLST([l.to_seglst_segment() for l in self.lines])

def map(self, fn):
return self.__class__([fn(line) for line in self.lines])

@classmethod
def new(cls, s, **defaults) -> 'Self':
from meeteval.io.seglst import asseglst
return cls([cls.line_cls.from_dict({**defaults, **segment}) for segment in asseglst(s)])


def _open(f, mode='r'):
if isinstance(f, io.TextIOBase):
return contextlib.nullcontext(f)
elif isinstance(f, str) and str(f).startswith('http'):
import urllib.request, urllib.error
try:
resource = urllib.request.urlopen(str(f))
except urllib.error.URLError as e:
raise FileNotFoundError(f) from e
# https://stackoverflow.com/a/19156107/5766934
return contextlib.nullcontext(io.TextIOWrapper(
resource, resource.headers.get_content_charset()))
elif isinstance(f, (str, os.PathLike)):
return open(f, mode)
else:
raise TypeError(type(f), f)


class _VerboseKeyError(KeyError):
# origin: paderbox.utils.mapping.DispatchError
def __str__(self):
8 changes: 5 additions & 3 deletions meeteval/io/seglst.py
Original file line number Diff line number Diff line change
@@ -68,12 +68,14 @@ def load(
) -> 'Self':
from meeteval.io.base import _open
files = file if isinstance(file, (tuple, list)) else [file]

parsed = []
for f in files:
with _open(f, 'r') as fd:
parsed.append(cls.parse(fd.read(), parse_float=parse_float))

try:
parsed.append(cls.parse(fd.read(), parse_float=parse_float))
except ValueError as e:
# Catches simplejson's JSONDecodeError and our own ValueErrors
raise ValueError(f'Unknown JSON format: {f}. Only SegLST format is supported.') from e
return cls.merge(*parsed)

@classmethod
102 changes: 77 additions & 25 deletions meeteval/io/load_wrapper.py → meeteval/io/smart.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,48 @@
import os
import decimal
import io
from pathlib import Path

__all__ = ['load']

__all__ = ['load', 'dump']


def _get_format(format, path):
import meeteval
format = {
'stm': meeteval.io.STM,
'rttm': meeteval.io.RTTM,
'uem': meeteval.io.UEM,
'ctm': meeteval.io.CTM,
'seglst': meeteval.io.SegLST,
'keyed_text': meeteval.io.KeyedText,
'json': meeteval.io.SegLST,
}.get(format)

if format is None:
raise ValueError(f'Unknown file type: {path}')

return format


def _open(f, mode='r'):
import contextlib
if isinstance(f, io.TextIOBase):
return contextlib.nullcontext(f)
elif isinstance(f, str) and str(f).startswith('http'):
# Web request
import urllib.request, urllib.error
try:
resource = urllib.request.urlopen(str(f))
except urllib.error.URLError as e:
raise FileNotFoundError(f) from e
# https://stackoverflow.com/a/19156107/5766934
return contextlib.nullcontext(io.TextIOWrapper(
resource, resource.headers.get_content_charset()))
elif isinstance(f, (str, os.PathLike)):
return open(f, mode)
else:
raise TypeError(type(f), f)


def _guess_format(path: 'Path | io.TextIOBase'):
@@ -63,7 +103,7 @@ def load(path: 'Path | list[Path]', parse_float=decimal.Decimal, format: 'str |
- 'rttm': NIST RTTM format
- 'uem': NIST UEM format
- 'ctm': NIST CTM format
- 'seglst': Chime7 JSON format (SegLST)
- 'seglst': SegLST (Chime7 JSON format)
- 'keyed_text': Kaldi KeyedText format
Args:
@@ -84,33 +124,45 @@ def load(path: 'Path | list[Path]', parse_float=decimal.Decimal, format: 'str |
raise ValueError(
f'All files must have the same format, but found {types} for {path}.'
)

import meeteval

if isinstance(loaded[0], meeteval.io.CTM):
return meeteval.io.CTMGroup({p.stem: l for p, l in zip(path, loaded)})

return loaded[0].__class__.merge(*loaded)

import meeteval
if format in (None, 'none', 'auto'):
format = _guess_format(Path(path))

loader = _get_format(format, path)

return loader.load(path, parse_float=parse_float)


def dump(obj, path, format: 'str | None'=None):
"""
Dump a `meeteval.io` object to a file.
Guesses the file format from the files suffix by default. The format to use can be specified by the user by
supplying `file_format`. This is especially useful when the files do not have a (correct) suffix, e.g., reading
from STDIN.
Available options are:
- 'stm': NIST STM format
- 'rttm': NIST RTTM format
- 'uem': NIST UEM format
- 'ctm': NIST CTM format
- 'seglst': SegLST (Chime7 JSON format)
- 'keyed_text': Kaldi KeyedText format
Args:
obj: Object to dump.
"""

if format in (None, 'none', 'auto'):
format = _guess_format(Path(path))

if format == 'stm':
load_fn = meeteval.io.STM.load
elif format == 'rttm':
load_fn = meeteval.io.RTTM.load
elif format == 'uem':
load_fn = meeteval.io.UEM.load
elif format == 'ctm':
load_fn = meeteval.io.CTMGroup.load
elif format == 'seglst':
load_fn = meeteval.io.SegLST.load
elif format == 'keyed_text':
load_fn = meeteval.io.KeyedText.load
elif format == 'json':
# Guess the type from the file content. Only support Chime7 JSON / SegLST format.
try:
return meeteval.io.SegLST.load(path, parse_float=parse_float)
except ValueError as e:
# Catches simplejson's JSONDecodeError and our own ValueErrors
raise ValueError(f'Unknown JSON format: {path}. Only SegLST format is supported.') from e
else:
raise ValueError(f'Unknown file type: {path}')
dumper = _get_format(format, path)

return load_fn(path, parse_float=parse_float)
return dumper.new(obj).dump(path)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -138,6 +138,7 @@
'meeteval-wer=meeteval.wer.__main__:cli',
'meeteval-der=meeteval.der.__main__:cli',
'meeteval-viz=meeteval.viz.__main__:cli',
'meeteval-io=meeteval.io.__main__:cli',
]
},
include_dirs=[numpy.get_include()],
109 changes: 109 additions & 0 deletions tests/test_io_converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import itertools
from pathlib import Path
import pytest
import subprocess
import meeteval

example_files = (Path(__file__).parent.parent / 'example_files').absolute()

test_files = {
'ctm': 'hyp1.ctm',
'stm': 'hyp.stm',
'rttm': 'hyp.rttm',
'seglst': 'hyp.seglst.json',
}

def run(cmd):
cp = subprocess.run(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
universal_newlines=True,
cwd=example_files,
executable='bash', # echo "<(cat hyp.stm)" requires bash not sh.
)

if cp.returncode == 0:
return cp
else:
if not isinstance(cmd, str):
import shlex
cmd = shlex.join(cmd)
raise Exception(
f'$ {cmd}'
f'\n\nreturncode: {cp.returncode}'
f'\n\nstdout:\n{cp.stdout}'
f'\n\nstderr:\n{cp.stderr}'
)


@pytest.fixture
def output_folder():
output_folder = Path('converted').absolute()
output_folder.mkdir(exist_ok=True)
yield output_folder
for f in output_folder.glob('*'):
f.unlink()
output_folder.rmdir()


@pytest.mark.parametrize(
'from_format, to_format',
list(itertools.product(
['ctm', 'stm', 'rttm', 'seglst'],
['stm', 'rttm', 'seglst'],
))
)
def test_converter(from_format, to_format, output_folder):
run(f'meeteval-io {from_format}2{to_format} {example_files / test_files[from_format]} {output_folder / test_files[to_format]}')
assert (output_folder / test_files[to_format]).exists()


def test_merge_ctm_filename(output_folder):
run(f'meeteval-io ctm2stm {example_files / "hyp1.ctm"} {example_files / "hyp2.ctm"} {output_folder / "hyp.stm"}')
assert (output_folder / "hyp.stm").exists()
meeteval.io.load(output_folder / "hyp.stm").to_seglst().unique('speaker') == {'hyp1', 'hyp2'}


def test_merge_ctm_speaker_arg(output_folder):
run(f'meeteval-io ctm2stm --speaker spk-A {example_files / "hyp1.ctm"} {example_files / "hyp2.ctm"} {output_folder / "hyp.stm"}')
assert (output_folder / "hyp.stm").exists()
meeteval.io.load(output_folder / "hyp.stm").to_seglst().unique('speaker') == {'spk-A'}

def test_piping(output_folder):
run(f'cat {example_files / "hyp.stm"} | meeteval-io stm2rttm {output_folder / "hyp.rttm"}')
run(f'cat {example_files / "hyp.stm"} | meeteval-io stm2rttm -')
run(f'cat {example_files / "hyp.stm"} | meeteval-io stm2rttm - | echo')
run(f'cat {example_files / "hyp.stm"} | meeteval-io stm2rttm - > {output_folder / "hyp.rttm"}')
run(f'meeteval-io stm2rttm <(cat {example_files / "hyp.stm"}) -')

def test_convert_correct(output_folder):
run(f'meeteval-io stm2rttm {example_files / "hyp.stm"} {output_folder / "hyp.rttm"}')
assert (output_folder / "hyp.rttm").read_text() == (example_files / "hyp.rttm").read_text()

# TODO: dump timestamps as str or int / float?
# run(f'meeteval-io stm2seglst {example_files / "hyp.stm"} {output_folder / "hyp.seglst.json"}')
# assert (output_folder / "hyp.seglst.json").read_text() == (example_files / "hyp.seglst.json").read_text()

run(f'meeteval-io rttm2stm {example_files / "hyp.rttm"} {output_folder / "hyp.stm"}')
assert meeteval.io.load(output_folder / "hyp.stm") == meeteval.io.load(example_files / "hyp.stm").map(lambda l: l.replace(transcript='<NA>'))

run(f'meeteval-io seglst2stm {example_files / "hyp.seglst.json"} {output_folder / "hyp.stm"} -f')
assert meeteval.io.load(output_folder / "hyp.stm") == meeteval.io.load(example_files / "hyp.stm")

def test_convert_file_exists(output_folder):
run(f'meeteval-io stm2rttm {example_files / "hyp.stm"} {output_folder / "hyp.rttm"}')

with pytest.raises(Exception, match='.*Output file .* already exists.* Use --force / -f to overwrite.'):
run(f'meeteval-io stm2rttm {example_files / "hyp.stm"} {output_folder / "hyp.rttm"}')

run(f'meeteval-io stm2rttm --force {example_files / "hyp.stm"} {output_folder / "hyp.rttm"}')
run(f'meeteval-io stm2rttm -f {example_files / "hyp.stm"} {output_folder / "hyp.rttm"}')

def test_ctm_piping():
run(f'cat {example_files / "hyp1.ctm"} | meeteval-io ctm2stm --speaker spk-A - > /dev/null')
with pytest.raises(Exception, match='.*the following arguments are required: --speaker.*'):
run(f'cat {example_files / "hyp1.ctm"} | meeteval-io ctm2stm - > /dev/null')
run(f'meeteval-io ctm2stm <(cat {example_files / "hyp1.ctm"}) - > /dev/null')