Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge: polish #1591

Merged
merged 6 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
>>> read_metadata("tests/functional/filter/data/metadata.tsv", id_columns=("Virus name",))
Traceback (most recent call last):
...
Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')
Exception: None of the possible id columns ('Virus name') were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')

We also allow iterating through metadata in fixed chunk sizes.

Expand Down Expand Up @@ -110,7 +110,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id

# If we couldn't find a valid index column in the metadata, alert the user.
if not id_columns_present:
raise Exception(f"None of the possible id columns ({id_columns!r}) were found in the metadata's columns {tuple(chunk.columns)!r}")
raise Exception(f"None of the possible id columns ({', '.join(map(repr, id_columns))}) were found in the metadata's columns ({', '.join(map(repr, chunk.columns))})")
else:
index_col = id_columns_present[0]

Expand Down Expand Up @@ -599,19 +599,19 @@ def __init__(self, path: str, delimiters: Sequence[str], id_columns: Sequence[st
raise AugurError(f"{self.path}: Expected a header row but it is empty.")

# Infer the ID column.
self.id_column = self._find_first(id_columns)
self.id_column = self._find_id_column(id_columns)

def open(self, **kwargs):
"""Open the file with auto-compression/decompression."""
return open_file(self.path, newline='', **kwargs)

def _find_first(self, columns: Sequence[str]):
def _find_id_column(self, columns: Sequence[str]):
"""Return the first column in `columns` that is present in the metadata.
"""
for column in columns:
if column in self.columns:
return column
raise AugurError(f"{self.path}: None of ({columns!r}) are in the columns {tuple(self.columns)!r}.")
raise AugurError(f"{self.path}: None of the possible id columns ({', '.join(map(repr, columns))}) were found in the metadata's columns ({', '.join(map(repr, self.columns))}).")

def rows(self, strict: bool = True):
"""Yield rows in a dictionary format. Empty lines are ignored.
Expand Down
73 changes: 65 additions & 8 deletions augur/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
you want to use a version different from what's on PATH), set the SQLITE3
environment variable to path of the desired sqlite3 executable.
"""
import gettext
import os
import re
import subprocess
import sys
from functools import reduce
Expand All @@ -43,7 +45,7 @@
from textwrap import dedent
from typing import Iterable, Tuple, TypeVar

from augur.argparse_ import ExtendOverwriteDefault
from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
from augur.errors import AugurError
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
from augur.io.print import print_err, print_debug
Expand All @@ -53,6 +55,12 @@
T = TypeVar('T')


# Use ngettext() without a message catalog for its singular/plural handling so
# we can make proper error messages. gettext() (no "n") is conventionally
# aliased as "_", so alias ngettext() as "_n".
_n = gettext.NullTranslations().ngettext


class NamedMetadata(Metadata):
name: str
"""User-provided descriptive name for this metadata file."""
Expand All @@ -73,14 +81,14 @@ def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))

input_group = parser.add_argument_group("inputs", "options related to input")
input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="metadata files with assigned names")
input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)

input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)

output_group = parser.add_argument_group("outputs", "options related to output")
output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="merged metadata as TSV")
output_group.add_argument('--quiet', action="store_true", default=False, help="suppress informational messages on stderr")
output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)

return parser

Expand All @@ -96,7 +104,7 @@ def run(args):
raise AugurError(dedent(f"""\
All metadata inputs must be assigned a name, e.g. with NAME=FILE.

The following inputs were missing a name:
The following {_n("input was", "inputs were", len(unnamed))} missing a name:

{indented_list(unnamed, ' ' + ' ')}
"""))
Expand All @@ -109,7 +117,7 @@ def run(args):
raise AugurError(dedent(f"""\
Metadata input names must be unique.

The following names were used more than once:
The following {_n("name was", "names were", len(duplicate_names))} used more than once:

{indented_list(duplicate_names, ' ' + ' ')}
"""))
Expand Down Expand Up @@ -315,3 +323,52 @@ def count_unique(xs: Iterable[T]) -> Iterable[Tuple[T, int]]:

def indented_list(xs, prefix):
return f"\n{prefix}".join(xs)


def shquote_humanized(x):
r"""
shquote for humans.

Use C-style escapes supported by shells (specifically, Bash) for characters
that humans would typically use C-style escapes for instead of quoted
literals.

<https://www.gnu.org/software/bash/manual/bash.html#ANSI_002dC-Quoting>

>>> shquote_humanized("abc")
'abc'

>>> shquote_humanized("\t")
"$'\\t'"

>>> shquote_humanized("abc def")
"'abc def'"

>>> shquote_humanized("abc\tdef")
"abc$'\\t'def"
"""
escapes = {
'\a': r'\a',
'\b': r'\b',
'\f': r'\f',
'\n': r'\n',
'\r': r'\r',
'\t': r'\t',
'\v': r'\v',
}

def quote(s):
if s in escapes:
return f"$'{escapes[s]}'"
else:
# split leaves leading and trailing empty strings when its input is
# entirely (captured) separator. Avoid quoting every empty string
# *part* here…
return shquote(s) if s else ''

parts = re.split('([' + ''.join(escapes.values()) + '])', x)
quoted = ''.join(map(quote, parts))

# …and instead quote a final empty string down here if we're still empty
# after joining all our parts together.
return quoted if quoted else shquote('')
2 changes: 1 addition & 1 deletion tests/functional/export_v2/cram/metadata-id-columns.t
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ This should fail with a helpful error message.
> --auspice-config "$TESTDIR/../data/auspice_config1.json" \
> --maintainers "Nextstrain Team" \
> --output dataset.json > /dev/null
ERROR: None of the possible id columns (('strain', 'name')) were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
ERROR: None of the possible id columns ('strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
[1]
11 changes: 10 additions & 1 deletion tests/functional/merge/cram/merge.t
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ Metadata names must be unique.
> --output-metadata -
ERROR: Metadata input names must be unique.

The following names were used more than once:
The following name was used more than once:

'data'

Expand All @@ -197,6 +197,15 @@ Duplicates.
ERROR: sqlite3 invocation failed
[2]

No id column found.

$ ${AUGUR} merge \
> --metadata X=x-id-column.tsv Y=y.tsv \
> --metadata-id-columns strain \
> --output-metadata /dev/null
ERROR: x-id-column.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c').
[2]

SQLITE3 env var can be used to override `sqlite3` location (and failure is
handled).

Expand Down
Loading