feature(summary_counts): Translates R code to Python

Closes #110 - Updates `isoslam/default_config.yaml` and `processing` argparse to be consistent. - In turn the `utils.update_config()` function has been corrected with additional tests to correctly update nested dictionaries from the configuration. - Logging is setup and `loguru` used with formatted output and level set from configuration or command line optoins. - Translates the R code that counted and summarised the transcripts to Python code (using Pandas) along with tests. - Necessary updates to `pyproject.toml`. The output from IsoSLAM can now be summarised using... ```bash isoslam --output-dir output summary-counts --file-pattern "tests/**/*.tsv" ``` Rather than calling `Rscript` to do the work. This will serve as a template for translating the other R code.
sudlab · Dec 13, 2024 · f0520ad · f0520ad
1 parent 5eeee2a
commit f0520ad
Show file tree

Hide file tree

Showing 14 changed files with 2,169 additions and 45 deletions.
diff --git a/isoslam/default_config.yaml b/isoslam/default_config.yaml
@@ -5,5 +5,20 @@ bam_file: data/bam/.bam
 gtf_file: data/gtf/
 bed_file: data/bed/
 vcf_file: data/vcf/
-summary: false
+summary_counts:
+  file_pattern: "**/*.tsv"
+  separator: "\t"
+  groupby:
+    - Transcript_id
+    - Chr
+    - Strand
+    - Start
+    - End
+    - Assignment
+    - Conversions
+    - filename
+  output:
+    outfile: summary_counts.tsv
+    sep: "\t"
+    index: false
 plot: false
diff --git a/isoslam/io.py b/isoslam/io.py
@@ -2,18 +2,19 @@
 
 import argparse
 import gzip
-from collections.abc import Callable
+from collections.abc import Callable, Generator
 from datetime import datetime
 from importlib import resources
 from pathlib import Path
 from typing import Any, TextIO
 
+import pandas as pd
 import pysam
-
-# from cgatcore import iotools
 from loguru import logger
 from ruamel.yaml import YAML, YAMLError
 
+from isoslam import utils
+
 CONFIG_DOCUMENTATION_REFERENCE = """# For more information on configuration and how to use it see:
 # https://sudlab.github.io/IsoSLAM\n"""
 
@@ -134,6 +135,29 @@ def write_yaml(
             logger.error(exception)
 
 
+def load_and_update_config(args: argparse.Namespace | None) -> dict[str, Any]:
+    """
+    Load a configuration file to dictionary and update entries with user supplied arguments.
+
+    If ''args'' does not contain any value for ''args.config_file'' the default configuration
+    (''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.
+
+    Once the configuration is loaded any user specified options update the dictionary.
+
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Arguments supplied by user.
+
+    Returns
+    -------
+    dict[str: Any]
+        Dictionary of configuration optionsupdated with user specified options.
+    """
+    config = read_yaml() if vars(args)["config_file"] is None else read_yaml(vars(args)["config_file"])
+    return utils.update_config(config, vars(args))  # type: ignore[arg-type]
+
+
 def create_config(args: argparse.Namespace | None = None) -> None:
     """
     Write the default configuration file to disk.
@@ -218,11 +242,11 @@ def _get_loader(file_ext: str = "bam") -> Callable:  # type: ignore[type-arg]
     """
     if file_ext == ".bam":
         return _load_bam
-    if file_ext == ".bed" or file_ext == ".bed.gz":
+    if file_ext in (".bed", ".bed.gz"):
         return _load_bed
     if file_ext == ".gtf":
         return _load_gtf
-    if file_ext == ".vcf" or file_ext == ".vcf.gz":
+    if file_ext in (".vcf", ".vcf.gz"):
         return _load_vcf
     raise ValueError(file_ext)
 
@@ -319,3 +343,68 @@ def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile:
         return pysam.VariantFile(vcf_file)
     except FileNotFoundError as e:
         raise e
+
+
+def _find_files(pattern: str = "**/*.tsv") -> Generator:  # type: ignore[type-arg]
+    """
+    Find files that match the given pattern.
+
+    Parameters
+    ----------
+    pattern : str
+        Pattern (regular expression) of files to search for.
+
+    Returns
+    -------
+    Generator[_P, None, None]
+        A generator of files found that match the given pattern.
+    """
+    pwd = Path.cwd()
+    return pwd.rglob(pattern)
+
+
+def load_files(pattern: str = "**/*.tsv", sep: str = "\t") -> dict[str, pd.DataFrame]:
+    """
+    Read a set of files into a list of Pandas DataFrames.
+
+    Parameters
+    ----------
+    pattern : str
+        File name pattern to search for.
+    sep : str
+        Separator/delimiter used in files.
+
+    Returns
+    -------
+    list[pd.DataFrame]
+        A list of Pandas DataFrames of each file found.
+    """
+    return {x.stem: pd.read_csv(x, sep=sep) for x in _find_files(pattern)}
+
+
+def data_frame_to_file(
+    data: pd.DataFrame,
+    output_dir: str | Path = "./output/",
+    outfile: str = "summary_counts.csv",
+    sep: str = "\t",
+    **kwargs: dict[Any, Any],
+) -> None:
+    """
+    Write a Pandas DataFrame to disk.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Pandas DataFrame to write to disk.
+    output_dir : str | Path
+        Location to write the output to, default is ''./output''.capitalize.
+    outfile : str
+        Filename to write data to.
+    sep : str
+        Separator to use in output file.
+    **kwargs
+        Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.
+    """
+    outdir_file = Path(output_dir) / f"{outfile}"
+    data.to_csv(outdir_file, sep=sep, **kwargs)
+    logger.debug(f"File written to : {outdir_file}")
diff --git a/isoslam/logging.py b/isoslam/logging.py
@@ -4,11 +4,24 @@
 
 from loguru import logger
 
-logger.remove()
-logger.add(sys.stderr)
-logger.add(
-    sys.stderr,
-    colorize=True,
-    format="{time:HH:mm:ss} | <level>{level}</level> |<magenta>{file}</magenta>:<magenta>{module}</magenta>:"
-    "<magenta>{function}</magenta>:<magenta>{line}</magenta> | <level>{message}</level>",
-)
+
+def setup(level: str = "INFO") -> None:
+    """
+    Loguru setup with the required logging level and format.
+
+    Parameters
+    ----------
+    level : str
+        Log level, default is "INFO", other options "WARNING", "DEBUG" etc.
+    """
+    logger.remove()
+    logger.add(sys.stderr)
+    logger.add(
+        sys.stderr,
+        colorize=True,
+        level=level.upper(),
+        format="<green>{time:HH:mm:ss}</green> "
+        "| <level>{level}</level> | "
+        "<magenta>{file}</magenta>:<magenta>{module}</magenta>:<magenta>{function}</magenta>:<magenta>{line}</magenta>"
+        " | <level>{message}</level>",
+    )
diff --git a/isoslam/processing.py b/isoslam/processing.py
@@ -5,7 +5,9 @@
 from pathlib import Path
 from typing import Any
 
-from isoslam import __version__, io
+from loguru import logger
+
+from isoslam import __version__, io, logging, summary
 
 
 def create_parser() -> arg.ArgumentParser:
@@ -37,6 +39,14 @@ def create_parser() -> arg.ArgumentParser:
         required=False,
         help="Path to a YAML configuration file.",
     )
+    parser.add_argument(
+        "-b",
+        "--base-dir",
+        dest="base_dir",
+        type=Path,
+        required=False,
+        help="Base directory to run isoslam on.",
+    )
     parser.add_argument(
         "-o",
         "--output-dir",
@@ -114,6 +124,38 @@ def create_parser() -> arg.ArgumentParser:
     )
     create_config_parser.set_defaults(func=io.create_config)
 
+    # Summarise counts sub-parser
+    summary_counts_parser = subparsers.add_parser(
+        "summary-counts",
+        description="Summarise the counts.",
+        help="Summarise the counts.",
+    )
+    summary_counts_parser.add_argument(
+        "--file-pattern",
+        dest="file_pattern",
+        type=str,
+        required=False,
+        default="*_summarized.tsv",
+        help="Regular expression for summarized files to process.",
+    )
+    summary_counts_parser.add_argument(
+        "--outfile",
+        dest="outfile",
+        type=Path,
+        required=False,
+        default="summary_counts.tsv",
+        help="Output filename to save results to, will be nested under 'output_dir'.",
+    )
+    summary_counts_parser.add_argument(
+        "--separator",
+        dest="sep",
+        type=str,
+        required=False,
+        default="\t",
+        help="Field separator to use in output file, default is '\t' but other values (e.g. ',' are allowed).",
+    )
+    summary_counts_parser.set_defaults(func=summarise_counts)
+
     # Additional parsers for future functionality
     # summarize_counts_parser = subparsers.add_parser(
     #     "summarize",
@@ -148,6 +190,39 @@ def process(args: arg.Namespace | None) -> None:  # pylint: disable=unused-argum
     return
 
 
+def summarise_counts(args: arg.Namespace | None) -> None:
+    """
+    Take a set of output files and summarise the number of conversions.
+
+    Counts are made within file, chromosome, transcript, start, end, assignment and whether there is one or more
+    conversion observed.
+
+    Parameters
+    ----------
+    args : arg.Namespace | None
+        Arguments function was invoked with.
+
+    Returns
+    -------
+    None
+        Function does not return anything.
+    """
+    # Load the configuration file (default or user) and update with supplied flags
+    config = io.load_and_update_config(args)
+    logger.remove()
+    if vars(args)["log_level"] is not None:
+        logging.setup(level=vars(args)["log_level"])
+    else:
+        logging.setup(level=config["log_level"])
+    summary_counts_config = config["summary_counts"]
+    output_config = summary_counts_config.pop("output")
+    output_config["output_dir"] = config["output_dir"]
+    summary_counts = summary.summary_counts(**summary_counts_config)
+    summary_counts.sort_values(by=["Chr", "Transcript_id", "Start"], inplace=True)
+    io.data_frame_to_file(summary_counts, **output_config)
+    logger.info(f"Summary counts file written to : {output_config['output_dir']}/{output_config['outfile']}")
+
+
 def entry_point(manually_provided_args: list[Any] | None = None, testing: bool = False) -> None | arg.Namespace:
     """
     Entry point for all IsoSLAM programs.

diff --git a/isoslam/summary.py b/isoslam/summary.py
@@ -0,0 +1,61 @@
+"""Functions for summarising output."""
+
+import pandas as pd
+
+from isoslam import io
+
+
+def append_files(pattern: str = "**/*.tsv", separator: str = "\t") -> pd.DataFrame:
+    """
+    Append a set of files into a Pandas DataFrames.
+
+    Parameters
+    ----------
+    pattern : str
+        File name pattern to search for.
+    separator : str
+        Separator/delimiter used in files.
+
+    Returns
+    -------
+    pd.DataFrame
+        A Pandas DataFrames of each file found.
+    """
+    _data = io.load_files(pattern, separator)
+    all_data = [data.assign(filename=key) for key, data in _data.items()]
+    return pd.concat(all_data)
+
+
+def summary_counts(
+    file_pattern: str = "**/*.tsv",
+    separator: str = "\t",
+    groupby: list[str] | None = None,
+    dropna: bool = True,
+) -> pd.DataFrame:
+    """
+    Count the number of assigned read pairs.
+
+    Groups the data by
+
+    Parameters
+    ----------
+    file_pattern : str
+        File name pattern to search for.
+    separator : str
+        Separator/delimiter used in files.
+    groupby : list[str]
+        List of variables to group the counts by.
+    dropna : book
+        Whether to drop rows with ``NA`` values.
+
+    Returns
+    -------
+    pd.DataFrame
+        A Pandas DataFrames of each file found.
+    """
+    if groupby is None:
+        groupby = ["Transcript_id", "Chr", "Strand", "Start", "End", "Assignment", "Conversions", "filename"]
+    _data = append_files(file_pattern, separator)
+    _data["one_or_more_conversion"] = _data["Conversions"] >= 1
+    groupby.append("one_or_more_conversion")
+    return _data.value_counts(subset=groupby, dropna=dropna).reset_index()