Skip to content

Commit

Permalink
feature(summary_counts): Translates R code to Python
Browse files Browse the repository at this point in the history
Closes #110

- Updates `isoslam/default_config.yaml` and `processing` argparse to be consistent.
- In turn the `utils.update_config()` function has been corrected with additional tests to correctly update nested
  dictionaries from the configuration.
- Logging is setup and `loguru` used with formatted output and level set from configuration or command line optoins.
- Translates the R code that counted and summarised the transcripts to Python code (using Pandas) along with tests.
- Necessary updates to `pyproject.toml`.

The output from IsoSLAM can now be summarised using...

```bash
isoslam --output-dir output summary-counts --file-pattern "tests/**/*.tsv"
```

Rather than calling `Rscript` to do the work.

This will serve as a template for translating the other R code.
  • Loading branch information
ns-rse committed Dec 13, 2024
1 parent 5eeee2a commit f0520ad
Show file tree
Hide file tree
Showing 14 changed files with 2,169 additions and 45 deletions.
17 changes: 16 additions & 1 deletion isoslam/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,20 @@ bam_file: data/bam/.bam
gtf_file: data/gtf/
bed_file: data/bed/
vcf_file: data/vcf/
summary: false
summary_counts:
file_pattern: "**/*.tsv"
separator: "\t"
groupby:
- Transcript_id
- Chr
- Strand
- Start
- End
- Assignment
- Conversions
- filename
output:
outfile: summary_counts.tsv
sep: "\t"
index: false
plot: false
99 changes: 94 additions & 5 deletions isoslam/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@

import argparse
import gzip
from collections.abc import Callable
from collections.abc import Callable, Generator
from datetime import datetime
from importlib import resources
from pathlib import Path
from typing import Any, TextIO

import pandas as pd
import pysam

# from cgatcore import iotools
from loguru import logger
from ruamel.yaml import YAML, YAMLError

from isoslam import utils

CONFIG_DOCUMENTATION_REFERENCE = """# For more information on configuration and how to use it see:
# https://sudlab.github.io/IsoSLAM\n"""

Expand Down Expand Up @@ -134,6 +135,29 @@ def write_yaml(
logger.error(exception)


def load_and_update_config(args: argparse.Namespace | None) -> dict[str, Any]:
"""
Load a configuration file to dictionary and update entries with user supplied arguments.
If ''args'' does not contain any value for ''args.config_file'' the default configuration
(''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.
Once the configuration is loaded any user specified options update the dictionary.
Parameters
----------
args : argparse.Namespace
Arguments supplied by user.
Returns
-------
dict[str: Any]
Dictionary of configuration optionsupdated with user specified options.
"""
config = read_yaml() if vars(args)["config_file"] is None else read_yaml(vars(args)["config_file"])
return utils.update_config(config, vars(args)) # type: ignore[arg-type]


def create_config(args: argparse.Namespace | None = None) -> None:
"""
Write the default configuration file to disk.
Expand Down Expand Up @@ -218,11 +242,11 @@ def _get_loader(file_ext: str = "bam") -> Callable: # type: ignore[type-arg]
"""
if file_ext == ".bam":
return _load_bam
if file_ext == ".bed" or file_ext == ".bed.gz":
if file_ext in (".bed", ".bed.gz"):
return _load_bed
if file_ext == ".gtf":
return _load_gtf
if file_ext == ".vcf" or file_ext == ".vcf.gz":
if file_ext in (".vcf", ".vcf.gz"):
return _load_vcf
raise ValueError(file_ext)

Expand Down Expand Up @@ -319,3 +343,68 @@ def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile:
return pysam.VariantFile(vcf_file)
except FileNotFoundError as e:
raise e


def _find_files(pattern: str = "**/*.tsv") -> Generator: # type: ignore[type-arg]
"""
Find files that match the given pattern.
Parameters
----------
pattern : str
Pattern (regular expression) of files to search for.
Returns
-------
Generator[_P, None, None]
A generator of files found that match the given pattern.
"""
pwd = Path.cwd()
return pwd.rglob(pattern)


def load_files(pattern: str = "**/*.tsv", sep: str = "\t") -> dict[str, pd.DataFrame]:
"""
Read a set of files into a list of Pandas DataFrames.
Parameters
----------
pattern : str
File name pattern to search for.
sep : str
Separator/delimiter used in files.
Returns
-------
list[pd.DataFrame]
A list of Pandas DataFrames of each file found.
"""
return {x.stem: pd.read_csv(x, sep=sep) for x in _find_files(pattern)}


def data_frame_to_file(
data: pd.DataFrame,
output_dir: str | Path = "./output/",
outfile: str = "summary_counts.csv",
sep: str = "\t",
**kwargs: dict[Any, Any],
) -> None:
"""
Write a Pandas DataFrame to disk.
Parameters
----------
data : pd.DataFrame
Pandas DataFrame to write to disk.
output_dir : str | Path
Location to write the output to, default is ''./output''.capitalize.
outfile : str
Filename to write data to.
sep : str
Separator to use in output file.
**kwargs
Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.
"""
outdir_file = Path(output_dir) / f"{outfile}"
data.to_csv(outdir_file, sep=sep, **kwargs)
logger.debug(f"File written to : {outdir_file}")
29 changes: 21 additions & 8 deletions isoslam/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,24 @@

from loguru import logger

logger.remove()
logger.add(sys.stderr)
logger.add(
sys.stderr,
colorize=True,
format="{time:HH:mm:ss} | <level>{level}</level> |<magenta>{file}</magenta>:<magenta>{module}</magenta>:"
"<magenta>{function}</magenta>:<magenta>{line}</magenta> | <level>{message}</level>",
)

def setup(level: str = "INFO") -> None:
"""
Loguru setup with the required logging level and format.
Parameters
----------
level : str
Log level, default is "INFO", other options "WARNING", "DEBUG" etc.
"""
logger.remove()
logger.add(sys.stderr)
logger.add(
sys.stderr,
colorize=True,
level=level.upper(),
format="<green>{time:HH:mm:ss}</green> "
"| <level>{level}</level> | "
"<magenta>{file}</magenta>:<magenta>{module}</magenta>:<magenta>{function}</magenta>:<magenta>{line}</magenta>"
" | <level>{message}</level>",
)
77 changes: 76 additions & 1 deletion isoslam/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from pathlib import Path
from typing import Any

from isoslam import __version__, io
from loguru import logger

from isoslam import __version__, io, logging, summary


def create_parser() -> arg.ArgumentParser:
Expand Down Expand Up @@ -37,6 +39,14 @@ def create_parser() -> arg.ArgumentParser:
required=False,
help="Path to a YAML configuration file.",
)
parser.add_argument(
"-b",
"--base-dir",
dest="base_dir",
type=Path,
required=False,
help="Base directory to run isoslam on.",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down Expand Up @@ -114,6 +124,38 @@ def create_parser() -> arg.ArgumentParser:
)
create_config_parser.set_defaults(func=io.create_config)

# Summarise counts sub-parser
summary_counts_parser = subparsers.add_parser(
"summary-counts",
description="Summarise the counts.",
help="Summarise the counts.",
)
summary_counts_parser.add_argument(
"--file-pattern",
dest="file_pattern",
type=str,
required=False,
default="*_summarized.tsv",
help="Regular expression for summarized files to process.",
)
summary_counts_parser.add_argument(
"--outfile",
dest="outfile",
type=Path,
required=False,
default="summary_counts.tsv",
help="Output filename to save results to, will be nested under 'output_dir'.",
)
summary_counts_parser.add_argument(
"--separator",
dest="sep",
type=str,
required=False,
default="\t",
help="Field separator to use in output file, default is '\t' but other values (e.g. ',' are allowed).",
)
summary_counts_parser.set_defaults(func=summarise_counts)

# Additional parsers for future functionality
# summarize_counts_parser = subparsers.add_parser(
# "summarize",
Expand Down Expand Up @@ -148,6 +190,39 @@ def process(args: arg.Namespace | None) -> None: # pylint: disable=unused-argum
return


def summarise_counts(args: arg.Namespace | None) -> None:
"""
Take a set of output files and summarise the number of conversions.
Counts are made within file, chromosome, transcript, start, end, assignment and whether there is one or more
conversion observed.
Parameters
----------
args : arg.Namespace | None
Arguments function was invoked with.
Returns
-------
None
Function does not return anything.
"""
# Load the configuration file (default or user) and update with supplied flags
config = io.load_and_update_config(args)
logger.remove()
if vars(args)["log_level"] is not None:
logging.setup(level=vars(args)["log_level"])
else:
logging.setup(level=config["log_level"])
summary_counts_config = config["summary_counts"]
output_config = summary_counts_config.pop("output")
output_config["output_dir"] = config["output_dir"]
summary_counts = summary.summary_counts(**summary_counts_config)
summary_counts.sort_values(by=["Chr", "Transcript_id", "Start"], inplace=True)
io.data_frame_to_file(summary_counts, **output_config)
logger.info(f"Summary counts file written to : {output_config['output_dir']}/{output_config['outfile']}")


def entry_point(manually_provided_args: list[Any] | None = None, testing: bool = False) -> None | arg.Namespace:
"""
Entry point for all IsoSLAM programs.
Expand Down
61 changes: 61 additions & 0 deletions isoslam/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Functions for summarising output."""

import pandas as pd

from isoslam import io


def append_files(pattern: str = "**/*.tsv", separator: str = "\t") -> pd.DataFrame:
"""
Append a set of files into a Pandas DataFrames.
Parameters
----------
pattern : str
File name pattern to search for.
separator : str
Separator/delimiter used in files.
Returns
-------
pd.DataFrame
A Pandas DataFrames of each file found.
"""
_data = io.load_files(pattern, separator)
all_data = [data.assign(filename=key) for key, data in _data.items()]
return pd.concat(all_data)


def summary_counts(
file_pattern: str = "**/*.tsv",
separator: str = "\t",
groupby: list[str] | None = None,
dropna: bool = True,
) -> pd.DataFrame:
"""
Count the number of assigned read pairs.
Groups the data by
Parameters
----------
file_pattern : str
File name pattern to search for.
separator : str
Separator/delimiter used in files.
groupby : list[str]
List of variables to group the counts by.
dropna : book
Whether to drop rows with ``NA`` values.
Returns
-------
pd.DataFrame
A Pandas DataFrames of each file found.
"""
if groupby is None:
groupby = ["Transcript_id", "Chr", "Strand", "Start", "End", "Assignment", "Conversions", "filename"]
_data = append_files(file_pattern, separator)
_data["one_or_more_conversion"] = _data["Conversions"] >= 1
groupby.append("one_or_more_conversion")
return _data.value_counts(subset=groupby, dropna=dropna).reset_index()
Loading

0 comments on commit f0520ad

Please sign in to comment.