Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(summary_counts): Translates R code to Python #121

Merged
merged 2 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion isoslam/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,20 @@ bam_file: data/bam/.bam
gtf_file: data/gtf/
bed_file: data/bed/
vcf_file: data/vcf/
summary: false
summary_counts:
file_pattern: "**/*.tsv"
separator: "\t"
groupby:
- Transcript_id
- Chr
- Strand
- Start
- End
- Assignment
- Conversions
- filename
output:
outfile: summary_counts.tsv
sep: "\t"
index: false
plot: false
107 changes: 99 additions & 8 deletions isoslam/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@

import argparse
import gzip
from collections.abc import Callable
from collections.abc import Callable, Generator
from datetime import datetime
from importlib import resources
from pathlib import Path
from typing import Any, TextIO

import pandas as pd
import pysam

# from cgatcore import iotools
from loguru import logger
from ruamel.yaml import YAML, YAMLError

from isoslam import utils

CONFIG_DOCUMENTATION_REFERENCE = """# For more information on configuration and how to use it see:
# https://sudlab.github.io/IsoSLAM\n"""

Expand Down Expand Up @@ -49,7 +50,7 @@ def _str_to_path(path: str | Path) -> Path:
return Path().cwd() if path == "./" else Path(path).expanduser()


def _path_to_str(config: dict) -> dict: # type: ignore[type-arg]
def _path_to_str(config: dict[str, Any]) -> dict[str, Any]:
"""
Recursively traverse a dictionary and convert any Path() objects to strings for writing to YAML.

Expand All @@ -71,7 +72,7 @@ def _path_to_str(config: dict) -> dict: # type: ignore[type-arg]
return config


def read_yaml(filename: str | Path) -> dict | None: # type: ignore[type-arg]
def read_yaml(filename: str | Path | None = None) -> dict[str, Any] | None:
"""
Read a YAML file.

Expand All @@ -85,7 +86,9 @@ def read_yaml(filename: str | Path) -> dict | None: # type: ignore[type-arg]
Dict
Dictionary of the file.
"""
with Path(filename).open(encoding="utf-8") as f:
if filename is None:
filename = resources.files(__package__) / "default_config.yaml" # type: ignore[assignment]
with Path(filename).open(encoding="utf-8") as f: # type: ignore[arg-type]
try:
yaml_file = YAML(typ="safe")
return yaml_file.load(f) # type: ignore[no-any-return]
Expand Down Expand Up @@ -132,6 +135,29 @@ def write_yaml(
logger.error(exception)


def load_and_update_config(args: argparse.Namespace | None) -> dict[str, Any]:
"""
Load a configuration file to dictionary and update entries with user supplied arguments.

If ''args'' does not contain any value for ''args.config_file'' the default configuration
(''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.

Once the configuration is loaded any user specified options update the dictionary.

Parameters
----------
args : argparse.Namespace
Arguments supplied by user.

Returns
-------
dict[str: Any]
Dictionary of configuration optionsupdated with user specified options.
"""
config = read_yaml() if vars(args)["config_file"] is None else read_yaml(vars(args)["config_file"])
return utils.update_config(config, vars(args)) # type: ignore[arg-type]


def create_config(args: argparse.Namespace | None = None) -> None:
"""
Write the default configuration file to disk.
Expand Down Expand Up @@ -216,11 +242,11 @@ def _get_loader(file_ext: str = "bam") -> Callable: # type: ignore[type-arg]
"""
if file_ext == ".bam":
return _load_bam
if file_ext == ".bed" or file_ext == ".bed.gz":
if file_ext in (".bed", ".bed.gz"):
return _load_bed
if file_ext == ".gtf":
return _load_gtf
if file_ext == ".vcf" or file_ext == ".vcf.gz":
if file_ext in (".vcf", ".vcf.gz"):
return _load_vcf
raise ValueError(file_ext)

Expand Down Expand Up @@ -317,3 +343,68 @@ def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile:
return pysam.VariantFile(vcf_file)
except FileNotFoundError as e:
raise e


def _find_files(pattern: str = "**/*.tsv") -> Generator: # type: ignore[type-arg]
"""
Find files that match the given pattern.

Parameters
----------
pattern : str
Pattern (regular expression) of files to search for.

Returns
-------
Generator[_P, None, None]
A generator of files found that match the given pattern.
"""
pwd = Path.cwd()
return pwd.rglob(pattern)


def load_files(pattern: str = "**/*.tsv", sep: str = "\t") -> dict[str, pd.DataFrame]:
"""
Read a set of files into a list of Pandas DataFrames.

Parameters
----------
pattern : str
File name pattern to search for.
sep : str
Separator/delimiter used in files.

Returns
-------
list[pd.DataFrame]
A list of Pandas DataFrames of each file found.
"""
return {x.stem: pd.read_csv(x, sep=sep) for x in _find_files(pattern)}


def data_frame_to_file(
data: pd.DataFrame,
output_dir: str | Path = "./output/",
outfile: str = "summary_counts.csv",
sep: str = "\t",
**kwargs: dict[Any, Any],
) -> None:
"""
Write a Pandas DataFrame to disk.

Parameters
----------
data : pd.DataFrame
Pandas DataFrame to write to disk.
output_dir : str | Path
Location to write the output to, default is ''./output''.capitalize.
outfile : str
Filename to write data to.
sep : str
Separator to use in output file.
**kwargs
Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.
"""
outdir_file = Path(output_dir) / f"{outfile}"
data.to_csv(outdir_file, sep=sep, **kwargs)
logger.debug(f"File written to : {outdir_file}")
29 changes: 21 additions & 8 deletions isoslam/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,24 @@

from loguru import logger

logger.remove()
logger.add(sys.stderr)
logger.add(
sys.stderr,
colorize=True,
format="{time:HH:mm:ss} | <level>{level}</level> |<magenta>{file}</magenta>:<magenta>{module}</magenta>:"
"<magenta>{function}</magenta>:<magenta>{line}</magenta> | <level>{message}</level>",
)

def setup(level: str = "INFO") -> None:
"""
Loguru setup with the required logging level and format.

Parameters
----------
level : str
Log level, default is "INFO", other options "WARNING", "DEBUG" etc.
"""
logger.remove()
logger.add(sys.stderr)
logger.add(
sys.stderr,
colorize=True,
level=level.upper(),
format="<green>{time:HH:mm:ss}</green> "
"| <level>{level}</level> | "
"<magenta>{file}</magenta>:<magenta>{module}</magenta>:<magenta>{function}</magenta>:<magenta>{line}</magenta>"
" | <level>{message}</level>",
)
77 changes: 76 additions & 1 deletion isoslam/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from pathlib import Path
from typing import Any

from isoslam import __version__, io
from loguru import logger

from isoslam import __version__, io, logging, summary


def create_parser() -> arg.ArgumentParser:
Expand Down Expand Up @@ -37,6 +39,14 @@ def create_parser() -> arg.ArgumentParser:
required=False,
help="Path to a YAML configuration file.",
)
parser.add_argument(
"-b",
"--base-dir",
dest="base_dir",
type=Path,
required=False,
help="Base directory to run isoslam on.",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down Expand Up @@ -114,6 +124,38 @@ def create_parser() -> arg.ArgumentParser:
)
create_config_parser.set_defaults(func=io.create_config)

# Summarise counts sub-parser
summary_counts_parser = subparsers.add_parser(
"summary-counts",
description="Summarise the counts.",
help="Summarise the counts.",
)
summary_counts_parser.add_argument(
"--file-pattern",
dest="file_pattern",
type=str,
required=False,
default="*_summarized.tsv",
help="Regular expression for summarized files to process.",
)
summary_counts_parser.add_argument(
"--outfile",
dest="outfile",
type=Path,
required=False,
default="summary_counts.tsv",
help="Output filename to save results to, will be nested under 'output_dir'.",
)
summary_counts_parser.add_argument(
"--separator",
dest="sep",
type=str,
required=False,
default="\t",
help="Field separator to use in output file, default is '\t' but other values (e.g. ',' are allowed).",
)
summary_counts_parser.set_defaults(func=summarise_counts)

# Additional parsers for future functionality
# summarize_counts_parser = subparsers.add_parser(
# "summarize",
Expand Down Expand Up @@ -148,6 +190,39 @@ def process(args: arg.Namespace | None) -> None: # pylint: disable=unused-argum
return


def summarise_counts(args: arg.Namespace | None) -> None:
"""
Take a set of output files and summarise the number of conversions.

Counts are made within file, chromosome, transcript, start, end, assignment and whether there is one or more
conversion observed.

Parameters
----------
args : arg.Namespace | None
Arguments function was invoked with.

Returns
-------
None
Function does not return anything.
"""
# Load the configuration file (default or user) and update with supplied flags
config = io.load_and_update_config(args)
logger.remove()
if vars(args)["log_level"] is not None:
logging.setup(level=vars(args)["log_level"])
else:
logging.setup(level=config["log_level"])
summary_counts_config = config["summary_counts"]
output_config = summary_counts_config.pop("output")
output_config["output_dir"] = config["output_dir"]
summary_counts = summary.summary_counts(**summary_counts_config)
summary_counts.sort_values(by=["Chr", "Transcript_id", "Start"], inplace=True)
io.data_frame_to_file(summary_counts, **output_config)
logger.info(f"Summary counts file written to : {output_config['output_dir']}/{output_config['outfile']}")


def entry_point(manually_provided_args: list[Any] | None = None, testing: bool = False) -> None | arg.Namespace:
"""
Entry point for all IsoSLAM programs.
Expand Down
61 changes: 61 additions & 0 deletions isoslam/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Functions for summarising output."""

import pandas as pd

from isoslam import io


def append_files(pattern: str = "**/*.tsv", separator: str = "\t") -> pd.DataFrame:
"""
Append a set of files into a Pandas DataFrames.

Parameters
----------
pattern : str
File name pattern to search for.
separator : str
Separator/delimiter used in files.

Returns
-------
pd.DataFrame
A Pandas DataFrames of each file found.
"""
_data = io.load_files(pattern, separator)
all_data = [data.assign(filename=key) for key, data in _data.items()]
return pd.concat(all_data)


def summary_counts(
file_pattern: str = "**/*.tsv",
separator: str = "\t",
groupby: list[str] | None = None,
dropna: bool = True,
) -> pd.DataFrame:
"""
Count the number of assigned read pairs.

Groups the data by

Parameters
----------
file_pattern : str
File name pattern to search for.
separator : str
Separator/delimiter used in files.
groupby : list[str]
List of variables to group the counts by.
dropna : book
Whether to drop rows with ``NA`` values.

Returns
-------
pd.DataFrame
A Pandas DataFrames of each file found.
"""
if groupby is None:
groupby = ["Transcript_id", "Chr", "Strand", "Start", "End", "Assignment", "Conversions", "filename"]
_data = append_files(file_pattern, separator)
_data["one_or_more_conversion"] = _data["Conversions"] >= 1
groupby.append("one_or_more_conversion")
return _data.value_counts(subset=groupby, dropna=dropna).reset_index()
Loading
Loading