Skip to content

Commit

Permalink
Merge branch 'main' into refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
katerinakazantseva committed Sep 24, 2024
2 parents c9012f2 + f8d8ec7 commit c1b4095
Show file tree
Hide file tree
Showing 11 changed files with 264 additions and 133 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
__pycache__/
strainy/.DS_Store
.DS_Store
build/
dist/
strainy.egg-info/
67 changes: 67 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
FROM ubuntu:22.04
MAINTAINER Mikhail Kolmogorov, [email protected]

# update and install dependencies
RUN apt-get update && \
DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata && \
apt-get -y install cmake git make gcc g++ autoconf bzip2 lzma-dev zlib1g-dev tabix libbz2-dev && \
apt-get -y install libcurl4-openssl-dev libpthread-stubs0-dev liblzma-dev libhdf5-dev && \
apt-get -y install python3-pip python3-virtualenv virtualenv python3-dev && \
apt-get -y install wget libz-dev libncurses5-dev libgsl-dev && \
apt-get -y install graphviz libgraphviz-dev pkg-config && \
apt-get clean && \
apt-get purge && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

RUN ln -s /usr/bin/python3 /usr/bin/python

RUN python3 --version && \
python3 -m pip install --upgrade pip

ARG MM_VER=2.28
RUN wget https://github.com/lh3/minimap2/releases/download/v$MM_VER/minimap2-$MM_VER.tar.bz2 && \
tar xvf minimap2-$MM_VER.tar.bz2 && \
rm minimap2-$MM_VER.tar.bz2 && \
cd minimap2-$MM_VER && \
make && \
cp minimap2 /usr/bin/

### samtools
# 1.15
WORKDIR /opt/samtools
RUN wget https://github.com/samtools/samtools/releases/download/1.15/samtools-1.15.tar.bz2 && \
tar xvf samtools-1.15.tar.bz2 && \
rm -r /opt/samtools/samtools-1.15.tar.bz2 && \
cd samtools-1.15/ && \
autoheader && \
autoconf -Wno-header && \
./configure && \
make && \
cp samtools /usr/bin/samtools

### bcftools
WORKDIR /opt/bcftools
RUN wget https://github.com/samtools/bcftools/releases/download/1.15/bcftools-1.15.tar.bz2 && \
tar xvf bcftools-1.15.tar.bz2 && \
cd bcftools-1.15/ && \
autoheader && \
autoconf -Wno-header && \
./configure --enable-libgsl && \
make && \
cp bcftools /usr/bin/bcftools

#build and install Flye
WORKDIR /opt/flye
RUN git clone https://github.com/fenderglass/Flye && \
cd Flye && \
python setup.py install

#Install strainy & dependencies
WORKDIR /opt/strainy
RUN git clone -b docker https://github.com/katerinakazantseva/strainy && \
cd strainy && \
python3 -m pip install -r requirements.txt && \
python3 setup.py install


ENV PYTHONUNBUFFERED "1"
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Third, Strainy enables assembly-based analysis, which is useful in the absence o
## Contents
* [Installation](#installation)
* [Quick Usage](#quick-usage)
* [Docker container](#docker-container)
* [Strainy input](#strainy-input)
* [Preparing de novo metagenomic assemblies](#preparing-de-novo-metagenomic-assemblies)
* [Parameters description](#parameters-description)
Expand Down Expand Up @@ -48,7 +49,7 @@ Find details [**here**](https://github.com/conda/conda/issues/11216)

## Quick usage

After successful installation, you should be able to run:
After repository cloning/installation, you should be able to run:

```
conda activate strainy
Expand All @@ -65,6 +66,15 @@ See below for the details on how to interpret strainy output.

Also see a more detailed [tutorial](#strainy-tutorial) for the example of how to use Strainy.

## Docker container

Alternatively, you can use a Docker container (using the example provided in `test_set` Strainy directory):

```
ST_DIR=`pwd`
docker run -v $ST_DIR:$ST_DIR -u `id -u`:`id -g` mkolmogo/strainy:1.0 strainy --gfa $ST_DIR/test_set/toy.gfa --fastq $ST_DIR/test_set/toy.fastq.gz -o $ST_DIR/out_strainy --threads 8 --mode hifi
```

## Strainy input

Strainy supports PacBio HiFi, Nanopore R9 (Guppy5+) and R10 sequencing.
Expand Down
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
networkx>=2.6,<3.4
numpy>=1.24,<1.27
pandas>=1.3,<3
pysam>=0.20,<0.23
scipy>=1.8,<1.13
biopython
gfapy
karateclub
matplotlib
networkx
numpy
pandas
pygraphviz
pysam
edlib
31 changes: 31 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import sys
import subprocess
import shutil

try:
import setuptools
except ImportError:
sys.exit("setuptools package not found. "
"Please use 'pip install setuptools' first")

from setuptools import setup

# Make sure we're running from the setup.py directory.
script_dir = os.path.dirname(os.path.realpath(__file__))
if script_dir != os.getcwd():
os.chdir(script_dir)

from strainy.__version__ import __version__


setup(name='strainy',
version=__version__,
description='Metagenomic strain phasing and assmebly using long reads',
url='https://github.com/katerinakazantseva/strainy',
author='Ekaterina Kazantseva & Ataberk Donmez',
author_email = '[email protected]',
license='CC BY-NC-SA 4.0',
packages=['strainy', 'strainy/clustering', 'strainy/gfa_operations', 'strainy/reports', 'strainy/simplification'],
entry_points={'console_scripts': ['strainy = strainy.main:main']},
)
136 changes: 10 additions & 126 deletions strainy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
#!/usr/bin/env python
#!/usr/bin/env python3

"""
This script sets up environment paths
and invokes Strainy without installation.
"""

import sys
import os
import platform
import re
import subprocess
import argparse
import logging
import shutil
import cProfile
import gfapy
import sys


#Setting executable paths and additional import dirs
Expand All @@ -23,126 +20,13 @@
os.environ["PATH"] = bin_absolute + os.pathsep + os.environ["PATH"]
###


from strainy.phase import phase_main
from strainy.transform import transform_main
from strainy.params import StRainyArgs, init_global_args_storage
from strainy.logging import set_thread_logging
from strainy.preprocessing import preprocess_cmd_args


logger = logging.getLogger()

def get_processor_name():
if platform.system() == "Windows":
return platform.processor()
elif platform.system() == "Darwin":
os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
command ="sysctl -n machdep.cpu.brand_string"
return subprocess.check_output(command).strip()
elif platform.system() == "Linux":
command = "cat /proc/cpuinfo"
all_info = subprocess.check_output(command, shell=True).decode().strip()
for line in all_info.split("\n"):
if "model name" in line:
return re.sub( ".*model name.*:", "", line,1)
return ""


def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
requiredNamed = parser.add_argument_group('Required named arguments')
requiredNamed.add_argument("-o", "--output", help="output directory",required=True)
requiredNamed.add_argument("-g", "--gfa", help="input gfa to uncollapse",required=True)
requiredNamed.add_argument("-m", "--mode", help="type of reads", choices=["hifi", "nano"],
required=True)
requiredNamed.add_argument("-q", "--fastq",help="fastq file with reads to phase / assemble",
required=True)
parser.add_argument("-s", "--stage", help="stage to run: either phase, transform or e2e (phase + transform)",
choices=["phase", "transform", "e2e"], default="e2e")
parser.add_argument("--snp", help="path to vcf file with SNP calls to use", default=None)
parser.add_argument("-t", "--threads", help="number of threads to use", type=int, default=4)
parser.add_argument("-f", "--fasta", required=False, help=argparse.SUPPRESS)
parser.add_argument("-b", "--bam", help="path to indexed alignment in bam format",required=False)
parser.add_argument("--link-simplify", required=False, action="store_true", default=False,
dest="link_simplify",help="Enable agressive graph simplification")
parser.add_argument("--debug", required=False, action="store_true", default=False,
help="Generate extra output for debugging")
parser.add_argument("--unitig-split-length",
help="The length (in kb) which the unitigs that are longer will be split, set 0 to disable",
required=False,
type=int,
default=50)
parser.add_argument("--only-split",help="Do not run stRainy, only split long gfa unitigs", default='False',
required=False)
parser.add_argument("-d","--cluster-divergence",help="cluster divergence", type=float, default=0, required=False)
parser.add_argument("-a","--allele-frequency",help="Set allele frequency for internal caller only (pileup)",
type=float, default=0.2, required=False)
parser.add_argument("--min-unitig-length",
help="The length (in kb) which the unitigs that are shorter will not be phased",
required=False,
type=float,
default=1)
parser.add_argument("--min-unitig-coverage",
help="The minimum coverage threshold for phasing unitigs, unitigs with lower coverage will not be phased",
required=False,
type=int,
default=20)
parser.add_argument("--max-unitig-coverage",
help="The maximum coverage threshold for phasing unitigs, unitigs with higher coverage will not be phased",
required=False,
type=int,
default=500)

args = parser.parse_args()
args.strainy_root = strainy_root
#setting up global arguments storage
input_graph = gfapy.Gfa.from_file(args.gfa)
args.graph_edges = input_graph.segment_names
args.edges_to_phase = []
init_global_args_storage(args)
BIN_TOOLS = ["samtools", "bcftools", "minimap2", StRainyArgs().flye]
for tool in BIN_TOOLS:
if not shutil.which(tool):
print("{} not installed".format(tool), file=sys.stderr)
return 1

os.makedirs(StRainyArgs().output, exist_ok=True)
os.makedirs(StRainyArgs().output_intermediate, exist_ok=True)
if os.path.isdir(StRainyArgs().log_phase):
shutil.rmtree(StRainyArgs().log_phase)
os.makedirs(StRainyArgs().log_phase, exist_ok=True)
set_thread_logging(StRainyArgs().log_phase, "phase_root", None)
#Strainy entry point
import strainy.main
sys.exit(strainy.main.main())

preprocess_cmd_args(args)

if StRainyArgs().debug:
print(f'Using processor(s): {get_processor_name()}')

# set one more time for the modified args
init_global_args_storage(args)
if args.only_split=='True':
sys.exit()
elif args.stage == "phase":
sys.exit(phase_main(args))
elif args.stage == "transform":
sys.exit(transform_main(args))
elif args.stage == "e2e":
pr_phase = cProfile.Profile()
pr_phase.enable()
phase_main(args)
logger.info("Phase stage completed, starting transform now...")
pr_phase.disable()
pr_phase.dump_stats(f'{StRainyArgs().output_intermediate}/phase.prof')

pr_transform = cProfile.Profile()
pr_transform.enable()
transform_main(args)
logger.info("Transform stage completed, exiting...")
pr_transform.disable()
pr_transform.dump_stats(f'{StRainyArgs().output_intermediate}/transform.prof')


if __name__ == "__main__":
main()

1 change: 1 addition & 0 deletions strainy/__version__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.1"
Empty file.
Loading

0 comments on commit c1b4095

Please sign in to comment.