Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update assembly resources #23

Merged
merged 19 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,49 @@ on:
branches_ignore: []

jobs:
Dry_Run_and_Lint:
Dry_Run_and_Lint_DNA&RNA:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: docker://snakemake/snakemake:latest
- name: Dry Run with test data
- name: Dry Run
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:latest \
/opt2/metamorph run --input /opt2/.tests/test_cohort.txt \
--output /opt2/output --mode local --dry-run
- name: View the pipeline config file
- name: View the pipeline config file [DNA & RNA modes]
run: |
echo "Generated config file for pipeline...." && cat $PWD/output/config.json
- name: Lint Workflow
continue-on-error: true
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:latest snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
Dry_Run_and_Lint_DNAonly1:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: docker://snakemake/snakemake:latest
- name: Dry Run [DNA only mode]
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:latest \
/opt2/metamorph run --input /opt2/.tests/test_cohort_DNAonly.txt \
--output /opt2/output --mode local --dry-run
- name: View the pipeline config file [DNA only mode]
run: |
echo "Generated config file for pipeline...." && cat $PWD/output/config.json
- name: Lint Workflow
continue-on-error: true
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:latest snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
Dry_Run_DNA2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: docker://snakemake/snakemake:latest
- name: Dry Run [DNA only mode w RNA column]
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:latest \
/opt2/metamorph run --input /opt2/.tests/test_cohort_DNAonly_RNAcolexists.txt \
--output /opt2/output --mode local --dry-run
5 changes: 5 additions & 0 deletions .tests/test_cohort_DNAonly.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DNA
/opt2/.tests/WT_S1_R1.fastq.gz
/opt2/.tests/WT_S1_R2.fastq.gz
/opt2/.tests/WT_S2_R1.fastq.gz
/opt2/.tests/WT_S2_R2.fastq.gz
5 changes: 5 additions & 0 deletions .tests/test_cohort_DNAonly_RNAcolexists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DNA RNA
/opt2/.tests/WT_S1_R1.fastq.gz
/opt2/.tests/WT_S1_R2.fastq.gz
/opt2/.tests/WT_S2_R1.fastq.gz
/opt2/.tests/WT_S2_R2.fastq.gz
15 changes: 8 additions & 7 deletions config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
"gres": "lscratch:400"
},
"metawrap_genome_assembly": {
"threads": 128,
"mem": "128G",
"threads": 56,
"mem": "240G",
"partition": "norm",
"time": "10-00:00:00"
},
Expand All @@ -36,7 +36,8 @@
"threads": 32,
"mem": "64G",
"partition": "norm",
"time": "2-00:00:00"
"time": "2-00:00:00",
"gres": "lscratch:600"
},
"derep_bins": {
"threads": 32,
Expand All @@ -52,8 +53,8 @@
"time": "2-00:00:00"
},
"bbtools_index_map": {
"threads": 48,
"mem": "64G",
"threads": 28,
"mem": "220G",
"partition": "norm",
"time": "2-00:00:00"
},
Expand Down Expand Up @@ -92,10 +93,10 @@
},
"rna_humann_classify": {
"threads": 32,
"mem": "64G",
"mem": "128G",
"partition": "norm",
"time": "10-00:00:00",
"gres": "lscratch:600"
"gres": "lscratch:800"
},
"map_to_rna_to_mag": {
"threads": 32,
Expand Down
4 changes: 2 additions & 2 deletions config/images.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"images": {
"metawrap": "docker://rroutsong/metamorph_metawrap:0.0.9"
"metawrap": "docker://rroutsong/metamorph_metawrap:0.0.11"
},
"containers": {
"metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.9.sif"
"metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.11.sif"
}
}
14 changes: 7 additions & 7 deletions config/resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,37 +53,37 @@
}, {
"name": "CAT_db",
"to": "/data2/CAT",
"from": "/vf/users/OpenOmics/references/metamorph/CAT_pack_prepare/cat_db/db",
"from": "/data/OpenOmics/references/metamorph/CAT_pack_prepare/db",
"mode": "rw"
}, {
"name": "CAT_tax",
"to": "/data2/CAT_tax",
"from": "/vf/users/OpenOmics/references/metamorph/CAT_pack_prepare/cat_db/db",
"from": "/data/OpenOmics/references/metamorph/CAT_pack_prepare/tax",
"mode": "rw"
}, {
"name": "humann3_protein_db",
"to": "/data2/uniref",
"from": "/vf/users/OpenOmics/references/metamorph/uniref",
"from": "/data/OpenOmics/references/metamorph/uniref",
"mode": "rw"
}, {
"name": "humann3_nuc_db",
"to": "/data2/chocophlan",
"from": "/vf/users/OpenOmics/references/metamorph/chocophlan",
"from": "/data/OpenOmics/references/metamorph/chocophlan",
"mode": "rw"
}, {
"name": "humann3_config",
"to": "/opt/conda/envs/bb3/lib/python3.7/site-packages/humann",
"from": "/vf/users/OpenOmics/references/metamorph/humann",
"from": "/data/OpenOmics/references/metamorph/humann",
"mode": "rw"
}, {
"name": "utility_mapping",
"to": "/data2/um",
"from": "/vf/users/OpenOmics/references/metamorph/utility_mapping",
"from": "/data/OpenOmics/references/metamorph/utility_mapping",
"mode": "rw"
}, {
"name": "metaphlan_db",
"to": "/data2/metaphlan",
"from": "/vf/users/OpenOmics/references/metamorph/metaphlan",
"from": "/data/OpenOmics/references/metamorph/metaphlan",
"mode": "rw"
}
]
Expand Down
2 changes: 1 addition & 1 deletion docker/metawrap/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ RUN mamba install -y -n bb3 -c biobakery humann
RUN mkdir /install; cd /install; wget https://carlowood.github.io/which/which-2.21.tar.gz; tar xvf which-2.21.tar.gz
RUN cd /install/which-2.21; ./configure; make && make install
RUN rm /usr/bin/which; ln -s /usr/local/bin/which /usr/bin/which
RUN cd /home; git clone https://github.com/dutilh/CAT.git; chmod +x CAT_pack/CAT_pack; ln -s /home/CAT/CAT_pack/CAT_pack /usr/bin/CAT; chmod +x /usr/bin/CAT
RUN cd /home; git clone https://github.com/rroutsong/CAT_pack.git; ln -s /home/CAT_pack/CAT_pack/CAT_pack /usr/bin/CAT
RUN cd /install; wget https://github.com/hyattpd/Prodigal/archive/refs/tags/v2.6.3.tar.gz; tar xvf v2.6.3.tar.gz
RUN cd /install/Prodigal-2.6.3; make install INSTALLDIR=/usr/bin
RUN cd /install; wget https://github.com/bbuchfink/diamond/archive/refs/tags/v2.1.9.tar.gz; tar xvf v2.1.9.tar.gz
Expand Down
16 changes: 10 additions & 6 deletions metamorph
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ from src.utils import (
exists,
fatal,
check_cache,
require
require,
permissions
)


Expand Down Expand Up @@ -114,19 +115,21 @@ def run(sub_args):

# Step 1. Parse input sample sheet in down stream
# data structures consistent with argparse parser
validated, sample_map = valid_input(sub_args.input)
validated, rna_included = valid_input(sub_args.input)

delattr(sub_args, 'input')
setattr(sub_args, 'input', [row['DNA'] for row in validated])
setattr(sub_args, 'rna', [row['RNA'] for row in validated])
if rna_included:
setattr(sub_args, 'rna', [row['RNA'] for row in validated])

# Step 2. Initialize working directory,
# copy over required resources to run
# the pipeline
git_repo = __home__

fastq_inputs = [sub_args.input]
if sub_args.rna:

if getattr(sub_args, 'rna', False):
fastq_inputs.append(sub_args.rna)

input_files = init(
Expand All @@ -142,7 +145,8 @@ def run(sub_args):
config = setup(sub_args,
ifiles = input_files,
repo_path = git_repo,
output_path = sub_args.output
output_path = sub_args.output,
rna=rna_included
)

# Step 4. Resolve docker/singularity bind
Expand Down
70 changes: 42 additions & 28 deletions src/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'c
try:
os.mkdir(os.path.join(output_path, 'rna'))
except FileExistsError:
pass
pass
inputs['rna'] = sym_safe(input_data = links[1], target = os.path.join(output_path, 'rna'))

return inputs
Expand Down Expand Up @@ -112,7 +112,7 @@ def sym_safe(input_data, target):
if not exists(renamed):
# Create a symlink if it does not already exist
# Follow source symlinks to resolve any binding issues
os.symlink(os.path.abspath(os.path.realpath(file)), renamed)
os.symlink(os.path.abspath(file), renamed)

return input_fastqs

Expand Down Expand Up @@ -182,7 +182,7 @@ def get_sid(filepath):
return


def setup(sub_args, ifiles, repo_path, output_path):
def setup(sub_args, ifiles, repo_path, output_path, rna=True):
"""Setup the pipeline for execution and creates config file from templates
@param sub_args <parser.parse_args() object>:
Parsed arguments for run sub-command
Expand Down Expand Up @@ -238,13 +238,16 @@ def setup(sub_args, ifiles, repo_path, output_path):
config['options'][opt] = v

# RNA -> DNA mapping
sample_map = {}
dna_files, rna_files = ifiles['dna'], ifiles['rna']
for i in range(len(dna_files)):
r_sid = get_sid(rna_files[i])
d_sid = get_sid(dna_files[i])
sample_map[r_sid] = d_sid
config['sample_map'] = sample_map
if rna:
sample_map = {}
dna_files, rna_files = ifiles['dna'], ifiles['rna']
for i in range(len(dna_files)):
r_sid = get_sid(rna_files[i])
d_sid = get_sid(dna_files[i])
sample_map[r_sid] = d_sid
config['sample_map'] = sample_map
else:
config['rna'] = False

return config

Expand Down Expand Up @@ -339,7 +342,7 @@ def bind(sub_args, config):

if 'databases' in config:
dbs = config.pop('databases')
bindpaths.extend([resolve(mount['from'])+':'+resolve(mount['to'])+':'+mount['mode'] for mount in dbs])
bindpaths.extend([resolve(mount['from'])+':'+mount['to']+':'+mount['mode'] for mount in dbs])

if 'options' in config and 'input' in config['options']:
inrents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['input'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))]))
Expand All @@ -351,7 +354,11 @@ def bind(sub_args, config):

if 'options' in config and 'rna' in config['options']:
rnarents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['rna'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))]))
bindpaths.extend(rnarents)
common_parent = longest_common_parent_path(rnarents)
if common_parent:
bindpaths.extend([common_parent])
else:
bindpaths.extend(rnarents)

if 'options' in config and 'output' in config['options']:
if os.path.exists(config['options']['output']) and os.path.isdir(config['options']['output']):
Expand All @@ -360,8 +367,8 @@ def bind(sub_args, config):
if 'tmp_dir' in config:
bindpaths.append(config['tmp_dir'])

rawdata_bind_paths = [os.path.abspath(p) for p in config['project']['datapath'].split(',')]
working_directory = os.path.realpath(config['project']['workpath'])
# rawdata_bind_paths = [os.path.abspath(p) for p in config['project']['datapath'].split(',')]
# working_directory = os.path.realpath(config['project']['workpath'])

return list(set(bindpaths))

Expand Down Expand Up @@ -486,7 +493,7 @@ def add_rawdata_information(sub_args, config, ifiles):

# Finds the set of rawdata directories to bind
config['project']['datapath'] = ','.join(get_rawdata_bind_paths(input_files = sub_args.input))
if sub_args.rna:
if getattr(sub_args, 'rna', False):
config["project"]["rna_datapath"] = ','.join(get_rawdata_bind_paths(input_files = sub_args.rna))

# Add each sample's basename
Expand Down Expand Up @@ -613,7 +620,7 @@ def get_rawdata_bind_paths(input_files):
bindpaths = []
for file in input_files:
# Get directory of input file
rawdata_src_path = os.path.dirname(os.path.abspath(os.path.realpath(file)))
rawdata_src_path = os.path.dirname(os.path.abspath(file))
if rawdata_src_path not in bindpaths:
bindpaths.append(rawdata_src_path)

Expand Down Expand Up @@ -675,26 +682,33 @@ def valid_input(sheet):
raise argparse.ArgumentTypeError(f"Path `{sheet}` exists, but cannot read path due to permissions!")

# check format to make sure it's correct
sheet = open(sheet, 'r')
dialect = Sniffer().sniff(sheet.read(), [',', "\t"])
sheet.seek(0)
rdr = DictReader(sheet, delimiter=dialect.delimiter)
if sheet.endswith('.tsv') or sheet.endswith('.txt'):
delim = '\t'
elif sheet.endswith('.csv'):
delim = '\t'

rdr = DictReader(open(sheet, 'r'), delimiter=delim)

if 'DNA' not in rdr.fieldnames:
raise argparse.ArgumentTypeError("Sample sheet does not contain `DNA` column")
if 'RNA' not in rdr.fieldnames:
raise argparse.ArgumentTypeError("Sample sheet does not contain `RNA` column")
data = [row for row in rdr]
print("-- Running in DNA only mode --")
else:
print("-- Running in paired DNA & RNA mode --")

this_map = {}
data = [row for row in rdr]
RNA_included = False
for row in data:
row['RNA'] = os.path.abspath(row['RNA'])
row['DNA'] = os.path.abspath(row['DNA'])
if not os.path.exists(row['RNA']):
raise argparse.ArgumentTypeError(f"Sample sheet path `{row['RNA']}` does not exist")
if not os.path.exists(row['DNA']):
raise argparse.ArgumentTypeError(f"Sample sheet path `{row['DNA']}` does not exist")

return data, this_map
if 'RNA' in row and not row['RNA'] in ('', None, 'None'):
RNA_included = True
row['RNA'] = os.path.abspath(row['RNA'])
if not os.path.exists(row['RNA']):
raise argparse.ArgumentTypeError(f"Sample sheet path `{row['RNA']}` does not exist")

return data, RNA_included


try:
Expand Down
2 changes: 1 addition & 1 deletion src/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ snakemake \\
--jobs 500 \\
--keep-remote \\
--stats "$3/logfiles/runtime_statistics.json" \\
--restart-times 0 \\
--restart-times 1 \\
--keep-incomplete \\
--local-cores "14" 2>&1
# Create summary report
Expand Down
Loading