config.yaml~nobamutil

##########################################################################################
#                      __  __    _    ____   _    ____ _   _ _____                       #
#                     |  \/  |  / \  |  _ \ / \  / ___| | | | ____|                      #
#                     | |\/| | / _ \ | |_) / _ \| |   | |_| |  _|                        #
#                     | |  | |/ ___ \|  __/ ___ \ |___|  _  | |___                       #
#                     |_|  |_/_/   \_\_| /_/   \_\____|_| |_|_____|                      #
#                                                                                        #
##########################################################################################


##########################################################################################
# Samples 
##########################################################################################
# Example samples file. Modify to map to your own samples.
sample_file: config/samples_sarakina.tsv

# Some statistics (like depth of coverage) can be computed for BAM files that were obtained
# by other means (e.g., downloaded from public repositories). Note that in that case, mapache
# assumes that the data were mapped to the first reference genome listed in the config file.
## by default ("") the stats are computed on the final bam files from the mapping pipeline. 
## To use on other pre-computed bam files,
## one can pass a list of bam files either in an external file (in this case the parameter 
## 'external_sample' should contain the file name) in the format:
##   SM    Bam      Genome 
##   ind1  bam1.bam  hg19
##   ind2  bam2.bam  hg19
##   ...
## or directly in the config file in the yaml format as:
##   external_sample:
##      hg19:
##        ind1:  path_to_bam1.bam
##        ind2:  path_to_bam2.bam
##        ...

## if the deliminator to read the sample file has to be adapted
#delim: "\s+"

##########################################################################################
# OUTPUT FOLDER 
##########################################################################################
# By default, all outputs will be stored in a folder named "results"
result_dir: 'results_sarakina'
#workdir: 


##########################################################################################
# Reference genome
##########################################################################################
# Lis the genome(s) to which each sample will be mapped.
# Include meaningful names for each genome. Final BAM(s) file will be named after it 
# (e.g., ind1.hg19.bam)
genome: 
    hs37d5: /home/ref_genomes/hs37d5.fa

##########################################################################################
# Mapping workflow
##########################################################################################
# ## genome/FASTA indexing
# This step will be run once per reference genome
indexing:
    mem: 16
    threads: 1
    time: 2
    bwa_params: ''
    bowtie2_params: ''
    samtools_params: ''
    picard_params: ''

#-----------------------------------------------------------------------------------------
## FASTQ level
# To consider when assigning runtime and memory:
# These steps will be run once per FASTQ file specified in the "samples" file.

## subsampling (optional)
subsampling:
    run: False
    params: '-s1'
    number: 1000

# adapter_removal (optional)
cleaning:
    run: 'adapterremoval'   # options: adapterremoval (default), fastp, False
    params_adapterremoval:
     DS_index8_8: '--minlength 30 --trimns --trimqualities  --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
     DS_index8_7: '--minlength 30 --trimns --trimqualities  --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
     DS_index6_8: '--minlength 30 --trimns --trimqualities  --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
     DS_index6_7: '--minlength 30 --trimns --trimqualities  --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
     DS_index8: '--minlength 30 --trimns --trimqualities  --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG'
    collapse_opt: only_collapse
    params_fastp: ''
    threads: 24 
    mem: 8 ## in GB
    time: 10

# mapping (compulsory)
mapping:
    mapper: 'bwa_aln' # options: bwa_aln, bwa_mem, bowtie2
    bwa_aln_params: '-l 1024 -n 0.01 -o 2'   
    bwa_samse_params: '-n 3'
    bwa_sampe_params: ''
    bwa_mem_params: ''
    bowtie2_params: ''
    pl: 'ILLUMINA'
    threads: 32
    mem: 32
    time: 36

## samtools_sort
sorting:
    threads: 4
    mem: 16
    time: 10


## samtools_filter
filtering:
    run: True
    mapq: 30
    threads: 4
    mem: 16
    time: 2
    # Save low-quality and unmapped reads to an extra BAM file?
    save_low_qual: False

#-----------------------------------------------------------------------------------------
## library level

# The configuration for the merging step applies to both
# merging mapped files per library and per sample
merging:
    threads: 4
    mem: 16
    time: 2

# ## extract_duplicates (optional)
remove_duplicates:
    run: markduplicates  # (False (default), markduplicates, dedup)
    params_markduplicates: '--REMOVE_DUPLICATES true'
    params_dedup: '-m'
    threads: 4  ## 1 for dedup
    mem: 32
    time: 24


# mapDamage2 (optional, not run by default)
# Do you wish to rescale base qualities in the mapped BAMs with mapDamage2?
# To compute only damage statistics (read length, deamination rates), 
# without rescaling qualities, see the entry below under "stats"
damage_rescale:
    run: False
    params: ''
    time: 2
    threads: 1 ## hardcoded at 1
    mem: 8
bamutil:
    run: False
    params:
     UDG_2: '2'
     nonUDG_6: '6'
     nonUDG_10: '10'
     default: ''
    threads: 1 ## hardcoded at 1
    mem: 8

#-----------------------------------------------------------------------------------------
## sample level

# realign around indels with GATK 
realign:
    run: True
    threads: 24
    mem: 32
    time: 8
# samtools_calmd (optional)
compute_md:
    run: True
    threads: 4
    mem: 16
    time: 2


##########################################################################################
## Statistics for final bam files
##########################################################################################
# This section corresponds to the statistics reported by mapache.
# By default, it is run together with the mapping pipeline for all the combinations of
# input FASTQ files and genomes.
stats:
    # Default: run it for samples in samples file.
    # If intended to compute stats on BAMs downloaded or mapped with other tools, see instructions above
    fastqc_time: 4
    
    # Output qualimap and multiqc reports?
    qualimap: True
    qualimap_mem: 8
    
    multiqc: True
    
    # specific to plots in the html report
    plots:
        # x_axis: Which value should be included as the variable on the x-axis? 
        # E.g.,
        # sample vs DoC (and bars colored by sample name)
        # genome vs DoC (and bars colored by genome name)
        # We recommend:
        #    "sample" if n_samples > n_genomes 
        #    "genome" if n_genomes > n_samples
        # if set to "auto", mapache will follow the above recommendation
        
        x_axis: auto        # either "auto" (default), "sample" or "genome"
        n_col: 1 # number of panels per row if multiple samples and genomes are used
        # Width and height of plots, in inches
        width: 18
        height: 12
        # This is to draw rectangles for the thresholds of the sex assignment;
        # Adjust according to the thresholds/labels previously defined
        sex_ribbons: 'c("XX"="#7e3075", "XY"="#003e83")'
        color: "#f2ad78"


##########################################################################################  
## Analyses
##########################################################################################
damage:
    ## damage statistics (read length, deamination rates)
    run: 'bamdamage'        # Options: ("False", "bamdamage", "mapDamage")
    bamdamage_fraction: 10000
    # This step can take a lot of time. We recommend to run it on a small subset of the mapped
    # reads in order to get an estimate of the damage. The value can be an integer (number of reads)
    # or a float (fraction of mapped reads)
    bamdamage_params: ' --rlength 200 --plot_length 30'
    # This parameter indicates that the estimation will be done with the first 100 bp of each read
    # and the plots will include the first and last 30 bp.

depth:
    hs37d5:
        run: True
        # Mapache outputs the depth of coverage (DoC) for each of the genomes. 
        # If you also want the DoC to be reported for some chromosomes in the main stats table, list them below. 
        # Their names have to match their IDs in the FASTA file specified above.
        chromosomes: ["X", "Y", "MT"]
        # parameters specific to sex inference
        
        
sex_inference:
    hs37d5:
        run: True
        # uncomment options below if you need to change any value
        # List with the autosomes names. You can paste any python expression here, in single quotes ('), for example:
            # '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]'
            # '[f"chr{x}" for x in range(1,23)]'  # chr1,chr2,chr3,...,chr22.
            # '[x for x in range(1,23)]'          # 1,2,3,...,22.
            # the outcome should be a python list
        autosomes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
        sex_chr: X
        # Possible options: XY, ZW, XO, ZO
        system: XY 
        signif: 0.9
        # These are the thresholds used to assign molecular sex, in R syntax.
        # For humans, we expect a ratio of DoC(X)/DoC(genome) = 1 for females and DoC(X)/DoC(genome) = 0.5 for males.
        # Note that sex will be assigned by considering the confidence intervals around the estimate for such ratio,
        # which can be a bit lower or higher than 1 or 0.5.
        # This option can be commented as the sex script comes with default values for different sex systems.
        thresholds:  list( "XX"=c(0.8, 1.2), "XY"=c(0, 0.6), "consistent with XX but not XY"=c(0.6, 1.2), "consistent with XY but not XX"=c(0, 0.8) )
    

imputation:
    hs37d5:
        run: False
        ## The imputation is done using the first named reference genome 'genome/{name}/fasta'
        gp_filter: [0.8]

        ## path_map and path_panel may specify a single file OR may contain the variable '{chr}' to specify it per chromosome 
        path_map: "/work/FAC/FBM/DBC/amalaspi/popgen/shared_ressources/genetic_maps_b37/chr{chr}.b37.gmap.gz"
        path_panel: "/work/FAC/FBM/DBC/amalaspi/popgen/shared_ressources/1000Genomes/ALL.chr{chr}.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"

        # Imputation will be run on the chromosomes present in the panel
        # The version of the panel and chromosome names should match those of
        # the reference genome to which individual samples were mapped.
        chromosomes: '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 X Y MT]'
        
        #chromosomes: [20,21]

        glimse_chunk_params: ""
        glimse_phase_params: ""


        # Input BAM files can be listed individually under the "paths" keyword
        # Alternatively, you can pass a plain text file (bam_list)
        # with the paths in it
        #bam_list: None
        #paths:
        #    ind1: results/03_sample/03_final_sample/01_bam/ind1.hg19.bam
        #    ind2: results/03_sample/03_final_sample/01_bam/ind2.hg19.bam

        bcftools_mpileup_threads: 1
        bcftools_mpileup_mem: 4
        bcftools_mpileup_time: 6

        split_genome_mem: 2
        split_genome_time: 4

        extract_positions_mem: 4
        extract_positions_time: 6

        impute_phase_threads: 1
        impute_phase_mem: 2
        impute_phase_time: 2

        ligate_chunks_mem: 8


##########################################################################################
# Job resubmission
##########################################################################################
# On an HPC system, a job might be cancelled/unfinished sue to insufficient resources
# (time or memory) allocated to it.
# global variables to define about resource increments after job failure
# 0: no change between failures
# 1: doubling after each failure
memory_increment_ratio: 0.5
runtime_increment_ratio: 0.5


##########################################################################################
# Software
##########################################################################################
software:
    picard_jar: 'picard'
    gatk3_jar: 'GenomeAnalysisTK'

# By default, mapache uses the software installed with the conda environment. Thus, no need
# to modify this section. 

# If you want to use a different version of a software (for instance, one that can
# be loaded with module load module_name on your server), you can add the 
# name and version of it in this section.
# For this to take effect, you need to invoke the pipeline with the option --use-envmodules
envmodules:
    samtools:       "gcc samtools/1.12"
    bowtie2:        "gcc bowtie2/2.4.2"
    bwa:            "gcc bwa/0.7.17"
    picard:         "gcc picard/2.24.0"
    gatk3:          "gcc gatk/3.8-1"
    fastqc:         "gcc fastqc/0.11.9"
    r:              "gcc r/4.0.4"
    adapterremoval: "gcc adapterremoval/2.3.2"
    fastp:          ""
    mapdamage:      ""
    bedtools:       "gcc bedtools2/2.29.2"
    dedup:          ""
    seqtk:          ""
    qualimap:       ""
    multiqc:        ""
    glimpse:        ""