config.yaml

####################################
# GLOBAL PARAMETERS                #
####################################

global_parameters:
  random_seed: 123
  chromosome: 1 # "all" or a number from 1 to 22
  superpopulation: none # "none" or a specific superpopulation (AFR, AMR, EAS, EUR, CSA, MID)
  memory: 8000 # amount of memory available (in MB) for memory-intensive commands
  batchsize: 10000 # batchsize for writing plink output during genotype generation


####################################
# FILEPATHS                        #
####################################

# - the chromosome number can be given as a wildcard by specifying {chromosome} in the filepath
# - the superpopulation can be given as a wildcard by specifying {superpopulation} in the filepath
filepaths:
  general:
    output_dir: data/outputs/test
    output_prefix: test_chr-{chromosome}
  genotype:
    vcf_input_raw: data/inputs/raw/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.final.vcf.gz
    vcf_input_processed: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.final.recode.vcf
    vcf_metadata: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.metadata
    popfile_raw: data/inputs/processed/1KG+HGDP/merged_pop_adjusted.tsv
    popfile_processed: data/inputs/processed/1KG+HGDP/merged_pop.tsv
    variant_list: data/inputs/processed/1KG+HGDP/hapmap_variant_list_chr{chromosome}.txt
    remove_list: data/inputs/processed/1KG+HGDP/remove.txt
    rsid_list: data/inputs/processed/1KG+HGDP/rsid_map_list_chr{chromosome}.txt
    genetic_mapfile: data/inputs/raw/1KG+HGDP/genetic_maps/chr{chromosome}.interpolated_genetic_map
    genetic_distfile: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.distfile
    mutation_mapfile: data/inputs/raw/1KG+HGDP/mutation_maps/atlas.chr{chromosome}.csv
    mutation_agefile: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.agefile
    hap1_matrix: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.h1
    hap2_matrix: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.h2
  phenotype:
    causal_list: data/inputs/processed/1KG+HGDP/Test.CausalList
    reference: data/inputs/processed/1KG+HGDP/Africa.Annot
    plink_override: none # can set to a value if using pre-simulated genetics input
  software:
    plink: plink
    plink2: plink2
    king: king
    vcftools: vcftools
    mapthin: mapthin
    phenoalg: phenoalg


####################################
# GENOTYPE DATA                    #
####################################

genotype_data:
  samples:
    use_default: true # setting this to true will ignore the custom population groups
    custom: # add your custom population groups below if using use_default=false
      - id: EUR_pop
        nsamples: 100
        populations: 
          - EUR: 100
      - id: AFR_pop
        nsamples: 200
        populations:
          - AFR: 100
      - id: admix_pop
        nsamples: 100
        populations:
          - EUR: 50
          - AFR: 50
    default:
      nsamples: 1000 # used by the algorithm if use_default=true, otherwise custom population groups are used
  # recombination rate 
  rho:
    AFR: 0.77
    AMR: 0.80
    EAS: 0.58
    EUR: 0.68
    CSA: 0.73
    MID: 0.65
  # effective population size 
  Ne:
    AFR: 11900
    AMR: 10400
    EAS: 11700
    EUR: 11700
    CSA: 11500
    MID: 8100


####################################
# PHENOTYPE DATA                   #
####################################

phenotype_data:
  nPopulation: 6
  nTrait: 1
  a: -0.4
  b: -1
  c: 0.5
  nComponent: 1
  PropotionGeno: 0.1,0.1,0.1,0.1,0.1,0.1
  PropotionCovar: 0,0,0,0,0,0
  Prevalence: 0.5,0.5,0.5,0.5,0.5,0.5
  TraitCorr: 1
  PopulationCorr: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  CompWeight: 1,5,10
  Causality:
      UseCausalList: false # if true the algorithm will use the causal_list filepath
      Polygenicity: 0.005 # only required if UseCausalList is false
      Pleiotropy: 1 # only required if UseCausalList is false


####################################
# EVALUATION                       #
####################################

# Set to true if you want the script to calculate the metric
evaluation:
  metrics:
    aats: true  # nearest neighbour adversarial accuracy
    kinship: true # relatedness, including kinship density and IBS plots
    ld_corr: true # linkage disequilibrium (LD) correlation matrix
    ld_decay: true # linkage disequilibrium (LD) decay plot (and distance)
    maf: true # minor allele frequency divergences
    pca: true # principal components analysis
    gwas: false # GWAS, manhattan plot and qqplot


####################################
# OPTIMISATION                     #
####################################

# Note that this code uses a single superpopulation and ignores custom population structures
optimisation:
  # prior distributions - specify lower/upper bounds for uniform priors
  priors:
    rho:
      uniform_lower: 0
      uniform_upper: 3
    Ne:
      uniform_lower: 0
      uniform_upper: 50000
  # inference type - simulation-based rejection ABC or emulation-based rejection ABC
  simulation_rejection_ABC:
    run: true
    n_particles: 500
    threshold: 0.15
    max_iter: 500
    write_progress: true
  emulation_rejection_ABC:
    run: false
    n_particles: 500
    threshold: 0.15
    n_design_points: 50
    max_iter: 500
    write_progress: true
  # choice of summary statistic/s
  summary_statistics:
    ld_decay: true
    kinship: true