Merge pull request #5 from alihamraoui/gh-page

UPDATE: update workflow parameters
GenomiqueENS · Jul 12, 2024 · 6a47cdd · 6a47cdd
2 parents aa83d9a + e3cc72f
commit 6a47cdd
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 1,455 deletions.
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Configuration for error model:
 | `error_model`      | Custom error model file (optional)                            | `null`                                        |
 | `qscore_model`     | Custom Q-score model file (optional)                          | `null`                                        |
 | `build_model`      | to build your own error/Qscor model                           | `false`                                       |
-| `model_fastq`      | reference real read (.fastq) to train error model   (optional)      | `false`                                       |
+| `fastq_model`      | reference real read (.fastq) to train error model   (optional)      | `false`                                       |
 | `ref_genome`       | reference genome .fasta file (optional)                       | `false`                                       |
 
 ### Additional Parameters
@@ -78,17 +78,7 @@ Configuration for running the workflow:
 | `container`       | Docker container for the workflow  | `'hamraouii/wf-SLSim'`    |
 | `docker.runOptions` | Docker run options to use       | `'-u $(id -u):$(id -g)'`  |
 
-## Execution
-
-To run the workflow with basic example:
-
-You can simulate a realistic disribution of per barcodes UMI counts by providing in addition to the filtered counts matrix a .csv file of BC counts.  AsaruSim will add a random transcrips count to fit the real distribution.
-```bash
-nextflow run main.nf --matrix test_data/matrix.csv \
-                     --bc_counts test_data/test_bc.csv \
-                     --transcriptome test_data/transcriptome.fa \
-
-```
+## Usage
 User can choose among 4 ways to simulate template reads.
 - use a real count matrix
 - estimated the parameter from a real count matrix to simulate synthetic count matrix 
@@ -97,10 +87,72 @@ User can choose among 4 ways to simulate template reads.
 
 We use SPARSIM tools to simulate count matrix. for more information a bout synthetic count matrix, please read [SPARSIM](https://gitlab.com/sysbiobig/sparsim/-/blob/master/vignettes/sparsim.Rmd?ref_type=heads#Sec_Input_parameter_estimated_from_data) documentaion.
 
-#### use a real count matrix
+### EXAMPLES 
+##### Sample data
+A demonstration dataset to initiate this workflow is accessible on zenodo DOI : [10.5281/zenodo.12731408](https://zenodo.org/records/12731409). This dataset is a subsample from a Nanopore run of the [10X 5k human pbmcs](https://www.10xgenomics.com/datasets/5k-human-pbmcs-3-v3-1-chromium-controller-3-1-standard).
+
+The human GRCh38 [reference transcriptome](https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/cdna/), [gtf annotation](https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/) and [fasta referance genome](https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/) can be downloaded from Ensembl.
+
+
+##### BASIC WORKFLOW
+
+```bash
+ nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/genes.gtf
+```
+
+##### WITH PCR AMPLIFICTION
+
+```bash
+ nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --pcr_cycles 2 \
+                      --pcr_dup_rate 0.7 \
+                      --pcr_error_rate 0.00003
+```
+
+##### WITH SIMULATED CELL TYPE COUNTS
+
+```bash
+ nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --sim_celltypes true \
+                      --cell_types_annotation dataset/sub_pbmc_cell_type.csv
+```
+
+##### WITH PERSONALIZED ERROR MODEL
+
+```bash
+nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                     --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                     --features gene_name \
+                     --gtf dataset/GRCh38-2020-A-genes.gtf \
+                     --build_model true \
+                     --fastq_model dataset/sub_pbmc_reads.fq \
+                     --ref_genome dataset/GRCh38-2020-A-genome.fa 
+```
+
+##### COMPLETE WORKFLOW
+
 ```bash
-nextflow run main.nf --matrix test_data/matrix.csv \
-                     --transcriptome test_data/transcriptome.fa
+ nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --sim_celltypes true \
+                      --cell_types_annotation dataset/sub_pbmc_cell_type.csv
+                      --build_model true \
+                      --fastq_model dataset/sub_pbmc_reads.fq \
+                      --ref_genome dataset/GRCh38-2020-A-genome.fa 
+                      --pcr_cycles 2 \
+                      --pcr_dup_rate 0.7 \
+                      --pcr_error_rate 0.00003
 ```
 
 ## Results

diff --git a/main.nf b/main.nf
@@ -14,13 +14,13 @@ log.info """\
     error model                   : ${params.error_model}
     Qscore model                  : ${params.qscore_model}
     build erro model              : ${params.build_model}
-    FASTQ model                   : ${params.model_fastq}
+    FASTQ model                   : ${params.fastq_model}
     reference genome              : ${params.ref_genome}
     UMI duplication               : ${params.umi_duplication}
-    PCR amplification cycles      : ${params.cycles}
-    PCR duplication rate          : ${params.dup_rate}
-    PCR error rate                : ${params.error_rate}
-    Total number of PCR reads     : ${params.totalNamber}
+    PCR amplification cycles      : ${params.pcr_cycles}
+    PCR duplication rate          : ${params.pcr_dup_rate}
+    PCR error rate                : ${params.pcr_error_rate}
+    Total number of PCR reads     : ${params.pcr_total_reads}
     outdir                        : ${params.outdir}
     """
     .stripIndent()
@@ -70,7 +70,7 @@ workflow {
                                                     channel.from("0.37,0.0,824.94")
 
     if (params.build_model) {
-        fastq_ch = Channel.fromPath(params.model_fastq, checkIfExists: true)
+        fastq_ch = Channel.fromPath(params.fastq_model, checkIfExists: true)
         genome_ch = Channel.fromPath(params.ref_genome, checkIfExists: true)
         sub_fastq_ch = SUBSAMPLE(fastq_ch)
         paf_ch = ALIGNMENT(sub_fastq_ch, genome_ch)
@@ -89,7 +89,7 @@ workflow {
         template_ch = TEMPLATE_MAKER(matrix_ch, transcriptome_ch, barcodes_ch, gtf_ch, length_dist_ch)
     }
 
-    if (params.cycles > 0) {
+    if (params.pcr_cycles > 0) {
         template_ch = PCR_SIMULATOR(template_ch)
     }
 

diff --git a/modules/PCR.nf b/modules/PCR.nf
@@ -8,13 +8,13 @@ process PCR_SIMULATOR {
     path "amplified_reads.fa" 
 
     script:
-        def totalNamber = params.totalNamber == false? "" : "--totalNamber $params.totalNamber"
+        def totalNamber = params.pcr_total_reads == null? "" : "--totalNamber $params.pcr_total_reads"
 
     """
     python3.11 $projectDir/bin/PCR.py -f ${fasta} \
-    --cycles $params.cycles \
-    --dup $params.dup_rate \
-    --error $params.error_rate \
+    --cycles $params.pcr_cycles \
+    --dup $params.pcr_dup_rate \
+    --error $params.pcr_error_rate \
     --thread $params.threads \
     $totalNamber \
     --seed 2024 \

diff --git a/nextflow.config b/nextflow.config
@@ -1,4 +1,4 @@
-params.outdir = "paper/test"
+params.outdir = "tmp/test"
 params.projetctName = "test"
 
 
@@ -7,35 +7,45 @@ params.projetctName = "test"
  */
 
 // BASIC INPUTS.
-params.matrix = "$projectDir/paper/pbmc.mtx_sub.csv"
-params.transcriptome = "$projectDir/../SLSim/data/cdna/Homo_sapiens.GRCh38.cdna.all.fa"
+params.matrix = null
+params.transcriptome = null
 
 // FEATURES : Provide a GTF file if the matrix row names do not correspond to "transcript_id".
-params.features = "gene_name"
-params.full_length = false
-params.length_dist = "0.37,0.0,824.94"
-params.gtf = "$projectDir/../references/refdata-gex-GRCh38-2020-A/genes/genes.gtf"
+params.features = "transcript_id"
+params.full_length = true
+params.gtf = null 
+params.umi_duplication = 4
 
 // SIMULATE CELL BARCODE COUNTS : produce a realistic knee plot.
 params.bc_counts = null
 
 // SIMULATE CELL TYPES.
 params.sim_celltypes = false
-params.cell_types_annotation = "$projectDir/paper/sub_cell_type.csv"
+params.cell_types_annotation = null
+
+
+/***************************
+ * PCR AMPLIFICATION PARAMETRES
+ */
 
+params.pcr_cycles = 0
+params.pcr_total_reads = null
+params.pcr_error_rate = 0.00003
+params.pcr_dup_rate = 0.7
 
 /*************
  * ERROR MODEL
  */
 
-// USE PRE-TRAINED MODEL
-params.trained_model = "nanopore2023"
-params.badread_identity = "96,2,98"
+// BUILD YOUR PERSONAL MODEL
+params.build_model = false
+params.fastq_model = null
+params.ref_genome = null
 
-// BUILD YOUR OWN MODEL
-params.build_model = true
-params.model_fastq = "$projectDir/paper/SC3pv3_GEX_Human_PBMC_ONT_1M.fastq"
-params.ref_genome = "$projectDir/../references/refdata-gex-GRCh38-2020-A/fasta/genome.fa"
+// USE PRE-TRAINED MODEL
+params.trained_model = null
+params.badread_identity = "97.0,4.0,100"
+params.length_dist = "0.37,0.0,824.94"
 
 // IMPORT MODEL (FILES)
 params.error_model = null
@@ -46,18 +56,16 @@ params.qscore_model = null
  * BASIC CONFIG
  */
 
-// PCR CYCLES.
-params.amp = 5
 params.ADPTER_SEQ = 'ACTAAAGGCCATTACGGCCTACACGACGCTCTTCCGATCT'
 params.TSO_SEQ = 'TGTACTCTGCGTTGATACCACTGCTT'
-params.dT_LENGTH = 20
+params.dT_LENGTH = 19
 
 
 /****************
  * RUN PARAMETERS
  */
 
-params.threads = 15
+params.threads = 10
 docker.enabled = true 
 process.container = 'hamraouii/wf-slsim:1.1'
 docker.runOptions = '-u $(id -u):$(id -g)'