nf-core · LeonHornich · Sep 2, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/modules/local/gpt_prepare_query.nf b/modules/local/gpt_prepare_query.nf
@@ -0,0 +1,15 @@
+process GPT_PREPARE_QUERY {
+    input:
+    path data
+    val source
+    val index
+    val count
+    val mode
+    val question
+
+    output:
+    path "gpt_${source}_query.txt", emit: query
+
+    script:
+    template 'generateGptQuery.py'
+}
diff --git a/nextflow.config b/nextflow.config
@@ -40,6 +40,17 @@ params {
     overrepresented            = false
     umi_clustering             = false
     skip_clonality             = false
+    gpt_interpretation         = null
+
+    // Nf-gpt parameters
+    gpt_drugz_gene_amount      = 400
+    gpt_drugz_question         = "Which of the following genes have historically shown records of enhancing drug activitys upon CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
+    gpt_mle_gene_amount        = 400
+    gpt_mle_question           = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
+    gpt_bagel2_gene_amount     = 400
+    gpt_bagel2_question         = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
+    gpt_rra_gene_amount        = 400
+    gpt_rra_question           = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
 
     // UMI parameters
     umi_bin_size               = 1
@@ -89,6 +100,18 @@ params {
     validate_params            = true
 }
 
+// nf-gpt plugin settings
+gpt {
+    // The user should provide a functioning api key
+    apiKey      = null
+    // Models are available in nf-gpt 0.4.0: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, and gpt-3.5-turbo
+    model       = "gpt-4o-mini"
+    // Set maximum number of tokens. This depends on the used model: https://platform.openai.com/docs/models
+    maxTokens   = 15000
+    // Sampling temperature
+    temperature = 0.7
+}
+
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
 
@@ -206,6 +229,12 @@ profiles {
 // Load nf-core custom profiles from different Institutions
 includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null"
 
+
+// Nextflow plugins
+plugins {
+    id '[email protected]' // Allows access to nf-gpt functionality
+}
+
 // Load nf-core/crisprseq custom profiles from different institutions.
 includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/crisprseq.config" : "/dev/null"
 
@@ -218,6 +247,7 @@ podman.registry       = 'quay.io'
 singularity.registry  = 'quay.io'
 charliecloud.registry = 'quay.io'
 
+
 // Load igenomes.config if required
 includeConfig !params.igenomes_ignore ? 'conf/igenomes.config' : 'conf/igenomes_ignored.config'
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -256,6 +256,10 @@
                     "description": "Essential genes to remove from the drugZ modules",
                     "pattern": "\\\\S+"
                 },
+                "gpt_interpretation": {
+                    "type": "string",
+                    "description": "Determines whether or not to run nf-gpt plugin"
+                },
                 "hitselection": {
                     "type": "boolean",
                     "description": "Specify to run the Hitselection algorithm"
@@ -448,6 +452,53 @@
                     "hidden": true
                 }
             }
+        },
+        "nf_gpt_parameters": {
+            "title": "nf_gpt_parameters",
+            "type": "object",
+            "description": "Contains parameters to control nf-gpt plugin calling.",
+            "default": "",
+            "properties": {
+                "gpt_drugz_gene_amount": {
+                    "type": "integer",
+                    "default": 400,
+                    "description": "Number of top genes to be selected from drugZ."
+                },
+                "gpt_drugz_question": {
+                    "type": "string",
+                    "default": "Which of the following genes have historically shown records of enhancing drug activitys upon CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
+                    "description": "Question parsed with drugZ data to gpt."
+                },
+                "gpt_mle_gene_amount": {
+                    "type": "integer",
+                    "default": 400,
+                    "description": "Number of top genes to be selected from mle."
+                },
+                "gpt_mle_question": {
+                    "type": "string",
+                    "default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
+                    "description": "Question parsed with MAGeCK mle data to gpt."
+                },
+                "gpt_bagel2_gene_amount": {
+                    "type": "integer",
+                    "default": 400,
+                    "description": "Number of top genes to be selected from bagel2."
+                },
+                "gpt_bagel2_question": {
+                    "type": "string",
+                    "default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
+                },
+                "gpt_rra_gene_amount": {
+                    "type": "integer",
+                    "default": 400,
+                    "description": "Number of top genes to be selected from rra."
+                },
+                "gpt_rra_question": {
+                    "type": "string",
+                    "default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
+                    "description": "Question parsed with rra data to gpt."
+                }
+            }
         }
     },
     "allOf": [
@@ -478,6 +529,13 @@
         {
             "$ref": "#/$defs/institutional_config_options"
         },
+        {
+
+            "$ref": "#/definitions/generic_options"
+        },
+        {
+            "$ref": "#/definitions/nf_gpt_parameters"
+        },
         {
             "$ref": "#/$defs/generic_options"
         }

diff --git a/templates/generateGptQuery.py b/templates/generateGptQuery.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+# Define process input variables
+data_path = "${data}"
+source = "${source}"
+target_index = "${index}"
+target_index = int(target_index)
+num_genes = "${count}"
+num_genes = int(num_genes)
+mode = "${mode}"
+question = "${question}"
+
+# Open data file
+with open(data_path, "r") as file:
+    # Read the header row (though it’s not necessary for column names anymore)
+    header = file.readline().strip().split("\t")
+
+    # Ensure target_index is within the bounds of available columns
+    if target_index >= len(header) or target_index < 0:
+        print(f"Error: The specified column index {target_index} is out of range!")
+
+    # Initiate list to store gene IDs with corresponding data values
+    data = []
+
+    # Prepare data file's rows
+    for line in file:
+        row = line.strip().split("\t")
+        gene_id = row[0]  # Assume the first column is the gene ID
+        value = float(row[target_index])  # Extract value using the provided index
+        data.append((gene_id, value))
+
+    # Sort the data based on provided mode
+    if mode == "low":
+        sorted_data = sorted(data, key=lambda x: x[1])
+    elif mode == "high":
+        sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
+    else:
+        print("Error: Please provide either 'low' or 'high' as mode.")
+
+    # Extract num_genes many top genes
+    top_gene_ids = [gene_id for gene_id, value in sorted_data[:num_genes]]
+
+    # Write everything into an output file
+    with open(f"gpt_{source}_query.txt", "w") as query_file:
+        query_file.write(question + """\n""")
+        for gene_id in top_gene_ids:
+            query_file.write(gene_id + """\n""")
diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf
@@ -18,8 +18,13 @@ include { MAGECK_FLUTEMLE                              } from '../modules/local/
 include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_CONTRASTS } from '../modules/local/mageck/flutemle'
 include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_DAY0      } from '../modules/local/mageck/flutemle'
 include { VENNDIAGRAM                                  } from '../modules/local/venndiagram'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_BAGEL2_QUERY} from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_DRUGZ_QUERY } from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_MLE_QUERY   } from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_RRA_QUERY   } from '../modules/local/gpt_prepare_query'
 include { VENNDIAGRAM as VENNDIAGRAM_DRUGZ             } from '../modules/local/venndiagram'
 
+
 // nf-core modules
 include { FASTQC                                       } from '../modules/nf-core/fastqc/main'
 include { CUTADAPT as CUTADAPT_THREE_PRIME             } from '../modules/nf-core/cutadapt/main'
@@ -37,10 +42,17 @@ include { BOWTIE2_ALIGN                                } from '../modules/nf-cor
 // Local subworkflows
 include { INITIALISATION_CHANNEL_CREATION_SCREENING    } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
 // Functions
+
+include { gptPromptForText                             } from 'plugin/nf-gpt'
+include { paramsSummaryMap                             } from 'plugin/nf-validation'
+include { paramsSummaryMultiqc                         } from '../subworkflows/nf-core/utils_nfcore_pipeline'
+include { softwareVersionsToYAML                       } from '../subworkflows/nf-core/utils_nfcore_pipeline'
+include { methodsDescriptionText                       } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
 include { paramsSummaryMap       } from 'plugin/nf-schema'
 include { paramsSummaryMultiqc   } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
+
 include { validateParametersScreening                  } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
 include { DRUGZ                                        } from '../modules/local/drugz'
 
@@ -360,6 +372,110 @@ workflow CRISPRSEQ_SCREENING {
     }
 
     //
+
+    // Calling of nf-gpt plugin on drugZ, MAGeCK mle or bagel2
+    //
+    if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('drugz')) {
+        if(params.drugz) {
+            def gpt_drugz_data = DRUGZ.out.per_gene_results.map { meta, genes -> genes }
+            def gpt_drugZ_source = "drugZ"
+            def gpt_drugZ_target_column = 5
+            def gpt_drugZ_mode = "low"
+            GPT_PREPARE_DRUGZ_QUERY(
+                gpt_drugz_data,
+                gpt_drugZ_source,
+                gpt_drugZ_target_column,
+                params.gpt_drugz_gene_amount,
+                gpt_drugZ_mode,
+                params.gpt_drugz_question
+            )
+
+            GPT_PREPARE_DRUGZ_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_drugz_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified DrugZ for gpt interpretation, but DrugZ is not running."
+        }
+    }
+    if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('mle')) {
+        if(params.mle) {
+            def gpt_mle_data = MAGECK_MLE.out.gene_summary.map { meta, genes -> genes }
+            def gpt_mle_source = "mle"
+            def gpt_mle_target_column = 2
+            def gpt_mle_mode = "high"
+            GPT_PREPARE_MLE_QUERY(
+                gpt_mle_data,
+                gpt_mle_source,
+                gpt_mle_target_column,
+                params.gpt_mle_gene_amount,
+                gpt_mle_mode,
+                params.gpt_mle_question
+            )
+
+            GPT_PREPARE_MLE_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_mle_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified MAGeCK MLE for gpt interpretation, but MAGeCK MLE is not running."
+        }
+    }
+    if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('bagel2')) {
+        if(params.bagel2) {
+            def gpt_bagel2_data = BAGEL2_BF.out.bf.map { meta, genes -> genes }
+            def gpt_bagel2_source = "bagel2"
+            def gpt_bagel2_target_column = 1
+            def gpt_bagel2_mode = "high"
+            GPT_PREPARE_BAGEL2_QUERY(
+                gpt_bagel2_data,
+                gpt_bagel2_source,
+                gpt_bagel2_target_column,
+                params.gpt_bagel2_gene_amount,
+                gpt_bagel2_mode,
+                params.gpt_bagel2_question
+            )
+
+            GPT_PREPARE_BAGEL2_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_bagel2_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified BAGEL2 for gpt interpretation, but BAGEL2 is not running."
+        }
+    }
+    if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('rra')) {
+        if(params.rra) {
+            def gpt_rra_data = MAGECK_TEST.out.gene_summary.map { meta, genes -> genes }
+            def gpt_rra_source = "rra"
+            def gpt_rra_target_column = 5
+            def gpt_rra_mode = "low"
+            GPT_PREPARE_RRA_QUERY(
+                gpt_rra_data,
+                gpt_rra_source,
+                gpt_rra_target_column,
+                params.gpt_rra_gene_amount,
+                gpt_rra_mode,
+                params.gpt_rra_question
+            )
+
+            GPT_PREPARE_RRA_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_rra_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified MAGeCK RRA for gpt interpretation, but MAGeCK RRA is not running."
+        }
+    }
+
     // Venn diagrams
     //