Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of nf-gpt into crisprseq pipeline. #193

Open
wants to merge 22 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
1c830ed
added nf-gpt plugin to nextflow.config
Sep 2, 2024
96074ea
introduced new boolean parameter gpt_interpretation that controls nf-…
Sep 3, 2024
c34bcc9
changed min nxf version to 24.03.0-edge
Sep 4, 2024
e30179b
added nf-gpt parameters to nextflow.config
Sep 4, 2024
6211a12
gpt plugin can now be parsed with individual data from drugZ, bagel2 …
Sep 5, 2024
fbc2de6
gpt questions can now be specified in custom config.
Sep 8, 2024
da32fa1
Data parser refactoring. Now 1 parser can handle data from different …
Sep 9, 2024
2d20407
merge gpt changes to dev branch
Sep 11, 2024
a936cfb
adjusted some formating
Sep 11, 2024
78c2c74
Added rra support. Also, gpt interpretation now only works when modul…
Sep 11, 2024
615618f
grouped nf-gpt related parameters in nextflow_schema.json.
Sep 12, 2024
295045c
added more checks before nf-gpt calling
Sep 17, 2024
e8c2b56
Merge branch 'dev' into dev
LaurenceKuhl Sep 18, 2024
05c4d56
Merge branch 'dev' into dev
LaurenceKuhl Sep 19, 2024
0aee7db
Parsing process now selects column by index not by name. Adjusted max…
Sep 29, 2024
cb0442b
Merge branch 'dev' of github.com:LeonHornich/crisprseq into dev
Sep 29, 2024
f4ce2ec
Updated schema json. Ran code formater.
Sep 29, 2024
eecb6f4
updated default gpt question for each module, removing request for re…
Oct 5, 2024
71b745e
updated schema file with new default values.
Oct 5, 2024
c55bec1
Merge branch 'dev' into dev
LaurenceKuhl Oct 18, 2024
e37c4d1
Merge branch 'dev' into dev
LaurenceKuhl Nov 11, 2024
4e2b551
Update nextflow.config
LaurenceKuhl Nov 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions modules/local/gpt_prepare_query.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
process GPT_PREPARE_QUERY {
input:
path data
val source
val index
val count
val mode
val question

output:
path "gpt_${source}_query.txt", emit: query

script:
template 'generateGptQuery.py'
}
30 changes: 30 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ params {
overrepresented = false
umi_clustering = false
skip_clonality = false
gpt_interpretation = null

// Nf-gpt parameters
gpt_drugz_gene_amount = 400
gpt_drugz_question = "Which of the following genes have historically shown records of enhancing drug activitys upon CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
gpt_mle_gene_amount = 400
gpt_mle_question = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
gpt_bagel2_gene_amount = 400
gpt_bagel2_question = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
gpt_rra_gene_amount = 400
gpt_rra_question = "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."

// UMI parameters
umi_bin_size = 1
Expand Down Expand Up @@ -89,6 +100,18 @@ params {
validate_params = true
}

// nf-gpt plugin settings
gpt {
// The user should provide a functioning api key
apiKey = null
// Models are available in nf-gpt 0.4.0: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, and gpt-3.5-turbo
model = "gpt-4o-mini"
// Set maximum number of tokens. This depends on the used model: https://platform.openai.com/docs/models
maxTokens = 15000
// Sampling temperature
temperature = 0.7
}

// Load base.config by default for all pipelines
includeConfig 'conf/base.config'

Expand Down Expand Up @@ -206,6 +229,12 @@ profiles {
// Load nf-core custom profiles from different Institutions
includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null"


// Nextflow plugins
plugins {
id '[email protected]' // Allows access to nf-gpt functionality
}

// Load nf-core/crisprseq custom profiles from different institutions.
includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/crisprseq.config" : "/dev/null"

Expand All @@ -218,6 +247,7 @@ podman.registry = 'quay.io'
singularity.registry = 'quay.io'
charliecloud.registry = 'quay.io'


// Load igenomes.config if required
includeConfig !params.igenomes_ignore ? 'conf/igenomes.config' : 'conf/igenomes_ignored.config'

Expand Down
58 changes: 58 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@
"description": "Essential genes to remove from the drugZ modules",
"pattern": "\\\\S+"
},
"gpt_interpretation": {
"type": "string",
"description": "Determines whether or not to run nf-gpt plugin"
},
"hitselection": {
"type": "boolean",
"description": "Specify to run the Hitselection algorithm"
Expand Down Expand Up @@ -448,6 +452,53 @@
"hidden": true
}
}
},
"nf_gpt_parameters": {
"title": "nf_gpt_parameters",
"type": "object",
"description": "Contains parameters to control nf-gpt plugin calling.",
"default": "",
"properties": {
"gpt_drugz_gene_amount": {
"type": "integer",
"default": 400,
"description": "Number of top genes to be selected from drugZ."
},
"gpt_drugz_question": {
"type": "string",
"default": "Which of the following genes have historically shown records of enhancing drug activitys upon CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
"description": "Question parsed with drugZ data to gpt."
},
"gpt_mle_gene_amount": {
"type": "integer",
"default": 400,
"description": "Number of top genes to be selected from mle."
},
"gpt_mle_question": {
"type": "string",
"default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
"description": "Question parsed with MAGeCK mle data to gpt."
},
"gpt_bagel2_gene_amount": {
"type": "integer",
"default": 400,
"description": "Number of top genes to be selected from bagel2."
},
"gpt_bagel2_question": {
"type": "string",
"default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail."
},
"gpt_rra_gene_amount": {
"type": "integer",
"default": 400,
"description": "Number of top genes to be selected from rra."
},
"gpt_rra_question": {
"type": "string",
"default": "Which of the following genes have historically shown significance in the context of a CRISPR knockout? For each positive gene write a paragraph explaining the findings in detail.",
"description": "Question parsed with rra data to gpt."
}
}
}
},
"allOf": [
Expand Down Expand Up @@ -478,6 +529,13 @@
{
"$ref": "#/$defs/institutional_config_options"
},
{

"$ref": "#/definitions/generic_options"
},
{
"$ref": "#/definitions/nf_gpt_parameters"
},
{
"$ref": "#/$defs/generic_options"
}
Expand Down
47 changes: 47 additions & 0 deletions templates/generateGptQuery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python

# Define process input variables
data_path = "${data}"
source = "${source}"
target_index = "${index}"
target_index = int(target_index)
num_genes = "${count}"
num_genes = int(num_genes)
mode = "${mode}"
question = "${question}"

# Open data file
with open(data_path, "r") as file:
# Read the header row (though it’s not necessary for column names anymore)
header = file.readline().strip().split("\t")

# Ensure target_index is within the bounds of available columns
if target_index >= len(header) or target_index < 0:
print(f"Error: The specified column index {target_index} is out of range!")

# Initiate list to store gene IDs with corresponding data values
data = []

# Prepare data file's rows
for line in file:
row = line.strip().split("\t")
gene_id = row[0] # Assume the first column is the gene ID
value = float(row[target_index]) # Extract value using the provided index
data.append((gene_id, value))

# Sort the data based on provided mode
if mode == "low":
sorted_data = sorted(data, key=lambda x: x[1])
elif mode == "high":
sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
else:
print("Error: Please provide either 'low' or 'high' as mode.")

# Extract num_genes many top genes
top_gene_ids = [gene_id for gene_id, value in sorted_data[:num_genes]]

# Write everything into an output file
with open(f"gpt_{source}_query.txt", "w") as query_file:
query_file.write(question + """\n""")
for gene_id in top_gene_ids:
query_file.write(gene_id + """\n""")
116 changes: 116 additions & 0 deletions workflows/crisprseq_screening.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@ include { MAGECK_FLUTEMLE } from '../modules/local/
include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_CONTRASTS } from '../modules/local/mageck/flutemle'
include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_DAY0 } from '../modules/local/mageck/flutemle'
include { VENNDIAGRAM } from '../modules/local/venndiagram'
include { GPT_PREPARE_QUERY as GPT_PREPARE_BAGEL2_QUERY} from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_DRUGZ_QUERY } from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_MLE_QUERY } from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_RRA_QUERY } from '../modules/local/gpt_prepare_query'
include { VENNDIAGRAM as VENNDIAGRAM_DRUGZ } from '../modules/local/venndiagram'


// nf-core modules
include { FASTQC } from '../modules/nf-core/fastqc/main'
include { CUTADAPT as CUTADAPT_THREE_PRIME } from '../modules/nf-core/cutadapt/main'
Expand All @@ -37,10 +42,17 @@ include { BOWTIE2_ALIGN } from '../modules/nf-cor
// Local subworkflows
include { INITIALISATION_CHANNEL_CREATION_SCREENING } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
// Functions

include { gptPromptForText } from 'plugin/nf-gpt'
include { paramsSummaryMap } from 'plugin/nf-validation'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'

include { validateParametersScreening } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
include { DRUGZ } from '../modules/local/drugz'

Expand Down Expand Up @@ -360,6 +372,110 @@ workflow CRISPRSEQ_SCREENING {
}

//

// Calling of nf-gpt plugin on drugZ, MAGeCK mle or bagel2
//
if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('drugz')) {
if(params.drugz) {
def gpt_drugz_data = DRUGZ.out.per_gene_results.map { meta, genes -> genes }
def gpt_drugZ_source = "drugZ"
def gpt_drugZ_target_column = 5
def gpt_drugZ_mode = "low"
GPT_PREPARE_DRUGZ_QUERY(
gpt_drugz_data,
gpt_drugZ_source,
gpt_drugZ_target_column,
params.gpt_drugz_gene_amount,
gpt_drugZ_mode,
params.gpt_drugz_question
)

GPT_PREPARE_DRUGZ_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_drugz_output.txt", newLine: true, sort: false )
} else {
error "You specified DrugZ for gpt interpretation, but DrugZ is not running."
}
}
if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('mle')) {
if(params.mle) {
def gpt_mle_data = MAGECK_MLE.out.gene_summary.map { meta, genes -> genes }
def gpt_mle_source = "mle"
def gpt_mle_target_column = 2
def gpt_mle_mode = "high"
GPT_PREPARE_MLE_QUERY(
gpt_mle_data,
gpt_mle_source,
gpt_mle_target_column,
params.gpt_mle_gene_amount,
gpt_mle_mode,
params.gpt_mle_question
)

GPT_PREPARE_MLE_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_mle_output.txt", newLine: true, sort: false )
} else {
error "You specified MAGeCK MLE for gpt interpretation, but MAGeCK MLE is not running."
}
}
if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('bagel2')) {
if(params.bagel2) {
def gpt_bagel2_data = BAGEL2_BF.out.bf.map { meta, genes -> genes }
def gpt_bagel2_source = "bagel2"
def gpt_bagel2_target_column = 1
def gpt_bagel2_mode = "high"
GPT_PREPARE_BAGEL2_QUERY(
gpt_bagel2_data,
gpt_bagel2_source,
gpt_bagel2_target_column,
params.gpt_bagel2_gene_amount,
gpt_bagel2_mode,
params.gpt_bagel2_question
)

GPT_PREPARE_BAGEL2_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_bagel2_output.txt", newLine: true, sort: false )
} else {
error "You specified BAGEL2 for gpt interpretation, but BAGEL2 is not running."
}
}
if(params.gpt_interpretation && params.gpt_interpretation.split(',').contains('rra')) {
if(params.rra) {
def gpt_rra_data = MAGECK_TEST.out.gene_summary.map { meta, genes -> genes }
def gpt_rra_source = "rra"
def gpt_rra_target_column = 5
def gpt_rra_mode = "low"
GPT_PREPARE_RRA_QUERY(
gpt_rra_data,
gpt_rra_source,
gpt_rra_target_column,
params.gpt_rra_gene_amount,
gpt_rra_mode,
params.gpt_rra_question
)

GPT_PREPARE_RRA_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_rra_output.txt", newLine: true, sort: false )
} else {
error "You specified MAGeCK RRA for gpt interpretation, but MAGeCK RRA is not running."
}
}

// Venn diagrams
//

Expand Down
Loading