From 126cc01513838711a72df00254a3dd0dd66e312f Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Fri, 12 Jan 2024 16:28:19 -0800 Subject: [PATCH 01/15] Added bash script for processing CoDNaS database --- .../1BZF_A.hclusterRMSD.txt | 188 ++++++++++++++++++ .../CoDNaS_FOXS_RAW_single_conf/README | 35 ++++ .../CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py | 38 ++++ .../batch_download_seq_new.sh | 93 +++++++++ .../CoDNaS_FOXS_RAW_single_conf/full_clean.sh | 78 ++++++++ 5 files changed, 432 insertions(+) create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py create mode 100755 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt new file mode 100644 index 0000000..9e12eed --- /dev/null +++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt @@ -0,0 +1,188 @@ +PDB cluster +1BZF-1_A 1 +1BZF-2_A 1 +1BZF-3_A 1 +1BZF-4_A 1 +1BZF-5_A 1 +1BZF-6_A 1 +1BZF-7_A 1 +1BZF-8_A 1 +1BZF-9_A 1 +1BZF-10_A 1 +1BZF-11_A 1 +1BZF-12_A 1 +1BZF-13_A 1 +1BZF-14_A 1 +1BZF-15_A 1 +1BZF-16_A 1 +1BZF-17_A 1 +1BZF-18_A 1 +1BZF-19_A 1 +1BZF-20_A 1 +1BZF-21_A 1 +1BZF-22_A 1 +1DIU-1_A 2 +1DIU-2_A 2 +1DIU-3_A 2 +1DIU-4_A 2 +1DIU-5_A 2 +1DIU-6_A 2 +1DIU-7_A 2 +1DIU-8_A 2 +1DIU-9_A 2 +1DIU-10_A 2 +1DIU-11_A 2 +1DIU-12_A 2 +1DIU-13_A 2 +1DIU-14_A 2 +1DIU-15_A 2 +1DIU-16_A 2 +1DIU-17_A 2 +1DIU-18_A 2 +1DIS_A 2 +3DFR_A 2 +1LUD-1_A 3 +1LUD-2_A 3 +1LUD-3_A 3 +1LUD-4_A 3 +1LUD-5_A 3 +1LUD-6_A 3 +1LUD-7_A 3 +1LUD-8_A 3 +1LUD-9_A 3 +1LUD-10_A 3 +1LUD-11_A 3 +1LUD-12_A 3 +1LUD-13_A 3 +1LUD-14_A 3 +1LUD-15_A 3 +1LUD-16_A 3 +1LUD-17_A 3 +1LUD-18_A 3 +1LUD-19_A 3 +1LUD-20_A 3 +1LUD-21_A 3 +1LUD-22_A 3 +1LUD-23_A 3 +1LUD-24_A 3 +2L28-1_A 4 +2L28-8_A 4 +2L28-16_A 4 +2L28-2_A 5 +2L28-10_A 5 +2L28-13_A 5 +2L28-3_A 6 +2L28-4_A 7 +2L28-5_A 8 +2L28-24_A 8 +2L28-6_A 9 +2L28-18_A 9 +2L28-7_A 10 +2L28-12_A 10 +2L28-9_A 11 +2L28-11_A 12 +2L28-20_A 12 +2L28-14_A 13 +2L28-15_A 14 +2L28-17_A 15 +2L28-19_A 16 +2L28-21_A 17 +2L28-22_A 18 +2L28-23_A 19 +2L28-25_A 20 +1AO8-1_A 21 +1AO8-2_A 21 +1AO8-3_A 21 +1AO8-4_A 21 +1AO8-5_A 21 +1AO8-6_A 21 +1AO8-7_A 21 +1AO8-8_A 21 +1AO8-9_A 21 +1AO8-10_A 21 +1AO8-11_A 21 +1AO8-12_A 21 +1AO8-13_A 21 +1AO8-14_A 21 +1AO8-15_A 21 +1AO8-16_A 21 +1AO8-17_A 21 +1AO8-18_A 21 +1AO8-19_A 21 +1AO8-20_A 21 +1AO8-21_A 21 +2HQP-1_A 22 +2HQP-2_A 22 +2HQP-3_A 22 +2HQP-8_A 22 +2HQP-11_A 22 +2HQP-13_A 22 +2HQP-16_A 22 +2HQP-18_A 22 +2HQP-19_A 22 +2HQP-21_A 22 +2HQP-25_A 22 +2HQP-4_A 23 +2HQP-7_A 23 +2HQP-9_A 23 +2HQP-14_A 23 +2HQP-17_A 23 +2HQP-22_A 23 +2HQP-5_A 24 +2HQP-10_A 24 +2HQP-15_A 24 +2HQP-23_A 24 +2HQP-6_A 25 +2HQP-20_A 25 +2HQP-12_A 26 +2HQP-24_A 26 +2LF1-1_A 27 +2LF1-2_A 27 +2LF1-3_A 27 +2LF1-4_A 27 +2LF1-5_A 27 +2LF1-6_A 27 +2LF1-7_A 27 +2LF1-8_A 27 +2LF1-9_A 27 +2LF1-10_A 27 +2LF1-11_A 27 +2LF1-12_A 27 +2LF1-13_A 27 +2LF1-14_A 27 +2LF1-15_A 27 +2LF1-16_A 27 +2LF1-17_A 27 +2LF1-18_A 27 +2LF1-19_A 27 +2LF1-20_A 27 +2LF1-21_A 27 +2LF1-22_A 27 +2LF1-23_A 27 +2LF1-24_A 27 +2LF1-25_A 27 +2HM9-1_A 28 +2HM9-2_A 28 +2HM9-3_A 28 +2HM9-4_A 28 +2HM9-5_A 28 +2HM9-6_A 28 +2HM9-7_A 28 +2HM9-8_A 28 +2HM9-9_A 28 +2HM9-10_A 28 +2HM9-11_A 28 +2HM9-12_A 28 +2HM9-13_A 28 +2HM9-14_A 28 +2HM9-15_A 28 +2HM9-16_A 28 +2HM9-17_A 28 +2HM9-18_A 28 +2HM9-19_A 28 +2HM9-20_A 28 +2HM9-21_A 28 +2HM9-22_A 28 +2HM9-23_A 28 +2HM9-24_A 28 +2HM9-25_A 28 diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README new file mode 100644 index 0000000..62341cf --- /dev/null +++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README @@ -0,0 +1,35 @@ +CoDNaS Auto Processing + +Description: +CoDNaS is a database comparing diverse conformation of the identical proteins. +It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files. + +The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein. +Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs. + +Problems: +1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations +2.Each PDB may contain multiple conformers which is similar to each other. I am trying to extract them and generate individual SAXS curve which is still under testing. + +Files: +1BZF_A.hclusterRMSD.txt - raw data downloaded from CoDNaS database +full_clean.sh - main script +batch_download_seq_new.sh - modified RCSB script to download PDB and sequence files +SAXS_to_pr.py - convert the PDB file to SAXS and P(r) + +Dependency: +Anaconda - +Request python library - +RAW API - +FOXS - + +Usage: +bash full_clean.sh +-f : the input file containing a list of pdb files downloaded from CoDNaS +-o : the output dir + +Example +mkdir test +bash full_clean.sh -f ./1BZF_A.hclusterRMSD.txt -o ./test + +The script will create 4 folders ./test/pdb ./test/sequence ./test/saxs_q (raw SAXS) ./test/saxs_r (P(r) in csv) \ No newline at end of file diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py new file mode 100644 index 0000000..9fdc5ac --- /dev/null +++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +""" +Author: Feng Yu. LBNL + +Convert the SAXS file to P(r) curves using thge RAW API + +First version created on 01/04/2024 +""" + +import bioxtasraw.RAWAPI as raw +import argparse +import pandas as pd +import os + +parser = argparse.ArgumentParser( + prog = 'SAXS2PR', + description = 'Convert the SAXS file to P(r) curves using thge RAW API' ) +parser.add_argument('-f', '--filename', required=True) +parser.add_argument('-o', '--output', default='./') + +args = parser.parse_args() + +profiles_name=args.filename + +# Load SAXS file (.dat format) +profiles = raw.load_profiles([profiles_name]) + +gi_profile=profiles[0] + +# Convert SAXS file with Inverse Fourier Transform +gi_bift = raw.bift(gi_profile)[0] + +# Save the radius and P(r) to csv file +output_pd=pd.DataFrame({'r':gi_bift.r, 'P(r)':gi_bift.p},columns=['r','P(r)']) + +output_loc=args.output +output_pd.to_csv(os.path.join(args.output,os.path.basename(profiles_name).split('.')[0]+".csv"),index=False) + diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh new file mode 100755 index 0000000..fa5688c --- /dev/null +++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Script to download files from RCSB http file download services. +# Use the -h switch to get help on usage. + +if ! command -v curl &> /dev/null +then + echo "'curl' could not be found. You need to install 'curl' for this script to work." + exit 1 +fi + +PROGNAME=$0 +BASE_URL="https://files.rcsb.org/download" +BASE_URL_SEQ="https://www.rcsb.org/fasta/entry" + +usage() { + cat << EOF >&2 +Usage: $PROGNAME -f [-o ] [-c] [-p] + modified from RCSB script + -f : the input file containing a comma-separated list of PDB ids + -o : the output dir, default: current dir + -c : download a cif.gz file for each PDB id and store in the pdb subfolder + -p : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder + -q : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder +EOF + exit 1 +} + +download() { + url="$BASE_URL/$1" + out=$2/$1 + echo "Downloading $url to $out" + curl -s -f $url -o $out || echo "Failed to download $url" +} + +download_seq() { + url="$BASE_URL_SEQ/$1" + out="$2/$1.fasta" + echo "Downloading $url to $out" + curl -s -f $url -o $out || echo "Failed to download $url" +} + +listfile="" +outdir="." +cif=false +pdb=false +seq=false +while getopts f:o:cpq op +do + case $op in + (f) listfile=$OPTARG;; + (o) outdir=$OPTARG;; + (c) cif=true;; + (p) pdb=true;; + (q) seq=true;; + (*) usage + esac +done +shift "$((OPTIND - 1))" + +if [ "$listfile" == "" ] +then + echo "Parameter -f must be provided" + exit 1 +fi +contents=$(cat $listfile) + +# see https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash#tab-top +IFS=',' read -ra tokens <<< "$contents" + +for token in "${tokens[@]}" +do + if [ "$cif" == true ] + then + download ${token}.cif.gz ${outdir}/pdb + fi + if [ "$pdb" == true ] + then + download ${token}.pdb.gz ${outdir}/pdb + fi + if [ "$seq" == true ] + then + download_seq ${token} ${outdir}/sequence + fi +done + + + + + + + + diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh new file mode 100644 index 0000000..7f96ffe --- /dev/null +++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +PROGNAME=$0 + +usage() { + cat << EOF >&2 +Usage: $PROGNAME -f -o + +-f : the input file containing a list of pdb files downloaded from CoDNaS +-o : the output dir +EOF + exit 1 +} + +filename="" +outdir="" + +while getopts f:o: options +do + case $options in + (f) filename=$OPTARG;; + (o) outdir=$OPTARG;; + (*) usage + esac +done + +if [ "$filename" == "" ] +then + echo "Parameter -f must be provided" + exit 1 +fi + +if [ "$outdir" == "" ] +then + echo "Parameter -o must be provided" + exit 1 +else + echo $outdir + awk 'NR>1 { print substr($1,1,4)}' ${filename} | sort -u |tr '\n' ' ' | sed '$s/ $/\n/'| tr ' ' ',' > ${outdir}/clean_pdb.output +fi + +pdb_loc=${outdir}/pdb +seq_loc=${outdir}/sequence +saxs_loc=${outdir}/saxs_q +pr_loc=${outdir}/saxs_r + +mkdir ${pdb_loc} +echo "created PDB storage folder at ${pdb_loc}" +mkdir ${seq_loc} +echo "created sequence storage folder at ${seq_loc}" + +./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output -o ${outdir} -p -q + +mkdir ${saxs_loc} +echo "created SAXS measurement storage folder at ${saxs_loc}" +mkdir ${pr_loc} +echo "created P(r) storage folder at ${pr_loc}" + +full_pdb_list=$(cat ${outdir}/clean_pdb.output) + +IFS=',' read -ra pdbs <<< "$full_pdb_list" + +for pdb in ${pdbs[@]} +do + if [ ! -f ${pdb_loc}/${pdb}.pdb.gz ] + then + echo no pdb file for $pdb + else + gunzip -c ${pdb_loc}/${pdb}.pdb.gz > ${pdb_loc}/${pdb}.pdb + rm ${pdb_loc}/${pdb}.pdb.gz + foxs ${pdb_loc}/${pdb}.pdb + mv ${pdb_loc}/${pdb}.pdb.dat ${saxs_loc}/${pdb}.dat + python SAXS_to_pr.py -f ${saxs_loc}/${pdb}.dat -o ${pr_loc} + fi +done + +rm ${outdir}/clean_pdb.output + From 4b5c00c39f6b4535d0d7bdacea890b4ba95f9ae4 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 10:38:41 -0800 Subject: [PATCH 02/15] Added Slurm script for processing PDB70 dataset on NERSC The url of the PDB70 database https://zenodo.org/records/7953087 --- .DS_Store | Bin 0 -> 6148 bytes PDB70_NERSC/PDBtoSeq.py | 31 ++++++++++++++ PDB70_NERSC/fixpdb_pr_parallel_v2.bash | 57 +++++++++++++++++++++++++ PDB70_NERSC/parallel_PDB70.slurm | 12 ++++++ 4 files changed, 100 insertions(+) create mode 100644 .DS_Store create mode 100644 PDB70_NERSC/PDBtoSeq.py create mode 100644 PDB70_NERSC/fixpdb_pr_parallel_v2.bash create mode 100644 PDB70_NERSC/parallel_PDB70.slurm diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..554bcac2fac44d5bd6bc7cec4b94df7348451c54 GIT binary patch literal 6148 zcmeHKO-sW-5PjPQjCiSd@!(;9fzXrJSfXBqUc9Mkr4?IKOg##@dG_ot5PygGs|4Td zE^R_o>P3po!0g-1M|SrmWHJCu?|#??Isjs-U~7}Z7b1SqhO~l3oJQ_38Fl-Ed^&4H zs$pFjkY^WRA0u32UVna1H&x#>Pt$CYXSBsR$Iw4&FCj++J5$Z<;Sp@w?30`EV-dQ&9$# z0cBua8DPy8iMKrJs0=6r%D{pF`94^xU=pzO=sq1BTnj*qXm-N6^b(R|1114WkMvNC zPbKZlAT19b-4{<0(W|LFVv zzn-L5%78NPuNW{<+E05tQmCzs!%3~R)C;PJ#FZWwDV(@c%vdSKyHqE%TbU4(fTc%T QDEcGdXwX3!_)`Wx0SdQRl>h($ literal 0 HcmV?d00001 diff --git a/PDB70_NERSC/PDBtoSeq.py b/PDB70_NERSC/PDBtoSeq.py new file mode 100644 index 0000000..f1ffe5e --- /dev/null +++ b/PDB70_NERSC/PDBtoSeq.py @@ -0,0 +1,31 @@ +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +import argparse +import os + +def extract_seq(pdb_input,output_path): + pdb_name=pdb_input.split(".")[0] + counter=1 + for record in SeqIO.parse(pdb_input,"pdb-atom"): + if counter > 1: + raise ValueError("More than 1 Chain is in the file {}".format(pdb_input)) + else: + new_seq_record = SeqRecord(record.seq, id=pdb_name, description='') + SeqIO.write(new_seq_record, output_path ,"fasta") + counter+=1 + +def main(): + parser = argparse.ArgumentParser( + prog = 'PDBtoSeq', + description = '''Extract Sequence from PDB using the BioPython API. + The output will be stored at the current directory as a fasta file.''' ) + + parser.add_argument('-f', '--filename', required=True) + parser.add_argument('-o', '--output', default='./') + + args = parser.parse_args() + + extract_seq(args.filename,args.output) + +if __name__ == '__main__': + main() diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/PDB70_NERSC/fixpdb_pr_parallel_v2.bash new file mode 100644 index 0000000..9d6437c --- /dev/null +++ b/PDB70_NERSC/fixpdb_pr_parallel_v2.bash @@ -0,0 +1,57 @@ +#!/bin/bash + +PROGNAME=$0 + +usage() { + cat << EOF >&2 +Usage: $PROGNAME -f -o + +-f : the input file containing a list of pdb files (can be obtained using ls > file_name.txt) +-o : the output dir +EOF + exit 1 +} + +file="" +outdir="" + +while getopts f:o: options +do + case $options in + (f) file=$OPTARG;; + (o) outdir=$OPTARG;; + (*) usage + esac +done + +if [ "$file" == "" ] +then + echo "Parameter -f must be provided" + exit 1 +fi + +if [ "$outdir" == "" ] +then + echo "Parameter -o must be provided" + exit 1 +fi + +pdb_loc=${outdir}/pdb +seq_loc=${outdir}/sequence +pr_loc=${outdir}/saxs_r + +#mkdir -p ${pdb_loc} +#echo "created PDB storage folder at ${pdb_loc}" +#mkdir -p ${seq_loc} +#echo "created sequence storage folder at ${seq_loc}" +#mkdir -p ${pr_loc} +#echo "created saxs P(r) storage folder at ${pr_loc}" + +if [ -f "$file" ]; then + echo ${file} + filename=$(basename "$file") + echo ${filename} + pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose + calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv" + python PDBtoSeq.py -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" +fi diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/PDB70_NERSC/parallel_PDB70.slurm new file mode 100644 index 0000000..2890bce --- /dev/null +++ b/PDB70_NERSC/parallel_PDB70.slurm @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:05:00 +#SBATCH --constraint=cpu +#SBATCH --qos=shared +#SBATCH --account=m3513 + +module load python +conda activate /pscratch/sd/l/lemonboy/PDB_prep +parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: *.pdb From 8b073d685f94623251b127f75a2212a37d3c213b Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 10:41:43 -0800 Subject: [PATCH 03/15] Delete .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 554bcac2fac44d5bd6bc7cec4b94df7348451c54..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO-sW-5PjPQjCiSd@!(;9fzXrJSfXBqUc9Mkr4?IKOg##@dG_ot5PygGs|4Td zE^R_o>P3po!0g-1M|SrmWHJCu?|#??Isjs-U~7}Z7b1SqhO~l3oJQ_38Fl-Ed^&4H zs$pFjkY^WRA0u32UVna1H&x#>Pt$CYXSBsR$Iw4&FCj++J5$Z<;Sp@w?30`EV-dQ&9$# z0cBua8DPy8iMKrJs0=6r%D{pF`94^xU=pzO=sq1BTnj*qXm-N6^b(R|1114WkMvNC zPbKZlAT19b-4{<0(W|LFVv zzn-L5%78NPuNW{<+E05tQmCzs!%3~R)C;PJ#FZWwDV(@c%vdSKyHqE%TbU4(fTc%T QDEcGdXwX3!_)`Wx0SdQRl>h($ From 0fb09933d765469b774016056173a1d6abf12642 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 13:35:46 -0800 Subject: [PATCH 04/15] Update PDB70_NERSC/parallel_PDB70.slurm Co-authored-by: Andrew Tritt --- PDB70_NERSC/parallel_PDB70.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/PDB70_NERSC/parallel_PDB70.slurm index 2890bce..f788014 100644 --- a/PDB70_NERSC/parallel_PDB70.slurm +++ b/PDB70_NERSC/parallel_PDB70.slurm @@ -9,4 +9,4 @@ module load python conda activate /pscratch/sd/l/lemonboy/PDB_prep -parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: *.pdb +ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: From 099284e66b39c882c1cd5ba97fe8a9b081168305 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 15:31:03 -0800 Subject: [PATCH 05/15] Added a scripts folder to store individual analyze scripts --- .../Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt | 0 .../Extract_Pr_CoDNaS/CoD_calc_main.sh | 11 ++++++++++- .../Extract_Pr_CoDNaS}/README | 2 +- .../Extract_Pr_CoDNaS}/SAXS_to_pr.py | 4 ++-- .../Extract_Pr_CoDNaS/batch_download_seq.sh | 2 +- {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py | 0 .../PDB70_NERSC}/fixpdb_pr_parallel_v2.bash | 0 .../PDB70_NERSC}/parallel_PDB70.slurm | 3 ++- 8 files changed, 16 insertions(+), 6 deletions(-) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt (100%) rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh => scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh (73%) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/README (93%) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/SAXS_to_pr.py (92%) rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh => scripts/Extract_Pr_CoDNaS/batch_download_seq.sh (98%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py (100%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/fixpdb_pr_parallel_v2.bash (100%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/parallel_PDB70.slurm (92%) diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt similarity index 100% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt rename to scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh similarity index 73% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh rename to scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh index 7f96ffe..3c657b2 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh +++ b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh @@ -1,4 +1,13 @@ #!/bin/bash +# Author: Feng Yu fyu2@lbl.gov 2024 + +# CoDNaS is a protein conformational database based of entire proteins as derived from PDB. +# For each represented protein, the database contains the redundant collection of all corresponding different structures. + +# This script is used to calculate SAXS curve for PDBs based on lists of PDBs downloading from the CoDNaS server. +# This script will generate SAXS curve, P(r) curve , extract sequences and download the PDBs for each PDB entry. + + PROGNAME=$0 @@ -49,7 +58,7 @@ echo "created PDB storage folder at ${pdb_loc}" mkdir ${seq_loc} echo "created sequence storage folder at ${seq_loc}" -./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output -o ${outdir} -p -q +./batch_download_seq.sh -f ${outdir}/clean_pdb.output -o ${outdir} -p -q mkdir ${saxs_loc} echo "created SAXS measurement storage folder at ${saxs_loc}" diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/scripts/Extract_Pr_CoDNaS/README similarity index 93% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README rename to scripts/Extract_Pr_CoDNaS/README index 62341cf..10d9fe2 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README +++ b/scripts/Extract_Pr_CoDNaS/README @@ -5,7 +5,7 @@ CoDNaS is a database comparing diverse conformation of the identical proteins. It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files. The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein. -Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs. +Thus, this script is used to automatically download the PDB from RCSB based on the list and then convert them to SAXS curve and P(r) curve with FOXS and RAW APIs. Problems: 1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py similarity index 92% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py rename to scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py index 9fdc5ac..ac3c775 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py +++ b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py @@ -2,7 +2,7 @@ """ Author: Feng Yu. LBNL -Convert the SAXS file to P(r) curves using thge RAW API +Convert the SAXS file to P(r) curves using the RAW API First version created on 01/04/2024 """ @@ -14,7 +14,7 @@ parser = argparse.ArgumentParser( prog = 'SAXS2PR', - description = 'Convert the SAXS file to P(r) curves using thge RAW API' ) + description = 'Convert the SAXS file to P(r) curves using the RAW API' ) parser.add_argument('-f', '--filename', required=True) parser.add_argument('-o', '--output', default='./') diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh similarity index 98% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh rename to scripts/Extract_Pr_CoDNaS/batch_download_seq.sh index fa5688c..27f454d 100755 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh +++ b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh @@ -21,7 +21,7 @@ Usage: $PROGNAME -f [-o ] [-c] [-p] -o : the output dir, default: current dir -c : download a cif.gz file for each PDB id and store in the pdb subfolder -p : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder - -q : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder + -q : download a .fasta file for each PDB id (sequence) and store in the sequence subfolder EOF exit 1 } diff --git a/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py similarity index 100% rename from PDB70_NERSC/PDBtoSeq.py rename to scripts/PDB70_NERSC/PDBtoSeq.py diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash similarity index 100% rename from PDB70_NERSC/fixpdb_pr_parallel_v2.bash rename to scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm similarity index 92% rename from PDB70_NERSC/parallel_PDB70.slurm rename to scripts/PDB70_NERSC/parallel_PDB70.slurm index f788014..d4561a4 100644 --- a/PDB70_NERSC/parallel_PDB70.slurm +++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm @@ -8,5 +8,6 @@ #SBATCH --account=m3513 module load python +module load parallel conda activate /pscratch/sd/l/lemonboy/PDB_prep -ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: +ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: \ No newline at end of file From 858e407021fa5a9d46905c47d25fd1f799d9750c Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 15:33:47 -0800 Subject: [PATCH 06/15] Added a scripts folder to store individual analyze scripts --- .../Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt | 0 .../Extract_Pr_CoDNaS/CoD_calc_main.sh | 11 ++++++++++- .../Extract_Pr_CoDNaS}/README | 2 +- .../Extract_Pr_CoDNaS}/SAXS_to_pr.py | 4 ++-- .../Extract_Pr_CoDNaS/batch_download_seq.sh | 2 +- {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py | 2 +- .../PDB70_NERSC}/fixpdb_pr_parallel_v2.bash | 0 .../PDB70_NERSC}/parallel_PDB70.slurm | 3 ++- 8 files changed, 17 insertions(+), 7 deletions(-) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt (100%) rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh => scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh (73%) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/README (93%) rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/SAXS_to_pr.py (92%) rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh => scripts/Extract_Pr_CoDNaS/batch_download_seq.sh (98%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py (94%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/fixpdb_pr_parallel_v2.bash (100%) rename {PDB70_NERSC => scripts/PDB70_NERSC}/parallel_PDB70.slurm (92%) diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt similarity index 100% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt rename to scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh similarity index 73% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh rename to scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh index 7f96ffe..3c657b2 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh +++ b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh @@ -1,4 +1,13 @@ #!/bin/bash +# Author: Feng Yu fyu2@lbl.gov 2024 + +# CoDNaS is a protein conformational database based of entire proteins as derived from PDB. +# For each represented protein, the database contains the redundant collection of all corresponding different structures. + +# This script is used to calculate SAXS curve for PDBs based on lists of PDBs downloading from the CoDNaS server. +# This script will generate SAXS curve, P(r) curve , extract sequences and download the PDBs for each PDB entry. + + PROGNAME=$0 @@ -49,7 +58,7 @@ echo "created PDB storage folder at ${pdb_loc}" mkdir ${seq_loc} echo "created sequence storage folder at ${seq_loc}" -./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output -o ${outdir} -p -q +./batch_download_seq.sh -f ${outdir}/clean_pdb.output -o ${outdir} -p -q mkdir ${saxs_loc} echo "created SAXS measurement storage folder at ${saxs_loc}" diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/scripts/Extract_Pr_CoDNaS/README similarity index 93% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README rename to scripts/Extract_Pr_CoDNaS/README index 62341cf..10d9fe2 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README +++ b/scripts/Extract_Pr_CoDNaS/README @@ -5,7 +5,7 @@ CoDNaS is a database comparing diverse conformation of the identical proteins. It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files. The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein. -Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs. +Thus, this script is used to automatically download the PDB from RCSB based on the list and then convert them to SAXS curve and P(r) curve with FOXS and RAW APIs. Problems: 1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py similarity index 92% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py rename to scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py index 9fdc5ac..ac3c775 100644 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py +++ b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py @@ -2,7 +2,7 @@ """ Author: Feng Yu. LBNL -Convert the SAXS file to P(r) curves using thge RAW API +Convert the SAXS file to P(r) curves using the RAW API First version created on 01/04/2024 """ @@ -14,7 +14,7 @@ parser = argparse.ArgumentParser( prog = 'SAXS2PR', - description = 'Convert the SAXS file to P(r) curves using thge RAW API' ) + description = 'Convert the SAXS file to P(r) curves using the RAW API' ) parser.add_argument('-f', '--filename', required=True) parser.add_argument('-o', '--output', default='./') diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh similarity index 98% rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh rename to scripts/Extract_Pr_CoDNaS/batch_download_seq.sh index fa5688c..27f454d 100755 --- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh +++ b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh @@ -21,7 +21,7 @@ Usage: $PROGNAME -f [-o ] [-c] [-p] -o : the output dir, default: current dir -c : download a cif.gz file for each PDB id and store in the pdb subfolder -p : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder - -q : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder + -q : download a .fasta file for each PDB id (sequence) and store in the sequence subfolder EOF exit 1 } diff --git a/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py similarity index 94% rename from PDB70_NERSC/PDBtoSeq.py rename to scripts/PDB70_NERSC/PDBtoSeq.py index f1ffe5e..ae22cae 100644 --- a/PDB70_NERSC/PDBtoSeq.py +++ b/scripts/PDB70_NERSC/PDBtoSeq.py @@ -4,7 +4,7 @@ import os def extract_seq(pdb_input,output_path): - pdb_name=pdb_input.split(".")[0] + pdb_name=os.path.basename(pdb_input).split(".")[0] counter=1 for record in SeqIO.parse(pdb_input,"pdb-atom"): if counter > 1: diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash similarity index 100% rename from PDB70_NERSC/fixpdb_pr_parallel_v2.bash rename to scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm similarity index 92% rename from PDB70_NERSC/parallel_PDB70.slurm rename to scripts/PDB70_NERSC/parallel_PDB70.slurm index f788014..d4561a4 100644 --- a/PDB70_NERSC/parallel_PDB70.slurm +++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm @@ -8,5 +8,6 @@ #SBATCH --account=m3513 module load python +module load parallel conda activate /pscratch/sd/l/lemonboy/PDB_prep -ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: +ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: \ No newline at end of file From fe1132c04da723fbedb135bd75f897caeb381b5d Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 25 Jan 2024 17:31:13 -0800 Subject: [PATCH 07/15] Added PDBtoSeq as a function to utils.py and a command to commands.py --- pyproject.toml | 1 + scripts/PDB70_NERSC/PDBtoSeq.py | 4 ++-- src/metfish/commands.py | 14 ++++++++++++++ src/metfish/utils.py | 24 ++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 126a97e..868745a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dynamic = ["version"] [project.scripts] calc-pr = "metfish.commands:get_Pr_cli" +extract-seq = "metfish.commands:extract_seq_cli" [tool.ruff] # Exclude a variety of commonly ignored directories. diff --git a/scripts/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py index ae22cae..a076e81 100644 --- a/scripts/PDB70_NERSC/PDBtoSeq.py +++ b/scripts/PDB70_NERSC/PDBtoSeq.py @@ -20,8 +20,8 @@ def main(): description = '''Extract Sequence from PDB using the BioPython API. The output will be stored at the current directory as a fasta file.''' ) - parser.add_argument('-f', '--filename', required=True) - parser.add_argument('-o', '--output', default='./') + parser.add_argument('-f', '--filename', required=True, help='input pdb file name') + parser.add_argument('-o', '--output', required=True, help='output fasta file name') args = parser.parse_args() diff --git a/src/metfish/commands.py b/src/metfish/commands.py index 80dde5f..d026a44 100644 --- a/src/metfish/commands.py +++ b/src/metfish/commands.py @@ -5,6 +5,7 @@ import pandas as pd from .utils import get_Pr +from .utils import extract_seq def get_Pr_cli(argv=None): @@ -40,3 +41,16 @@ def get_Pr_cli(argv=None): step=args.step) pd.DataFrame({"r": r, "P(r)": p}).to_csv(out, index=False) + +def extract_seq_cli(argv=None): + + parser = argparse.ArgumentParser( + description = '''Extract Sequence from PDB using the BioPython API. + The output will be stored at the current directory as a fasta file.''' ) + + parser.add_argument('-f', '--filename', required=True, help="input pdb file") + parser.add_argument('-o', '--output', required=True, help="output fasta file path + name") + + args = parser.parse_args() + + extract_seq(args.filename,args.output) diff --git a/src/metfish/utils.py b/src/metfish/utils.py index 10202b2..7cd9c60 100644 --- a/src/metfish/utils.py +++ b/src/metfish/utils.py @@ -6,6 +6,8 @@ import numpy as np from periodictable import elements from scipy.spatial.distance import pdist, squareform +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord n_elec_df = {el.symbol: el.number for el in elements} @@ -69,3 +71,25 @@ def get_Pr(structure, structure_id="", dmax=None, step=0.5): p = np.concatenate(([0], hist / hist.sum())) return r, p + +def extract_seq(pdb_input,output_path): + """ + Args: + pdb_input : The path to the PDB to extract sequence. + The PDB must only contain a single chain. + pdbfixer is a good tool to prepare PDB for this function + output_path : The path to store the output fasta file. + Example: /location/to/store/PDB.fasta + Returns: + There is no return for this function. The sequence will be written + as a fasta file in the give location. + """ + pdb_name=os.path.basename(pdb_input).split(".")[0] + counter=1 + for record in SeqIO.parse(pdb_input,"pdb-atom"): + if counter > 1: + raise ValueError("More than 1 Chain is in the file {}".format(pdb_input)) + else: + new_seq_record = SeqRecord(record.seq, id=pdb_name, description='') + SeqIO.write(new_seq_record, output_path ,"fasta") + counter+=1 \ No newline at end of file From 70a1138c05d9b39df12d42a5ee9e372cd045133a Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Wed, 31 Jan 2024 13:57:38 -0800 Subject: [PATCH 08/15] Update scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash Co-authored-by: Andrew Tritt --- scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash index 9d6437c..dde400f 100644 --- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash +++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash @@ -54,4 +54,5 @@ if [ -f "$file" ]; then pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv" python PDBtoSeq.py -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" + echo ${file} >> complete.txt fi From a8607f1c009036b939b6953e76b0fa5203fed17e Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Wed, 31 Jan 2024 13:58:13 -0800 Subject: [PATCH 09/15] Update scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash Co-authored-by: Andrew Tritt --- scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash index dde400f..64c3df5 100644 --- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash +++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash @@ -48,6 +48,10 @@ pr_loc=${outdir}/saxs_r #echo "created saxs P(r) storage folder at ${pr_loc}" if [ -f "$file" ]; then + done=`grep -c ${file} complete.txt` + if [ "$done" -ne "0" ] ; then + return + fi echo ${file} filename=$(basename "$file") echo ${filename} From b8195642c73faacc9d34df274e70965d0999e8ec Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Thu, 22 Feb 2024 17:06:31 -0800 Subject: [PATCH 10/15] add .DS_store to gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c1fc984..c9d5296 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ __pycache__ *.pyc *.egg-info/ *.swp - +.DS_Store From e7f600365874efee86594c62fb6b5da8ba3ee066 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Fri, 1 Mar 2024 14:55:27 -0800 Subject: [PATCH 11/15] Retrive full sequence from RCSB to fix the conformation of PDBs. --- scripts/PDB70_NERSC/get_full_seq.py | 104 ++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 scripts/PDB70_NERSC/get_full_seq.py diff --git a/scripts/PDB70_NERSC/get_full_seq.py b/scripts/PDB70_NERSC/get_full_seq.py new file mode 100644 index 0000000..c1dfbcc --- /dev/null +++ b/scripts/PDB70_NERSC/get_full_seq.py @@ -0,0 +1,104 @@ +# %% +import Bio.Align as Align +import Bio.SeqIO as SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.PDB import PDBList +import argparse +import os + +# %% +def extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID): + pdbl = PDBList() + seqres_type="pdb-seqres" + # Retrive PDB reference (with full sequence) from RCSB + # Will not rewrite PDB if already downloaded + # Will download obsolete PBDS + ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', obsolete=True , pdir=".") + if not os.path.exists(ref_file_name): + ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir=".") + seqres_type="cif-seqres" + if not os.path.exists(ref_file_name): + raise ValueError("PDB doesn't exist in the database") + for record in SeqIO.parse(ref_file_name, seqres_type): + # Get the corresponding chain from the Reference PDB and extract sequence + if record.annotations["chain"] == chain_ID: + break + ref_seq=record.seq + print(ref_seq,seqres_type) + #os.remove(ref_file_name) + + counter=1 + # To prevent 2 chains in the "to be fixed" PDB file + for record2 in SeqIO.parse(input_pdb_path,"pdb-atom"): + if counter > 1: + raise ValueError("More than 1 Chain is in the file {}".format(input_pdb_path)) + else: + sequence_to_be_fix=record2.seq + counter+=1 + return sequence_to_be_fix, ref_seq + +# %% +# Align the ref sequence and the sequence to be fix +def align_fix(sequence_to_be_fix, ref_seq): + + aligner = Align.PairwiseAligner() + alignments = aligner.align(ref_seq, sequence_to_be_fix) + # Ignore the N-terminal or C-terminal loop of the ref seq + counter=0 + for i in alignments[0][1]: + if i == '-': + counter+=1 + continue + else: + break + start_ind=counter + + counter2=0 + for j in alignments[0][1][::-1]: + if j == '-': + counter2+=1 + continue + else: + break + + end_ind=len(ref_seq)-counter2 + + return(ref_seq[start_ind:end_ind]) + +# %% +# function to save the sequence to fast +def save_fasta(sequence, input_name ,output_path): + + new_seq_record = SeqRecord(sequence, id=input_name, description='') + SeqIO.write(new_seq_record, os.path.join(output_path,'fixed_' + input_name + '.fasta') ,"fasta") +# Get pdb name to save files +def extract_pdb_name(input_pdb_path): + + base_name_without_extension = os.path.splitext(os.path.basename(input_pdb_path))[0] + ref_pdb_name=base_name_without_extension.split('_')[0].lower() + chain_ID=base_name_without_extension.split('_')[1] + + return base_name_without_extension,ref_pdb_name, chain_ID +# the "entire workflow" +def extract_fixed_seq(input_pdb_path,output_path): + base_name_without_extension, ref_pdb_name, chain_ID = extract_pdb_name(input_pdb_path) + sequence_to_be_fix, ref_seq = extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID) + sequence = align_fix(sequence_to_be_fix, ref_seq) + save_fasta(sequence, base_name_without_extension ,output_path) +# Command line interface +def main(): + parser = argparse.ArgumentParser( + prog = 'FullSEQRES', + description = '''Extract the fill sequence of PDBs with missing loops + using the BioPython and BioPandas. + The output will be stored at the output directory as a fasta file.''' ) + + parser.add_argument('-f', '--filename', required=True) + parser.add_argument('-o', '--output', default='./') + + args = parser.parse_args() + extract_fixed_seq(args.filename, args.output) + #print("The full sequence of {} is {}".format(args.filename,full_seq)) + +if __name__ == '__main__': + main() From d88c6c3e6cfd72291f9709236471f65d9432b8e6 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Fri, 1 Mar 2024 14:57:04 -0800 Subject: [PATCH 12/15] replace the old bash script with new script to fix the conformations. --- ...rallel_v2.bash => fixpdb_pr_parallel.bash} | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) rename scripts/PDB70_NERSC/{fixpdb_pr_parallel_v2.bash => fixpdb_pr_parallel.bash} (68%) mode change 100644 => 100755 diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash old mode 100644 new mode 100755 similarity index 68% rename from scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash rename to scripts/PDB70_NERSC/fixpdb_pr_parallel.bash index 64c3df5..9c77b40 --- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash +++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash @@ -39,6 +39,7 @@ fi pdb_loc=${outdir}/pdb seq_loc=${outdir}/sequence pr_loc=${outdir}/saxs_r +ref_seq_loc=${outdir}/ref_seq #mkdir -p ${pdb_loc} #echo "created PDB storage folder at ${pdb_loc}" @@ -48,14 +49,22 @@ pr_loc=${outdir}/saxs_r #echo "created saxs P(r) storage folder at ${pr_loc}" if [ -f "$file" ]; then - done=`grep -c ${file} complete.txt` - if [ "$done" -ne "0" ] ; then - return - fi + done=`grep -c ${file} complete.txt` + echo $done + if [ "$done" -ne "0" ] ; then + exit + fi echo ${file} + filename=$(basename "$file") + prefix="${filename%.*}" + IFS='_' read -r PDB_code chain_ID <<< "$prefix" echo ${filename} - pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose + + python get_full_seq.py -f ${file} -o ${ref_seq_loc} + pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" \ + --keep-heterogens=none --replace-nonstandard --add-residues --add-atoms=all --verbose \ + --sequence "${ref_seq_loc}/fixed_${prefix}.fasta" --chain_id "${chain_ID}" calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv" python PDBtoSeq.py -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" echo ${file} >> complete.txt From 74dfc699a3d7a20617ed387c1115695571f54fc6 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Wed, 6 Mar 2024 11:07:30 -0800 Subject: [PATCH 13/15] Change the scripts to the actual scripts running on NERSC --- scripts/PDB70_NERSC/PDBtoSeq.py | 6 +++--- scripts/PDB70_NERSC/fixpdb_pr_parallel.bash | 16 ++++++++++++---- scripts/PDB70_NERSC/get_full_seq.py | 8 ++++---- scripts/PDB70_NERSC/parallel_PDB70.slurm | 8 ++++---- 4 files changed, 23 insertions(+), 15 deletions(-) mode change 100644 => 100755 scripts/PDB70_NERSC/PDBtoSeq.py diff --git a/scripts/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py old mode 100644 new mode 100755 index a076e81..f1ffe5e --- a/scripts/PDB70_NERSC/PDBtoSeq.py +++ b/scripts/PDB70_NERSC/PDBtoSeq.py @@ -4,7 +4,7 @@ import os def extract_seq(pdb_input,output_path): - pdb_name=os.path.basename(pdb_input).split(".")[0] + pdb_name=pdb_input.split(".")[0] counter=1 for record in SeqIO.parse(pdb_input,"pdb-atom"): if counter > 1: @@ -20,8 +20,8 @@ def main(): description = '''Extract Sequence from PDB using the BioPython API. The output will be stored at the current directory as a fasta file.''' ) - parser.add_argument('-f', '--filename', required=True, help='input pdb file name') - parser.add_argument('-o', '--output', required=True, help='output fasta file name') + parser.add_argument('-f', '--filename', required=True) + parser.add_argument('-o', '--output', default='./') args = parser.parse_args() diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash index 9c77b40..f5485c0 100755 --- a/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash +++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash @@ -6,7 +6,7 @@ usage() { cat << EOF >&2 Usage: $PROGNAME -f -o --f : the input file containing a list of pdb files (can be obtained using ls > file_name.txt) +-f : the input PDB file path -o : the output dir EOF exit 1 @@ -49,9 +49,9 @@ ref_seq_loc=${outdir}/ref_seq #echo "created saxs P(r) storage folder at ${pr_loc}" if [ -f "$file" ]; then - done=`grep -c ${file} complete.txt` + done=`grep -c ${file} ${outdir}/complete.txt` echo $done - if [ "$done" -ne "0" ] ; then + if [ "$done" -ne 0 ] ; then exit fi echo ${file} @@ -62,10 +62,18 @@ if [ -f "$file" ]; then echo ${filename} python get_full_seq.py -f ${file} -o ${ref_seq_loc} + if [ ! -f "${ref_seq_loc}/fixed_${prefix}.fasta" ]; then + echo "BASH_ref_seq_Error: File ${ref_seq_loc}/fixed_${prefix}.fasta does not exist." + exit + fi pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" \ --keep-heterogens=none --replace-nonstandard --add-residues --add-atoms=all --verbose \ --sequence "${ref_seq_loc}/fixed_${prefix}.fasta" --chain_id "${chain_ID}" + if [ ! -f "${pdb_loc}/fixed_${filename}" ]; then + echo "BASH_fixer_Error: File ${pdb_loc}/fixed_${filename} does not exist." + exit + fi calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv" python PDBtoSeq.py -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" - echo ${file} >> complete.txt + echo ${file} >> ${outdir}/complete.txt fi diff --git a/scripts/PDB70_NERSC/get_full_seq.py b/scripts/PDB70_NERSC/get_full_seq.py index c1dfbcc..f2730a7 100644 --- a/scripts/PDB70_NERSC/get_full_seq.py +++ b/scripts/PDB70_NERSC/get_full_seq.py @@ -10,12 +10,12 @@ def extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID): pdbl = PDBList() seqres_type="pdb-seqres" - # Retrive PDB reference (with full sequence) from RCSB + # Retrieve PDB reference (with full sequence) from RCSB # Will not rewrite PDB if already downloaded - # Will download obsolete PBDS - ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', obsolete=True , pdir=".") + # Will download obsolete PDBS + ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', pdir="./tmp") if not os.path.exists(ref_file_name): - ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir=".") + ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir="./tmp") seqres_type="cif-seqres" if not os.path.exists(ref_file_name): raise ValueError("PDB doesn't exist in the database") diff --git a/scripts/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm index d4561a4..e476e1b 100644 --- a/scripts/PDB70_NERSC/parallel_PDB70.slurm +++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm @@ -1,13 +1,13 @@ #!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=4 -#SBATCH --time=00:05:00 +#SBATCH --cpus-per-task=32 +#SBATCH --time=24:00:00 #SBATCH --constraint=cpu #SBATCH --qos=shared #SBATCH --account=m3513 module load python module load parallel -conda activate /pscratch/sd/l/lemonboy/PDB_prep -ls *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: \ No newline at end of file +conda activate /pscratch/sd/l/lemonboy/PDB_fixer_test +find ./ -maxdepth 1 -name '*.pdb' -type f | parallel -j 32 bash fixpdb_pr_parallel.bash -f {} -o ../result From 5169f540301d32c79ecddc1cb34bc9042d2b795f Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Wed, 6 Mar 2024 11:55:58 -0800 Subject: [PATCH 14/15] Added README file for NERSC scripts. --- scripts/PDB70_NERSC/README | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts/PDB70_NERSC/README diff --git a/scripts/PDB70_NERSC/README b/scripts/PDB70_NERSC/README new file mode 100644 index 0000000..720a617 --- /dev/null +++ b/scripts/PDB70_NERSC/README @@ -0,0 +1,33 @@ +# PDB Loop Fixer/ SAXS profile Script + +## Overview +This script is designed to fix loops and generate SAXS data for Protein Data Bank (PDB) files using GNU parallel on NERSC. This script will calculate the SAXS P(r) curve for single chain and single conformation PDBs as a reference for training the metfish deep learning network. + +## Features +- Corrects loop regions in PDB files. +- Utilizes GNU parallel programming for multi-thread processing on supercomputers. +- Generates output logs for tracking changes and errors. +- Generates standalone fasta for PDBs after fixing the loop. +- Calculate the SAXS P(r) curve for fixed PDBs + +## Prerequisites +Here are the prerequisites tested on NERSC. Generally speaking, this script does now have a strick restriction on Python and Library versions. +If you installed openmm, pdbfixer and metfish with conda/pip, generally speaking, you do not need to install Numpy and other dependencies independently. + +- Python 3.12.1 +- openmm 8.1.1 +- pdbfixer 1.9.0 +- metfish 0.0.0 +- GNU Parallel +- Input PDB file(s) containing !!single chain!! protein structures. (in a single folder for parallel computing input) + +## Installation +1. Clone or download the repository. +(optional but recommended) Create a new conda environment for installation +2. Install the metfish package and other prerequisites. +3. Copy the scripts in PDB70_NERSC folder to the target folder and edit the slurm script to configure the input, output folders and job queue settings to user's preferences. + +## Usage +For a single PDB file, please refer to the usage of fixpdb_pr_parallel.bash + +For multiple PDB files, please refer to the parallel_PDB70.slurm \ No newline at end of file From 6ab54cc8a6d7d3cf5571b52d120418485cde0d86 Mon Sep 17 00:00:00 2001 From: Feng Yu Date: Wed, 6 Mar 2024 14:39:53 -0800 Subject: [PATCH 15/15] Added the github url for the modified pdbfixer --- scripts/PDB70_NERSC/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/PDB70_NERSC/README b/scripts/PDB70_NERSC/README index 720a617..a494385 100644 --- a/scripts/PDB70_NERSC/README +++ b/scripts/PDB70_NERSC/README @@ -16,7 +16,7 @@ If you installed openmm, pdbfixer and metfish with conda/pip, generally speaking - Python 3.12.1 - openmm 8.1.1 -- pdbfixer 1.9.0 +- pdbfixer 1.9.0 (modified) (https://github.com/smallfishabc/pdbfixer.git) - metfish 0.0.0 - GNU Parallel - Input PDB file(s) containing !!single chain!! protein structures. (in a single folder for parallel computing input)