lbl-cbg · smallfishabc · Mar 13, 2024 · Jan 13, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dynamic = ["version"]
 
 [project.scripts]
 calc-pr = "metfish.commands:get_Pr_cli"
+extract-seq = "metfish.commands:extract_seq_cli"
 
 [tool.ruff]
 # Exclude a variety of commonly ignored directories.

diff --git a/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt b/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt
@@ -0,0 +1,188 @@
+PDB	cluster
+1BZF-1_A	1
+1BZF-2_A	1
+1BZF-3_A	1
+1BZF-4_A	1
+1BZF-5_A	1
+1BZF-6_A	1
+1BZF-7_A	1
+1BZF-8_A	1
+1BZF-9_A	1
+1BZF-10_A	1
+1BZF-11_A	1
+1BZF-12_A	1
+1BZF-13_A	1
+1BZF-14_A	1
+1BZF-15_A	1
+1BZF-16_A	1
+1BZF-17_A	1
+1BZF-18_A	1
+1BZF-19_A	1
+1BZF-20_A	1
+1BZF-21_A	1
+1BZF-22_A	1
+1DIU-1_A	2
+1DIU-2_A	2
+1DIU-3_A	2
+1DIU-4_A	2
+1DIU-5_A	2
+1DIU-6_A	2
+1DIU-7_A	2
+1DIU-8_A	2
+1DIU-9_A	2
+1DIU-10_A	2
+1DIU-11_A	2
+1DIU-12_A	2
+1DIU-13_A	2
+1DIU-14_A	2
+1DIU-15_A	2
+1DIU-16_A	2
+1DIU-17_A	2
+1DIU-18_A	2
+1DIS_A	2
+3DFR_A	2
+1LUD-1_A	3
+1LUD-2_A	3
+1LUD-3_A	3
+1LUD-4_A	3
+1LUD-5_A	3
+1LUD-6_A	3
+1LUD-7_A	3
+1LUD-8_A	3
+1LUD-9_A	3
+1LUD-10_A	3
+1LUD-11_A	3
+1LUD-12_A	3
+1LUD-13_A	3
+1LUD-14_A	3
+1LUD-15_A	3
+1LUD-16_A	3
+1LUD-17_A	3
+1LUD-18_A	3
+1LUD-19_A	3
+1LUD-20_A	3
+1LUD-21_A	3
+1LUD-22_A	3
+1LUD-23_A	3
+1LUD-24_A	3
+2L28-1_A	4
+2L28-8_A	4
+2L28-16_A	4
+2L28-2_A	5
+2L28-10_A	5
+2L28-13_A	5
+2L28-3_A	6
+2L28-4_A	7
+2L28-5_A	8
+2L28-24_A	8
+2L28-6_A	9
+2L28-18_A	9
+2L28-7_A	10
+2L28-12_A	10
+2L28-9_A	11
+2L28-11_A	12
+2L28-20_A	12
+2L28-14_A	13
+2L28-15_A	14
+2L28-17_A	15
+2L28-19_A	16
+2L28-21_A	17
+2L28-22_A	18
+2L28-23_A	19
+2L28-25_A	20
+1AO8-1_A	21
+1AO8-2_A	21
+1AO8-3_A	21
+1AO8-4_A	21
+1AO8-5_A	21
+1AO8-6_A	21
+1AO8-7_A	21
+1AO8-8_A	21
+1AO8-9_A	21
+1AO8-10_A	21
+1AO8-11_A	21
+1AO8-12_A	21
+1AO8-13_A	21
+1AO8-14_A	21
+1AO8-15_A	21
+1AO8-16_A	21
+1AO8-17_A	21
+1AO8-18_A	21
+1AO8-19_A	21
+1AO8-20_A	21
+1AO8-21_A	21
+2HQP-1_A	22
+2HQP-2_A	22
+2HQP-3_A	22
+2HQP-8_A	22
+2HQP-11_A	22
+2HQP-13_A	22
+2HQP-16_A	22
+2HQP-18_A	22
+2HQP-19_A	22
+2HQP-21_A	22
+2HQP-25_A	22
+2HQP-4_A	23
+2HQP-7_A	23
+2HQP-9_A	23
+2HQP-14_A	23
+2HQP-17_A	23
+2HQP-22_A	23
+2HQP-5_A	24
+2HQP-10_A	24
+2HQP-15_A	24
+2HQP-23_A	24
+2HQP-6_A	25
+2HQP-20_A	25
+2HQP-12_A	26
+2HQP-24_A	26
+2LF1-1_A	27
+2LF1-2_A	27
+2LF1-3_A	27
+2LF1-4_A	27
+2LF1-5_A	27
+2LF1-6_A	27
+2LF1-7_A	27
+2LF1-8_A	27
+2LF1-9_A	27
+2LF1-10_A	27
+2LF1-11_A	27
+2LF1-12_A	27
+2LF1-13_A	27
+2LF1-14_A	27
+2LF1-15_A	27
+2LF1-16_A	27
+2LF1-17_A	27
+2LF1-18_A	27
+2LF1-19_A	27
+2LF1-20_A	27
+2LF1-21_A	27
+2LF1-22_A	27
+2LF1-23_A	27
+2LF1-24_A	27
+2LF1-25_A	27
+2HM9-1_A	28
+2HM9-2_A	28
+2HM9-3_A	28
+2HM9-4_A	28
+2HM9-5_A	28
+2HM9-6_A	28
+2HM9-7_A	28
+2HM9-8_A	28
+2HM9-9_A	28
+2HM9-10_A	28
+2HM9-11_A	28
+2HM9-12_A	28
+2HM9-13_A	28
+2HM9-14_A	28
+2HM9-15_A	28
+2HM9-16_A	28
+2HM9-17_A	28
+2HM9-18_A	28
+2HM9-19_A	28
+2HM9-20_A	28
+2HM9-21_A	28
+2HM9-22_A	28
+2HM9-23_A	28
+2HM9-24_A	28
+2HM9-25_A	28
diff --git a/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Author: Feng Yu [email protected] 2024
+
+# CoDNaS is a protein conformational database based of entire proteins as derived from PDB. 
+# For each represented protein, the database contains the redundant collection of all corresponding different structures. 
+
+# This script is used to calculate SAXS curve for PDBs based on lists of PDBs downloading from the CoDNaS server.
+# This script will generate SAXS curve, P(r) curve , extract sequences and download the PDBs for each PDB entry.
+
+
+
+PROGNAME=$0
+
+usage() {
+  cat << EOF >&2
+Usage: $PROGNAME -f <file> -o <dir>
+
+-f <file>: the input file containing a list of pdb files downloaded from CoDNaS
+-o  <dir>: the output dir
+EOF
+ exit 1
+}
+
+filename=""
+outdir=""
+
+while getopts f:o: options
+do
+  case $options in 
+    (f) filename=$OPTARG;;
+    (o) outdir=$OPTARG;;
+    (*) usage
+  esac
+done
+
+if [ "$filename" == "" ]
+then 
+  echo "Parameter -f must be provided"
+  exit 1
+fi
+
+if [ "$outdir" == "" ]
+then
+  echo "Parameter -o must be provided"
+  exit 1
+else  
+  echo $outdir
+  awk 'NR>1 { print substr($1,1,4)}' ${filename} | sort -u |tr '\n' ' ' | sed '$s/ $/\n/'| tr ' ' ',' > ${outdir}/clean_pdb.output
+fi
+
+pdb_loc=${outdir}/pdb
+seq_loc=${outdir}/sequence
+saxs_loc=${outdir}/saxs_q
+pr_loc=${outdir}/saxs_r
+
+mkdir ${pdb_loc}
+echo "created PDB storage folder at ${pdb_loc}"
+mkdir ${seq_loc}
+echo "created sequence storage folder at ${seq_loc}"
+
+./batch_download_seq.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
+
+mkdir ${saxs_loc}
+echo "created SAXS measurement storage folder at ${saxs_loc}"
+mkdir ${pr_loc}
+echo "created P(r) storage folder at ${pr_loc}"
+
+full_pdb_list=$(cat ${outdir}/clean_pdb.output)
+
+IFS=',' read -ra pdbs <<< "$full_pdb_list"
+
+for pdb in ${pdbs[@]}
+do
+ if [ ! -f ${pdb_loc}/${pdb}.pdb.gz ]
+ then
+   echo no pdb file for $pdb
+ else
+   gunzip -c ${pdb_loc}/${pdb}.pdb.gz > ${pdb_loc}/${pdb}.pdb
+   rm ${pdb_loc}/${pdb}.pdb.gz
+   foxs ${pdb_loc}/${pdb}.pdb 
+   mv ${pdb_loc}/${pdb}.pdb.dat ${saxs_loc}/${pdb}.dat
+   python SAXS_to_pr.py -f ${saxs_loc}/${pdb}.dat -o ${pr_loc}
+ fi
+done
+
+rm ${outdir}/clean_pdb.output
+
diff --git a/scripts/Extract_Pr_CoDNaS/README b/scripts/Extract_Pr_CoDNaS/README
@@ -0,0 +1,35 @@
+CoDNaS Auto Processing
+
+Description:
+CoDNaS is a database comparing diverse conformation of the identical proteins.
+It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files.
+
+The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein.
+Thus, this script is used to automatically download the PDB from RCSB based on the list and then convert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
+
+Problems:
+1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations
+2.Each PDB may contain multiple conformers which is similar to each other. I am trying to extract them and generate individual SAXS curve which is still under testing. 
+
+Files:
+1BZF_A.hclusterRMSD.txt - raw data downloaded from CoDNaS database
+full_clean.sh - main script
+batch_download_seq_new.sh - modified RCSB script to download PDB and sequence files
+SAXS_to_pr.py - convert the PDB file to SAXS and P(r)
+
+Dependency: 
+Anaconda -
+Request python library -
+RAW API -
+FOXS -
+
+Usage:
+bash full_clean.sh
+-f <file>: the input file containing a list of pdb files downloaded from CoDNaS
+-o  <dir>: the output dir
+
+Example
+mkdir test
+bash full_clean.sh -f ./1BZF_A.hclusterRMSD.txt -o ./test
+
+The script will create 4 folders ./test/pdb ./test/sequence ./test/saxs_q (raw SAXS) ./test/saxs_r (P(r) in csv)  
diff --git a/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+Author: Feng Yu. LBNL
+
+Convert the SAXS file to P(r) curves using the RAW API
+
+First version created on 01/04/2024
+"""
+
+import bioxtasraw.RAWAPI as raw
+import argparse
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser(
+    prog = 'SAXS2PR',
+    description = 'Convert the SAXS file to P(r) curves using the RAW API' )
+parser.add_argument('-f', '--filename', required=True)
+parser.add_argument('-o', '--output', default='./')
+
+args = parser.parse_args()
+
+profiles_name=args.filename
+
+# Load SAXS file (.dat format)
+profiles = raw.load_profiles([profiles_name])
+
+gi_profile=profiles[0]
+
+# Convert SAXS file with Inverse Fourier Transform 
+gi_bift = raw.bift(gi_profile)[0]
+
+# Save the radius and P(r) to csv file
+output_pd=pd.DataFrame({'r':gi_bift.r, 'P(r)':gi_bift.p},columns=['r','P(r)'])
+
+output_loc=args.output
+output_pd.to_csv(os.path.join(args.output,os.path.basename(profiles_name).split('.')[0]+".csv"),index=False)
+