From 126cc01513838711a72df00254a3dd0dd66e312f Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Fri, 12 Jan 2024 16:28:19 -0800
Subject: [PATCH 01/15] Added bash script for processing CoDNaS database

---
 .../1BZF_A.hclusterRMSD.txt                   | 188 ++++++++++++++++++
 .../CoDNaS_FOXS_RAW_single_conf/README        |  35 ++++
 .../CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py |  38 ++++
 .../batch_download_seq_new.sh                 |  93 +++++++++
 .../CoDNaS_FOXS_RAW_single_conf/full_clean.sh |  78 ++++++++
 5 files changed, 432 insertions(+)
 create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt
 create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
 create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
 create mode 100755 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
 create mode 100644 Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh

diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt
new file mode 100644
index 0000000..9e12eed
--- /dev/null
+++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt
@@ -0,0 +1,188 @@
+PDB	cluster
+1BZF-1_A	1
+1BZF-2_A	1
+1BZF-3_A	1
+1BZF-4_A	1
+1BZF-5_A	1
+1BZF-6_A	1
+1BZF-7_A	1
+1BZF-8_A	1
+1BZF-9_A	1
+1BZF-10_A	1
+1BZF-11_A	1
+1BZF-12_A	1
+1BZF-13_A	1
+1BZF-14_A	1
+1BZF-15_A	1
+1BZF-16_A	1
+1BZF-17_A	1
+1BZF-18_A	1
+1BZF-19_A	1
+1BZF-20_A	1
+1BZF-21_A	1
+1BZF-22_A	1
+1DIU-1_A	2
+1DIU-2_A	2
+1DIU-3_A	2
+1DIU-4_A	2
+1DIU-5_A	2
+1DIU-6_A	2
+1DIU-7_A	2
+1DIU-8_A	2
+1DIU-9_A	2
+1DIU-10_A	2
+1DIU-11_A	2
+1DIU-12_A	2
+1DIU-13_A	2
+1DIU-14_A	2
+1DIU-15_A	2
+1DIU-16_A	2
+1DIU-17_A	2
+1DIU-18_A	2
+1DIS_A	2
+3DFR_A	2
+1LUD-1_A	3
+1LUD-2_A	3
+1LUD-3_A	3
+1LUD-4_A	3
+1LUD-5_A	3
+1LUD-6_A	3
+1LUD-7_A	3
+1LUD-8_A	3
+1LUD-9_A	3
+1LUD-10_A	3
+1LUD-11_A	3
+1LUD-12_A	3
+1LUD-13_A	3
+1LUD-14_A	3
+1LUD-15_A	3
+1LUD-16_A	3
+1LUD-17_A	3
+1LUD-18_A	3
+1LUD-19_A	3
+1LUD-20_A	3
+1LUD-21_A	3
+1LUD-22_A	3
+1LUD-23_A	3
+1LUD-24_A	3
+2L28-1_A	4
+2L28-8_A	4
+2L28-16_A	4
+2L28-2_A	5
+2L28-10_A	5
+2L28-13_A	5
+2L28-3_A	6
+2L28-4_A	7
+2L28-5_A	8
+2L28-24_A	8
+2L28-6_A	9
+2L28-18_A	9
+2L28-7_A	10
+2L28-12_A	10
+2L28-9_A	11
+2L28-11_A	12
+2L28-20_A	12
+2L28-14_A	13
+2L28-15_A	14
+2L28-17_A	15
+2L28-19_A	16
+2L28-21_A	17
+2L28-22_A	18
+2L28-23_A	19
+2L28-25_A	20
+1AO8-1_A	21
+1AO8-2_A	21
+1AO8-3_A	21
+1AO8-4_A	21
+1AO8-5_A	21
+1AO8-6_A	21
+1AO8-7_A	21
+1AO8-8_A	21
+1AO8-9_A	21
+1AO8-10_A	21
+1AO8-11_A	21
+1AO8-12_A	21
+1AO8-13_A	21
+1AO8-14_A	21
+1AO8-15_A	21
+1AO8-16_A	21
+1AO8-17_A	21
+1AO8-18_A	21
+1AO8-19_A	21
+1AO8-20_A	21
+1AO8-21_A	21
+2HQP-1_A	22
+2HQP-2_A	22
+2HQP-3_A	22
+2HQP-8_A	22
+2HQP-11_A	22
+2HQP-13_A	22
+2HQP-16_A	22
+2HQP-18_A	22
+2HQP-19_A	22
+2HQP-21_A	22
+2HQP-25_A	22
+2HQP-4_A	23
+2HQP-7_A	23
+2HQP-9_A	23
+2HQP-14_A	23
+2HQP-17_A	23
+2HQP-22_A	23
+2HQP-5_A	24
+2HQP-10_A	24
+2HQP-15_A	24
+2HQP-23_A	24
+2HQP-6_A	25
+2HQP-20_A	25
+2HQP-12_A	26
+2HQP-24_A	26
+2LF1-1_A	27
+2LF1-2_A	27
+2LF1-3_A	27
+2LF1-4_A	27
+2LF1-5_A	27
+2LF1-6_A	27
+2LF1-7_A	27
+2LF1-8_A	27
+2LF1-9_A	27
+2LF1-10_A	27
+2LF1-11_A	27
+2LF1-12_A	27
+2LF1-13_A	27
+2LF1-14_A	27
+2LF1-15_A	27
+2LF1-16_A	27
+2LF1-17_A	27
+2LF1-18_A	27
+2LF1-19_A	27
+2LF1-20_A	27
+2LF1-21_A	27
+2LF1-22_A	27
+2LF1-23_A	27
+2LF1-24_A	27
+2LF1-25_A	27
+2HM9-1_A	28
+2HM9-2_A	28
+2HM9-3_A	28
+2HM9-4_A	28
+2HM9-5_A	28
+2HM9-6_A	28
+2HM9-7_A	28
+2HM9-8_A	28
+2HM9-9_A	28
+2HM9-10_A	28
+2HM9-11_A	28
+2HM9-12_A	28
+2HM9-13_A	28
+2HM9-14_A	28
+2HM9-15_A	28
+2HM9-16_A	28
+2HM9-17_A	28
+2HM9-18_A	28
+2HM9-19_A	28
+2HM9-20_A	28
+2HM9-21_A	28
+2HM9-22_A	28
+2HM9-23_A	28
+2HM9-24_A	28
+2HM9-25_A	28
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
new file mode 100644
index 0000000..62341cf
--- /dev/null
+++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
@@ -0,0 +1,35 @@
+CoDNaS Auto Processing
+
+Description:
+CoDNaS is a database comparing diverse conformation of the identical proteins.
+It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files.
+
+The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein.
+Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
+
+Problems:
+1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations
+2.Each PDB may contain multiple conformers which is similar to each other. I am trying to extract them and generate individual SAXS curve which is still under testing. 
+
+Files:
+1BZF_A.hclusterRMSD.txt - raw data downloaded from CoDNaS database
+full_clean.sh - main script
+batch_download_seq_new.sh - modified RCSB script to download PDB and sequence files
+SAXS_to_pr.py - convert the PDB file to SAXS and P(r)
+
+Dependency: 
+Anaconda -
+Request python library -
+RAW API -
+FOXS -
+
+Usage:
+bash full_clean.sh
+-f <file>: the input file containing a list of pdb files downloaded from CoDNaS
+-o  <dir>: the output dir
+
+Example
+mkdir test
+bash full_clean.sh -f ./1BZF_A.hclusterRMSD.txt -o ./test
+
+The script will create 4 folders ./test/pdb ./test/sequence ./test/saxs_q (raw SAXS) ./test/saxs_r (P(r) in csv)  
\ No newline at end of file
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
new file mode 100644
index 0000000..9fdc5ac
--- /dev/null
+++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+Author: Feng Yu. LBNL
+
+Convert the SAXS file to P(r) curves using thge RAW API
+
+First version created on 01/04/2024
+"""
+
+import bioxtasraw.RAWAPI as raw
+import argparse
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser(
+    prog = 'SAXS2PR',
+    description = 'Convert the SAXS file to P(r) curves using thge RAW API' )
+parser.add_argument('-f', '--filename', required=True)
+parser.add_argument('-o', '--output', default='./')
+
+args = parser.parse_args()
+
+profiles_name=args.filename
+
+# Load SAXS file (.dat format)
+profiles = raw.load_profiles([profiles_name])
+
+gi_profile=profiles[0]
+
+# Convert SAXS file with Inverse Fourier Transform 
+gi_bift = raw.bift(gi_profile)[0]
+
+# Save the radius and P(r) to csv file
+output_pd=pd.DataFrame({'r':gi_bift.r, 'P(r)':gi_bift.p},columns=['r','P(r)'])
+
+output_loc=args.output
+output_pd.to_csv(os.path.join(args.output,os.path.basename(profiles_name).split('.')[0]+".csv"),index=False)
+
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
new file mode 100755
index 0000000..fa5688c
--- /dev/null
+++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Script to download files from RCSB http file download services.
+# Use the -h switch to get help on usage.
+
+if ! command -v curl &> /dev/null
+then
+    echo "'curl' could not be found. You need to install 'curl' for this script to work."
+    exit 1
+fi
+
+PROGNAME=$0
+BASE_URL="https://files.rcsb.org/download"
+BASE_URL_SEQ="https://www.rcsb.org/fasta/entry"
+
+usage() {
+  cat << EOF >&2
+Usage: $PROGNAME -f <file> [-o <dir>] [-c] [-p]
+ modified from RCSB script
+ -f <file>: the input file containing a comma-separated list of PDB ids
+ -o  <dir>: the output dir, default: current dir
+ -c       : download a cif.gz file for each PDB id and store in the pdb subfolder
+ -p       : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder
+ -q       : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder
+EOF
+  exit 1
+}
+
+download() {
+  url="$BASE_URL/$1"
+  out=$2/$1
+  echo "Downloading $url to $out"
+  curl -s -f $url -o $out || echo "Failed to download $url"
+}
+
+download_seq() {
+  url="$BASE_URL_SEQ/$1"
+  out="$2/$1.fasta"
+  echo "Downloading $url to $out" 
+  curl -s -f $url -o $out || echo "Failed to download $url"
+}
+
+listfile=""
+outdir="."
+cif=false
+pdb=false
+seq=false
+while getopts f:o:cpq op
+do
+  case $op in
+    (f) listfile=$OPTARG;;
+    (o) outdir=$OPTARG;;
+    (c) cif=true;;
+    (p) pdb=true;;
+    (q) seq=true;;   
+    (*) usage
+  esac
+done
+shift "$((OPTIND - 1))"
+
+if [ "$listfile" == "" ]
+then
+  echo "Parameter -f must be provided"
+  exit 1
+fi
+contents=$(cat $listfile)
+
+# see https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash#tab-top
+IFS=',' read -ra tokens <<< "$contents"
+
+for token in "${tokens[@]}"
+do
+  if [ "$cif" == true ]
+  then
+    download ${token}.cif.gz ${outdir}/pdb
+  fi
+  if [ "$pdb" == true ]
+  then
+    download ${token}.pdb.gz ${outdir}/pdb
+  fi
+  if [ "$seq" == true ]
+  then 
+    download_seq ${token} ${outdir}/sequence
+  fi
+done
+
+
+
+
+
+
+
+
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
new file mode 100644
index 0000000..7f96ffe
--- /dev/null
+++ b/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+PROGNAME=$0
+
+usage() {
+  cat << EOF >&2
+Usage: $PROGNAME -f <file> -o <dir>
+
+-f <file>: the input file containing a list of pdb files downloaded from CoDNaS
+-o  <dir>: the output dir
+EOF
+ exit 1
+}
+
+filename=""
+outdir=""
+
+while getopts f:o: options
+do
+  case $options in 
+    (f) filename=$OPTARG;;
+    (o) outdir=$OPTARG;;
+    (*) usage
+  esac
+done
+
+if [ "$filename" == "" ]
+then 
+  echo "Parameter -f must be provided"
+  exit 1
+fi
+
+if [ "$outdir" == "" ]
+then
+  echo "Parameter -o must be provided"
+  exit 1
+else  
+  echo $outdir
+  awk 'NR>1 { print substr($1,1,4)}' ${filename} | sort -u |tr '\n' ' ' | sed '$s/ $/\n/'| tr ' ' ',' > ${outdir}/clean_pdb.output
+fi
+
+pdb_loc=${outdir}/pdb
+seq_loc=${outdir}/sequence
+saxs_loc=${outdir}/saxs_q
+pr_loc=${outdir}/saxs_r
+
+mkdir ${pdb_loc}
+echo "created PDB storage folder at ${pdb_loc}"
+mkdir ${seq_loc}
+echo "created sequence storage folder at ${seq_loc}"
+
+./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
+
+mkdir ${saxs_loc}
+echo "created SAXS measurement storage folder at ${saxs_loc}"
+mkdir ${pr_loc}
+echo "created P(r) storage folder at ${pr_loc}"
+
+full_pdb_list=$(cat ${outdir}/clean_pdb.output)
+
+IFS=',' read -ra pdbs <<< "$full_pdb_list"
+
+for pdb in ${pdbs[@]}
+do
+ if [ ! -f ${pdb_loc}/${pdb}.pdb.gz ]
+ then
+   echo no pdb file for $pdb
+ else
+   gunzip -c ${pdb_loc}/${pdb}.pdb.gz > ${pdb_loc}/${pdb}.pdb
+   rm ${pdb_loc}/${pdb}.pdb.gz
+   foxs ${pdb_loc}/${pdb}.pdb 
+   mv ${pdb_loc}/${pdb}.pdb.dat ${saxs_loc}/${pdb}.dat
+   python SAXS_to_pr.py -f ${saxs_loc}/${pdb}.dat -o ${pr_loc}
+ fi
+done
+
+rm ${outdir}/clean_pdb.output
+

From 4b5c00c39f6b4535d0d7bdacea890b4ba95f9ae4 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 10:38:41 -0800
Subject: [PATCH 02/15] Added Slurm script for processing PDB70 dataset on
 NERSC

The url of the PDB70 database https://zenodo.org/records/7953087
---
 .DS_Store                              | Bin 0 -> 6148 bytes
 PDB70_NERSC/PDBtoSeq.py                |  31 ++++++++++++++
 PDB70_NERSC/fixpdb_pr_parallel_v2.bash |  57 +++++++++++++++++++++++++
 PDB70_NERSC/parallel_PDB70.slurm       |  12 ++++++
 4 files changed, 100 insertions(+)
 create mode 100644 .DS_Store
 create mode 100644 PDB70_NERSC/PDBtoSeq.py
 create mode 100644 PDB70_NERSC/fixpdb_pr_parallel_v2.bash
 create mode 100644 PDB70_NERSC/parallel_PDB70.slurm

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..554bcac2fac44d5bd6bc7cec4b94df7348451c54
GIT binary patch
literal 6148
zcmeHKO-sW-5PjPQjCiSd@!(;9fzXrJSfXBqUc9Mkr4?IKOg##@dG_ot5PygGs|4Td
zE^R_o>P3po!0g-1M|SrmWHJCu?|#??Isjs-U~7}Z7b1SqhO~l3oJQ_38Fl-Ed^&4H
zs$pFjkY^WRA0u32UVna1H&x#>Pt$CYXSBs<J0Fh^uWyHS|L{luY+QLa+9gx5jbmKl
zmTR9Q!zBugIV&*09d|Rs97TOc_Ib6e>R$Iw4&FCj++J5$Z<;Sp@w?30`EV-dQ&9$#
z0cBua8DPy8iMKrJs0=6r%D{pF`94^xU=pzO=sq1BTnj*qXm-N6^b(R|1114WkMvNC
zPbK<P6Iu-8(`mOME(utA^yx66`7mK+6FL;*R_Fb#3x`QO>ZlAT19b-4{<0(W|LFVv
zzn-L5%78NPuNW{<+E05tQmCzs!%3~R)C;PJ#FZWwDV(@c%vdSKyHqE%TbU4(fTc%T
QDEcGdXwX3!_)`Wx0SdQRl>h($

literal 0
HcmV?d00001

diff --git a/PDB70_NERSC/PDBtoSeq.py b/PDB70_NERSC/PDBtoSeq.py
new file mode 100644
index 0000000..f1ffe5e
--- /dev/null
+++ b/PDB70_NERSC/PDBtoSeq.py
@@ -0,0 +1,31 @@
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+import argparse
+import os
+
+def extract_seq(pdb_input,output_path):
+    pdb_name=pdb_input.split(".")[0]
+    counter=1
+    for record in SeqIO.parse(pdb_input,"pdb-atom"):
+        if counter > 1:
+            raise ValueError("More than 1 Chain is in the file {}".format(pdb_input))
+        else:
+            new_seq_record = SeqRecord(record.seq, id=pdb_name, description='')
+            SeqIO.write(new_seq_record, output_path ,"fasta")
+        counter+=1
+
+def main():
+    parser = argparse.ArgumentParser(
+    prog = 'PDBtoSeq',
+    description = '''Extract Sequence from PDB using the BioPython API.
+    The output will be stored at the current directory as a fasta file.''' )
+    
+    parser.add_argument('-f', '--filename', required=True)
+    parser.add_argument('-o', '--output', default='./')
+
+    args = parser.parse_args()
+
+    extract_seq(args.filename,args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
new file mode 100644
index 0000000..9d6437c
--- /dev/null
+++ b/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+PROGNAME=$0
+
+usage() {
+  cat << EOF >&2
+Usage: $PROGNAME -f <file> -o <dir>
+
+-f <file>: the input file containing a list of pdb files (can be obtained using ls > file_name.txt)
+-o  <dir>: the output dir
+EOF
+ exit 1
+}
+
+file=""
+outdir=""
+
+while getopts f:o: options
+do
+  case $options in
+    (f) file=$OPTARG;;
+    (o) outdir=$OPTARG;;
+    (*) usage
+  esac
+done
+
+if [ "$file" == "" ]
+then
+  echo "Parameter -f must be provided"
+  exit 1
+fi
+
+if [ "$outdir" == "" ]
+then
+  echo "Parameter -o must be provided"
+  exit 1
+fi
+
+pdb_loc=${outdir}/pdb
+seq_loc=${outdir}/sequence
+pr_loc=${outdir}/saxs_r
+
+#mkdir -p ${pdb_loc}
+#echo "created PDB storage folder at ${pdb_loc}"
+#mkdir -p ${seq_loc}
+#echo "created sequence storage folder at ${seq_loc}"
+#mkdir -p ${pr_loc}
+#echo "created saxs P(r) storage folder at ${pr_loc}"
+
+if [ -f "$file" ]; then
+   echo ${file}
+   filename=$(basename "$file")
+   echo ${filename}
+   pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose
+   calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv"
+   python PDBtoSeq.py  -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" 
+fi
diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/PDB70_NERSC/parallel_PDB70.slurm
new file mode 100644
index 0000000..2890bce
--- /dev/null
+++ b/PDB70_NERSC/parallel_PDB70.slurm
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:05:00
+#SBATCH --constraint=cpu
+#SBATCH --qos=shared
+#SBATCH --account=m3513
+
+module load python
+conda activate /pscratch/sd/l/lemonboy/PDB_prep
+parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: *.pdb

From 8b073d685f94623251b127f75a2212a37d3c213b Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 10:41:43 -0800
Subject: [PATCH 03/15] Delete .DS_Store

---
 .DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 554bcac2fac44d5bd6bc7cec4b94df7348451c54..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKO-sW-5PjPQjCiSd@!(;9fzXrJSfXBqUc9Mkr4?IKOg##@dG_ot5PygGs|4Td
zE^R_o>P3po!0g-1M|SrmWHJCu?|#??Isjs-U~7}Z7b1SqhO~l3oJQ_38Fl-Ed^&4H
zs$pFjkY^WRA0u32UVna1H&x#>Pt$CYXSBs<J0Fh^uWyHS|L{luY+QLa+9gx5jbmKl
zmTR9Q!zBugIV&*09d|Rs97TOc_Ib6e>R$Iw4&FCj++J5$Z<;Sp@w?30`EV-dQ&9$#
z0cBua8DPy8iMKrJs0=6r%D{pF`94^xU=pzO=sq1BTnj*qXm-N6^b(R|1114WkMvNC
zPbK<P6Iu-8(`mOME(utA^yx66`7mK+6FL;*R_Fb#3x`QO>ZlAT19b-4{<0(W|LFVv
zzn-L5%78NPuNW{<+E05tQmCzs!%3~R)C;PJ#FZWwDV(@c%vdSKyHqE%TbU4(fTc%T
QDEcGdXwX3!_)`Wx0SdQRl>h($


From 0fb09933d765469b774016056173a1d6abf12642 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 13:35:46 -0800
Subject: [PATCH 04/15] Update PDB70_NERSC/parallel_PDB70.slurm

Co-authored-by: Andrew Tritt <ajtritt@lbl.gov>
---
 PDB70_NERSC/parallel_PDB70.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/PDB70_NERSC/parallel_PDB70.slurm
index 2890bce..f788014 100644
--- a/PDB70_NERSC/parallel_PDB70.slurm
+++ b/PDB70_NERSC/parallel_PDB70.slurm
@@ -9,4 +9,4 @@
 
 module load python
 conda activate /pscratch/sd/l/lemonboy/PDB_prep
-parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . ::: *.pdb
+ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::

From 099284e66b39c882c1cd5ba97fe8a9b081168305 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 15:31:03 -0800
Subject: [PATCH 05/15] Added a scripts folder to store individual analyze
 scripts

---
 .../Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt        |  0
 .../Extract_Pr_CoDNaS/CoD_calc_main.sh                | 11 ++++++++++-
 .../Extract_Pr_CoDNaS}/README                         |  2 +-
 .../Extract_Pr_CoDNaS}/SAXS_to_pr.py                  |  4 ++--
 .../Extract_Pr_CoDNaS/batch_download_seq.sh           |  2 +-
 {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py      |  0
 .../PDB70_NERSC}/fixpdb_pr_parallel_v2.bash           |  0
 .../PDB70_NERSC}/parallel_PDB70.slurm                 |  3 ++-
 8 files changed, 16 insertions(+), 6 deletions(-)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt (100%)
 rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh => scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh (73%)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/README (93%)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/SAXS_to_pr.py (92%)
 rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh => scripts/Extract_Pr_CoDNaS/batch_download_seq.sh (98%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py (100%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/fixpdb_pr_parallel_v2.bash (100%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/parallel_PDB70.slurm (92%)

diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt
similarity index 100%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt
rename to scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
similarity index 73%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
rename to scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
index 7f96ffe..3c657b2 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
+++ b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
@@ -1,4 +1,13 @@
 #!/bin/bash
+# Author: Feng Yu fyu2@lbl.gov 2024
+
+# CoDNaS is a protein conformational database based of entire proteins as derived from PDB. 
+# For each represented protein, the database contains the redundant collection of all corresponding different structures. 
+
+# This script is used to calculate SAXS curve for PDBs based on lists of PDBs downloading from the CoDNaS server.
+# This script will generate SAXS curve, P(r) curve , extract sequences and download the PDBs for each PDB entry.
+
+
 
 PROGNAME=$0
 
@@ -49,7 +58,7 @@ echo "created PDB storage folder at ${pdb_loc}"
 mkdir ${seq_loc}
 echo "created sequence storage folder at ${seq_loc}"
 
-./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
+./batch_download_seq.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
 
 mkdir ${saxs_loc}
 echo "created SAXS measurement storage folder at ${saxs_loc}"
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/scripts/Extract_Pr_CoDNaS/README
similarity index 93%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
rename to scripts/Extract_Pr_CoDNaS/README
index 62341cf..10d9fe2 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
+++ b/scripts/Extract_Pr_CoDNaS/README
@@ -5,7 +5,7 @@ CoDNaS is a database comparing diverse conformation of the identical proteins.
 It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files.
 
 The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein.
-Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
+Thus, this script is used to automatically download the PDB from RCSB based on the list and then convert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
 
 Problems:
 1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
similarity index 92%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
rename to scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
index 9fdc5ac..ac3c775 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
+++ b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
@@ -2,7 +2,7 @@
 """
 Author: Feng Yu. LBNL
 
-Convert the SAXS file to P(r) curves using thge RAW API
+Convert the SAXS file to P(r) curves using the RAW API
 
 First version created on 01/04/2024
 """
@@ -14,7 +14,7 @@
 
 parser = argparse.ArgumentParser(
     prog = 'SAXS2PR',
-    description = 'Convert the SAXS file to P(r) curves using thge RAW API' )
+    description = 'Convert the SAXS file to P(r) curves using the RAW API' )
 parser.add_argument('-f', '--filename', required=True)
 parser.add_argument('-o', '--output', default='./')
 
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
similarity index 98%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
rename to scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
index fa5688c..27f454d 100755
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
+++ b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
@@ -21,7 +21,7 @@ Usage: $PROGNAME -f <file> [-o <dir>] [-c] [-p]
  -o  <dir>: the output dir, default: current dir
  -c       : download a cif.gz file for each PDB id and store in the pdb subfolder
  -p       : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder
- -q       : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder
+ -q       : download a .fasta file for each PDB id (sequence) and store in the sequence subfolder
 EOF
   exit 1
 }
diff --git a/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py
similarity index 100%
rename from PDB70_NERSC/PDBtoSeq.py
rename to scripts/PDB70_NERSC/PDBtoSeq.py
diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
similarity index 100%
rename from PDB70_NERSC/fixpdb_pr_parallel_v2.bash
rename to scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm
similarity index 92%
rename from PDB70_NERSC/parallel_PDB70.slurm
rename to scripts/PDB70_NERSC/parallel_PDB70.slurm
index f788014..d4561a4 100644
--- a/PDB70_NERSC/parallel_PDB70.slurm
+++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm
@@ -8,5 +8,6 @@
 #SBATCH --account=m3513
 
 module load python
+module load parallel
 conda activate /pscratch/sd/l/lemonboy/PDB_prep
-ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::
+ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::
\ No newline at end of file

From 858e407021fa5a9d46905c47d25fd1f799d9750c Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 15:33:47 -0800
Subject: [PATCH 06/15] Added a scripts folder to store individual analyze
 scripts

---
 .../Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt        |  0
 .../Extract_Pr_CoDNaS/CoD_calc_main.sh                | 11 ++++++++++-
 .../Extract_Pr_CoDNaS}/README                         |  2 +-
 .../Extract_Pr_CoDNaS}/SAXS_to_pr.py                  |  4 ++--
 .../Extract_Pr_CoDNaS/batch_download_seq.sh           |  2 +-
 {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py      |  2 +-
 .../PDB70_NERSC}/fixpdb_pr_parallel_v2.bash           |  0
 .../PDB70_NERSC}/parallel_PDB70.slurm                 |  3 ++-
 8 files changed, 17 insertions(+), 7 deletions(-)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/1BZF_A.hclusterRMSD.txt (100%)
 rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh => scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh (73%)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/README (93%)
 rename {Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf => scripts/Extract_Pr_CoDNaS}/SAXS_to_pr.py (92%)
 rename Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh => scripts/Extract_Pr_CoDNaS/batch_download_seq.sh (98%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/PDBtoSeq.py (94%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/fixpdb_pr_parallel_v2.bash (100%)
 rename {PDB70_NERSC => scripts/PDB70_NERSC}/parallel_PDB70.slurm (92%)

diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt b/scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt
similarity index 100%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/1BZF_A.hclusterRMSD.txt
rename to scripts/Extract_Pr_CoDNaS/1BZF_A.hclusterRMSD.txt
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
similarity index 73%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
rename to scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
index 7f96ffe..3c657b2 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/full_clean.sh
+++ b/scripts/Extract_Pr_CoDNaS/CoD_calc_main.sh
@@ -1,4 +1,13 @@
 #!/bin/bash
+# Author: Feng Yu fyu2@lbl.gov 2024
+
+# CoDNaS is a protein conformational database based of entire proteins as derived from PDB. 
+# For each represented protein, the database contains the redundant collection of all corresponding different structures. 
+
+# This script is used to calculate SAXS curve for PDBs based on lists of PDBs downloading from the CoDNaS server.
+# This script will generate SAXS curve, P(r) curve , extract sequences and download the PDBs for each PDB entry.
+
+
 
 PROGNAME=$0
 
@@ -49,7 +58,7 @@ echo "created PDB storage folder at ${pdb_loc}"
 mkdir ${seq_loc}
 echo "created sequence storage folder at ${seq_loc}"
 
-./batch_download_seq_new.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
+./batch_download_seq.sh -f ${outdir}/clean_pdb.output  -o ${outdir} -p -q
 
 mkdir ${saxs_loc}
 echo "created SAXS measurement storage folder at ${saxs_loc}"
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README b/scripts/Extract_Pr_CoDNaS/README
similarity index 93%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
rename to scripts/Extract_Pr_CoDNaS/README
index 62341cf..10d9fe2 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/README
+++ b/scripts/Extract_Pr_CoDNaS/README
@@ -5,7 +5,7 @@ CoDNaS is a database comparing diverse conformation of the identical proteins.
 It provides a good interface for pairwise comparison but lack of useful APIs especially for downloading files.
 
 The best approach for now is to manually download a tsv file containing the PDB ID of all conformers for a single protein.
-Thus, this script is used to automatically download the PDB from RCSB based on the list and then covert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
+Thus, this script is used to automatically download the PDB from RCSB based on the list and then convert them to SAXS curve and P(r) curve with FOXS and RAW APIs.
 
 Problems:
 1.FOXS will add the hydration layer to the PDB and then calculated the SAXS curve. This may eliminate some difference between conformations
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
similarity index 92%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
rename to scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
index 9fdc5ac..ac3c775 100644
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/SAXS_to_pr.py
+++ b/scripts/Extract_Pr_CoDNaS/SAXS_to_pr.py
@@ -2,7 +2,7 @@
 """
 Author: Feng Yu. LBNL
 
-Convert the SAXS file to P(r) curves using thge RAW API
+Convert the SAXS file to P(r) curves using the RAW API
 
 First version created on 01/04/2024
 """
@@ -14,7 +14,7 @@
 
 parser = argparse.ArgumentParser(
     prog = 'SAXS2PR',
-    description = 'Convert the SAXS file to P(r) curves using thge RAW API' )
+    description = 'Convert the SAXS file to P(r) curves using the RAW API' )
 parser.add_argument('-f', '--filename', required=True)
 parser.add_argument('-o', '--output', default='./')
 
diff --git a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
similarity index 98%
rename from Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
rename to scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
index fa5688c..27f454d 100755
--- a/Feng_bash_scripts/CoDNaS_FOXS_RAW_single_conf/batch_download_seq_new.sh
+++ b/scripts/Extract_Pr_CoDNaS/batch_download_seq.sh
@@ -21,7 +21,7 @@ Usage: $PROGNAME -f <file> [-o <dir>] [-c] [-p]
  -o  <dir>: the output dir, default: current dir
  -c       : download a cif.gz file for each PDB id and store in the pdb subfolder
  -p       : download a pdb.gz file for each PDB id (not available for large structures) and store in the pdb subfolder
- -q       : download a .fasta file for each PDB id (sequence) and sotre in the sequence subfolder
+ -q       : download a .fasta file for each PDB id (sequence) and store in the sequence subfolder
 EOF
   exit 1
 }
diff --git a/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py
similarity index 94%
rename from PDB70_NERSC/PDBtoSeq.py
rename to scripts/PDB70_NERSC/PDBtoSeq.py
index f1ffe5e..ae22cae 100644
--- a/PDB70_NERSC/PDBtoSeq.py
+++ b/scripts/PDB70_NERSC/PDBtoSeq.py
@@ -4,7 +4,7 @@
 import os
 
 def extract_seq(pdb_input,output_path):
-    pdb_name=pdb_input.split(".")[0]
+    pdb_name=os.path.basename(pdb_input).split(".")[0]
     counter=1
     for record in SeqIO.parse(pdb_input,"pdb-atom"):
         if counter > 1:
diff --git a/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
similarity index 100%
rename from PDB70_NERSC/fixpdb_pr_parallel_v2.bash
rename to scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
diff --git a/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm
similarity index 92%
rename from PDB70_NERSC/parallel_PDB70.slurm
rename to scripts/PDB70_NERSC/parallel_PDB70.slurm
index f788014..d4561a4 100644
--- a/PDB70_NERSC/parallel_PDB70.slurm
+++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm
@@ -8,5 +8,6 @@
 #SBATCH --account=m3513
 
 module load python
+module load parallel
 conda activate /pscratch/sd/l/lemonboy/PDB_prep
-ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::
+ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::
\ No newline at end of file

From fe1132c04da723fbedb135bd75f897caeb381b5d Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 25 Jan 2024 17:31:13 -0800
Subject: [PATCH 07/15] Added PDBtoSeq as a function to utils.py and a command
 to commands.py

---
 pyproject.toml                  |  1 +
 scripts/PDB70_NERSC/PDBtoSeq.py |  4 ++--
 src/metfish/commands.py         | 14 ++++++++++++++
 src/metfish/utils.py            | 24 ++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 126a97e..868745a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ dynamic = ["version"]
 
 [project.scripts]
 calc-pr = "metfish.commands:get_Pr_cli"
+extract-seq = "metfish.commands:extract_seq_cli"
 
 [tool.ruff]
 # Exclude a variety of commonly ignored directories.
diff --git a/scripts/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py
index ae22cae..a076e81 100644
--- a/scripts/PDB70_NERSC/PDBtoSeq.py
+++ b/scripts/PDB70_NERSC/PDBtoSeq.py
@@ -20,8 +20,8 @@ def main():
     description = '''Extract Sequence from PDB using the BioPython API.
     The output will be stored at the current directory as a fasta file.''' )
     
-    parser.add_argument('-f', '--filename', required=True)
-    parser.add_argument('-o', '--output', default='./')
+    parser.add_argument('-f', '--filename', required=True, help='input pdb file name')
+    parser.add_argument('-o', '--output', required=True, help='output fasta file name')
 
     args = parser.parse_args()
 
diff --git a/src/metfish/commands.py b/src/metfish/commands.py
index 80dde5f..d026a44 100644
--- a/src/metfish/commands.py
+++ b/src/metfish/commands.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from .utils import get_Pr
+from .utils import extract_seq
 
 def get_Pr_cli(argv=None):
 
@@ -40,3 +41,16 @@ def get_Pr_cli(argv=None):
                   step=args.step)
 
     pd.DataFrame({"r": r, "P(r)": p}).to_csv(out, index=False)
+
+def extract_seq_cli(argv=None):
+    
+    parser = argparse.ArgumentParser(
+    description = '''Extract Sequence from PDB using the BioPython API.
+    The output will be stored at the current directory as a fasta file.''' )
+    
+    parser.add_argument('-f', '--filename', required=True, help="input pdb file")
+    parser.add_argument('-o', '--output', required=True, help="output fasta file path + name")
+
+    args = parser.parse_args()
+
+    extract_seq(args.filename,args.output)
diff --git a/src/metfish/utils.py b/src/metfish/utils.py
index 10202b2..7cd9c60 100644
--- a/src/metfish/utils.py
+++ b/src/metfish/utils.py
@@ -6,6 +6,8 @@
 import numpy as np
 from periodictable import elements
 from scipy.spatial.distance import pdist, squareform
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
 
 
 n_elec_df = {el.symbol: el.number for el in elements}
@@ -69,3 +71,25 @@ def get_Pr(structure, structure_id="", dmax=None, step=0.5):
     p = np.concatenate(([0], hist / hist.sum()))
 
     return r, p
+
+def extract_seq(pdb_input,output_path):
+    """
+    Args:
+        pdb_input   : The path to the PDB to extract sequence.
+                      The PDB must only contain a single chain. 
+                      pdbfixer is a good tool to prepare PDB for this function
+        output_path : The path to store the output fasta file.
+                      Example: /location/to/store/PDB.fasta
+    Returns:
+        There is no return for this function. The sequence will be written 
+        as a fasta file in the give location.
+    """
+    pdb_name=os.path.basename(pdb_input).split(".")[0]
+    counter=1
+    for record in SeqIO.parse(pdb_input,"pdb-atom"):
+        if counter > 1:
+            raise ValueError("More than 1 Chain is in the file {}".format(pdb_input))
+        else:
+            new_seq_record = SeqRecord(record.seq, id=pdb_name, description='')
+            SeqIO.write(new_seq_record, output_path ,"fasta")
+        counter+=1
\ No newline at end of file

From 70a1138c05d9b39df12d42a5ee9e372cd045133a Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Wed, 31 Jan 2024 13:57:38 -0800
Subject: [PATCH 08/15] Update scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash

Co-authored-by: Andrew Tritt <ajtritt@lbl.gov>
---
 scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
index 9d6437c..dde400f 100644
--- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
+++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
@@ -54,4 +54,5 @@ if [ -f "$file" ]; then
    pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose
    calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv"
    python PDBtoSeq.py  -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" 
+   echo ${file} >> complete.txt
 fi

From a8607f1c009036b939b6953e76b0fa5203fed17e Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Wed, 31 Jan 2024 13:58:13 -0800
Subject: [PATCH 09/15] Update scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash

Co-authored-by: Andrew Tritt <ajtritt@lbl.gov>
---
 scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
index dde400f..64c3df5 100644
--- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
+++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
@@ -48,6 +48,10 @@ pr_loc=${outdir}/saxs_r
 #echo "created saxs P(r) storage folder at ${pr_loc}"
 
 if [ -f "$file" ]; then
+       done=`grep -c ${file} complete.txt`
+       if [ "$done" -ne "0" ] ; then
+             return
+       fi
    echo ${file}
    filename=$(basename "$file")
    echo ${filename}

From b8195642c73faacc9d34df274e70965d0999e8ec Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Thu, 22 Feb 2024 17:06:31 -0800
Subject: [PATCH 10/15] add .DS_store to gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index c1fc984..c9d5296 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,4 @@ __pycache__
 *.pyc
 *.egg-info/
 *.swp
-
+.DS_Store

From e7f600365874efee86594c62fb6b5da8ba3ee066 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Fri, 1 Mar 2024 14:55:27 -0800
Subject: [PATCH 11/15] Retrive full sequence from RCSB to fix the conformation
 of PDBs.

---
 scripts/PDB70_NERSC/get_full_seq.py | 104 ++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 scripts/PDB70_NERSC/get_full_seq.py

diff --git a/scripts/PDB70_NERSC/get_full_seq.py b/scripts/PDB70_NERSC/get_full_seq.py
new file mode 100644
index 0000000..c1dfbcc
--- /dev/null
+++ b/scripts/PDB70_NERSC/get_full_seq.py
@@ -0,0 +1,104 @@
+# %%
+import Bio.Align as Align
+import Bio.SeqIO as SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.PDB import PDBList
+import argparse
+import os
+
+# %%
+def extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID):
+    pdbl = PDBList()
+    seqres_type="pdb-seqres"
+    # Retrive PDB reference (with full sequence) from RCSB
+    # Will not rewrite PDB if already downloaded
+    # Will download obsolete PBDS
+    ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', obsolete=True , pdir=".")
+    if not os.path.exists(ref_file_name):
+        ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir=".")
+        seqres_type="cif-seqres"
+    if not os.path.exists(ref_file_name):
+        raise ValueError("PDB doesn't exist in the database")
+    for record in SeqIO.parse(ref_file_name, seqres_type):
+        # Get the corresponding chain from the Reference PDB and extract sequence
+        if record.annotations["chain"] == chain_ID:
+            break
+    ref_seq=record.seq
+    print(ref_seq,seqres_type)
+    #os.remove(ref_file_name)
+
+    counter=1
+    # To prevent 2 chains in the "to be fixed" PDB file
+    for record2 in SeqIO.parse(input_pdb_path,"pdb-atom"):
+        if counter > 1:
+            raise ValueError("More than 1 Chain is in the file {}".format(input_pdb_path))
+        else:
+            sequence_to_be_fix=record2.seq
+        counter+=1
+    return sequence_to_be_fix, ref_seq
+    
+# %%
+# Align the ref sequence and the sequence to be fix
+def align_fix(sequence_to_be_fix, ref_seq):
+    
+    aligner = Align.PairwiseAligner()
+    alignments = aligner.align(ref_seq, sequence_to_be_fix)
+    # Ignore the N-terminal or C-terminal loop of the ref seq
+    counter=0
+    for i in alignments[0][1]:
+        if i == '-':
+            counter+=1
+            continue
+        else:
+            break
+    start_ind=counter
+
+    counter2=0
+    for j in alignments[0][1][::-1]:
+        if j == '-':
+            counter2+=1
+            continue
+        else:
+            break
+ 
+    end_ind=len(ref_seq)-counter2
+
+    return(ref_seq[start_ind:end_ind])
+
+# %%
+# function to save the sequence to fast
+def save_fasta(sequence, input_name ,output_path):
+
+    new_seq_record = SeqRecord(sequence, id=input_name, description='')
+    SeqIO.write(new_seq_record, os.path.join(output_path,'fixed_' + input_name + '.fasta') ,"fasta")
+# Get pdb name to save files
+def extract_pdb_name(input_pdb_path):
+
+    base_name_without_extension = os.path.splitext(os.path.basename(input_pdb_path))[0]
+    ref_pdb_name=base_name_without_extension.split('_')[0].lower()
+    chain_ID=base_name_without_extension.split('_')[1]
+    
+    return base_name_without_extension,ref_pdb_name, chain_ID 
+# the "entire workflow"
+def extract_fixed_seq(input_pdb_path,output_path):
+    base_name_without_extension, ref_pdb_name, chain_ID = extract_pdb_name(input_pdb_path)
+    sequence_to_be_fix, ref_seq = extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID)
+    sequence = align_fix(sequence_to_be_fix, ref_seq)
+    save_fasta(sequence, base_name_without_extension ,output_path)
+# Command line interface
+def main():
+    parser = argparse.ArgumentParser(
+    prog = 'FullSEQRES',
+    description = '''Extract the fill sequence of PDBs with missing loops
+    using the BioPython and BioPandas.
+    The output will be stored at the output directory as a fasta file.''' )
+    
+    parser.add_argument('-f', '--filename', required=True)
+    parser.add_argument('-o', '--output', default='./')
+
+    args = parser.parse_args()
+    extract_fixed_seq(args.filename, args.output)
+    #print("The full sequence of {} is {}".format(args.filename,full_seq))
+
+if __name__ == '__main__':
+    main()

From d88c6c3e6cfd72291f9709236471f65d9432b8e6 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Fri, 1 Mar 2024 14:57:04 -0800
Subject: [PATCH 12/15] replace the old bash script with new script to fix the
 conformations.

---
 ...rallel_v2.bash => fixpdb_pr_parallel.bash} | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)
 rename scripts/PDB70_NERSC/{fixpdb_pr_parallel_v2.bash => fixpdb_pr_parallel.bash} (68%)
 mode change 100644 => 100755

diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
old mode 100644
new mode 100755
similarity index 68%
rename from scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
rename to scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
index 64c3df5..9c77b40
--- a/scripts/PDB70_NERSC/fixpdb_pr_parallel_v2.bash
+++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
@@ -39,6 +39,7 @@ fi
 pdb_loc=${outdir}/pdb
 seq_loc=${outdir}/sequence
 pr_loc=${outdir}/saxs_r
+ref_seq_loc=${outdir}/ref_seq
 
 #mkdir -p ${pdb_loc}
 #echo "created PDB storage folder at ${pdb_loc}"
@@ -48,14 +49,22 @@ pr_loc=${outdir}/saxs_r
 #echo "created saxs P(r) storage folder at ${pr_loc}"
 
 if [ -f "$file" ]; then
-       done=`grep -c ${file} complete.txt`
-       if [ "$done" -ne "0" ] ; then
-             return
-       fi
+   done=`grep -c ${file} complete.txt`
+   echo $done
+   if [ "$done" -ne "0" ] ; then
+       exit
+   fi
    echo ${file}
+   
    filename=$(basename "$file")
+   prefix="${filename%.*}"
+   IFS='_' read -r PDB_code chain_ID <<< "$prefix"
    echo ${filename}
-   pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" --keep-heterogens=none --replace-nonstandard --add-residues --verbose
+   
+   python get_full_seq.py -f ${file} -o ${ref_seq_loc}
+   pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" \
+   --keep-heterogens=none --replace-nonstandard --add-residues --add-atoms=all  --verbose \
+   --sequence "${ref_seq_loc}/fixed_${prefix}.fasta" --chain_id "${chain_ID}"
    calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv"
    python PDBtoSeq.py  -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" 
    echo ${file} >> complete.txt

From 74dfc699a3d7a20617ed387c1115695571f54fc6 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Wed, 6 Mar 2024 11:07:30 -0800
Subject: [PATCH 13/15] Change the scripts to the actual scripts running on
 NERSC

---
 scripts/PDB70_NERSC/PDBtoSeq.py             |  6 +++---
 scripts/PDB70_NERSC/fixpdb_pr_parallel.bash | 16 ++++++++++++----
 scripts/PDB70_NERSC/get_full_seq.py         |  8 ++++----
 scripts/PDB70_NERSC/parallel_PDB70.slurm    |  8 ++++----
 4 files changed, 23 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 scripts/PDB70_NERSC/PDBtoSeq.py

diff --git a/scripts/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py
old mode 100644
new mode 100755
index a076e81..f1ffe5e
--- a/scripts/PDB70_NERSC/PDBtoSeq.py
+++ b/scripts/PDB70_NERSC/PDBtoSeq.py
@@ -4,7 +4,7 @@
 import os
 
 def extract_seq(pdb_input,output_path):
-    pdb_name=os.path.basename(pdb_input).split(".")[0]
+    pdb_name=pdb_input.split(".")[0]
     counter=1
     for record in SeqIO.parse(pdb_input,"pdb-atom"):
         if counter > 1:
@@ -20,8 +20,8 @@ def main():
     description = '''Extract Sequence from PDB using the BioPython API.
     The output will be stored at the current directory as a fasta file.''' )
     
-    parser.add_argument('-f', '--filename', required=True, help='input pdb file name')
-    parser.add_argument('-o', '--output', required=True, help='output fasta file name')
+    parser.add_argument('-f', '--filename', required=True)
+    parser.add_argument('-o', '--output', default='./')
 
     args = parser.parse_args()
 
diff --git a/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
index 9c77b40..f5485c0 100755
--- a/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
+++ b/scripts/PDB70_NERSC/fixpdb_pr_parallel.bash
@@ -6,7 +6,7 @@ usage() {
   cat << EOF >&2
 Usage: $PROGNAME -f <file> -o <dir>
 
--f <file>: the input file containing a list of pdb files (can be obtained using ls > file_name.txt)
+-f <file>: the input PDB file path
 -o  <dir>: the output dir
 EOF
  exit 1
@@ -49,9 +49,9 @@ ref_seq_loc=${outdir}/ref_seq
 #echo "created saxs P(r) storage folder at ${pr_loc}"
 
 if [ -f "$file" ]; then
-   done=`grep -c ${file} complete.txt`
+   done=`grep -c ${file} ${outdir}/complete.txt`
    echo $done
-   if [ "$done" -ne "0" ] ; then
+   if [ "$done" -ne 0 ] ; then
        exit
    fi
    echo ${file}
@@ -62,10 +62,18 @@ if [ -f "$file" ]; then
    echo ${filename}
    
    python get_full_seq.py -f ${file} -o ${ref_seq_loc}
+   if [ ! -f "${ref_seq_loc}/fixed_${prefix}.fasta" ]; then
+   	echo "BASH_ref_seq_Error: File ${ref_seq_loc}/fixed_${prefix}.fasta does not exist."
+        exit
+   fi
    pdbfixer ${file} --output "${pdb_loc}/fixed_${filename}" \
    --keep-heterogens=none --replace-nonstandard --add-residues --add-atoms=all  --verbose \
    --sequence "${ref_seq_loc}/fixed_${prefix}.fasta" --chain_id "${chain_ID}"
+   if [ ! -f "${pdb_loc}/fixed_${filename}" ]; then
+        echo "BASH_fixer_Error: File ${pdb_loc}/fixed_${filename} does not exist."
+        exit
+   fi
    calc-pr "${pdb_loc}/fixed_${filename}" -o "${pr_loc}/${filename}.pr.csv"
    python PDBtoSeq.py  -f "${pdb_loc}/fixed_${filename}" -o "${seq_loc}/${filename}.fasta" 
-   echo ${file} >> complete.txt
+   echo ${file} >> ${outdir}/complete.txt
 fi
diff --git a/scripts/PDB70_NERSC/get_full_seq.py b/scripts/PDB70_NERSC/get_full_seq.py
index c1dfbcc..f2730a7 100644
--- a/scripts/PDB70_NERSC/get_full_seq.py
+++ b/scripts/PDB70_NERSC/get_full_seq.py
@@ -10,12 +10,12 @@
 def extract_ref_seq(input_pdb_path, ref_pdb_name, chain_ID):
     pdbl = PDBList()
     seqres_type="pdb-seqres"
-    # Retrive PDB reference (with full sequence) from RCSB
+    # Retrieve PDB reference (with full sequence) from RCSB
     # Will not rewrite PDB if already downloaded
-    # Will download obsolete PBDS
-    ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', obsolete=True , pdir=".")
+    # Will download obsolete PDBS
+    ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='pdb', pdir="./tmp")
     if not os.path.exists(ref_file_name):
-        ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir=".")
+        ref_file_name=pdbl.retrieve_pdb_file(ref_pdb_name, file_format='mmCif', pdir="./tmp")
         seqres_type="cif-seqres"
     if not os.path.exists(ref_file_name):
         raise ValueError("PDB doesn't exist in the database")
diff --git a/scripts/PDB70_NERSC/parallel_PDB70.slurm b/scripts/PDB70_NERSC/parallel_PDB70.slurm
index d4561a4..e476e1b 100644
--- a/scripts/PDB70_NERSC/parallel_PDB70.slurm
+++ b/scripts/PDB70_NERSC/parallel_PDB70.slurm
@@ -1,13 +1,13 @@
 #!/bin/bash
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=4
-#SBATCH --time=00:05:00
+#SBATCH --cpus-per-task=32
+#SBATCH --time=24:00:00
 #SBATCH --constraint=cpu
 #SBATCH --qos=shared
 #SBATCH --account=m3513
 
 module load python
 module load parallel
-conda activate /pscratch/sd/l/lemonboy/PDB_prep
-ls  *.pdb | parallel -j 4 bash fixpdb_pr_parallel.bash -f {} -o . :::
\ No newline at end of file
+conda activate /pscratch/sd/l/lemonboy/PDB_fixer_test
+find ./ -maxdepth 1 -name '*.pdb' -type f | parallel -j 32 bash fixpdb_pr_parallel.bash -f {} -o ../result

From 5169f540301d32c79ecddc1cb34bc9042d2b795f Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Wed, 6 Mar 2024 11:55:58 -0800
Subject: [PATCH 14/15] Added README file for NERSC scripts.

---
 scripts/PDB70_NERSC/README | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 scripts/PDB70_NERSC/README

diff --git a/scripts/PDB70_NERSC/README b/scripts/PDB70_NERSC/README
new file mode 100644
index 0000000..720a617
--- /dev/null
+++ b/scripts/PDB70_NERSC/README
@@ -0,0 +1,33 @@
+# PDB Loop Fixer/ SAXS profile Script
+
+## Overview
+This script is designed to fix loops and generate SAXS data for Protein Data Bank (PDB) files using GNU parallel on NERSC. This script will calculate the SAXS P(r) curve for single chain and single conformation PDBs as a reference for training the metfish deep learning network.
+
+## Features
+- Corrects loop regions in PDB files.
+- Utilizes GNU parallel programming for multi-thread processing on supercomputers.
+- Generates output logs for tracking changes and errors.
+- Generates standalone fasta for PDBs after fixing the loop.
+- Calculate the SAXS P(r) curve for fixed PDBs
+
+## Prerequisites
+Here are the prerequisites tested on NERSC. Generally speaking, this script does now have a strick restriction on Python and Library versions.
+If you installed openmm, pdbfixer and metfish with conda/pip, generally speaking, you do not need to install Numpy and other dependencies independently.
+
+- Python 3.12.1
+- openmm 8.1.1
+- pdbfixer 1.9.0
+- metfish 0.0.0
+- GNU Parallel
+- Input PDB file(s) containing !!single chain!! protein structures. (in a single folder for parallel computing input)
+
+## Installation
+1. Clone or download the repository.
+(optional but recommended) Create a new conda environment for installation
+2. Install the metfish package and other prerequisites.
+3. Copy the scripts in PDB70_NERSC folder to the target folder and edit the slurm script to configure the input, output folders and job queue settings to user's preferences.
+
+## Usage
+For a single PDB file, please refer to the usage of fixpdb_pr_parallel.bash
+
+For multiple PDB files, please refer to the parallel_PDB70.slurm
\ No newline at end of file

From 6ab54cc8a6d7d3cf5571b52d120418485cde0d86 Mon Sep 17 00:00:00 2001
From: Feng Yu <fyu2@lbl.gov>
Date: Wed, 6 Mar 2024 14:39:53 -0800
Subject: [PATCH 15/15] Added the github url for the modified pdbfixer

---
 scripts/PDB70_NERSC/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/PDB70_NERSC/README b/scripts/PDB70_NERSC/README
index 720a617..a494385 100644
--- a/scripts/PDB70_NERSC/README
+++ b/scripts/PDB70_NERSC/README
@@ -16,7 +16,7 @@ If you installed openmm, pdbfixer and metfish with conda/pip, generally speaking
 
 - Python 3.12.1
 - openmm 8.1.1
-- pdbfixer 1.9.0
+- pdbfixer 1.9.0 (modified) (https://github.com/smallfishabc/pdbfixer.git)
 - metfish 0.0.0
 - GNU Parallel
 - Input PDB file(s) containing !!single chain!! protein structures. (in a single folder for parallel computing input)