Skip to content

Commit

Permalink
Merge pull request #11 from LeeBergstrand/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
LeeBergstrand authored Jun 9, 2018
2 parents f6b9a10 + a32f4c1 commit ee70ff9
Show file tree
Hide file tree
Showing 16 changed files with 386 additions and 355 deletions.
204 changes: 105 additions & 99 deletions BackBLAST.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env python
#!/usr/bin/env python

# ----------------------------------------------------------------------------------------
# Created by: Lee Bergstrand
# Description: A Biopython program that takes a list of query proteins and uses local BLASTp to search
# for highly similar proteins within a local blast database (usually a local db of a target
Expand All @@ -16,79 +18,82 @@
# Example: BackBLAST.py queryGeneList.faa AL123456.3.faa AUUJ00000000.faa
# ----------------------------------------------------------------------------------------
# ===========================================================================================================

# Imports & Setup:
import sys
import csv
import subprocess
import tempfile
import sys
from multiprocessing import cpu_count

from Bio import SeqIO

from Graph import Graph
from multiprocessing import cpu_count

processors = cpu_count() # Gets number of processor cores for BLAST.


# ===========================================================================================================
# Functions:

# 1: Checks if in proper number of arguments are passed gives instructions on proper use.
def argsCheck(argsCount):
if len(sys.argv) < argsCount:
print("Orthologous Gene Finder")
print("By Lee Bergstrand\n")
print("Please refer to source code for documentation\n")
print("Usage: " + sys.argv[0] + " <queryGeneList.faa> <queryBLASTDB.faa> <subjectBLASTDB.faa> \n")
print("Examples:" + sys.argv[0] + " queryGeneList.faa AL123456.3.faa AUUJ00000000.faa")
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred)
if len(sys.argv) < argsCount:
print("Orthologous Gene Finder")
print("By Lee Bergstrand\n")
print("Please refer to source code for documentation\n")
print("Usage: " + sys.argv[0] + " <queryGeneList.faa> <queryBLASTDB.faa> <subjectBLASTDB.faa> \n")
print("Examples:" + sys.argv[0] + " queryGeneList.faa AL123456.3.faa AUUJ00000000.faa")
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred)


# -------------------------------------------------------------------------------------------------
# 2: Runs BLAST, can either be sent a FASTA formatted string or a file ...
def runBLAST(query, BLASTDBFile):
# Runs BLASTp and saves the output to a string.
# BLASTp is set to output a csv which can be parsed by Pythons CSV module.
BLASTOut = subprocess.check_output(
["blastp", "-db", BLASTDBFile, "-query", query, "-evalue", "1e-25", "-num_threads", str(processors), "-outfmt",
"10 qseqid sseqid pident evalue qcovhsp bitscore"])
return BLASTOut
# Runs BLASTp and saves the output to a string.
# BLASTp is set to output a csv which can be parsed by Pythons CSV module.
BLASTOut = subprocess.check_output(
["blastp", "-db", BLASTDBFile, "-query", query, "-evalue", "1e-25", "-num_threads", str(processors), "-outfmt",
"10 qseqid sseqid pident evalue qcovhsp bitscore"])
return BLASTOut


# -------------------------------------------------------------------------------------------------
# 3: Filters HSPs by Percent Identity...
def filterBLASTCSV(BLASTOut):
minIdent = 25
minIdent = 25

BLASTCSVOut = BLASTOut.splitlines(True) # Converts raw BLAST csv output into list of csv rows.
BLASTreader = csv.reader(BLASTCSVOut) # Reads BLAST csv rows as a csv.
BLASTCSVOut = BLASTOut.splitlines(True) # Converts raw BLAST csv output into list of csv rows.
BLASTreader = csv.reader(BLASTCSVOut) # Reads BLAST csv rows as a csv.

BLASTCSVOutFiltred = [] # Note should simply delete unwanted HSPs from current list rather than making new list.
# Rather than making a new one.
for HSP in BLASTreader:
if HSP[2] >= minIdent: # Filters by minimum identity.
# Converts each HSP parameter that should be a number to a number.
HSP[2] = float(HSP[2])
HSP[3] = float(HSP[3])
HSP[4] = float(HSP[4])
HSP[5] = float(HSP[5])
BLASTCSVOutFiltred.append(HSP) # Appends to output array.
BLASTCSVOutFiltred = [] # Note should simply delete unwanted HSPs from current list rather than making new list.
# Rather than making a new one.
for HSP in BLASTreader:
if HSP[2] >= minIdent: # Filters by minimum identity.
# Converts each HSP parameter that should be a number to a number.
HSP[2] = float(HSP[2])
HSP[3] = float(HSP[3])
HSP[4] = float(HSP[4])
HSP[5] = float(HSP[5])
BLASTCSVOutFiltred.append(HSP) # Appends to output array.

return BLASTCSVOutFiltred
return BLASTCSVOutFiltred


# -------------------------------------------------------------------------------------------------
# 5: Creates a python dictionary (hash table) that contains the the FASTA for each protein in the proteome.
def createProteomeHash(ProteomeFile):
ProteomeHash = dict()
try:
handle = open(ProteomeFile, "rU")
proteome = SeqIO.parse(handle, "fasta")
for record in proteome:
ProteomeHash.update({record.id: record.format("fasta")})
handle.close()
except IOError:
print("Failed to open " + ProteomeFile)
sys.exit(1)
ProteomeHash = dict()
try:
handle = open(ProteomeFile, "rU")
proteome = SeqIO.parse(handle, "fasta")
for record in proteome:
ProteomeHash.update({record.id: record.format("fasta")})
handle.close()
except IOError:
print("Failed to open " + ProteomeFile)
sys.exit(1)

return ProteomeHash
return ProteomeHash


# ===========================================================================================================
Expand All @@ -104,11 +109,11 @@ def createProteomeHash(ProteomeFile):

# File extension checks
if not queryFile.endswith(".faa"):
print("[Warning] " + queryFile + " may not be a amino acid FASTA file!")
print("[Warning] " + queryFile + " may not be a amino acid FASTA file!")
if not queryBLASTDBFile.endswith(".faa"):
print("[Warning] " + queryBLASTDBFile + " may not be a amino acid FASTA file!")
print("[Warning] " + queryBLASTDBFile + " may not be a amino acid FASTA file!")
if not subjectBLASTDBFile.endswith(".faa"):
print("[Warning] " + subjectBLASTDBFile + " may not be a amino acid FASTA file!")
print("[Warning] " + subjectBLASTDBFile + " may not be a amino acid FASTA file!")

OutFile = subjectBLASTDBFile.rstrip(".faa") + ".csv"

Expand All @@ -119,41 +124,41 @@ def createProteomeHash(ProteomeFile):
BLASTForward = filterBLASTCSV(BLASTForward) # Filters BLAST results by percent identity.

if len(BLASTForward) == 0:
print(">> No Forward hits in subject proteome were found.")
# Writes empty file for easier data processing.
try:
writeFile = open(OutFile, "w")
writer = csv.writer(writeFile)
except IOError:
print(">> Failed to create " + outFile)
sys.exit(1)
print(">> Exiting.\n\n")
sys.exit(0) # Aborts program. (exit(0) indicates that no error occurred)
print(">> No Forward hits in subject proteome were found.")
# Writes empty file for easier data processing.
try:
writeFile = open(OutFile, "w")
writer = csv.writer(writeFile)
except IOError:
print(">> Failed to create " + OutFile)
sys.exit(1)
print(">> Exiting.\n\n")
sys.exit(0) # Aborts program. (exit(0) indicates that no error occurred)

SubjectProteomeHash = createProteomeHash(
subjectBLASTDBFile) # Creates python dictionary confining every protein in the subject Proteome.
subjectBLASTDBFile) # Creates python dictionary confining every protein in the subject Proteome.
BackBlastQueryFASTAs = []

print(">> Creating Back-Blasting Query from found subject proteins...")
# For each top Hit...

for hit in BLASTForward:
subjectProtein = hit[1]
queryProtein = hit[0]
subjectProteinFASTA = SubjectProteomeHash.get(subjectProtein) # Extracts subjectProtein from python dictionary.
subjectProteinFASTA.strip()
BackBlastQueryFASTAs.append(subjectProteinFASTA) # Adds current subject to overall protein list.
subjectProtein = hit[1]
queryProtein = hit[0]
subjectProteinFASTA = SubjectProteomeHash.get(subjectProtein) # Extracts subjectProtein from python dictionary.
subjectProteinFASTA.strip()
BackBlastQueryFASTAs.append(subjectProteinFASTA) # Adds current subject to overall protein list.

CompleteBackBlastQuery = "".join(BackBlastQueryFASTAs)

# Attempt to write a temporary FASTA file for the reverse BLAST to use.
try:
writeFile = open("tempQuery.faa", "w")
writeFile.write(CompleteBackBlastQuery)
writeFile.close()
writeFile = open("tempQuery.faa", "w")
writeFile.write(CompleteBackBlastQuery)
writeFile.close()
except IOError:
print("Failed to create tempQuery.faa")
sys.exit(1)
print("Failed to create tempQuery.faa")
sys.exit(1)

print(">> Blasting backwards from subject genome to query genome.")
# Run backwards BLAST towards query proteome.
Expand All @@ -162,47 +167,48 @@ def createProteomeHash(ProteomeFile):

print(">> Creating Graph...")
for hit in BLASTForward:
BLASTGraph.addEdge(hit[0], hit[1], hit[5])
BLASTGraph.addEdge(hit[0], hit[1], hit[5])
for hit in BLASTBackward:
BLASTGraph.addEdge(hit[0], hit[1], hit[5])
BLASTGraph.addEdge(hit[0], hit[1], hit[5])

BackBlastOutput = list(BLASTForward)

print(">> Checking if forward hit subjects have better reciprocal hits than query.")
for hit in BLASTForward:
queryProtein = BLASTGraph.getVertex(hit[0])
subjectProtein = BLASTGraph.getVertex(hit[1])

topBackHitScore = 0
# Find the top score of the best reciprocal BLAST hit.
for backHit in subjectProtein.getConnections():
backHitScore = subjectProtein.getWeight(
backHit) # The edge weight between the subject and its reciprocal BLAST hit is the BLAST score.
if backHitScore >= topBackHitScore:
topBackHitScore = backHitScore

# Check if the query is the best reciprocal BLAST hit for the subject.
deleteHit = False
if queryProtein in subjectProtein.getConnections():
BackHitToQueryScore = subjectProtein.getWeight(
queryProtein) # The edge weight between the subject and the query is the reciprocal BLAST score.
if BackHitToQueryScore < topBackHitScore:
deleteHit = True # If the query is not the best reciprocal BLAST hit simply delete it from the BackBlastOutput.
else:
deleteHit = True # If the query is not a reciprocal BLAST hit simply delete it from the BackBlastOutput.

if deleteHit:
del BackBlastOutput[BackBlastOutput.index(hit)] # Delete the forward BLAST hit from BackBlastOutput.
queryProtein = BLASTGraph.getVertex(hit[0])
subjectProtein = BLASTGraph.getVertex(hit[1])

topBackHitScore = 0
# Find the top score of the best reciprocal BLAST hit.
for backHit in subjectProtein.getConnections():
backHitScore = subjectProtein.getWeight(
backHit) # The edge weight between the subject and its reciprocal BLAST hit is the BLAST score.
if backHitScore >= topBackHitScore:
topBackHitScore = backHitScore

# Check if the query is the best reciprocal BLAST hit for the subject.
deleteHit = False
if queryProtein in subjectProtein.getConnections():
BackHitToQueryScore = subjectProtein.getWeight(
queryProtein) # The edge weight between the subject and the query is the reciprocal BLAST score.
if BackHitToQueryScore < topBackHitScore:
# If the query is not the best reciprocal BLAST hit simply delete it from the BackBlastOutput.
deleteHit = True
else:
deleteHit = True # If the query is not a reciprocal BLAST hit simply delete it from the BackBlastOutput.

if deleteHit:
del BackBlastOutput[BackBlastOutput.index(hit)] # Delete the forward BLAST hit from BackBlastOutput.

# Attempts to write reciprocal BLAST output to file.
try:
writeFile = open(OutFile, "w")
writer = csv.writer(writeFile)
print(">> Output file created.")
print(">> Writing Data...")
for row in BackBlastOutput:
writer.writerow(row)
writeFile = open(OutFile, "w")
writer = csv.writer(writeFile)
print(">> Output file created.")
print(">> Writing Data...")
for row in BackBlastOutput:
writer.writerow(row)
except IOError:
print(">> Failed to create " + outFile)
sys.exit(1)
print(">> Failed to create " + OutFile)
sys.exit(1)
print(">> Done\n")
95 changes: 49 additions & 46 deletions Graph.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,61 @@
#!/usr/bin/env python
#!/usr/bin/env python

# ----------------------------------------------------------------------------------------
# From: http://interactivepython.org/courselib/static/pythonds/Graphs/graphintro.html#graphs
# Description: Lite Weight Script For Creating Graph Objects.
# Description: Light Weight Script For Creating Graph Objects.
# ----------------------------------------------------------------------------------------


class Vertex:
def __init__(self, key):
self.id = key
self.connectedTo = {}
def __init__(self, key):
self.id = key
self.connectedTo = {}

def addNeighbor(self, nbr, weight=0):
self.connectedTo[nbr] = weight
def addNeighbor(self, nbr, weight=0):
self.connectedTo[nbr] = weight

def __str__(self):
return str(self.id) + ' connectedTo: ' + str([x.id for x in self.connectedTo])
def __str__(self):
return str(self.id) + ' connectedTo: ' + str([x.id for x in self.connectedTo])

def getConnections(self):
return self.connectedTo.keys()
def getConnections(self):
return self.connectedTo.keys()

def getId(self):
return self.id
def getId(self):
return self.id

def getWeight(self, nbr):
return self.connectedTo[nbr]
def getWeight(self, nbr):
return self.connectedTo[nbr]


class Graph:
def __init__(self):
self.vertList = {}
self.numVertices = 0

def addVertex(self, key):
self.numVertices = self.numVertices + 1
newVertex = Vertex(key)
self.vertList[key] = newVertex
return newVertex

def getVertex(self, n):
if n in self.vertList:
return self.vertList[n]
else:
return None

def __contains__(self, n):
return n in self.vertList

def addEdge(self, f, t, cost=0):
if f not in self.vertList:
nv = self.addVertex(f)
if t not in self.vertList:
nv = self.addVertex(t)
self.vertList[f].addNeighbor(self.vertList[t], cost)

def getVertices(self):
return self.vertList.keys()

def __iter__(self):
return iter(self.vertList.values())
def __init__(self):
self.vertList = {}
self.numVertices = 0

def addVertex(self, key):
self.numVertices = self.numVertices + 1
newVertex = Vertex(key)
self.vertList[key] = newVertex
return newVertex

def getVertex(self, n):
if n in self.vertList:
return self.vertList[n]
else:
return None

def __contains__(self, n):
return n in self.vertList

def addEdge(self, f, t, cost=0):
if f not in self.vertList:
self.addVertex(f)
if t not in self.vertList:
self.addVertex(t)
self.vertList[f].addNeighbor(self.vertList[t], cost)

def getVertices(self):
return self.vertList.keys()

def __iter__(self):
return iter(self.vertList.values())
Loading

0 comments on commit ee70ff9

Please sign in to comment.