Skip to content

Commit

Permalink
fix broken OMA tree err
Browse files Browse the repository at this point in the history
  • Loading branch information
cactuskid committed Aug 13, 2024
1 parent 929bbb7 commit 5ca6437
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 35 deletions.
2 changes: 1 addition & 1 deletion src/HogProf.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: HogProf
Version: 0.0.8
Version: 0.0.9
Summary: Phylogenetic Profiling with OMA and minhashing
Author-email: Dave Moi <[email protected]>
License: MIT License
Expand Down
92 changes: 70 additions & 22 deletions src/HogProf/lshbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class LSHBuilder:
with a list of taxonomic codes for all the species in your db
"""

def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving_name=None , numperm = 256, treeweights= None , taxfilter = None, taxmask= None , lossonly = False, duplonly = False, verbose = False , use_taxcodes = False , datetime = datetime.now()):
def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving_name=None , numperm = 256, treeweights= None , taxfilter = None, taxmask= None , lossonly = False, duplonly = False, verbose = False , use_taxcodes = False , datetime = datetime.now() , reformat_names = False):

"""
Initializes the LSHBuilder class with the specified parameters and sets up the necessary objects.
Expand All @@ -67,11 +67,14 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
self.db_obj = None
self.oma_id_obj = None

self.reformat_names = reformat_names
self.tax_filter = taxfilter
self.tax_mask = taxmask
self.verbose = verbose
self.datetime = datetime
self.use_phyloxml = False
self.fileglob = fileglob
self.idmapper = None
self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now())
if saving_name:
self.saving_name= saving_name
Expand All @@ -82,6 +85,7 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
os.mkdir(path=self.saving_path)
else:
raise Exception( 'please specify an output location' )
self.errorfile = self.saving_path + 'errors.txt'

if masterTree is None:
if h5_oma:
Expand All @@ -102,19 +106,37 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
trees = [t for t in project.get_phylogeny()]
self.tree_ete3 = [ n for n in trees[0] ][0]
print( self.tree_ete3 )

self.use_phyloxml = True
print('using phyloxml')
print( self.tree_ete3 )
self.tree_string = masterTree
else:

try:
self.tree_ete3 = ete3.Tree(masterTree, format=1 , quoted_node_names= True)
print( self.tree_ete3 )
except:
self.tree_ete3 = ete3.Tree(masterTree, format=0)

with open(masterTree) as treein:
self.tree_string = treein.read()
#self.tree_string = self.tree_ete3.write(format=0)

else:
raise Exception( 'please specify a tree' )

if self.reformat_names:
self.tree_ete3, self.idmapper = pyhamutils.tree2numerical(self.tree_ete3)
with open( self.saving_path + 'reformatted_tree.nw', 'w') as treeout:
treeout.write(self.tree_ete3.write(format=0 ))
with open( self.saving_path + 'idmapper.pkl', 'wb') as idout:
idout.write( pickle.dumps(self.idmapper))
print('reformatted tree')
print( self.tree_ete3 )
self.tree_string = self.tree_ete3.write(format=1)
#remap taxfilter and taxmask
if taxfilter:
self.tax_filter = [ self.idmapper[tax] for tax in taxfilter ]
if taxmask:
self.tax_mask = self.idmapper[taxmask]

self.swap2taxcode = use_taxcodes
self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3 , self.tax_filter, self.tax_mask)
with open( self.saving_path + 'taxaIndex.pkl', 'wb') as taxout:
Expand All @@ -137,14 +159,21 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
print('swapping ids')
else:
print('not swapping ids')



if self.h5OMA:

self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode )
print( 'configuring pyham functions')
print( 'swap ids', self.swap2taxcode)
print( 'reformat names', self.reformat_names)
print( 'use phyloxml', self.use_phyloxml)
print( 'use taxcodes', self.swap2taxcode)

if self.h5OMA:
self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , reformat_names = self.reformat_names ,
orthoXML_as_string = True , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper )
else:
self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , orthoXML_as_string = False )
self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode ,
orthoXML_as_string = False , reformat_names = self.reformat_names , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper )


self.HASH_PIPELINE = functools.partial( hashutils.row2hash , taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg , lossonly = lossonly, duplonly = duplonly)
if self.h5OMA:

Expand Down Expand Up @@ -236,9 +265,18 @@ def worker(self, i, q, retq, matq, l):
while True:
df = q.get()
if df is not None :
df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1)
df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
retq.put(df[['Fam', 'hash']])

try:
df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1)
df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
retq.put(df[['Fam', 'hash']])
except Exception as e:
print('error in worker' + str(i))
print(e)
with open(self.errorfile, 'a') as errorfile:
errorfile.write(str(e))
errorfile.write(str(df))

#matq.put(df[['Fam', 'rows']])
else:
if self.verbose == True:
Expand Down Expand Up @@ -287,8 +325,9 @@ def saver(self, i, q, retq, matq, l ):
h5hashes[taxstr][fam, :] = hashes[fam].hashvalues.ravel()
count += 1
if t.time() - save_start > 200:
print( t.time() - global_time )
print( 'saving at :' , t.time() - global_time )
forest.index()
print( 'testing forest' )
print(forest.query( hashes[fam] , k = 10 ) )
h5flush()
save_start = t.time()
Expand Down Expand Up @@ -412,13 +451,14 @@ def main():
parser.add_argument('--OrthoGlob', help='a glob expression for orthoxml files ' , type = str)
parser.add_argument('--tarfile', help='use tarfile with orthoxml data ' , type = str)
parser.add_argument('--nperm', help='number of hash functions to use when constructing profiles' , type = int)
parser.add_argument('--mastertree', help='master taxonomic tree. should use ncbi taxonomic id numbers as leaf names' , type = str)
parser.add_argument('--mastertree', help='master taxonomic tree. nodes should correspond to orthoxml' , type = str)

parser.add_argument('--nthreads', help='nthreads for multiprocessing' , type = int)
parser.add_argument('--outfolder', help='folder for storing hash, db and tree objects' , type = str)
parser.add_argument('--lossonly', help='only compile loss events' , type = bool)
parser.add_argument('--duplonly', help='only compile duplication events' , type = bool)
parser.add_argument('--taxcodes', help='use taxid info in HOGs' , type = str)
parser.add_argument('--verbose', help='print verbose output' , type = bool)
parser.add_argument('--reformat_names', help='try to correct broken species trees by replacing all names with numbers.' , type = bool)
dbdict = {
'all': { 'taxfilter': None , 'taxmask': None },
'plants': { 'taxfilter': None , 'taxmask': 33090 },
Expand All @@ -439,10 +479,10 @@ def main():

if 'OrthoGlob' in args:
if args['OrthoGlob']:
orthoglob = glob.glob(args['OrthoGlob']+ '*')
orthoglob = glob.glob(args['OrthoGlob'])
else:
orthoglob = None

if 'outpath' in args:
dbname = args['outpath']
else:
Expand All @@ -467,6 +507,9 @@ def main():
else:
raise Exception(' please specify input data ')




if args['lossonly']:
lossonly = args['lossonly']
else:
Expand All @@ -488,7 +531,10 @@ def main():
else:
verbose = False


if args['reformat_names']:
reformat_names = True
else:
reformat_names = False

threads = 4
if args['nthreads']:
Expand All @@ -514,11 +560,13 @@ def main():
if omafile:
with open_file( omafile , mode="r") as h5_oma:
lsh_builder = LSHBuilder(h5_oma = h5_oma, fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose)
treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree ,
lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose )
lsh_builder.run_pipeline(threads)
else:
lsh_builder = LSHBuilder(h5_oma = None, fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose)
treeweights= weights , taxfilter = taxfilter, taxmask=taxmask ,
masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose)
lsh_builder.run_pipeline(threads)
print(time.time() - start)
print('DONE')
Expand Down
2 changes: 2 additions & 0 deletions src/HogProf/utils/hashutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly =
:return weighted_hash: a weighted minhash of a HOG
"""
if not tp:
return None, None

hog_matrix_weighted = np.zeros((1, 3*len(taxaIndex)))
hog_matrix_binary = np.zeros((1, 3*len(taxaIndex)))
Expand Down
96 changes: 84 additions & 12 deletions src/HogProf/utils/pyhamutils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pyham
import xml.etree.cElementTree as ET
import ete3
import pickle
import traceback

Expand Down Expand Up @@ -34,23 +35,94 @@ def switch_name_ncbi_id(orthoxml , mapdict = None ):
orthoxml = ET.tostring(root, encoding='unicode', method='xml')
return orthoxml

def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True ):
def reformat_treenames( tree , mapdict = None ):
#tree is an ete3 tree instance
#replace ( ) - / . and spaces with underscores

#iterate over all nodes
for node in tree.traverse():
if mapdict:
node.name = mapdict[node.name]
else:
node.name = node.name.replace('(', '').replace(')', '').replace('-', '_').replace('/', '_')
return tree

def reformat_names_orthoxml(orthoxml , mapdict = None ):
#replace ( ) - / . and spaces with underscores
root = ET.fromstring(orthoxml)
for child in root:
if 'species' in child.tag:
child.attrib['name'] = child.attrib['name'].replace('(', '').replace(')', '').replace('-', '_').replace('/', '_')
elif mapdict:
child.attrib['name'] = mapdict[child.attrib['name']]
orthoxml = ET.tostring(root, encoding='unicode', method='xml')
return orthoxml

def create_nodemapping(tree):
#create a mapping from node name to node
nodemapping = {}
for i,node in enumerate(tree.traverse()):
nodemapping[node.name] = str(i)

#assert number of nodes is equal to number of unique names
assert len([n for n in tree.traverse()]) == len(set(nodemapping.values()))

return nodemapping

def tree2numerical(tree):
mapper = create_nodemapping(tree)
for i,node in enumerate(tree.traverse()):
node.name = mapper[node.name]
return tree , mapper

def orthoxml2numerical(orthoxml , mapper):
root = ET.fromstring(orthoxml)
for child in root:
if 'name' in child.attrib:
child.attrib['name'] = mapper[child.attrib['name']]
orthoxml = ET.tostring(root, encoding='unicode', method='xml')
return orthoxml

def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True , use_phyloxml = False , use_internal_name = True ,reformat_names= False, orthomapper = None ):
fam, orthoxml = row
format = 'newick_string'
if use_phyloxml:
format = 'phyloxml'
if orthoxml:
if swap_ids == True and orthoXML_as_string == True:
orthoxml = switch_name_ncbi_id(orthoxml)
quoted = False
elif reformat_names == True and orthoXML_as_string == True:
orthoxml = orthoxml2numerical(orthoxml , orthomapper)
quoted = False
else:
quoted = True
try:
if swap_ids == True and orthoXML_as_string == True:
orthoxml = switch_name_ncbi_id(orthoxml)
quoted = False
else:
quoted = True
ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = 'newick_string' ,use_internal_name=True, orthoXML_as_string=orthoXML_as_string )

ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = format , use_internal_name=use_internal_name, orthoXML_as_string=orthoXML_as_string )
tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0])
return tp.treemap
except:
print('error' , traceback.format_exc())
return None

except Exception as e:
# Capture the exception and format the traceback
full_error_message = str(e)
if 'TypeError: species name ' in full_error_message and 'maps to an ancestral name, not a leaf' in full_error_message:
print('error' , full_error_message)
#species name from bullshit error
#TypeError: species name '3515' maps to an ancestral name, not a leaf of the taxono
species = full_error_message.split('species name ')[1].split(' ')[0].replace('\'','')
print( 'trim tree'+species)
tree = ete3.Tree(tree , format = 1)
#select all nodes with name = species
nodes = tree.search_nodes(name = species)
#get the first node
node = nodes[0]
for c in node.children:
#delete all children
c.delete()
#rerun with trimmed tree
ham_obj = pyham.Ham(tree.write(format=1), orthoxml, type_hog_file="orthoxml" , tree_format = format , use_internal_name=use_internal_name, orthoXML_as_string=orthoXML_as_string )
else:
print('error' , full_error_message)
return None

def yield_families(h5file, start_fam):
"""
Expand Down
1 change: 1 addition & 0 deletions test/reformatted_tree.nw

Large diffs are not rendered by default.

0 comments on commit 5ca6437

Please sign in to comment.