Skip to content

Commit

Permalink
fixing index err
Browse files Browse the repository at this point in the history
  • Loading branch information
cactuskid committed Aug 30, 2024
1 parent 7a649ee commit f8e87ae
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 28 deletions.
25 changes: 8 additions & 17 deletions src/HogProf/lshbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import xml.etree.cElementTree as ET
from ete3 import Phyloxml


import traceback
from datasketch import MinHashLSHForest , WeightedMinHashGenerator
from datetime import datetime
import h5py
Expand Down Expand Up @@ -147,7 +147,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
else:
#load machine learning weights
self.treeweights = treeweights
print(self.treeweights)
wmg = WeightedMinHashGenerator(3*len(self.taxaIndex), sample_size = numperm , seed=1)
with open( self.saving_path + 'wmg.pkl', 'wb') as wmgout:
wmgout.write( pickle.dumps(wmg))
Expand Down Expand Up @@ -259,21 +258,13 @@ def worker(self, i, q, retq, matq, l):
while True:
df = q.get()
if df is not None :
try:
df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1)
df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
if self.fileglob:
retq.put(df[['Fam', 'hash', 'ortho']])
else:
retq.put(df[['Fam', 'hash']])
except Exception as e:
print('error in worker' + str(i))
print(e)
with open(self.errorfile, 'a') as errorfile:
errorfile.write(str(e))
errorfile.write(str(df))

#matq.put(df[['Fam', 'rows']])
df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1)
df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
if self.fileglob:
retq.put(df[['Fam', 'hash', 'ortho']])
else:
retq.put(df[['Fam', 'hash']])

else:
if self.verbose == True:
print('Worker done' + str(i))
Expand Down
32 changes: 21 additions & 11 deletions src/HogProf/utils/hashutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@


def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ):
#weighing function for tax level, masking levels etc. sets all weights to 1
#weighing function for tax level, masking levels etc. sets all weights to 1 if they are in taxmask or filter
#custom weights can also be used here
"""
Generate the weights of each taxonomic level to be applied during the
constructin of weighted minhashes
Expand All @@ -23,7 +24,7 @@ def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ):
#get max of taxa index
taxmax = max(taxaIndex.values())+1
weights = np.zeros((3*taxmax,1))
print(len(taxaIndex))
print('making tree weights w n taxa = ':len(taxaIndex))
newtree = mastertree
for event in weights:
for n in newtree.traverse():
Expand Down Expand Up @@ -57,22 +58,31 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly =
taxaIndex_max = max(taxaIndex.values())+1
hog_matrix_weighted = np.zeros((1, 3*taxaIndex_max))
hog_matrix_binary = np.zeros((1, 3*taxaIndex_max))

if tp:
losses = [ taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in taxaIndex ]
dupl = [ taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in taxaIndex ]
presence = [ taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in taxaIndex ]
indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) )
for i,event in enumerate(indices):
if len(indices[event])>0:
taxindex = np.asarray(indices[event])
hogindex = np.asarray(indices[event])+i*taxaIndex_max
hog_matrix_weighted[:,hogindex] = treeweights[hogindex , : ].ravel()
if lossonly == True and event == 'loss':
hog_matrix_weighted[:,hogindex] = 1
if duplonly == True and event == 'dup':
hog_matrix_weighted[:,hogindex] = 1
if lossonly == False and duplonly == False:
hog_matrix_binary[:,hogindex] = 1
try:
hogindex = np.asarray(indices[event])+i*taxaIndex_max

hog_matrix_weighted[:,hogindex] = treeweights[hogindex , : ].ravel()

if lossonly == True and event == 'loss':
hog_matrix_weighted[:,hogindex] = 1
if duplonly == True and event == 'dup':
hog_matrix_weighted[:,hogindex] = 1
if lossonly == False and duplonly == False:
hog_matrix_binary[:,hogindex] = 1
except:
print( 'error in hash_tree')
print( 'event', event)
print( 'indices', indices[event])
print( 'hogindex', hogindex)

else:
#throwaway vector...
hog_matrix_weighted[0,0] = 1
Expand Down

0 comments on commit f8e87ae

Please sign in to comment.