diff --git a/src/HogProf/lshbuilder.py b/src/HogProf/lshbuilder.py index 181ba7e..5cdb5e2 100755 --- a/src/HogProf/lshbuilder.py +++ b/src/HogProf/lshbuilder.py @@ -10,7 +10,7 @@ import xml.etree.cElementTree as ET from ete3 import Phyloxml - +import traceback from datasketch import MinHashLSHForest , WeightedMinHashGenerator from datetime import datetime import h5py @@ -147,7 +147,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving else: #load machine learning weights self.treeweights = treeweights - print(self.treeweights) wmg = WeightedMinHashGenerator(3*len(self.taxaIndex), sample_size = numperm , seed=1) with open( self.saving_path + 'wmg.pkl', 'wb') as wmgout: wmgout.write( pickle.dumps(wmg)) @@ -259,21 +258,13 @@ def worker(self, i, q, retq, matq, l): while True: df = q.get() if df is not None : - try: - df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1) - df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1) - if self.fileglob: - retq.put(df[['Fam', 'hash', 'ortho']]) - else: - retq.put(df[['Fam', 'hash']]) - except Exception as e: - print('error in worker' + str(i)) - print(e) - with open(self.errorfile, 'a') as errorfile: - errorfile.write(str(e)) - errorfile.write(str(df)) - - #matq.put(df[['Fam', 'rows']]) + df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1) + df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1) + if self.fileglob: + retq.put(df[['Fam', 'hash', 'ortho']]) + else: + retq.put(df[['Fam', 'hash']]) + else: if self.verbose == True: print('Worker done' + str(i)) diff --git a/src/HogProf/utils/hashutils.py b/src/HogProf/utils/hashutils.py index c061976..ab6f7a9 100755 --- a/src/HogProf/utils/hashutils.py +++ b/src/HogProf/utils/hashutils.py @@ -10,7 +10,8 @@ def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ): - #weighing function for tax level, masking levels etc. sets all weights to 1 + #weighing function for tax level, masking levels etc. sets all weights to 1 if they are in taxmask or filter + #custom weights can also be used here """ Generate the weights of each taxonomic level to be applied during the constructin of weighted minhashes @@ -23,7 +24,7 @@ def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ): #get max of taxa index taxmax = max(taxaIndex.values())+1 weights = np.zeros((3*taxmax,1)) - print(len(taxaIndex)) + print('making tree weights w n taxa = ':len(taxaIndex)) newtree = mastertree for event in weights: for n in newtree.traverse(): @@ -57,6 +58,7 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly = taxaIndex_max = max(taxaIndex.values())+1 hog_matrix_weighted = np.zeros((1, 3*taxaIndex_max)) hog_matrix_binary = np.zeros((1, 3*taxaIndex_max)) + if tp: losses = [ taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in taxaIndex ] dupl = [ taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in taxaIndex ] @@ -64,15 +66,23 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly = indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) ) for i,event in enumerate(indices): if len(indices[event])>0: - taxindex = np.asarray(indices[event]) - hogindex = np.asarray(indices[event])+i*taxaIndex_max - hog_matrix_weighted[:,hogindex] = treeweights[hogindex , : ].ravel() - if lossonly == True and event == 'loss': - hog_matrix_weighted[:,hogindex] = 1 - if duplonly == True and event == 'dup': - hog_matrix_weighted[:,hogindex] = 1 - if lossonly == False and duplonly == False: - hog_matrix_binary[:,hogindex] = 1 + try: + hogindex = np.asarray(indices[event])+i*taxaIndex_max + + hog_matrix_weighted[:,hogindex] = treeweights[hogindex , : ].ravel() + + if lossonly == True and event == 'loss': + hog_matrix_weighted[:,hogindex] = 1 + if duplonly == True and event == 'dup': + hog_matrix_weighted[:,hogindex] = 1 + if lossonly == False and duplonly == False: + hog_matrix_binary[:,hogindex] = 1 + except: + print( 'error in hash_tree') + print( 'event', event) + print( 'indices', indices[event]) + print( 'hogindex', hogindex) + else: #throwaway vector... hog_matrix_weighted[0,0] = 1