diff --git a/src/HogProf/lshbuilder.py b/src/HogProf/lshbuilder.py index 3faafe0..a6c9824 100755 --- a/src/HogProf/lshbuilder.py +++ b/src/HogProf/lshbuilder.py @@ -161,10 +161,10 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving if self.h5OMA: self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , reformat_names = self.reformat_names , - orthoXML_as_string = True , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper ) + orthoXML_as_string = True , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper , levels = None ) else: self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , - orthoXML_as_string = False , reformat_names = self.reformat_names , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper ) + orthoXML_as_string = False , reformat_names = self.reformat_names , use_phyloxml = self.use_phyloxml , orthomapper = self.idmapper , levels = None ) self.HASH_PIPELINE = functools.partial( hashutils.row2hash , taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg , lossonly = lossonly, duplonly = duplonly) if self.h5OMA: @@ -254,6 +254,10 @@ def worker(self, i, q, retq, matq, l): df = q.get() if df is not None : df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1) + #add a dictionary of results with subhogs { fam_sub1: { 'tree':tp , 'Fam':fam } , fam_sub2: { 'tree':tp , 'Fam':fam } , ... } + #returned_df = pd.DataFrame.from_dict(df['tree'].to_dict(), orient='index') + #merge with pandas on right e.g. df.merge( returned_df , on = 'Fam' , how = 'right' ) + df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1) if self.fileglob: retq.put(df[['Fam', 'hash', 'ortho']]) diff --git a/src/HogProf/utils/hashutils.py b/src/HogProf/utils/hashutils.py index 38c24ba..94b405d 100755 --- a/src/HogProf/utils/hashutils.py +++ b/src/HogProf/utils/hashutils.py @@ -133,6 +133,7 @@ def row2hash(row , taxaIndex , treeweights , wmg , lossonly = False , duplonly = #convert a dataframe row to a weighted minhash fam, treemap = row.tolist() hog_matrix,weighted_hash = hash_tree(treemap , taxaIndex , treeweights , wmg , lossonly = lossonly , duplonly = duplonly) + return pd.Series([weighted_hash,hog_matrix], index=['hash','rows']) def fam2hash_hdf5(fam, hdf5, dataset = None, nsamples = 128 ): diff --git a/src/HogProf/utils/pyhamutils.py b/src/HogProf/utils/pyhamutils.py index e52521e..f0c903d 100755 --- a/src/HogProf/utils/pyhamutils.py +++ b/src/HogProf/utils/pyhamutils.py @@ -84,7 +84,7 @@ def orthoxml2numerical(orthoxml , mapper): orthoxml = ET.tostring(root, encoding='unicode', method='xml') return orthoxml -def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True , use_phyloxml = False , use_internal_name = True ,reformat_names= False, orthomapper = None ): +def get_ham_treemap_from_row(row, tree , levels = None , swap_ids = True , orthoXML_as_string = True , use_phyloxml = False , use_internal_name = True ,reformat_names= False, orthomapper = None ): fam, orthoxml = row format = 'newick_string' if use_phyloxml: @@ -99,8 +99,10 @@ def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoX else: quoted = True try: + # return multiple treemaps corresponding to slices at different levels ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = format , use_internal_name=use_internal_name, orthoXML_as_string=orthoXML_as_string ) - tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0]) + tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0]) + #check for losses / events and n leaves return tp.treemap except Exception as e: # Capture the exception and format the traceback