From 5fc7039dfe34e646a16906208cd27b296d0ed332 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 30 Oct 2024 15:12:44 +0100 Subject: [PATCH 1/5] removing mpi.pkl files at cleaning steps --- src/haddock/gear/clean_steps.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/haddock/gear/clean_steps.py b/src/haddock/gear/clean_steps.py index a73915d6f..5448c8abf 100644 --- a/src/haddock/gear/clean_steps.py +++ b/src/haddock/gear/clean_steps.py @@ -59,16 +59,25 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: # `unpack_compressed_and_archived_files` so that the # uncompressing routines when restarting the run work. + # Files to delete (all) + file_to_delete_all = ( + "mpi.pkl", + ) + for extension in file_to_delete_all: + for file_ in glob_folder(path, extension): + Path(file_).unlink() + # Files to delete # deletes all except the first one - files_to_delete = [ + # (keeping one for debugging purposes) + files_to_delete = ( ".inp", ".inp.gz", ".out", ".out.gz", ".job", ".err", - ] + ) for extension in files_to_delete: flist = glob_folder(path, extension) @@ -76,11 +85,11 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: Path(file_).unlink() # files to archive (all files in single .gz) - files_to_archive = [ + files_to_archive = ( ".seed", ".seed.gz", ".con", - ] + ) archive_ready = partial(_archive_and_remove_files, path=path) _ncores = min(ncores, len(files_to_archive)) @@ -90,13 +99,13 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: pass # files to compress in .gz - files_to_compress = [ + files_to_compress = ( ".inp", ".out", ".pdb", ".psf", ".cnserr", - ] + ) for ftc in files_to_compress: found = compress_files_ext(path, ftc, ncores=ncores) @@ -111,9 +120,11 @@ def _archive_and_remove_files(fta: str, path: FilePath) -> None: # eventually this function can be moved to `libs.libio` in case of future need. -def unpack_compressed_and_archived_files(folders: Iterable[FilePathT], - ncores: int = 1, - dec_all: bool = False) -> None: +def unpack_compressed_and_archived_files( + folders: Iterable[FilePathT], + ncores: int = 1, + dec_all: bool = False, + ) -> None: """ Unpack compressed and archived files in a folders. From 2c06f17e3ff33a70bbdea3e7a6f5788805370919 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 30 Oct 2024 15:19:43 +0100 Subject: [PATCH 2/5] no more bioexcel survey --- src/haddock/gear/greetings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py index 3e92691c7..353456e96 100644 --- a/src/haddock/gear/greetings.py +++ b/src/haddock/gear/greetings.py @@ -31,7 +31,7 @@ feedback_urls = { "GitHub issues": "https://github.com/haddocking/haddock3/issues", "BioExcel feedback": "https://www.bonvinlab.org/feedback", - "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", + # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", } From c1a429b7b9bcd018a2304411a6770f8cc7af746f Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 31 Oct 2024 16:00:27 +0100 Subject: [PATCH 3/5] local structures in analysis & archiving haddock3 run --- src/haddock/clis/cli.py | 20 +- src/haddock/clis/cli_analyse.py | 367 +++++++++++++++++++---------- src/haddock/core/optional.yaml | 10 +- src/haddock/gear/postprocessing.py | 46 ++++ src/haddock/libs/libplots.py | 43 +++- src/haddock/libs/libworkflow.py | 19 +- 6 files changed, 357 insertions(+), 148 deletions(-) create mode 100644 src/haddock/gear/postprocessing.py diff --git a/src/haddock/clis/cli.py b/src/haddock/clis/cli.py index a3681430c..65e4b78e7 100755 --- a/src/haddock/clis/cli.py +++ b/src/haddock/clis/cli.py @@ -75,7 +75,7 @@ def cli(ap: ArgumentParser, main: Callable[..., None]) -> None: def maincli() -> None: """Execute main client.""" - cli(ap, main) + cli(_ap(), main) def main( @@ -117,6 +117,7 @@ def main( get_initial_greeting, gen_feedback_messages, ) + from haddock.gear.postprocessing import archive_run from haddock.gear.prepare_run import setup_run from haddock.libs.libio import working_directory from haddock.libs.liblog import ( @@ -181,10 +182,7 @@ def main( restart_step = restart WorkflowManager_ = WorkflowManager - with ( - working_directory(_run_dir), - log_error_and_exit(), - ): + with (working_directory(_run_dir), log_error_and_exit()): workflow = WorkflowManager_( workflow_params=params, start=restart_step, @@ -193,10 +191,20 @@ def main( # Main loop of execution workflow.run() + + # Run post-processing steps if other_params["postprocess"]: - workflow.postprocess() + workflow.postprocess(self_contained=other_params["gen_archive"]) + # Clean outputs workflow.clean() + # Generate archive of the run + if other_params["gen_archive"]: + _run_archive, _analysis_archive = archive_run(_run_dir) + log.info(f"Run archive created: {_run_archive}!") + if _analysis_archive: + log.info(f"Run analysis archive created: {_analysis_archive}") + # Finish end = time() elapsed = convert_seconds_to_min_sec(end - start) diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index 65ca24507..f0ef7e519 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -20,6 +20,7 @@ Where, ``-m 1 3`` means that the analysis will be performed on ``1_rigidbody`` and ``3_flexref``. """ + import argparse import os import shutil @@ -66,59 +67,6 @@ INTER_STR = INTERACTIVE_RE_SUFFIX # suffix of interactive analysis folders -def get_cluster_ranking( - capri_clt_filename: FilePath, - top_cluster: int, - ) -> ClRank: - """ - Get capri cluster ranking. - - Parameters - ---------- - capri_clt_filename : str or Path - capri cluster filename - top_cluster : int - Number of clusters to be considered - - Returns - ------- - cl_ranking : dict - {cluster_id : cluster_rank} dictionary - """ - cl_ranking: ClRank = {} - dfcl = read_capri_table(capri_clt_filename) - for n in range(min(top_cluster, dfcl.shape[0])): - cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n] - return cl_ranking - - -def update_paths( - capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../" -) -> None: - """ - Update paths in capri_ss_filename. - - Parameters - ---------- - capri_ss_filename : str or Path - capri ss filename - toch : str - string to be replaced - toadd : str - string to be added - """ - new_lines: list[str] = [] - with open(capri_ss_filename, "r") as rfile: - for ln in rfile: - new_ln = ln.replace(toch, toadd) - new_lines.append(new_ln) - - with open(capri_ss_filename, "w") as wfile: - for ln in new_lines: - wfile.write(ln) - return - - # Command line interface parser ap = argparse.ArgumentParser( prog="haddock3-analyse", @@ -205,6 +153,14 @@ def update_paths( default=1, ) +ap.add_argument( + "--self-contained", + help="Should the models be accessed locally?", + required=False, + default=False, + type=bool, +) + ap.add_argument( "-p", @@ -242,6 +198,59 @@ def maincli() -> None: cli(_ap(), main) +def get_cluster_ranking( + capri_clt_filename: FilePath, + top_cluster: int, + ) -> ClRank: + """ + Get capri cluster ranking. + + Parameters + ---------- + capri_clt_filename : str or Path + capri cluster filename + top_cluster : int + Number of clusters to be considered + + Returns + ------- + cl_ranking : dict + {cluster_id : cluster_rank} dictionary + """ + cl_ranking: ClRank = {} + dfcl = read_capri_table(capri_clt_filename) + for n in range(min(top_cluster, dfcl.shape[0])): + cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n] + return cl_ranking + + +def update_paths( + capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../" +) -> None: + """ + Update paths in capri_ss_filename. + + Parameters + ---------- + capri_ss_filename : str or Path + capri ss filename + toch : str + string to be replaced + toadd : str + string to be added + """ + new_lines: list[str] = [] + with open(capri_ss_filename, "r") as rfile: + for ln in rfile: + new_ln = ln.replace(toch, toadd) + new_lines.append(new_ln) + + with open(capri_ss_filename, "w") as wfile: + for ln in new_lines: + wfile.write(ln) + return + + def run_capri_analysis( step: str, run_dir: FilePath, @@ -356,67 +365,128 @@ def update_paths_in_capri_dict( return new_capri_dict -def zip_top_ranked( +def get_top_ranked_mapping( capri_filename: FilePath, cluster_ranking: ClRank, - summary_name: FilePath, - ) -> None: - """ - Zip the top ranked structures. + clustered_topX: int = 4, + unclustered_topX: int = 10, + ) -> dict[Path, str]: + # Set mapping of generated files + top_ranked_mapping: dict[Path, str] = {} - Parameters - ---------- - cluster_ranking : dict - {cluster_id : cluster_rank} dictionary - ss_file : str or Path - capri ss filename - - Returns - ------- - output_zipfile : str or Path - path to the zipped file - """ + # Read table capri_df = read_capri_table(capri_filename, comment="#") + # Group by clusters gb_cluster = capri_df.groupby("cluster_id") + + # Loop over clusters for cl_id, cl_df in gb_cluster: + # Filter only top clusters if cl_id in cluster_ranking.keys(): + # If clustered structure if cl_id != "-": - structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]] + # Retrieve only top 4 models per cluster + structs = cl_df.loc[cl_df["model-cluster_ranking"] <= clustered_topX][["model", "model-cluster_ranking"]] # noqa : E501 + # If un-clustered structures else: - structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]] + # Retrieve top 10 + structs = cl_df.loc[cl_df["caprieval_rank"] <= unclustered_topX][["model", "caprieval_rank"]] # noqa : E501 + # Rename columns to access them using same keywords structs.columns = ["model", "rank"] # iterate over the structures for _, row in structs.iterrows(): - struct = Path(row["model"]) - struct_gz = Path(f"{struct}.gz") + # Point rank rank = row["rank"] # set target name if cl_id != "-": - target_name = f"cluster_{cluster_ranking[cl_id]}_model_{rank}.pdb" + # Give it its cluster name + target_name = ( + f"cluster_{cluster_ranking[cl_id]}" + f"_model_{rank}.pdb" + ) else: + # Give it its rank name target_name = f"model_{rank}.pdb" + + # Generate structure path + struct = Path(row["model"]) + struct_gz = Path(f"{struct}.gz") # copy the structure if Path(struct).exists(): - shutil.copy(struct, Path(target_name)) + top_ranked_mapping[struct] = target_name elif struct_gz.exists(): - shutil.copy(struct_gz, ".") - # unpack the file - _unpack_gz(Path(".", struct_gz.name)) - shutil.move(struct.name, Path(target_name)) + top_ranked_mapping[struct_gz] = target_name else: log.warning(f"structure {struct} not found") + return top_ranked_mapping - # now make the archive and delete the pdb files - archive_files_ext(".", "pdb") - for file in Path(".").glob("*.pdb"): - file.unlink() - # move archive to summary - expected_archive = Path(".", "pdb.tgz") - if expected_archive.exists(): - shutil.move("pdb.tgz", summary_name) - log.info(f"Summary archive {summary_name} created!") +def zip_top_ranked( + top_ranked_mapping: dict[Path, str], + summary_name: str, + gen_archive: bool, + ) -> Optional[Path]: + """ + Zip the top ranked structures. + + Parameters + ---------- + capri_filename : str or Path + capri ss filename + cluster_ranking : dict + {cluster_id : cluster_rank} dictionary + summary_name: str + Base name of the archive to be generated + gen_archive: bool + Should the archive be generated? + clustered_topX: int + Number of models to access per cluster. Default is 4. + unclustered_topX: int + Number of models to access when no clusters. Default is 10. + + Return + ------ + output_fname : Optional[Path] + Path to the generated output. Can be a .tgz archive or a directory. + """ + for ori_fpath, new_name in top_ranked_mapping.items(): + # If already compressed + if ori_fpath.suffix == ".gz": + copied_fpath = shutil.copy(ori_fpath, ".") + # unpack the file + _unpack_gz(copied_fpath.name) + # Rename it + shutil.move(copied_fpath.name.replace(".gz", ""), new_name) + else: + shutil.copy(ori_fpath, new_name) + + # Compress pdb files + if gen_archive: + archive_was_created = archive_files_ext(".", "pdb") + # Delete the pdb files + for file_ in top_ranked_mapping.values(): + file_.unlink() + output_fname = Path(f"{summary_name}.tgz") + if archive_was_created: + # move archive to summary + shutil.move("pdb.tgz", output_fname) + log.info(f"Top structures summary archive {output_fname} created!") + return + else: + log.warning(f"Summary archive {output_fname} not created!") + return None + # Generate a directory holding all the structures else: - log.warning(f"Summary archive {summary_name} not created!") + output_fname = Path(summary_name) + output_fname.mkdir(parents=True, exist_ok=True) + for ori_fpath, new_name in top_ranked_mapping.items(): + # Create new path + next_filepath = Path(output_fname, str(new_name)) + # Hold it in mapping dict + top_ranked_mapping[ori_fpath] = str(next_filepath) + # Displace file + shutil.move(new_name, top_ranked_mapping[ori_fpath]) + log.info(f"Top structures copied into {output_fname}!") + return output_fname def analyse_step( @@ -431,6 +501,9 @@ def analyse_step( offline: bool = False, mode: str = "local", ncores: int = 4, + self_contained: bool = False, + clustered_topX: int = 4, + unclustered_topX: int = 10, ) -> None: """ Analyse a step. @@ -454,13 +527,31 @@ def analyse_step( Produce images in the selected format. scale : int scale for images. + is_cleaned: bool + is the directory going to be cleaned? + offline: bool + Should plots js functions be self-contained? + mode: str + mode of execution + ncores: int + number of cores to use + self_contained : bool + Should the analysis directory contain the models? + clustered_topX: int + Number of models to access per cluster. Default is 4. + unclustered_topX: int + Number of models to access when no clusters. Default is 10. """ log.info(f"Analysing step {step}") - + # Create directory target_path.mkdir(parents=True, exist_ok=False) + # Build caprieval output file names/paths + ss_filename = Path("capri_ss.tsv") + clt_filename = Path("capri_clt.tsv") step_name = step.split("_")[1] - ss_fname = Path(run_dir, f"{step}/capri_ss.tsv") - clt_fname = Path(run_dir, f"{step}/capri_clt.tsv") + ss_fname = Path(run_dir, f"{step}/{ss_filename}") + clt_fname = Path(run_dir, f"{step}/{clt_filename}") + # Search for caprieval output files if step_name != "caprieval": if ss_fname.exists() and clt_fname.exists(): log.info(f"step {step} has caprieval data, files are available") @@ -472,44 +563,62 @@ def analyse_step( log.info(f"step {step} is caprieval, files should be already available") run_capri = False - if run_capri == False: + # If caprieval data available, just copy them + if not run_capri: shutil.copy(ss_fname, target_path) shutil.copy(clt_fname, target_path) # Go to directory where to write all the analysis figures / report os.chdir(target_path) # if the step is not caprieval, caprieval must be run - if run_capri == True: + if run_capri: run_capri_analysis(step, run_dir, capri_dict, is_cleaned, mode, ncores) log.info("CAPRI files identified") # plotting - ss_file = Path("capri_ss.tsv") - clt_file = Path("capri_clt.tsv") - if clt_file.exists(): - cluster_ranking = get_cluster_ranking(clt_file, top_cluster) + if clt_filename.exists(): + cluster_ranking = get_cluster_ranking(clt_filename, top_cluster) else: - raise Exception(f"clustering file {clt_file} does not exist") - if ss_file.exists(): + raise Exception(f"clustering file {clt_filename} does not exist") + if ss_filename.exists(): + # Generate file mapping for top ranked structures + top_ranked_mapping = get_top_ranked_mapping( + ss_filename, + cluster_ranking, + clustered_topX=clustered_topX, + unclustered_topX=unclustered_topX, + ) + # provide a zipped archive of the top ranked structures + zip_top_ranked( + top_ranked_mapping, + "summary", + not self_contained, + ) log.info("Plotting results..") scatters = scatter_plot_handler( - ss_file, + ss_filename, cluster_ranking, format, scale, offline=offline, ) boxes = box_plot_handler( - ss_file, + ss_filename, cluster_ranking, format, scale, offline=offline, ) - tables = clt_table_handler(clt_file, ss_file, is_cleaned) + tables = clt_table_handler( + clt_filename, + ss_filename, + is_cleaned, + topX_clusters=top_cluster, + clustered_topX=clustered_topX, + unclustered_topX=unclustered_topX, + top_ranked_mapping=top_ranked_mapping if self_contained else None, + ) report_generator(boxes, scatters, tables, step, ".", offline) - # provide a zipped archive of the top ranked structures - zip_top_ranked(ss_file, cluster_ranking, Path("summary.tgz")) def validate_format(_format: Optional[ImgFormat]) -> Optional[ImgFormat]: @@ -570,6 +679,7 @@ def main( offline: bool = False, mode: Optional[str] = None, ncores: Optional[int] = None, + self_contained: bool = False, **kwargs: Any, ) -> None: """ @@ -579,33 +689,26 @@ def main( ---------- run_dir : str or Path Path to the original run directory. - modules : list of ints List of the integer prefix of the modules to copy. - top_cluster : int Number of clusters to be considered. - format : str Produce images in the selected format. - scale : int scale for images. - inter: bool analyse only steps labelled as 'interactive' - is_cleaned: bool is the directory going to be cleaned? - offline: bool Should plots js functions be self-contained? - mode: str mode of execution - ncores: int number of cores to use + self_contained : bool + Should the analysis directory contain the models? """ log.level = 20 log.info( @@ -646,7 +749,7 @@ def main( bad_folder_paths: list[Path] = [] for step in sel_steps: subfolder_name = f"{step}_analysis" - target_path = Path(Path("./"), subfolder_name) + target_path = Path(subfolder_name) # check if subfolder is already present dest_path = Path(ANA_FOLDER, subfolder_name) @@ -662,7 +765,6 @@ def main( shutil.rmtree(dest_path) # run the analysis - error = False try: analyse_step( step, @@ -676,14 +778,14 @@ def main( offline=offline, mode=mode, ncores=ncores, - ) + #self_contained=self_contained, + self_contained=True, + ) except Exception as e: - error = True log.warning( f"Could not execute the analysis for step {step}. " f"The following error occurred {e}" ) - if error: bad_folder_paths.append(target_path) else: good_folder_paths.append(target_path) @@ -694,8 +796,23 @@ def main( # moving files into analysis folder if good_folder_paths != []: log.info("moving files to analysis folder") + urls: list[str] = [] for directory in good_folder_paths: shutil.move(directory, outdir) + url = f"- [{Path(directory, 'report.html')}](http://0.0.0.0:8000/{Path(directory, 'report.html')}) " # noqa : E501 + urls.append(url) + + # Adding instructions on how to setup the server + readme_fpath = Path(outdir, "README.md") + readme_fpath.write_text( + f"# Usage{os.linesep}{os.linesep}" + "To view structures or download the structure files, " + f"in a terminal run the command:{os.linesep}```bash{os.linesep}" + f"python -m http.server --directory .{os.linesep}```{os.linesep}" + f"And open the link following links in a web browser:{os.linesep}" + f"{os.linesep.join(urls)}{os.linesep}" + ) + assert readme_fpath.exists() if bad_folder_paths != []: log.info("cancelling unsuccesful analysis folders") @@ -714,12 +831,12 @@ def main( log.info(f"View the results in {report_file}") info_msg = ( "To view structures or download the structure files, " - f"in a terminal run the command " - f"`python -m http.server --directory {rundir_cwd}`. " - "By default, http server runs on `http://0.0.0.0:8000/`. " - f"Open the link http://0.0.0.0:8000/{report_file} " + f"in a terminal run the command: {os.linesep}" + f">python -m http.server --directory {rundir_cwd}{os.linesep}" + # "By default, http server runs on `http://0.0.0.0:8000/`. " + f"And open the link http://0.0.0.0:8000/{report_file} " "in a web browser." - ) + ) log.info(info_msg) os.chdir(ori_cwd) return diff --git a/src/haddock/core/optional.yaml b/src/haddock/core/optional.yaml index 3e23e7f33..a07a3d9c1 100644 --- a/src/haddock/core/optional.yaml +++ b/src/haddock/core/optional.yaml @@ -21,4 +21,12 @@ postprocess: used to plot the results of a HADDOCK3 workflow. If this option, this command is automatically executed at the end of the workflow (on the caprieval folders). explevel: easy - +gen_archive: + default: false + type: boolean + title: Generates an archive of the run and of the analysis. + short: If true, executes haddock3-analyse in self_contained mode and generates + archives of the run_directory and of the analysis in two separated tgz files. + long: If true, executes haddock3-analyse in self_contained mode and generates + archives of the run_directory and of the analysis in two separated tgz files. + explevel: easy diff --git a/src/haddock/gear/postprocessing.py b/src/haddock/gear/postprocessing.py new file mode 100644 index 000000000..349dcbb6b --- /dev/null +++ b/src/haddock/gear/postprocessing.py @@ -0,0 +1,46 @@ +"""Tools for post-processing haddock3 runs.""" + +import os +import shutil +import tarfile + +from haddock import log +from haddock.clis.cli_analyse import ANA_FOLDER +from haddock.core.typing import Optional + + +def archive_run(run_dir: str, delete: bool = True) -> tuple[str, Optional[str]]: + """Create an archive of the haddock3 run directory and analysis. + + Parameters + ---------- + run_dir : str + Path to the run directory + delete : bool, optional + Should the un-archived directory be deleted?, by default False + + Returns + ------- + tuple[str, Optional[str]] + run_archive_fname : str + Path to the run archive + analysis_archive_fname : Optional[str] + Path to the run analysis archive + """ + log.info("Creating an archive of the run") + # Start by archiving the run_directory + run_archive_fname = f"{run_dir}.tgz" + with tarfile.open(run_archive_fname, "w:gz") as tar: + tar.add(run_dir, arcname=os.path.basename(run_dir)) + + # Archive the analysis directory + analysis_archive_fname = None + if os.path.exists(f"{run_dir}/{ANA_FOLDER}"): + analysis_archive_fname = f"{run_dir}_{ANA_FOLDER}.tgz" + with tarfile.open(analysis_archive_fname, "w:gz") as tar: + tar.add(f"{run_dir}/{ANA_FOLDER}", arcname=f"{run_dir}_{ANA_FOLDER}") + + if delete: + shutil.rmtree(run_dir) + + return run_archive_fname, analysis_archive_fname \ No newline at end of file diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py index 0747b2192..b85f2bae7 100644 --- a/src/haddock/libs/libplots.py +++ b/src/haddock/libs/libplots.py @@ -877,7 +877,15 @@ def create_other_cluster( return clusters_df, structs_df -def clt_table_handler(clt_file, ss_file, is_cleaned=False): +def clt_table_handler( + clt_file: FilePath, + ss_file: FilePath, + is_cleaned: bool = False, + topX_clusters: int = 10, + clustered_topX: int = 4, + unclustered_topX: int = 10, + top_ranked_mapping: Optional[dict[Path, str]] = None, + ) -> pd.DataFrame: """ Create a dataframe including data for tables. @@ -906,23 +914,34 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False): clusters_df = clusters_df.round(2) structs_df = structs_df.round(2) - # if the run will be cleaned, the structures are going to be gzipped - if is_cleaned: - # substitute the values in the df by adding .gz at the end - structs_df['model'] = structs_df['model'].replace( - to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True, - ) + if not top_ranked_mapping: + # if the run will be cleaned, the structures are going to be gzipped + if is_cleaned and not top_ranked_mapping: + # substitute the values in the df by adding .gz at the end + structs_df['model'] = structs_df['model'].replace( + to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True, + ) # ss_file is in NN_caprieval/ while report is in # analysis/NN_caprieval_analysis/ # need to correct model paths by prepending ../ - structs_df['model'] = structs_df['model'].apply(lambda x: f"../{x}") + def correct_relative_paths( + path: str, + top_ranked_mapping: Optional[dict[Path, str]], + ) -> str: + try: + new_path = top_ranked_mapping[Path(path)] + except KeyError: + new_path = f"../{path}" + return new_path + structs_df['model'] = structs_df['model'].apply( + lambda x: correct_relative_paths(x, top_ranked_mapping) + ) is_unclustered = clusters_df["cluster_rank"].unique().tolist() == ["-"] # If unclustered, we only want to show the top 10 structures in a table. if is_unclustered: - max_unstructured_structures = 10 - structs_df = structs_df[:max_unstructured_structures] + structs_df = structs_df[:unclustered_topX] cols2keep = ['caprieval_rank', 'model'] + list(AXIS_NAMES.keys()) structs_df = structs_df[cols2keep] # model has ../../01_rigidbody/rigidbody_62.pdb.gz @@ -933,11 +952,11 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False): clusters_df, structs_df = create_other_cluster( clusters_df, structs_df, - max_clusters=11, + max_clusters=topX_clusters + 1, ) clusters_df = clean_capri_table(clusters_df) - structs_df = find_best_struct(structs_df, max_best_structs=4) + structs_df = find_best_struct(structs_df, max_best_structs=clustered_topX) df_merged = pd.merge(clusters_df, structs_df, on="cluster_id") return df_merged diff --git a/src/haddock/libs/libworkflow.py b/src/haddock/libs/libworkflow.py index ca2c45ccb..e789b79f3 100644 --- a/src/haddock/libs/libworkflow.py +++ b/src/haddock/libs/libworkflow.py @@ -59,7 +59,7 @@ def clean(self, terminated: Optional[int] = None) -> None: for step in self.recipe.steps[:terminated]: step.clean() - def postprocess(self) -> None: + def postprocess(self, self_contained: bool = False) -> None: """Postprocess the workflow.""" # is the workflow going to be cleaned? is_cleaned = self.recipe.steps[0].config['clean'] @@ -69,14 +69,25 @@ def postprocess(self) -> None: mode = self.recipe.steps[0].config['mode'] # ncores ncores = self.recipe.steps[0].config['ncores'] - + capri_steps: list[int] = [] for step in self.recipe.steps: if step.module_name == "caprieval": capri_steps.append(step.order) # type: ignore # call cli_analyse (no need for capri_dicts, it's all precalculated) - cli_analyse("./", capri_steps, top_cluster=10, format=None, scale=None, - inter=False, is_cleaned=is_cleaned, offline=offline, mode=mode, ncores=ncores) + cli_analyse( + "./", + capri_steps, + top_cluster=10, + format=None, + scale=None, + inter=False, + is_cleaned=is_cleaned, + offline=offline, + mode=mode, + ncores=ncores, + self_contained=self_contained, + ) # call cli_traceback. If it fails, it's not a big deal try: cli_traceback("./", offline=offline) From 3ea287d3a1ed9cf03ce5e28ca64eff9fc06a5a67 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Fri, 1 Nov 2024 16:23:12 +0100 Subject: [PATCH 4/5] update test --- src/haddock/clis/cli_analyse.py | 2 +- tests/test_cli_analyse.py | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index f0ef7e519..9baee2ca5 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -464,7 +464,7 @@ def zip_top_ranked( archive_was_created = archive_files_ext(".", "pdb") # Delete the pdb files for file_ in top_ranked_mapping.values(): - file_.unlink() + Path(file_).unlink() output_fname = Path(f"{summary_name}.tgz") if archive_was_created: # move archive to summary diff --git a/tests/test_cli_analyse.py b/tests/test_cli_analyse.py index bf5d4325c..224ec3d9f 100644 --- a/tests/test_cli_analyse.py +++ b/tests/test_cli_analyse.py @@ -8,6 +8,7 @@ from haddock.clis.cli_analyse import ( get_cluster_ranking, + get_top_ranked_mapping, main, update_capri_dict, zip_top_ranked, @@ -51,11 +52,13 @@ def test_update_capri_dict(default_capri): def test_get_cluster_ranking(example_capri_clt): """Test get_cluster_ranking.""" obs_cl_ranking = get_cluster_ranking(example_capri_clt, 5) - exp_cl_ranking = {16: 1, - 1: 2, - 13: 3, - 4: 4, - 5: 5} + exp_cl_ranking = { + 16: 1, + 1: 2, + 13: 3, + 4: 4, + 5: 5, + } assert exp_cl_ranking == obs_cl_ranking @@ -109,9 +112,20 @@ def test_zip_top_ranked(example_capri_ss, monkeypatch): monkeypatch.chdir(rigid_dir_analysis) exp_cl_ranking = {1: 2} - zip_top_ranked(example_capri_ss, exp_cl_ranking, "summary.tgz") + top_ranked_mapping = get_top_ranked_mapping( + example_capri_ss, + exp_cl_ranking, + ) + # Archive version + zip_top_ranked(top_ranked_mapping, "summary", True) assert os.path.isfile("summary.tgz") is True + # Non-Archived version + zip_top_ranked(top_ranked_mapping, "notarchived", False) + assert not os.path.isfile("notarchived.tgz") + assert os.path.isdir("notarchived") + assert len(list(Path("notarchived").glob("*.pdb"))) > 0 + def test_main_offline(example_capri_ss, example_capri_clt, tmp_path): """Test cli_analyse main in offline mode.""" From 5570d4414816567cde9fd83c1fc8f50b230a624c Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Fri, 6 Dec 2024 09:06:44 +0100 Subject: [PATCH 5/5] Update src/haddock/gear/greetings.py --- src/haddock/gear/greetings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py index 9565c5bd2..e34869b93 100644 --- a/src/haddock/gear/greetings.py +++ b/src/haddock/gear/greetings.py @@ -30,7 +30,7 @@ # Do not hesitate to update / comment one of these feedback_urls = { "GitHub issues": "https://github.com/haddocking/haddock3/issues", - "BioExcel feedback": "https://www.bonvinlab.org/feedback" + "BioExcel feedback": "https://www.bonvinlab.org/feedback", # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", "BioExcel forum": "https://ask.bioexcel.eu/c/haddock/6", }