From 22cba75c47416f07befd648952493ea938ae5b01 Mon Sep 17 00:00:00 2001
From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com>
Date: Wed, 12 Jun 2024 15:07:58 -0400
Subject: [PATCH] Enable std tqdm bar, refactor for re-use. (#330)

* Enable std tqdm bar, refactor for re-use.

* Test output of progress printing.

* WTF isort.

* Document new progress bar option.
---
 docs/catalogs/arguments.rst                   |  6 ++
 src/hipscat_import/catalog/resume_plan.py     |  5 +-
 src/hipscat_import/catalog/run_import.py      | 10 +--
 src/hipscat_import/index/run_index.py         | 10 ++-
 .../margin_cache/margin_cache.py              |  3 +-
 .../margin_cache/margin_cache_resume_plan.py  |  6 +-
 src/hipscat_import/pipeline_resume_plan.py    | 88 +++++++++++++++----
 src/hipscat_import/runtime_arguments.py       |  6 +-
 src/hipscat_import/soap/resume_plan.py        |  6 +-
 src/hipscat_import/soap/run_soap.py           |  6 +-
 .../test_pipeline_resume_plan.py              | 32 +++++--
 11 files changed, 124 insertions(+), 54 deletions(-)

diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst
index 86e378f8..b7c7e0a9 100644
--- a/docs/catalogs/arguments.rst
+++ b/docs/catalogs/arguments.rst
@@ -275,6 +275,12 @@ reporting to look like the following:
     Reducing : 100%|██████████| 10895/10895 [7:46:07<00:00,  2.57s/it]
     Finishing: 100%|██████████| 6/6 [08:03<00:00, 80.65s/it]
 
+``tqdm`` will try to make a guess about the type of output to provide: plain
+text as for a command line, or a pretty ipywidget. If it tries to use a pretty
+widget but your execution environment can't support the widget, you can 
+force the pipeline to use a simple progress bar with the ``simple_progress_bar``
+argument.
+
 For very long-running pipelines (e.g. multi-TB inputs), you can get an 
 email notification when the pipeline completes using the 
 ``completion_email_address`` argument. This will send a brief email, 
diff --git a/src/hipscat_import/catalog/resume_plan.py b/src/hipscat_import/catalog/resume_plan.py
index 30750d12..5ff33f2e 100644
--- a/src/hipscat_import/catalog/resume_plan.py
+++ b/src/hipscat_import/catalog/resume_plan.py
@@ -11,7 +11,6 @@
 from hipscat import pixel_math
 from hipscat.io import FilePointer, file_io
 from hipscat.pixel_math.healpix_pixel import HealpixPixel
-from tqdm.auto import tqdm
 
 from hipscat_import.catalog.sparse_histogram import SparseHistogram
 from hipscat_import.pipeline_resume_plan import PipelineResumePlan
@@ -44,9 +43,7 @@ def __post_init__(self):
 
     def gather_plan(self):
         """Initialize the plan."""
-        with tqdm(
-            total=5, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar
-        ) as step_progress:
+        with self.print_progress(total=5, stage_name="Planning") as step_progress:
             ## Make sure it's safe to use existing resume state.
             super().safe_to_resume()
             step_progress.update(1)
diff --git a/src/hipscat_import/catalog/run_import.py b/src/hipscat_import/catalog/run_import.py
index 7081670a..1e4054cc 100644
--- a/src/hipscat_import/catalog/run_import.py
+++ b/src/hipscat_import/catalog/run_import.py
@@ -11,11 +11,9 @@
 from hipscat.catalog import PartitionInfo
 from hipscat.io import paths
 from hipscat.io.parquet_metadata import write_parquet_metadata
-from tqdm.auto import tqdm
 
 import hipscat_import.catalog.map_reduce as mr
 from hipscat_import.catalog.arguments import ImportArguments
-from hipscat_import.pipeline_resume_plan import PipelineResumePlan
 
 
 def run(args, client):
@@ -47,9 +45,7 @@ def run(args, client):
             )
         args.resume_plan.wait_for_mapping(futures)
 
-    with tqdm(
-        total=2, desc=PipelineResumePlan.get_formatted_stage_name("Binning"), disable=not args.progress_bar
-    ) as step_progress:
+    with args.resume_plan.print_progress(total=2, stage_name="Binning") as step_progress:
         raw_histogram = args.resume_plan.read_histogram(args.mapping_healpix_order)
         step_progress.update(1)
         alignment_file = args.resume_plan.get_alignment_file(
@@ -115,9 +111,7 @@ def run(args, client):
             args.resume_plan.wait_for_reducing(futures)
 
     # All done - write out the metadata
-    with tqdm(
-        total=5, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar
-    ) as step_progress:
+    with args.resume_plan.print_progress(total=5, stage_name="Finishing") as step_progress:
         catalog_info = args.to_catalog_info(int(raw_histogram.sum()))
         io.write_provenance_info(
             catalog_base_dir=args.catalog_path,
diff --git a/src/hipscat_import/index/run_index.py b/src/hipscat_import/index/run_index.py
index c4279623..bbf870d1 100644
--- a/src/hipscat_import/index/run_index.py
+++ b/src/hipscat_import/index/run_index.py
@@ -1,11 +1,10 @@
 """Create columnar index of hipscat table using dask for parallelization"""
 
 from hipscat.io import file_io, parquet_metadata, write_metadata
-from tqdm.auto import tqdm
 
 import hipscat_import.index.map_reduce as mr
 from hipscat_import.index.arguments import IndexArguments
-from hipscat_import.pipeline_resume_plan import PipelineResumePlan
+from hipscat_import.pipeline_resume_plan import print_progress
 
 
 def run(args, client):
@@ -17,8 +16,11 @@ def run(args, client):
     rows_written = mr.create_index(args, client)
 
     # All done - write out the metadata
-    with tqdm(
-        total=4, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar
+    with print_progress(
+        total=4,
+        stage_name="Finishing",
+        use_progress_bar=args.progress_bar,
+        simple_progress_bar=args.simple_progress_bar,
     ) as step_progress:
         index_catalog_info = args.to_catalog_info(int(rows_written))
         write_metadata.write_provenance_info(
diff --git a/src/hipscat_import/margin_cache/margin_cache.py b/src/hipscat_import/margin_cache/margin_cache.py
index 217fcc56..8611393f 100644
--- a/src/hipscat_import/margin_cache/margin_cache.py
+++ b/src/hipscat_import/margin_cache/margin_cache.py
@@ -1,6 +1,5 @@
 from hipscat.catalog import PartitionInfo
 from hipscat.io import file_io, parquet_metadata, paths, write_metadata
-from tqdm.auto import tqdm
 
 import hipscat_import.margin_cache.margin_cache_map_reduce as mcmr
 from hipscat_import.margin_cache.margin_cache_resume_plan import MarginCachePlan
@@ -59,7 +58,7 @@ def generate_margin_cache(args, client):
             )
         resume_plan.wait_for_reducing(futures)
 
-    with tqdm(total=4, desc="Finishing", disable=not args.progress_bar) as step_progress:
+    with resume_plan.print_progress(total=4, stage_name="Finishing") as step_progress:
         parquet_metadata.write_parquet_metadata(
             args.catalog_path, storage_options=args.output_storage_options
         )
diff --git a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py b/src/hipscat_import/margin_cache/margin_cache_resume_plan.py
index 72aec80d..9a6274df 100644
--- a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py
+++ b/src/hipscat_import/margin_cache/margin_cache_resume_plan.py
@@ -9,7 +9,6 @@
 from hipscat import pixel_math
 from hipscat.io import file_io
 from hipscat.pixel_math.healpix_pixel import HealpixPixel
-from tqdm.auto import tqdm
 
 from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
 from hipscat_import.pipeline_resume_plan import PipelineResumePlan
@@ -33,6 +32,7 @@ def __init__(self, args: MarginCacheArguments):
         super().__init__(
             resume=args.resume,
             progress_bar=args.progress_bar,
+            simple_progress_bar=args.simple_progress_bar,
             tmp_path=args.tmp_path,
             delete_resume_log_files=args.delete_resume_log_files,
         )
@@ -40,9 +40,7 @@ def __init__(self, args: MarginCacheArguments):
 
     def _gather_plan(self, args):
         """Initialize the plan."""
-        with tqdm(
-            total=3, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar
-        ) as step_progress:
+        with self.print_progress(total=3, stage_name="Planning") as step_progress:
             ## Make sure it's safe to use existing resume state.
             super().safe_to_resume()
             mapping_done = self.is_mapping_done()
diff --git a/src/hipscat_import/pipeline_resume_plan.py b/src/hipscat_import/pipeline_resume_plan.py
index 7e859e91..84a1e603 100644
--- a/src/hipscat_import/pipeline_resume_plan.py
+++ b/src/hipscat_import/pipeline_resume_plan.py
@@ -10,7 +10,8 @@
 from dask.distributed import print as dask_print
 from hipscat.io import FilePointer, file_io
 from hipscat.pixel_math.healpix_pixel import HealpixPixel
-from tqdm.auto import tqdm
+from tqdm.auto import tqdm as auto_tqdm
+from tqdm.std import tqdm as std_tqdm
 
 
 @dataclass
@@ -25,6 +26,10 @@ class PipelineResumePlan:
     progress_bar: bool = True
     """if true, a tqdm progress bar will be displayed for user
     feedback of planning progress"""
+    simple_progress_bar: bool = False
+    """if displaying a progress bar, use a text-only simple progress
+    bar instead of widget. this can be useful in some environments when running
+    in a notebook where ipywidgets cannot be used (see `progress_bar` argument)"""
     delete_resume_log_files: bool = True
     """should we delete task-level done files once each stage is complete?
     if False, we will keep all sub-histograms from the mapping stage, and all
@@ -131,13 +136,7 @@ def wait_for_futures(self, futures, stage_name, fail_fast=False):
             RuntimeError: if any future returns an error status.
         """
         some_error = False
-        formatted_stage_name = self.get_formatted_stage_name(stage_name)
-        for future in tqdm(
-            as_completed(futures),
-            desc=formatted_stage_name,
-            total=len(futures),
-            disable=(not self.progress_bar),
-        ):
+        for future in self.print_progress(as_completed(futures), stage_name=stage_name, total=len(futures)):
             if future.status == "error":
                 some_error = True
                 if fail_fast:
@@ -146,18 +145,26 @@ def wait_for_futures(self, futures, stage_name, fail_fast=False):
         if some_error:
             raise RuntimeError(f"Some {stage_name} stages failed. See logs for details.")
 
-    @staticmethod
-    def get_formatted_stage_name(stage_name) -> str:
-        """Create a stage name of consistent minimum length. Ensures that the tqdm
-        progress bars can line up nicely when multiple stages must run.
+    def print_progress(self, iterable=None, total=None, stage_name=None):
+        """Create a progress bar that will provide user with task feedback.
+
+        This is a thin wrapper around the static ``print_progress`` method that uses
+        member variables for the caller's convenience.
 
         Args:
-            stage_name (str): name of the stage (e.g. mapping, reducing)
+            iterable (iterable): Optional. provides iterations to progress updates.
+            total (int): Optional. Expected iterations.
+            stage_name (str): name of the stage (e.g. mapping, reducing). this will
+                be further formatted with ``get_formatted_stage_name``, so the caller
+                doesn't need to worry about that.
         """
-        if stage_name is None or len(stage_name) == 0:
-            stage_name = "progress"
-
-        return f"{stage_name.capitalize(): <10}"
+        return print_progress(
+            iterable=iterable,
+            total=total,
+            stage_name=stage_name,
+            use_progress_bar=self.progress_bar,
+            simple_progress_bar=self.simple_progress_bar,
+        )
 
     def check_original_input_paths(self, input_paths):
         """Validate that we're operating on the same file set as the original pipeline,
@@ -230,3 +237,50 @@ def print_task_failure(custom_message, exception):
     except Exception:  # pylint: disable=broad-exception-caught
         pass
     dask_print(exception)
+
+
+def get_formatted_stage_name(stage_name) -> str:
+    """Create a stage name of consistent minimum length. Ensures that the tqdm
+    progress bars can line up nicely when multiple stages must run.
+
+    Args:
+        stage_name (str): name of the stage (e.g. mapping, reducing)
+    """
+    if stage_name is None or len(stage_name) == 0:
+        stage_name = "progress"
+
+    return f"{stage_name.capitalize(): <10}"
+
+
+def print_progress(
+    iterable=None, total=None, stage_name=None, use_progress_bar=True, simple_progress_bar=False
+):
+    """Create a progress bar that will provide user with task feedback.
+
+    Args:
+        iterable (iterable): Optional. provides iterations to progress updates.
+        total (int): Optional. Expected iterations.
+        stage_name (str): name of the stage (e.g. mapping, reducing). this will
+            be further formatted with ``get_formatted_stage_name``, so the caller
+            doesn't need to worry about that.
+        use_progress_bar (bool): should we display any progress. typically False
+            when no stdout is expected.
+        simple_progress_bar (bool): if displaying a progress bar, use a text-only
+            simple progress bar instead of widget. this can be useful when running
+            in a particular notebook where ipywidgets cannot be used
+            (only used when ``use_progress_bar`` is True)
+    """
+    if simple_progress_bar:
+        return std_tqdm(
+            iterable,
+            desc=get_formatted_stage_name(stage_name),
+            total=total,
+            disable=not use_progress_bar,
+        )
+
+    return auto_tqdm(
+        iterable,
+        desc=get_formatted_stage_name(stage_name),
+        total=total,
+        disable=not use_progress_bar,
+    )
diff --git a/src/hipscat_import/runtime_arguments.py b/src/hipscat_import/runtime_arguments.py
index 4f4d7126..9ce5e8ad 100644
--- a/src/hipscat_import/runtime_arguments.py
+++ b/src/hipscat_import/runtime_arguments.py
@@ -32,8 +32,12 @@ class RuntimeArguments:
     the pipeline where we left off. If False, we start the import from scratch,
     overwriting any content of the output directory."""
     progress_bar: bool = True
-    """if true, a tqdm progress bar will be displayed for user
+    """if true, a progress bar will be displayed for user
     feedback of map reduce progress"""
+    simple_progress_bar: bool = False
+    """if displaying a progress bar, use a text-only simple progress
+    bar instead of widget. this can be useful in some environments when running
+    in a notebook where ipywidgets cannot be used (see `progress_bar` argument)"""
     dask_tmp: str = ""
     """directory for dask worker space. this should be local to
     the execution of the pipeline, for speed of reads and writes"""
diff --git a/src/hipscat_import/soap/resume_plan.py b/src/hipscat_import/soap/resume_plan.py
index be1a6f86..3afd8626 100644
--- a/src/hipscat_import/soap/resume_plan.py
+++ b/src/hipscat_import/soap/resume_plan.py
@@ -11,7 +11,6 @@
 from hipscat.io import file_io
 from hipscat.pixel_math.healpix_pixel import HealpixPixel
 from hipscat.pixel_tree import PixelAlignment, align_trees
-from tqdm.auto import tqdm
 
 from hipscat_import.pipeline_resume_plan import PipelineResumePlan
 from hipscat_import.soap.arguments import SoapArguments
@@ -39,6 +38,7 @@ def __init__(self, args: SoapArguments):
         super().__init__(
             resume=args.resume,
             progress_bar=args.progress_bar,
+            simple_progress_bar=args.simple_progress_bar,
             tmp_path=args.tmp_path,
             delete_resume_log_files=args.delete_resume_log_files,
         )
@@ -46,9 +46,7 @@ def __init__(self, args: SoapArguments):
 
     def gather_plan(self, args):
         """Initialize the plan."""
-        with tqdm(
-            total=3, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar
-        ) as step_progress:
+        with self.print_progress(total=3, stage_name="Planning") as step_progress:
             ## Make sure it's safe to use existing resume state.
             super().safe_to_resume()
             step_progress.update(1)
diff --git a/src/hipscat_import/soap/run_soap.py b/src/hipscat_import/soap/run_soap.py
index dafafae4..34d50b8a 100644
--- a/src/hipscat_import/soap/run_soap.py
+++ b/src/hipscat_import/soap/run_soap.py
@@ -5,9 +5,7 @@
 
 from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo
 from hipscat.io import parquet_metadata, paths, write_metadata
-from tqdm.auto import tqdm
 
-from hipscat_import.pipeline_resume_plan import PipelineResumePlan
 from hipscat_import.soap.arguments import SoapArguments
 from hipscat_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins
 from hipscat_import.soap.resume_plan import SoapPlan
@@ -50,9 +48,7 @@ def run(args, client):
         resume_plan.wait_for_reducing(futures)
 
     # All done - write out the metadata
-    with tqdm(
-        total=4, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar
-    ) as step_progress:
+    with resume_plan.print_progress(total=4, stage_name="Finishing") as step_progress:
         if args.write_leaf_files:
             parquet_metadata.write_parquet_metadata(
                 args.catalog_path,
diff --git a/tests/hipscat_import/test_pipeline_resume_plan.py b/tests/hipscat_import/test_pipeline_resume_plan.py
index c5e0b77c..7334d6a2 100644
--- a/tests/hipscat_import/test_pipeline_resume_plan.py
+++ b/tests/hipscat_import/test_pipeline_resume_plan.py
@@ -5,7 +5,7 @@
 import numpy.testing as npt
 import pytest
 
-from hipscat_import.pipeline_resume_plan import PipelineResumePlan
+from hipscat_import.pipeline_resume_plan import PipelineResumePlan, get_formatted_stage_name
 
 
 def test_done_key(tmp_path):
@@ -118,6 +118,28 @@ def error_on_even(argument):
         plan.wait_for_futures(futures, "test")
 
 
+@pytest.mark.dask
+def test_wait_for_futures_progress(tmp_path, dask_client, capsys):
+    """Test that we can wait around for futures to complete.
+
+    Additionally test that relevant parts of the traceback are printed to stdout."""
+    plan = PipelineResumePlan(tmp_path=tmp_path, progress_bar=True, simple_progress_bar=True, resume=False)
+
+    def error_on_even(argument):
+        """Silly little method used to test futures that fail under predictable conditions"""
+        if argument % 2 == 0:
+            raise RuntimeError("we are at odds with evens")
+
+    ## Everything is fine if we're all odd, but use a silly name so it's
+    ## clear that the stage name is present, and well-formatted.
+    futures = [dask_client.submit(error_on_even, 1)]
+    plan.wait_for_futures(futures, "teeeest")
+
+    captured = capsys.readouterr()
+    assert "Teeeest" in captured.err
+    assert "100%" in captured.err
+
+
 @pytest.mark.dask
 def test_wait_for_futures_fail_fast(tmp_path, dask_client):
     """Test that we can wait around for futures to complete.
@@ -137,16 +159,16 @@ def error_on_even(argument):
 
 def test_formatted_stage_name():
     """Test that we make pretty stage names for presenting in progress bars"""
-    formatted = PipelineResumePlan.get_formatted_stage_name(None)
+    formatted = get_formatted_stage_name(None)
     assert formatted == "Progress  "
 
-    formatted = PipelineResumePlan.get_formatted_stage_name("")
+    formatted = get_formatted_stage_name("")
     assert formatted == "Progress  "
 
-    formatted = PipelineResumePlan.get_formatted_stage_name("stage")
+    formatted = get_formatted_stage_name("stage")
     assert formatted == "Stage     "
 
-    formatted = PipelineResumePlan.get_formatted_stage_name("very long stage name")
+    formatted = get_formatted_stage_name("very long stage name")
     assert formatted == "Very long stage name"