allenai · soldni · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "1.0.12"
+version = "1.0.13"
 description = "Data filters"
 license = { text = "Apache-2.0" }
 readme = "README.md"

diff --git a/python/dolma/cli/analyzer.py b/python/dolma/cli/analyzer.py
@@ -9,6 +9,18 @@
 from dolma.core.paths import glob_path
 
 
+@dataclass
+class BinsConfig:
+    compute: int = field(
+        default=1_000,
+        help="Number of bins to use to compute the histograms.",
+    )
+    visualization: int = field(
+        default=10,
+        help="Number of bins to use when visualizing the histograms.",
+    )
+
+
 @dataclass
 class AnalyzerConfig:
     attributes: List[str] = field(
@@ -22,10 +34,7 @@ class AnalyzerConfig:
             "If not provided, the report will be printed to stdout."
         ),
     )
-    bins: int = field(
-        default=1_000,
-        help="Number of bins to use for the histograms.",
-    )
+    bins: BinsConfig = field(default=BinsConfig(), help="Configuration for the bins to use for the histograms.")
     processes: int = field(
         default=1,
         help="Number of parallel processes to use.",
@@ -80,7 +89,8 @@ def run(cls, parsed_config: AnalyzerConfig):
                 metadata_path=work_dirs.input,
                 debug=parsed_config.debug,
                 seed=parsed_config.seed,
-                num_bins=parsed_config.bins,
+                compute_bins=parsed_config.bins.compute,
+                visualize_bins=parsed_config.bins.visualization,
                 num_processes=parsed_config.processes,
                 name_regex=parsed_config.regex,
                 show_total=parsed_config.total,

diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py
@@ -77,6 +77,10 @@ class MixerConfig:
         default=False,
         help="If true, only print the configuration and exit without running the mixer.",
     )
+    skip_checks: bool = field(
+        default=False,
+        help="If true, skip checks on paths (e.g. validation, globbing). Useful in case many paths are being evaluated.",
+    )
 
 
 class MixerCli(BaseCli):
@@ -141,19 +145,22 @@ def run(cls, parsed_config: MixerConfig):
                 if "span_replacement" not in stream_config_dict and "filter" not in stream_config_dict:
                     raise DolmaConfigError("Either `filter` or `span_replacement` must be specified")
 
-                # perform some path validation to make sure we don't call the mixer with invalid config
-                total_matching_documents = 0
-                for document in stream_config.documents:
+                if not parsed_config.skip_checks:
+                    # perform some path validation to make sure we don't call the mixer with invalid config
+                    total_matching_documents = 0
+                    for document in stream_config.documents:
 
-                    current_matching_documents = sum(1 for _ in glob_path(document))
-                    if current_matching_documents == 0:
-                        # only raise a warning if no documents are found for a single path
-                        logger.warning("No documents found for path %s", document)
-                    total_matching_documents += current_matching_documents
+                        current_matching_documents = sum(1 for _ in glob_path(document))
+                        if current_matching_documents == 0:
+                            # only raise a warning if no documents are found for a single path
+                            logger.warning("No documents found for path %s", document)
+                        total_matching_documents += current_matching_documents
 
-                if total_matching_documents == 0:
-                    # but raise an error if no documents are found for all paths
-                    raise DolmaConfigError(f"No documents found for the paths for {stream_config.name} config.")
+                    if total_matching_documents == 0:
+                        # but raise an error if no documents are found for all paths
+                        raise DolmaConfigError(
+                            f"No documents found for the paths for {stream_config.name} config."
+                        )
 
                 # populate the stream config dict
                 stream_config_dict["name"] = stream_config.name

diff --git a/python/dolma/core/analyzer.py b/python/dolma/core/analyzer.py
@@ -283,7 +283,8 @@ def create_and_run_analyzer(
     report: Optional[str] = None,
     debug: bool = False,
     seed: int = 0,
-    num_bins: int = 1000,
+    compute_bins: int = 1_000,
+    visualize_bins: int = 10,
     num_processes: int = 1,
     name_regex: Optional[str] = None,
     show_total: bool = False,
@@ -300,7 +301,8 @@ def create_and_run_analyzer(
         report (Optional[str], optional): Path to the report directory. Defaults to None.
         debug (bool, optional): Enable debug mode. Defaults to False.
         seed (int, optional): Seed value for randomization. Defaults to 0.
-        num_bins (int, optional): Number of bins for analysis. Defaults to 1000.
+        compute_bins (int, optional): Number of bins for analysis. Defaults to 1_000.
+        visualize_bins (int, optional): Number of bins for visualization. Defaults to 10.
         num_processes (int, optional): Number of processes to use for analysis. Defaults to 1.
         name_regex (Optional[str], optional): Regular expression for filtering attribute names. Defaults to None.
         show_total (bool, optional): Show total summary. Defaults to False.
@@ -328,8 +330,8 @@ def create_and_run_analyzer(
             retries_on_error=0,
             num_processes=num_processes,
         )
-        analyzer(num_bins=num_bins, name_regex=name_regex)
+        analyzer(num_bins=compute_bins, name_regex=name_regex)
 
-        summaries = aggregate_summaries(summaries_path=summaries_path, num_bins=num_bins)
-        visualize_summaries(summaries=summaries, show_total=show_total)
+        summaries = aggregate_summaries(summaries_path=summaries_path, num_bins=compute_bins)
+        visualize_summaries(summaries=summaries, show_total=show_total, num_viz_bins=visualize_bins)
         write_output(summaries=summaries, report=report)