opendp · mccalluc · Jan 16, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/README-PYPI.md b/README-PYPI.md
@@ -10,11 +10,27 @@ Output options include:
 ## Usage
 
 ```
-usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo]
+usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo]
+
+DP Wizard makes it easier to get started with Differential Privacy.
 
 options:
   -h, --help         show this help message and exit
-  --csv CSV_PATH     Path to CSV containing private data
+  --public_csv CSV   Path to public CSV
+  --private_csv CSV  Path to private CSV
   --contrib CONTRIB  How many rows can an individual contribute?
   --demo             Use generated fake CSV for a quick demo
+
+Use "--public_csv" if you have a public data set, and are curious how
+DP can be applied: The preview visualizations will use your public data.
+
+Use "--private_csv" if you only have a private data set, and want to
+make a release from it: The preview visualizations will only use
+simulated data, and apart from the headers, the private CSV is not
+read until the release.
+
+Use "--public_csv" and "--private_csv" together if you have two CSVs
+with the same structure. Perhaps the public CSV is older and no longer
+sensitive. Preview visualizations will be made with the public data,
+but the release will be made with private data.
 ```
diff --git a/README.md b/README.md
@@ -13,13 +13,29 @@ Building on what we've learned from [DP Creator](https://github.com/opendp/dpcre
 ## Usage
 
 ```
-usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo]
+usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo]
+
+DP Wizard makes it easier to get started with Differential Privacy.
 
 options:
   -h, --help         show this help message and exit
-  --csv CSV_PATH     Path to CSV containing private data
+  --public_csv CSV   Path to public CSV
+  --private_csv CSV  Path to private CSV
   --contrib CONTRIB  How many rows can an individual contribute?
   --demo             Use generated fake CSV for a quick demo
+
+Use "--public_csv" if you have a public data set, and are curious how
+DP can be applied: The preview visualizations will use your public data.
+
+Use "--private_csv" if you only have a private data set, and want to
+make a release from it: The preview visualizations will only use
+simulated data, and apart from the headers, the private CSV is not
+read until the release.
+
+Use "--public_csv" and "--private_csv" together if you have two CSVs
+with the same structure. Perhaps the public CSV is older and no longer
+sensitive. Preview visualizations will be made with the public data,
+but the release will be made with private data.
 ```
 
 

diff --git a/dp_wizard/app/__init__.py b/dp_wizard/app/__init__.py
@@ -28,8 +28,11 @@ def ctrl_c_reminder():  # pragma: no cover
 
 def make_server_from_cli_info(cli_info: CLIInfo):
     def server(input: Inputs, output: Outputs, session: Session):  # pragma: no cover
-        cli_csv_path = cli_info.csv_path
-        csv_path = reactive.value("" if cli_csv_path is None else cli_csv_path)
+        public_csv_path = reactive.value(  # noqa: F841 # TODO
+            cli_info.public_csv_path or ""
+        )
+        private_csv_path = reactive.value(cli_info.private_csv_path or "")
+
         contributions = reactive.value(cli_info.contributions)
 
         lower_bounds = reactive.value({})
@@ -43,15 +46,17 @@ def server(input: Inputs, output: Outputs, session: Session):  # pragma: no cove
             output,
             session,
             is_demo=cli_info.is_demo,
-            csv_path=csv_path,
+            public_csv_path=public_csv_path,
+            private_csv_path=private_csv_path,
             contributions=contributions,
         )
         analysis_panel.analysis_server(
             input,
             output,
             session,
             is_demo=cli_info.is_demo,
-            csv_path=csv_path,
+            public_csv_path=public_csv_path,
+            private_csv_path=private_csv_path,
             contributions=contributions,
             lower_bounds=lower_bounds,
             upper_bounds=upper_bounds,
@@ -63,7 +68,8 @@ def server(input: Inputs, output: Outputs, session: Session):  # pragma: no cove
             input,
             output,
             session,
-            csv_path=csv_path,
+            public_csv_path=public_csv_path,
+            private_csv_path=private_csv_path,
             contributions=contributions,
             lower_bounds=lower_bounds,
             upper_bounds=upper_bounds,

diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py
@@ -1,11 +1,16 @@
 from math import pow
 from typing import Iterable, Any
+from pathlib import Path
 
 from shiny import ui, reactive, render, req, Inputs, Outputs, Session
 
 from dp_wizard.app.components.inputs import log_slider
 from dp_wizard.app.components.column_module import column_ui, column_server
-from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
+from dp_wizard.utils.csv_helper import (
+    read_csv_ids_labels,
+    read_csv_ids_names,
+    get_csv_row_count,
+)
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
 from dp_wizard.utils.code_generators import make_privacy_loss_block
 
@@ -42,24 +47,7 @@ def analysis_ui():
             ),
             ui.card(
                 ui.card_header("Simulation"),
-                ui.markdown(
-                    """
-                    This simulation will assume a normal distribution
-                    between the specified lower and upper bounds.
-                    Until you make a release, your CSV will not be
-                    read except to determine the columns.
-
-                    What is the approximate number of rows in the dataset?
-                    This number is only used for the simulation
-                    and not the final calculation.
-                    """
-                ),
-                ui.input_select(
-                    "row_count",
-                    "Estimated Rows",
-                    choices=["100", "1000", "10000"],
-                    selected="100",
-                ),
+                ui.output_ui("simulation_card_ui"),
             ),
         ),
         ui.output_ui("columns_ui"),
@@ -82,7 +70,8 @@ def analysis_server(
     input: Inputs,
     output: Outputs,
     session: Session,
-    csv_path: reactive.Value[str],
+    public_csv_path: reactive.Value[str],
+    private_csv_path: reactive.Value[str],
     contributions: reactive.Value[int],
     is_demo: bool,
     lower_bounds: reactive.Value[dict[str, float]],
@@ -124,13 +113,59 @@ def columns_checkbox_group_tooltip_ui():
             """,
         )
 
+    @render.ui
+    def simulation_card_ui():
+        if public_csv_path():
+            row_count = get_csv_row_count(Path(public_csv_path()))
+            return [
+                ui.markdown(
+                    f"""
+                    Because you've provided a public CSV,
+                    it *will be read* to generate previews.
+
+                    The confidence interval depends on the number of rows.
+                    Your public CSV has {row_count} rows,
+                    but if you believe the private CSV will be
+                    much larger or smaller, please update.
+                    """
+                ),
+                ui.input_select(
+                    "row_count",
+                    "Estimated Rows",
+                    choices=[row_count, "100", "1000", "10000"],
+                    selected=row_count,
+                ),
+            ]
+        else:
+            return [
+                ui.markdown(
+                    """
+                    This simulation will assume a normal distribution
+                    between the specified lower and upper bounds.
+                    Until you make a release, your CSV will not be
+                    read except to determine the columns.
+
+                    What is the approximate number of rows in the dataset?
+                    This number is only used for the simulation
+                    and not the final calculation.
+                    """
+                ),
+                ui.input_select(
+                    "row_count",
+                    "Estimated Rows",
+                    choices=["100", "1000", "10000"],
+                    selected="100",
+                ),
+            ]
+
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
         column_ids_to_names = csv_ids_names_calc()
         for column_id in column_ids:
             column_server(
                 column_id,
+                public_csv_path=public_csv_path(),
                 name=column_ids_to_names[column_id],
                 contributions=contributions(),
                 epsilon=epsilon(),
@@ -146,11 +181,13 @@ def columns_ui():
 
     @reactive.calc
     def csv_ids_names_calc():
-        return read_csv_ids_names(req(csv_path()))
+        # The previous tab validated that if both public and private are given,
+        # the columns match, so it shouldn't matter which is read.
+        return read_csv_ids_names(Path(req(public_csv_path() or private_csv_path())))
 
     @reactive.calc
     def csv_ids_labels_calc():
-        return read_csv_ids_labels(req(csv_path()))
+        return read_csv_ids_labels(Path(req(public_csv_path() or private_csv_path())))
 
     @render.ui
     def epsilon_tooltip_ui():

diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py
@@ -3,12 +3,14 @@
 from htmltools.tags import details, summary
 from shiny import ui, render, module, reactive, Inputs, Outputs, Session
 from shiny.types import SilentException
+import polars as pl
 
 from dp_wizard.utils.dp_helper import make_accuracy_histogram
 from dp_wizard.utils.shared import plot_histogram
 from dp_wizard.utils.code_generators import make_column_config_block
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip, hide_if
 from dp_wizard.utils.dp_helper import confidence
+from dp_wizard.utils.mock_data import mock_data, ColumnDef
 
 
 default_weight = "2"
@@ -56,6 +58,7 @@ def column_server(
     input: Inputs,
     output: Outputs,
     session: Session,
+    public_csv_path: str,
     name: str,
     contributions: int,
     epsilon: float,
@@ -107,7 +110,20 @@ def accuracy_histogram():
             # This function is triggered when column is removed;
             # Exit early to avoid divide-by-zero.
             raise SilentException("weights_sum == 0")
+
+        # Mock data only depends on lower and upper bounds, so it could be cached,
+        # but I'd guess this is dominated by the DP operations,
+        # so not worth optimizing.
+        # TODO: Use real public data, if we have it!
+        if public_csv_path:
+            lf = pl.scan_csv(public_csv_path)
+        else:
+            lf = pl.LazyFrame(
+                mock_data({name: ColumnDef(lower_x, upper_x)}, row_count=row_count)
+            )
         return make_accuracy_histogram(
+            lf=lf,
+            column_name=name,
             row_count=row_count,
             lower=lower_x,
             upper=upper_x,
@@ -210,9 +226,11 @@ def data_frame():
     def histogram_preview_plot():
         accuracy, histogram = accuracy_histogram()
         s = "s" if contributions > 1 else ""
-        title = (
-            f"Simulated {name}: normal distribution, "
-            f"{contributions} contribution{s} / invidual"
+        title = ", ".join(
+            [
+                name if public_csv_path else f"Simulated {name}: normal distribution",
+                f"{contributions} contribution{s} / invidual",
+            ]
         )
         return plot_histogram(
             histogram,

diff --git a/dp_wizard/app/components/outputs.py b/dp_wizard/app/components/outputs.py
@@ -22,3 +22,7 @@ def demo_tooltip(is_demo: bool, text: str):  # pragma: no cover
 def hide_if(condition: bool, el):  # pragma: no cover
     display = "none" if condition else "block"
     return ui.div(el, style=f"display: {display};")
+
+
+def info_box(content):  # pragma: no cover
+    return ui.div(content, class_="alert alert-info", role="alert")