diff --git a/README-PYPI.md b/README-PYPI.md index 975e12d..1c16f1f 100644 --- a/README-PYPI.md +++ b/README-PYPI.md @@ -10,11 +10,27 @@ Output options include: ## Usage ``` -usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo] +usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo] + +DP Wizard makes it easier to get started with Differential Privacy. options: -h, --help show this help message and exit - --csv CSV_PATH Path to CSV containing private data + --public_csv CSV Path to public CSV + --private_csv CSV Path to private CSV --contrib CONTRIB How many rows can an individual contribute? --demo Use generated fake CSV for a quick demo + +Use "--public_csv" if you have a public data set, and are curious how +DP can be applied: The preview visualizations will use your public data. + +Use "--private_csv" if you only have a private data set, and want to +make a release from it: The preview visualizations will only use +simulated data, and apart from the headers, the private CSV is not +read until the release. + +Use "--public_csv" and "--private_csv" together if you have two CSVs +with the same structure. Perhaps the public CSV is older and no longer +sensitive. Preview visualizations will be made with the public data, +but the release will be made with private data. ``` diff --git a/README.md b/README.md index 5006e4c..dcb750f 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,29 @@ Building on what we've learned from [DP Creator](https://github.com/opendp/dpcre ## Usage ``` -usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo] +usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo] + +DP Wizard makes it easier to get started with Differential Privacy. options: -h, --help show this help message and exit - --csv CSV_PATH Path to CSV containing private data + --public_csv CSV Path to public CSV + --private_csv CSV Path to private CSV --contrib CONTRIB How many rows can an individual contribute? --demo Use generated fake CSV for a quick demo + +Use "--public_csv" if you have a public data set, and are curious how +DP can be applied: The preview visualizations will use your public data. + +Use "--private_csv" if you only have a private data set, and want to +make a release from it: The preview visualizations will only use +simulated data, and apart from the headers, the private CSV is not +read until the release. + +Use "--public_csv" and "--private_csv" together if you have two CSVs +with the same structure. Perhaps the public CSV is older and no longer +sensitive. Preview visualizations will be made with the public data, +but the release will be made with private data. ``` diff --git a/dp_wizard/app/__init__.py b/dp_wizard/app/__init__.py index 43e78ff..5310475 100644 --- a/dp_wizard/app/__init__.py +++ b/dp_wizard/app/__init__.py @@ -28,8 +28,11 @@ def ctrl_c_reminder(): # pragma: no cover def make_server_from_cli_info(cli_info: CLIInfo): def server(input: Inputs, output: Outputs, session: Session): # pragma: no cover - cli_csv_path = cli_info.csv_path - csv_path = reactive.value("" if cli_csv_path is None else cli_csv_path) + public_csv_path = reactive.value( # noqa: F841 # TODO + cli_info.public_csv_path or "" + ) + private_csv_path = reactive.value(cli_info.private_csv_path or "") + contributions = reactive.value(cli_info.contributions) lower_bounds = reactive.value({}) @@ -43,7 +46,8 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove output, session, is_demo=cli_info.is_demo, - csv_path=csv_path, + public_csv_path=public_csv_path, + private_csv_path=private_csv_path, contributions=contributions, ) analysis_panel.analysis_server( @@ -51,7 +55,8 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove output, session, is_demo=cli_info.is_demo, - csv_path=csv_path, + public_csv_path=public_csv_path, + private_csv_path=private_csv_path, contributions=contributions, lower_bounds=lower_bounds, upper_bounds=upper_bounds, @@ -63,7 +68,8 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove input, output, session, - csv_path=csv_path, + public_csv_path=public_csv_path, + private_csv_path=private_csv_path, contributions=contributions, lower_bounds=lower_bounds, upper_bounds=upper_bounds, diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py index 3e7da3a..85aff06 100644 --- a/dp_wizard/app/analysis_panel.py +++ b/dp_wizard/app/analysis_panel.py @@ -1,11 +1,16 @@ from math import pow from typing import Iterable, Any +from pathlib import Path from shiny import ui, reactive, render, req, Inputs, Outputs, Session from dp_wizard.app.components.inputs import log_slider from dp_wizard.app.components.column_module import column_ui, column_server -from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names +from dp_wizard.utils.csv_helper import ( + read_csv_ids_labels, + read_csv_ids_names, + get_csv_row_count, +) from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip from dp_wizard.utils.code_generators import make_privacy_loss_block @@ -42,24 +47,7 @@ def analysis_ui(): ), ui.card( ui.card_header("Simulation"), - ui.markdown( - """ - This simulation will assume a normal distribution - between the specified lower and upper bounds. - Until you make a release, your CSV will not be - read except to determine the columns. - - What is the approximate number of rows in the dataset? - This number is only used for the simulation - and not the final calculation. - """ - ), - ui.input_select( - "row_count", - "Estimated Rows", - choices=["100", "1000", "10000"], - selected="100", - ), + ui.output_ui("simulation_card_ui"), ), ), ui.output_ui("columns_ui"), @@ -82,7 +70,8 @@ def analysis_server( input: Inputs, output: Outputs, session: Session, - csv_path: reactive.Value[str], + public_csv_path: reactive.Value[str], + private_csv_path: reactive.Value[str], contributions: reactive.Value[int], is_demo: bool, lower_bounds: reactive.Value[dict[str, float]], @@ -124,6 +113,51 @@ def columns_checkbox_group_tooltip_ui(): """, ) + @render.ui + def simulation_card_ui(): + if public_csv_path(): + row_count = get_csv_row_count(Path(public_csv_path())) + return [ + ui.markdown( + f""" + Because you've provided a public CSV, + it *will be read* to generate previews. + + The confidence interval depends on the number of rows. + Your public CSV has {row_count} rows, + but if you believe the private CSV will be + much larger or smaller, please update. + """ + ), + ui.input_select( + "row_count", + "Estimated Rows", + choices=[row_count, "100", "1000", "10000"], + selected=row_count, + ), + ] + else: + return [ + ui.markdown( + """ + This simulation will assume a normal distribution + between the specified lower and upper bounds. + Until you make a release, your CSV will not be + read except to determine the columns. + + What is the approximate number of rows in the dataset? + This number is only used for the simulation + and not the final calculation. + """ + ), + ui.input_select( + "row_count", + "Estimated Rows", + choices=["100", "1000", "10000"], + selected="100", + ), + ] + @render.ui def columns_ui(): column_ids = input.columns_checkbox_group() @@ -131,6 +165,7 @@ def columns_ui(): for column_id in column_ids: column_server( column_id, + public_csv_path=public_csv_path(), name=column_ids_to_names[column_id], contributions=contributions(), epsilon=epsilon(), @@ -146,11 +181,13 @@ def columns_ui(): @reactive.calc def csv_ids_names_calc(): - return read_csv_ids_names(req(csv_path())) + # The previous tab validated that if both public and private are given, + # the columns match, so it shouldn't matter which is read. + return read_csv_ids_names(Path(req(public_csv_path() or private_csv_path()))) @reactive.calc def csv_ids_labels_calc(): - return read_csv_ids_labels(req(csv_path())) + return read_csv_ids_labels(Path(req(public_csv_path() or private_csv_path()))) @render.ui def epsilon_tooltip_ui(): diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py index 938edc8..b99d33b 100644 --- a/dp_wizard/app/components/column_module.py +++ b/dp_wizard/app/components/column_module.py @@ -3,12 +3,14 @@ from htmltools.tags import details, summary from shiny import ui, render, module, reactive, Inputs, Outputs, Session from shiny.types import SilentException +import polars as pl from dp_wizard.utils.dp_helper import make_accuracy_histogram from dp_wizard.utils.shared import plot_histogram from dp_wizard.utils.code_generators import make_column_config_block from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip, hide_if from dp_wizard.utils.dp_helper import confidence +from dp_wizard.utils.mock_data import mock_data, ColumnDef default_weight = "2" @@ -56,6 +58,7 @@ def column_server( input: Inputs, output: Outputs, session: Session, + public_csv_path: str, name: str, contributions: int, epsilon: float, @@ -107,7 +110,20 @@ def accuracy_histogram(): # This function is triggered when column is removed; # Exit early to avoid divide-by-zero. raise SilentException("weights_sum == 0") + + # Mock data only depends on lower and upper bounds, so it could be cached, + # but I'd guess this is dominated by the DP operations, + # so not worth optimizing. + # TODO: Use real public data, if we have it! + if public_csv_path: + lf = pl.scan_csv(public_csv_path) + else: + lf = pl.LazyFrame( + mock_data({name: ColumnDef(lower_x, upper_x)}, row_count=row_count) + ) return make_accuracy_histogram( + lf=lf, + column_name=name, row_count=row_count, lower=lower_x, upper=upper_x, @@ -210,9 +226,11 @@ def data_frame(): def histogram_preview_plot(): accuracy, histogram = accuracy_histogram() s = "s" if contributions > 1 else "" - title = ( - f"Simulated {name}: normal distribution, " - f"{contributions} contribution{s} / invidual" + title = ", ".join( + [ + name if public_csv_path else f"Simulated {name}: normal distribution", + f"{contributions} contribution{s} / invidual", + ] ) return plot_histogram( histogram, diff --git a/dp_wizard/app/components/outputs.py b/dp_wizard/app/components/outputs.py index cb0f4b3..bf26475 100644 --- a/dp_wizard/app/components/outputs.py +++ b/dp_wizard/app/components/outputs.py @@ -22,3 +22,7 @@ def demo_tooltip(is_demo: bool, text: str): # pragma: no cover def hide_if(condition: bool, el): # pragma: no cover display = "none" if condition else "block" return ui.div(el, style=f"display: {display};") + + +def info_box(content): # pragma: no cover + return ui.div(content, class_="alert alert-info", role="alert") diff --git a/dp_wizard/app/dataset_panel.py b/dp_wizard/app/dataset_panel.py index cdfc9ae..0e8c0cf 100644 --- a/dp_wizard/app/dataset_panel.py +++ b/dp_wizard/app/dataset_panel.py @@ -1,38 +1,80 @@ from pathlib import Path +from typing import Optional from shiny import ui, reactive, render, Inputs, Outputs, Session -from dp_wizard.utils.argparse_helpers import get_cli_info -from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip +from dp_wizard.utils.argparse_helpers import ( + get_cli_info, + PUBLIC_TEXT, + PRIVATE_TEXT, + PUBLIC_PRIVATE_TEXT, +) +from dp_wizard.utils.csv_helper import get_csv_names_mismatch +from dp_wizard.app.components.outputs import ( + output_code_sample, + demo_tooltip, + hide_if, + info_box, +) from dp_wizard.utils.code_generators import make_privacy_unit_block def dataset_ui(): cli_info = get_cli_info() - csv_placeholder = "" if cli_info.csv_path is None else Path(cli_info.csv_path).name + public_csv_placeholder = ( + "" if cli_info.public_csv_path is None else Path(cli_info.public_csv_path).name + ) + private_csv_placeholder = ( + "" + if cli_info.private_csv_path is None + else Path(cli_info.private_csv_path).name + ) return ui.nav_panel( "Select Dataset", - # Doesn't seem to be possible to preset the actual value, - # but the placeholder string is a good substitute. - ui.input_file( - "csv_path", - ["Choose CSV file", ui.output_ui("choose_csv_demo_tooltip_ui")], - accept=[".csv"], - placeholder=csv_placeholder, - ), - ui.markdown( - "How many rows of the CSV can one individual contribute to? " - 'This is the "unit of privacy" which will be protected.' + ui.card( + ui.card_header("Input CSVs"), + ui.markdown( + f""" +Choose **Public CSV** {PUBLIC_TEXT} + +Choose **Private CSV** {PRIVATE_TEXT} + +Choose both **Public CSV** and **Private CSV** {PUBLIC_PRIVATE_TEXT}""" + ), + ui.row( + # Doesn't seem to be possible to preset the actual value, + # but the placeholder string is a good substitute. + ui.input_file( + "public_csv_path", + ["Choose Public CSV", ui.output_ui("choose_csv_demo_tooltip_ui")], + accept=[".csv"], + placeholder=public_csv_placeholder, + ), + ui.input_file( + "private_csv_path", + "Choose Private CSV", + accept=[".csv"], + placeholder=private_csv_placeholder, + ), + ), + ui.output_ui("csv_column_match_ui"), ), - ui.input_numeric( - "contributions", - ["Contributions", ui.output_ui("contributions_demo_tooltip_ui")], - cli_info.contributions, - min=1, + ui.card( + ui.card_header("Unit of privacy"), + ui.markdown( + "How many rows of the CSV can one individual contribute to? " + 'This is the "unit of privacy" which will be protected.' + ), + ui.input_numeric( + "contributions", + ["Contributions", ui.output_ui("contributions_demo_tooltip_ui")], + cli_info.contributions, + min=1, + ), + ui.output_ui("python_tooltip_ui"), + output_code_sample("Unit of Privacy", "unit_of_privacy_python"), ), - ui.output_ui("python_tooltip_ui"), - output_code_sample("Unit of Privacy", "unit_of_privacy_python"), ui.output_ui("define_analysis_button_ui"), value="dataset_panel", ) @@ -42,14 +84,49 @@ def dataset_server( input: Inputs, output: Outputs, session: Session, - csv_path: reactive.Value[str], + public_csv_path: reactive.Value[str], + private_csv_path: reactive.Value[str], contributions: reactive.Value[int], is_demo: bool, ): # pragma: no cover @reactive.effect - @reactive.event(input.csv_path) - def _on_csv_path_change(): - csv_path.set(input.csv_path()[0]["datapath"]) + @reactive.event(input.public_csv_path) + def _on_public_csv_path_change(): + public_csv_path.set(input.public_csv_path()[0]["datapath"]) + + @reactive.effect + @reactive.event(input.private_csv_path) + def _on_private_csv_path_change(): + private_csv_path.set(input.private_csv_path()[0]["datapath"]) + + @reactive.calc + def csv_column_mismatch_calc() -> Optional[tuple[set, set]]: + public = public_csv_path() + private = private_csv_path() + if public and private: + just_public, just_private = get_csv_names_mismatch( + Path(public), Path(private) + ) + if just_public or just_private: + return just_public, just_private + + @render.ui + def csv_column_match_ui(): + mismatch = csv_column_mismatch_calc() + messages = [] + if mismatch: + just_public, just_private = mismatch + if just_public: + messages.append( + "- Only the public CSV contains: " + + ", ".join(f"`{name}`" for name in just_public) + ) + if just_private: + messages.append( + "- Only the private CSV contains: " + + ", ".join(f"`{name}`" for name in just_private) + ) + return hide_if(not messages, info_box(ui.markdown("\n".join(messages)))) @reactive.effect @reactive.event(input.contributions) @@ -58,10 +135,16 @@ def _on_contributions_change(): @reactive.calc def button_enabled(): - contributions_is_set = input.contributions() is not None + public_csv_path_is_set = ( + input.public_csv_path() is not None and len(input.public_csv_path()) > 0 + ) + private_csv_path_is_set = ( + input.private_csv_path() is not None and len(input.private_csv_path()) > 0 + ) csv_path_is_set = ( - input.csv_path() is not None and len(input.csv_path()) > 0 - ) or is_demo + public_csv_path_is_set or private_csv_path_is_set or is_demo + ) and not csv_column_mismatch_calc() + contributions_is_set = input.contributions() is not None return contributions_is_set and csv_path_is_set @render.ui diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py index 992f6ec..eca63ce 100644 --- a/dp_wizard/app/results_panel.py +++ b/dp_wizard/app/results_panel.py @@ -54,7 +54,8 @@ def results_server( input: Inputs, output: Outputs, session: Session, - csv_path: reactive.Value[str], + public_csv_path: reactive.Value[str], + private_csv_path: reactive.Value[str], contributions: reactive.Value[int], lower_bounds: reactive.Value[dict[str, float]], upper_bounds: reactive.Value[dict[str, float]], @@ -77,7 +78,8 @@ def analysis_plan() -> AnalysisPlan: for col in weights().keys() } return AnalysisPlan( - csv_path=csv_path(), + # Prefer private CSV, if available: + csv_path=private_csv_path() or public_csv_path(), contributions=contributions(), epsilon=epsilon(), columns=columns, diff --git a/dp_wizard/utils/argparse_helpers.py b/dp_wizard/utils/argparse_helpers.py index 149b05b..5da58d8 100644 --- a/dp_wizard/utils/argparse_helpers.py +++ b/dp_wizard/utils/argparse_helpers.py @@ -1,28 +1,58 @@ from sys import argv from pathlib import Path -from argparse import ArgumentParser, ArgumentTypeError +import argparse import csv import random -from warnings import warn from typing import NamedTuple, Optional def _existing_csv_type(arg: str) -> Path: path = Path(arg) if not path.exists(): - raise ArgumentTypeError(f"No such file: {arg}") + raise argparse.ArgumentTypeError(f"No such file: {arg}") if path.suffix != ".csv": - raise ArgumentTypeError(f'Must have ".csv" extension: {arg}') + raise argparse.ArgumentTypeError(f'Must have ".csv" extension: {arg}') return path +PUBLIC_TEXT = """if you have a public data set, and are curious how +DP can be applied: The preview visualizations will use your public data.""" +PRIVATE_TEXT = """if you only have a private data set, and want to +make a release from it: The preview visualizations will only use +simulated data, and apart from the headers, the private CSV is not +read until the release.""" +PUBLIC_PRIVATE_TEXT = """if you have two CSVs +with the same structure. Perhaps the public CSV is older and no longer +sensitive. Preview visualizations will be made with the public data, +but the release will be made with private data.""" + + def _get_arg_parser(): - parser = ArgumentParser(description=__doc__) + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="DP Wizard makes it easier to get started with " + "Differential Privacy.", + epilog=f""" +Use "--public_csv" {PUBLIC_TEXT} + +Use "--private_csv" {PRIVATE_TEXT} + +Use "--public_csv" and "--private_csv" together {PUBLIC_PRIVATE_TEXT} +""", + ) parser.add_argument( - "--csv", - dest="csv_path", + "--public_csv", + dest="public_csv_path", + metavar="CSV", type=_existing_csv_type, - help="Path to CSV containing private data", + help="Path to public CSV", + ) + parser.add_argument( + "--private_csv", + dest="private_csv_path", + metavar="CSV", + type=_existing_csv_type, + help="Path to private CSV", ) parser.add_argument( "--contrib", @@ -41,17 +71,30 @@ def _get_arg_parser(): def _get_args(): """ >>> _get_args() - Namespace(csv_path=None, contributions=1, demo=False) + Namespace(public_csv_path=None, private_csv_path=None, contributions=1, demo=False) """ arg_parser = _get_arg_parser() if "pytest" in argv[0] or ("shiny" in argv[0] and "run" == argv[1]): # We are running a test, # and ARGV is polluted, so override: - return arg_parser.parse_args([]) + args = arg_parser.parse_args([]) else: # Normal parsing: - return arg_parser.parse_args() # pragma: no cover + args = arg_parser.parse_args() # pragma: no cover + + if args.demo: # pragma: no cover + other_args = {arg for arg in dir(args) if not arg.startswith("_")} - { + "demo", + "contributions", + } + set_args = [k for k in other_args if getattr(args, k) is not None] + if set_args: + arg_parser.error( + "When --demo is set, other arguments should be skipped: " + + ", ".join(set_args) + ) + return args def _clip(n: float, lower: float, upper: float) -> float: @@ -67,30 +110,15 @@ def _clip(n: float, lower: float, upper: float) -> float: class CLIInfo(NamedTuple): - csv_path: Optional[str] + public_csv_path: Optional[str] + private_csv_path: Optional[str] contributions: int is_demo: bool -def _get_demo_csv_contrib() -> CLIInfo: - """ - >>> csv_path, contributions, is_demo = _get_demo_csv_contrib() - >>> with open(csv_path, newline="") as csv_handle: - ... reader = csv.DictReader(csv_handle) - ... reader.fieldnames - ... rows = list(reader) - ... rows[0] - ... rows[-1] - ['student_id', 'class_year', 'hw_number', 'grade'] - {'student_id': '1', 'class_year': '2', 'hw_number': '1', 'grade': '73'} - {'student_id': '100', 'class_year': '1', 'hw_number': '10', 'grade': '78'} - """ +def _make_fake_data(path: Path, contributions): random.seed(0) # So the mock data will be stable across runs. - - csv_path = Path(__file__).parent.parent / "tmp" / "demo.csv" - contributions = 10 - - with csv_path.open("w", newline="") as demo_handle: + with path.open("w", newline="") as demo_handle: fields = ["student_id", "class_year", "hw_number", "grade"] writer = csv.DictWriter(demo_handle, fieldnames=fields) writer.writeheader() @@ -109,15 +137,39 @@ def _get_demo_csv_contrib() -> CLIInfo: } ) - return CLIInfo(csv_path=str(csv_path), contributions=contributions, is_demo=True) + +def _get_demo_cli_info() -> CLIInfo: + """ + >>> cli_info = _get_demo_cli_info() + >>> with open(cli_info.private_csv_path, newline="") as csv_handle: + ... reader = csv.DictReader(csv_handle) + ... reader.fieldnames + ... rows = list(reader) + ... rows[0] + ... rows[-1] + ['student_id', 'class_year', 'hw_number', 'grade'] + {'student_id': '1', 'class_year': '2', 'hw_number': '1', 'grade': '73'} + {'student_id': '100', 'class_year': '1', 'hw_number': '10', 'grade': '78'} + """ + private_csv_path = Path(__file__).parent.parent / "tmp" / "demo.csv" + contributions = 10 + _make_fake_data(private_csv_path, contributions) + + return CLIInfo( + public_csv_path=None, + private_csv_path=str(private_csv_path), + contributions=contributions, + is_demo=True, + ) -def get_cli_info(): # pragma: no cover +def get_cli_info() -> CLIInfo: # pragma: no cover args = _get_args() if args.demo: - if args.csv_path is not None: - warn('"--demo" overrides "--csv" and "--contrib"') - return _get_demo_csv_contrib() + return _get_demo_cli_info() return CLIInfo( - csv_path=args.csv_path, contributions=args.contributions, is_demo=False + public_csv_path=args.public_csv_path, + private_csv_path=args.private_csv_path, + contributions=args.contributions, + is_demo=False, ) diff --git a/dp_wizard/utils/converters.py b/dp_wizard/utils/converters.py index d53d08b..ddfd43c 100644 --- a/dp_wizard/utils/converters.py +++ b/dp_wizard/utils/converters.py @@ -14,6 +14,10 @@ def convert_py_to_nb(python_str: str, execute: bool = False): temp_dir_path = Path(temp_dir) py_path = temp_dir_path / "input.py" py_path.write_text(python_str) + + # DEBUG: + Path("/tmp/script.py").write_text(python_str) + argv = ( [ "jupytext", diff --git a/dp_wizard/utils/csv_helper.py b/dp_wizard/utils/csv_helper.py index edc822a..2055062 100644 --- a/dp_wizard/utils/csv_helper.py +++ b/dp_wizard/utils/csv_helper.py @@ -8,9 +8,10 @@ import re import polars as pl +from pathlib import Path -def read_csv_names(csv_path: str): +def read_csv_names(csv_path: Path): # Polars is overkill, but it is more robust against # variations in encoding than Python stdlib csv. # However, it could be slow: @@ -21,14 +22,27 @@ def read_csv_names(csv_path: str): return lf.collect_schema().names() -def read_csv_ids_labels(csv_path: str): +def get_csv_names_mismatch(public_csv_path: Path, private_csv_path: Path): + public_names = set(read_csv_names(public_csv_path)) + private_names = set(read_csv_names(private_csv_path)) + extra_public = public_names - private_names + extra_private = private_names - public_names + return (extra_public, extra_private) + + +def get_csv_row_count(csv_path: Path): + lf = pl.scan_csv(csv_path) + return lf.select(pl.len()).collect().item() + + +def read_csv_ids_labels(csv_path: Path): return { name_to_id(name): f"{i+1}: {name or '[blank]'}" for i, name in enumerate(read_csv_names(csv_path)) } -def read_csv_ids_names(csv_path: str): +def read_csv_ids_names(csv_path: Path): return {name_to_id(name): name for name in read_csv_names(csv_path)} diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py index 90a3786..99a0cc3 100644 --- a/dp_wizard/utils/dp_helper.py +++ b/dp_wizard/utils/dp_helper.py @@ -1,7 +1,6 @@ import polars as pl import opendp.prelude as dp -from dp_wizard.utils.mock_data import mock_data, ColumnDef from dp_wizard.utils.shared import make_cut_points dp.enable_features("contrib") @@ -11,6 +10,8 @@ def make_accuracy_histogram( + lf: pl.LazyFrame, + column_name: str, row_count: int, lower: float, upper: float, @@ -19,8 +20,16 @@ def make_accuracy_histogram( weighted_epsilon: float, ) -> tuple[float, pl.DataFrame]: """ - Creates fake data between lower and upper, and then returns a DP histogram from it. + Given a LazyFrame and column, and calculate a DP histogram. + + >>> from dp_wizard.utils.mock_data import mock_data, ColumnDef + >>> lower, upper = 0, 10 + >>> row_count = 100 + >>> column_name = "value" + >>> df = mock_data({column_name: ColumnDef(lower, upper)}, row_count=row_count) >>> accuracy, histogram = make_accuracy_histogram( + ... lf=pl.LazyFrame(df), + ... column_name=column_name, ... row_count=100, ... lower=0, upper=10, ... bin_count=5, @@ -43,20 +52,16 @@ def make_accuracy_histogram( │ (8, 10] ┆ ... │ └─────────┴─────┘ """ - # Mock data only depends on lower and upper bounds, so it could be cached, - # but I'd guess this is dominated by the DP operations, - # so not worth optimizing. - df = mock_data({"value": ColumnDef(lower, upper)}, row_count=row_count) - - # TODO: When this is stable, merge it to templates, so we can be + # TODO: https://github.com/opendp/dp-wizard/issues/219 + # When this is stable, merge it to templates, so we can be # sure that we're using the same code in the preview that we # use in the generated notebook. cut_points = make_cut_points(lower, upper, bin_count) context = dp.Context.compositor( - data=pl.LazyFrame(df).with_columns( + data=lf.with_columns( # The cut() method returns a Polars categorical type. # Cast to string to get the human-readable label. - pl.col("value") + pl.col(column_name) .cut(cut_points) .alias("bin") .cast(pl.String), diff --git a/tests/fixtures/default_app.py b/tests/fixtures/default_app.py index 4ed8b49..a881c1d 100644 --- a/tests/fixtures/default_app.py +++ b/tests/fixtures/default_app.py @@ -1,6 +1,5 @@ from shiny import App - from dp_wizard.app import app_ui, make_server_from_cli_info from dp_wizard.utils.argparse_helpers import CLIInfo @@ -8,8 +7,9 @@ app_ui, make_server_from_cli_info( CLIInfo( - csv_path=None, - contributions=None, + public_csv_path=None, + private_csv_path=None, + contributions=1, is_demo=False, ) ), diff --git a/tests/fixtures/demo_app.py b/tests/fixtures/demo_app.py index e1ef64a..747fd89 100644 --- a/tests/fixtures/demo_app.py +++ b/tests/fixtures/demo_app.py @@ -8,8 +8,9 @@ app_ui, make_server_from_cli_info( CLIInfo( - csv_path=None, - contributions=None, + public_csv_path=None, + private_csv_path=None, + contributions=1, is_demo=True, ) ), diff --git a/tests/test_app.py b/tests/test_app.py index 283072c..e53c811 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -18,7 +18,6 @@ default_app = create_app_fixture(Path(__file__).parent / "fixtures/default_app.py") tooltip = "#choose_csv_demo_tooltip_ui svg" for_the_demo = "For the demo, we'll imagine" -simulation = "This simulation will assume a normal distribution" # TODO: Why is incomplete coverage reported here? @@ -30,6 +29,10 @@ def test_demo_app(page: Page, demo_app: ShinyAppProc): # pragma: no cover page.locator(tooltip).hover() expect(page.get_by_text(for_the_demo)).to_be_visible() + # -- Define analysis -- + page.get_by_role("button", name="Define analysis").click() + expect(page.get_by_text("This simulation will assume")).to_be_visible() + def test_default_app(page: Page, default_app: ShinyAppProc): # pragma: no cover pick_dataset_text = "How many rows of the CSV" @@ -63,7 +66,7 @@ def expect_no_error(): # Now upload: csv_path = Path(__file__).parent / "fixtures" / "fake.csv" - page.get_by_label("Choose CSV file").set_input_files(csv_path.resolve()) + page.get_by_label("Choose Public CSV").set_input_files(csv_path.resolve()) expect_no_error() # -- Define analysis -- @@ -89,6 +92,8 @@ def expect_no_error(): expect_visible("Epsilon: 0.316") page.locator(".irs-bar").click() expect_visible("Epsilon: 0.158") + # Simulation + expect_visible("Because you've provided a public CSV") # Button disabled until column selected: download_results_button = page.get_by_role("button", name="Download results") @@ -100,7 +105,6 @@ def expect_no_error(): # Set column details: page.get_by_label("grade").check() - expect_visible(simulation) expect_not_visible("Weight") # Check that default is set correctly: assert page.get_by_label("Upper").input_value() == "10" @@ -109,14 +113,12 @@ def expect_no_error(): page.get_by_label("Upper").fill(new_value) # Uncheck the column: page.get_by_label("grade").uncheck() - expect_visible(simulation) # Recheck the column: page.get_by_label("grade").check() - expect_visible(simulation) assert page.get_by_label("Upper").input_value() == new_value expect_visible("The 95% confidence interval is ±794") page.get_by_text("Data Table").click() - expect_visible("(0, 2]") + expect_visible(f"({new_value}, inf]") # Because values are well above the bins. # Add a second column: # page.get_by_label("blank").check() diff --git a/tests/utils/test_argparse_helpers.py b/tests/utils/test_argparse_helpers.py index bff0c13..7877a78 100644 --- a/tests/utils/test_argparse_helpers.py +++ b/tests/utils/test_argparse_helpers.py @@ -1,5 +1,6 @@ from pathlib import Path from argparse import ArgumentTypeError +import re import pytest @@ -9,26 +10,50 @@ fixtures_path = Path(__file__).parent.parent / "fixtures" +def extract_block(md): + ''' + >>> fake_md = """ + ... header + ... ``` + ... block + ... ``` + ... footer + ... """ + >>> extract_block(fake_md) + 'block' + + >>> extract_block('sorry') + Traceback (most recent call last): + ... + Exception: no match for block + ''' + match = re.search(r"```\n(.*?)\n```", md, flags=re.DOTALL) + if match: + return match.group(1) + raise Exception("no match for block") + + def test_help(): help = ( - _get_arg_parser() - .format_help() + re.sub( + r"\]\s+\[", + "] [", # line wrapping of params varies. + _get_arg_parser().format_help(), + ) # argparse doesn't actually know the name of the script # and inserts the name of the running program instead. - .replace("__main__.py", "dp-wizard") - .replace("pytest", "dp-wizard") + .replace("__main__.py", "dp-wizard").replace("pytest", "dp-wizard") # Text is different under Python 3.9: .replace("optional arguments:", "options:") - ) - print(help) + ).strip() root_path = Path(__file__).parent.parent.parent readme_md = (root_path / "README.md").read_text() - assert help in readme_md + assert help == extract_block(readme_md) readme_pypi_md = (root_path / "README-PYPI.md").read_text() - assert help in readme_pypi_md + assert help == extract_block(readme_pypi_md) def test_arg_validation_no_file(): @@ -38,9 +63,9 @@ def test_arg_validation_no_file(): def test_arg_validation_not_csv(): with pytest.raises(ArgumentTypeError, match='Must have ".csv" extension:'): - _existing_csv_type(fixtures_path / "fake.ipynb") + _existing_csv_type(str(fixtures_path / "fake.ipynb")) def test_arg_validation_works(): - path = _existing_csv_type(fixtures_path / "fake.csv") + path = _existing_csv_type(str(fixtures_path / "fake.csv")) assert path.name == "fake.csv" diff --git a/tests/utils/test_csv_helper.py b/tests/utils/test_csv_helper.py index 76ac569..727c37b 100644 --- a/tests/utils/test_csv_helper.py +++ b/tests/utils/test_csv_helper.py @@ -4,7 +4,32 @@ import tempfile import pytest -from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names +from pathlib import Path + +from dp_wizard.utils.csv_helper import ( + read_csv_ids_labels, + read_csv_ids_names, + get_csv_names_mismatch, + get_csv_row_count, +) + + +def test_get_csv_names_mismatch(): + with tempfile.TemporaryDirectory() as tmp: + a_path = Path(tmp) / "a.csv" + a_path.write_text("a,b,c") + b_path = Path(tmp) / "b.csv" + b_path.write_text("b,c,d") + just_a, just_b = get_csv_names_mismatch(a_path, b_path) + assert just_a == {"a"} + assert just_b == {"d"} + + +def test_get_csv_row_count(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "a.csv" + path.write_text("a,b,c\n1,2,3") + assert get_csv_row_count(path) == 1 # We will not reference the encoding when reading: