Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explain confidence interval #169

Merged
merged 3 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions dp_wizard/app/analysis_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dp_wizard.app.components.inputs import log_slider
from dp_wizard.app.components.column_module import column_ui, column_server
from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
from dp_wizard.utils.dp_helper import confidence
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
from dp_wizard.utils.code_generators import make_privacy_loss_block
from dp_wizard.app.components.column_module import col_widths
Expand Down Expand Up @@ -113,6 +114,14 @@ def columns_ui():
weights=weights,
is_demo=is_demo,
)
confidence_percent = f"{int(confidence * 100)}%"
note_md = f"""
This simulation assumes a normal distribution between the specified
lower and upper bounds. Your CSV has not been read except to
determine the columns.

The confidence interval is {confidence_percent}.
"""
return [
[
[
Expand All @@ -125,17 +134,7 @@ def columns_ui():
(
ui.layout_columns(
[],
[
ui.markdown(
"""
This simulation assumes a normal
distribution between the specified
lower and upper bounds. Your data
file has not been read except to
determine the columns.
"""
)
],
[ui.markdown(note_md)],
col_widths=col_widths, # type: ignore
)
if column_ids
Expand Down
4 changes: 2 additions & 2 deletions dp_wizard/app/components/column_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from shiny import ui, render, module, reactive, Inputs, Outputs, Session

from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
from dp_wizard.utils.dp_helper import make_accuracy_histogram
from dp_wizard.utils.shared import plot_histogram
from dp_wizard.utils.code_generators import make_column_config_block
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
Expand Down Expand Up @@ -155,7 +155,7 @@ def column_plot():
# This function is triggered when column is removed;
# Exit early to avoid divide-by-zero.
return None
_confidence, accuracy, histogram = make_confidence_accuracy_histogram(
accuracy, histogram = make_accuracy_histogram(
lower=lower_x,
upper=upper_x,
bin_count=bin_count,
Expand Down
7 changes: 6 additions & 1 deletion dp_wizard/utils/code_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from dp_wizard.utils.csv_helper import name_to_identifier
from dp_wizard.utils.code_generators._template import Template
from dp_wizard.utils.dp_helper import confidence


class AnalysisPlanColumn(NamedTuple):
Expand Down Expand Up @@ -77,7 +78,11 @@ def _make_columns(self, columns: dict[str, AnalysisPlanColumn]):
)

def _make_queries(self, column_names: Iterable[str]):
return "confidence = 0.95\n\n" + "\n".join(
confidence_note = (
"The actual value is within the shown range "
f"with {int(confidence * 100)}% confidence."
)
return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join(
_make_query(column_name) for column_name in column_names
)

Expand Down
14 changes: 7 additions & 7 deletions dp_wizard/utils/dp_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@
dp.enable_features("contrib")


def make_confidence_accuracy_histogram(
confidence = 0.95


def make_accuracy_histogram(
lower: float,
upper: float,
bin_count: int,
contributions: int,
weighted_epsilon: float,
) -> tuple[float, float, Any]:
) -> tuple[float, Any]:
"""
Creates fake data between lower and upper, and then returns a DP histogram from it.
>>> confidence, accuracy, histogram = make_confidence_accuracy_histogram(
>>> accuracy, histogram = make_accuracy_histogram(
... lower=0, upper=10, bin_count=5, contributions=1, weighted_epsilon=1)
>>> confidence
0.95
>>> accuracy
3.37...
>>> histogram
Expand Down Expand Up @@ -74,9 +75,8 @@ def make_confidence_accuracy_histogram(
)
query = context.query().group_by("bin").agg(pl.len().dp.noise()) # type: ignore

confidence = 0.95
accuracy = query.summarize(alpha=1 - confidence)["accuracy"].item() # type: ignore
# The sort is alphabetical. df_to_columns needs to be used
# downstream to parse interval and sort by numeric value.
histogram = query.release().collect().sort("bin")
return (confidence, accuracy, histogram)
return (accuracy, histogram)
Loading