Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle public and private CSVs #218

Merged
merged 29 commits into from
Jan 16, 2025
Merged
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
afc07fc
handle public and private CSV in CLI... but nothing downstream
mccalluc Jan 13, 2025
f4ec7a3
coverage
mccalluc Jan 13, 2025
492c4d6
More readable CLI help
mccalluc Jan 13, 2025
5858129
fake data into separate function
mccalluc Jan 13, 2025
2b95650
warn -> error
mccalluc Jan 13, 2025
25083dd
warn -> error
mccalluc Jan 13, 2025
c76ac79
add private and public component params
mccalluc Jan 13, 2025
9a2ecc3
add explanation in UI
mccalluc Jan 13, 2025
7f6cc46
add cards to organize first tab
mccalluc Jan 13, 2025
b900e05
stub where the warning message will go
mccalluc Jan 14, 2025
c398c9a
warning message about column mismatch
mccalluc Jan 14, 2025
64c01a3
better formating on list
mccalluc Jan 14, 2025
327395d
linting
mccalluc Jan 14, 2025
9c2439d
make the "Define analysis" button conditional
mccalluc Jan 14, 2025
3a0de4e
fix label in end-to-end
mccalluc Jan 14, 2025
c280b0b
reformat for readability
mccalluc Jan 14, 2025
43ef859
match -> mismatch
mccalluc Jan 14, 2025
5bbcc91
read either public or private
mccalluc Jan 14, 2025
9712374
move out content of simulation card
mccalluc Jan 14, 2025
0893d72
Different simulation card if public CSV
mccalluc Jan 14, 2025
630d5a6
fix renaming bugs
mccalluc Jan 14, 2025
cb394fb
add test to fix coverage; use "Optional"
mccalluc Jan 14, 2025
3f4b11b
factor mock data generation out of make_accuracy_histogram
mccalluc Jan 14, 2025
2cbc826
public and private previews
mccalluc Jan 14, 2025
38160f5
start testing conditional display for public vs private
mccalluc Jan 15, 2025
48c1239
nb reads public or private
mccalluc Jan 15, 2025
6f25df7
also make plot title conditional
mccalluc Jan 15, 2025
a4ca970
factor out shared descriptions
mccalluc Jan 15, 2025
df1a17d
missing f on f-string
mccalluc Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
warning message about column mismatch
mccalluc committed Jan 14, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit c398c9ac7249f2e8cf2447576566e24c001cab14
35 changes: 30 additions & 5 deletions dp_wizard/app/dataset_panel.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from pathlib import Path
import csv

from shiny import ui, reactive, render, Inputs, Outputs, Session

from dp_wizard.utils.argparse_helpers import get_cli_info
from dp_wizard.utils.csv_helper import csv_names_mismatch
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
from dp_wizard.utils.code_generators import make_privacy_unit_block

@@ -93,12 +95,30 @@ def _on_public_csv_path_change():
def _on_private_csv_path_change():
private_csv_path.set(input.private_csv_path()[0]["datapath"])

@render.ui
def csv_column_match_ui():
@reactive.calc
def csv_column_match_calc() -> tuple[set, set] | None:
public = public_csv_path()
private = private_csv_path()
if public and private:
return f"TODO: read files and check columns: public: {public_csv_path()}, private: {private_csv_path()}"
just_public, just_private = csv_names_mismatch(Path(public), Path(private))
if just_public or just_private:
return just_public, just_private

@render.ui
def csv_column_match_ui():
mismatch = csv_column_match_calc()
if mismatch:
just_public, just_private = mismatch
messages = []
if just_public:
messages.append(
f"- Only the public CSV contains: {', '.join(just_public)}."
)
if just_private:
messages.append(
f"- Only the private CSV contains: {', '.join(just_private)}."
)
return ui.markdown("\n".join(messages))

@reactive.effect
@reactive.event(input.contributions)
@@ -109,8 +129,13 @@ def _on_contributions_change():
def button_enabled():
contributions_is_set = input.contributions() is not None
csv_path_is_set = (
input.csv_path() is not None and len(input.csv_path()) > 0
) or is_demo
(input.public_csv_path() is not None and len(input.public_csv_path()) > 0)
or (
input.private_csv_path() is not None
and len(input.private_csv_path()) > 0
)
or is_demo
)
return contributions_is_set and csv_path_is_set

@render.ui
16 changes: 13 additions & 3 deletions dp_wizard/utils/csv_helper.py
Original file line number Diff line number Diff line change
@@ -8,9 +8,11 @@

import re
import polars as pl
from tempfile import tempdir
from pathlib import Path


def read_csv_names(csv_path: str):
def read_csv_names(csv_path: Path):
# Polars is overkill, but it is more robust against
# variations in encoding than Python stdlib csv.
# However, it could be slow:
@@ -21,14 +23,22 @@ def read_csv_names(csv_path: str):
return lf.collect_schema().names()


def read_csv_ids_labels(csv_path: str):
def csv_names_mismatch(public_csv_path: Path, private_csv_path: Path):
public_names = set(read_csv_names(public_csv_path))
private_names = set(read_csv_names(private_csv_path))
extra_public = public_names - private_names
extra_private = private_names - public_names
return (extra_public, extra_private)


def read_csv_ids_labels(csv_path: Path):
return {
name_to_id(name): f"{i+1}: {name or '[blank]'}"
for i, name in enumerate(read_csv_names(csv_path))
}


def read_csv_ids_names(csv_path: str):
def read_csv_ids_names(csv_path: Path):
return {name_to_id(name): name for name in read_csv_names(csv_path)}


19 changes: 18 additions & 1 deletion tests/utils/test_csv_helper.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,24 @@
import tempfile
import pytest

from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
from pathlib import Path

from dp_wizard.utils.csv_helper import (
read_csv_ids_labels,
read_csv_ids_names,
csv_names_mismatch,
)


def test_csv_names_mismatch():
with tempfile.TemporaryDirectory() as tmp:
a_path = Path(tmp) / "a.csv"
a_path.write_text("a,b,c")
b_path = Path(tmp) / "b.csv"
b_path.write_text("b,c,d")
just_a, just_b = csv_names_mismatch(a_path, b_path)
assert just_a == {"a"}
assert just_b == {"d"}


# We will not reference the encoding when reading: