From 015b5ddcfe0d5f87795f4a5bdee8e46f3a5ade30 Mon Sep 17 00:00:00 2001 From: Richard Preen Date: Tue, 22 Oct 2024 12:34:50 +0100 Subject: [PATCH] add support for pandas version 2 (#237) * changes to support pandas>2 * update version number; allow pandas v2; try testing 3.13 * update notes/ci for Python313 * update lint runner to use pip install --- .github/workflows/lint.yml | 8 ++--- .github/workflows/tests.yml | 2 +- ...ests_python312.yml => tests_python313.yml} | 4 +-- CHANGELOG.md | 5 ++++ CITATION.cff | 7 +++-- README.md | 29 ++++++++----------- acro/acro_tables.py | 3 ++ acro/version.py | 2 +- docs/source/conf.py | 2 +- requirements.txt | 7 ----- setup.cfg | 11 +++---- test/test_initial.py | 7 ++++- 12 files changed, 45 insertions(+), 42 deletions(-) rename .github/workflows/{tests_python312.yml => tests_python313.yml} (94%) delete mode 100644 requirements.txt diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index af4e3c5..1ce028c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,11 +15,11 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Install dependencies + - name: Install run: | - python -m pip install --upgrade pip - pip install pylint pytest pytest-cov - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install --upgrade pip + pip install .[test] + pip install pylint - name: pylint run: | diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3609248..4141a99 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - name: Checkout diff --git a/.github/workflows/tests_python312.yml b/.github/workflows/tests_python313.yml similarity index 94% rename from .github/workflows/tests_python312.yml rename to .github/workflows/tests_python313.yml index c877b24..9b49fad 100644 --- a/.github/workflows/tests_python312.yml +++ b/.github/workflows/tests_python313.yml @@ -1,5 +1,5 @@ --- -name: Test Python312 +name: Test Python313 on: workflow_dispatch @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.12'] + python-version: ['3.13'] steps: - name: Checkout diff --git a/CHANGELOG.md b/CHANGELOG.md index 930ee1a..9ebe20e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.4.7 (Oct 22, 2024) + +Changes: +* Add support for Pandas version 2 ([#237](https://github.com/AI-SDC/ACRO/pull/237)) + ## Version 0.4.6 (Jun 25, 2024) Changes: diff --git a/CITATION.cff b/CITATION.cff index 80887c5..01d3956 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 title: ACRO -version: 0.4.6 -doi: 10.5281/zenodo.12535291 -date-released: 2024-06-25 +version: 0.4.7 +doi: +date-released: 2024-10-22 license: MIT repository-code: https://github.com/AI-SDC/ACRO languages: @@ -13,6 +13,7 @@ keywords: - privacy - privacy tools - statistical disclosure control + - statistical software authors: - family-names: Preen given-names: Richard John diff --git a/README.md b/README.md index 567d9d3..743cba9 100644 --- a/README.md +++ b/README.md @@ -8,22 +8,16 @@ This repository holds the Python ACRO package. An R wrapper package is available: [ACRO-R](https://github.com/AI-SDC/ACRO-R). -ACRO (Automatic Checking of Research Outputs) is an open source -tool for automating the statistical disclosure control (SDC) of research -outputs. ACRO assists researchers and output checkers by distinguishing between -research output that is safe to publish, output that requires further analysis, -and output that cannot be published because of substantial disclosure risk. - -It does this by providing a light-weight 'skin' that sits over well-known -analysis tools, in a variety of languages researchers might use. This adds -functionality to: - -* identify potentially disclosive outputs against a range of commonly used - disclosure tests; +A GUI for viewing and approving outputs is also available: [SACRO-Viewer](https://github.com/AI-SDC/SACRO-Viewer) + +ACRO (Automatic Checking of Research Outputs) is an open source tool for automating the [statistical disclosure control](https://en.wikipedia.org/wiki/Statistical_disclosure_control) (SDC) of research outputs. ACRO assists researchers and output checkers by distinguishing between research output that is safe to publish, output that requires further analysis, and output that cannot be published because of a substantial risk of disclosing private data. + +It does this by providing a lightweight 'skin' that sits over well-known analysis tools, in a variety of languages researchers might use. This adds functionality to: + +* identify potentially disclosive outputs against a range of commonly used disclosure tests; * suppress outputs where required; * report reasons for suppression; -* produce simple summary documents TRE staff can use to streamline their - workflow. +* produce simple summary documents TRE staff can use to streamline their workflow. ![ACRO workflow and architecture schematic](docs/schematic.png) @@ -37,15 +31,16 @@ If installed in this way, the example [notebooks](notebooks) and the [data](data $ pip install acro ``` -#### Notes for Python 3.12 +#### Notes for Python 3.13 -ACRO currently depends on an older version of Pandas (~1.5.0) for which no pre-compiled wheels are available within pip for Python 3.12. Therefore, in this scenario, Pandas must be built from source. This requires the installation of a C++ compiler before pip installing acro. +ACRO currently depends on numpy version 1.x.x for which no pre-compiled wheels are available within pip for Python 3.13. Therefore, in this scenario, numpy must be built from source. This requires the installation of a C++ compiler before pip installing acro. -For Windows, [Microsoft Visual Studio](https://visualstudio.microsoft.com/downloads/) and the [C++ build tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) will likely need to be installed first. +For Windows, the [Microsoft Visual Studio C++ build tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) will likely need to be installed first. ### Examples See the example notebooks for: + * [Python charities dataset](notebooks/test.ipynb) * [Python nursery dataset](notebooks/test-nursery.ipynb) * [R charities dataset](https://ai-sdc.github.io/ACRO/_static/test.nb.html) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index be8da02..c243f00 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -1512,6 +1512,9 @@ def crosstab_with_totals( # pylint: disable=too-many-arguments,too-many-locals normalize=normalize, ) + if table.empty: + raise ValueError("empty table") + table, _ = delete_empty_rows_columns(table) masks = create_crosstab_masks( index_new, diff --git a/acro/version.py b/acro/version.py index 694c981..b53ab89 100644 --- a/acro/version.py +++ b/acro/version.py @@ -1,3 +1,3 @@ """ACRO version number.""" -__version__ = "0.4.6" +__version__ = "0.4.7" diff --git a/docs/source/conf.py b/docs/source/conf.py index 77bb166..ea72598 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,7 +12,7 @@ # -- Project information ----------------------------------------------------- project = "ACRO" -copyright = "2023, ACRO Project Team" +copyright = "2024, ACRO Project Team" author = "ACRO Project Team" release = __version__ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 808ed20..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -lxml==4.9.1 -matplotlib==3.7.2 -numpy==1.23.1 -openpyxl==3.0.10 -pandas==1.5.0 -PyYAML==6.0 -statsmodels==0.13.2 diff --git a/setup.cfg b/setup.cfg index 5953584..d272bd7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = acro -version = 0.4.6 +version = 0.4.7 description = ACRO: Tools for the Automatic Checking of Research Outputs long_description = file: README.md long_description_content_type = text/markdown @@ -10,16 +10,16 @@ maintainer_email = james.smith@uwe.ac.uk license = MIT license_files = LICENSE.md classifiers = - Development Status :: 3 - Alpha + Development Status :: 4 - Beta Intended Audience :: Developers Intended Audience :: Science/Research License :: OSI Approved :: MIT License Natural Language :: English - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 Topic :: Scientific/Engineering Topic :: Scientific/Engineering :: Information Analysis Operating System :: OS Independent @@ -29,6 +29,7 @@ keywords = privacy privacy-tools statistical-disclosure-control + statistical-software project_urls = Changelog = https://github.com/AI-SDC/ACRO/CHANGELOG.md Documentation = https://github.com/AI-SDC/ACRO/wiki @@ -36,7 +37,7 @@ project_urls = Discussions = https://github.com/AI-SDC/ACRO/discussions [options] -python_requires = >=3.8 +python_requires = >=3.9 zip_safe = False include_package_data = True packages = find: @@ -45,7 +46,7 @@ install_requires = matplotlib numpy<2.0.0 openpyxl - pandas~=1.5.0 + pandas>=1.5.0,<2.3 PyYAML statsmodels diff --git a/test/test_initial.py b/test/test_initial.py index 7cbc2be..c8da439 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -379,7 +379,12 @@ def test_finalise_json(data, acro): assert orig.summary == read.summary assert orig.comments == read.comments assert orig.timestamp == read.timestamp - assert (orig.output[0].reset_index()).equals(read.output[0]) + # check SDC outcome DataFrame + orig_df = orig.output[0].reset_index() + read_df = read.output[0] + pd.testing.assert_frame_equal( + orig_df, read_df, check_names=False, check_dtype=False + ) # test reading JSON with open(os.path.normpath(f"{PATH}/results.json"), encoding="utf-8") as file: json_data = json.load(file)