From d193835301f8ef41ea455ff1a135e0c409623324 Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Tue, 23 Jul 2024 07:15:42 -0400 Subject: [PATCH] Add source for budget.json and validation script Partially addresses #15 --- SOURCES.md | 3 +++ scripts/validate_budget_dataset.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 scripts/validate_budget_dataset.py diff --git a/SOURCES.md b/SOURCES.md index bad6739..d83d3f9 100644 --- a/SOURCES.md +++ b/SOURCES.md @@ -26,6 +26,9 @@ http://wildlife.faa.gov ## `budget.json` +Source: Office of Management and Budget (U.S.) +[Budget FY 2016 - Receipts](https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3) + ## `budgets.json` ## `burtin.json` diff --git a/scripts/validate_budget_dataset.py b/scripts/validate_budget_dataset.py new file mode 100644 index 0000000..8fbee5f --- /dev/null +++ b/scripts/validate_budget_dataset.py @@ -0,0 +1,25 @@ +import pandas as pd + +def validate_budget_data(): + """ + Validates that the budget data in the vega-datasets repository + matches the original source from the U.S. Government Publishing Office. + """ + source_url = "https://www.govinfo.gov/content/pkg/BUDGET-2016-DB/xls/BUDGET-2016-DB-3.xls" # https://www.govinfo.gov/app/details/BUDGET-2016-DB/context + vega_url = "https://raw.githubusercontent.com/vega/vega-datasets/05fcb7c07b1d76206856e75129fc1e79dc61735c/data/budget.json" # 2015-10-15 Commit + + source_df = pd.read_excel(source_url) + vega_df = pd.read_json(vega_url) + + # Format numeric columns and 'TQ' with commas + numeric_cols = [col for col in source_df.columns if col.isdigit()] + ['TQ'] + source_df[numeric_cols] = source_df[numeric_cols].apply(lambda col: col.map(lambda x: f"{x:,}" if pd.notnull(x) else x)) + + try: + pd.testing.assert_frame_equal(source_df, vega_df) + print("The DataFrames are identical.") + except AssertionError as e: + print("The DataFrames are not identical. Differences:", e) + +if __name__ == "__main__": + validate_budget_data() \ No newline at end of file