diff --git a/.github/workflows/haupt.yml b/.github/workflows/haupt.yml index 540c0a1..86e8c31 100644 --- a/.github/workflows/haupt.yml +++ b/.github/workflows/haupt.yml @@ -13,9 +13,10 @@ defaults: env: SKIP_HYRISE: false + SKIP_HYRISE_MASTER: false SKIP_MONETDB: false SKIP_DUCKDB: false - SCALE_FACTOR: 0.5 + SCALE_FACTOR: 1 CMAKE_GENERATOR: Ninja jobs: @@ -45,10 +46,11 @@ jobs: # Add repository for older python versions. We use 3.11 as there are several issues with 3.12 (e.g., removed distutils and pip problems). sudo add-apt-repository ppa:deadsnakes/ppa --yes # We don't use Hyrise's install_dependencies script as it includes much more than needed for this small setup here. - sudo apt-get install -y ninja-build libboost-all-dev postgresql-server-dev-16 libtbb-dev libreadline-dev libsqlite3-dev systemtap-sdt-dev lld numactl python3.11-full python3.11-venv + sudo apt-get install -y -qq ninja-build libboost-all-dev postgresql-server-dev-16 libtbb-dev libreadline-dev libsqlite3-dev systemtap-sdt-dev numactl python3.11-full python3.11-venv clang-17 lld-17 + sudo update-alternatives --install /usr/bin/ld.lld ld.lld /usr/bin/ld.lld-17 90 python3.11 -m venv ~/venv source ~/venv/bin/activate - python -m pip install -r python/requirements.txt --quiet + python -m pip install -r python/requirements.txt --quiet - name: Determine core and client counts for database comparison id: core_client_counts @@ -70,10 +72,10 @@ jobs: run: | mkdir -p encoding_plugin/rel pushd encoding_plugin/rel > /dev/null - # Erase all encoding types. Hurts performance but allows us to compile in release mode with GitHub runners. - # Further, we use the relaxed mode as there are several issues with newer compiler (fixed in Hyrise's master, - # but not in the project's code). - cmake -DCMAKE_BUILD_TYPE=Release -DHYRISE_RELAXED_BUILD=ON -DERASE_SEGMENT_TYPES=Dictionary,LZ4,RunLength,FSST,FrameOfReference,Unencoded,FixedStringDictionary .. + # We use the relaxed mode as there are several issues with newer compiler (fixed in Hyrise's master, + # but not in the project's code). On top of relaxed mode (i.e., not all warnings are errors), we + # disable issues with deprecated declarations (atomic shared_ptr's). + cmake -DCMAKE_C_COMPILER=clang-17 -DCMAKE_CXX_COMPILER=clang++-17 -DCMAKE_BUILD_TYPE=Release -DCMAKE_UNITY_BUILD=ON -DHYRISE_RELAXED_BUILD=ON -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" .. cmake --build . --target hyriseServer WorkloadStatisticsPlugin WorkloadHandlerPlugin CommandExecutorPlugin DataCharacteristicsPlugin popd > /dev/null @@ -87,7 +89,7 @@ jobs: source ~/venv/bin/activate pushd python > /dev/null - python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --base_benchmark_runs=1 --single_benchmark=TPC-H --execute=calibration --scale_factor ${{ env.SCALE_FACTOR }} --random_encoding_configs_count=3 + python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --base_benchmark_runs=1 --single_benchmark=TPC-H --execute=calibration --scale_factor ${{ env.SCALE_FACTOR }} --random_encoding_configs_count=3 popd > /dev/null - name: Run calibration - learn runtime and size models @@ -189,6 +191,14 @@ jobs: repository: electrum/tpch-dbgen path: ./tpch-dbgen + - uses: actions/checkout@master + if: env.SKIP_HYRISE_MASTER == 'false' + with: + token: ${{ secrets.PAT }} + repository: hyrise/hyrise + path: ./hyrise_master + submodules: recursive + - name: Determine client and core counts for database comparison id: core_client_counts run: | @@ -213,7 +223,8 @@ jobs: - name: Install dependencies run: | sudo apt-get update -y -qq - sudo apt-get install -y -qq ninja-build libsqlite3-dev postgresql-server-dev-16 numactl bison python3-venv + sudo apt-get install -y -qq ninja-build libsqlite3-dev postgresql-server-dev-16 numactl bison python3-venv libboost-all-dev libtbb-dev libreadline-dev clang-17 lld-17 + sudo update-alternatives --install /usr/bin/ld.lld ld.lld /usr/bin/ld.lld-17 90 python3 -m venv ~/venv source ~/venv/bin/activate pip3 install -r python/requirements.txt # Not using --quiet to log the installed DuckDB version. @@ -259,7 +270,7 @@ jobs: chmod 644 *.tbl mkdir -p sf${{ env.SCALE_FACTOR }} - mv *.tbl sf${{ env.SCALE_FACTOR }} + mv *.tbl sf${{ env.SCALE_FACTOR }} popd mv tpch-dbgen ~ @@ -273,9 +284,30 @@ jobs: python3 db_comparison_runner.py duckdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only popd + - name: Build Hyrise (master) release server + if: env.SKIP_HYRISE_MASTER == 'false' + run: | + mkdir -p hyrise_master/rel + pushd hyrise_master/rel > /dev/null + cmake -DCMAKE_C_COMPILER=clang-17 -DCMAKE_CXX_COMPILER=clang++-17 -DCMAKE_BUILD_TYPE=Release -DCMAKE_UNITY_BUILD=ON -DHYRISE_RELAXED_BUILD=ON .. + cmake --build . --target hyriseServer + popd > /dev/null + + - name: Benchmark Hyrise (master, database comparison) + if: env.SKIP_HYRISE_MASTER == 'false' + run: | + pushd python + source ~/venv/bin/activate + python3 db_comparison_runner.py hyrise --hyrise_server_path=../hyrise_master/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }} + python3 db_comparison_runner.py hyrise --hyrise_server_path=../hyrise_master/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only + + mv db_comparison_results/database_comparison__TPC-H__hyrise.csv db_comparison_results/database_comparison__TPC-H__hyrise_master.csv + mv db_comparison_results/size_hyrise.csv db_comparison_results/size_hyrise_master.csv + popd + - name: Upload benchmark results uses: actions/upload-artifact@master - if: env.SKIP_DUCKDB == 'false' || env.SKIP_MONETDB == 'false' + if: env.SKIP_DUCKDB == 'false' || env.SKIP_MONETDB == 'false' || env.SKIP_HYRISE_MASTER == 'false' with: name: comparison_results path: | @@ -334,4 +366,4 @@ jobs: with: name: database_comparison path: | - db_comparison.pdf + db_comparison*.pdf diff --git a/R/plot.R b/R/plot.R index f10504f..d6acb26 100644 --- a/R/plot.R +++ b/R/plot.R @@ -7,8 +7,8 @@ scale_factor = Sys.getenv("SCALE_FACTOR") run_name = Sys.getenv("CALIBRATION_RUN") # currently not used. hyrise_core_count = Sys.getenv("HYRISE_CORE_COUNT") hyrise_client_count = Sys.getenv("HYRISE_CLIENT_COUNT") -comparison_core_count = Sys.getenv("HYRISE_CORE_COUNT") -comparison_client_count = Sys.getenv("HYRISE_CLIENT_COUNT") +comparison_core_count = Sys.getenv("COMPARISON_CORE_COUNT") +comparison_client_count = Sys.getenv("COMPARISON_CLIENT_COUNT") results_dir = paste0("results_to_plot") @@ -38,13 +38,17 @@ hyrise_lp$is_geom_line <- TRUE monet_runtimes <- read.csv(paste0(results_dir, "/database_comparison__TPC-H__monetdb.csv")) hyrise_runtimes <- read.csv(paste0(results_dir, "/database_comparison__TPC-H__hyrise.csv")) duckdb_runtimes <- read.csv(paste0(results_dir, "/database_comparison__TPC-H__duckdb.csv")) +hyrise_master_runtimes <- read.csv(paste0(results_dir, "/database_comparison__TPC-H__hyrise_master.csv")) +hyrise_master_runtimes$DATABASE_SYSTEM = "hyrise_master" -monet_size <- read.csv(paste0(results_dir, "/size_monetdb__SF", scale_factor, ".csv")) -hyrise_size <- read.csv(paste0(results_dir, "/size_hyrise__SF", scale_factor, ".csv")) -duckdb_size <- read.csv(paste0(results_dir, "/size_duckdb__SF", scale_factor, ".csv")) +monet_size <- read.csv(paste0(results_dir, "/size_monetdb.csv")) +hyrise_size <- read.csv(paste0(results_dir, "/size_hyrise.csv")) +duckdb_size <- read.csv(paste0(results_dir, "/size_duckdb.csv")) +hyrise_master_size <- read.csv(paste0(results_dir, "/size_hyrise_master.csv")) +hyrise_master_size$DATABASE_SYSTEM = "hyrise_master" -runtimes <- rbind(monet_runtimes, hyrise_runtimes, duckdb_runtimes) -sizes <- rbind(monet_size, hyrise_size, duckdb_size) +runtimes <- rbind(monet_runtimes, hyrise_runtimes, duckdb_runtimes, hyrise_master_runtimes) +sizes <- rbind(monet_size, hyrise_size, duckdb_size, hyrise_master_size) runtimes_q_agg <- runtimes %>% group_by(DATABASE_SYSTEM, ITEM_NAME) %>% summarize(median_runtime = mean(RUNTIME_MS), .groups="keep") runtimes_db_agg <- runtimes_q_agg %>% group_by(DATABASE_SYSTEM) %>% summarize(cumulative_runtime = sum(median_runtime), .groups="keep") @@ -66,6 +70,7 @@ joined <- rbind(joined, first_lp) joined$DATABASE_SYSTEM[which(joined$DATABASE_SYSTEM == "duckdb")] <- "DuckDB" joined$DATABASE_SYSTEM[which(joined$DATABASE_SYSTEM == "monetdb")] <- "MonetDB" joined$DATABASE_SYSTEM[which(joined$DATABASE_SYSTEM == "hyrise")] <- "Default Hyrise" +joined$DATABASE_SYSTEM[which(joined$DATABASE_SYSTEM == "hyrise_master")] <- "Hyrise Master" max_size <- max(joined$size_gb) max_throughput <- max(joined$runs_per_hour) @@ -96,4 +101,4 @@ g <- ggplot(joined, aes(x=size_gb, y=runs_per_hour, group=DATABASE_SYSTEM, fill= force = 0.5, ) -ggsave("db_comparison.pdf", g, width=7, height=5) +ggsave(paste0("db_comparison__", strftime(as.POSIXlt(Sys.time(), "UTC") , "%Y-%m-%d"),".pdf"), g, width=7, height=5) diff --git a/README.md b/README.md index fbceb22..fda554f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,13 @@ This repository contains source code and artifacts for the paper [**Robust and Budget-Constrained Encoding Configurations for In-Memory Database Systems**](https://www.vldb.org/pvldb/vol15/p780-boissier.pdf) (VLDB 2022). +Exemplary results of pipeline run, comparing against MonetDB and Duckdb. + + +Once a month, we automatically execute the encoding system presented in the paper as a GitHub action and compare the performance against MonetDB and DuckDB[^1]. +You can see the most recent results plotted in the `database_comparison.zip` artifact of the [most recent run of the main branch](https://github.com/hyrise/encoding_selection/actions?query=branch%3Amain) (more information below). + + In case you have any questions, please contact [Martin Boissier](https://hpi.de/plattner/people/phd-students/martin-boissier.html). @@ -40,15 +47,15 @@ The whole encoding selection pipeline runs within GitHub actions to ease reprodu The `hyrise_full_pipeline` job in the main workflow file [haupt.yml](https://github.com/hyrise/encoding_selection/blob/main/.github/workflows/haupt.yml#L20) lists all steps required from gathering calibration data, learning models, selecting configurations, to evaluating them. Due to GitHub restrictions, the pipeline creates only a tiny data set (scale factor of 0.5). -For each run, we compare Hyrise against MonetDB and DuckDB[^1]. -The results are plotted and stored in the artifacts of each run[^2]. +For each run, we compare Hyrise against MonetDB and DuckDB[^2]. +The results are plotted and stored in the artifacts of each run[^3]. Download `database_comparison(.zip)` of the last succesful run for a plot of the TPC-H benchmark runs. The code (both the plugins as well as the Python scripts) are extracted from a larger project. Please excuse the often convoluted and bloated code. -Flowchart of the GitHub runner workflow[^3]: +Flowchart of the GitHub runner workflow: ```mermaid flowchart LR; Start --> setuph["Setup Hyrise Pipeline
(git, apt, pip, ...)"]; @@ -65,7 +72,9 @@ flowchart LR; runduckdb --> plot; ``` -[^1]: Please view the results with a huge grain of salt, especially the DuckDB results. +[^1]: Please note that the comparison is executed on GitHub action runners and is thus only meant to show reproducibility. We do not aim to establish a performance order with these action runs. + +[^2]: Please view the results with a huge grain of salt, especially the DuckDB results. We are huge fans of DuckDB and thus wanted to include it. But the current benchmark script is probably an unfair comparison, as DuckDB's aim is more on single-user performance (i.e., data scientists/smartists). Hyrise's focus on concurrent OLTP/OLAP users. @@ -74,8 +83,6 @@ Further, we cannot rule out that Python's GIL causes unexpected performance degr We have talked to the DuckDB maintainers and decided to exclude DuckDB measurements from the paper for this reason. In case you can help us to make a fair comparison, feel free to post a pull request. -[^2]: The plots are meant to show the reproducibility of the results, not to establish a fair comparison. +[^3]: The plots are meant to show the reproducibility of the results, not to establish a fair comparison. To conduct a "fairer" comparison (cf. footnote on DuckDB), the pipeline needs to be run on a dedicated machine. We have seen workflow runtimes on GitHub varying from 3h to over 6h (which is than canceled by GitHub) for the same setup. - -[^3]: Yes, I just wanted to integrate the flowchart for the sake of integrating a flowchart in Markdown. It isn't that interesting. diff --git a/python/db_comparison_runner.py b/python/db_comparison_runner.py index 76759b6..ca95458 100644 --- a/python/db_comparison_runner.py +++ b/python/db_comparison_runner.py @@ -64,7 +64,7 @@ hyrise_server_path = Path(args.hyrise_server_path).expanduser().resolve() assert (hyrise_server_path / "hyriseServer").exists(), "Please pass valid --hyrise_server_path" -monetdb_scale_factor_string = str(args.scale_factor).replace(".", "_") +monetdb_scale_factor_string = str(args.scale_factor).replace(".", "_") if float(int(args.scale_factor)) != args.scale_factor else str(int(args.scale_factor)) duckdb_scale_factor_string = int(args.scale_factor) if args.scale_factor >= 1.0 else args.scale_factor assert (args.single_query_id is None or (args.single_query_id > 0 and args.single_query_id < 23)), "Unexpected query id" @@ -190,7 +190,7 @@ def get_aggregated_table_size(): rows_fetched += len(rows) print("{:,} rows.".format(rows_fetched), flush=True) - with open("db_comparison_results/size_{}__SF{}.csv".format(args.dbms, args.scale_factor), "w") as size_file: + with open("db_comparison_results/size_{}.csv".format(args.dbms), "w") as size_file: size_file.write("DATABASE_SYSTEM,SCALE_FACTOR,SIZE_IN_BYTES\n") cumulative_size = 0 if args.dbms == "monetdb": @@ -297,7 +297,7 @@ def loop(thread_id, queries, query_id, start_time, successful_runs, timeout, is_ time_left = start_time + timeout - time.time() if time_left < 0: break - print('\rBenchmarking {}... {:.0f} seconds left'.format(query_name, time_left), end="") + print('\rBenchmarking {}... {:.0f} seconds left'.format(query_name, time_left), end="", flush=True) time.sleep(min(10, time_left)) while True: diff --git a/python/encoding_configuration_selector.py b/python/encoding_configuration_selector.py index 83470de..854dc2c 100644 --- a/python/encoding_configuration_selector.py +++ b/python/encoding_configuration_selector.py @@ -250,7 +250,7 @@ def run_compression_selection_comparison(short_name, calibration_run, robustness dictionary_size = workload['all_dictionary_size'] dictionary_runtime = workload['all_dictionary_runtime'] - results_with_static_dictionary = results.append({"MODEL": "Static", "BUDGET": dictionary_size, "SIZE_IN_BYTES": dictionary_size, "CUMULATIVE_RUNTIME_MS": dictionary_runtime}, ignore_index=True) + results_with_static_dictionary = pd.concat([results, pd.DataFrame({"MODEL": ["Static"], "BUDGET": [dictionary_size], "SIZE_IN_BYTES": [dictionary_size], "CUMULATIVE_RUNTIME_MS": [dictionary_runtime]})]) # create static dictonary configuration dict_configuration = np.zeros((workload['table_count'], workload['max_row_clusters'], workload['max_column_count']), dtype=np.int32) diff --git a/python/helpers/encoding_selection_helpers.py b/python/helpers/encoding_selection_helpers.py index 19f96bc..6b9df10 100644 --- a/python/helpers/encoding_selection_helpers.py +++ b/python/helpers/encoding_selection_helpers.py @@ -189,7 +189,7 @@ def eval_and_append(_title, _result, _metric, _sum_dict, _df_plotting, plot=True previous_results_path = Path(plot_file_name).parent / "model_evaluation.csv" if previous_results_path.exists(): df_plotting_previous = pd.read_csv(previous_results_path) - df_plotting = df_plotting_previous.append(df_plotting, ignore_index=True) + df_plotting = pd.concat([df_plotting_previous, df_plotting]) df_plotting.to_csv(previous_results_path, index=False) diff --git a/python/requirements.txt b/python/requirements.txt index 74ae72d..7c8a9d3 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,7 +3,7 @@ joblib == 1.4.2 matplotlib == 3.9.0 multiprocess == 0.70.16 numpy == 1.26.4 -pandas == 1.5.3 # Not using 2.x as we widely use DataFrame.append(). Change to pd.concat would be easy though. +pandas == 2.2.2 pymonetdb == 1.8.1 psycopg2-binary == 2.9.9 PuLP == 2.8.0 diff --git a/python/selection/column_runtime_change_prediction_helper.py b/python/selection/column_runtime_change_prediction_helper.py index 3150312..c7c412a 100644 --- a/python/selection/column_runtime_change_prediction_helper.py +++ b/python/selection/column_runtime_change_prediction_helper.py @@ -105,7 +105,7 @@ def collect_unified_operator_runtimes_per_column(operator_name, operator_data_fo candidates_left.columns = ["TABLE_NAME", "COLUMN_NAME", "DATA_TYPE"] candidates_right = df[["RIGHT_TABLE_NAME", "RIGHT_COLUMN_NAME", "DATA_TYPE_RIGHT"]] candidates_right.columns = ["TABLE_NAME", "COLUMN_NAME", "DATA_TYPE"] - candidates = candidates_left.append(candidates_right) + candidates = pd.concat([candidates_left, candidates_right]) else: candidates = df[['TABLE_NAME', 'COLUMN_NAME', 'DATA_TYPE']] @@ -266,10 +266,8 @@ def predict_join_baselines(df, runtime_models_folder, model = 'heteroscedastic', f"(predicted for baseline: {baseline_execution_time:,.2f} ms, actual: {actual_execution_time:,.2f} ms).") encoding_selection_helpers.adapt_negative_predictions(predictable, "prediction") - predictable["prediction_error"] = predictable.prediction - predictable.execution_time_ms - - results = results.append(predictable[['QUERY_HASH', 'OPERATOR_HASH', 'JOIN_MODEL_TYPE', 'materialize_side', 'prediction']]) + results = pd.concat([results, predictable[['QUERY_HASH', 'OPERATOR_HASH', 'JOIN_MODEL_TYPE', 'materialize_side', 'prediction']]]) return results @@ -296,7 +294,7 @@ def collect_join_runtimes_per_column(operator_data_folder, runtime_models_folder candidates_left.columns = ["TABLE_NAME", "COLUMN_NAME", "DATA_TYPE"] candidates_right = df[["RIGHT_TABLE_NAME", "RIGHT_COLUMN_NAME", "DATA_TYPE_RIGHT"]] candidates_right.columns = ["TABLE_NAME", "COLUMN_NAME", "DATA_TYPE"] - candidates = candidates_left.append(candidates_right) + candidates = pd.concat([candidates_left, candidates_right]) # Drop publicates, but keep na() as they mark materialized columns candidates = candidates.drop_duplicates() @@ -374,7 +372,7 @@ def collect_join_runtimes_per_column(operator_data_folder, runtime_models_folder 'VECTOR_COMPRESSION_TYPE', 'JOIN_MODEL_TYPE', 'materialize_side'], dropna=False).agg({'adapted_prediction': 'min'}).reset_index() - encoding_change_predictions = encoding_change_predictions.append(encoding_change_predictions_extract_grouped) + encoding_change_predictions = pd.concat([encoding_change_predictions, encoding_change_predictions_extract_grouped]) if len(encoding_change_predictions) > 0: encoding_change_predictions.VECTOR_COMPRESSION_TYPE = encoding_change_predictions.VECTOR_COMPRESSION_TYPE.replace("", np.nan) diff --git a/python/selection/workload_loading.py b/python/selection/workload_loading.py index a36986d..3a42c44 100644 --- a/python/selection/workload_loading.py +++ b/python/selection/workload_loading.py @@ -376,19 +376,19 @@ def parse_file_based_workload(workload_folder, runtime_models_folder, size_model __table_scans.groupby(['QUERY_HASH', 'OPERATOR_HASH'], dropna=False).prediction.max().sum() table_scans_grouped = __table_scans.groupby(['TABLE_NAME', 'COLUMN_NAME', 'ENCODING_TYPE', 'VECTOR_COMPRESSION_TYPE'], dropna=False).agg({'change': 'sum'}).reset_index() - results_grouped = results_grouped.append(table_scans_grouped, ignore_index=True) + results_grouped = pd.concat([results_grouped, table_scans_grouped]) if load_aggregations: if 'TABLE_NAME' not in __aggregates.columns: add_columns_for_grouping(__aggregates) - results_appended = results_appended.append(__aggregates, sort=False, ignore_index=True) + results_appended = pd.concat([results_appended, __aggregates]) assert __aggregates.groupby(['QUERY_HASH', 'OPERATOR_HASH'], dropna=False).prediction.min().sum() == \ __aggregates.groupby(['QUERY_HASH', 'OPERATOR_HASH'], dropna=False).prediction.max().sum() aggregates_grouped = __aggregates.groupby(['TABLE_NAME', 'COLUMN_NAME', 'ENCODING_TYPE', 'VECTOR_COMPRESSION_TYPE'], dropna=False).agg({'change': 'sum'}).reset_index() - results_grouped = results_grouped.append(aggregates_grouped, ignore_index=True) + results_grouped = pd.concat([results_grouped, aggregates_grouped]) if load_joins: - results_appended = results_appended.append(__joins, sort=False, ignore_index=True) + results_appended = pd.concat([results_appended, __joins]) # check that we have only a single prediction value per operator assert __joins.groupby(['QUERY_HASH', 'OPERATOR_HASH', 'JOIN_MODEL_TYPE', 'materialize_side']).prediction.min().sum() == \ @@ -397,19 +397,19 @@ def parse_file_based_workload(workload_folder, runtime_models_folder, size_model # we could also group by join_model_type here, but that shouldn't matter here any longer as we are not # interested in the differences between stages joins_grouped = __joins.groupby(['TABLE_NAME', 'COLUMN_NAME', 'ENCODING_TYPE', 'VECTOR_COMPRESSION_TYPE']).agg({'change': 'sum'}).reset_index() - results_grouped = results_grouped.append(joins_grouped, ignore_index=True) + results_grouped = pd.concat([results_grouped, joins_grouped]) if load_projections and __projections is not None: if 'TABLE_NAME' not in __projections.columns: add_columns_for_grouping(__projections) - results_appended = results_appended.append(__projections, sort=False, ignore_index=True) + results_appended = pd.concat([results_appended, __projections]) # check that we have only a single prediction value per operator assert __projections.groupby(['QUERY_HASH', 'OPERATOR_HASH']).prediction.min().sum() == \ __projections.groupby(['QUERY_HASH', 'OPERATOR_HASH']).prediction.max().sum() projections_grouped = __projections.groupby(['TABLE_NAME', 'COLUMN_NAME', 'ENCODING_TYPE', 'VECTOR_COMPRESSION_TYPE'], dropna=False).agg({'change': 'sum'}).reset_index() - results_grouped = results_grouped.append(projections_grouped, ignore_index=True) + results_grouped = pd.concat([results_grouped, projections_grouped]) assert len(__projections.query('ENCODING_TYPE == "Dictionary" and VECTOR_COMPRESSION_TYPE == "FixedSize2ByteAligned" and abs(change) > 0.0')) == 0