Add result of performance evaluation of loading datasets from GCS to …

…BiqQuery (#464) * Add result of Performance evaluation of loading datasets from GCS with Astro Python SDK 0.9.2 into BigQuery * Add benchmark details with respect to resources used * Add benchmark result on n2-standard-4 GCP * Add benchmark result for baseline using GCSToBigQueryOperator and `bq load` command
astronomer · Jun 20, 2022 · 0a04a4d · 0a04a4d
1 parent 5ed1187
commit 0a04a4d
Show file tree

Hide file tree

Showing 3 changed files with 208 additions and 31 deletions.
diff --git a/tests/benchmark/dags/benchmark_gcs_to_big_query.py b/tests/benchmark/dags/benchmark_gcs_to_big_query.py
@@ -0,0 +1,116 @@
+"""
+This DAG is to benchmark GCSToBigQueryOperator for various dataset
+"""
+import os
+from datetime import datetime, timedelta
+
+from airflow import models
+from airflow.operators import bash_operator
+from airflow.providers.google.cloud.operators.bigquery import (
+    BigQueryDeleteDatasetOperator,
+)
+from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
+    GCSToBigQueryOperator,
+)
+
+DATASET_NAME = os.environ.get("GCP_DATASET_NAME", "gcs_to_bq_benchmarking_dataset")
+TABLE_NAME = os.environ.get("GCP_TABLE_NAME", "gcs_to_bq_table")
+GCP_CONN_ID = os.getenv("GCP_CONN_ID", "google_cloud_default")
+EXECUTION_TIMEOUT_STR = os.getenv("EXECUTION_TIMEOUT_STR", default="4")
+RETRIES_STR = os.getenv("DEFAULT_TASK_RETRIES", default="2")
+DEFAULT_RETRY_DELAY_SECONDS_STR = os.getenv("DEFAULT_RETRY_DELAY_SECONDS", default="60")
+EXECUTION_TIMEOUT = int(EXECUTION_TIMEOUT_STR)
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(RETRIES_STR),
+    "retry_delay": timedelta(seconds=int(DEFAULT_RETRY_DELAY_SECONDS_STR)),
+}
+
+dag = models.DAG(
+    dag_id="benchmark_gcs_to_bigquery_operator",
+    schedule_interval=None,
+    start_date=datetime(2022, 1, 1),
+    catchup=False,
+    default_args=default_args,
+    tags=["benchmark", "dag_authoring"],
+)
+create_test_dataset = bash_operator.BashOperator(
+    task_id="create_test_dataset",
+    bash_command="bq mk --force=true %s" % DATASET_NAME,
+    dag=dag,
+)
+
+load_ten_kb = GCSToBigQueryOperator(
+    task_id="load_ten_kb",
+    bucket="astro-sdk",
+    source_objects=["benchmark/trimmed/covid_overview/covid_overview_10kb.parquet"],
+    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
+    schema_fields=None,
+    source_format="PARQUET",
+    write_disposition="WRITE_TRUNCATE",
+    dag=dag,
+)
+load_hundred_kb = GCSToBigQueryOperator(
+    task_id="load_hundred_kb",
+    bucket="astro-sdk",
+    source_objects=["benchmark/trimmed/tate_britain/artist_data_100kb.csv"],
+    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
+    schema_fields=None,
+    source_format="CSV",
+    write_disposition="WRITE_TRUNCATE",
+    dag=dag,
+)
+load_ten_mb = GCSToBigQueryOperator(
+    task_id="load_ten_mb",
+    bucket="astro-sdk",
+    source_objects=["benchmark/trimmed/imdb/title_ratings_10mb.csv"],
+    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
+    schema_fields=None,
+    source_format="CSV",
+    write_disposition="WRITE_TRUNCATE",
+    dag=dag,
+)
+
+load_one_gb = GCSToBigQueryOperator(
+    task_id="load_one_gb",
+    bucket="astro-sdk",
+    source_objects=["benchmark/trimmed/stackoverflow/stackoverflow_posts_1g.ndjson"],
+    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
+    schema_fields=None,
+    source_format="NEWLINE_DELIMITED_JSON",
+    write_disposition="WRITE_TRUNCATE",
+    dag=dag,
+)
+
+load_five_gb = GCSToBigQueryOperator(
+    task_id="load_five_gb",
+    bucket="astro-sdk",
+    source_objects=[
+        (
+            "benchmark/trimmed/pypi/pypi-downloads-2021-03-28-0000000000"
+            + str(i)
+            + ".ndjson"
+        )
+        if i >= 10
+        else (
+            "benchmark/trimmed/pypi/pypi-downloads-2021-03-28-0000000000"
+            + "0"
+            + str(i)
+            + ".ndjson"
+        )
+        for i in range(20)
+    ],
+    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
+    schema_fields=None,
+    source_format="NEWLINE_DELIMITED_JSON",
+    write_disposition="WRITE_TRUNCATE",
+    dag=dag,
+)
+
+delete_test_dataset = BigQueryDeleteDatasetOperator(
+    task_id="delete_airflow_test_dataset",
+    dataset_id=DATASET_NAME,
+    delete_contents=True,
+    dag=dag,
+)
diff --git a/tests/benchmark/download_datasets.sh b/tests/benchmark/download_datasets.sh
@@ -1,37 +1,38 @@
+  GNU nano 5.4                                                                                       download_datasets.sh
 #!/usr/bin/env bash
 
 #set -x
 #set -v
 set -e
 
-tate_artist_path=/tmp/artist_data.csv
-imdb_title_ratings_path=/tmp/title_ratings.csv
-github_timeline_path=/tmp/github_timeline.csv
-gcs_github_timeline_dir=gs://$GCS_BUCKET/github_timeline
-covid_overview_path=/tmp/covid_overview.csv
-
-echo $'\nDownloading the Tate Gallery artist dataset to' $tate_artist_path...
-curl https://raw.githubusercontent.com/tategallery/collection/master/artist_data.csv --output  $tate_artist_path
-
-echo $'\nDownloading and extracting the IMDB title.ratings dataset to' $imdb_title_ratings_path...
-curl https://datasets.imdbws.com/title.ratings.tsv.gz --output /tmp/title_ratings.tsv.gz
-gzip -d /tmp/title_ratings.tsv.gz -f
-tr '\t' ',' < /tmp/title_ratings.tsv > $imdb_title_ratings_path
-rm /tmp/title_ratings.tsv
-
-
-echo $'\nDownloading the UK COVID overview dataset to' $covid_overview_path...
-curl 'https://coronavirus.data.gov.uk/api/v2/data?areaType=overview&metric=covidOccupiedMVBeds&metric=cumCasesByPublishDate&metric=newOnsDeathsByRegistrationDate&metric=hospitalCases&format=csv' --output /tmp/covid_overview.csv
-
-# The following dataset assume the user has:
-# 1. a Google Cloud Platform account
-# 2. the GCP SDK
-
-echo $'\nDownloading the Github timeline dataset to' $github_timeline_path...
-if [ ! -n "$(gsutil ls $gcs_github_timeline_dir)" ]; then
-  bq extract \
-    --destination_format CSV \
-    bigquery-public-data:samples.github_timeline \
-    $gcs_github_timeline_dir/github_timeline_*.csv
-fi
-gsutil cp $gcs_github_timeline_dir/github_timeline_000000000007.csv /tmp/github_timeline.csv
+ten_kb=/tmp/covid_overview_10kb.parquet
+gcs_ten_kb=gs://astro-sdk/benchmark/trimmed/covid_overview/covid_overview_10kb.parquet
+echo $'\nDownloading the 10 kb covid_overview dataset to' $covid_overview...
+gsutil cp $gcs_ten_kb $ten_kb
+
+hundred_kb=/tmp/artist_data_100kb.csv
+gcs_hundred_kb=gs://astro-sdk/benchmark/trimmed/tate_britain/artist_data_100kb.csv
+echo $'\nDownloading the 100 kb artist_data dataset to' $hundred_kb...
+gsutil cp $gcs_hundred_kb $hundred_kb
+
+ten_mb=/tmp/title_ratings_10mb.csv
+gcs_ten_mb=gs://astro-sdk/benchmark/trimmed/imdb/title_ratings_10mb.csv
+echo $'\nDownloading the 10 mb imdb dataset to' $ten_mb...
+gsutil cp $gcs_ten_mb $ten_mb
+
+one_gb=/tmp/stackoverflow_posts_1g.ndjson
+gcs_one_gb=gs://astro-sdk/benchmark/trimmed/stackoverflow/stackoverflow_posts_1g.ndjson
+echo $'\nDownloading the 1 Gb stackoverflow dataset to' $one_gb...
+gsutil cp $gcs_one_gb $one_gb
+
+five_gb=/tmp/pypi/
+gcs_five_gb=gs://astro-sdk/benchmark/trimmed/pypi/
+mkdir $five_gb
+echo $'\nDownloading the 5 Gb pypi dataset to' $five_gb...
+gsutil -m cp -r $gcs_five_gb $five_gb
+
+ten_gb=/tmp/github-archive/
+gcs_ten_gb=gs://astro-sdk/benchmark/trimmed/github/github-archive/
+mkdir $ten_gb
+echo $'\nDownloading the 10 Gb github archive dataset to' $ten_gb...
+gsutil -m cp -r $gcs_ten_gb $ten_gb
diff --git a/tests/benchmark/results.md b/tests/benchmark/results.md
@@ -0,0 +1,60 @@
+# Benchmark Results
+
+## Dataset
+Details about the dataset used can be found at [dataset.md](datasets.md)
+
+## Performance evaluation of loading datasets from GCS with Astro Python SDK 0.9.2 into BigQuery
+The configuration used for this benchmarking can be found here [config.json](config.json)
+
+### Database: bigquery
+The benchmark ran with chunk size size 1,000,000 and following VM details:
+For Machine types: e2-medium
+- VM Image: Debian GNU/Linux 11 (bullseye)
+- CPU:2 vCPU
+- Memory: 4 GB memory
+
+| database   | dataset    | total_time   | memory_rss   | cpu_time_user   | cpu_time_system   | memory_pss   | memory_shared   |
+|:-----------|:-----------|:-------------|:-------------|:----------------|:------------------|:-------------|:----------------|
+| bigquery   | five_gb    | 13.06min     | 50.92MB      | 1.43min         | 9.06s             | 61.54MB      | 12.24MB         |
+| bigquery   | hundred_kb | 9.88s        | 21.89MB      | 540.0ms         | 50.0ms            | 16.96MB      | 12.31MB         |
+| bigquery   | one_gb     | 2.34min      | 27.98MB      | 16.99s          | 1.82s             | 28.93MB      | 10.83MB         |
+| bigquery   | ten_gb     | 25.83min     | 37.03MB      | 2.7min          | 17.68s            | 75.59MB      | 11.09MB         |
+| bigquery   | ten_kb     | 7.58s        | 37.27MB      | 570.0ms         | 60.0ms            | 29.67MB      | 15.59MB         |
+| bigquery   | ten_mb     | 11.8s        | 34.79MB      | 1.22s           | 280.0ms           | 35.92MB      | 11.27MB         |
+
+For Machine types: n2-standard-4
+- VM Image: Debian GNU/Linux 11 (bullseye)
+- CPU:4 vCPUs
+- Memory: 16 GB memory
+
+| database   | dataset    | total_time   | memory_rss   | cpu_time_user   | cpu_time_system   | memory_pss   | memory_shared   |
+|:-----------|:-----------|:-------------|:-------------|:----------------|:------------------|:-------------|:----------------|
+| bigquery   | five_gb    | 14.17min     | 52.93MB      | 1.41min         | 6.94s             | 64.24MB      | 11.52MB         |
+| bigquery   | hundred_kb | 8.68s        | 20.54MB      | 3.63s           | 250.0ms           | 13.8MB       | 10.03MB         |
+| bigquery   | one_gb     | 2.43min      | 26.75MB      | 15.04s          | 1.5s              | 27.28MB      | 11.55MB         |
+| bigquery   | ten_gb     | 29.22min     | 43.85MB      | 2.68min         | 13.29s            | 82.42MB      | 11.23MB         |
+| bigquery   | ten_kb     | 9.57s        | 30.13MB      | 3.69s           | 220.0ms           | 24.97MB      | 15.76MB         |
+| bigquery   | ten_mb     | 34.96s       | 34.5MB       | 3.9s            | 410.0ms           | 35.58MB      | 11.55MB         |
+
+
+#### Baseline using `bq load`
+
+|Dataset                                    |Size |Duration(h-m-s)|
+|-------------------------------------------|-----|---------------|
+|covid_overview/covid_overview_10kb.csv     |10 KB|0:00:02        |
+|tate_britain/artist_data_100kb.csv         |100KB|0:00:02        |
+|imdb/title_ratings_10mb.csv                |10MB |0:00:05        |
+|stackoverflow/stackoverflow_posts_1g.ndjson|1GB  |0:00:50        |
+|trimmed/pypi/*                             |5GB  |0:00:41        |
+|github/github-archive/*                    |10GB |0:01:09        |
+
+
+#### Baseline using `GCSToBigQueryOperator` using [benchmark_gcs_to_bigquery.py](tests/benchmark/dags/benchmark_gcs_to_big_query.py)
+
+|Dataset                                    |Size | Duration(seconds)  |
+|-------------------------------------------|-----|--------------------|
+|covid_overview/covid_overview_10kb.csv     |10 KB| 5.129522           |
+|tate_britain/artist_data_100kb.csv         |100KB| 3.319834           |
+|imdb/title_ratings_10mb.csv                |10MB | 5.558414           |
+|stackoverflow/stackoverflow_posts_1g.ndjson|1GB  | 85.409014          |
+|trimmed/pypi/*                             |5GB  | 48.973093          |