Skip to content

Commit

Permalink
Add result of performance evaluation of loading datasets from GCS to …
Browse files Browse the repository at this point in the history
…BiqQuery (#464)

* Add result of Performance evaluation of loading datasets from GCS with Astro Python SDK 0.9.2 into BigQuery

* Add benchmark details with respect to resources used

* Add benchmark result on n2-standard-4 GCP

* Add benchmark result for baseline using GCSToBigQueryOperator and `bq load` command
  • Loading branch information
sunank200 committed Jun 20, 2022
1 parent 5ed1187 commit 0a04a4d
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 31 deletions.
116 changes: 116 additions & 0 deletions tests/benchmark/dags/benchmark_gcs_to_big_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
This DAG is to benchmark GCSToBigQueryOperator for various dataset
"""
import os
from datetime import datetime, timedelta

from airflow import models
from airflow.operators import bash_operator
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryDeleteDatasetOperator,
)
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
GCSToBigQueryOperator,
)

DATASET_NAME = os.environ.get("GCP_DATASET_NAME", "gcs_to_bq_benchmarking_dataset")
TABLE_NAME = os.environ.get("GCP_TABLE_NAME", "gcs_to_bq_table")
GCP_CONN_ID = os.getenv("GCP_CONN_ID", "google_cloud_default")
EXECUTION_TIMEOUT_STR = os.getenv("EXECUTION_TIMEOUT_STR", default="4")
RETRIES_STR = os.getenv("DEFAULT_TASK_RETRIES", default="2")
DEFAULT_RETRY_DELAY_SECONDS_STR = os.getenv("DEFAULT_RETRY_DELAY_SECONDS", default="60")
EXECUTION_TIMEOUT = int(EXECUTION_TIMEOUT_STR)

default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(RETRIES_STR),
"retry_delay": timedelta(seconds=int(DEFAULT_RETRY_DELAY_SECONDS_STR)),
}

dag = models.DAG(
dag_id="benchmark_gcs_to_bigquery_operator",
schedule_interval=None,
start_date=datetime(2022, 1, 1),
catchup=False,
default_args=default_args,
tags=["benchmark", "dag_authoring"],
)
create_test_dataset = bash_operator.BashOperator(
task_id="create_test_dataset",
bash_command="bq mk --force=true %s" % DATASET_NAME,
dag=dag,
)

load_ten_kb = GCSToBigQueryOperator(
task_id="load_ten_kb",
bucket="astro-sdk",
source_objects=["benchmark/trimmed/covid_overview/covid_overview_10kb.parquet"],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=None,
source_format="PARQUET",
write_disposition="WRITE_TRUNCATE",
dag=dag,
)
load_hundred_kb = GCSToBigQueryOperator(
task_id="load_hundred_kb",
bucket="astro-sdk",
source_objects=["benchmark/trimmed/tate_britain/artist_data_100kb.csv"],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=None,
source_format="CSV",
write_disposition="WRITE_TRUNCATE",
dag=dag,
)
load_ten_mb = GCSToBigQueryOperator(
task_id="load_ten_mb",
bucket="astro-sdk",
source_objects=["benchmark/trimmed/imdb/title_ratings_10mb.csv"],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=None,
source_format="CSV",
write_disposition="WRITE_TRUNCATE",
dag=dag,
)

load_one_gb = GCSToBigQueryOperator(
task_id="load_one_gb",
bucket="astro-sdk",
source_objects=["benchmark/trimmed/stackoverflow/stackoverflow_posts_1g.ndjson"],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=None,
source_format="NEWLINE_DELIMITED_JSON",
write_disposition="WRITE_TRUNCATE",
dag=dag,
)

load_five_gb = GCSToBigQueryOperator(
task_id="load_five_gb",
bucket="astro-sdk",
source_objects=[
(
"benchmark/trimmed/pypi/pypi-downloads-2021-03-28-0000000000"
+ str(i)
+ ".ndjson"
)
if i >= 10
else (
"benchmark/trimmed/pypi/pypi-downloads-2021-03-28-0000000000"
+ "0"
+ str(i)
+ ".ndjson"
)
for i in range(20)
],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=None,
source_format="NEWLINE_DELIMITED_JSON",
write_disposition="WRITE_TRUNCATE",
dag=dag,
)

delete_test_dataset = BigQueryDeleteDatasetOperator(
task_id="delete_airflow_test_dataset",
dataset_id=DATASET_NAME,
delete_contents=True,
dag=dag,
)
63 changes: 32 additions & 31 deletions tests/benchmark/download_datasets.sh
Original file line number Diff line number Diff line change
@@ -1,37 +1,38 @@
GNU nano 5.4 download_datasets.sh
#!/usr/bin/env bash

#set -x
#set -v
set -e

tate_artist_path=/tmp/artist_data.csv
imdb_title_ratings_path=/tmp/title_ratings.csv
github_timeline_path=/tmp/github_timeline.csv
gcs_github_timeline_dir=gs://$GCS_BUCKET/github_timeline
covid_overview_path=/tmp/covid_overview.csv

echo $'\nDownloading the Tate Gallery artist dataset to' $tate_artist_path...
curl https://raw.githubusercontent.com/tategallery/collection/master/artist_data.csv --output $tate_artist_path

echo $'\nDownloading and extracting the IMDB title.ratings dataset to' $imdb_title_ratings_path...
curl https://datasets.imdbws.com/title.ratings.tsv.gz --output /tmp/title_ratings.tsv.gz
gzip -d /tmp/title_ratings.tsv.gz -f
tr '\t' ',' < /tmp/title_ratings.tsv > $imdb_title_ratings_path
rm /tmp/title_ratings.tsv


echo $'\nDownloading the UK COVID overview dataset to' $covid_overview_path...
curl 'https://coronavirus.data.gov.uk/api/v2/data?areaType=overview&metric=covidOccupiedMVBeds&metric=cumCasesByPublishDate&metric=newOnsDeathsByRegistrationDate&metric=hospitalCases&format=csv' --output /tmp/covid_overview.csv

# The following dataset assume the user has:
# 1. a Google Cloud Platform account
# 2. the GCP SDK

echo $'\nDownloading the Github timeline dataset to' $github_timeline_path...
if [ ! -n "$(gsutil ls $gcs_github_timeline_dir)" ]; then
bq extract \
--destination_format CSV \
bigquery-public-data:samples.github_timeline \
$gcs_github_timeline_dir/github_timeline_*.csv
fi
gsutil cp $gcs_github_timeline_dir/github_timeline_000000000007.csv /tmp/github_timeline.csv
ten_kb=/tmp/covid_overview_10kb.parquet
gcs_ten_kb=gs://astro-sdk/benchmark/trimmed/covid_overview/covid_overview_10kb.parquet
echo $'\nDownloading the 10 kb covid_overview dataset to' $covid_overview...
gsutil cp $gcs_ten_kb $ten_kb

hundred_kb=/tmp/artist_data_100kb.csv
gcs_hundred_kb=gs://astro-sdk/benchmark/trimmed/tate_britain/artist_data_100kb.csv
echo $'\nDownloading the 100 kb artist_data dataset to' $hundred_kb...
gsutil cp $gcs_hundred_kb $hundred_kb

ten_mb=/tmp/title_ratings_10mb.csv
gcs_ten_mb=gs://astro-sdk/benchmark/trimmed/imdb/title_ratings_10mb.csv
echo $'\nDownloading the 10 mb imdb dataset to' $ten_mb...
gsutil cp $gcs_ten_mb $ten_mb

one_gb=/tmp/stackoverflow_posts_1g.ndjson
gcs_one_gb=gs://astro-sdk/benchmark/trimmed/stackoverflow/stackoverflow_posts_1g.ndjson
echo $'\nDownloading the 1 Gb stackoverflow dataset to' $one_gb...
gsutil cp $gcs_one_gb $one_gb

five_gb=/tmp/pypi/
gcs_five_gb=gs://astro-sdk/benchmark/trimmed/pypi/
mkdir $five_gb
echo $'\nDownloading the 5 Gb pypi dataset to' $five_gb...
gsutil -m cp -r $gcs_five_gb $five_gb

ten_gb=/tmp/github-archive/
gcs_ten_gb=gs://astro-sdk/benchmark/trimmed/github/github-archive/
mkdir $ten_gb
echo $'\nDownloading the 10 Gb github archive dataset to' $ten_gb...
gsutil -m cp -r $gcs_ten_gb $ten_gb
60 changes: 60 additions & 0 deletions tests/benchmark/results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Benchmark Results

## Dataset
Details about the dataset used can be found at [dataset.md](datasets.md)

## Performance evaluation of loading datasets from GCS with Astro Python SDK 0.9.2 into BigQuery
The configuration used for this benchmarking can be found here [config.json](config.json)

### Database: bigquery
The benchmark ran with chunk size size 1,000,000 and following VM details:
For Machine types: e2-medium
- VM Image: Debian GNU/Linux 11 (bullseye)
- CPU:2 vCPU
- Memory: 4 GB memory

| database | dataset | total_time | memory_rss | cpu_time_user | cpu_time_system | memory_pss | memory_shared |
|:-----------|:-----------|:-------------|:-------------|:----------------|:------------------|:-------------|:----------------|
| bigquery | five_gb | 13.06min | 50.92MB | 1.43min | 9.06s | 61.54MB | 12.24MB |
| bigquery | hundred_kb | 9.88s | 21.89MB | 540.0ms | 50.0ms | 16.96MB | 12.31MB |
| bigquery | one_gb | 2.34min | 27.98MB | 16.99s | 1.82s | 28.93MB | 10.83MB |
| bigquery | ten_gb | 25.83min | 37.03MB | 2.7min | 17.68s | 75.59MB | 11.09MB |
| bigquery | ten_kb | 7.58s | 37.27MB | 570.0ms | 60.0ms | 29.67MB | 15.59MB |
| bigquery | ten_mb | 11.8s | 34.79MB | 1.22s | 280.0ms | 35.92MB | 11.27MB |

For Machine types: n2-standard-4
- VM Image: Debian GNU/Linux 11 (bullseye)
- CPU:4 vCPUs
- Memory: 16 GB memory

| database | dataset | total_time | memory_rss | cpu_time_user | cpu_time_system | memory_pss | memory_shared |
|:-----------|:-----------|:-------------|:-------------|:----------------|:------------------|:-------------|:----------------|
| bigquery | five_gb | 14.17min | 52.93MB | 1.41min | 6.94s | 64.24MB | 11.52MB |
| bigquery | hundred_kb | 8.68s | 20.54MB | 3.63s | 250.0ms | 13.8MB | 10.03MB |
| bigquery | one_gb | 2.43min | 26.75MB | 15.04s | 1.5s | 27.28MB | 11.55MB |
| bigquery | ten_gb | 29.22min | 43.85MB | 2.68min | 13.29s | 82.42MB | 11.23MB |
| bigquery | ten_kb | 9.57s | 30.13MB | 3.69s | 220.0ms | 24.97MB | 15.76MB |
| bigquery | ten_mb | 34.96s | 34.5MB | 3.9s | 410.0ms | 35.58MB | 11.55MB |


#### Baseline using `bq load`

|Dataset |Size |Duration(h-m-s)|
|-------------------------------------------|-----|---------------|
|covid_overview/covid_overview_10kb.csv |10 KB|0:00:02 |
|tate_britain/artist_data_100kb.csv |100KB|0:00:02 |
|imdb/title_ratings_10mb.csv |10MB |0:00:05 |
|stackoverflow/stackoverflow_posts_1g.ndjson|1GB |0:00:50 |
|trimmed/pypi/* |5GB |0:00:41 |
|github/github-archive/* |10GB |0:01:09 |


#### Baseline using `GCSToBigQueryOperator` using [benchmark_gcs_to_bigquery.py](tests/benchmark/dags/benchmark_gcs_to_big_query.py)

|Dataset |Size | Duration(seconds) |
|-------------------------------------------|-----|--------------------|
|covid_overview/covid_overview_10kb.csv |10 KB| 5.129522 |
|tate_britain/artist_data_100kb.csv |100KB| 3.319834 |
|imdb/title_ratings_10mb.csv |10MB | 5.558414 |
|stackoverflow/stackoverflow_posts_1g.ndjson|1GB | 85.409014 |
|trimmed/pypi/* |5GB | 48.973093 |

0 comments on commit 0a04a4d

Please sign in to comment.