diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 2aeb1944..a81e4441 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -229,6 +229,33 @@ jobs: - id: set_is_parent_modified run: echo "is_parent_modified=${MODIFIED}" >> $GITHUB_OUTPUT + postgresql: + runs-on: ubuntu-latest + needs: base-os + env: + DH_REPO: "cloudsuite/${{ github.job }}" + outputs: + is_parent_modified: ${{ steps.set_is_parent_modified.outputs.is_parent_modified }} + strategy: + matrix: + tag: ["15"] + platform: ["linux/amd64,linux/arm64"] + steps: + - name: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - if: ${{ needs.java.outputs.is_parent_modified == 'true' }} + run: echo "IS_PARENT_MODIFIED=true" >> $GITHUB_ENV + - name: build and push + run: "./.github/scripts/build-images.sh" + env: + IMG_TAG: "${{ matrix.tag }}" + DF_PATH: "./commons/${{ github.job }}/${{ matrix.tag }}" + DBX_PLATFORM: ${{ matrix.platform }} + - id: set_is_parent_modified + run: echo "is_parent_modified=${MODIFIED}" >> $GITHUB_OUTPUT + data-analytics: runs-on: ubuntu-latest needs: hadoop @@ -301,6 +328,29 @@ jobs: DF_PATH: "./benchmarks/${{ github.job }}/${{ matrix.tag }}" DBX_PLATFORM: ${{ matrix.platform }} + data-serving-relational: + runs-on: ubuntu-latest + needs: postgresql + env: + DH_REPO: "cloudsuite/${{ github.job }}" + strategy: + matrix: + tag: ["server", "client"] + platform: ["linux/amd64,linux/arm64"] + steps: + - name: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - if: ${{ needs.cassandra.outputs.is_parent_modified == 'true' }} + run: echo "IS_PARENT_MODIFIED=true" >> $GITHUB_ENV + - name: build and push + run: "./.github/scripts/build-images.sh" + env: + IMG_TAG: "${{ matrix.tag }}" + DF_PATH: "./benchmarks/${{ github.job }}/${{ matrix.tag }}" + DBX_PLATFORM: ${{ matrix.platform }} + graph-analytics: runs-on: ubuntu-latest needs: spark diff --git a/.wordlist.txt b/.wordlist.txt index 067b70e4..1cf1dae0 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -92,6 +92,7 @@ mysqlserverdocker nginx NoSQL Nutch +OLTP OPERATIONCOUNT os PageRank @@ -118,10 +119,14 @@ solr solr's SQL src +stddev sudo sys +sysbench taskset TCP +TPC +tpcc threadcount txt UI diff --git a/benchmarks/data-serving-relational/client/Dockerfile b/benchmarks/data-serving-relational/client/Dockerfile new file mode 100644 index 00000000..5822d619 --- /dev/null +++ b/benchmarks/data-serving-relational/client/Dockerfile @@ -0,0 +1,23 @@ +FROM cloudsuite/base-os:ubuntu + +ENV DEBIAN_FRONTEND noninteractive + + +# 1. install necessary software (sysbench) +RUN apt update && apt install git sysbench python3 -y + +# 2. clone sysbench-tpcc's repo and install its script +RUN git clone https://github.com/Percona-Lab/sysbench-tpcc && cp sysbench-tpcc/*.lua /usr/share/sysbench/ + +# 3. Copy the template load file +COPY ./docker-entrypoint.py /root +COPY ./template/tpcc.py /root/template/tpcc.py +COPY ./template/oltp-rw.py /root/template/oltp-rw.py +COPY ./template/database.conf /root/template/database.conf +ENV DATABASE_CONF_FILE /root/template/database.conf + +RUN chmod +x /root/docker-entrypoint.py +RUN chmod +x /root/template/tpcc.py +RUN chmod +x /root/template/oltp-rw.py + +ENTRYPOINT ["/root/docker-entrypoint.py"] diff --git a/benchmarks/data-serving-relational/client/docker-entrypoint.py b/benchmarks/data-serving-relational/client/docker-entrypoint.py new file mode 100755 index 00000000..360a919e --- /dev/null +++ b/benchmarks/data-serving-relational/client/docker-entrypoint.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import os +import sys +import subprocess +import argparse + +args = sys.argv[1:] +parser = argparse.ArgumentParser() +parser.add_argument("--tpcc", help="Run TPC-C benchmark", action='store_true') +parser.add_argument("--oltp-rw", help="Run sysbench OLTP Read/Write workload", action='store_true') +parser.add_argument("--server-ip", help="IP of the server to load") + +args_parsed, unknown = parser.parse_known_args() + +if not args_parsed.server_ip: + print("Please pass the server IP as an argument with --server-ip=") + sys.exit() + +print("args: " + str(args)) +if not args_parsed.tpcc and not args_parsed.oltp_rw: + print("Precise whenever it's --tpcc or --oltp-rw") + sys.exit() + +import os +import os.path as path +import shutil + +def get_dict(lines): + config_dict = {} + for line in lines: + is_enabled = True + if "=" in line: + if line.startswith("#"): + is_enabled = False + line = line[1:] # Remove `#` + + key, value = line.split("=", 1) + key = key.strip() + value = value.strip() + config_dict[key] = (value, is_enabled) + + return config_dict + +def save_dict(config_dict, lines): + # Reconstruct the updated configuration + new_lines = [] + for line in lines: + if "=" in line: + if line.startswith("#"): + line = line[1:] + key, _ = line.split("=", 1) + key = key.strip() + if config_dict[key][1]: + new_lines.append(f"{key}={config_dict[key][0]}") + else: + new_lines.append(f"#{key}={config_dict[key][0]}") + else: + new_lines.append(line) + + new_config = "\n".join(new_lines) + return new_config + +DATABASE_CONF_FILE = os.environ["DATABASE_CONF_FILE"] + +if not path.exists(f"{DATABASE_CONF_FILE}"): + shutil.copy(f"{DATABASE_CONF_FILE}", f"{DATABASE_CONF_FILE}.bak") + +with open(f"{DATABASE_CONF_FILE}", "r") as f: + lines = f.readlines() + config_dict = get_dict(lines) + + # Update the desired key with the new value + config_dict["pgsql-host"] = (args_parsed.server_ip, True) + + file_txt = save_dict(config_dict, lines) + # Write it back + with open(f"{DATABASE_CONF_FILE}", "w") as f: + f.writelines(file_txt) + +if args_parsed.tpcc: + subprocess.call(['/root/template/tpcc.py'] + args) +else: + subprocess.call(['/root/template/oltp-rw.py'] + args) \ No newline at end of file diff --git a/benchmarks/data-serving-relational/client/template/database.conf b/benchmarks/data-serving-relational/client/template/database.conf new file mode 100644 index 00000000..e4b63e4d --- /dev/null +++ b/benchmarks/data-serving-relational/client/template/database.conf @@ -0,0 +1,6 @@ +db-driver=pgsql +pgsql-host=128.178.116.117 +pgsql-port=5432 +pgsql-user=cloudsuite +pgsql-password=cloudsuite +pgsql-db=sbtest \ No newline at end of file diff --git a/benchmarks/data-serving-relational/client/template/oltp-rw.py b/benchmarks/data-serving-relational/client/template/oltp-rw.py new file mode 100755 index 00000000..053b55e7 --- /dev/null +++ b/benchmarks/data-serving-relational/client/template/oltp-rw.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os +import sys +import subprocess +import argparse + +# According to the code, the table structure is like the following: +# - id: 4B (primary key) +# - key: 4B +# - c: 120B +# - pad: 60B +# As a result, each row takes 188B. +# You can increase the dataset size by adding more + +parser = argparse.ArgumentParser() +parser.add_argument("--run", help="Run the benchmark, must be warmuped up before with --warmup", action='store_true') +parser.add_argument("--warmup", help="Warmup the benchmark, then can be ran with --run", action='store_true') +parser.add_argument("--threads", "-t", help="Number of threads for the client", default=8, type=int) +parser.add_argument("--report-interval", "-ri", help="Report interval for metrics in seconds", default=10, type=int) +parser.add_argument("--record-count", "-c", help="Record count per table. Each record is 188B", default=1000000, type=int) +parser.add_argument("--tables", "-n", help="Number of tables with `table_size` rows each", default=50, type=int) +parser.add_argument("--rate", "-r", help="The expected load (transaction / sec)", type=int) +parser.add_argument("--time", "-s", help="Length of the benchmark in seconds", default=360, type=int) + + + +args_parsed, unknown = parser.parse_known_args() + +# Warmup +if not args_parsed.warmup and not args_parsed.run: + print("Need to pass at least --run or --warmup argument") + exit() + +if args_parsed.warmup: + os.system(f"sysbench oltp_read_write --config-file=/root/template/database.conf --threads={args_parsed.threads} --time={args_parsed.time} --report-interval={args_parsed.report_interval} prepare --table_size={args_parsed.record_count} --tables={args_parsed.tables}") +elif not args_parsed.rate: + os.system(f"sysbench oltp_read_write --config-file=/root/template/database.conf --threads={args_parsed.threads} --time={args_parsed.time} --report-interval={args_parsed.report_interval} run --table_size={args_parsed.record_count} --tables={args_parsed.tables}") +else: + os.system(f"sysbench oltp_read_write --config-file=/root/template/database.conf --threads={args_parsed.threads} --time={args_parsed.time} --report-interval={args_parsed.report_interval} run --table_size={args_parsed.record_count} --tables={args_parsed.tables} --rate={args_parsed.rate}") \ No newline at end of file diff --git a/benchmarks/data-serving-relational/client/template/tpcc.py b/benchmarks/data-serving-relational/client/template/tpcc.py new file mode 100755 index 00000000..f054c687 --- /dev/null +++ b/benchmarks/data-serving-relational/client/template/tpcc.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import os +import sys +import subprocess +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--run", help="Run the benchmark, must be warmuped up before with --warmup", action='store_true') +parser.add_argument("--warmup", help="Warmup the benchmark, then can be ran with --run", action='store_true') +parser.add_argument("--threads", "-t", help="Number of threads for the client", default=8, type=int) +parser.add_argument("--report-interval", "-ri", help="Report interval for metrics in seconds", default=10, type=int) +parser.add_argument("--time", "-s", help="Length of the benchmark in seconds", default=360, type=int) +parser.add_argument("--scale", "-n", help="Scale of the dataset", default=10, type=int) +parser.add_argument("--rate", "-r", help="The expected load (transaction / sec)", type=int) + +args_parsed, unknown = parser.parse_known_args() + +# Warmup +if not args_parsed.warmup and not args_parsed.run: + print("Need to pass at least --run or --warmup argument") + exit() + + + +if args_parsed.warmup: + os.system(f"sysbench tpcc --config-file=/root/template/database.conf --threads={args_parsed.threads} prepare --scale={args_parsed.scale}") +elif not args_parsed.rate: + os.system(f"sysbench tpcc --config-file=/root/template/database.conf --threads={args_parsed.threads} --time={args_parsed.time} --report-interval={args_parsed.report_interval} run --scale={args_parsed.scale}") +else: + os.system(f"sysbench tpcc --config-file=/root/template/database.conf --threads={args_parsed.threads} --time={args_parsed.time} --report-interval={args_parsed.report_interval} run --scale={args_parsed.scale} --rate={args_parsed.rate}") diff --git a/benchmarks/data-serving-relational/server/Dockerfile b/benchmarks/data-serving-relational/server/Dockerfile new file mode 100644 index 00000000..5273de01 --- /dev/null +++ b/benchmarks/data-serving-relational/server/Dockerfile @@ -0,0 +1,14 @@ +FROM cloudsuite/postgresql:15 + +# Install sudo for user switching +RUN apt update && apt install sudo python3 -y + +# Make the database access public +RUN echo 'host\tall\tcloudsuite\t0.0.0.0/0\tscram-sha-256' >> /etc/postgresql/15/main/pg_hba.conf + +# Copy the entrypoint +COPY ./docker-entrypoint.py /root + +RUN chmod +x /root/docker-entrypoint.py + +ENTRYPOINT ["/root/docker-entrypoint.py"] diff --git a/benchmarks/data-serving-relational/server/docker-entrypoint.py b/benchmarks/data-serving-relational/server/docker-entrypoint.py new file mode 100644 index 00000000..73422e83 --- /dev/null +++ b/benchmarks/data-serving-relational/server/docker-entrypoint.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +import socket +def get_ip(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0) + try: + # doesn't even have to be reachable + s.connect(('8.8.8.8', 1)) + IP = s.getsockname()[0] + except Exception: + IP = '127.0.0.1' + finally: + s.close() + return IP + +import argparse + +parser = argparse.ArgumentParser() +# If no value provided, the script tries to find the primary IP address by itself. = get_ip() +parser.add_argument("--listen-addresses", "-a", help="The listening IP address of PostGRES.", default="'*'") +parser.add_argument("--number", "-n", type=int, help="The number is not used, place holder for new argument.", default=0) + +args, unknown = parser.parse_known_args() + +import os +import os.path as path +import shutil + +def get_dict(lines): + config_dict = {} + for line in lines: + is_enabled = True + if "=" in line: + if line.startswith("#"): + is_enabled = False + line = line[1:] # Remove `#` + + key, value = line.split("=", 1) + key = key.strip() + value = value.strip() + config_dict[key] = (value, is_enabled) + + return config_dict + +def save_dict(config_dict, lines): + # Reconstruct the updated configuration + new_lines = [] + for line in lines: + if "=" in line: + if line.startswith("#"): + line = line[1:] + key, _ = line.split("=", 1) + key = key.strip() + if config_dict[key][1]: + new_lines.append(f"{key} = {config_dict[key][0]}") + else: + new_lines.append(f"#{key} = {config_dict[key][0]}") + else: + new_lines.append(line) + + new_config = "\n".join(new_lines) + return new_config + +POSTGRE_HOMEDIR = os.environ["POSTGRE_HOME"] + +# Backup the original file +if not path.exists(f"{POSTGRE_HOMEDIR}/postgresql.conf"): + shutil.copy(f"{POSTGRE_HOMEDIR}/postgresql.conf", f"{POSTGRE_HOMEDIR}/postgresql.conf.bak") + +with open(f"{POSTGRE_HOMEDIR}/postgresql.conf", "r") as f: + lines = f.readlines() + config_dict = get_dict(lines) + + # Update the desired key with the new value + config_dict["listen_addresses"] = (args.listen_addresses, True) # sed -i "s/#listen_addresses = 'localhost'/listen_addresses = '*'/g" /etc/postgresql/15/main/postgresql.conf + + file_txt = save_dict(config_dict, lines) + # Write it back + with open(f"{POSTGRE_HOMEDIR}/postgresql.conf", "w") as f: + f.writelines(file_txt) + +os.system("service postgresql start") +os.system("sudo -u postgres psql -c \"CREATE USER cloudsuite WITH PASSWORD 'cloudsuite';\"") # Create the user called `cloudsuite` +os.system("sudo -u postgres psql -c \"CREATE DATABASE sbtest;\"") # Create a table named sbtest +os.system("sudo -u postgres psql -c \"GRANT ALL PRIVILEGES ON DATABASE sbtest TO cloudsuite\"") # Gave permission to this table +os.system("sudo -u postgres psql sbtest -c \"GRANT ALL ON SCHEMA public TO cloudsuite;\"") +os.system("sudo -u postgres psql") diff --git a/commons/postgresql/15/Dockerfile b/commons/postgresql/15/Dockerfile new file mode 100644 index 00000000..8623382d --- /dev/null +++ b/commons/postgresql/15/Dockerfile @@ -0,0 +1,12 @@ +FROM cloudsuite/base-os:ubuntu + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt update && apt install -y wget gnupg lsb-release && \ + echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ + wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ + apt update && \ + apt -y install postgresql-15 + + +ENV POSTGRE_HOME /etc/postgresql/15/main \ No newline at end of file diff --git a/docs/benchmarks/data-serving-relational.md b/docs/benchmarks/data-serving-relational.md new file mode 100644 index 00000000..4e13e3d9 --- /dev/null +++ b/docs/benchmarks/data-serving-relational.md @@ -0,0 +1,107 @@ +# Data Serving (PostgreSQL) + +The Data Serving benchmark (based on PostgreSQL 15) uses [sysbench][sysbench] and [sysbench-tpcc][sysbench-tpcc] as the load generator. With several different [aspects][difference-tpcc] from the standard TPC-C, it can still reflect most of the important properties of the TPC-C workload. One of the most widely used workloads from sysbench (`oltp_read_write`) is also wrapped for measurement. + +### Dockerfiles +Supported tags and their respective `Dockerfile` links: +- `server` contains PostgreSQL 15, and by default it opens the prompt using user `postgres`. +- `client` contains sysbench, sysbench-tpcc, and template load generation script to run the workload. + +### Server Container + +Start the server container with the following command: + +```bash + +$ docker run --name postgresql-server -it --net host cloudsuite/data-serving-relational:server + +``` + +It creates a database user `cloudsuite` (password is `cloudsuite` as well), a database `sbtest`, and grant database's permission to the user. The user has the permission to access the database remotely. + + +### Client Container + +We have two types of benchmarks, TPC-C and Sysbench standard OLTP read/write workload. Both of them require you to point to the destination server with `--server-ip=`. To run the warmup phase one can pass the `--warmup` argument, and for the actual measurements `--run`. + +Depending on which one you want to launch, you can pick between `--tpcc` and `--oltp-rw` such as the following: + +```bash +docker run --name sysbench-client -it --net host cloudsuite/data-serving-relational:client --warmup <--tpcc | --oltp-rw> --server-ip=127.0.0.1 +``` + +And for running the benchmark you can run the following command: + +```bash +docker run --name sysbench-client -it --net host cloudsuite/data-serving-relational:client --run <--tpcc | --oltp-rw> --server-ip=127.0.0.1 +``` + +#### TPC-C + +For the TPC-C benchmark we can control the following arguments: +- `--threads=N` spawns `N` threads for the load generator, default is 8 threads. +- `--report-interval=s` report the intermediate statistics every `s` seconds, default is 10 seconds. +- `--time=s` the length in `s` seconds of the benchmark, default is 360 seconds. +- `--scale=N` the scale `N` of the database, default is 50 times. +- `--rate=N` the expected load (transaction per second), and the default is omitted, which means pushing to the maximum possible throughput the server could sustain. + +#### Sysbench OLTP Read/write Workload + +For the Sysbench OLTP read/write workload, you can configure the following parameters: +- `--threads=N` spawns `N` threads for the load generator. +- `--report-interval=s` report the intermediate statistics every `s` seconds. +- `--time=s` the length in `s` seconds of the benchmark. +- `--scale=N` the scale `N` of the database. +- `--rate=N` the expected load (transaction per second), and the default is omitted, which means pushing to the maximum possible throughput the server could sustain. + +```bash +$ docker run --name sysbench-client -it --net host cloudsuite/data-serving-relational:client +``` + +### Container + +You can enter the container with the following command: + +```bash +$ docker run --name sysbench-client -it --net host --entrypoint bash cloudsuite/data-serving-relational:client +``` + +- `/root/template/database.conf` defines the port to the PostgreSQL. You can modify the IP address and the port accordingly based on the configuration of your server container. +- More options can be added by referring `sysbench --help` and the help of each workload. + +### Results + +Afterwards, the script reports the statistics, including the queries mix, the transactions throughput, and the latency (average and 95th tail): + +``` +SQL statistics: + queries performed: + read: 3422483 + write: 3551947 + other: 563730 + total: 7538160 + transactions: 252068 (700.15 per sec.) + queries: 7538160 (20938.04 per sec.) + ignored errors: 30904 (85.84 per sec.) + reconnects: 0 (0.00 per sec.) + +General statistics: + total time: 360.0202s + total number of events: 252068 + +Latency (ms): + min: 0.16 + avg: 11.42 + max: 1965.81 + 95th percentile: 24.83 + sum: 2879436.25 + +Threads fairness: + events (avg/stddev): 31508.5000/70.33 + execution time (avg/stddev): 359.9295/0.01 +``` + + +[sysbench]: https://github.com/akopytov/sysbench +[sysbench-tpcc]: https://github.com/Percona-Lab/sysbench-tpcc +[difference-tpcc]: https://www.percona.com/blog/tpcc-like-workload-sysbench-1-0/ \ No newline at end of file