Merge pull request #1 from sendbird/develop

Code migration
sendbird · May 8, 2024 · 88ab350 · 88ab350
2 parents 2425788 + 0329251
commit 88ab350
Show file tree

Hide file tree

Showing 67 changed files with 5,771 additions and 0 deletions.
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -0,0 +1,24 @@
+name: Linters
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  flake8_py3:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - run: python -m pip install flake8
+      - name: flake8
+        uses: liskin/gh-problem-matcher-wrap@v1
+        with:
+          linters: flake8
+          run: flake8
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,49 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    services:
+      mysql:
+        image: "mysql:8.0.34"
+        ports:
+          - "3306:3306"
+        env:
+          MYSQL_ALLOW_EMPTY_PASSWORD: 1
+          MYSQL_ROOT_HOST: "%"
+          MYSQL_DATABASE: "sbosc"
+        options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3
+      redis:
+        image: "bitnami/redis:7.0.4"
+        ports:
+          - "6379:6379"
+        env:
+          ALLOW_EMPTY_PASSWORD: "yes"
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Copy custom MySQL configuration file
+      run: |
+        docker cp ./tests/configs/my.cnf $(docker ps -aqf "name=mysql"):/etc/mysql/conf.d/my.cnf
+        docker kill $(docker ps -aqf "name=mysql")
+        docker start $(docker ps -aqf "name=mysql" -a)
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.11
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest sqlalchemy
+    - name: Run tests
+      run: |
+        export PYTHONPATH="$(pwd)/src"
+        python -m pytest -s
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+
+# pycharm
+.idea
+
+# Visual Studio Code
+.vscode
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,40 @@
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# apt update
+RUN apt-get update && \
+    apt -y upgrade && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update
+
+# install python
+RUN apt-get install -y python3.11 python3.11-dev python3.11-distutils build-essential
+
+# install mysql, postgres clients
+RUN apt-get install -y libmysqlclient-dev mysql-client
+
+# install utilities
+RUN apt-get install -y curl
+
+# Set working directory
+WORKDIR /opt/sbosc
+
+# Make python 3.11 the default
+# Register the version in alternatives
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+
+# Set python 3 as the default python
+RUN update-alternatives --set python /usr/bin/python3.11
+
+# Install pip and requirements.txt
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python
+
+# Install requirements
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# Copy repository
+COPY src ./
+ENV PYTHONPATH=/opt/sbosc
diff --git a/README.md b/README.md
@@ -0,0 +1,180 @@
+# SB-OSC
+
+**Sendbird's online schema migration for Aurora MySQL**
+
+SB-OSC is an online schema change tool for Aurora MySQL databases, designed to dramatically improve performance on large
+tables by leveraging multithreading in all stages of schema migration process.
+
+It also provides seamless pausing and resuming of tasks to adeptly handle extended operation times of large table schema
+changes, along with a built-in monitoring system to dynamically control its heavy DML load based on Aurora's performance
+metrics.
+
+SB-OSC is designed to overcome the limitations that existing migration tools face with large-scale tables,
+significantly reducing the operational overhead associated with managing large tables.
+
+## Takeaways
+
+SB-OSC has its own unique features that differentiate it from existing schema migration tools such as `pt-osc` and `gh-ost`.
+
+### Multithreading
+
+SB-OSC is designed to leverage multithreading in all stages of the schema migration process, bulk import (initial table
+copy), binlog event processing, and DML event application.
+
+For binlog event processing, SB-OSC processes binlog files in parallel, which enables it to handle large tables with
+heavy write loads.
+
+### Resumable
+
+SB-OSC is resumable at any stage of the schema migration process. It saves the current state of each stage to database
+and Redis, allowing users to pause and resume the process at any time, as log as binlog retention is sufficient.
+
+### Operation Class
+
+SB-OSC supports operation classes that can override main queries used in the schema migration process. This feature
+allows users to customize queries for specific tables such as data retention, table redesign, and more.
+
+Also, it provides operation class that allows replication cross different Aurora clusters which can be used in various
+scenarios such as cross-region replication, cross-account replication, clone cluster replication, etc.
+
+[Guide for operation class](doc/operation-class.md)
+
+### Data Validation
+
+SB-OSC provides strong data validation features to ensure data consistency between the source and destination tables. It
+validates both the bulk import and DML event application stages, and attempts to recover from any inconsistencies.
+
+### Index Creation Strategy
+
+SB-OSC allows users to create indexes after the bulk import stage, which can significantly reduce the time required for
+the initial table copy. This feature is especially useful for large tables with many indexes.
+
+### Monitoring
+
+SB-OSC has a built-in monitoring system that dynamically controls its heavy DML load based on Aurora's performance
+metrics. This feature makes SB-OSC more reliable on production environments, since it will automatically adjust its DML
+load when production traffic increases.
+
+## Requirements
+
+SB-OSC is designed to work with Aurora MySQL database, and it's an EKS-based tool.
+
+It requires the following resources to run:
+
+- Aurora MySQL database (v2, v3)
+- EKS cluster
+- AWS SecretsManager secret
+- IAM role
+
+SB-OSC accepts `ROW` for binlog format. It is recommended to set `binlog-ignore-db` to `sbosc` to prevent SB-OSC from
+processing its own binlog events.
+
+- `binlog_format` set to `ROW`
+- `binlog-ignore-db` set to `sbosc` (Recommended)
+
+Detailed requirements and setup instructions can be found in the [usage guide](doc/usage.md).
+
+## Performance
+
+SB-OSC shows high performance on both binlog event processing and bulk import. Following are specs of tables used for
+performance testing:
+
+| Table Alias | Avg Row Length (Bytes) | Write IOPS (IOPS/m) |
+|:-----------:|-----------------------:|--------------------:|
+|      A      |                     57 |                	149 |
+|      B      |                    912 |                	502 |
+|      C      |                     61 |              3.38 K |
+|      D      |                    647 |              17.9 K |
+|      E      |                   1042 |              24.4 K |
+|      F      |                     86 |               151 K |
+|      G      |                   1211 |              60.7 K |
+
+**Avg Row Length**: `avg_row_length` from `information_schema.TABLES`  
+**Write IOPS**: Average increase of `count_write` from `performance_schema.table_io_waits_summary_by_table` per
+minute.
+
+All tables were in the same Aurora MySQL v3 cluster
+
+### Binlog Event Processing
+
+Following are read throughput of binlog event processing in read bytes per minute. By comparing read throughput to total
+binlog creation rate of the cluster, we can see whether SB-OSC can catch up DML events or not.
+
+**Total Binlog Creation Rate**: 144 (MB/m)
+
+|      Table Alias       |  A  |  B  |  C  |  D  |  E  |  F  |  G  |
+|:----------------------:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| Read Throughput (MB/m) | 513 | 589 | 591 | 402 | 466 | 361 | 305 |
+
+Result shows that SB-OSC can catch up DML events on tables with very high write load.
+
+### Bulk Import
+
+To provide general insight on bulk import performance, the test was conducted on table `A` with no secondary indexes,
+and no additional traffic.
+
+Actual performance of bulk import can vary depending on the number of secondary indexes, the number of rows, column
+types,
+production traffic, etc.
+
+Following are the results of bulk import performance based on instance sizes:
+
+| Instance Type | Insert Rate (rows/s) | Network Throughput (Bytes/s) | Storage Throughput (Bytes/s) | CPU Utilization (%) |
+|:-------------:|---------------------:|-----------------------------:|-----------------------------:|--------------------:|
+|  r6g.2xlarge  |               42.3 K |                       27.2 K |                        457 M |                55.0 |
+|  r6g.4xlarge  |               94.0 K |                       45.9 K |                        900 M |                51.9 |
+|  r6g.8xlarge  |                158 K |                       72.2 K |                       1.39 G |                44.6 |
+
+Insert rate, network throughput, and storage throughput are the average values calculated from CloudWatch metrics.
+
+### Comparison with gh-ost
+
+We've compared total migration time of SB-OSC and gh-ost on following conditions:
+
+- Table `C` with ~200M rows
+- Aurora MySQL v3 cluster, r6g.8xlarge instance
+- 2 secondary indexes
+- `batch_size` (`chunk-size` for gh-ost): 50000
+- (gh-ost) `--allow-on-master`
+
+**w/o traffic**
+
+|  Tool  | Total Migration Time | CPU Utilization (%) |
+|:------:|---------------------:|--------------------:|
+| SB-OSC |                  22m |                60.6 |
+| gh-ost |               1h 52m |                19.7 |
+
+**w/ traffic**
+
+Traffic was generated only to table `C` during the migration. (~1.0K inserts/s, ~0.33K updates/s, ~0.33K deletes/s)
+
+|  Tool  | Total Migration Time | CPU Utilization (%) |
+|:------:|---------------------:|--------------------:|
+| SB-OSC |                  27m |                62.7 |
+| gh-ost |                  1d+ |                27.4 |
+
+For gh-ost, we interrupted the migration at 50% (~12h) since ETA kept increasing.
+
+## Limitations
+
+- **Necessity of Integer Primary Keys**
+  SB-OSC performs multithreading based on integer primary keys (PKs) during the bulk import phase. This approach,
+  designed around batch processing and other operations utilizing integer PKs, means SB-OSC cannot be used with tables
+  that do not have integer PKs.
+
+
+- **Updates on Primary Key**
+  SB-OSC replicates records from the original table based on the PK for applying DML events. Therefore, if updates occur
+  on the table's PK, it can be challenging to guarantee data integrity.
+
+
+- **Binlog Resolution**
+  SB-OSC is limited by the fact that binlog's resolution is in seconds. While this doesn't significantly impact most
+  scenarios due to SB-OSC's design, it can affect the logic based on timestamps when excessive events occur within a
+  second.
+
+
+- **Reduced Efficiency for Small Tables**
+  For small tables, the initial table creation, chunk creation, and the multi-stage process of SB-OSC can act as
+  overhead, potentially slowing down the overall speed. Therefore, applying SB-OSC to small tables may not be as
+  effective.
diff --git a/catalog-info.yaml b/catalog-info.yaml
@@ -0,0 +1,12 @@
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: sb-osc
+  description: Application for online schema change
+  annotations:
+    github.com/project-slug: sendbird/sb-osc
+spec:
+  type: service
+  lifecycle: production
+  owner: team-data-infrastructure
+  system: sendbird-internal-tools
diff --git a/charts/Chart.yaml b/charts/Chart.yaml
@@ -0,0 +1,8 @@
+apiVersion: v2
+name: sb-osc
+version: 0.0.1
+
+dependencies:
+  - name: external-secrets
+    version: "0.8.2"
+    repository: "https://charts.external-secrets.io/"
diff --git a/charts/templates/externalsecret.yaml b/charts/templates/externalsecret.yaml
@@ -0,0 +1,49 @@
+apiVersion: 'external-secrets.io/v1beta1'
+kind: SecretStore
+metadata:
+  name: sb-osc-secret
+spec:
+  provider:
+    aws:
+      service: SecretsManager
+      region: {{ .Values.awsRegion }}
+      auth:
+        jwt:
+          serviceAccountRef:
+            name: sb-osc-external-secrets
+            namespace: {{ .Release.Namespace }}
+---
+apiVersion: 'external-secrets.io/v1beta1'
+kind: ExternalSecret
+metadata:
+  name: sb-osc-secret
+spec:
+  secretStoreRef:
+    name: sb-osc-secret
+    kind: SecretStore
+  target:
+    name: sb-osc-secret
+    template:
+      engineVersion: v2
+      data:
+        secret.json: |
+          {{ printf `{
+            {{- $first := true }}
+            {{- range $k, $v := . }}
+            {{- if $first }}
+              {{- $first = false }}
+            {{- else }}
+              {{- "," -}}
+            {{- end }}
+            "{{ $k }}": "{{ $v }}"
+            {{- end }}
+          }` }}
+        redis.conf: {{ printf `| 
+          requirepass {{ .redis_password }}
+          appendonly yes
+          save ""
+          `}}
+  dataFrom:
+  - extract:
+      key: {{ .Values.secretName }}
+  refreshInterval: 10m