From e51aba9c98f485e9230bf0acb3e628705251f0ab Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Mon, 1 Apr 2024 18:16:47 +0900 Subject: [PATCH 01/22] src --- .gitignore | 7 + charts/Chart.yaml | 0 charts/values.yaml | 0 requirements.txt | 10 + src/config/__init__.py | 8 + src/config/config.py | 143 ++++++++++ src/config/env.py | 33 +++ src/config/secret.py | 38 +++ src/modules/__init__.py | 0 src/modules/aws.py | 54 ++++ src/modules/db.py | 120 ++++++++ src/modules/logger.py | 51 ++++ src/modules/redis/__init__.py | 3 + src/modules/redis/connect.py | 22 ++ src/modules/redis/data_types.py | 153 ++++++++++ src/modules/redis/interface.py | 108 +++++++ src/modules/redis/schema.py | 47 +++ src/modules/slack.py | 169 +++++++++++ src/sbosc/__init__.py | 0 src/sbosc/component.py | 78 +++++ src/sbosc/const.py | 30 ++ src/sbosc/controller/__init__.py | 3 + src/sbosc/controller/controller.py | 376 ++++++++++++++++++++++++ src/sbosc/controller/initializer.py | 247 ++++++++++++++++ src/sbosc/controller/main.py | 5 + src/sbosc/controller/validator.py | 378 +++++++++++++++++++++++++ src/sbosc/eventhandler/__init__.py | 3 + src/sbosc/eventhandler/eventhandler.py | 347 +++++++++++++++++++++++ src/sbosc/eventhandler/eventloader.py | 212 ++++++++++++++ src/sbosc/eventhandler/main.py | 5 + src/sbosc/exceptions.py | 3 + src/sbosc/monitor/__init__.py | 3 + src/sbosc/monitor/main.py | 5 + src/sbosc/monitor/monitor.py | 344 ++++++++++++++++++++++ src/sbosc/operations/__init__.py | 0 src/sbosc/operations/base.py | 281 ++++++++++++++++++ src/sbosc/operations/cases/__init__.py | 0 src/sbosc/operations/operation.py | 101 +++++++ src/sbosc/operations/utils.py | 10 + src/sbosc/worker/__init__.py | 3 + src/sbosc/worker/main.py | 5 + src/sbosc/worker/manager.py | 87 ++++++ src/sbosc/worker/worker.py | 250 ++++++++++++++++ src/utils.py | 24 ++ 44 files changed, 3766 insertions(+) create mode 100644 .gitignore create mode 100644 charts/Chart.yaml create mode 100644 charts/values.yaml create mode 100644 requirements.txt create mode 100644 src/config/__init__.py create mode 100644 src/config/config.py create mode 100644 src/config/env.py create mode 100644 src/config/secret.py create mode 100644 src/modules/__init__.py create mode 100644 src/modules/aws.py create mode 100644 src/modules/db.py create mode 100644 src/modules/logger.py create mode 100644 src/modules/redis/__init__.py create mode 100644 src/modules/redis/connect.py create mode 100644 src/modules/redis/data_types.py create mode 100644 src/modules/redis/interface.py create mode 100644 src/modules/redis/schema.py create mode 100644 src/modules/slack.py create mode 100644 src/sbosc/__init__.py create mode 100644 src/sbosc/component.py create mode 100644 src/sbosc/const.py create mode 100644 src/sbosc/controller/__init__.py create mode 100644 src/sbosc/controller/controller.py create mode 100644 src/sbosc/controller/initializer.py create mode 100644 src/sbosc/controller/main.py create mode 100644 src/sbosc/controller/validator.py create mode 100644 src/sbosc/eventhandler/__init__.py create mode 100644 src/sbosc/eventhandler/eventhandler.py create mode 100644 src/sbosc/eventhandler/eventloader.py create mode 100644 src/sbosc/eventhandler/main.py create mode 100644 src/sbosc/exceptions.py create mode 100644 src/sbosc/monitor/__init__.py create mode 100644 src/sbosc/monitor/main.py create mode 100644 src/sbosc/monitor/monitor.py create mode 100644 src/sbosc/operations/__init__.py create mode 100644 src/sbosc/operations/base.py create mode 100644 src/sbosc/operations/cases/__init__.py create mode 100644 src/sbosc/operations/operation.py create mode 100644 src/sbosc/operations/utils.py create mode 100644 src/sbosc/worker/__init__.py create mode 100644 src/sbosc/worker/main.py create mode 100644 src/sbosc/worker/manager.py create mode 100644 src/sbosc/worker/worker.py create mode 100644 src/utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c370bfa --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ + +# pycharm +.idea + +# Visual Studio Code +.vscode diff --git a/charts/Chart.yaml b/charts/Chart.yaml new file mode 100644 index 0000000..e69de29 diff --git a/charts/values.yaml b/charts/values.yaml new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cb1b99e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +boto3==1.26.72 +botocore==1.29.165 +mysqlclient==2.2.4 +mysql-replication==0.31 +redis==5.0.3 +PyYAML==6.0.1 +requests==2.31.0 +pandas==2.1.4 +prometheus-client==0.16.0 +dnspython==2.5.0 diff --git a/src/config/__init__.py b/src/config/__init__.py new file mode 100644 index 0000000..91fec36 --- /dev/null +++ b/src/config/__init__.py @@ -0,0 +1,8 @@ +from config.config import Config +from config.secret import Secret +from config.env import Env + +config = Config() # override by setting env CONFIG_FILE +secret = Secret() # override by setting env SECRET_FILE +env = Env() # override with environment variables + diff --git a/src/config/config.py b/src/config/config.py new file mode 100644 index 0000000..620b868 --- /dev/null +++ b/src/config/config.py @@ -0,0 +1,143 @@ +import os +from dataclasses import dataclass +from importlib import import_module +from pkgutil import walk_packages + +import yaml +import dns.resolver + +from config import Env +from sbosc.operations.base import BaseOperation, CrossClusterBaseOperation + + +def get_operation_class(class_name): + package = import_module('sbosc.operations') + + for _, name, is_pkg in walk_packages(package.__path__, package.__name__ + '.'): + if not is_pkg: + module = import_module(name) + if hasattr(module, class_name): + return getattr(module, class_name) + + raise ImportError(f"Operation class {class_name} not found") + + +def get_cluster_id(endpoint, cluster_id=None) -> str: + """ + Get RDS cluster identifier from endpoint or cname record if cluster_id is not provided + :param endpoint: RDS cluster endpoint or CNAME record that targets RDS cluster endpoint + :param cluster_id: Optional RDS cluster identifier. If provided, it will be returned + :return: Resolved RDS cluster identifier + """ + if cluster_id is not None: + return cluster_id + elif 'rds.amazonaws.com' in endpoint: + return endpoint.split('.')[0] + else: + try: + answers = dns.resolver.resolve(endpoint, 'CNAME') + if len(answers) == 1 and 'rds.amazonaws.com' in answers[0].target: + return answers[0].target.split('.')[0] + else: + raise Exception( + f"Can't get cluster_id from endpoint. Endpoint {endpoint} targets multiple cname records") + except dns.resolver.NoAnswer: + raise Exception(f"Can't get cluster_id from endpoint. Endpoint {endpoint} doesn't have cname record") + + +@dataclass +class IndexConfig: + name: str + columns: str + unique: bool = False + + +class Config: + # Migration plan + SOURCE_WRITER_ENDPOINT = '' + SOURCE_READER_ENDPOINT = '' + SOURCE_CLUSTER_ID = None # optional + SOURCE_DB = 'sbosc' + SOURCE_TABLE = '' + DESTINATION_WRITER_ENDPOINT = None + DESTINATION_READER_ENDPOINT = None + DESTINATION_CLUSTER_ID = None # optional + DESTINATION_DB = 'sbosc' + DESTINATION_TABLE = '' + MIN_CHUNK_SIZE = 100000 + MAX_CHUNK_COUNT = 200 + AUTO_SWAP = False + PREFERRED_WINDOW = '00:00-23:59' + SKIP_BULK_IMPORT = False + OPERATION_CLASS = BaseOperation + INDEXES = [] + INDEX_CREATED_PER_QUERY = 4 + + # Worker config + MIN_BATCH_SIZE = 100 + BATCH_SIZE_STEP_SIZE = 100 + MAX_BATCH_SIZE = 10000 + MIN_THREAD_COUNT = 4 + THREAD_COUNT_STEP_SIZE = 4 + MAX_THREAD_COUNT = 64 + COMMIT_INTERVAL = 0.01 + OPTIMAL_VALUE_USE_LIMIT = 10 + USE_BATCH_SIZE_MULTIPLIER = False + + # EventHandler config + EVENT_HANDLER_THREAD_COUNT = 4 + EVENT_HANDLER_THREAD_TIMEOUT = 300 + INIT_BINLOG_FILE: str = None + INIT_BINLOG_POSITION: int = None + + # Threshold + CPU_SOFT_THRESHOLD = 70 + CPU_HARD_THRESHOLD = 90 + LATENCY_SOFT_THRESHOLD = 20 # milliseconds + LATENCY_HARD_THRESHOLD = 50 # milliseconds + + # Validation + BULK_IMPORT_VALIDATION_BATCH_SIZE = 100000 + APPLY_DML_EVENTS_VALIDATION_BATCH_SIZE = 100000 + VALIDATION_THREAD_COUNT = 4 + FULL_DML_EVENT_VALIDATION_INTERVAL = 1 # hours + + # DML event loading + PK_SET_MAX_SIZE = 1000000 + EVENT_BATCH_DURATION = 3600 + + def __init__(self): + env = Env() + if os.path.exists(env.CONFIG_FILE): + with open(env.CONFIG_FILE, 'r') as f: + config = yaml.safe_load(f) + for key, value in config.items(): + setattr(self, key.upper(), value) + + if type(self.OPERATION_CLASS) == str: + self.OPERATION_CLASS = get_operation_class(self.OPERATION_CLASS) + + if self.DESTINATION_WRITER_ENDPOINT is None: + self.DESTINATION_WRITER_ENDPOINT = self.SOURCE_WRITER_ENDPOINT + if self.DESTINATION_READER_ENDPOINT is None: + self.DESTINATION_READER_ENDPOINT = self.SOURCE_READER_ENDPOINT + if self.DESTINATION_DB is None: + self.DESTINATION_DB = self.SOURCE_DB + if self.SOURCE_WRITER_ENDPOINT != self.DESTINATION_WRITER_ENDPOINT: + self.AUTO_SWAP = False + if self.OPERATION_CLASS == BaseOperation: + self.OPERATION_CLASS = CrossClusterBaseOperation + if self.SOURCE_DB != self.DESTINATION_DB: + self.AUTO_SWAP = False + + self.SOURCE_CLUSTER_ID = get_cluster_id(self.SOURCE_WRITER_ENDPOINT, self.SOURCE_CLUSTER_ID) + self.DESTINATION_CLUSTER_ID = get_cluster_id(self.DESTINATION_WRITER_ENDPOINT, self.DESTINATION_CLUSTER_ID) + + if self.INDEXES: + self.INDEXES = [IndexConfig(**index) for index in self.INDEXES] + + if self.INIT_BINLOG_FILE is not None and self.INIT_BINLOG_POSITION is None: + raise ValueError('INIT_BINLOG_POSITION is required when INIT_BINLOG_FILE is set') + + if self.SKIP_BULK_IMPORT and self.INIT_BINLOG_FILE is None: + raise ValueError('INIT_BINLOG_FILE is required when SKIP_BULK_IMPORT is True') diff --git a/src/config/env.py b/src/config/env.py new file mode 100644 index 0000000..b5ee5d4 --- /dev/null +++ b/src/config/env.py @@ -0,0 +1,33 @@ +import os + +from utils import from_string + + +class Env: + """ + Defines the environment variables used in SB-OSC. + List of environment variables are defined in the class annotations. + Default values set in the annotations are used if the environment variable is not set. + """ + AWS_REGION: str = 'ap-northeast-2' + POD_NAME: str = 'local' # POD_NAME = 'local' will determine whether it's running in a local environment or not. + CONFIG_FILE: str = '/opt/sb-osc/config.yaml' + SECRET_FILE: str = '/opt/sb-osc/secrets.json' + + def __init__(self, **envs): + """ + Sets the environment variables defined in the class annotations. + If environment variables are not set, default values are used. + :param envs: Environment variables to override. + """ + for env in self.__annotations__: + default_value = getattr(self, env) if hasattr(self, env) else None + override_value = envs.get(env) + if override_value is None: + # convert string to its original type + setattr(self, env, from_string(os.getenv(env, str(default_value)))) + else: + setattr(self, env, override_value) + + def __repr__(self): + return str(self.__dict__) diff --git a/src/config/secret.py b/src/config/secret.py new file mode 100644 index 0000000..d5d5f14 --- /dev/null +++ b/src/config/secret.py @@ -0,0 +1,38 @@ +import json +import os + +from config import Env +from utils import from_string + + +class Secret: + """ + Loads secrets from secret file mounted by ExternalSecrets as attributes. + Default values are used for local environment. + """ + USERNAME: str = 'root' + PASSWORD: str = '' + PORT: int = 3306 + REDIS_HOST: str = '' + REDIS_PASSWORD: str = '' + SLACK_CHANNEL: str = '' + SLACK_TOKEN: str = '' + + def __init__(self, **secrets): + env = Env() + if os.path.exists(env.SECRET_FILE): + with open(env.SECRET_FILE, 'r') as f: + secrets = json.load(f) + if secrets: + for key, value in secrets.items(): + setattr(self, key.upper(), from_string(value)) + + if secrets: + for key, value in secrets.items(): + setattr(self, key.upper(), from_string(value)) + + def __repr__(self): + repr_dict = self.__annotations__.copy() + for key in repr_dict: + repr_dict[key] = '********' + return str(repr_dict) diff --git a/src/modules/__init__.py b/src/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/aws.py b/src/modules/aws.py new file mode 100644 index 0000000..8483a6f --- /dev/null +++ b/src/modules/aws.py @@ -0,0 +1,54 @@ +from datetime import datetime +from typing import Union, Dict, List + +import boto3 + +from config import env + + +class CloudWatch: + def __init__(self): + self.client = boto3.client('cloudwatch', region_name=env.AWS_REGION) + + def get_rds_instance_metrics( + self, + metric_name, + instance_id: str, + start_time: Union[datetime, str], + end_time: Union[datetime, str], + statistics, + unit, + ) -> Dict: + """ + :param metric_name: name of the metric + :param instance_id: instance id + :param start_time: start datetime you want to get (ex: '2021-09-10 03:57:45.117255') + :param end_time: end datetime you want to get (ex: '2021-09-10 04:27:45.117255') + :param statistics: 'SampleCount'|'Average'|'Sum'|'Minimum'|'Maximum' + :param unit: unit of data + """ + return self.client.get_metric_statistics( + Namespace='AWS/RDS', + MetricName=metric_name, + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': instance_id + }, + ], + StartTime=str(start_time), + EndTime=str(end_time), + Period=60, # seconds + Statistics=[statistics], + Unit=unit + ) + + def get_instance_cpu_usages( + self, + instance_id: str, + start_time: Union[datetime, str], + end_time: Union[datetime, str], + statistics + ) -> List[Dict]: + return self.get_rds_instance_metrics( + 'CPUUtilization', instance_id, start_time, end_time, statistics, 'Percent')['Datapoints'] diff --git a/src/modules/db.py b/src/modules/db.py new file mode 100644 index 0000000..3c1c29c --- /dev/null +++ b/src/modules/db.py @@ -0,0 +1,120 @@ +from contextlib import contextmanager, ExitStack +from queue import Queue +from typing import Literal + +import MySQLdb +from MySQLdb import OperationalError + +from config import config, secret + + +class Database: + def __init__(self): + self.connections = { + 'source': { + 'writer': Connection(config.SOURCE_WRITER_ENDPOINT), + 'reader': Connection(config.SOURCE_READER_ENDPOINT) + }, + 'dest': { + 'writer': Connection(config.DESTINATION_WRITER_ENDPOINT), + 'reader': Connection(config.DESTINATION_READER_ENDPOINT) + } + } + + def __del__(self): + self.close() + + def cursor( + self, cursorclass=None, + host: Literal['source', 'dest'] = 'source', + role: Literal['writer', 'reader'] = 'writer' + ): + return self.connections[host][role].cursor(cursorclass) + + @staticmethod + def get_reader_connection_pool(maxsize: int, host: Literal['source', 'dest'] = 'source'): + endpoint = config.SOURCE_READER_ENDPOINT if host == 'source' else config.DESTINATION_READER_ENDPOINT + return ConnectionPool(endpoint, maxsize) + + def get_instance_id(self, host: Literal['source', 'dest'] = 'source', role: Literal['writer', 'reader'] = 'writer'): + with self.cursor(host=host, role=role) as cursor: + cursor.execute("SELECT @@aurora_server_id;") + return cursor.fetchone()[0] + + def close(self): + for host in self.connections: + for role in self.connections[host]: + self.connections[host][role].close() + + +class Connection: + def __init__(self, endpoint: str): + self.endpoint = endpoint + self._conn = None + + def connect(self): + return MySQLdb.connect( + host=self.endpoint, + user=secret.USERNAME, + password=secret.PASSWORD, + port=secret.PORT, + ) + + def cursor(self, cursorclass=None): + if not self._conn: + self._conn = self.connect() + try: + self._conn.ping() + except OperationalError: + self._conn = self.connect() + cursor: cursorclass = self._conn.cursor(cursorclass) + return cursor + + def close(self): + if self._conn: + self._conn.close() + + +class ConnectionPool: + def __init__(self, endpoint, maxsize=30, prefill=False): + self.endpoint = endpoint + self.free_connections = Queue(maxsize=maxsize) + self.size = 0 + self.maxsize = maxsize + if prefill: + for _ in range(maxsize): + self.free_connections.put(Connection(self.endpoint)) + self.size = maxsize + + @contextmanager + def get_connection(self): + if self.free_connections.qsize() > 0: + conn = self.free_connections.get() + else: + if self.size >= self.maxsize: + raise Exception("Connection pool full") + conn = Connection(self.endpoint) + self.size += 1 + + yield conn + + try: + conn.ping() + except OperationalError: + conn = Connection(self.endpoint) + if self.free_connections.full(): + raise Exception("Connection pool full") + else: + self.free_connections.put(conn) + + @contextmanager + def get_connections(self, count): + with ExitStack() as stack: + connections = [stack.enter_context(self.get_connection()) for _ in range(count)] + yield connections + + def close(self): + while not self.free_connections.empty(): + conn = self.free_connections.get() + conn.close() + self.size -= 1 diff --git a/src/modules/logger.py b/src/modules/logger.py new file mode 100644 index 0000000..dfee8ac --- /dev/null +++ b/src/modules/logger.py @@ -0,0 +1,51 @@ +import logging +import sys +from datetime import datetime + +from utils import to_string + + +class LogfmtFormatter(logging.Formatter): + """ + https://docs.python.org/3/library/logging.html#logrecord-attributes + """ + + def __init__(self, extra=None, *args, **kwargs, ): + super().__init__(*args, **kwargs) + self.extra = extra + + def format(self, record) -> str: + attributes = { + 'asctime': datetime.fromtimestamp(record.created).isoformat(' '), + 'levelname': record.levelname, + 'msg': record.msg.replace('"', '\\"').replace('\n', '\\n'), + } + if self.extra: + attributes.update(self.extra) + for attribute in attributes: + if attribute not in ['levelname']: + attributes[attribute] = f'"{attributes[attribute]}"' + logfmt = [ + f'{key}={to_string(value)}' + for key, value in attributes.items() + if key is not None + ] + return ' '.join(logfmt) + + +def get_logger(default_tags=None): + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + + if logger.hasHandlers() and default_tags: + for handler in logger.handlers: + handler.setFormatter(LogfmtFormatter(extra=default_tags)) + + if not logger.hasHandlers(): + handler = logging.StreamHandler(stream=sys.stdout) + formatter = LogfmtFormatter(extra=default_tags) + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger + diff --git a/src/modules/redis/__init__.py b/src/modules/redis/__init__.py new file mode 100644 index 0000000..84c8867 --- /dev/null +++ b/src/modules/redis/__init__.py @@ -0,0 +1,3 @@ +from modules.redis.interface import RedisData + +RedisData = RedisData diff --git a/src/modules/redis/connect.py b/src/modules/redis/connect.py new file mode 100644 index 0000000..fbc66be --- /dev/null +++ b/src/modules/redis/connect.py @@ -0,0 +1,22 @@ +from typing import Literal + +import redis +from redis.backoff import ExponentialBackoff +from redis.retry import Retry + +from config import secret + + +def get_redis_connection(decode_responses: Literal[True, False] = True): + conn_args = { + 'port': 6379, + 'decode_responses': decode_responses, + 'retry': Retry(ExponentialBackoff(10, 1), 3), + 'retry_on_error': [ConnectionError, TimeoutError, ConnectionRefusedError], + } + conn = redis.Redis( + host=secret.REDIS_HOST, + password=secret.REDIS_PASSWORD, + **conn_args + ) + return conn diff --git a/src/modules/redis/data_types.py b/src/modules/redis/data_types.py new file mode 100644 index 0000000..9451d41 --- /dev/null +++ b/src/modules/redis/data_types.py @@ -0,0 +1,153 @@ +from typing import Literal, Collection + +import redis + +from utils import from_string, to_string +from modules.redis.connect import get_redis_connection + + +class Hash: + def __init__(self, name, conn: redis.Redis = None): + self.name = name + self.conn = conn or get_redis_connection() + + data = self.conn.hgetall(self.name) + for key in self.__annotations__: + super().__setattr__(key, None) + for key, value in data.items(): + super().__setattr__(key, from_string(value)) + + def __setattr__(self, key, value): + if hasattr(self, 'conn'): + # only set values to redis after self.conn init + self.conn.hset(self.name, key, to_string(value)) + super().__setattr__(key, value) + + def set(self, data): + for key, value in data.items(): + setattr(self, key, value) + + @property + def data(self): + return { + key: value for key, value in self.__dict__.items() + if key in self.__annotations__ + } + + def delete(self): + self.conn.delete(self.name) + + +class Set: + def __init__(self, name, conn: redis.Redis = None): + self.name = name + self.conn = conn or get_redis_connection() + + def __len__(self): + return self.conn.scard(self.name) + + def __iter__(self): + return self.conn.sscan_iter(self.name) + + def __contains__(self, value): + return self.conn.sismember(self.name, value) + + def add(self, *values): + if not values: + return + self.conn.sadd(self.name, *values) + + def remove(self, *values): + if not values: + return + self.conn.srem(self.name, *values) + + def get(self, count=None): + return self.conn.spop(self.name, count=count) + + def getall(self): + return self.conn.smembers(self.name) + + def delete(self): + self.conn.delete(self.name) + + +class SortedSet: + def __init__(self, name, conn: redis.Redis = None): + self.name = name + self.conn = conn or get_redis_connection() + + def __len__(self): + return self.conn.zcard(self.name) + + def __contains__(self, value): + return self.conn.zscore(self.name, value) is not None + + def add(self, values: Collection): + if not values: + return + if not isinstance(values, dict): + values = {value: value for value in values} + self.conn.zadd(self.name, values) + + def remove(self, values): + if not values: + return + self.conn.zrem(self.name, *values) + + def get(self, count=None, minmax: Literal['min', 'max'] = 'min'): + if minmax == 'min': + items = self.conn.zpopmin(self.name, count) + elif minmax == 'max': + items = self.conn.zpopmax(self.name, count) + else: + raise ValueError(f'Invalid minmax: {minmax}') + return [item[0] for item in items] + + def delete(self): + self.conn.delete(self.name) + + +class List: + def __init__(self, name, conn: redis.Redis = None): + self.name = name + self.conn = conn or get_redis_connection() + + def __len__(self): + return self.conn.llen(self.name) + + def __getitem__(self, index): + return self.conn.lindex(self.name, index) + + def __setitem__(self, index, value): + self.conn.lset(self.name, index, value) + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def append(self, value): + self.conn.rpush(self.name, value) + + def delete(self): + self.conn.delete(self.name) + + +class Queue(List): + def push(self, *values): + if not values: + return + self.conn.rpush(self.name, *values) + + def pop(self, count=None): + return self.conn.lpop(self.name, count=count) + + +class Stack(List): + def push(self, *values): + if not values: + return + self.conn.rpush(self.name, *values) + + def pop(self, count=None): + return self.conn.rpop(self.name, count=count) diff --git a/src/modules/redis/interface.py b/src/modules/redis/interface.py new file mode 100644 index 0000000..c7fddd8 --- /dev/null +++ b/src/modules/redis/interface.py @@ -0,0 +1,108 @@ +from datetime import datetime +from typing import Self + +from config import config +from modules.redis.schema import RedisKey, Metadata, WorkerConfig, WorkerMetric, ChunkInfo +from modules.redis.connect import get_redis_connection +from modules.redis.data_types import Set, Stack, SortedSet + + +# functools.lru_cache is not used because it does not support expiration based on time +class ExpiringData: + def __init__(self, data, expire_time=60): + self.data = data + self.expire_time = expire_time + self.fetched_at = datetime.now() + + @property + def is_expired(self): + return (datetime.now() - self.fetched_at).seconds > self.expire_time + + +class RedisData: + def __init__(self, migration_id, use_cached_property=True): + self.conn = get_redis_connection() + self.cluster_id = config.SOURCE_CLUSTER_ID + self.migration_id = migration_id + self.use_cached_property = use_cached_property + + self.updated_pk_set: SortedSet = SortedSet(self.get_key(RedisKey.UPDATED_PK_SET), self.conn) + self.removed_pk_set: SortedSet = SortedSet(self.get_key(RedisKey.REMOVED_PK_SET), self.conn) + self.chunk_stack: Stack = Stack(self.get_key(RedisKey.CHUNK_STACK), self.conn) + self.chunk_set: Set = Set(self.get_key(RedisKey.CHUNK_SET), self.conn) + + @staticmethod + def cached_property(expire_time=60): + def decorator(func): + @property + def wrapper(self: Self): + if not self.use_cached_property: + return func(self) + cached_data = getattr(self, f'_{func.__name__}', None) + if cached_data is None or cached_data.is_expired: + cached_data = ExpiringData(func(self), expire_time) + setattr(self, f'_{func.__name__}', cached_data) + return cached_data.data + + return wrapper + + return decorator + + def get_key(self, fstring, *args): + return fstring.format(self.cluster_id, self.migration_id, *args) + + @property + def current_stage(self): + return self.conn.get(self.get_key(RedisKey.CURRENT_STAGE)) + + def set_current_stage(self, stage): + self.conn.set(self.get_key(RedisKey.CURRENT_STAGE), stage) + + @property + def last_catchup_timestamp(self): + last_catchup_timestamp = self.conn.get(self.get_key(RedisKey.LAST_CATCHUP_TIMESTAMP)) + last_catchup_timestamp = float(last_catchup_timestamp) if last_catchup_timestamp else 1 + return last_catchup_timestamp + + def set_last_catchup_timestamp(self, status): + self.conn.set(self.get_key(RedisKey.LAST_CATCHUP_TIMESTAMP), status) + + @cached_property(60) + def metadata(self) -> Metadata: + return Metadata(self.get_key(RedisKey.METADATA)) + + @property + def worker_config(self) -> WorkerConfig: + return WorkerConfig(self.get_key(RedisKey.WORKER_CONFIG)) + + @property + def worker_metric(self) -> WorkerMetric: + return WorkerMetric(self.get_key(RedisKey.WORKER_METRIC, env.POD_NAME)) + + def get_all_worker_metrics(self): + worker_metric_keys = self.conn.keys(self.get_key(RedisKey.WORKER_METRIC, '*')) + return [WorkerMetric(key) for key in worker_metric_keys] + + def push_chunk(self, *chunk_ids): + self.chunk_set.add(*chunk_ids) + self.chunk_stack.push(*chunk_ids) + + def get_chunk_info(self, chunk_id) -> ChunkInfo: + return ChunkInfo(self.get_key(RedisKey.CHUNK_INFO, chunk_id)) + + def remove_all_chunks(self): + chunk_ids = self.chunk_set.getall() + for chunk_id in chunk_ids: + self.conn.delete(self.get_key(RedisKey.CHUNK_INFO, chunk_id)) + self.chunk_set.delete() + self.chunk_stack.delete() + + @property + def old_source_table(self): + return self.conn.get(self.get_key(RedisKey.OLD_SOURCE_TABLE)) + + def set_old_source_table(self, table_name): + if table_name is None: + self.conn.delete(self.get_key(RedisKey.OLD_SOURCE_TABLE)) + else: + self.conn.set(self.get_key(RedisKey.OLD_SOURCE_TABLE), table_name) diff --git a/src/modules/redis/schema.py b/src/modules/redis/schema.py new file mode 100644 index 0000000..e8dac62 --- /dev/null +++ b/src/modules/redis/schema.py @@ -0,0 +1,47 @@ +from datetime import datetime + +from modules.redis.data_types import Hash + + +class ChunkInfo(Hash): + start_pk: int + end_pk: int + status: str + last_pk_inserted: int + + +class WorkerConfig(Hash): + batch_size: int + thread_count: int + commit_interval: int + revision: int + + +# Metrics managed by WorkerManager +class WorkerMetric(Hash): + average_insert_rate: float + + +class Metadata(Hash): + source_db: str + source_table: str + destination_db: str + destination_table: str + source_columns: str + max_id: int + start_datetime: datetime + + +class RedisKey: + METADATA = 'sb-osc:{}:{}:metadata' + CURRENT_STAGE = 'sb-osc:{}:{}:current_stage' + LAST_CATCHUP_TIMESTAMP = 'sb-osc:{}:{}:last_catchup_timestamp' + UPDATED_PK_SET = 'sb-osc:{}:{}:updated_pk_set' + REMOVED_PK_SET = 'sb-osc:{}:{}:removed_pk_set' + WORKER_CONFIG = 'sb-osc:{}:{}:worker_config' + WORKER_METRIC = 'sb-osc:{}:{}:worker_metrics:{}' + CHUNK_STACK = 'sb-osc:{}:{}:bulk_import:chunk_stack' + CHUNK_SET = 'sb-osc:{}:{}:bulk_import:chunk_set' + CHUNK_INFO = 'sb-osc:{}:{}:bulk_import:chunk_info:{}' + OLD_SOURCE_TABLE = 'sb-osc:{}:{}:old_source_table' + diff --git a/src/modules/slack.py b/src/modules/slack.py new file mode 100644 index 0000000..f87a1f8 --- /dev/null +++ b/src/modules/slack.py @@ -0,0 +1,169 @@ +import json +from dataclasses import dataclass +from typing import Literal, TypedDict, List, Union, Optional +import requests + +from config import secret +from modules.logger import get_logger + +AttachmentColorType = Literal['info', 'good', 'warning', 'danger'] +AttachmentFieldType = TypedDict( + 'AttachmentFieldType', { + 'title': str, # optional + 'value': str, + 'short': bool + }, + total=False, +) +AttachmentType = TypedDict( + 'AttachmentType', { + 'title': Optional[str], # optional + 'text': str, # optional + 'fallback': str, + 'pretext': str, + 'fields': List[AttachmentFieldType], # optional + 'color': AttachmentColorType + }, + total=False, +) + +logger = get_logger() + + +@dataclass +class SlackMessage: + channel: str + ts: str + text: str + user: str + + +class SlackClient: + def __init__(self, title, identifier): + self.token = secret.SLACK_TOKEN + self.channel = secret.SLACK_CHANNEL + + # default message configs + self.title = title + self.identifier = identifier + self.color = 'info' + + # attachment colors + self.colors = { + 'info': '#0394fc', + 'good': 'good', + 'warning': 'warning', + 'danger': 'danger' + } + + @staticmethod + def _handle_response(response): + if response.status_code == 200: + result = response.json() + is_success = result.get('ok') + + if not is_success: + error = result.get('error') + raise Exception(f'Fail to send slack message, error: {error}') + else: + return result + else: + raise Exception(f'Fail to send slack message, error: {response.text}') + + def _post_message(self, attachments: str): + """ + :param attachments: 'attachment' payload of Slack post message api + """ + slack_request_url = 'https://slack.com/api/chat.postMessage' + headers = {'content-type': 'application/x-www-form-urlencoded'} + data = [ + ('token', self.token), + ('channel', self.channel), + ('as_user', 'true'), + ('attachments', attachments), + ] + response = requests.post(slack_request_url, data=data, headers=headers) + self._handle_response(response) + + def create_attachments( + self, + fields: List[AttachmentFieldType], + color: AttachmentColorType = 'danger', + fallback=None, + pretext=None, + ): + """ + :param pretext: Text before attachment + :param fields: Attachment fields + :param color: Color of attachments + :param fallback: Summary of message. Will be used in notifications etc. + :return: + """ + attachment: AttachmentType = {'color': self.colors[color]} + if len(fields) > 1: + attachment['fields'] = fields + else: + message = fields[0] + attachment['title'] = message.get('title') + attachment['text'] = message['value'] + if fallback: + attachment['fallback'] = fallback + attachment['pretext'] = fallback + if pretext is not None: + attachment['pretext'] = pretext + return json.dumps([attachment]) + + @staticmethod + def _create_fields(message, subtitle): + if not message: + raise ValueError('messages is empty') + # set fields + else: + if isinstance(message[0], dict): + fields = message + elif isinstance(message, str): + fields = [{'title': subtitle, 'value': message}] if subtitle else [{'value': message}] + else: + fields = [{'value': message} for message in message] + return fields + + def send_message( + self, + message: Union[str, List[str], List[AttachmentFieldType]], + title: str = None, + subtitle: Optional[str] = None, + summary: Optional[str] = None, + color: AttachmentColorType = None, + ): + """ + :param message: Message to send. Can be either a string, a list of strings or a list of attachment fields + If message is a string, it can be used with title to create an attachment field. + If a list of strings is provided, each string will be used as a value of an attachment field. + If a list of attachment fields is provided, it will be used as is + :param title: Title of the message. Will be used in pretext & fallback + :param subtitle: Title of the message. Will be used in attachment field if message is a string + :param summary: Extra summary of the message. Will be used in fallback + :param color: Color of the message + """ + fields = self._create_fields(message, subtitle) + summary = subtitle or summary + + # set pretext + header = f'*{title or self.title} ({self.identifier})*' + + # set fallback + fallback = header + if summary: + fallback = f'{header} - {summary}' + + attachments = self.create_attachments( + fields=fields, + color=color or self.color, + fallback=fallback, + pretext=header, + ) + + logger.info(f'Slack message: {attachments}') + + if self.token and self.channel: + self._post_message(attachments) diff --git a/src/sbosc/__init__.py b/src/sbosc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/sbosc/component.py b/src/sbosc/component.py new file mode 100644 index 0000000..48a93ee --- /dev/null +++ b/src/sbosc/component.py @@ -0,0 +1,78 @@ +import signal +import time + +import MySQLdb +from MySQLdb.cursors import Cursor + +from config import config +from modules.db import Database +from modules.redis import RedisData +from modules.logger import get_logger + + +class SBOSCComponent: + def __init__(self): + self.db: Database = Database() + + self.migration_id = self.get_migration_id() + if self.migration_id is None: + print("Migration ID not found. Exiting.") + time.sleep(60) + return + self.redis_data: RedisData = RedisData(self.migration_id) + if self.redis_data.current_stage is None: + print("Current stage not set. Exiting.") + time.sleep(60) + return + self.logger = get_logger({ + "dbclusteridentifier": config.SOURCE_CLUSTER_ID, + "migration_id": self.migration_id + }) + + self.stop_flag = False + + def set_stop_flag(self): + self.stop_flag = True + + def is_preferred_window(self): + pass + + def get_migration_id(self): + if not config.SOURCE_TABLE or not config.DESTINATION_TABLE: + raise Exception("Migration not configured") + + with self.db.cursor() as cursor: + cursor: Cursor + try: + cursor.execute(''' + SELECT id FROM sbosc.migration_plan + WHERE source_cluster_id = %s AND source_db = %s AND source_table = %s + AND destination_cluster_id = %s AND destination_db = %s AND destination_table = %s + ''', ( + config.SOURCE_CLUSTER_ID, + config.SOURCE_DB, + config.SOURCE_TABLE, + config.DESTINATION_CLUSTER_ID, + config.DESTINATION_DB, + config.DESTINATION_TABLE + )) + return cursor.fetchone()[0] if cursor.rowcount > 0 else None + + except MySQLdb.ProgrammingError as e: + if e.args[0] == 1146: # database or table doesn't exist + print(f"Table not found. {e.args[1]}") + return None + raise e + + +def start_component(component_class): + component = component_class() + + def signal_handler(sig, frame): + component.set_stop_flag() + + # Handle SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + component.start() diff --git a/src/sbosc/const.py b/src/sbosc/const.py new file mode 100644 index 0000000..727850f --- /dev/null +++ b/src/sbosc/const.py @@ -0,0 +1,30 @@ +class Stage: + START_EVENT_HANDLER = '01_start_event_handler' + BULK_IMPORT_CHUNK_CREATION = '02_bulk_import_chunk_creation' + BULK_IMPORT = '03_bulk_import' + BULK_IMPORT_VALIDATION = '04_bulk_import_validation' + BULK_IMPORT_VALIDATION_FAILED = '04_1_bulk_import_validation_failed' + APPLY_DML_EVENTS = '05_apply_dml_events' + APPLY_DML_EVENTS_PRE_VALIDATION = '06_apply_dml_events_pre_validation' + ADD_INDEX = '06_1_add_index' + APPLY_DML_EVENTS_VALIDATION = '07_apply_dml_events_validation' + SWAP_TABLES = '08_swap_tables' + SWAP_TABLES_FAILED = '08_1_swap_tables_failed' + DONE = '09_done' + + +class ChunkStatus: + NOT_STARTED = 'not_started' + IN_PROGRESS = 'in_progress' + DUPLICATE_KEY = 'duplicate_key' + DONE = 'done' + + +class WorkerStatus: + IDLE = 'idle' + BUSY = 'busy' + + +class UnmatchType: + NOT_UPDATED = 'not_updated' + NOT_REMOVED = 'not_removed' diff --git a/src/sbosc/controller/__init__.py b/src/sbosc/controller/__init__.py new file mode 100644 index 0000000..1b0844d --- /dev/null +++ b/src/sbosc/controller/__init__.py @@ -0,0 +1,3 @@ +from sbosc.controller.controller import Controller + +Controller = Controller diff --git a/src/sbosc/controller/controller.py b/src/sbosc/controller/controller.py new file mode 100644 index 0000000..9391a10 --- /dev/null +++ b/src/sbosc/controller/controller.py @@ -0,0 +1,376 @@ +import time +import concurrent.futures + +from datetime import datetime + +from MySQLdb.cursors import Cursor + +from config import config +from modules.slack import SlackClient +from sbosc.component import SBOSCComponent +from sbosc.const import Stage, ChunkStatus +from sbosc.controller.initializer import Initializer +from sbosc.controller.validator import DataValidator +from sbosc.exceptions import StopFlagSet + + +class Controller(SBOSCComponent): + def __init__(self): + self.initializer = Initializer() + super().__init__() + self.slack = SlackClient('SB-OSC Controller', f'{config.SOURCE_CLUSTER_ID}, {self.migration_id}') + self.validator: DataValidator = DataValidator(self) + + self.interval = 60 + + def get_migration_id(self): + migration_id = super().get_migration_id() + if migration_id is None: + migration_id = self.initializer.init_migration() + return migration_id + + def set_stop_flag(self): + self.logger.info("Stopping controller master...") + self.stop_flag = True + self.validator.set_stop_flag() + + def start(self): + self.logger.info("Starting controller master") + + stage_actions = { + Stage.BULK_IMPORT_CHUNK_CREATION: self.create_bulk_import_chunks, + Stage.BULK_IMPORT_VALIDATION: self.validate_bulk_import, + Stage.APPLY_DML_EVENTS: self.apply_dml_events, + Stage.ADD_INDEX: self.add_index, + Stage.APPLY_DML_EVENTS_VALIDATION: self.apply_dml_events_validation, + Stage.SWAP_TABLES: self.swap_tables, + } + + while not self.stop_flag: + current_stage = self.redis_data.current_stage + self.logger.info(f"Current stage: {current_stage}") + action = stage_actions.get(current_stage) + if action: + action() + + # TODO: Add Redis data validation if needed + time.sleep(self.interval) + + # Close db connection + self.logger.info("Controller master stopped") + + def create_bulk_import_chunks(self): + # Remove old chunks + self.redis_data.remove_all_chunks() + + metadata = self.redis_data.metadata + max_id = metadata.max_id + + # chunk_count is determined by min_chunk_size and max_chunk_count + # Each chunk will have min_chunk_size rows and the number of chunks should not exceed max_chunk_count + min_chunk_size = config.MIN_CHUNK_SIZE + max_chunk_count = config.MAX_CHUNK_COUNT # Number of chunks means max number of worker threads + chunk_count = min(max_id // min_chunk_size, max_chunk_count) + chunk_size = max_id // chunk_count + + # Create chunks + # Each chunk will have a range of primary key values [start_pk, end_pk] + chunks = [] + for i in range(chunk_count): + start_pk = i * chunk_size + 1 + end_pk = (i + 1) * chunk_size + if i == chunk_count - 1: + end_pk = max_id + + chunk_id = f"{self.migration_id}-{i}" + chunk_info = self.redis_data.get_chunk_info(chunk_id) + chunk_info.set({ + 'start_pk': start_pk, + 'end_pk': end_pk, + 'status': ChunkStatus.NOT_STARTED, + }) + chunks.append((self.migration_id, chunk_id, start_pk, end_pk)) + self.redis_data.push_chunk(chunk_id) + + self.logger.info("Bulk import chunks created") + + # Set initial worker config + self.redis_data.worker_config.set({ + 'batch_size': config.MIN_BATCH_SIZE, + 'thread_count': config.MIN_THREAD_COUNT, + 'commit_interval': config.COMMIT_INTERVAL, + 'revision': 0, + }) + + # Save chunk info to database + with self.db.cursor() as cursor: + cursor: Cursor + cursor.executemany(''' + INSERT INTO sbosc.chunk_info (migration_id, chunk_id, start_pk, end_pk, created_at) + VALUES (%s, %s, %s, %s, NOW()) + ''', chunks) + + self.redis_data.set_current_stage(Stage.BULK_IMPORT) + self.slack.send_message( + subtitle="Bulk import started", + message=f"Max id: {max_id}\n" + f"Chunk count: {chunk_count}\n" + f"Chunk size: {chunk_size}\n" + f"Batch size: {config.MIN_BATCH_SIZE}\n" + f"Thread count: {config.MIN_THREAD_COUNT}\n" + f"Commit interval: {config.COMMIT_INTERVAL}" + ) + + def validate_bulk_import(self): + # Check if all chunks are completed + incomplete_chunks = [] + + # Restore missing chunk from database + self.logger.info("Restoring missing chunks from database") + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(''' + SELECT chunk_id FROM sbosc.chunk_info + WHERE migration_id = %s + ''', [self.migration_id]) + for chunk_id, in cursor.fetchall(): + if chunk_id not in self.redis_data.chunk_set: + incomplete_chunks.append(chunk_id) + + # Check if all chunks are completed + self.logger.info("Checking if all chunks are completed") + for chunk_id in self.redis_data.chunk_set: + chunk_info = self.redis_data.get_chunk_info(chunk_id) + if chunk_info.status != ChunkStatus.DONE or chunk_info.last_pk_inserted != chunk_info.end_pk: + incomplete_chunks.append(chunk_id) + + if len(incomplete_chunks) > 0: + self.logger.warning(f"Found incomplete chunks: {incomplete_chunks}") + self.redis_data.push_chunk(*incomplete_chunks) + self.redis_data.set_current_stage(Stage.BULK_IMPORT) + else: + # Analyze destination table + metadata = self.redis_data.metadata + with self.db.cursor(host='dest') as cursor: + cursor: Cursor + cursor.execute(f"ANALYZE TABLE {metadata.destination_db}.{metadata.destination_table}") + self.logger.info("Finished ANALYZE TABLE on destination table") + while not self.is_preferred_window(): + if self.stop_flag: + return + self.logger.info("Waiting for least busy hour") + time.sleep(300) + self.slack.send_message("Start validating bulk import") + try: + is_valid = self.validator.bulk_import_validation() + if not is_valid: + self.redis_data.set_current_stage(Stage.BULK_IMPORT_VALIDATION_FAILED) + self.slack.send_message(message="Bulk import validation failed", color="danger") + else: + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) + self.slack.send_message(message="Bulk import validation succeeded", color="good") + except StopFlagSet: + return + + def apply_dml_events(self): + self.logger.info("Resetting worker config to minimum values") + revision = self.redis_data.worker_config.revision or 0 # If revision is None, set to 0 + self.redis_data.worker_config.set({ + 'batch_size': config.MIN_BATCH_SIZE, + 'thread_count': config.MIN_THREAD_COUNT, + 'commit_interval': config.COMMIT_INTERVAL, + 'revision': revision + 1 + }) + + def apply_dml_events_validation(self): + self.interval = 10 + + try: + is_valid = self.validator.apply_dml_events_validation() + + if is_valid: + # Analyze table + with self.db.cursor(host='dest') as cursor: + cursor: Cursor + metadata = self.redis_data.metadata + cursor.execute(f"ANALYZE TABLE {metadata.destination_db}.{metadata.destination_table}") + self.logger.info("Finished ANALYZE TABLE on destination table") + + if not self.is_preferred_window(): + self.logger.info("Waiting for least busy hour") + time.sleep(300) + return + if not config.AUTO_SWAP: + self.logger.info("Auto swap is disabled") + time.sleep(60) + return + + is_valid = self.validator.full_dml_event_validation() + if is_valid is not None: # Validation did not skip + return + + self.redis_data.set_current_stage(Stage.SWAP_TABLES) + self.interval = 1 + except StopFlagSet: + return + + def add_index(self): + self.logger.info("Start creating indexes") + metadata = self.redis_data.metadata + + finished_all_creation = False + while not self.stop_flag: + finished_creation = False + with self.db.cursor() as cursor: + cursor: Cursor + + index_info = None + cursor.execute(''' + SELECT index_name FROM sbosc.index_creation_status + WHERE migration_id = %s AND ended_at IS NULL AND started_at IS NOT NULL + ''', (self.migration_id,)) + + if cursor.rowcount > 0: + index_names = [row[0] for row in cursor.fetchall()] + self.slack.send_message( + subtitle="Found unfinished index creation", message=f"Indexes: {index_names}", color="warning") + + while True: + if self.stop_flag: + return + cursor.execute(f''' + SELECT DISTINCT database_name, table_name, index_name FROM mysql.innodb_index_stats + WHERE database_name = %s AND table_name = %s + AND index_name IN ({','.join(['%s'] * len(index_names))}) + ''', [metadata.destination_db, metadata.destination_table] + index_names) + if cursor.rowcount == len(index_names): + finished_creation = True + break + self.logger.info("Waiting for index creation to finish") + time.sleep(60) + + else: + cursor.execute(f''' + SELECT index_name, index_columns, is_unique FROM sbosc.index_creation_status + WHERE migration_id = %s AND ended_at IS NULL LIMIT {config.INDEX_CREATED_PER_QUERY} + ''', (self.migration_id,)) + + if cursor.rowcount == 0: + finished_all_creation = True + break + + index_info = cursor.fetchall() + index_names = [index_name for index_name, *_ in index_info] + + if index_info and not finished_creation: + self.logger.info(f"Creating indexes {index_names}") + self.slack.send_message(subtitle="Start creating indexes", message=f"Indexes: {index_names}") + + # update ended_at + started_at = datetime.now() + with self.db.cursor() as cursor: + cursor: Cursor + cursor.executemany(''' + UPDATE sbosc.index_creation_status SET started_at = %s + WHERE migration_id = %s AND index_name = %s + ''', [(started_at, self.migration_id, index_name) for index_name in index_names]) + + # add index + with self.db.cursor(host='dest') as cursor: + cursor: Cursor + cursor.execute(f''' + ALTER TABLE {metadata.destination_db}.{metadata.destination_table} + {', '.join([ + f"ADD{' UNIQUE' if is_unique else ''} INDEX {index_name} ({index_columns})" + for index_name, index_columns, is_unique in index_info + ])} + ''') + + finished_creation = True + + if finished_creation: + # update ended_at + ended_at = datetime.now() + with self.db.cursor() as cursor: + cursor: Cursor + cursor.executemany(''' + UPDATE sbosc.index_creation_status SET ended_at = %s + WHERE migration_id = %s AND index_name = %s + ''', [(ended_at, self.migration_id, index_name) for index_name in index_names]) + + self.logger.info(f"Finished creating index {index_names}") + self.slack.send_message( + subtitle="Finished creating indexes", message=f"Indexes: {index_names}", color="good") + + if finished_all_creation: + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) + + def swap_tables(self): + updated_pk_set = self.redis_data.updated_pk_set + removed_pk_set = self.redis_data.removed_pk_set + + # Check if all updates are applied + if len(updated_pk_set) > 0 or len(removed_pk_set) > 0 or \ + time.time() - self.redis_data.last_catchup_timestamp > 1: + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS_VALIDATION) + return + + self.slack.send_message("Start swapping tables") + # Swap tables + with self.db.cursor() as cursor: + metadata = self.redis_data.metadata + cursor.execute(''' + SELECT 1 FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s + ''', (metadata.source_db, metadata.source_table)) + destination_table_valid = False + if cursor.rowcount > 0: + source_table = f"{metadata.source_db}.{metadata.source_table}" + destination_table = f"{metadata.destination_db}.{metadata.destination_table}" + self.redis_data.set_old_source_table( + f"_{metadata.source_table}_old_{datetime.now().strftime('%Y%m%d')}") + old_source_table = f"{metadata.source_db}.{self.redis_data.old_source_table}" + cursor.execute(f"RENAME TABLE {source_table} TO {old_source_table}") + after_rename_table_timestamp = time.time() + cursor.execute(f"SELECT MAX(id) FROM {old_source_table}") + final_max_id = cursor.fetchone()[0] + + with self.validator.migration_operation.override_source_table(self.redis_data.old_source_table): + retry_interval = 0.3 + for _ in range(10): + time.sleep(retry_interval) + if len(updated_pk_set) == 0 and len(removed_pk_set) == 0 and \ + self.redis_data.last_catchup_timestamp > after_rename_table_timestamp: + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + validation_thread = executor.submit(self.validator.apply_dml_events_validation) + try: + destination_table_valid = validation_thread.result(timeout=3) + if destination_table_valid: + break + else: + self.logger.warning( + f"Final validation failed. Retrying in {retry_interval} seconds") + except concurrent.futures.TimeoutError: + destination_table_valid = False + self.logger.warning("Final validation timed out. Downtime may be required.") + else: + self.logger.warning(f"Found unapplied DML event. Retrying in {retry_interval} seconds") + + # Source table does not exist or destination table is not valid + if not destination_table_valid: + self.logger.critical("Failed to validate destination table. Restoring source table name") + cursor.execute(f"RENAME TABLE {old_source_table} TO {source_table}") + self.redis_data.set_old_source_table(None) + self.redis_data.set_current_stage(Stage.SWAP_TABLES_FAILED) + self.slack.send_message("Failed to swap tables", color="danger") + return + else: + cursor.execute(f"RENAME TABLE {destination_table} TO {source_table}") + self.redis_data.set_current_stage(Stage.DONE) + cursor.execute(''' + UPDATE sbosc.migration_plan SET ended_at = FROM_UNIXTIME(%s), final_max_id = %s WHERE id = %s + ''', (after_rename_table_timestamp, final_max_id, self.migration_id)) + self.logger.info("Tables swapped") + self.slack.send_message("Tables swapped", color="good") + + self.interval = 60 diff --git a/src/sbosc/controller/initializer.py b/src/sbosc/controller/initializer.py new file mode 100644 index 0000000..477f989 --- /dev/null +++ b/src/sbosc/controller/initializer.py @@ -0,0 +1,247 @@ +from datetime import datetime + +from MySQLdb.cursors import Cursor + +from config import config +from modules.slack import SlackClient +from sbosc.const import Stage +from modules.db import Database +from modules.redis import RedisData +from modules.logger import get_logger + +REQUIRED_TABLES = [ + # Controller + "migration_plan", "chunk_info", + "apply_dml_events_status", "index_creation_status", + "apply_dml_events_validation_status", "full_dml_event_validation_status", "unmatched_rows", + # EventHandler + "event_handler_status" +] + + +class Initializer: + def __init__(self): + self.db = Database() + self.logger = get_logger({"dbclusteridentifier": config.SOURCE_CLUSTER_ID}) + + def check_database_setup(self): + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute("SELECT 1 FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = 'sbosc'") + if cursor.rowcount == 0: + self.logger.info("SB-OSC database not found") + return False + cursor.execute(''' + SELECT 1 FROM information_schema.TABLES + WHERE TABLE_SCHEMA = 'sbosc' AND TABLE_NAME IN (%s) + ''' % ','.join(['%s'] * len(REQUIRED_TABLES)), REQUIRED_TABLES) + if cursor.rowcount != len(REQUIRED_TABLES): + self.logger.info("Required tables not found") + return False + return True + + def setup_database(self): + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute("CREATE DATABASE IF NOT EXISTS sbosc;") + self.logger.info("Database created") + + # Controller tables + cursor.execute("USE sbosc;") + cursor.execute(''' + CREATE TABLE IF NOT EXISTS migration_plan ( + id int PRIMARY KEY AUTO_INCREMENT, + source_cluster_id varchar(128), + source_db varchar(128), + source_table varchar(128), + destination_cluster_id varchar(128), + destination_db varchar(128), + destination_table varchar(128), + detail text, + created_at datetime, + final_max_id bigint, + ended_at datetime + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Migration plan table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS chunk_info ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + chunk_id varchar(128), + start_pk bigint, + end_pk bigint, + created_at datetime, + KEY `idx_chunk_info_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Chunk info table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS apply_dml_events_status ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + last_loaded_timestamp bigint, + created_at datetime, + KEY `idx_apply_dml_events_status_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Apply DML event status table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS index_creation_status ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + index_name varchar(128), + index_columns varchar(128), + is_unique bool, + started_at datetime, + ended_at datetime, + created_at datetime, + KEY `idx_index_creation_status_migration_id_ended_at` (`migration_id`, `ended_at`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Index creation status table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS apply_dml_events_validation_status ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + last_validated_timestamp bigint, + is_valid bool, + created_at datetime, + KEY `idx_apply_dml_events_validation_status_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Apply DML event validation status table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS full_dml_event_validation_status ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + last_validated_timestamp bigint, + is_valid bool, + created_at datetime, + KEY `idx_full_dml_event_validation_status_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Full DML event validation status table created") + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS unmatched_rows ( + id int PRIMARY KEY AUTO_INCREMENT, + source_pk bigint, + migration_id int, + unmatch_type varchar(128), + KEY `idx_unmatched_rows_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Unmatched rows table created") + + # EventHandler tables + cursor.execute(''' + CREATE TABLE IF NOT EXISTS event_handler_status ( + id int PRIMARY KEY AUTO_INCREMENT, + migration_id int, + log_file varchar(128), + log_pos bigint, + last_event_timestamp bigint, + created_at datetime, + KEY `idx_event_handler_status_migration_id` (`migration_id`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("Event handler status table created") + + def fetch_metadata(self, migration_id): + redis_data = RedisData(migration_id) + metadata = redis_data.metadata + + # Config data + metadata.set({ + "source_db": config.SOURCE_DB, + "source_table": config.SOURCE_TABLE, + "destination_db": config.DESTINATION_DB, + "destination_table": config.DESTINATION_TABLE, + }) + self.logger.info("Saved migration plan data to Redis") + + with self.db.cursor() as cursor: + # Column schema + cursor.execute(''' + SELECT GROUP_CONCAT('`', COLUMN_NAME, '`') FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s + ''', (metadata.source_db, metadata.source_table)) + metadata.source_columns = cursor.fetchone()[0] + self.logger.info("Saved source column schema to Redis") + + # Get max id + cursor.execute("SELECT MAX(id) FROM %s.%s" % (metadata.source_db, metadata.source_table)) + max_id = cursor.fetchone()[0] + metadata.max_id = max_id + self.logger.info("Saved total rows to Redis") + + metadata.start_datetime = datetime.now() + redis_data.set_current_stage(Stage.START_EVENT_HANDLER) + + def init_migration(self): + if not self.check_database_setup(): + self.setup_database() + + with self.db.cursor() as cursor: + # Insert migration plan + cursor: Cursor + cursor.execute(''' + INSERT INTO sbosc.migration_plan + (source_cluster_id, source_db, source_table, + destination_cluster_id, destination_db, destination_table, created_at) + VALUES (%s, %s, %s, %s, %s, %s, NOW()) + ''', ( + config.SOURCE_CLUSTER_ID, + config.SOURCE_DB, + config.SOURCE_TABLE, + config.cluster_id, + config.DESTINATION_DB, + config.DESTINATION_TABLE + )) + self.logger.info("Migration plan created") + migration_id = cursor.lastrowid + + # Insert index creation status + for index in config.INDEXES: + cursor.execute(''' + INSERT INTO sbosc.index_creation_status + (migration_id, index_name, index_columns, is_unique, created_at) + VALUES (%s, %s, %s, %s, NOW()) + ''', ( + migration_id, + index.name, + index.columns, + index.unique + )) + + # DML log tables + dml_log_tables = [f'{table}_{migration_id}' for table in ['inserted_pk', 'updated_pk', 'deleted_pk']] + for table in dml_log_tables: + cursor.execute(f''' + CREATE TABLE IF NOT EXISTS sbosc.{table} ( + source_pk bigint PRIMARY KEY, + event_timestamp bigint, + KEY `idx_{table}_event_timestamp` (`event_timestamp`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + ''') + self.logger.info("DML log tables created") + + # Fetch metadata + self.fetch_metadata(migration_id) + + slack = SlackClient("SB-OSC Controller", f'{config.SOURCE_CLUSTER_ID}, {migration_id}') + slack.send_message( + subtitle=f"Finished initializing migration. Migration ID: {migration_id}", + message=f"Source DB: {config.SOURCE_DB}\n" + f"Source table: {config.SOURCE_TABLE}\n" + f"Destination table: {config.DESTINATION_TABLE}", + color="good" + ) + + return migration_id diff --git a/src/sbosc/controller/main.py b/src/sbosc/controller/main.py new file mode 100644 index 0000000..7b2820e --- /dev/null +++ b/src/sbosc/controller/main.py @@ -0,0 +1,5 @@ +from sbosc.component import start_component +from sbosc.controller import Controller + +if __name__ == '__main__': + start_component(Controller) diff --git a/src/sbosc/controller/validator.py b/src/sbosc/controller/validator.py new file mode 100644 index 0000000..0950dc7 --- /dev/null +++ b/src/sbosc/controller/validator.py @@ -0,0 +1,378 @@ +import concurrent.futures +import time +from datetime import datetime, timedelta +from queue import Queue, Empty + +import MySQLdb +from MySQLdb.cursors import Cursor + +from typing import TYPE_CHECKING + +from modules.db import Database +from sbosc.exceptions import StopFlagSet +from sbosc.operations.operation import MigrationOperation + +if TYPE_CHECKING: + from sbosc.controller import Controller +from config import config +from modules.redis import RedisData +from sbosc.const import UnmatchType + + +class DataValidator: + def __init__(self, controller: 'Controller'): + self.migration_id = controller.migration_id + self.bulk_import_batch_size = config.BULK_IMPORT_VALIDATION_BATCH_SIZE + self.apply_dml_events_batch_size = config.APPLY_DML_EVENTS_VALIDATION_BATCH_SIZE + self.full_dml_event_validation_interval = config.FULL_DML_EVENT_VALIDATION_INTERVAL + self.thread_count = config.VALIDATION_THREAD_COUNT + self.db = Database() + self.redis_data = RedisData(self.migration_id) + self.migration_operation: MigrationOperation = config.OPERATION_CLASS(self.migration_id) + self.logger = controller.logger + + self.source_conn_pool = self.db.get_reader_connection_pool(self.thread_count) + self.dest_conn_pool = self.db.get_reader_connection_pool(self.thread_count, host='dest') + + self.stop_flag = False + + def set_stop_flag(self): + self.stop_flag = True + + def handle_operational_error(self, e, range_queue, start_range, end_range): + if e.args[0] == 2013: + self.logger.warning("Query timeout. Retry with smaller batch size") + range_queue.put((start_range, start_range + (end_range - start_range) // 2)) + range_queue.put((start_range + (end_range - start_range) // 2 + 1, end_range)) + time.sleep(0.1) + else: + self.logger.error(f"Error occurred during validation. Error: {e}") + range_queue.put((start_range, end_range)) + time.sleep(3) + + def validate_bulk_import_batch(self, range_queue: Queue, failed_pks): + with self.source_conn_pool.get_connection() as source_conn, self.dest_conn_pool.get_connection() as dest_conn: + while not range_queue.empty(): + if len(failed_pks) > 0: + return False + + if self.stop_flag: + raise StopFlagSet() + + with source_conn.cursor() as source_cursor, dest_conn.cursor() as dest_cursor: + try: + batch_start_pk, batch_end_pk = range_queue.get_nowait() + not_imported_pks = self.migration_operation.get_not_imported_pks( + source_cursor, dest_cursor, batch_start_pk, batch_end_pk) + if not_imported_pks: + failed_pks.extend(not_imported_pks) + return False + except MySQLdb.OperationalError as e: + self.handle_operational_error(e, range_queue, batch_start_pk, batch_end_pk) + source_conn.ping(True) + dest_conn.ping(True) + continue + except Empty: + self.logger.warning("Range queue is empty") + continue + self.logger.info(f"Validation succeeded for range {batch_start_pk} - {batch_end_pk}") + return True + + def bulk_import_validation(self): + self.logger.info("Start bulk import validation") + metadata = self.redis_data.metadata + range_queue = Queue() + start_pk = 0 + while start_pk < metadata.max_id: + range_queue.put((start_pk, min(start_pk + self.bulk_import_batch_size, metadata.max_id))) + start_pk += self.bulk_import_batch_size + 1 + failed_pks = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_count) as executor: + threads = [] + for _ in range(self.thread_count): + threads.append(executor.submit(self.validate_bulk_import_batch, range_queue, failed_pks)) + is_valid = all([thread.result() for thread in threads]) + if not is_valid: + self.logger.critical(f"Failed to validate bulk import. Failed pks: {failed_pks}") + else: + self.logger.info("Bulk import validation succeeded") + return is_valid + + def get_timestamp_range(self): + start_timestamp = None + end_timestamp = None + with self.db.cursor() as cursor: + cursor: Cursor + + # Get last validated event timestamp + cursor.execute(f''' + SELECT last_validated_timestamp FROM sbosc.apply_dml_events_validation_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + if cursor.rowcount > 0: + start_timestamp = cursor.fetchone()[0] + else: + cursor.execute(f''' + SELECT MIN(event_timestamps.min_ts) FROM ( + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.inserted_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.updated_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.deleted_pk_{self.migration_id} + ) AS event_timestamps; + ''') + if cursor.rowcount > 0: + start_timestamp = cursor.fetchone()[0] + + cursor.execute(f''' + SELECT last_event_timestamp FROM sbosc.event_handler_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + if cursor.rowcount > 0: + end_timestamp = cursor.fetchone()[0] + return start_timestamp, end_timestamp + + def execute_apply_dml_events_validation_query( + self, source_cursor, dest_cursor, table, start_timestamp, end_timestamp, unmatched_pks): + metadata = self.redis_data.metadata + if table == 'inserted_pk': + not_inserted_pks = self.migration_operation.get_not_inserted_pks( + source_cursor, dest_cursor, start_timestamp, end_timestamp) + if not_inserted_pks: + self.logger.warning(f"Found {len(not_inserted_pks)} unmatched inserted pks") + unmatched_pks.extend([(pk, UnmatchType.NOT_UPDATED) for pk in not_inserted_pks]) + elif table == 'updated_pk': + not_updated_pks = self.migration_operation.get_not_updated_pks( + source_cursor, dest_cursor, start_timestamp, end_timestamp) + if not_updated_pks: + self.logger.warning(f"Found {len(not_updated_pks)} unmatched updated pks") + unmatched_pks.extend([(pk, UnmatchType.NOT_UPDATED) for pk in not_updated_pks]) + elif table == 'deleted_pk': + source_cursor.execute(f''' + SELECT source_pk FROM sbosc.deleted_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {end_timestamp} + ''') + if source_cursor.rowcount > 0: + target_pks = ','.join([str(row[0]) for row in source_cursor.fetchall()]) + dest_cursor.execute(f''' + SELECT id FROM {metadata.destination_db}.{metadata.destination_table} WHERE id IN ({target_pks}) + ''') + deleted_pks = set([row[0] for row in dest_cursor.fetchall()]) + if dest_cursor.rowcount > 0: + # Check if deleted pks are reinserted + source_cursor.execute(f''' + SELECT id FROM {metadata.source_db}.{metadata.source_table} WHERE id IN ({target_pks}) + ''') + reinserted_pks = set([row[0] for row in source_cursor.fetchall()]) + if reinserted_pks: + deleted_pks = deleted_pks - reinserted_pks + self.logger.warning(f"Found {len(reinserted_pks)} reinserted pks") + self.logger.warning(f"Found {len(deleted_pks)} unmatched deleted pks") + unmatched_pks.extend([(pk, UnmatchType.NOT_REMOVED) for pk in deleted_pks]) + + def validate_apply_dml_events_batch(self, table, range_queue: Queue, unmatched_pks): + with self.source_conn_pool.get_connection() as source_conn, self.dest_conn_pool.get_connection() as dest_conn: + while not range_queue.empty(): + if self.stop_flag: + raise StopFlagSet() + + try: + batch_start_timestamp, batch_end_timestamp = range_queue.get_nowait() + except Empty: + self.logger.warning("Range queue is empty") + continue + + with source_conn.cursor() as source_cursor, dest_conn.cursor() as dest_cursor: + source_cursor: Cursor + dest_cursor: Cursor + source_cursor.execute("SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;") + dest_cursor.execute("SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;") + + source_cursor.execute(f''' + SELECT COUNT(1) FROM sbosc.{table}_{self.migration_id} + WHERE event_timestamp BETWEEN {batch_start_timestamp} AND {batch_end_timestamp} + ''') + event_count = source_cursor.fetchone()[0] + if event_count > self.apply_dml_events_batch_size: + range_queue.put(( + batch_start_timestamp, + batch_start_timestamp + (batch_end_timestamp - batch_start_timestamp) // 2 + )) + range_queue.put(( + batch_start_timestamp + (batch_end_timestamp - batch_start_timestamp) // 2 + 1, + batch_end_timestamp + )) + continue + + else: + try: + self.execute_apply_dml_events_validation_query( + source_cursor, dest_cursor, table, + batch_start_timestamp, batch_end_timestamp, unmatched_pks + ) + except MySQLdb.OperationalError as e: + self.handle_operational_error(e, range_queue, batch_start_timestamp, batch_end_timestamp) + source_conn.ping(True) + dest_conn.ping(True) + continue + + def validate_unmatched_pks(self): + self.logger.info("Validating unmatched pks") + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT source_pk, unmatch_type FROM sbosc.unmatched_rows + WHERE migration_id = {self.migration_id} LIMIT {self.apply_dml_events_batch_size} + ''') + if cursor.rowcount > 0: + not_updated_pks = set() + not_removed_pks = set() + for pk, unmatch_type in cursor.fetchall(): + if unmatch_type == UnmatchType.NOT_UPDATED: + not_updated_pks.add(pk) + elif unmatch_type == UnmatchType.NOT_REMOVED: + not_removed_pks.add(pk) + if len(not_updated_pks) > 0: + matched_pks = self.migration_operation.get_rematched_updated_pks(self.db, not_updated_pks) + if matched_pks: + not_updated_pks = not_updated_pks - matched_pks + matched_pks_str = ','.join([str(pk) for pk in matched_pks]) + cursor.execute(f''' + DELETE FROM sbosc.unmatched_rows WHERE source_pk IN ({matched_pks_str}) + AND unmatch_type = '{UnmatchType.NOT_UPDATED}' + ''') + if len(not_removed_pks) > 0: + matched_pks = self.migration_operation.get_rematched_removed_pks(self.db, not_removed_pks) + if matched_pks: + not_removed_pks = not_removed_pks - matched_pks + matched_pks_str = ','.join([str(pk) for pk in matched_pks]) + cursor.execute(f''' + DELETE FROM sbosc.unmatched_rows WHERE source_pk IN ({matched_pks_str}) + AND unmatch_type = '{UnmatchType.NOT_REMOVED}' + ''') + self.redis_data.updated_pk_set.add(not_updated_pks - not_removed_pks) + self.redis_data.updated_pk_set.remove(not_removed_pks) + self.redis_data.removed_pk_set.add(not_removed_pks) + + def validate_apply_dml_events(self, start_timestamp, end_timestamp): + unmatched_pks = [] + with self.db.cursor() as cursor: + cursor: Cursor + + if start_timestamp <= end_timestamp: + self.logger.info(f"Start validating DML events from {start_timestamp} to {end_timestamp}") + for table in ['inserted_pk', 'updated_pk', 'deleted_pk']: + cursor.execute(f''' + ANALYZE TABLE sbosc.{table}_{self.migration_id} + ''') + cursor.execute(f''' + SELECT TABLE_ROWS FROM information_schema.TABLES + WHERE TABLE_SCHEMA = 'sbosc' AND TABLE_NAME = '{table}_{self.migration_id}' + ''') + table_rows = cursor.fetchone()[0] + + if table_rows > 0: + range_queue = Queue() + batch_start_timestamp = start_timestamp + while batch_start_timestamp < end_timestamp: + batch_duration = \ + (end_timestamp - start_timestamp) * self.apply_dml_events_batch_size // table_rows + batch_end_timestamp = min(batch_start_timestamp + batch_duration, end_timestamp) + range_queue.put((batch_start_timestamp, batch_end_timestamp)) + batch_start_timestamp = batch_end_timestamp + 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_count) as executor: + threads = [] + for _ in range(self.thread_count): + threads.append(executor.submit( + self.validate_apply_dml_events_batch, table, range_queue, unmatched_pks)) + for thread in threads: + thread.result() + + cursor.executemany(f''' + INSERT IGNORE INTO sbosc.unmatched_rows (source_pk, migration_id, unmatch_type) + VALUES (%s, {self.migration_id}, %s) + ''', unmatched_pks) + self.validate_unmatched_pks() + cursor.execute(f"SELECT COUNT(1) FROM sbosc.unmatched_rows WHERE migration_id = {self.migration_id}") + unmatched_rows = cursor.fetchone()[0] + + # Even though validation logic is based on data in tables following valid condition can be achieved. + # All events are being pushed to redis in validation stage. + return unmatched_rows == 0 and not self.stop_flag + + def apply_dml_events_validation(self): + self.logger.info("Start apply DML events validation") + + start_timestamp, end_timestamp = self.get_timestamp_range() + if start_timestamp is None: + self.logger.warning("No events found. Skipping apply DML events validation") + return True + elif end_timestamp is None: + self.logger.warning("Failed to get valid end_timestamp") + return False + + is_valid = self.validate_apply_dml_events(start_timestamp, end_timestamp) + + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(f''' + INSERT INTO sbosc.apply_dml_events_validation_status + (migration_id, last_validated_timestamp, is_valid, created_at) + VALUES ({self.migration_id}, {end_timestamp}, {is_valid}, NOW()) + ''') + + return is_valid + + def full_dml_event_validation(self): + """ + :return: True if validation succeeded, False if validation failed, None if validation is skipped + """ + self.logger.info("Start full DML event validation") + + with self.db.cursor(role='reader') as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT created_at FROM sbosc.full_dml_event_validation_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + + if cursor.rowcount > 0: + last_validation_time = cursor.fetchone()[0] + if datetime.now() - last_validation_time < timedelta(hours=self.full_dml_event_validation_interval): + self.logger.info( + "Last validation was done less than 1 hour ago. Skipping full DML event validation") + return + + cursor.execute(f''' + SELECT MIN(event_timestamps.min_ts) FROM ( + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.inserted_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.updated_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.deleted_pk_{self.migration_id} + ) AS event_timestamps; + ''') + if cursor.rowcount > 0: + start_timestamp = cursor.fetchone()[0] + if start_timestamp is None: + self.logger.warning("No events found. Skipping full DML event validation") + return + + cursor.execute(f''' + SELECT last_event_timestamp FROM sbosc.event_handler_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + if cursor.rowcount > 0: + end_timestamp = cursor.fetchone()[0] + if end_timestamp is None: + self.logger.warning("Failed to get valid end_timestamp") + return + + is_valid = self.validate_apply_dml_events(start_timestamp, end_timestamp) + + with self.db.cursor() as cursor: + cursor.execute(f''' + INSERT INTO sbosc.full_dml_event_validation_status + (migration_id, last_validated_timestamp, is_valid, created_at) + VALUES ({self.migration_id}, {end_timestamp}, {is_valid}, NOW()) + ''') + + return is_valid diff --git a/src/sbosc/eventhandler/__init__.py b/src/sbosc/eventhandler/__init__.py new file mode 100644 index 0000000..0a64624 --- /dev/null +++ b/src/sbosc/eventhandler/__init__.py @@ -0,0 +1,3 @@ +from sbosc.eventhandler.eventhandler import EventHandler + +EventHandler = EventHandler diff --git a/src/sbosc/eventhandler/eventhandler.py b/src/sbosc/eventhandler/eventhandler.py new file mode 100644 index 0000000..1bb4bc7 --- /dev/null +++ b/src/sbosc/eventhandler/eventhandler.py @@ -0,0 +1,347 @@ +import concurrent.futures +import time +from queue import Queue, Empty +from threading import Thread + +from MySQLdb.cursors import Cursor, DictCursor +from pymysqlreplication import BinLogStreamReader +from pymysqlreplication.row_event import DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent, RowsEvent + +from config import config, secret +from modules.slack import SlackClient +from sbosc.component import SBOSCComponent +from sbosc.const import Stage +from sbosc.eventhandler.eventloader import EventLoader + + +class EventStore: + def __init__(self): + self.handled_events = 0 + self.holding_events = 0 + self.last_event_timestamp = 1 + self.insert_event_timestamp = {} + self.update_event_timestamp = {} + self.delete_event_timestamp = {} + + @staticmethod + def parse_dml_event(event: RowsEvent): + # UpdateRowsEvent has 'before_values' and 'after_values' attributes, while others have 'values' + affected_pks = [list(row.values())[0][event.primary_key] for row in event.rows] + timestamp = event.timestamp + return affected_pks, timestamp + + def add_event(self, event: RowsEvent): + event_timestamp_dict = None + + affected_pks, timestamp = self.parse_dml_event(event) + if isinstance(event, WriteRowsEvent): + event_timestamp_dict = self.insert_event_timestamp + elif isinstance(event, UpdateRowsEvent): + event_timestamp_dict = self.update_event_timestamp + elif isinstance(event, DeleteRowsEvent): + event_timestamp_dict = self.delete_event_timestamp + + if event_timestamp_dict is not None: + for pk in affected_pks: + event_timestamp_dict[pk] = timestamp + + if self.last_event_timestamp < timestamp: + self.last_event_timestamp = timestamp + + self.handled_events += 1 + self.holding_events += 1 + + def merge(self, event_store): + self.insert_event_timestamp.update(event_store.insert_event_timestamp) + self.update_event_timestamp.update(event_store.update_event_timestamp) + self.delete_event_timestamp.update(event_store.delete_event_timestamp) + self.last_event_timestamp = max(self.last_event_timestamp, event_store.last_event_timestamp) + self.handled_events += event_store.handled_events + self.holding_events += event_store.holding_events + + def clear(self): + self.holding_events = 0 + self.insert_event_timestamp = {} + self.update_event_timestamp = {} + self.delete_event_timestamp = {} + + +class EventHandler(SBOSCComponent): + def __init__(self): + super().__init__() + self.thread_count = config.EVENT_HANDLER_THREAD_COUNT + self.thread_timeout = config.EVENT_HANDLER_THREAD_TIMEOUT + self.slack = SlackClient('SB-OSC EventHandler', f'{config.SOURCE_CLUSTER_ID}, {self.migration_id}') + + # EventLoader + self.event_loader = EventLoader(self) + self.event_loader_thread = None + + # BinlogStreamReader + self.connection_settings = { + 'host': config.writer_endpoint, + 'port': secret.PORT, + 'user': secret.USERNAME, + 'passwd': secret.PASSWORD, + } + + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_count) + + self.log_file = None + self.log_pos = None + self.live_mode = False + + self.event_store = EventStore() + self.last_saved_timestamp = 1 + self.handled_binlog_files = set() + + def set_stop_flag(self): + self.logger.info('Stopping event handler...') + self.stop_flag = True + self.event_loader.set_stop_flag() + + def init_event_handler(self): + with self.db.cursor(DictCursor) as cursor: + cursor: DictCursor + cursor.execute(f''' + SELECT log_file, log_pos, last_event_timestamp, created_at FROM sbosc.event_handler_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + + if cursor.rowcount > 0: + status = cursor.fetchone() + self.log_file = status['log_file'] + self.log_pos = status['log_pos'] + self.event_store.last_event_timestamp = status['last_event_timestamp'] + self.last_saved_timestamp = status['created_at'].timestamp() + self.logger.info(f'Resuming from binlog position: {self.log_file} {self.log_pos}') + else: + if config.INIT_BINLOG_FILE: + self.log_file = config.INIT_BINLOG_FILE + self.log_pos = config.INIT_BINLOG_POSITION + self.last_saved_timestamp = int(time.time()) + else: + cursor.execute("SHOW MASTER STATUS") + status = cursor.fetchone() + self.log_file = status['File'] + self.log_pos = status['Position'] + self.last_saved_timestamp = self.event_store.last_event_timestamp = int(time.time()) + self.save_current_binlog_position() + self.slack.send_message( + subtitle="EventHandler started binlog stream", + message=f"Log file: {self.log_file}\n" + f"Log position: {self.log_pos}" + ) + + if self.redis_data.current_stage == Stage.START_EVENT_HANDLER: + if config.SKIP_BULK_IMPORT: + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) + else: + self.redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) + + def create_binlog_stream(self, log_file, log_pos, thread_id=0) -> BinLogStreamReader: + metadata = self.redis_data.metadata + return BinLogStreamReader( + connection_settings=self.connection_settings, + server_id=int(self.migration_id) * 100 + thread_id, + resume_stream=True, + only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent], + only_schemas=[metadata.source_db], + only_tables=[metadata.source_table], + log_file=log_file, + log_pos=log_pos + ) + + def save_current_binlog_position(self): + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(''' + INSERT INTO sbosc.event_handler_status + (migration_id, log_file, log_pos, last_event_timestamp, created_at) + VALUES (%s, %s, %s, %s, NOW()) + ''', (self.migration_id, self.log_file, self.log_pos, self.event_store.last_event_timestamp)) + self.logger.info(f'Saved binlog position: {self.log_file} {self.log_pos}') + + def save_events_to_db(self): + with self.db.cursor() as cursor: + cursor: Cursor + for table_name, events in [ + (f'inserted_pk_{self.migration_id}', self.event_store.insert_event_timestamp.items()), + (f'updated_pk_{self.migration_id}', self.event_store.update_event_timestamp.items()), + (f'deleted_pk_{self.migration_id}', self.event_store.delete_event_timestamp.items()) + ]: + cursor.executemany(f''' + INSERT INTO sbosc.{table_name} (source_pk, event_timestamp) + VALUES (%s, %s) ON DUPLICATE KEY UPDATE event_timestamp = VALUES(event_timestamp) + ''', list(events)) + self.event_store.clear() + self.logger.info('Saved events to db') + + def save(self): + self.save_events_to_db() + self.save_current_binlog_position() + self.logger.info(f'Handled binlog files: {self.handled_binlog_files}') + self.handled_binlog_files.clear() + self.last_saved_timestamp = time.time() + + def start(self): + self.logger.info('Starting event handler') + while not self.stop_flag: + current_stage = self.redis_data.current_stage + if Stage.DONE > current_stage >= Stage.START_EVENT_HANDLER: + if self.log_file is None or self.log_pos is None: + self.logger.info('Initializing event handler') + self.init_event_handler() + else: + if current_stage == Stage.APPLY_DML_EVENTS: + self.apply_dml_events() + elif current_stage == Stage.APPLY_DML_EVENTS_PRE_VALIDATION: + self.apply_dml_events_pre_validation() + elif current_stage >= Stage.APPLY_DML_EVENTS_VALIDATION: + self.live_mode = True + self.follow_event_stream() + time.sleep(0.1) + if self.redis_data.current_stage == Stage.START_EVENT_HANDLER: + self.redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) + else: + time.sleep(60) + + # Save events and binlog position before exiting + self.save() + self.logger.info('Saved events and binlog position before exiting.') + + def are_indexes_created(self): + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT COUNT(1) FROM sbosc.index_creation_status + WHERE migration_id = {self.migration_id} AND ended_at IS NULL + ''') + return cursor.fetchone()[0] == 0 + + def start_event_loader(self): + if self.event_loader_thread is None or not self.event_loader_thread.is_alive(): + if self.event_loader_thread is not None: + self.logger.warning('Event loader thread is dead, restarting...') + self.event_loader = EventLoader(self) + + self.event_loader_thread = Thread(target=self.event_loader.start) + self.event_loader_thread.start() + time.sleep(60) + + def apply_dml_events(self): + self.start_event_loader() + if len(self.redis_data.updated_pk_set) == 0 and len(self.redis_data.removed_pk_set) == 0 and \ + self.event_store.last_event_timestamp - self.event_loader.last_loaded_timestamp < 60: + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS_PRE_VALIDATION) + + def apply_dml_events_pre_validation(self): + self.start_event_loader() + self.save() + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(f"SELECT COUNT(1) FROM sbosc.inserted_pk_{self.migration_id}") + inserted_count = cursor.fetchone()[0] + cursor.execute(f"SELECT COUNT(1) FROM sbosc.updated_pk_{self.migration_id}") + updated_count = cursor.fetchone()[0] + cursor.execute(f"SELECT COUNT(1) FROM sbosc.deleted_pk_{self.migration_id}") + deleted_count = cursor.fetchone()[0] + if inserted_count + updated_count + deleted_count > 0: + while self.event_store.last_event_timestamp != self.event_loader.last_loaded_timestamp: + if self.stop_flag: + return + time.sleep(60) + self.event_loader.set_stop_flag() + if self.are_indexes_created(): + self.live_mode = True + self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS_VALIDATION) + else: + self.redis_data.set_current_stage(Stage.ADD_INDEX) + + def parse_binlog_batch(self, thread_id, batch_queue: Queue, done_batch: list): + event_store = EventStore() + while batch_queue.qsize() > 0 and not self.stop_flag: + try: + binlog_file, start_pos = batch_queue.get_nowait() + except Empty: + self.logger.warning('Binlog batch queue is empty') + continue + stream = self.create_binlog_stream(binlog_file, start_pos, thread_id) + for event in stream: + event_store.add_event(event) + if stream.log_file != binlog_file: + break + + done_batch.append((stream.log_file, stream.log_pos)) + stream.close() + return event_store + + def follow_event_stream(self): + file_queue = Queue() + + # Create binlog batch queue + with self.db.cursor(DictCursor) as cursor: + cursor: DictCursor + last_binlog_check_timestamp = time.time() + cursor.execute("SHOW MASTER STATUS") + live_log_file = cursor.fetchone()['File'] + live_index_number = live_log_file.split('.')[-1] + base_name, index_number = self.log_file.split('.') + binlog_files = [ + '{}.{:06d}'.format(base_name, i) + for i in range(int(index_number), int(live_index_number) + 1) + ] + for log_file in binlog_files[:self.thread_count]: + start_pos = self.log_pos if log_file == self.log_file else 4 + file_queue.put((log_file, start_pos)) + + # Parse binlog batches + threads = [] + done_files = [] + queued_files = file_queue.qsize() + event_store = EventStore() + result_event_stores = [] + + for i in range(self.thread_count): + threads.append(self.executor.submit(self.parse_binlog_batch, i, file_queue, done_files)) + done, not_done = concurrent.futures.wait(threads, timeout=self.thread_timeout) + if len(not_done) > 0: + self.set_stop_flag() + raise Exception('Binlog batch parsing timed out') + for thread in threads: + result_event_stores.append(thread.result()) + + if len(done_files) == queued_files: + self.log_file, self.log_pos = max(done_files) + self.handled_binlog_files = self.handled_binlog_files | set([binlog_file for binlog_file, _ in done_files]) + + # Merge event stores + for result_event_store in sorted(result_event_stores, key=lambda x: x.last_event_timestamp): + event_store.merge(result_event_store) + + if event_store.handled_events > 0: + if self.live_mode: + updated_pks = \ + set(event_store.insert_event_timestamp.keys()) | set(event_store.update_event_timestamp.keys()) + removed_pks = set(event_store.delete_event_timestamp.keys()) + updated_pks = updated_pks - removed_pks + self.redis_data.updated_pk_set.add(updated_pks - removed_pks) + self.redis_data.updated_pk_set.remove(removed_pks) + self.redis_data.removed_pk_set.add(removed_pks) + + self.event_store.merge(event_store) + self.logger.info(f'Handled {self.event_store.handled_events} events.') + if self.event_store.holding_events >= 1000 or self.redis_data.current_stage == Stage.SWAP_TABLES: + # Save events and binlog position + self.save() + + elif time.time() - self.last_saved_timestamp > 600: + self.save() + + if len(binlog_files) == 1: + self.redis_data.set_last_catchup_timestamp(last_binlog_check_timestamp) + + elif self.stop_flag: + self.logger.info('Binlog parsing stopped') + else: + self.logger.error('Binlog parsing failed') diff --git a/src/sbosc/eventhandler/eventloader.py b/src/sbosc/eventhandler/eventloader.py new file mode 100644 index 0000000..01b4581 --- /dev/null +++ b/src/sbosc/eventhandler/eventloader.py @@ -0,0 +1,212 @@ +import time + +from MySQLdb.cursors import Cursor + +from typing import TYPE_CHECKING + +from modules.db import Database +from modules.redis import RedisData + +if TYPE_CHECKING: + from sbosc.eventhandler.eventhandler import EventHandler +from config import config + + +class EventLoader: + def __init__(self, event_handler: 'EventHandler'): + self.migration_id = event_handler.migration_id + self.db = Database() + self.redis_data = RedisData(self.migration_id) + self.logger = event_handler.logger + + self.last_loaded_timestamp = 1 + self.batch_duration = config.EVENT_BATCH_DURATION + + self.stop_flag = False + + def set_stop_flag(self): + self.logger.info("Stopping event loader") + self.stop_flag = True + + def start(self): + try: + while not self.stop_flag: + self.load_events_from_db() + time.sleep(self.interval) + except Exception as e: + self.logger.error(f"Error in event loader: {e}") + raise e + + @property + def event_set_length(self): + return len(self.redis_data.updated_pk_set) + len(self.redis_data.removed_pk_set) + + @property + def interval(self): + event_set_length = self.event_set_length + if event_set_length > 100000: + return 60 + elif event_set_length > 10000: + return 10 + else: + return 1 + + def get_start_timestamp(self): + with self.db.cursor() as cursor: + cursor: Cursor + + # Get last loaded event timestamp + cursor.execute(f''' + SELECT last_loaded_timestamp FROM sbosc.apply_dml_events_status + WHERE migration_id = {self.migration_id} ORDER BY id DESC LIMIT 1 + ''') + if cursor.rowcount > 0: + start_timestamp = cursor.fetchone()[0] + else: + cursor.execute(f''' + SELECT MIN(event_timestamps.min_ts) FROM ( + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.inserted_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.updated_pk_{self.migration_id} UNION + SELECT MIN(event_timestamp) AS min_ts FROM sbosc.deleted_pk_{self.migration_id} + ) AS event_timestamps; + ''') + if cursor.rowcount > 0: + start_timestamp = cursor.fetchone()[0] or 0 + else: + start_timestamp = 0 + return start_timestamp + + def get_max_timestamp(self): + with self.db.cursor() as cursor: + cursor: Cursor + + cursor.execute(f''' + SELECT MAX(event_timestamps.max_ts) FROM ( + SELECT MAX(event_timestamp) AS max_ts FROM sbosc.inserted_pk_{self.migration_id} UNION + SELECT MAX(event_timestamp) AS max_ts FROM sbosc.updated_pk_{self.migration_id} UNION + SELECT MAX(event_timestamp) AS max_ts FROM sbosc.deleted_pk_{self.migration_id} + ) AS event_timestamps; + ''') + if cursor.rowcount > 0: + max_timestamp = cursor.fetchone()[0] or 0 + else: + max_timestamp = 0 + return max_timestamp + + def get_end_timestamp(self, start_timestamp): + found_end_timestamp = False + with self.db.cursor() as cursor: + cursor: Cursor + + while not found_end_timestamp and self.batch_duration > 0 and not self.stop_flag: + cursor.execute(f''' + SELECT COUNT(1) FROM sbosc.inserted_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {start_timestamp + self.batch_duration} + ''') + inserted_count = cursor.fetchone()[0] + cursor.execute(f''' + SELECT COUNT(1) FROM sbosc.updated_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {start_timestamp + self.batch_duration} + ''') + updated_count = cursor.fetchone()[0] + cursor.execute(f''' + SELECT COUNT(1) FROM sbosc.deleted_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {start_timestamp + self.batch_duration} + ''') + deleted_count = cursor.fetchone()[0] + + max_event_count = max(inserted_count, updated_count, deleted_count) + if max_event_count > config.PK_SET_MAX_SIZE: + self.batch_duration //= 2 + self.logger.warning(f"Batch is too large, reducing duration to {self.batch_duration} seconds") + elif max_event_count == 0: + self.batch_duration *= 2 + self.logger.warning( + f"No events found in timestamp range, Batch duration increased to {self.batch_duration}") + else: + found_end_timestamp = True + + return start_timestamp + self.batch_duration + + def get_pk_batch(self, start_timestamp, end_timestamp) -> (set, set, int): + updated_pks = set() + removed_pks = set() + + self.logger.info( + f"Loading events from database. Start timestamp: {start_timestamp}, end timestamp: {end_timestamp}") + with self.db.cursor() as cursor: + cursor: Cursor + + # Updated pks + cursor.execute(f''' + SELECT updated_pks.source_pk, updated_pks.event_timestamp FROM ( + SELECT source_pk, event_timestamp FROM sbosc.inserted_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {end_timestamp} + UNION + SELECT source_pk, event_timestamp FROM sbosc.updated_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {end_timestamp} + ) AS updated_pks + ''') + max_timestamp = start_timestamp + for source_pk, event_timestamp in cursor.fetchall(): + if event_timestamp > max_timestamp: + max_timestamp = event_timestamp + updated_pks.add(source_pk) + + # Removed pks + cursor.execute(f''' + SELECT source_pk, event_timestamp FROM sbosc.deleted_pk_{self.migration_id} + WHERE event_timestamp BETWEEN {start_timestamp} AND {end_timestamp} + ''') + for source_pk, event_timestamp in cursor.fetchall(): + if event_timestamp > max_timestamp: + max_timestamp = event_timestamp + removed_pks.add(source_pk) + + return updated_pks, removed_pks, max_timestamp + + def load_events_from_db(self): + updated_pk_set = self.redis_data.updated_pk_set + removed_pk_set = self.redis_data.removed_pk_set + if self.event_set_length < config.PK_SET_MAX_SIZE: + # Load events from database + + start_timestamp = self.get_start_timestamp() + max_timestamp = self.get_max_timestamp() + if start_timestamp == 0 or start_timestamp > max_timestamp: + self.logger.info("No events to load") + if self.last_loaded_timestamp == 1: + # Set last loaded timestamp to initial timestamp + # By updating it here, eventhandler can move to next stage + # Also it will prevent eventhandler from moving to next stage to early even before loading events + with self.db.cursor(role='reader') as cursor: + cursor: Cursor + cursor.execute(''' + SELECT last_event_timestamp FROM sbosc.event_handler_status + WHERE migration_id = %s ORDER BY id LIMIT 1 + ''', (self.migration_id,)) + if cursor.rowcount > 0: + self.last_loaded_timestamp = cursor.fetchone()[0] + + time.sleep(10) + return + + next_timestamp = self.get_end_timestamp(start_timestamp) + if start_timestamp == next_timestamp: + self.logger.warning("Batch duration is 0, please resize PK_SET_MAX_SIZE") + time.sleep(10) + return + + updated_pks, removed_pks, max_timestamp = self.get_pk_batch(start_timestamp, next_timestamp) + updated_pk_set.add(updated_pks - removed_pks) + updated_pk_set.remove(removed_pks) + removed_pk_set.add(removed_pks) + # Save last loaded event timestamp + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(''' + INSERT INTO sbosc.apply_dml_events_status (migration_id, last_loaded_timestamp, created_at) + VALUES (%s, %s, NOW()) + ''', (self.migration_id, max_timestamp)) + self.last_loaded_timestamp = max_timestamp + self.logger.info(f"Loaded events from database. Last loaded timestamp: {self.last_loaded_timestamp}") diff --git a/src/sbosc/eventhandler/main.py b/src/sbosc/eventhandler/main.py new file mode 100644 index 0000000..c3cc631 --- /dev/null +++ b/src/sbosc/eventhandler/main.py @@ -0,0 +1,5 @@ +from sbosc.component import start_component +from sbosc.eventhandler import EventHandler + +if __name__ == '__main__': + start_component(EventHandler) diff --git a/src/sbosc/exceptions.py b/src/sbosc/exceptions.py new file mode 100644 index 0000000..1f7168b --- /dev/null +++ b/src/sbosc/exceptions.py @@ -0,0 +1,3 @@ +class StopFlagSet(Exception): + """Raised when a stop flag is set.""" + pass diff --git a/src/sbosc/monitor/__init__.py b/src/sbosc/monitor/__init__.py new file mode 100644 index 0000000..74c9c5f --- /dev/null +++ b/src/sbosc/monitor/__init__.py @@ -0,0 +1,3 @@ +from sbosc.monitor.monitor import MetricMonitor + +MetricMonitor = MetricMonitor diff --git a/src/sbosc/monitor/main.py b/src/sbosc/monitor/main.py new file mode 100644 index 0000000..043127f --- /dev/null +++ b/src/sbosc/monitor/main.py @@ -0,0 +1,5 @@ +from sbosc.component import start_component +from sbosc.monitor import MetricMonitor + +if __name__ == '__main__': + start_component(MetricMonitor) diff --git a/src/sbosc/monitor/monitor.py b/src/sbosc/monitor/monitor.py new file mode 100644 index 0000000..5616846 --- /dev/null +++ b/src/sbosc/monitor/monitor.py @@ -0,0 +1,344 @@ +import time +from datetime import datetime, timedelta +from typing import Tuple + +from MySQLdb.cursors import DictCursor, Cursor +from prometheus_client import Gauge, Counter, CollectorRegistry, start_http_server + +from config import config +from sbosc.component import SBOSCComponent +from sbosc.const import Stage + +from config import env +from modules.aws import CloudWatch + + +class PrometheusMetricSender: + GAUGE = "gauge" + COUNTER = "counter" + + METRIC_CLASSES = { + GAUGE: Gauge, + COUNTER: Counter + } + + def __init__(self, metrics=None, label_keys=None): + self.registry = CollectorRegistry() + + self.metric_list = metrics + self.label_keys = label_keys + self.labels = {} + + self._metrics = self._create_metrics() + + def _create_metrics(self): + return { + metric_name: self.METRIC_CLASSES.get(metric_type)( + metric_name, description, self.label_keys, registry=self.registry) + for metric_name, (description, metric_type) in self.metric_list.items() + } + + def set_labels(self, labels): + self.labels = labels + + def submit(self, metric_name, value, labels=None): + self._metrics[metric_name].labels(**{**self.labels, **(labels or {})}).set(value) + + def start(self): + start_http_server(9156, registry=self.registry) + + def reset(self): + for metric in self._metrics.values(): + self.registry.unregister(metric) + self._metrics = self._create_metrics() + + +class MetricMonitor(SBOSCComponent): + def __init__(self): + super().__init__() + self.metric_sender = PrometheusMetricSender(metrics={ + "sb_osc_worker_batch_size": ("Batch size of worker", PrometheusMetricSender.GAUGE), + "sb_osc_worker_thread_count": ("Thread count of worker", PrometheusMetricSender.GAUGE), + "sb_osc_worker_commit_interval": ("Commit interval of worker", PrometheusMetricSender.GAUGE), + "sb_osc_last_event_timestamp": ("Timestamp of last event read from binlog", PrometheusMetricSender.GAUGE), + "sb_osc_last_loaded_timestamp": + ("Timestamp of last loaded event by EventLoader", PrometheusMetricSender.GAUGE), + "sb_osc_last_catchup_timestamp": + ("Last timestamp to reach the end of binlog", PrometheusMetricSender.GAUGE), + "sb_osc_remaining_binlog_size": ("Remaining binlog size", PrometheusMetricSender.GAUGE), + "sb_osc_average_insert_rate": ("Number of rows inserted per second", PrometheusMetricSender.GAUGE), + "sb_osc_bulk_import_progress": ("Progress of bulk import", PrometheusMetricSender.GAUGE), + "sb_osc_updated_pk_set_length": ("Length of updated PK set", PrometheusMetricSender.GAUGE), + "sb_osc_removed_pk_set_length": ("Length of removed PK set", PrometheusMetricSender.GAUGE), + "sb_osc_unmatched_rows": ("Number of unmatched row count", PrometheusMetricSender.GAUGE), + }, label_keys=["dbclusteridentifier", "sbregion", "migration_id"]) + self.metric_sender.set_labels(labels={ + "dbclusteridentifier": config.SOURCE_CLUSTER_ID, + "migration_id": self.migration_id + }) + + # Worker config + self.previous_batch_size = self.redis_data.worker_config.batch_size + self.previous_thread_count = self.redis_data.worker_config.thread_count + self.optimal_batch_size = None + self.optimal_thread_count = None + self.optimal_value_use_count = 0 + + # Worker metric + self.previous_insert_rate = 0 + + # CloudWatch + self.cw = CloudWatch() + + def start(self): + self.logger.info("Metric monitor started") + self.metric_sender.start() + while not self.stop_flag: + current_stage = self.redis_data.current_stage + if current_stage < Stage.DONE: + if current_stage == Stage.BULK_IMPORT or Stage.APPLY_DML_EVENTS <= current_stage: + self.update_worker_config() + self.check_migration_status() + + time.sleep(60) + + self.logger.info("Metric monitor stopped") + + def get_writer_cpu(self, dest_writer_id): + datapoints = self.cw.get_instance_cpu_usages( + instance_id=dest_writer_id, + start_time=datetime.utcnow() - timedelta(minutes=3), + end_time=datetime.utcnow(), + statistics="Maximum" + ) + if datapoints: + writer_cpu = sum([datapoint['Maximum'] for datapoint in datapoints]) / len(datapoints) + else: + with self.db.cursor(DictCursor, host='dest') as cursor: + cursor: DictCursor + cursor.execute(''' + SELECT CPU AS writer_cpu FROM information_schema.REPLICA_HOST_STATUS + WHERE SESSION_ID = 'MASTER_SESSION_ID' + ''') + writer_cpu = cursor.fetchone()['writer_cpu'] + return writer_cpu + + def get_write_latency(self, dest_writer_id): + current_datetime = datetime.utcnow() + datapoints = self.cw.get_rds_instance_metrics( + instance_id=dest_writer_id, + start_time=current_datetime - timedelta(minutes=5), + end_time=current_datetime, + metric_name="WriteLatency", + statistics="Average", + unit="Seconds", + )['Datapoints'] + if datapoints: + return sum([datapoint['Average'] * 1000 for datapoint in datapoints]) / len(datapoints) + else: + self.logger.warning("No datapoints for WriteLatency") + return 0 + + def get_optimal_worker_config(self, current_batch_size, current_thread_count, average_insert_rate) \ + -> Tuple[int, int]: + # Return optimal values if they are already calculated + if self.optimal_batch_size is not None and self.optimal_thread_count is not None: + next_batch_size = self.optimal_batch_size + next_thread_count = min( + current_thread_count + config.THREAD_COUNT_STEP_SIZE, + self.optimal_thread_count + ) + + # Reset optimal values if they are used for a while + self.optimal_value_use_count += 1 + if self.optimal_value_use_count >= config.OPTIMAL_VALUE_USE_LIMIT: + next_batch_size = current_batch_size + config.BATCH_SIZE_STEP_SIZE + self.optimal_batch_size = None + self.optimal_thread_count = None + self.optimal_value_use_count = 0 + else: + if self.previous_insert_rate == 0: + next_batch_size = current_batch_size + config.BATCH_SIZE_STEP_SIZE + next_thread_count = current_thread_count + + elif current_batch_size > self.previous_batch_size: + # Increase batch size until insert rate stops increasing + if average_insert_rate > self.previous_insert_rate: + next_batch_size = current_batch_size + config.BATCH_SIZE_STEP_SIZE + next_thread_count = current_thread_count + else: + next_batch_size = self.previous_batch_size + next_thread_count = current_thread_count + config.THREAD_COUNT_STEP_SIZE + elif current_thread_count > self.previous_thread_count or \ + current_thread_count == config.MAX_THREAD_COUNT: + # Increase thread count until insert rate stops increasing + if average_insert_rate > self.previous_insert_rate: + next_batch_size = current_batch_size + config.BATCH_SIZE_STEP_SIZE + next_thread_count = current_thread_count + else: + self.optimal_batch_size = current_batch_size + self.optimal_thread_count = self.previous_thread_count + next_batch_size = current_batch_size + next_thread_count = self.previous_thread_count + elif current_batch_size == config.MAX_BATCH_SIZE: + next_batch_size = current_batch_size + next_thread_count = current_thread_count + config.THREAD_COUNT_STEP_SIZE + + else: + next_batch_size = current_batch_size + next_thread_count = current_thread_count + + # Save current values + self.previous_insert_rate = average_insert_rate + self.previous_batch_size = current_batch_size + self.previous_thread_count = current_thread_count + + next_batch_size = min(next_batch_size, config.MAX_BATCH_SIZE) + next_thread_count = min(next_thread_count, config.MAX_THREAD_COUNT) + + return next_batch_size, next_thread_count + + def submit_worker_metrics( + self, current_batch_size, current_thread_count, commit_interval, average_insert_rate): + self.metric_sender.submit('sb_osc_worker_batch_size', current_batch_size) + self.metric_sender.submit('sb_osc_worker_thread_count', current_thread_count) + self.metric_sender.submit('sb_osc_worker_commit_interval', commit_interval) + self.metric_sender.submit('sb_osc_average_insert_rate', average_insert_rate) + + def update_worker_config(self): + # Get writer metrics + dest_writer_id = self.db.get_instance_id(host='dest', role='writer') + writer_cpu = self.get_writer_cpu(dest_writer_id) + write_latency = self.get_write_latency(dest_writer_id) + self.logger.info(f"Writer CPU: {writer_cpu}%, Write Latency: {write_latency}ms") + + # Calculate average insert rate + worker_metrics = self.redis_data.get_all_worker_metrics() + if worker_metrics: + average_insert_rate = sum([m.average_insert_rate for m in worker_metrics]) / len(worker_metrics) + else: + average_insert_rate = 0 + + # Get current worker config + worker_config = self.redis_data.worker_config + next_batch_size = current_batch_size = worker_config.batch_size + next_thread_count = current_thread_count = worker_config.thread_count + commit_interval = worker_config.commit_interval + + if writer_cpu > config.CPU_HARD_THRESHOLD: + self.logger.warning("CPU exceeded hard threshold. Setting thread count to 0.") + next_batch_size = current_batch_size + next_thread_count = 0 + + elif writer_cpu > config.CPU_SOFT_THRESHOLD: + self.logger.warning("CPU exceeded soft threshold. Start decreasing thread count.") + next_batch_size = current_batch_size + next_thread_count = max(config.MIN_THREAD_COUNT, current_thread_count // 2) + + elif write_latency > config.LATENCY_HARD_THRESHOLD: + self.logger.warning("Latency exceeded hard threshold. Setting thread count to 0.") + next_batch_size = config.MIN_BATCH_SIZE + next_thread_count = 0 + + elif write_latency > config.LATENCY_SOFT_THRESHOLD: + self.logger.warning("Latency exceeded soft threshold. Start decreasing batch size.") + next_batch_size = max(config.MIN_BATCH_SIZE, current_batch_size // 2) + next_thread_count = current_thread_count + + elif writer_cpu < config.CPU_SOFT_THRESHOLD and \ + write_latency < config.LATENCY_SOFT_THRESHOLD and current_thread_count == 0: + self.logger.info("Writer became stable. Restoring thread count to MIN_THREAD_COUNT.") + next_batch_size = current_batch_size + next_thread_count = config.MIN_THREAD_COUNT + + elif average_insert_rate > 0 and self.redis_data.current_stage == Stage.BULK_IMPORT: + next_batch_size, next_thread_count = self.get_optimal_worker_config( + current_batch_size, current_thread_count, average_insert_rate * current_thread_count) + + self.submit_worker_metrics( + current_batch_size, current_thread_count, commit_interval, average_insert_rate) + self.logger.info("Submitted worker metrics") + + if next_batch_size != current_batch_size or next_thread_count != current_thread_count: + self.redis_data.worker_config.set({ + 'batch_size': next_batch_size, + 'thread_count': next_thread_count, + 'commit_interval': commit_interval, + 'revision': worker_config.revision + 1 + }) + + # Unlike batch_size, thread_count requires some time to take effect + if next_thread_count != current_thread_count: + time.sleep(60) + + def submit_event_handler_timestamps(self): + # last_event_timestamp, last_loaded_timestamp, last_catchup_timestamp + with self.db.cursor() as cursor: + cursor: Cursor + cursor.execute(''' + SELECT MIN(last_event_timestamp), MAX(last_event_timestamp) + FROM sbosc.event_handler_status WHERE migration_id = %s AND last_event_timestamp > 1 + ''' % self.migration_id) + start_timestamp, last_timestamp = cursor.fetchone() + if start_timestamp: + last_event_timestamp = last_timestamp - start_timestamp + self.metric_sender.submit('sb_osc_last_event_timestamp', last_event_timestamp) + + cursor.execute(''' + SELECT last_loaded_timestamp FROM sbosc.apply_dml_events_status + WHERE migration_id = %s ORDER BY id DESC LIMIT 1 + ''' % self.migration_id) + if cursor.rowcount > 0: + last_loaded_timestamp = cursor.fetchone()[0] - start_timestamp + self.metric_sender.submit('sb_osc_last_loaded_timestamp', last_loaded_timestamp) + + last_catchup_timestamp = self.redis_data.last_catchup_timestamp - start_timestamp + if last_catchup_timestamp > 0: + self.metric_sender.submit('sb_osc_last_catchup_timestamp', last_catchup_timestamp) + + def check_migration_status(self): + # bulk_import_progress + inserted_rows = 0 + for chunk_id in self.redis_data.chunk_set: + chunk_info = self.redis_data.get_chunk_info(chunk_id) + last_pk_inserted = chunk_info.last_pk_inserted + + # last_pk_inserted is initialized as batch_start_pk - 1 + if last_pk_inserted and last_pk_inserted >= chunk_info.start_pk: + inserted_rows += last_pk_inserted - chunk_info.start_pk + + if self.redis_data.metadata.max_id: + bulk_import_progress = inserted_rows / self.redis_data.metadata.max_id * 100 + self.metric_sender.submit('sb_osc_bulk_import_progress', bulk_import_progress) + + self.submit_event_handler_timestamps() + + # remaining_binlog_size + remaining_binlog_size = 0 + if time.time() - self.redis_data.last_catchup_timestamp > 2: + with self.db.cursor() as cursor: + cursor.execute(''' + SELECT log_file, log_pos FROM sbosc.event_handler_status + WHERE migration_id = %s ORDER BY id DESC LIMIT 1 + ''' % self.migration_id) + if cursor.rowcount > 0: + log_file, log_pos = cursor.fetchone() + remaining_binlog_size = 0 + cursor.execute("SHOW BINARY LOGS") + for log_name, file_size in cursor.fetchall(): + if log_name >= log_file: + remaining_binlog_size += file_size + remaining_binlog_size -= log_pos + self.metric_sender.submit('sb_osc_remaining_binlog_size', remaining_binlog_size) + + # updated_pk_set, removed_pk_set + updated_pk_set_len = len(self.redis_data.updated_pk_set) + removed_pk_set_len = len(self.redis_data.removed_pk_set) + self.metric_sender.submit('sb_osc_updated_pk_set_length', updated_pk_set_len) + self.metric_sender.submit('sb_osc_removed_pk_set_length', removed_pk_set_len) + + # unmatched_pks + with self.db.cursor() as cursor: + cursor.execute("SELECT COUNT(1) FROM sbosc.unmatched_rows WHERE migration_id = %s" % self.migration_id) + unmatched_pks = cursor.fetchone()[0] + self.metric_sender.submit('sb_osc_unmatched_rows', unmatched_pks) diff --git a/src/sbosc/operations/__init__.py b/src/sbosc/operations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/sbosc/operations/base.py b/src/sbosc/operations/base.py new file mode 100644 index 0000000..805fa0b --- /dev/null +++ b/src/sbosc/operations/base.py @@ -0,0 +1,281 @@ +import pandas as pd +from MySQLdb.cursors import Cursor + +from sbosc.operations.operation import MigrationOperation +import sbosc.operations.utils as operation_utils + + +class BaseOperation(MigrationOperation): + def _insert_batch_query(self, start_pk, end_pk): + return f''' + INSERT INTO {self.source_db}.{self.destination_table}({self.source_columns}) + SELECT {self.source_columns} + FROM {self.source_db}.{self.source_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + ''' + + def insert_batch(self, db, start_pk, end_pk, upsert=False, limit=None): + query = self._insert_batch_query(start_pk, end_pk) + if limit: + query = operation_utils.apply_limit(query, limit) + if upsert: + query = operation_utils.insert_to_upsert(query, self.source_column_list) + with db.cursor() as cursor: + cursor.execute(query) + return cursor + + def apply_update(self, db, updated_pks): + with db.cursor() as cursor: + updated_pks_str = ",".join([str(pk) for pk in updated_pks]) + query = f''' + INSERT INTO {self.source_db}.{self.destination_table}({self.source_columns}) + SELECT {self.source_columns} + FROM {self.source_db}.{self.source_table} + WHERE id IN ({updated_pks_str}) + ''' + query = operation_utils.insert_to_upsert(query, self.source_column_list) + cursor.execute(query) + return cursor + + def _get_not_imported_pks_query(self, start_pk, end_pk): + return f''' + SELECT source.id FROM {self.source_db}.{self.source_table} AS source + LEFT JOIN {self.source_db}.{self.destination_table} AS dest ON source.id = dest.id + WHERE source.id BETWEEN {start_pk} AND {end_pk} + AND dest.id IS NULL + ''' + + def get_not_imported_pks(self, source_cursor, dest_cursor, start_pk, end_pk): + not_imported_pks = [] + query = self._get_not_imported_pks_query(start_pk, end_pk) + source_cursor.execute(query) + if source_cursor.rowcount > 0: + not_imported_pks = [row[0] for row in source_cursor.fetchall()] + return not_imported_pks + + def get_not_inserted_pks(self, source_cursor, dest_cursor, start_timestamp, end_timestamp): + not_inserted_pks = [] + event_pks = self._get_event_pks(source_cursor, 'insert', start_timestamp, end_timestamp) + if event_pks: + source_cursor.execute(f''' + SELECT source.id FROM {self.source_db}.{self.source_table} AS source + LEFT JOIN {self.source_db}.{self.destination_table} AS dest ON source.id = dest.id + WHERE source.id IN ({event_pks}) + AND dest.id IS NULL + ''') + not_inserted_pks = [row[0] for row in source_cursor.fetchall()] + return not_inserted_pks + + def get_not_updated_pks(self, source_cursor, dest_cursor, start_timestamp, end_timestamp): + not_updated_pks = [] + event_pks = self._get_event_pks(source_cursor, 'update', start_timestamp, end_timestamp) + if event_pks: + source_cursor.execute(f''' + SELECT combined.id + FROM ( + SELECT {self.source_columns}, 'source' AS table_type + FROM {self.source_db}.{self.source_table} + WHERE id IN ({event_pks}) + UNION ALL + SELECT {self.source_columns}, 'destination' AS table_type + FROM {self.source_db}.{self.destination_table} + WHERE id IN ({event_pks}) + ) AS combined + GROUP BY {self.source_columns} + HAVING COUNT(1) = 1 AND SUM(table_type = 'source') = 1 + ''') + not_updated_pks = [row[0] for row in source_cursor.fetchall()] + return not_updated_pks + + def get_rematched_updated_pks(self, db, not_updated_pks): + not_updated_pks_str = ','.join([str(pk) for pk in not_updated_pks]) + with db.cursor() as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT combined.id FROM ( + SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} + WHERE id IN ({not_updated_pks_str}) UNION ALL + SELECT {self.source_columns} FROM {self.source_db}.{self.destination_table} + WHERE id IN ({not_updated_pks_str}) + ) AS combined GROUP BY {self.source_columns} HAVING COUNT(*) = 2 + ''') + rematched_pks = set([row[0] for row in cursor.fetchall()]) + # add deleted pks + cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} WHERE id IN ({not_updated_pks_str}) + ''') + remaining_pks = set([row[0] for row in cursor.fetchall()]) + deleted_pks = not_updated_pks - remaining_pks + return rematched_pks | deleted_pks + + def get_rematched_removed_pks(self, db, not_removed_pks): + not_removed_pks_str = ','.join([str(pk) for pk in not_removed_pks]) + with db.cursor() as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT source_pk FROM sbosc.unmatched_rows WHERE source_pk NOT IN ( + SELECT id FROM {self.source_db}.{self.destination_table} + WHERE id IN ({not_removed_pks_str}) + ) AND source_pk IN ({not_removed_pks_str}) + ''') + rematched_pks = set([row[0] for row in cursor.fetchall()]) + # add reinserted pks + cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} WHERE id IN ({not_removed_pks_str}) + ''') + reinserted_pks = set([row[0] for row in cursor.fetchall()]) + return rematched_pks | reinserted_pks + + +class CrossClusterBaseOperation(MigrationOperation): + def _select_batch_query(self, start_pk, end_pk): + return f''' + SELECT {self.source_columns} + FROM {self.source_db}.{self.source_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + ''' + + def insert_batch(self, db, start_pk, end_pk, upsert=False, limit=None): + select_batch_query = self._select_batch_query(start_pk, end_pk) + if limit: + select_batch_query = operation_utils.apply_limit(select_batch_query, limit) + with db.cursor(host='source', role='reader') as cursor: + cursor.execute(select_batch_query) + rows = cursor.fetchall() + if rows: + insert_batch_query = f''' + INSERT INTO {self.destination_db}.{self.destination_table}({self.source_columns}) + VALUES ({','.join(['%s'] * len(self.source_column_list))}) + ''' + if upsert: + insert_batch_query = operation_utils.insert_to_upsert(insert_batch_query, self.source_column_list) + with db.cursor(host='dest', role='writer') as cursor: + cursor.executemany(insert_batch_query, rows) + return cursor + else: + return cursor + + def apply_update(self, db, updated_pks): + with db.cursor(host='source', role='reader') as cursor: + updated_pks_str = ",".join([str(pk) for pk in updated_pks]) + cursor: Cursor + cursor.execute(f''' + SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} + WHERE id IN ({updated_pks_str}) + ''') + rows = cursor.fetchall() + if rows: + with db.cursor(host='dest', role='writer') as cursor: + query = f''' + INSERT INTO {self.destination_db}.{self.destination_table}({self.source_columns}) + VALUES ({','.join(['%s'] * len(self.source_column_list))}) + ''' + query = operation_utils.insert_to_upsert(query, self.source_column_list) + cursor.executemany(query, rows) + return cursor + else: + return cursor + + def get_not_imported_pks(self, source_cursor, dest_cursor, start_pk, end_pk): + source_cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + ''') + source_pks = [row[0] for row in source_cursor.fetchall()] + dest_cursor.execute(f''' + SELECT id FROM {self.destination_db}.{self.destination_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + ''') + dest_pks = [row[0] for row in dest_cursor.fetchall()] + return list(set(source_pks) - set(dest_pks)) + + def get_not_inserted_pks(self, source_cursor, dest_cursor, start_timestamp, end_timestamp): + not_inserted_pks = [] + event_pks = self._get_event_pks(source_cursor, 'insert', start_timestamp, end_timestamp) + if event_pks: + source_cursor.execute(f"SELECT id FROM {self.source_db}.{self.source_table} WHERE id IN ({event_pks})") + source_pks = [row[0] for row in source_cursor.fetchall()] + dest_cursor.execute( + f"SELECT id FROM {self.destination_db}.{self.destination_table} WHERE id IN ({event_pks})") + dest_pks = [row[0] for row in dest_cursor.fetchall()] + not_inserted_pks = list(set(source_pks) - set(dest_pks)) + return not_inserted_pks + + def get_not_updated_pks(self, source_cursor, dest_cursor, start_timestamp, end_timestamp): + not_updated_pks = [] + event_pks = self._get_event_pks(source_cursor, 'update', start_timestamp, end_timestamp) + if event_pks: + source_cursor.execute(f''' + SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} + WHERE id IN ({event_pks}) + ''') + source_df = pd.DataFrame(source_cursor.fetchall(), columns=[c[0] for c in source_cursor.description]) + dest_cursor.execute(f''' + SELECT {self.source_columns} FROM {self.destination_db}.{self.destination_table} + WHERE id IN ({event_pks}) + ''') + dest_df = pd.DataFrame(dest_cursor.fetchall(), columns=[c[0] for c in dest_cursor.description]) + + dest_df = dest_df[source_df.columns] + source_df.set_index('id', inplace=True) + dest_df.set_index('id', inplace=True) + common_index = dest_df.index.intersection(source_df.index) + source_df = source_df.loc[common_index] + dest_df = dest_df.loc[common_index] + + comparison_df = source_df.compare(dest_df).dropna(how='all') + not_updated_pks = comparison_df.index.unique().tolist() + return not_updated_pks + + def get_rematched_updated_pks(self, db, not_updated_pks): + not_updated_pks_str = ','.join([str(pk) for pk in not_updated_pks]) + with db.cursor(host='source', role='reader') as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} + WHERE id IN ({not_updated_pks_str}) + ''') + source_df = pd.DataFrame(cursor.fetchall(), columns=[c[0] for c in cursor.description]) + with db.cursor(host='dest', role='reader') as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT {self.source_columns} FROM {self.destination_db}.{self.destination_table} + WHERE id IN ({not_updated_pks_str}) + ''') + dest_df = pd.DataFrame(cursor.fetchall(), columns=[c[0] for c in cursor.description]) + + dest_df = dest_df.astype(source_df.dtypes.to_dict()) + merged_df = source_df.merge(dest_df, how='inner', on=source_df.columns.tolist(), indicator=True) + rematched_pks = set(merged_df[merged_df['_merge'] == 'both']['id'].tolist()) + # add deleted pks + with db.cursor(host='source', role='reader') as cursor: + cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} WHERE id IN ({not_updated_pks_str}) + ''') + remaining_pks = set([row[0] for row in cursor.fetchall()]) + deleted_pks = not_updated_pks - remaining_pks + return rematched_pks | deleted_pks + + def get_rematched_removed_pks(self, db, not_removed_pks): + not_removed_pks_str = ','.join([str(pk) for pk in not_removed_pks]) + with db.cursor(host='dest', role='reader') as cursor: + cursor: Cursor + cursor.execute( + f"SELECT id FROM {self.destination_db}.{self.destination_table} WHERE id IN ({not_removed_pks_str})") + still_not_removed_pks_str = ','.join([str(row[0]) for row in cursor.fetchall()]) + with db.cursor(host='source', role='reader') as cursor: + cursor: Cursor + query = f''' + SELECT source_pk FROM sbosc.unmatched_rows + WHERE source_pk IN ({not_removed_pks_str}) + ''' + if still_not_removed_pks_str: + query += f" AND source_pk NOT IN ({still_not_removed_pks_str})" + cursor.execute(query) + rematched_pks = set([row[0] for row in cursor.fetchall()]) + # add reinserted pks + cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} WHERE id IN ({not_removed_pks_str}) + ''') + reinserted_pks = set([row[0] for row in cursor.fetchall()]) + return rematched_pks | reinserted_pks diff --git a/src/sbosc/operations/cases/__init__.py b/src/sbosc/operations/cases/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/sbosc/operations/operation.py b/src/sbosc/operations/operation.py new file mode 100644 index 0000000..1717440 --- /dev/null +++ b/src/sbosc/operations/operation.py @@ -0,0 +1,101 @@ +from abc import abstractmethod +from contextlib import contextmanager +from typing import Literal + +from MySQLdb.cursors import Cursor + +from modules.db import Database +from modules.redis import RedisData + + +class MigrationOperation: + """Abstract class for migration operations.""" + + def __init__(self, migration_id): + self.migration_id = migration_id + self.redis_data = RedisData(migration_id) + + metadata = self.redis_data.metadata + self.source_db = metadata.source_db + self.source_table = metadata.source_table + self.destination_db = metadata.destination_db + self.destination_table = metadata.destination_table + self.source_columns: str = metadata.source_columns + self.source_column_list: list = metadata.source_columns.split(',') + self.start_datetime = metadata.start_datetime + + @abstractmethod + def insert_batch(self, db: Database, start_pk: int, end_pk: int, upsert=False, limit=None) -> Cursor: + """ + Executes a query to insert a batch of records from the source table into the destination table. + Used by the worker to insert a batch of records into the destination table. + """ + pass + + @abstractmethod + def apply_update(self, db: Database, updated_pks: list) -> Cursor: + """ + Executes a query to apply DML update (insert) events to the destination table. + """ + pass + + @abstractmethod + def get_not_imported_pks(self, source_cursor: Cursor, dest_cursor: Cursor, start_pk: int, end_pk: int) -> list: + """ + Returns a list of primary keys that have not been imported into the destination table. + Used in BULK_IMPORT_VALIDATION stage to validate that all records have been imported. + """ + pass + + def _get_event_pks( + self, cursor: Cursor, event_type: Literal['insert', 'update'], start_timestamp, end_timestamp): + table_names = { + 'insert': f'inserted_pk_{self.migration_id}', + 'update': f'updated_pk_{self.migration_id}' + } + cursor.execute(f''' + SELECT source_pk FROM sbosc.{table_names[event_type]} + WHERE event_timestamp BETWEEN {start_timestamp} AND {end_timestamp} + ''') + return ','.join([str(row[0]) for row in cursor.fetchall()]) + + @abstractmethod + def get_not_inserted_pks(self, source_cursor: Cursor, dest_cursor: Cursor, start_timestamp, end_timestamp): + """ + Returns a list of primary keys that have not been inserted into the destination table. + Used in APPLY_DML_EVENTS_VALIDATION stage to validate that all inserts have been applied. + """ + pass + + @abstractmethod + def get_not_updated_pks(self, source_cursor: Cursor, dest_cursor: Cursor, start_timestamp, end_timestamp): + """ + Returns a list of primary keys that have not been updated in the destination table. + Used in APPLY_DML_EVENTS_VALIDATION stage to validate that all updates have been applied. + """ + pass + + @abstractmethod + def get_rematched_updated_pks(self, db: Database, not_updated_pks: set) -> set: + """ + Returns a list of primary keys that have been updated in the destination table after first unmatch. + Used in APPLY_DML_EVENTS_VALIDATION stage to validate unmatched pks. + """ + pass + + @abstractmethod + def get_rematched_removed_pks(self, db: Database, not_removed_pks: set) -> set: + """ + Returns a list of primary keys that have been updated in the destination table after first unmatch. + Used in APPLY_DML_EVENTS_VALIDATION stage to validate unmatched pks. + """ + pass + + @contextmanager + def override_source_table(self, table_name: str): + """Context manager to override the source table name.""" + source_table = self.source_table + if table_name: + self.source_table = table_name + yield + self.source_table = source_table diff --git a/src/sbosc/operations/utils.py b/src/sbosc/operations/utils.py new file mode 100644 index 0000000..1577e17 --- /dev/null +++ b/src/sbosc/operations/utils.py @@ -0,0 +1,10 @@ + +def insert_to_upsert(query: str, source_columns: list) -> str: + query = query.replace("INSERT INTO", "INSERT IGNORE INTO") + update_clause = ",".join([f'{column} = VALUES({column})' for column in source_columns]) + query += f" ON DUPLICATE KEY UPDATE {update_clause}" + return query + + +def apply_limit(query: str, limit: int) -> str: + return query + f" ORDER BY id LIMIT {limit}" diff --git a/src/sbosc/worker/__init__.py b/src/sbosc/worker/__init__.py new file mode 100644 index 0000000..301d17e --- /dev/null +++ b/src/sbosc/worker/__init__.py @@ -0,0 +1,3 @@ +from sbosc.worker.worker import Worker + +Worker = Worker diff --git a/src/sbosc/worker/main.py b/src/sbosc/worker/main.py new file mode 100644 index 0000000..85d650e --- /dev/null +++ b/src/sbosc/worker/main.py @@ -0,0 +1,5 @@ +from sbosc.component import start_component +from sbosc.worker.manager import WorkerManager + +if __name__ == '__main__': + start_component(WorkerManager) diff --git a/src/sbosc/worker/manager.py b/src/sbosc/worker/manager.py new file mode 100644 index 0000000..a4bb994 --- /dev/null +++ b/src/sbosc/worker/manager.py @@ -0,0 +1,87 @@ +import concurrent.futures +import time + +from config import config +from sbosc.component import SBOSCComponent +from sbosc.const import Stage, WorkerStatus +from sbosc.worker import Worker + + +class WorkerManager(SBOSCComponent): + def __init__(self): + super().__init__() + self.desired_thread_count = 0 + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=config.MAX_THREAD_COUNT) + self.worker_threads = [] + self.created_threads = 0 + + @property + def thread_count(self): + return len(self.worker_threads) + + def set_stop_flag(self): + self.logger.info("Stopping worker manager...") + self.stop_flag = True + + def start(self): + self.logger.info("Worker manager started") + while not self.stop_flag: + self.check_worker_status() + self.calculate_metrics() + + self.desired_thread_count = self.redis_data.worker_config.thread_count or 0 + self.logger.info( + f"Current thread count: {self.thread_count}, " + f"desired thread count: {self.desired_thread_count}" + ) + if self.thread_count < self.desired_thread_count: + self.add_threads() + elif self.desired_thread_count < self.thread_count: + self.remove_threads() + time.sleep(60) + + # Stop all workers + self.logger.info("Stopping workers...") + self.desired_thread_count = 0 + self.remove_threads() + self.redis_data.worker_metric.delete() + self.logger.info("All workers stopped") + self.logger.info("Worker manager stopped") + + def add_threads(self): + self.logger.info(f"Adding {self.desired_thread_count - self.thread_count} threads") + while self.thread_count < self.desired_thread_count: + self.created_threads += 1 + worker = Worker(f'worker_{self.created_threads}', config, self) + self.worker_threads.append((worker, self.executor.submit(worker.start))) + + def remove_threads(self): + self.logger.info(f"Removing {self.thread_count - self.desired_thread_count} threads") + removed_threads = [] + while self.thread_count > self.desired_thread_count: + worker, thread = self.worker_threads.pop() + worker.set_stop_flag() + removed_threads.append(thread) + + # Wait for threads to stop + concurrent.futures.wait(removed_threads, timeout=120) + + def check_worker_status(self): + self.worker_threads = [ + (worker, thread) for worker, thread in self.worker_threads if not thread.done() + ] # Remove finished threads. This will trigger add_threads() on the next loop. + busy_worker_count = len([ + worker for worker, thread in self.worker_threads if worker.status == WorkerStatus.BUSY + ]) + self.logger.info(f"Busy worker count: {busy_worker_count}") + if busy_worker_count == 0 and self.redis_data.current_stage == Stage.BULK_IMPORT \ + and len(self.redis_data.chunk_stack) == 0: + self.redis_data.set_current_stage(Stage.BULK_IMPORT_VALIDATION) + + def calculate_metrics(self): + datapoints = [] + for worker, _ in self.worker_threads: + datapoints += list(worker.datapoints) + if datapoints: + average_insert_rate = sum(datapoints) / len(datapoints) + self.redis_data.worker_metric.average_insert_rate = average_insert_rate diff --git a/src/sbosc/worker/worker.py b/src/sbosc/worker/worker.py new file mode 100644 index 0000000..23a3ef2 --- /dev/null +++ b/src/sbosc/worker/worker.py @@ -0,0 +1,250 @@ +import time +from collections import deque +from typing import Callable, Self + +import MySQLdb +from MySQLdb.cursors import Cursor + +from typing import TYPE_CHECKING + +from config import config +from modules.db import Database +from modules.redis import RedisData +from modules.redis.schema import ChunkInfo +from sbosc.const import WorkerStatus, Stage, ChunkStatus +from sbosc.operations.operation import MigrationOperation + +if TYPE_CHECKING: + from sbosc.worker.manager import WorkerManager + + +class WorkerConfig: + def __init__(self, redis_data: RedisData, datapoints: deque): + self.redis_data = redis_data + self.datapoints = datapoints + self.worker_config_revision = -1 # default revision is 0 + self.batch_size_multiplier = 1 + + # config + self.raw_batch_size = None + self.batch_size = None + self.commit_interval = None + + def update(self): + worker_config = self.redis_data.worker_config + if worker_config.revision > self.worker_config_revision: + self.raw_batch_size = worker_config.batch_size + self.commit_interval = worker_config.commit_interval + self.worker_config_revision = worker_config.revision + self.datapoints.clear() + self.batch_size = int(self.raw_batch_size * self.batch_size_multiplier) + + def update_batch_size_multiplier(self, rowcount): + if rowcount == 0: + self.batch_size_multiplier += 1 + elif self.raw_batch_size == rowcount: + self.batch_size_multiplier = \ + max(1.0, self.batch_size_multiplier * 0.9) + else: + self.batch_size_multiplier = self.batch_size / rowcount + + +class Worker: + def __init__(self, worker_id, manager: 'WorkerManager'): + self.migration_id = manager.migration_id + self.db = Database() + self.redis_data = RedisData(self.migration_id) + self.logger = manager.logger + self.migration_operation: MigrationOperation = config.OPERATION_CLASS(self.migration_id) + self.use_batch_size_multiplier = config.USE_BATCH_SIZE_MULTIPLIER + self.worker_id = worker_id + self.stop_flag = False + self.status = WorkerStatus.IDLE + self.datapoints = deque(maxlen=100) + self.worker_config = WorkerConfig(self.redis_data, self.datapoints) + self.interval = 60 + + self.old_source_table = None + + def set_stop_flag(self): + self.stop_flag = True + + def start(self): + self.logger.info(f"Worker {self.worker_id} started") + while not self.stop_flag: + self.status = WorkerStatus.IDLE + if self.redis_data.current_stage == Stage.BULK_IMPORT: + self.bulk_import() + self.interval = 1 + # For apply DML events, it keeps running until the end of the process + elif Stage.APPLY_DML_EVENTS <= self.redis_data.current_stage < Stage.DONE: + if self.redis_data.current_stage == Stage.SWAP_TABLES: + self.interval = 0.1 + self.old_source_table = self.redis_data.old_source_table + else: + self.interval = 1 + self.apply_dml_events() + else: + self.interval = 60 + + time.sleep(self.interval) + + self.logger.info(f"Worker {self.worker_id} stopped") + + def get_start_pk(self, chunk_info: ChunkInfo): + if chunk_info.status == ChunkStatus.DONE: + return None + elif chunk_info.status == ChunkStatus.NOT_STARTED: + chunk_info.last_pk_inserted = chunk_info.start_pk - 1 + return chunk_info.start_pk + elif chunk_info.status == ChunkStatus.IN_PROGRESS: + return chunk_info.last_pk_inserted + 1 + elif chunk_info.status == ChunkStatus.DUPLICATE_KEY: + max_pk = self.get_max_pk(chunk_info.start_pk, chunk_info.end_pk) + return max_pk + 1 + + def bulk_import(self): + chunk_id = self.redis_data.chunk_stack.pop() + if chunk_id is None: + return + + try: + self.logger.info(f"Worker {self.worker_id} started processing chunk {chunk_id}") + self.status = WorkerStatus.BUSY + + chunk_info = self.redis_data.get_chunk_info(chunk_id) + start_pk = self.get_start_pk(chunk_info) + if start_pk is None: + return + + end_pk = chunk_info.end_pk + chunk_info.status = ChunkStatus.IN_PROGRESS + + self.worker_config.update() + batch_start_pk = start_pk + batch_end_pk = min(batch_start_pk + self.worker_config.batch_size - 1, end_pk) + + while batch_start_pk <= end_pk and not self.stop_flag: + try: + cursor = self.insert_batch(batch_start_pk, batch_end_pk) + except MySQLdb.IntegrityError as e: + # Retry on duplicate key error + self.logger.error(f"Integrity error: {e}. Retrying with upsert query.") + self.insert_batch(batch_start_pk, batch_end_pk, upsert=True) + # Set chunk status to DUPLICATE_KEY + chunk_info.status = ChunkStatus.DUPLICATE_KEY + break + + # update batch size multiplier + if self.use_batch_size_multiplier: + self.worker_config.update_batch_size_multiplier(cursor.rowcount) + + # update last pk inserted + if cursor.rowcount == self.worker_config.raw_batch_size: + print('last row id', cursor.lastrowid) + last_pk_inserted = cursor.lastrowid + else: + last_pk_inserted = batch_end_pk + self.logger.info( + f"Worker {self.worker_id} finished processing batch " + f"{batch_start_pk} - {last_pk_inserted}. " + f"Batch size multiplier: {self.worker_config.batch_size_multiplier}" + ) + chunk_info.last_pk_inserted = last_pk_inserted + + # Get next batch + self.worker_config.update() + batch_start_pk = last_pk_inserted + 1 + batch_end_pk = min(batch_start_pk + self.worker_config.batch_size - 1, end_pk) + + except Exception as e: + self.logger.error(e) + + # This part will be executed when the worker is stopped or the chunk is done or exception is raised + chunk_info = self.redis_data.get_chunk_info(chunk_id) + if chunk_info.last_pk_inserted == chunk_info.end_pk: + chunk_info.status = ChunkStatus.DONE + self.logger.info(f"Worker {self.worker_id} finished processing chunk {chunk_id}") + else: + self.redis_data.push_chunk(chunk_id) + self.logger.warning( + f"Worker {self.worker_id} stopped processing chunk {chunk_id} at {chunk_info.last_pk_inserted}") + self.worker_config.batch_size_multiplier = 1 + + def apply_dml_events(self): + try: + updated_pk_set = self.redis_data.updated_pk_set + removed_pk_set = self.redis_data.removed_pk_set + while (len(updated_pk_set) > 0 or len(removed_pk_set) > 0) and not self.stop_flag: + self.status = WorkerStatus.BUSY + + self.worker_config.update() + batch_size = self.worker_config.batch_size + if len(updated_pk_set) > len(removed_pk_set): + updated_pks = updated_pk_set.get(batch_size) + if updated_pks: + self.logger.info( + f"Worker {self.worker_id} started processing {len(updated_pks)} updated events. " + f"Updated pks: {updated_pks}" + ) + self.apply_update(updated_pks) + self.logger.info( + f"Worker {self.worker_id} finished processing {len(updated_pks)} updated events") + else: + removed_pks = removed_pk_set.get(batch_size) + if removed_pks: + self.logger.info( + f"Worker {self.worker_id} started processing {len(removed_pks)} removed events. " + f"Removed pks: {removed_pks}") + self.apply_delete(removed_pks) + self.logger.info( + f"Worker {self.worker_id} finished processing {len(removed_pks)} removed events") + + except Exception as e: + self.logger.error(e) + + def get_max_pk(self, start_pk, end_pk): + metadata = self.redis_data.metadata + with self.db.cursor(host='dest') as cursor: + cursor: Cursor + cursor.execute(f''' + SELECT MAX(id) FROM {metadata.destination_db}.{metadata.destination_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + ''') + return cursor.fetchone()[0] + + @staticmethod + def calculate_metrics(func: Callable[..., Cursor]): + def wrapper(self: Self, *args, **kwargs): + t1 = time.time() + cursor = func(self, *args, **kwargs) + time.sleep(self.worker_config.commit_interval) + t2 = time.time() + + insert_rate = cursor.rowcount / (t2 - t1) # rows per second + self.datapoints.append(insert_rate) + return cursor + + return wrapper + + @calculate_metrics + def insert_batch(self, batch_start_pk, batch_end_pk, upsert=False): + limit = self.worker_config.raw_batch_size if self.use_batch_size_multiplier else None + return self.migration_operation.insert_batch(self.db, batch_start_pk, batch_end_pk, upsert, limit) + + @calculate_metrics + def apply_update(self, updated_pks): + with self.migration_operation.override_source_table(self.old_source_table): + return self.migration_operation.apply_update(self.db, updated_pks) + + @calculate_metrics + def apply_delete(self, removed_pks): + with self.db.cursor(host="dest") as cursor: + metadata = self.redis_data.metadata + removed_pks_str = ",".join([str(pk) for pk in removed_pks]) + query = f""" + DELETE FROM {metadata.destination_db}.{metadata.destination_table} + WHERE id IN ({removed_pks_str}) + """ + cursor.execute(query) + return cursor diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..d1697cc --- /dev/null +++ b/src/utils.py @@ -0,0 +1,24 @@ +import re +from datetime import datetime + + +def from_string(value: str): + if value.isdigit(): + return int(value) + elif value.isdecimal() or value.replace('.', '', 1).isdigit(): + return float(value) + elif re.match(r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}', value): + return datetime.strptime(value, "%Y-%m-%d %H:%M:%S") + elif value == 'True': + return True + elif value == 'False': + return False + else: + return value + + +def to_string(value): + if isinstance(value, datetime): + return value.isoformat(' ', 'seconds') + else: + return str(value) From 7547f7322ab2ae71be6c01f86e21b4334adbc918 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 16:21:04 +0900 Subject: [PATCH 02/22] tests --- setup.cfg | 17 +++++++++++ tests/configs/config.yaml | 47 +++++++++++++++++++++++++++++ tests/configs/secret.json | 9 ++++++ tests/conftest.py | 63 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 setup.cfg create mode 100644 tests/configs/config.yaml create mode 100644 tests/configs/secret.json create mode 100644 tests/conftest.py diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..9563c91 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,17 @@ +[flake8] +ignore = + # E402 module level import not at top of file + E402 + # W504 Line break occurred after a binary operator + W504 + # W605 invalid escape sequence + W605 +exclude = + # No need to traverse our git directory + .git, + # There's no value in checking cache directories + __pycache__, + # tests + tests +max-line-length = 120 +max-complexity = 15 diff --git a/tests/configs/config.yaml b/tests/configs/config.yaml new file mode 100644 index 0000000..c14efa4 --- /dev/null +++ b/tests/configs/config.yaml @@ -0,0 +1,47 @@ +source_writer_endpoint: "127.0.0.1" +source_reader_endpoint: "127.0.0.1" +source_cluster_id: "test" +source_db: "sbosc" +source_table: "source_table" +destination_cluster_id: "test" +destination_db: "sbosc" +destination_table: "destination_table" +min_chunk_size: 100000 +max_chunk_count: 200 +auto_swap: False +preferred_window: "00:00-23:59" +skip_bulk_import: False +operation_class: BaseOperation +indexes: [] +index_created_per_query: 4 + +# Worker config +min_batch_size: 100 +batch_size_step_size: 100 +max_batch_size: 10000 +min_thread_count: 4 +thread_count_step_size: 4 +max_thread_count: 64 +commit_interval: 0.01 +optimal_value_use_limit: 10 +use_batch_size_multiplier: False + +# EventHandler config +event_handler_thread_count: 4 +event_handler_thread_timeout: 300 + +# Threshold +cpu_soft_threshold: 70 +cpu_hard_threshold: 90 +latency_soft_threshold: 20 +latency_hard_threshold: 50 + +# Validation +bulk_import_validation_batch_size: 1000000 +apply_dml_events_validation_batch_size: 10000 +validation_thread_count: 4 +full_dml_event_validation_interval: 1 + +# DML event loading +pk_set_max_size: 1000000 +event_batch_duration: 3600 diff --git a/tests/configs/secret.json b/tests/configs/secret.json new file mode 100644 index 0000000..df7b37d --- /dev/null +++ b/tests/configs/secret.json @@ -0,0 +1,9 @@ +{ + "username": "root", + "password": "", + "port": 3306, + "redis_host": "127.0.0.1", + "redis_password": "", + "slack_channel": "", + "slack_token": "" +} diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..ef603d8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,63 @@ +import os +import time +from unittest.mock import patch + +import MySQLdb +import pytest +from sqlalchemy import create_engine + +# current dir +PATH = os.getcwd() + +ENVS = { + 'AWS_REGION': 'ap-northeast-2', + 'POD_NAME': 'local', + 'CONFIG_FILE': f'{PATH}/configs/config.yaml', + 'SECRET_FILE': f'{PATH}/configs/secrets.json', +} +os.environ.update(ENVS) + + +@pytest.fixture(scope='session') +def secret(): + from config import secret + return secret + + +@pytest.fixture(scope='session') +def config(): + from config import config + return config + + +@pytest.fixture +def request_id(request): + return request.node.callspec.id + + +@pytest.fixture(scope='session') +def cursor(config, secret): + connection = MySQLdb.connect( + host=config.SOURCE_WRITER_ENDPOINT, + port=secret.PORT, + user=secret.USERNAME, + password=secret.PASSWORD, + autocommit=True + ) + with connection.cursor() as cursor: + yield cursor + + +@pytest.fixture +def sqlalchemy_engine(config, secret): + return create_engine(f'mysql+mysqldb://{secret.USERNAME}:@{config.SOURCE_WRITER_ENDPOINT}:{secret.PORT}/sbosc') + + +@pytest.fixture(autouse=True) +def time_sleep_mock(): + """ + Mock time.sleep to speed up the test (1ms = 1s) + """ + sleep = time.sleep + with patch('time.sleep', side_effect=lambda duration: sleep(duration / 1000)): + yield From 654c21b4b06ba091f219b9f39e50da97dc58d088 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 17:02:56 +0900 Subject: [PATCH 03/22] conftest --- Dockerfile | 40 +++++++++++++++++++++++++++++ docker-compose.yml | 25 ++++++++++++++++++ requirements.txt | 4 ++- src/config/config.py | 42 ++++++++++++++++--------------- src/config/secret.py | 2 +- src/sbosc/controller/validator.py | 2 +- src/sbosc/worker/worker.py | 2 +- tests/conftest.py | 33 ++++++++++++++++++++++++ 8 files changed, 126 insertions(+), 24 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cb11e5a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +FROM ubuntu:20.04 + +ENV DEBIAN_FRONTEND noninteractive + +# apt update +RUN apt-get update && \ + apt -y upgrade && \ + apt-get install -y software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update + +# install python +RUN apt-get install -y python3.11 python3.11-dev python3.11-distutils build-essential + +# install mysql, postgres clients +RUN apt-get install -y libmysqlclient-dev mysql-client + +# install utilities +RUN apt-get install -y curl + +# Set working directory +WORKDIR /opt/sbosc + +# Make python 3.11 the default +# Register the version in alternatives +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 + +# Set python 3 as the default python +RUN update-alternatives --set python /usr/bin/python3.11 + +# Install pip and requirements.txt +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python + +# Install requirements +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy repository +COPY src ./ +ENV PYTHONPATH=/opt/sbosc diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..e785a26 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,25 @@ +version: '3.5' + +services: + mysql: + image: "mysql:8.0.34" + container_name: mysql + ports: + - "3306:3306" + volumes: + - mysql-data:/var/lib/mysql + command: + [ 'mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci', '--server-id=1', '--log-bin=mysqld-bin'] + environment: + MYSQL_ALLOW_EMPTY_PASSWORD: 1 + MYSQL_ROOT_HOST: "%" + MYSQL_DATABASE: "sbosc" + + redis: + image: redis:7.0.4 + container_name: redis + ports: + - "6379:6379" + +volumes: + mysql-data: diff --git a/requirements.txt b/requirements.txt index cb1b99e..2d89944 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ boto3==1.26.72 botocore==1.29.165 -mysqlclient==2.2.4 +mysqlclient==2.1.1 mysql-replication==0.31 redis==5.0.3 PyYAML==6.0.1 @@ -8,3 +8,5 @@ requests==2.31.0 pandas==2.1.4 prometheus-client==0.16.0 dnspython==2.5.0 + +setuptools>=65.5.1 # not directly required diff --git a/src/config/config.py b/src/config/config.py index 620b868..061b94f 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -6,20 +6,7 @@ import yaml import dns.resolver -from config import Env -from sbosc.operations.base import BaseOperation, CrossClusterBaseOperation - - -def get_operation_class(class_name): - package = import_module('sbosc.operations') - - for _, name, is_pkg in walk_packages(package.__path__, package.__name__ + '.'): - if not is_pkg: - module = import_module(name) - if hasattr(module, class_name): - return getattr(module, class_name) - - raise ImportError(f"Operation class {class_name} not found") +from config.env import Env def get_cluster_id(endpoint, cluster_id=None) -> str: @@ -69,7 +56,7 @@ class Config: AUTO_SWAP = False PREFERRED_WINDOW = '00:00-23:59' SKIP_BULK_IMPORT = False - OPERATION_CLASS = BaseOperation + OPERATION_CLASS = 'BaseOperation' INDEXES = [] INDEX_CREATED_PER_QUERY = 4 @@ -106,6 +93,22 @@ class Config: PK_SET_MAX_SIZE = 1000000 EVENT_BATCH_DURATION = 3600 + @property + def operation_class(self): + if self._operation_class is not None: + return self._operation_class + + package = import_module('sbosc.operations') + + for _, name, is_pkg in walk_packages(package.__path__, package.__name__ + '.'): + if not is_pkg: + module = import_module(name) + if hasattr(module, self.OPERATION_CLASS): + self._operation_class = getattr(module, self.OPERATION_CLASS) + return self._operation_class + + raise ImportError(f"Operation class {self.OPERATION_CLASS} not found") + def __init__(self): env = Env() if os.path.exists(env.CONFIG_FILE): @@ -114,9 +117,6 @@ def __init__(self): for key, value in config.items(): setattr(self, key.upper(), value) - if type(self.OPERATION_CLASS) == str: - self.OPERATION_CLASS = get_operation_class(self.OPERATION_CLASS) - if self.DESTINATION_WRITER_ENDPOINT is None: self.DESTINATION_WRITER_ENDPOINT = self.SOURCE_WRITER_ENDPOINT if self.DESTINATION_READER_ENDPOINT is None: @@ -125,8 +125,8 @@ def __init__(self): self.DESTINATION_DB = self.SOURCE_DB if self.SOURCE_WRITER_ENDPOINT != self.DESTINATION_WRITER_ENDPOINT: self.AUTO_SWAP = False - if self.OPERATION_CLASS == BaseOperation: - self.OPERATION_CLASS = CrossClusterBaseOperation + if self.OPERATION_CLASS == 'BaseOperation': + self.OPERATION_CLASS = 'CrossClusterBaseOperation' if self.SOURCE_DB != self.DESTINATION_DB: self.AUTO_SWAP = False @@ -141,3 +141,5 @@ def __init__(self): if self.SKIP_BULK_IMPORT and self.INIT_BINLOG_FILE is None: raise ValueError('INIT_BINLOG_FILE is required when SKIP_BULK_IMPORT is True') + + self._operation_class = None # Will be set by property operation_class diff --git a/src/config/secret.py b/src/config/secret.py index d5d5f14..23f0546 100644 --- a/src/config/secret.py +++ b/src/config/secret.py @@ -1,7 +1,7 @@ import json import os -from config import Env +from config.env import Env from utils import from_string diff --git a/src/sbosc/controller/validator.py b/src/sbosc/controller/validator.py index 0950dc7..17b9c52 100644 --- a/src/sbosc/controller/validator.py +++ b/src/sbosc/controller/validator.py @@ -28,7 +28,7 @@ def __init__(self, controller: 'Controller'): self.thread_count = config.VALIDATION_THREAD_COUNT self.db = Database() self.redis_data = RedisData(self.migration_id) - self.migration_operation: MigrationOperation = config.OPERATION_CLASS(self.migration_id) + self.migration_operation: MigrationOperation = config.operation_class(self.migration_id) self.logger = controller.logger self.source_conn_pool = self.db.get_reader_connection_pool(self.thread_count) diff --git a/src/sbosc/worker/worker.py b/src/sbosc/worker/worker.py index 23a3ef2..cc27c03 100644 --- a/src/sbosc/worker/worker.py +++ b/src/sbosc/worker/worker.py @@ -55,7 +55,7 @@ def __init__(self, worker_id, manager: 'WorkerManager'): self.db = Database() self.redis_data = RedisData(self.migration_id) self.logger = manager.logger - self.migration_operation: MigrationOperation = config.OPERATION_CLASS(self.migration_id) + self.migration_operation: MigrationOperation = config.operation_class(self.migration_id) self.use_batch_size_multiplier = config.USE_BATCH_SIZE_MULTIPLIER self.worker_id = worker_id self.stop_flag = False diff --git a/tests/conftest.py b/tests/conftest.py index ef603d8..c95773a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -61,3 +61,36 @@ def time_sleep_mock(): sleep = time.sleep with patch('time.sleep', side_effect=lambda duration: sleep(duration / 1000)): yield + + +@pytest.fixture(scope='session') +def redis_data(): + from modules.redis import RedisData + return RedisData(1, False) + + +@pytest.fixture(autouse=True, scope='session') +def init_migration(config, cursor, redis_data): + from sbosc.const import Stage + from sbosc.controller.initializer import Initializer + + cursor.execute(f''' + SELECT table_name FROM information_schema.tables + WHERE table_schema = '{config.SOURCE_DB}' + ''') + for table, in cursor.fetchall(): + cursor.execute(f'DROP TABLE {table}') + + for table in [config.SOURCE_TABLE, config.DESTINATION_TABLE]: + cursor.execute(f"CREATE TABLE {table} (id int)") + migration_id = Initializer().init_migration() + + # Validate Initializer.init_migration + assert migration_id == 1 + assert redis_data.current_stage == Stage.START_EVENT_HANDLER + assert redis_data.metadata.source_db == config.SOURCE_DB + assert redis_data.metadata.source_table == config.SOURCE_TABLE + assert redis_data.metadata.destination_db == config.DESTINATION_DB + assert redis_data.metadata.destination_table == config.DESTINATION_TABLE + assert redis_data.metadata.source_columns == '`id`' + assert redis_data.metadata.start_datetime is not None From 5be9ce0a9f68f37914aed8a01f0ac820b9ccd145 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 18:54:43 +0900 Subject: [PATCH 04/22] test_worker --- docker-compose.yml | 5 - requirements.txt | 2 +- src/modules/db.py | 1 + src/modules/redis/interface.py | 2 +- src/sbosc/controller/initializer.py | 2 +- src/sbosc/worker/manager.py | 2 +- src/sbosc/worker/worker.py | 1 - tests/conftest.py | 13 +- tests/test_worker.py | 288 ++++++++++++++++++++++++++++ 9 files changed, 305 insertions(+), 11 deletions(-) create mode 100644 tests/test_worker.py diff --git a/docker-compose.yml b/docker-compose.yml index e785a26..f4af349 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,8 +6,6 @@ services: container_name: mysql ports: - "3306:3306" - volumes: - - mysql-data:/var/lib/mysql command: [ 'mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci', '--server-id=1', '--log-bin=mysqld-bin'] environment: @@ -20,6 +18,3 @@ services: container_name: redis ports: - "6379:6379" - -volumes: - mysql-data: diff --git a/requirements.txt b/requirements.txt index 2d89944..66fbc89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ mysql-replication==0.31 redis==5.0.3 PyYAML==6.0.1 requests==2.31.0 -pandas==2.1.4 +pandas==2.2.1 prometheus-client==0.16.0 dnspython==2.5.0 diff --git a/src/modules/db.py b/src/modules/db.py index 3c1c29c..4d27c74 100644 --- a/src/modules/db.py +++ b/src/modules/db.py @@ -58,6 +58,7 @@ def connect(self): user=secret.USERNAME, password=secret.PASSWORD, port=secret.PORT, + autocommit=True ) def cursor(self, cursorclass=None): diff --git a/src/modules/redis/interface.py b/src/modules/redis/interface.py index c7fddd8..d1fa43c 100644 --- a/src/modules/redis/interface.py +++ b/src/modules/redis/interface.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import Self -from config import config +from config import config, env from modules.redis.schema import RedisKey, Metadata, WorkerConfig, WorkerMetric, ChunkInfo from modules.redis.connect import get_redis_connection from modules.redis.data_types import Set, Stack, SortedSet diff --git a/src/sbosc/controller/initializer.py b/src/sbosc/controller/initializer.py index 477f989..aba5d37 100644 --- a/src/sbosc/controller/initializer.py +++ b/src/sbosc/controller/initializer.py @@ -200,7 +200,7 @@ def init_migration(self): config.SOURCE_CLUSTER_ID, config.SOURCE_DB, config.SOURCE_TABLE, - config.cluster_id, + config.DESTINATION_CLUSTER_ID, config.DESTINATION_DB, config.DESTINATION_TABLE )) diff --git a/src/sbosc/worker/manager.py b/src/sbosc/worker/manager.py index a4bb994..3f03a01 100644 --- a/src/sbosc/worker/manager.py +++ b/src/sbosc/worker/manager.py @@ -52,7 +52,7 @@ def add_threads(self): self.logger.info(f"Adding {self.desired_thread_count - self.thread_count} threads") while self.thread_count < self.desired_thread_count: self.created_threads += 1 - worker = Worker(f'worker_{self.created_threads}', config, self) + worker = Worker(f'worker_{self.created_threads}', self) self.worker_threads.append((worker, self.executor.submit(worker.start))) def remove_threads(self): diff --git a/src/sbosc/worker/worker.py b/src/sbosc/worker/worker.py index cc27c03..c50474a 100644 --- a/src/sbosc/worker/worker.py +++ b/src/sbosc/worker/worker.py @@ -141,7 +141,6 @@ def bulk_import(self): # update last pk inserted if cursor.rowcount == self.worker_config.raw_batch_size: - print('last row id', cursor.lastrowid) last_pk_inserted = cursor.lastrowid else: last_pk_inserted = batch_end_pk diff --git a/tests/conftest.py b/tests/conftest.py index c95773a..7108bd1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,8 +6,10 @@ import pytest from sqlalchemy import create_engine +from modules.logger import get_logger + # current dir -PATH = os.getcwd() +PATH = os.path.dirname(os.path.abspath(__file__)) ENVS = { 'AWS_REGION': 'ap-northeast-2', @@ -17,6 +19,9 @@ } os.environ.update(ENVS) +# Set up logger +get_logger() + @pytest.fixture(scope='session') def secret(): @@ -42,6 +47,7 @@ def cursor(config, secret): port=secret.PORT, user=secret.USERNAME, password=secret.PASSWORD, + db=config.SOURCE_DB, autocommit=True ) with connection.cursor() as cursor: @@ -94,3 +100,8 @@ def init_migration(config, cursor, redis_data): assert redis_data.metadata.destination_table == config.DESTINATION_TABLE assert redis_data.metadata.source_columns == '`id`' assert redis_data.metadata.start_datetime is not None + + +@pytest.fixture(scope='module', params=['BaseOperation', 'CrossClusterBaseOperation']) +def override_operation_class(request): + config.OPERATION_CLASS = request.param diff --git a/tests/test_worker.py b/tests/test_worker.py new file mode 100644 index 0000000..f1be5b2 --- /dev/null +++ b/tests/test_worker.py @@ -0,0 +1,288 @@ +import random +import time +from threading import Thread + +import pytest + +from config import config +from sbosc.const import Stage, WorkerStatus, ChunkStatus +from sbosc.worker.manager import WorkerManager + +import numpy as np +import pandas as pd + +############ +# Fixtures # +############ +SOURCE_COLUMNS = ['A', 'B', 'C'] +TEST_TABLE_VALUES = ['a', 'b', 'c'] +TABLE_SIZE = 10000 + + +@pytest.fixture +def setup_table(sqlalchemy_engine, cursor, request): + param = request.param if hasattr(request, 'param') else "" + + if "sparse" in param: + source_df = pd.DataFrame({ + 'id': np.random.choice(range(1, TABLE_SIZE * 10 + 1), size=TABLE_SIZE, replace=False), + 'A': np.random.choice(TEST_TABLE_VALUES, size=TABLE_SIZE), + 'B': np.random.choice(TEST_TABLE_VALUES, size=TABLE_SIZE), + 'C': np.random.choice(TEST_TABLE_VALUES, size=TABLE_SIZE), + }) + else: + source_df = pd.DataFrame(np.random.choice(TEST_TABLE_VALUES, size=(TABLE_SIZE, 3)), columns=SOURCE_COLUMNS) + + if "duplicate_key" in param: + dest_df = source_df.head(TABLE_SIZE // 20) + elif "with_data" in param: + dest_df = source_df + else: + dest_df = pd.DataFrame(columns=SOURCE_COLUMNS) + + source_df.to_sql(config.SOURCE_TABLE, sqlalchemy_engine, if_exists='replace', index=False) + dest_df.to_sql(config.DESTINATION_TABLE, sqlalchemy_engine, if_exists='replace', index=False) + + if "sparse" in param: + cursor.execute(f'alter table {config.SOURCE_TABLE} modify column id int primary KEY AUTO_INCREMENT;') + else: + cursor.execute(f'alter table {config.SOURCE_TABLE} add column id int primary KEY AUTO_INCREMENT;') + cursor.execute(f'alter table {config.DESTINATION_TABLE} add column id int primary KEY AUTO_INCREMENT;') + + +@pytest.fixture(autouse=True) +def init_redis(redis_data): + redis_data.worker_config.thread_count = 0 + redis_data.worker_config.batch_size = 100 + redis_data.worker_config.commit_interval = 100 + redis_data.worker_config.revision = 1 + redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) + redis_data.metadata.source_columns = '`id`,`' + '`,`'.join(SOURCE_COLUMNS) + '`' + redis_data.remove_all_chunks() + + +@pytest.fixture +def worker_manager(): + worker_manager = WorkerManager() + worker_manager_thread = Thread(target=worker_manager.start) + worker_manager_thread.start() + yield worker_manager + worker_manager.set_stop_flag() + + while worker_manager_thread.is_alive(): + time.sleep(1000) + + +######## +# Test # +######## +def test_setup_table(cursor, setup_table): + cursor.execute(f"SELECT * FROM {config.SOURCE_TABLE} LIMIT 1") + result = cursor.fetchall() + assert result[0][0] in TEST_TABLE_VALUES + + +def test_add_remove_thread(worker_manager, redis_data): + # init condition check + assert worker_manager.desired_thread_count == 0 + + # add thread + redis_data.worker_config.thread_count = 3 + time.sleep(100) + assert worker_manager.desired_thread_count == 3 + assert worker_manager.thread_count == 3 + assert worker_manager.created_threads == 3 + + # remove thread + redis_data.worker_config.thread_count = 1 + time.sleep(100) + assert worker_manager.desired_thread_count == 1 + assert worker_manager.thread_count == 1 + assert worker_manager.created_threads == 3 + + +def test_check_worker_status(worker_manager, redis_data): + # create one thread + redis_data.worker_config.thread_count = 1 + + time.sleep(100) + + # set the condition to pass the if statement in check_worker_status + + # set the worker status to BUSY + assert worker_manager.thread_count == 1 + assert worker_manager.created_threads == 1 + worker_manager.worker_threads[0][0].status = WorkerStatus.BUSY + + time.sleep(100) + redis_data.remove_all_chunks() + assert len(redis_data.chunk_stack) == 0 + + # set the current stage to BULK_IMPORT + redis_data.set_current_stage(Stage.BULK_IMPORT) + assert redis_data.current_stage == Stage.BULK_IMPORT + + # check the current stage + time.sleep(100) + assert redis_data.current_stage == Stage.BULK_IMPORT_VALIDATION + + # check the current stage when if statement is not passed in check_worker_status + time.sleep(100) + assert redis_data.current_stage == Stage.BULK_IMPORT_VALIDATION + + +@pytest.mark.parametrize( + 'setup_table', + ["standard", "duplicate_key", "sparse"], + indirect=True +) +def test_bulk_import(setup_table, cursor, worker_manager, request_id, redis_data, override_operation_class): + redis_data.worker_config.thread_count = 0 + + # Create chunks + chunk_count = 10 + chunk_size = TABLE_SIZE // chunk_count + if "sparse" in request_id: + chunk_size *= 10 + config.USE_BATCH_SIZE_MULTIPLIER = True + for chunk_id in range(chunk_count): + redis_data.push_chunk(chunk_id) + chunk_info = redis_data.get_chunk_info(chunk_id) + chunk_info.set({ + 'start_pk': chunk_size * chunk_id + 1, + 'end_pk': chunk_size * (chunk_id + 1), + 'status': ChunkStatus.NOT_STARTED + }) + + # Check tables + cursor.execute(f"SELECT COUNT(1) FROM {config.SOURCE_TABLE}") + assert cursor.fetchone()[0] == TABLE_SIZE + + cursor.execute(f"SELECT MAX(id) FROM {config.DESTINATION_TABLE}") + if "duplicate_key" in request_id: + assert cursor.fetchone()[0] == TABLE_SIZE // 20 + else: + assert cursor.fetchone()[0] is None + time.sleep(100) + + # Create threads + assert worker_manager.thread_count == 0 + assert len(redis_data.chunk_stack) == 10 + + desired_thread_count = 5 + redis_data.worker_config.thread_count = desired_thread_count + redis_data.worker_config.batch_size = 300 + + for _ in range(10): + if worker_manager.desired_thread_count == desired_thread_count and \ + worker_manager.thread_count == desired_thread_count: + break + time.sleep(100) + assert worker_manager.desired_thread_count == desired_thread_count + + # Check worker status + time.sleep(100) + for i in range(worker_manager.thread_count): + worker = worker_manager.worker_threads[i][0] + assert worker.status == WorkerStatus.IDLE + # Check configs + assert worker.use_batch_size_multiplier == config.USE_BATCH_SIZE_MULTIPLIER + assert isinstance(worker.migration_operation, config.operation_class) + + # Set current stage to BULK_IMPORT + t1 = time.time() + redis_data.set_current_stage(Stage.BULK_IMPORT) + assert redis_data.current_stage == Stage.BULK_IMPORT + while redis_data.current_stage == Stage.BULK_IMPORT: + time.sleep(100) + t2 = time.time() + print(f"Execution time: {t2 - t1}") + + cursor.execute(f"SELECT COUNT(1) FROM {config.DESTINATION_TABLE}") + assert cursor.fetchone()[0] == TABLE_SIZE + + # test worker metrics + avg_insert_rate = redis_data.worker_metric.average_insert_rate + datapoints = [] + for worker, _ in worker_manager.worker_threads: + datapoints += worker.datapoints + calculated_avg_insert_rate = sum(datapoints) / len(datapoints) + print(f"Average insert rate: {avg_insert_rate}, calculated average insert rate: {calculated_avg_insert_rate}") + assert abs(avg_insert_rate / calculated_avg_insert_rate - 1) < 0.01 + + +@pytest.mark.parametrize('setup_table', ["with_data"], indirect=True) +def test_apply_dml_events(setup_table, worker_manager, cursor, redis_data, override_operation_class): + # set worker config + redis_data.worker_config.thread_count = 5 + + time.sleep(100) + + updated_rows = 1000 + removed_rows = 1000 + removed_pks = random.sample(range(1, TABLE_SIZE + 1), removed_rows) + updated_pks = set(random.sample(range(1, TABLE_SIZE + 1), updated_rows)) - set(removed_pks) + + redis_data.updated_pk_set.delete() + redis_data.removed_pk_set.delete() + + redis_data.updated_pk_set.add(updated_pks) + redis_data.removed_pk_set.add(removed_pks) + + # update rows + modified_column = SOURCE_COLUMNS[0] + cursor.execute(f''' + UPDATE {config.SOURCE_TABLE} SET {modified_column} = 'x' WHERE id IN ({','.join(str(pk) for pk in updated_pks)}) + ''') + redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) + + while len(redis_data.updated_pk_set) > 0 or len(redis_data.removed_pk_set) > 0: + time.sleep(100) + time.sleep(100) + + # check updated rows + cursor.execute(f''' + SELECT count(*) FROM {config.DESTINATION_TABLE} WHERE {modified_column} = 'x' + ''') + assert cursor.fetchone()[0] == len(updated_pks) + + # check removed rows + cursor.execute(f''' + SELECT count(*) FROM {config.DESTINATION_TABLE} WHERE id IN ({','.join(str(pk) for pk in removed_pks)}) + ''') + assert cursor.fetchone()[0] == 0 + + # check sets + assert len(redis_data.updated_pk_set) == 0 + assert len(redis_data.removed_pk_set) == 0 + + +@pytest.mark.parametrize('setup_table', ["with_data"], indirect=True) +def test_swap_tables(setup_table, worker_manager, cursor, redis_data, request_id): + redis_data.worker_config.thread_count = 1 + + def insert_and_check(source_table): + cursor.execute(f"INSERT INTO {source_table} (A, B, C) VALUES (1, 2, 3)") + last_inserted_id = cursor.lastrowid + redis_data.updated_pk_set.add([last_inserted_id]) + for _ in range(10): + cursor.execute(f"SELECT * FROM {config.DESTINATION_TABLE} WHERE id = {last_inserted_id}") + if cursor.rowcount > 0: + break + time.sleep(100) + assert cursor.rowcount == 1 + + # swap table stage + old_source_table = f"_{config.SOURCE_TABLE}_old" + cursor.execute(f"RENAME TABLE {config.SOURCE_TABLE} TO {old_source_table}") + redis_data.set_old_source_table(old_source_table) + redis_data.set_current_stage(Stage.SWAP_TABLES) + + insert_and_check(old_source_table) + + cursor.execute(f"RENAME TABLE {old_source_table} TO {config.SOURCE_TABLE}") + redis_data.set_old_source_table(None) + + time.sleep(100) + + insert_and_check(config.SOURCE_TABLE) From 4162543472e97c2beaf6e107e58ba464880ae4df Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 22:19:19 +0900 Subject: [PATCH 05/22] test_eventhandler, test_controller --- src/modules/db.py | 5 + src/sbosc/eventhandler/eventhandler.py | 2 +- src/sbosc/operations/base.py | 14 +- tests/configs/config.yaml | 16 +- tests/test_controller.py | 336 +++++++++++++++++++++++++ tests/test_eventhandler.py | 173 +++++++++++++ tests/test_worker.py | 26 +- 7 files changed, 543 insertions(+), 29 deletions(-) create mode 100644 tests/test_controller.py create mode 100644 tests/test_eventhandler.py diff --git a/src/modules/db.py b/src/modules/db.py index 4d27c74..491f5e3 100644 --- a/src/modules/db.py +++ b/src/modules/db.py @@ -71,6 +71,11 @@ def cursor(self, cursorclass=None): cursor: cursorclass = self._conn.cursor(cursorclass) return cursor + def ping(self): + if not self._conn: + self._conn = self.connect() + self._conn.ping() + def close(self): if self._conn: self._conn.close() diff --git a/src/sbosc/eventhandler/eventhandler.py b/src/sbosc/eventhandler/eventhandler.py index 1bb4bc7..49c0eb5 100644 --- a/src/sbosc/eventhandler/eventhandler.py +++ b/src/sbosc/eventhandler/eventhandler.py @@ -79,7 +79,7 @@ def __init__(self): # BinlogStreamReader self.connection_settings = { - 'host': config.writer_endpoint, + 'host': config.SOURCE_WRITER_ENDPOINT, 'port': secret.PORT, 'user': secret.USERNAME, 'passwd': secret.PASSWORD, diff --git a/src/sbosc/operations/base.py b/src/sbosc/operations/base.py index 805fa0b..c77ca3b 100644 --- a/src/sbosc/operations/base.py +++ b/src/sbosc/operations/base.py @@ -8,7 +8,7 @@ class BaseOperation(MigrationOperation): def _insert_batch_query(self, start_pk, end_pk): return f''' - INSERT INTO {self.source_db}.{self.destination_table}({self.source_columns}) + INSERT INTO {self.destination_db}.{self.destination_table}({self.source_columns}) SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} WHERE id BETWEEN {start_pk} AND {end_pk} @@ -28,7 +28,7 @@ def apply_update(self, db, updated_pks): with db.cursor() as cursor: updated_pks_str = ",".join([str(pk) for pk in updated_pks]) query = f''' - INSERT INTO {self.source_db}.{self.destination_table}({self.source_columns}) + INSERT INTO {self.destination_db}.{self.destination_table}({self.source_columns}) SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} WHERE id IN ({updated_pks_str}) @@ -40,7 +40,7 @@ def apply_update(self, db, updated_pks): def _get_not_imported_pks_query(self, start_pk, end_pk): return f''' SELECT source.id FROM {self.source_db}.{self.source_table} AS source - LEFT JOIN {self.source_db}.{self.destination_table} AS dest ON source.id = dest.id + LEFT JOIN {self.destination_db}.{self.destination_table} AS dest ON source.id = dest.id WHERE source.id BETWEEN {start_pk} AND {end_pk} AND dest.id IS NULL ''' @@ -59,7 +59,7 @@ def get_not_inserted_pks(self, source_cursor, dest_cursor, start_timestamp, end_ if event_pks: source_cursor.execute(f''' SELECT source.id FROM {self.source_db}.{self.source_table} AS source - LEFT JOIN {self.source_db}.{self.destination_table} AS dest ON source.id = dest.id + LEFT JOIN {self.destination_db}.{self.destination_table} AS dest ON source.id = dest.id WHERE source.id IN ({event_pks}) AND dest.id IS NULL ''') @@ -78,7 +78,7 @@ def get_not_updated_pks(self, source_cursor, dest_cursor, start_timestamp, end_t WHERE id IN ({event_pks}) UNION ALL SELECT {self.source_columns}, 'destination' AS table_type - FROM {self.source_db}.{self.destination_table} + FROM {self.destination_db}.{self.destination_table} WHERE id IN ({event_pks}) ) AS combined GROUP BY {self.source_columns} @@ -95,7 +95,7 @@ def get_rematched_updated_pks(self, db, not_updated_pks): SELECT combined.id FROM ( SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} WHERE id IN ({not_updated_pks_str}) UNION ALL - SELECT {self.source_columns} FROM {self.source_db}.{self.destination_table} + SELECT {self.source_columns} FROM {self.destination_db}.{self.destination_table} WHERE id IN ({not_updated_pks_str}) ) AS combined GROUP BY {self.source_columns} HAVING COUNT(*) = 2 ''') @@ -114,7 +114,7 @@ def get_rematched_removed_pks(self, db, not_removed_pks): cursor: Cursor cursor.execute(f''' SELECT source_pk FROM sbosc.unmatched_rows WHERE source_pk NOT IN ( - SELECT id FROM {self.source_db}.{self.destination_table} + SELECT id FROM {self.destination_db}.{self.destination_table} WHERE id IN ({not_removed_pks_str}) ) AND source_pk IN ({not_removed_pks_str}) ''') diff --git a/tests/configs/config.yaml b/tests/configs/config.yaml index c14efa4..c33771e 100644 --- a/tests/configs/config.yaml +++ b/tests/configs/config.yaml @@ -6,8 +6,8 @@ source_table: "source_table" destination_cluster_id: "test" destination_db: "sbosc" destination_table: "destination_table" -min_chunk_size: 100000 -max_chunk_count: 200 +min_chunk_size: 1000 +max_chunk_count: 10 auto_swap: False preferred_window: "00:00-23:59" skip_bulk_import: False @@ -21,7 +21,7 @@ batch_size_step_size: 100 max_batch_size: 10000 min_thread_count: 4 thread_count_step_size: 4 -max_thread_count: 64 +max_thread_count: 8 commit_interval: 0.01 optimal_value_use_limit: 10 use_batch_size_multiplier: False @@ -37,11 +37,11 @@ latency_soft_threshold: 20 latency_hard_threshold: 50 # Validation -bulk_import_validation_batch_size: 1000000 -apply_dml_events_validation_batch_size: 10000 -validation_thread_count: 4 +bulk_import_validation_batch_size: 1000 +apply_dml_events_validation_batch_size: 1000 +validation_thread_count: 10 full_dml_event_validation_interval: 1 # DML event loading -pk_set_max_size: 1000000 -event_batch_duration: 3600 +pk_set_max_size: 10000 +event_batch_duration: 1 diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 0000000..3410b38 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,336 @@ +import random +import threading +import time +from datetime import datetime +from threading import Thread + +import numpy as np +import pandas as pd +import pytest + +from config import config +from config.config import IndexConfig +from sbosc.const import Stage, ChunkStatus +from sbosc.controller import Controller +from modules.redis import RedisData + +TABLE_SIZE = 10000 + + +############ +# Fixtures # +############ + +@pytest.fixture +def setup_table(sqlalchemy_engine, cursor, request): + param = request.param if hasattr(request, 'param') else '' + cursor.execute(f"DROP TABLE IF EXISTS {config.SOURCE_TABLE}") + if param == 'with_data': + df = pd.DataFrame(np.random.choice(['a', 'b', 'c'], size=(TABLE_SIZE, 3)), columns=['A', 'B', 'C']) + df['id'] = range(1, 1 + len(df)) + df.to_sql(config.SOURCE_TABLE, sqlalchemy_engine, if_exists='replace', index=False) + df.to_sql(config.DESTINATION_TABLE, sqlalchemy_engine, if_exists='replace', index=False) + cursor.execute(f"ALTER TABLE {config.SOURCE_TABLE} MODIFY COLUMN id int AUTO_INCREMENT PRIMARY KEY") + cursor.execute(f"ALTER TABLE {config.DESTINATION_TABLE} MODIFY COLUMN id int AUTO_INCREMENT PRIMARY KEY") + else: + cursor.execute(f''' + CREATE TABLE {config.SOURCE_TABLE} ( + id INT PRIMARY KEY AUTO_INCREMENT, + A CHAR(1), B CHAR(1), C CHAR(1) + ) + ''') + cursor.execute(f"INSERT INTO {config.SOURCE_TABLE} (id, A, B, C) VALUES ({TABLE_SIZE}, 'a', 'b', 'c')") + + +@pytest.fixture +def controller(request, redis_data): + param = request.param if hasattr(request, 'param') else '' + controller = Controller() + if param == 'object': + yield controller + else: + controller_thread = Thread(target=controller.start) + controller_thread.start() + yield controller, controller_thread + controller.set_stop_flag() + + while controller_thread.is_alive(): + time.sleep(1000) + + +######## +# Test # +######## +def test_chunk_creation(controller, setup_table, redis_data): + controller, controller_thread = controller + controller.initializer.fetch_metadata(1) + redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) + while redis_data.current_stage != Stage.BULK_IMPORT: + assert controller_thread.is_alive() + time.sleep(100) + if redis_data.current_stage == Stage.START_EVENT_HANDLER: + redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) + + # test chunk creation + assert redis_data.chunk_set.getall() == set([f'{redis_data.migration_id}-{i}' for i in range(10)]) + assert len(redis_data.chunk_stack) == 10 + + # test metadata + assert redis_data.metadata.source_table == config.SOURCE_TABLE + assert redis_data.metadata.destination_table == config.DESTINATION_TABLE + assert set(redis_data.metadata.source_columns.split(',')) == {'`id`', '`A`', '`B`', '`C`'} + + # test chunk validation + done_chunk_count = 2 + done_but_invalid_chunk_count = 2 + while len(redis_data.chunk_stack) > 0: + chunk_id = redis_data.chunk_stack.pop() + chunk_info = redis_data.get_chunk_info(chunk_id) + if done_chunk_count > 0: + chunk_info.status = ChunkStatus.DONE + chunk_info.last_pk_inserted = chunk_info.end_pk + done_chunk_count -= 1 + elif done_but_invalid_chunk_count > 0: + chunk_info.status = ChunkStatus.DONE + done_but_invalid_chunk_count -= 1 + redis_data.set_current_stage(Stage.BULK_IMPORT_VALIDATION) + + while redis_data.current_stage != Stage.BULK_IMPORT: + time.sleep(100) + + assert redis_data.current_stage == Stage.BULK_IMPORT + assert len(redis_data.chunk_stack) == 8 + + +@pytest.mark.parametrize('setup_table', ['with_data'], indirect=True) +@pytest.mark.parametrize('controller', ['object'], indirect=True) +def test_bulk_import_validation(controller: Controller, setup_table, cursor, override_operation_class): + assert controller.validator.bulk_import_validation() + delete_pks = random.sample(range(1, TABLE_SIZE), 10) + cursor.execute(f''' + DELETE FROM {config.SOURCE_TABLE} + WHERE id IN ({','.join([str(i) for i in delete_pks[:5]])}) + ''') + assert controller.validator.bulk_import_validation() + controller.validator.stop_flag = False + cursor.execute(f''' + DELETE FROM {config.DESTINATION_TABLE} + WHERE id IN ({','.join([str(i) for i in delete_pks])}) + ''') + assert not controller.validator.bulk_import_validation() + + +@pytest.mark.parametrize('setup_table', ['with_data'], indirect=True) +@pytest.mark.parametrize('controller', ['object'], indirect=True) +@pytest.mark.parametrize('case', ['normal', 'on_restart']) +def test_add_index(controller: Controller, setup_table, cursor, case): + cursor.execute(f''' + ALTER TABLE {config.DESTINATION_TABLE} + MODIFY COLUMN A VARCHAR(128), MODIFY COLUMN B VARCHAR(128), MODIFY COLUMN C VARCHAR(128) + ''') + + config.INDEXES = [ + IndexConfig('idx_1', 'A'), + IndexConfig('idx_2', 'B'), + IndexConfig('idx_3', 'C'), + IndexConfig('idx_4', 'A,B'), + IndexConfig('idx_5', 'A,C'), + IndexConfig('idx_6', 'B,C') + ] + + cursor.execute("TRUNCATE TABLE index_creation_status") + cursor.executemany(f''' + INSERT INTO index_creation_status + (migration_id, index_name, index_columns, is_unique) + VALUES (%s, %s, %s, %s) + ''', [(1, index.name, index.columns, index.unique) for index in config.INDEXES]) + + if case == 'on_restart': + cursor.execute("UPDATE index_creation_status SET started_at = NOW() WHERE id = 1") + + def delayed_add_index(): + time.sleep(100) + cursor.execute(f''' + ALTER TABLE {config.DESTINATION_TABLE} ADD INDEX idx_1 (A) + ''') + + threading.Thread(target=delayed_add_index).start() + controller.add_index() + else: + controller.add_index() + + cursor.execute(''' + SELECT DISTINCT index_name FROM mysql.innodb_index_stats + WHERE table_name = %s AND index_name not like 'PRIMARY' + ''', (config.DESTINATION_TABLE,)) + created_indexes = set([row[0] for row in cursor.fetchall()]) + expected_indexes = set([index.name for index in config.INDEXES]) + assert created_indexes == expected_indexes + + +@pytest.mark.parametrize('setup_table', ['with_data'], indirect=True) +@pytest.mark.parametrize('controller', ['object'], indirect=True) +def test_apply_dml_events_validation(controller: Controller, setup_table, cursor, override_operation_class): + controller.initializer.fetch_metadata(1) + + timestamp_range = (1, 100) + insert_events = [(random.randint(1, TABLE_SIZE), random.randint(*timestamp_range)) for _ in range(500)] + update_events = [(random.randint(1, TABLE_SIZE), random.randint(*timestamp_range)) for _ in range(500)] + delete_events = [(random.randint(1, TABLE_SIZE), random.randint(*timestamp_range)) for _ in range(500)] + cursor.executemany(''' + INSERT IGNORE INTO inserted_pk_1 (source_pk, event_timestamp) VALUES (%s, %s) + ''', insert_events) + cursor.executemany(''' + INSERT IGNORE INTO updated_pk_1 (source_pk, event_timestamp) VALUES (%s, %s) + ''', update_events) + cursor.executemany(''' + INSERT IGNORE INTO deleted_pk_1 (source_pk, event_timestamp) VALUES (%s, %s) + ''', delete_events) + cursor.execute("TRUNCATE TABLE event_handler_status") + cursor.execute("TRUNCATE TABLE apply_dml_events_status") + + # Event handler status doesn't have any row + assert not controller.validator.apply_dml_events_validation() + + # Insert row to event handler status and validate + cursor.execute(f''' + INSERT INTO event_handler_status (migration_id, log_file, log_pos, last_event_timestamp, created_at) + VALUES (1, 'mysql-bin.000001', 4, {timestamp_range[1]}, NOW()) + ''') + controller.validator.apply_dml_events_validation() + + # Check if the validation is correct + cursor.execute(''' + SELECT COUNT(1) FROM unmatched_rows WHERE migration_id = 1 AND unmatch_type = 'not_removed' + ''') + not_removed_event_count = cursor.fetchone()[0] + + # Not deleted from source table + # Since rows are not deleted from source table. All pks in deleted_pk will be counted as reinserted pks. + assert not_removed_event_count == 0 + + # Delete rows from source table and validate + cursor.execute(f''' + DELETE FROM apply_dml_events_validation_status WHERE migration_id = 1 + ''') + cursor.execute(f''' + DELETE FROM {config.SOURCE_TABLE} WHERE id IN (SELECT source_pk FROM deleted_pk_1) + ''') + controller.validator.apply_dml_events_validation() + + # Check if the validation is correct + cursor.execute(''' + SELECT COUNT(1) FROM unmatched_rows WHERE migration_id = 1 AND unmatch_type = 'not_removed' + ''') + not_removed_event_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(1) FROM deleted_pk_1") + deleted_pk_count = cursor.fetchone()[0] + assert not_removed_event_count == deleted_pk_count + + # Delete rows from destination table and validate + cursor.execute(f''' + DELETE FROM {config.SOURCE_TABLE} + WHERE id IN ({','.join([str(i) for i in [i[0] for i in delete_events]])}) + ''') + cursor.execute(f''' + DELETE FROM {config.DESTINATION_TABLE} + WHERE id IN ({','.join([str(i) for i in [i[0] for i in delete_events]])}) + ''') + assert controller.validator.apply_dml_events_validation() + cursor.execute("SELECT COUNT(1) FROM unmatched_rows") + assert cursor.fetchone()[0] == 0 + + # Add new insert, update event + new_timestamp_range = (101, 200) + new_insert_events = [ + (random.randint(TABLE_SIZE, TABLE_SIZE * 2), random.randint(*new_timestamp_range)) for _ in range(500)] + new_update_events = [(random.randint(1, TABLE_SIZE), random.randint(*new_timestamp_range)) for _ in range(500)] + + cursor.executemany(''' + INSERT IGNORE INTO inserted_pk_1 (source_pk, event_timestamp) VALUES (%s, %s) + ''', new_insert_events) + cursor.executemany(''' + INSERT IGNORE INTO updated_pk_1 (source_pk, event_timestamp) VALUES (%s, %s) + ''', new_update_events) + + cursor.executemany(f''' + INSERT IGNORE INTO {config.SOURCE_TABLE} (id, A, B, C) VALUES (%s, %s, %s, %s) + ''', [(i[0], 'a', 'b', 'c') for i in new_insert_events]) + cursor.execute(f''' + UPDATE {config.SOURCE_TABLE} SET A = 'x' WHERE id IN ({','.join([str(i) for i in [i[0] for i in new_update_events]])}) + ''') + + cursor.execute(f''' + INSERT INTO event_handler_status (migration_id, log_file, log_pos, last_event_timestamp, created_at) + VALUES (1, 'mysql-bin.000001', 4, {new_timestamp_range[1]}, NOW()) + ''') + + assert not controller.validator.apply_dml_events_validation() + + # Apply changes to destination table + cursor.executemany(f''' + INSERT IGNORE INTO {config.DESTINATION_TABLE} (id, A, B, C) VALUES (%s, %s, %s, %s) + ''', [(i[0], 'a', 'b', 'c') for i in new_insert_events]) + cursor.execute(f''' + UPDATE {config.DESTINATION_TABLE} SET A = 'x' WHERE id IN ({','.join([str(i) for i in [i[0] for i in new_update_events]])}) + ''') + assert controller.validator.apply_dml_events_validation() + cursor.execute("SELECT COUNT(1) FROM unmatched_rows") + assert cursor.fetchone()[0] == 0 + + # Test full validation + assert controller.validator.full_dml_event_validation() + + cursor.execute("TRUNCATE TABLE event_handler_status") + cursor.execute("TRUNCATE TABLE inserted_pk_1") + cursor.execute("TRUNCATE TABLE updated_pk_1") + cursor.execute("TRUNCATE TABLE deleted_pk_1") + cursor.execute("TRUNCATE TABLE unmatched_rows") + cursor.execute("TRUNCATE TABLE apply_dml_events_validation_status") + cursor.execute("TRUNCATE TABLE full_dml_event_validation_status") + + +@pytest.mark.parametrize('setup_table', ['with_data'], indirect=True) +@pytest.mark.parametrize('controller', ['object'], indirect=True) +def test_swap_tables(controller: Controller, setup_table, cursor, redis_data: RedisData): + redis_data.updated_pk_set.delete() + redis_data.removed_pk_set.delete() + assert redis_data.last_catchup_timestamp < time.time() + + # Auto swap is disabled + redis_data.set_current_stage(Stage.SWAP_TABLES) + controller.swap_tables() + assert redis_data.current_stage == Stage.APPLY_DML_EVENTS_VALIDATION + + # Auto swap is enabled, but updated_pk_set is not empty + config.AUTO_SWAP = True + redis_data.updated_pk_set.add([1]) + redis_data.set_current_stage(Stage.SWAP_TABLES) + controller.swap_tables() + assert redis_data.current_stage == Stage.APPLY_DML_EVENTS_VALIDATION + + # last_catchup_timestamp hasn't been updated after rename + redis_data.set_last_catchup_timestamp(time.time()) + redis_data.updated_pk_set.get(1) + redis_data.set_current_stage(Stage.SWAP_TABLES) + controller.swap_tables() + assert redis_data.current_stage == Stage.SWAP_TABLES_FAILED + cursor.execute(f''' + SELECT COUNT(1) FROM information_schema.TABLES + WHERE TABLE_NAME in ('{config.SOURCE_TABLE}','{config.DESTINATION_TABLE}') + ''') + assert cursor.fetchone()[0] == 2 + + # Correct condition + redis_data.set_last_catchup_timestamp(time.time() + 1) + redis_data.set_current_stage(Stage.SWAP_TABLES) + controller.swap_tables() + assert redis_data.current_stage == Stage.DONE + cursor.execute(f''' + SELECT COUNT(1) FROM information_schema.TABLES + WHERE TABLE_NAME in ('{config.SOURCE_TABLE}','{config.DESTINATION_TABLE}') + ''') + assert cursor.fetchone()[0] == 1 + + # Clean up + cursor.execute(f"DROP TABLE _{config.SOURCE_TABLE}_old_{datetime.now().strftime('%Y%m%d')}") diff --git a/tests/test_eventhandler.py b/tests/test_eventhandler.py new file mode 100644 index 0000000..efd3661 --- /dev/null +++ b/tests/test_eventhandler.py @@ -0,0 +1,173 @@ +import random +import time +from threading import Thread + +import pytest + +from config import config +from modules.redis import RedisData +from sbosc.const import Stage +from sbosc.eventhandler import EventHandler + + +############ +# Fixtures # +############ +@pytest.fixture(autouse=True) +def ignore_deprecation_warning(): + # pymysqlreplication is using deprecated distutils.version.LooseVersion + import warnings + warnings.filterwarnings("ignore", category=DeprecationWarning) + + +@pytest.fixture +def setup_table(cursor): + # Truncate table + cursor.execute("TRUNCATE TABLE event_handler_status") + cursor.execute("TRUNCATE TABLE apply_dml_events_status") + # Source table + cursor.execute(f"DROP TABLE IF EXISTS {config.SOURCE_TABLE}") + cursor.execute(f''' + CREATE TABLE {config.SOURCE_TABLE} ( + id INT PRIMARY KEY AUTO_INCREMENT, + A CHAR(10), B CHAR(10), C CHAR(10) + ) + ''') + print(f"Created table {config.SOURCE_TABLE}") + + +@pytest.fixture(autouse=True) +def init_redis(redis_data): + redis_data.set_current_stage(Stage.APPLY_DML_EVENTS_VALIDATION) + redis_data.metadata.source_db = 'sbosc' + redis_data.metadata.source_table = config.SOURCE_TABLE + redis_data.updated_pk_set.delete() + redis_data.removed_pk_set.delete() + + +@pytest.fixture +def event_handler(setup_table): + event_handler = EventHandler() + event_handler_thread = Thread(target=event_handler.start) + event_handler_thread.start() + yield event_handler + event_handler.set_stop_flag() + + while event_handler_thread.is_alive(): + time.sleep(1000) + + +######## +# Test # +######## +def test_event_handler(event_handler, cursor, redis_data: RedisData): + time.sleep(100) + assert redis_data.current_stage == Stage.APPLY_DML_EVENTS_VALIDATION + assert event_handler.live_mode + time.sleep(100) + + # test insert event + cursor.executemany( + f'INSERT INTO {config.SOURCE_TABLE} (A, B, C) VALUES (%s, %s, %s)', + [('a', 'b', 'c'), ('d', 'e', 'f')] + ) + time.sleep(100) + + assert len(redis_data.updated_pk_set) == 2 + assert set(redis_data.updated_pk_set.get(2)) == {'1', '2'} + + # test update event on same pk + cursor.executemany( + f'UPDATE {config.SOURCE_TABLE} SET A=%s, B=%s, C=%s WHERE id=%s', + [('a', 'b', 'c', 1), ('d', 'e', 'f', 1)] + ) + time.sleep(100) + assert len(redis_data.updated_pk_set) == 1 + assert set(redis_data.updated_pk_set.get(1)) == {'1'} + + # test delete event + cursor.execute(f'DELETE FROM {config.SOURCE_TABLE} WHERE id=1') + time.sleep(100) + assert len(redis_data.updated_pk_set) == 0 + assert len(redis_data.removed_pk_set) == 1 + assert set(redis_data.removed_pk_set.get(1)) == {'1'} + + +def test_event_handler_save_to_database(event_handler, cursor, redis_data): + time.sleep(100) + + total_events = 1000 + redis_data.metadata.max_id = total_events + insert_events = total_events // 2 + update_events = (total_events - insert_events) // 2 + delete_events = total_events - insert_events - update_events + + # Remove previous data + for table in ['inserted_pk_1', 'updated_pk_1', 'deleted_pk_1']: + cursor.execute(f'TRUNCATE TABLE {table}') + redis_data.updated_pk_set.delete() + redis_data.removed_pk_set.delete() + + # Set current stage to START_EVENT_HANDLER + event_handler.live_mode = False + redis_data.set_current_stage(Stage.START_EVENT_HANDLER) + time.sleep(100) + + # Insert events + for _ in range(insert_events): + cursor.execute(f'INSERT INTO {config.SOURCE_TABLE} (A, B, C) VALUES (%s, %s, %s)', ('a', 'b', 'c')) + after_insert = time.time() + while redis_data.last_catchup_timestamp < after_insert: + print("Waiting for INSERT events to be processed...") + time.sleep(100) + + # Update events + for i in range(update_events): + target_id = random.choice(range(1, insert_events + 1)) + # Update events are only created when data is changed + cursor.execute( + f'UPDATE {config.SOURCE_TABLE} SET A=%s, B=%s, C=%s WHERE id=%s', + (f'a{i}', f'b{i}', f'c{i}', target_id) + ) + after_update = time.time() + while redis_data.last_catchup_timestamp < after_update: + print("Waiting for UPDATE events to be processed...") + time.sleep(100) + + # Delete events + deleted_ids = random.sample(range(1, insert_events + 1), delete_events) + for target_id in deleted_ids: + cursor.execute(f'DELETE FROM {config.SOURCE_TABLE} WHERE id=%s', (target_id,)) + after_delete = time.time() + while redis_data.last_catchup_timestamp < after_delete: + print("Waiting for DELETE events to be processed...") + time.sleep(100) + + # Check if the events have been saved to the database + cursor.execute(f'SELECT COUNT(*) FROM inserted_pk_1') + assert cursor.fetchone()[0] == insert_events + cursor.execute(f'SELECT COUNT(*) FROM {config.SOURCE_TABLE} WHERE A != %s', ('a',)) + updated_source_rows = cursor.fetchone()[0] + cursor.execute( + f'SELECT COUNT(*) FROM updated_pk_1 WHERE source_pk NOT IN (SELECT source_pk FROM deleted_pk_1)') + assert cursor.fetchone()[0] == updated_source_rows + cursor.execute(f'SELECT COUNT(*) FROM deleted_pk_1') + assert cursor.fetchone()[0] == delete_events + + # Set current stage to APPLY_DML_EVENTS + redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) + while event_handler.event_loader.last_loaded_timestamp != event_handler.event_store.last_event_timestamp: + time.sleep(100) + + # Check if the primary keys in updated_pk_set have been saved to the database + assert len(redis_data.updated_pk_set) == insert_events - delete_events + + # Check if the primary keys in removed_pk_set have been saved to the database + assert len(redis_data.removed_pk_set) == delete_events + + while redis_data.current_stage != Stage.APPLY_DML_EVENTS_VALIDATION: + # Event timestamp is in seconds, so same events can be loaded multiple times + redis_data.updated_pk_set.delete() + redis_data.removed_pk_set.delete() + time.sleep(100) + assert redis_data.current_stage == Stage.APPLY_DML_EVENTS_VALIDATION diff --git a/tests/test_worker.py b/tests/test_worker.py index f1be5b2..a0a4497 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -82,22 +82,28 @@ def test_setup_table(cursor, setup_table): assert result[0][0] in TEST_TABLE_VALUES +def check_thread_count(_worker_manager, desired_thread_count): + for _ in range(10): + if _worker_manager.desired_thread_count == desired_thread_count and \ + _worker_manager.thread_count == desired_thread_count: + break + time.sleep(100) + assert _worker_manager.thread_count == desired_thread_count + assert _worker_manager.desired_thread_count == desired_thread_count + + def test_add_remove_thread(worker_manager, redis_data): # init condition check assert worker_manager.desired_thread_count == 0 # add thread redis_data.worker_config.thread_count = 3 - time.sleep(100) - assert worker_manager.desired_thread_count == 3 - assert worker_manager.thread_count == 3 + check_thread_count(worker_manager, 3) assert worker_manager.created_threads == 3 # remove thread redis_data.worker_config.thread_count = 1 - time.sleep(100) - assert worker_manager.desired_thread_count == 1 - assert worker_manager.thread_count == 1 + check_thread_count(worker_manager, 1) assert worker_manager.created_threads == 3 @@ -172,13 +178,7 @@ def test_bulk_import(setup_table, cursor, worker_manager, request_id, redis_data desired_thread_count = 5 redis_data.worker_config.thread_count = desired_thread_count redis_data.worker_config.batch_size = 300 - - for _ in range(10): - if worker_manager.desired_thread_count == desired_thread_count and \ - worker_manager.thread_count == desired_thread_count: - break - time.sleep(100) - assert worker_manager.desired_thread_count == desired_thread_count + check_thread_count(worker_manager, 5) # Check worker status time.sleep(100) From 2424e9d2f32719e2040d3aa2f6ee2d2258cc973a Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 22:22:53 +0900 Subject: [PATCH 06/22] github action --- .github/workflows/conf.d/my.cnf | 5 ++++ .github/workflows/linters.yml | 24 +++++++++++++++++ .github/workflows/tests.yml | 48 +++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 .github/workflows/conf.d/my.cnf create mode 100644 .github/workflows/linters.yml create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/conf.d/my.cnf b/.github/workflows/conf.d/my.cnf new file mode 100644 index 0000000..d04cd05 --- /dev/null +++ b/.github/workflows/conf.d/my.cnf @@ -0,0 +1,5 @@ +[mysqld] +character_set_server=utf8mb4 +collation_server=utf8mb4_unicode_ci +server_id=1 +log_bin=mysqld-bin diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml new file mode 100644 index 0000000..a34a2b8 --- /dev/null +++ b/.github/workflows/linters.yml @@ -0,0 +1,24 @@ +name: Linters + +on: + pull_request: + push: + branches: + - master + +jobs: + flake8_py3: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - run: python -m pip install flake8 + - name: flake8 + uses: liskin/gh-problem-matcher-wrap@v1 + with: + linters: flake8 + run: flake8 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..5e6e9b2 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,48 @@ +name: Tests + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + services: + mysql: + image: "mysql:8.0.34" + ports: + - "3306:3306" + env: + MYSQL_ALLOW_EMPTY_PASSWORD: 1 + MYSQL_ROOT_HOST: "%" + MYSQL_DATABASE: "sbosc" + options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 + redis: + image: "bitnami/redis:7.0.4" + ports: + - "6379:6379" + env: + ALLOW_EMPTY_PASSWORD: "yes" + + steps: + - uses: actions/checkout@v2 + - name: Copy custom MySQL configuration file + run: | + docker cp ./.github/workflows/conf.d/my.cnf $(docker ps -aqf "name=mysql"):/etc/mysql/conf.d/my.cnf + docker kill $(docker ps -aqf "name=mysql") + docker start $(docker ps -aqf "name=mysql" -a) + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest sqlalchemy + - name: Run tests + run: | + python -m pytest -s From c3ab40e636018661de899e63b51538eee3287cf6 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 2 Apr 2024 22:25:41 +0900 Subject: [PATCH 07/22] fix actions --- .github/workflows/tests.yml | 1 + src/config/env.py | 2 +- src/sbosc/monitor/monitor.py | 1 - tests/configs/secret.json | 2 +- tests/conftest.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5e6e9b2..05232b9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -45,4 +45,5 @@ jobs: pip install pytest sqlalchemy - name: Run tests run: | + export PYTHONPATH="$(pwd)/src" python -m pytest -s diff --git a/src/config/env.py b/src/config/env.py index b5ee5d4..d8268bc 100644 --- a/src/config/env.py +++ b/src/config/env.py @@ -12,7 +12,7 @@ class Env: AWS_REGION: str = 'ap-northeast-2' POD_NAME: str = 'local' # POD_NAME = 'local' will determine whether it's running in a local environment or not. CONFIG_FILE: str = '/opt/sb-osc/config.yaml' - SECRET_FILE: str = '/opt/sb-osc/secrets.json' + SECRET_FILE: str = '/opt/sb-osc/secret.json' def __init__(self, **envs): """ diff --git a/src/sbosc/monitor/monitor.py b/src/sbosc/monitor/monitor.py index 5616846..16767a7 100644 --- a/src/sbosc/monitor/monitor.py +++ b/src/sbosc/monitor/monitor.py @@ -9,7 +9,6 @@ from sbosc.component import SBOSCComponent from sbosc.const import Stage -from config import env from modules.aws import CloudWatch diff --git a/tests/configs/secret.json b/tests/configs/secret.json index df7b37d..bf3aceb 100644 --- a/tests/configs/secret.json +++ b/tests/configs/secret.json @@ -1,7 +1,7 @@ { "username": "root", "password": "", - "port": 3306, + "port": "3306", "redis_host": "127.0.0.1", "redis_password": "", "slack_channel": "", diff --git a/tests/conftest.py b/tests/conftest.py index 7108bd1..126a2bf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,7 @@ 'AWS_REGION': 'ap-northeast-2', 'POD_NAME': 'local', 'CONFIG_FILE': f'{PATH}/configs/config.yaml', - 'SECRET_FILE': f'{PATH}/configs/secrets.json', + 'SECRET_FILE': f'{PATH}/configs/secret.json', } os.environ.update(ENVS) From 4f156e4dca7d0a15873064e8b440f117e9f24fd7 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Thu, 4 Apr 2024 01:46:50 +0900 Subject: [PATCH 08/22] README --- README.md | 141 ++++++++++++++++++++++++++++++++++++ src/config/__init__.py | 1 - src/modules/logger.py | 1 - src/modules/redis/schema.py | 1 - 4 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..9cbc7e4 --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +# SB-OSC + +**Sendbird's online schema migration for Aurora MySQL** + +SB-OSC is an online schema change tool for Aurora MySQL databases, designed to dramatically improve performance on large +tables by leveraging multithreading in all stages of schema migration process. + +It also provides seamless pausing and resuming of tasks to adeptly handle extended operation times of large table schema +changes, along with a built-in monitoring system to dynamically control its heavy DML load based on Aurora's performance +metrics. + +SB-OSC is designed to overcome the limitations that traditional migration tools face with large-scale tables, +significantly reducing the operational overhead associated with managing large tables. + +## Takeaways + +SB-OSC has its own unique features that differentiate it from traditional schema migration tools such +as `pt-osc` and `gh-ost`. + +### Multithreading + +SB-OSC is designed to leverage multithreading in all stages of the schema migration process, bulk import (initial table +copy), binlog event processing, and DML event application. + +For binlog event processing, SB-OSC processes binlog files in parallel, which enables it to handle large tables with +heavy write loads. + +### Resumable +SB-OSC is resumable at any stage of the schema migration process. It saves the current state of each stage to database and Redis, allowing users to pause and resume the process at any time, as log as binlog retention is sufficient. + +### Operation Class + +SB-OSC supports operation classes that can override main queries used in the schema migration process. This feature +allows users to customize queries for specific tables such as data retention, table redesign, and more. + +Also, it provides operation class that allows replication cross different Aurora clusters which can be used in various +scenarios such as cross-region replication, cross-account replication, clone cluster replication, etc. + +### Data Validation + +SB-OSC provides strong data validation features to ensure data consistency between the source and destination tables. It +validates both the bulk import and DML event application stages, and attempts to recover from any inconsistencies. + +### Index Creation Strategy + +SB-OSC allows users to create indexes after the bulk import stage, which can significantly reduce the time required for +the initial table copy. This feature is especially useful for large tables with many indexes. + +### Monitoring + +SB-OSC has a built-in monitoring system that dynamically controls its heavy DML load based on Aurora's performance +metrics. This feature makes SB-OSC more reliable on production environments, since it will automatically adjust its DML +load when production traffic increases. + +## Requirements + +SB-OSC is designed to work with Aurora MySQL database, and it's an EKS-based tool. + +It requires the following components to run: + +- Aurora MySQL database (v2, v3) +- EKS cluster +- ExternalSecrets (AWS Secrets Manager) +- IAM role +- `binlog_format` set to `ROW` +- `binlog-ignore-db` set to `sbosc` (Recommended) + +## Performance + +SB-OSC shows high performance on both binlog event processing and bulk import. Following are specs of tables used for +performance testing: + +| Table Alias | Avg Row Length (Bytes) | Write IOPS (IOPS/m) | +|:-----------:|-----------------------:|--------------------:| +| A | 57 | 149 | +| B | 912 | 502 | +| C | 61 | 3.38 K | +| D | 647 | 17.9 K | +| E | 1042 | 24.4 K | +| F | 86 | 151 K | +| G | 1211 | 60.7 K | + +**Avg Row Length**: `avg_row_length` from `information_schema.TABLES` +**Write IOPS**: Average increase of `count_star` from `performance_schema.table_io_waits_summary_by_table` per +minute. + +All tables were in the same Aurora MySQL v3 cluster + +### Binlog Event Processing + +Following are read throughput of binlog event processing in read bytes per minute. By comparing read throughput to total +binlog creation rate of the cluster, we can see whether SB-OSC can catch up DML events or not. + +**Total Binlog Creation Rate**: 144 (MB/m) + +| Table Alias | A | B | C | D | E | F | G | +|:----------------------:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +| Read Throughput (MB/m) | 513 | 589 | 591 | 402 | 466 | 361 | 305 | + +Result shows that SB-OSC can catch up DML events on tables with very high write load. + +### Bulk Import + +To provide general insight on bulk import performance, the test was conducted on table `A` with no secondary indexes, and no additional traffic. + +Actual performance of bulk import can vary depending on the number of secondary indexes, the number of rows, column types, +production traffic, etc. + +Following are the results of bulk import performance based on instance sizes: + +| Instance Type | Insert Rate (rows/s) | Network Throughput (Bytes/s) | Storage Throughput (Bytes/s) | CPU Utilization (%) | +|:-------------:|---------------------:|-----------------------------:|-----------------------------:|--------------------:| +| r6g.2xlarge | 42.3 K | 27.2 K | 457 M | 55.0 | +| r6g.4xlarge | 94.0 K | 45.9 K | 900 M | 51.9 | +| r6g.8xlarge | 158 K | 72.2 K | 1.39 G | 44.6 | + +Insert rate, network throughput, and storage throughput are the average values calculated from CloudWatch metrics. + +## Limitations + +- **Necessity of Integer Primary Keys** + SB-OSC performs multithreading based on integer primary keys (PKs) during the bulk import phase. This approach, + designed around batch processing and other operations utilizing integer PKs, means SB-OSC cannot be used with tables + that do not have integer PKs. + + +- **Updates on Primary Key** + SB-OSC replicates records from the original table based on the PK for applying DML events. Therefore, if updates occur + on the table's PK, it can be challenging to guarantee data integrity. + + +- **Binlog Resolution** + SB-OSC is limited by the fact that binlog's resolution is in seconds. While this doesn't significantly impact most + scenarios due to SB-OSC's design, it can affect the logic based on timestamps when excessive events occur within a + second. + + +- **Reduced Efficiency for Small Tables** + For small tables, the initial table creation, chunk creation, and the multi-stage process of SB-OSC can act as + overhead, potentially slowing down the overall speed. Therefore, applying SB-OSC to small tables may not be as + effective. diff --git a/src/config/__init__.py b/src/config/__init__.py index 91fec36..f833ee4 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -5,4 +5,3 @@ config = Config() # override by setting env CONFIG_FILE secret = Secret() # override by setting env SECRET_FILE env = Env() # override with environment variables - diff --git a/src/modules/logger.py b/src/modules/logger.py index dfee8ac..7a2b0eb 100644 --- a/src/modules/logger.py +++ b/src/modules/logger.py @@ -48,4 +48,3 @@ def get_logger(default_tags=None): logger.addHandler(handler) return logger - diff --git a/src/modules/redis/schema.py b/src/modules/redis/schema.py index e8dac62..c5fb30d 100644 --- a/src/modules/redis/schema.py +++ b/src/modules/redis/schema.py @@ -44,4 +44,3 @@ class RedisKey: CHUNK_SET = 'sb-osc:{}:{}:bulk_import:chunk_set' CHUNK_INFO = 'sb-osc:{}:{}:bulk_import:chunk_info:{}' OLD_SOURCE_TABLE = 'sb-osc:{}:{}:old_source_table' - From 6f4d70e76620d7c2063942e13acfd727956ba3b5 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Fri, 5 Apr 2024 19:04:18 +0900 Subject: [PATCH 09/22] chart --- charts/Chart.yaml | 3 + charts/templates/externalsecret.yaml | 49 ++++++++++++++ charts/templates/redis.yaml | 88 +++++++++++++++++++++++++ charts/templates/sb-osc.yaml | 97 ++++++++++++++++++++++++++++ charts/templates/serviceaccount.yaml | 14 ++++ charts/values.yaml | 79 ++++++++++++++++++++++ src/sbosc/component.py | 7 +- src/sbosc/monitor/monitor.py | 2 +- 8 files changed, 337 insertions(+), 2 deletions(-) create mode 100644 charts/templates/externalsecret.yaml create mode 100644 charts/templates/redis.yaml create mode 100644 charts/templates/sb-osc.yaml create mode 100644 charts/templates/serviceaccount.yaml diff --git a/charts/Chart.yaml b/charts/Chart.yaml index e69de29..d334c37 100644 --- a/charts/Chart.yaml +++ b/charts/Chart.yaml @@ -0,0 +1,3 @@ +apiVersion: v2 +name: sb-osc +version: 0.0.1 diff --git a/charts/templates/externalsecret.yaml b/charts/templates/externalsecret.yaml new file mode 100644 index 0000000..8d65f8a --- /dev/null +++ b/charts/templates/externalsecret.yaml @@ -0,0 +1,49 @@ +apiVersion: 'external-secrets.io/v1beta1' +kind: SecretStore +metadata: + name: sb-osc-secret +spec: + provider: + aws: + service: SecretsManager + region: {{ .Values.awsRegion }} + auth: + jwt: + serviceAccountRef: + name: sb-osc-external-secrets + namespace: {{ .Release.Namespace }} +--- +apiVersion: 'external-secrets.io/v1beta1' +kind: ExternalSecret +metadata: + name: sb-osc-secret +spec: + secretStoreRef: + name: sb-osc-secret + kind: SecretStore + target: + name: sb-osc-secret + template: + engineVersion: v2 + data: + secret.json: | + {{ printf `{ + {{- $first := true }} + {{- range $k, $v := . }} + {{- if $first }} + {{- $first = false }} + {{- else }} + {{- "," -}} + {{- end }} + "{{ $k }}": "{{ $v }}" + {{- end }} + }` }} + redis.conf: {{ printf `| + requirepass {{ .redis_password }} + appendonly yes + save "" + `}} + dataFrom: + - extract: + key: {{ .Values.secretName }} + refreshInterval: 10m diff --git a/charts/templates/redis.yaml b/charts/templates/redis.yaml new file mode 100644 index 0000000..3edfafa --- /dev/null +++ b/charts/templates/redis.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: sb-osc-redis + labels: + app: redis +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: redis + serviceName: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: sb-osc-redis + image: redis:7.0.2 + command: + - redis-server + - /usr/local/etc/redis/redis.conf + ports: + - containerPort: 6379 + resources: + requests: + cpu: "500m" + {{- if hasKey .Values.redis "memory" }} + memory: {{ .Values.redis.memory }} + {{- else }} + memory: "1000Mi" + {{- end }} + volumeMounts: + - mountPath: /data + name: redis-data + - mountPath: /usr/local/etc/redis/redis.conf + name: redis-secret + subPath: redis.conf + volumes: + - name: redis-data + persistentVolumeClaim: + claimName: redis-pvc + - name: redis-config + configMap: + name: redis-config + - name: redis-secret + secret: + secretName: sb-osc-secret + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redis-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: redis-sc + resources: + requests: + storage: 5Gi + +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: redis-sc +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer +parameters: + type: gp3 + +--- +apiVersion: v1 +kind: Service +metadata: + name: redis + labels: + app: redis +spec: + ports: + - port: 6379 + targetPort: 6379 + selector: + app: redis + diff --git a/charts/templates/sb-osc.yaml b/charts/templates/sb-osc.yaml new file mode 100644 index 0000000..2b27b5d --- /dev/null +++ b/charts/templates/sb-osc.yaml @@ -0,0 +1,97 @@ +{{- $enabled := .Values.sbosc.enabled }} +{{- $defaultImage := .Values.sbosc.default.image }} +{{- $defualtWorkers := .Values.sbosc.default.workers }} +{{- $defaultResources := deepCopy .Values.sbosc.default.resources }} +{{- $defaultConfig := deepCopy .Values.sbosc.default.config }} + +{{- range .Values.sbosc.instances }} +{{- $instanceEnabled := ternary .enabled $enabled (hasKey . "enabled")}} +{{- if (and $instanceEnabled $enabled) }} +{{- $config := deepCopy $defaultConfig | merge .config }} +{{- $resources := (hasKey . "resources") | ternary .resources dict }} +{{- $resources = deepCopy $defaultResources | merge $resources }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: sb-osc-config-{{ .name }} + labels: + app: sb-osc + name: {{ .name }} +data: + config.yaml: | + {{- $config | toYaml | nindent 4}} + +{{- $name := .name }} +{{- $image := .image | default $defaultImage }} +{{- $workers := ternary .workers $defualtWorkers (hasKey . "workers") }} +{{- $configVersion := $config | toYaml | sha256sum }} +{{- range $component := list "controller" "eventhandler" "monitor" "worker" }} +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: sb-osc-{{ $component }}-{{ $name }} + labels: + app: sb-osc + name: {{ $name }} +spec: + serviceName: sb-osc-{{ $component }} + replicas: {{ ternary $workers 1 (eq $component "worker") }} + revisionHistoryLimit: 3 + selector: + matchLabels: + app: sb-osc + component: {{ $component }} + name: {{ $name }} + template: + metadata: + labels: + app: sb-osc + component: {{ $component }} + name: {{ $name }} + annotations: + config-version: {{ $configVersion }} + spec: + serviceAccountName: sb-osc-access + containers: + - name: {{ $component }} + image: {{ $image }} + imagePullPolicy: Always + command: + - "python" + - "-m" + - "sbosc.{{ $component }}.main" + volumeMounts: + - name: sb-osc-secret + mountPath: {{ $.Values.secretFile }} + subPath: secret.json + readOnly: true + - name: sb-osc-config + mountPath: {{ $.Values.configFile }} + subPath: config.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_REGION + value: {{ $.Values.awsRegion }} + - name: CONFIG_FILE + value: {{ $.Values.configFile }} + - name: SECRET_FILE + value: {{ $.Values.secretFile }} + resources: + {{- index $resources $component | toYaml | nindent 10 }} + volumes: + - name: sb-osc-config + configMap: + name: sb-osc-config-{{ $name }} + - name: sb-osc-secret + secret: + secretName: sb-osc-secret + + terminationGracePeriodSeconds: 300 +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/templates/serviceaccount.yaml b/charts/templates/serviceaccount.yaml new file mode 100644 index 0000000..528ceb7 --- /dev/null +++ b/charts/templates/serviceaccount.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "sb-osc-access" + annotations: + eks.amazonaws.com/role-arn: {{ .Values.sboscRoleArn }} + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "sb-osc-external-secrets" + annotations: + eks.amazonaws.com/role-arn: {{ .Values.externalsecretsRoleArn }} diff --git a/charts/values.yaml b/charts/values.yaml index e69de29..8a72866 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -0,0 +1,79 @@ +awsRegion: 'ap-northeast-2' + +# ExternalSecret +secretName: sb-osc + +# ServiceAccount +sboscRoleArn: "" +externalsecretsRoleArn: "" + +# Reids +redis: + memory: 1000Mi + +# Envs +configFile: "/opt/sb-osc/config.yaml" +secretFile: "/opt/sb-osc/secret.json" + +# SB-OSC Config +sbosc: + enabled: false + default: + image: "" + workers: 1 + resources: + controller: + requests: + cpu: 500m + memory: 500Mi + eventhandler: + requests: + cpu: 2 + memory: 500Mi + worker: + requests: + cpu: 500m + memory: 500Mi + monitor: + requests: + cpu: 50m + memory: 100Mi + + config: + auto_swap: false + bypass_least_busy_hour: false + + source_writer_endpoint: "" + + min_chunk_size: 100000 + max_chunk_count: 200 + + min_batch_size: 200 + max_batch_size: 1000 + batch_size_step_size: 200 + + min_thread_count: 1 + max_thread_count: 8 + thread_count_step_size: 1 + + commit_interval: 1 + use_batch_size_multiplier: true + + # Monitor + optimal_value_use_limit: 3 + cpu_soft_threshold: 40 + cpu_hard_threshold: 60 + latency_soft_threshold: 30 + latency_hard_threshold: 50 + + # EventHandler + event_handler_thread_count: 4 + event_handler_thread_timeout: 300 + +# instances: +# - name: +# config: +# source_db: +# source_table: +# destination_db: +# destination_table: diff --git a/src/sbosc/component.py b/src/sbosc/component.py index 48a93ee..5090a5d 100644 --- a/src/sbosc/component.py +++ b/src/sbosc/component.py @@ -58,8 +58,13 @@ def get_migration_id(self): )) return cursor.fetchone()[0] if cursor.rowcount > 0 else None + except MySQLdb.OperationalError as e: + if e.args[0] == 1049: + print(f"Database not found. {e.args[1]}") + return None + except MySQLdb.ProgrammingError as e: - if e.args[0] == 1146: # database or table doesn't exist + if e.args[0] == 1146: print(f"Table not found. {e.args[1]}") return None raise e diff --git a/src/sbosc/monitor/monitor.py b/src/sbosc/monitor/monitor.py index 16767a7..821fb12 100644 --- a/src/sbosc/monitor/monitor.py +++ b/src/sbosc/monitor/monitor.py @@ -70,7 +70,7 @@ def __init__(self): "sb_osc_updated_pk_set_length": ("Length of updated PK set", PrometheusMetricSender.GAUGE), "sb_osc_removed_pk_set_length": ("Length of removed PK set", PrometheusMetricSender.GAUGE), "sb_osc_unmatched_rows": ("Number of unmatched row count", PrometheusMetricSender.GAUGE), - }, label_keys=["dbclusteridentifier", "sbregion", "migration_id"]) + }, label_keys=["dbclusteridentifier", "migration_id"]) self.metric_sender.set_labels(labels={ "dbclusteridentifier": config.SOURCE_CLUSTER_ID, "migration_id": self.migration_id From 8b5d05120efe30624ec9812f89d18777fc8e7990 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Fri, 5 Apr 2024 19:14:26 +0900 Subject: [PATCH 10/22] PREFERRED_WINDOW --- src/sbosc/component.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sbosc/component.py b/src/sbosc/component.py index 5090a5d..20b0ab6 100644 --- a/src/sbosc/component.py +++ b/src/sbosc/component.py @@ -1,5 +1,6 @@ import signal import time +from datetime import datetime import MySQLdb from MySQLdb.cursors import Cursor @@ -34,8 +35,13 @@ def __init__(self): def set_stop_flag(self): self.stop_flag = True - def is_preferred_window(self): - pass + @staticmethod + def is_preferred_window(): + current_time = datetime.now().time() + start_time_str, end_time_str = config.PREFERRED_WINDOW.split('-') + start_time = datetime.strptime(start_time_str, '%H:%M').time() + end_time = datetime.strptime(end_time_str, '%H:%M').time() + return start_time <= current_time <= end_time def get_migration_id(self): if not config.SOURCE_TABLE or not config.DESTINATION_TABLE: From 49dc81d31e5667a7a9306d25770138ac1482ed82 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Sat, 6 Apr 2024 21:00:49 +0900 Subject: [PATCH 11/22] fix bugs --- src/sbosc/controller/controller.py | 16 ++++++---------- src/sbosc/controller/initializer.py | 15 ++++++++++++--- src/sbosc/monitor/monitor.py | 2 +- tests/test_controller.py | 6 +++--- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/sbosc/controller/controller.py b/src/sbosc/controller/controller.py index 9391a10..a09fe01 100644 --- a/src/sbosc/controller/controller.py +++ b/src/sbosc/controller/controller.py @@ -40,7 +40,6 @@ def start(self): stage_actions = { Stage.BULK_IMPORT_CHUNK_CREATION: self.create_bulk_import_chunks, Stage.BULK_IMPORT_VALIDATION: self.validate_bulk_import, - Stage.APPLY_DML_EVENTS: self.apply_dml_events, Stage.ADD_INDEX: self.add_index, Stage.APPLY_DML_EVENTS_VALIDATION: self.apply_dml_events_validation, Stage.SWAP_TABLES: self.swap_tables, @@ -172,15 +171,6 @@ def validate_bulk_import(self): except StopFlagSet: return - def apply_dml_events(self): - self.logger.info("Resetting worker config to minimum values") - revision = self.redis_data.worker_config.revision or 0 # If revision is None, set to 0 - self.redis_data.worker_config.set({ - 'batch_size': config.MIN_BATCH_SIZE, - 'thread_count': config.MIN_THREAD_COUNT, - 'commit_interval': config.COMMIT_INTERVAL, - 'revision': revision + 1 - }) def apply_dml_events_validation(self): self.interval = 10 @@ -303,6 +293,12 @@ def add_index(self): subtitle="Finished creating indexes", message=f"Indexes: {index_names}", color="good") if finished_all_creation: + self.logger.info("Resetting worker config") + self.redis_data.worker_config.set({ + 'batch_size': config.MIN_BATCH_SIZE, + 'thread_count': 0, + 'revision': self.redis_data.worker_config.revision + 1 + }) self.redis_data.set_current_stage(Stage.APPLY_DML_EVENTS) def swap_tables(self): diff --git a/src/sbosc/controller/initializer.py b/src/sbosc/controller/initializer.py index aba5d37..d04966a 100644 --- a/src/sbosc/controller/initializer.py +++ b/src/sbosc/controller/initializer.py @@ -153,8 +153,7 @@ def setup_database(self): ''') self.logger.info("Event handler status table created") - def fetch_metadata(self, migration_id): - redis_data = RedisData(migration_id) + def fetch_metadata(self, redis_data): metadata = redis_data.metadata # Config data @@ -232,8 +231,18 @@ def init_migration(self): ''') self.logger.info("DML log tables created") + redis_data = RedisData(migration_id) + # Fetch metadata - self.fetch_metadata(migration_id) + self.fetch_metadata(redis_data) + + # Set initial worker config + redis_data.worker_config.set({ + 'batch_size': config.MIN_BATCH_SIZE, + 'thread_count': config.MIN_THREAD_COUNT, + 'commit_interval': config.COMMIT_INTERVAL, + 'revision': 0, + }) slack = SlackClient("SB-OSC Controller", f'{config.SOURCE_CLUSTER_ID}, {migration_id}') slack.send_message( diff --git a/src/sbosc/monitor/monitor.py b/src/sbosc/monitor/monitor.py index 821fb12..d076b87 100644 --- a/src/sbosc/monitor/monitor.py +++ b/src/sbosc/monitor/monitor.py @@ -324,7 +324,7 @@ def check_migration_status(self): log_file, log_pos = cursor.fetchone() remaining_binlog_size = 0 cursor.execute("SHOW BINARY LOGS") - for log_name, file_size in cursor.fetchall(): + for log_name, file_size, *_ in cursor.fetchall(): # *_ if for MySQL 5.7 and 8.0 compatibility if log_name >= log_file: remaining_binlog_size += file_size remaining_binlog_size -= log_pos diff --git a/tests/test_controller.py b/tests/test_controller.py index 3410b38..ed92c36 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -63,7 +63,7 @@ def controller(request, redis_data): ######## def test_chunk_creation(controller, setup_table, redis_data): controller, controller_thread = controller - controller.initializer.fetch_metadata(1) + controller.initializer.fetch_metadata(redis_data) redis_data.set_current_stage(Stage.BULK_IMPORT_CHUNK_CREATION) while redis_data.current_stage != Stage.BULK_IMPORT: assert controller_thread.is_alive() @@ -170,8 +170,8 @@ def delayed_add_index(): @pytest.mark.parametrize('setup_table', ['with_data'], indirect=True) @pytest.mark.parametrize('controller', ['object'], indirect=True) -def test_apply_dml_events_validation(controller: Controller, setup_table, cursor, override_operation_class): - controller.initializer.fetch_metadata(1) +def test_apply_dml_events_validation(controller: Controller, setup_table, redis_data, cursor, override_operation_class): + controller.initializer.fetch_metadata(redis_data) timestamp_range = (1, 100) insert_events = [(random.randint(1, TABLE_SIZE), random.randint(*timestamp_range)) for _ in range(500)] From e046c478f36817834b8016234caa3f7327e50446 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 9 Apr 2024 14:03:20 +0900 Subject: [PATCH 12/22] usage.md --- README.md | 8 +++-- charts/Chart.yaml | 5 +++ doc/usage.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 doc/usage.md diff --git a/README.md b/README.md index 9cbc7e4..fea77dd 100644 --- a/README.md +++ b/README.md @@ -56,15 +56,19 @@ load when production traffic increases. SB-OSC is designed to work with Aurora MySQL database, and it's an EKS-based tool. -It requires the following components to run: +It requires the following resources to run: - Aurora MySQL database (v2, v3) - EKS cluster -- ExternalSecrets (AWS Secrets Manager) +- AWS SecretsManager secret - IAM role + +SB-OSC accepts `ROW` for binlog format. It is recommended to set `binlog-ignore-db` to `sbosc` to prevent SB-OSC from processing its own binlog events. - `binlog_format` set to `ROW` - `binlog-ignore-db` set to `sbosc` (Recommended) +Detailed requirements and setup instructions can be found in the [usage guide](doc/usage.md). + ## Performance SB-OSC shows high performance on both binlog event processing and bulk import. Following are specs of tables used for diff --git a/charts/Chart.yaml b/charts/Chart.yaml index d334c37..dbb6099 100644 --- a/charts/Chart.yaml +++ b/charts/Chart.yaml @@ -1,3 +1,8 @@ apiVersion: v2 name: sb-osc version: 0.0.1 + +dependencies: + - name: external-secrets + version: "0.8.2" + repository: "https://charts.external-secrets.io/" diff --git a/doc/usage.md b/doc/usage.md new file mode 100644 index 0000000..c633639 --- /dev/null +++ b/doc/usage.md @@ -0,0 +1,77 @@ +# Usage + +## 1. Create AWS Resources + +### IAM Role + +Two IAM role is required. One for `ExternalSecrets` to access SecretsManager secret and another for the `monitor` to access CloudWatch metrics. Each role will be attached to separate service accounts. + + +Create an IAM role with the following policy: + +**sb-osc-external-role** +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + "secretsmanager:ListSecretVersionIds" + ], + "Resource": "arn:aws:secretsmanager:REGION:ACCOUNT_ID:secret:SECRET_NAME" + } + ] +} +``` + +**sb-osc-role** +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "cloudwatch:GetMetricStatistics" + ], + "Resource": "*" + } + ] +} +``` + +### SecretsManager Secret +SB-OSC uses ExternalSecrets with SecretsManager for credentials. Following keys should be defined. + +- `username`: Database username +- `password`: Database password +- `port`: Database port +- `redis_host`: Redis endpoint (k8s Service name) +- `redis_password`: Redis password +- `slack_channel`: Slack channel ID (Optional) +– `slack_token`: Slack app token (Optional) + +You can find these keys in [secret.py](../src/config/secret.py) + +## 2. Create Destination Table +SB-OSC does not create destination table on its own. Table should be manually created before starting migration. + +## 3. Enable Binlog +SB-OSC requires binlog to be enabled on the source database. Please set `binlog_format` to `ROW` + +### Other Parameters +- Setting `binlog-ignore-db` to `sbosc` is recommended to prevent SB-OSC from processing its own binlog events. +- Set `range_optimizer_max_mem_size` to `0` or a large value to prevent bad query plans on queries with large `IN` clauses (especially on Aurora v3) + +## 4. Run SB-OSC +When all of the above steps are completed, you can start the migration process by installing the [helm chart](../charts) + +```bash +helm install charts sb-osc -n sb-osc --create-namespace + +# or +helm -i upgrade charts sb-osc -n sb-osc +``` From e0b97227ca744cf2adccd5a2a30c3a4af2371449 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 9 Apr 2024 17:41:35 +0900 Subject: [PATCH 13/22] add-index, opearation-class --- doc/add-index.md | 11 ++++++++ doc/operation-class.md | 60 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 doc/add-index.md create mode 100644 doc/operation-class.md diff --git a/doc/add-index.md b/doc/add-index.md new file mode 100644 index 0000000..e89ee90 --- /dev/null +++ b/doc/add-index.md @@ -0,0 +1,11 @@ +## Add Index +Secondary indexes on large tables degrades insert performance significantly. SB-OSC handles this problem by allowing users to create indexes after the bulk import stage. Although `ALTER TABLE ... ADD INDEX` command after initial data copy still takes a long time, it is much faster than copying table with those indexes. + +There are a few things to consider when creating indexes after the bulk import stage. + +### FreeLocalStorage +Before `ALTER TABLE ... ADD INDEX` command finishes, index is temporarily created in the local storage of the Aurora MySQL instance. The amount of FreeLocalStorage should be greater than the total size of the index being created together. If the FreeLocalStorage is not enough, the index creation will fail with when FreeLocalStorage reaches 0. + +### Free Memory (Enhanced Monitoring) +Upon creating an index, the Free Memory as reported by Enhanced Monitoring will decrease. This decrease continues rapidly until it reaches a certain value. However, Aurora has the capability to immediately reclaim memory from Freeable Memory (as observed in CloudWatch), so this should not pose a significant issue. Nonetheless, it is important to monitor and ensure that neither Free Memory nor Freeable Memory reaches zero. + diff --git a/doc/operation-class.md b/doc/operation-class.md new file mode 100644 index 0000000..5a5506b --- /dev/null +++ b/doc/operation-class.md @@ -0,0 +1,60 @@ +# Operation Class +Operation class is a feature that allows users to customize queries for specific use cases such as data retention, table redesign, and more. + +SB-OSC provides two default operation classes. `BaseOperation` is the default operation class that is used for normal schema migration. It copies all columns and records from the source table to the destination table. + +`CrossClusterOperation` is an operation class that allows replication across different Aurora clusters. Instead of `INSERT INTO ... SELECT ...` it selects from source cluster and inserts into destination cluster with two separate connections. This can be used in various scenarios such as cross-region replication, cross-account replication, clone cluster replication, etc. + +You can create your own operation class by inheriting `BaseOperation` and overriding its methods. If you pass the operation class name to the `operation_class` parameter in the migration configuration, SB-OSC detect any operation class defined below `src/sbosc/opeartion` directory and use it for the migration process. + +## Example + +### BaseOperation +```python +from sbosc.operations.base import BaseOperation + + +class MessageRetentionOperation(BaseOperation): + def _insert_batch_query(self, start_pk, end_pk): + return f""" + INSERT INTO {self.source_db}.{self.destination_table}({self.source_columns}) + SELECT {self.source_columns} + FROM {self.source_db}.{self.source_table} AS source + WHERE source.id BETWEEN {start_pk} AND {end_pk} + AND source.ts > DATE_SUB(NOW(), INTERVAL 30 DAY) + """ + def _get_not_imported_pks_query(self, start_pk, end_pk): + return f''' + SELECT source.id FROM {self.source_db}.{self.source_table} AS source + LEFT JOIN {self.source_db}.{self.destination_table} AS dest ON source.id = dest.id + WHERE source.id BETWEEN {start_pk} AND {end_pk} + AND source.ts > DATE_SUB(NOW(), INTERVAL 30 DAY) + AND dest.id IS NULL + ''' +``` + +### CrossClusterOperation +```python +class CrossClusterMessageRetentionOperation(CrossClusterBaseOperation): + def _select_batch_query(self, start_pk, end_pk): + return f''' + SELECT {self.source_columns} FROM {self.source_db}.{self.source_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + AND source.ts > DATE_SUB(NOW(), INTERVAL 30 DAY) + ''' + + def get_not_imported_pks(self, source_cursor, dest_cursor, start_pk, end_pk): + source_cursor.execute(f''' + SELECT id FROM {self.source_db}.{self.source_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + AND source.ts > DATE_SUB(NOW(), INTERVAL 30 DAY) + ''') + source_pks = [row[0] for row in source_cursor.fetchall()] + dest_cursor.execute(f''' + SELECT id FROM {self.destination_db}.{self.destination_table} + WHERE id BETWEEN {start_pk} AND {end_pk} + AND source.ts > DATE_SUB(NOW(), INTERVAL 30 DAY) + ''') + dest_pks = [row[0] for row in dest_cursor.fetchall()] + return list(set(source_pks) - set(dest_pks)) +``` From 59625f497e1d354d86626bc08d2b9da0f4f62bff Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 9 Apr 2024 18:20:26 +0900 Subject: [PATCH 14/22] config.md --- charts/values.yaml | 63 ++++++++++++++++++++++++++++------------------ doc/add-index.md | 2 +- doc/config.md | 26 +++++++++++++++++++ 3 files changed, 66 insertions(+), 25 deletions(-) create mode 100644 doc/config.md diff --git a/charts/values.yaml b/charts/values.yaml index 8a72866..a8d11cb 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -1,25 +1,27 @@ +# See github.com/sendbird/sb-osc/blob/main/doc/config.md for more information + awsRegion: 'ap-northeast-2' # ExternalSecret -secretName: sb-osc +secretName: "" # SecretsManager secret name # ServiceAccount -sboscRoleArn: "" -externalsecretsRoleArn: "" +sboscRoleArn: "" # SB-OSC ServiceAccount role ARN +externalsecretsRoleArn: "" # ExternalSecrets ServiceAccount role ARN # Reids redis: memory: 1000Mi # Envs -configFile: "/opt/sb-osc/config.yaml" -secretFile: "/opt/sb-osc/secret.json" +configFile: "/opt/sb-osc/config.yaml" # SB-OSC config mount path +secretFile: "/opt/sb-osc/secret.json" # SB-OSC secret mount path # SB-OSC Config sbosc: - enabled: false + enabled: false # Enable SB-OSC default: - image: "" + image: "" # SB-OSC image workers: 1 resources: controller: @@ -40,14 +42,18 @@ sbosc: memory: 100Mi config: - auto_swap: false - bypass_least_busy_hour: false + auto_swap: false # Whether to swap tables automatically + preferred_window: "00:00-23:59" # Preferred window for swapping tables & bulk import validation source_writer_endpoint: "" + source_reader_endpoint: "" + destination_writer_endpoint: "" + destination_reader_endpoint: "" min_chunk_size: 100000 max_chunk_count: 200 + # Worker min_batch_size: 200 max_batch_size: 1000 batch_size_step_size: 200 @@ -60,20 +66,29 @@ sbosc: use_batch_size_multiplier: true # Monitor - optimal_value_use_limit: 3 - cpu_soft_threshold: 40 - cpu_hard_threshold: 60 - latency_soft_threshold: 30 - latency_hard_threshold: 50 + cpu_soft_threshold: 40 # Soft threshold for CPU usage. If the CPU usage exceeds this value, thread count will be decreased into half. + cpu_hard_threshold: 60 # Hard threshold for CPU usage. If the CPU usage exceeds this value, thread count will be decreased to 0. + latency_soft_threshold: 30 # Soft threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased into half. + latency_hard_threshold: 50 # Hard threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased to 0. # EventHandler - event_handler_thread_count: 4 - event_handler_thread_timeout: 300 - -# instances: -# - name: -# config: -# source_db: -# source_table: -# destination_db: -# destination_table: + event_handler_thread_count: 4 # Number of threads for EventHandler. Max number of binlog files to read at once. (Max 4 recommended) + event_handler_thread_timeout: 300 # Timeout for EventHandler thread. If the thread is not finished within this time, it raises exception and restarts EventHandler. + + # EventLoader + pk_set_max_size: 1000000 # Max number of DML PKs to load from DB at once. No more than 2 * pk_set_max_size will be kept in Redis. This is used for memory optimization. + event_batch_duration: 3600 # Timestamp range of DML events to load from DB at once (seconds). + + # Validator + bulk_import_validation_batch_size: 1000000 # Batch size for bulk import validation + apply_dml_events_validation_batch_size: 3000 # Batch size for DML event validation + full_dml_event_validation_interval: 1 # Interval for full DML event validation (hours) + validation_thread_count: 4 # Number of threads to use for validation process + + instances: + - name: + config: + source_db: + source_table: + destination_db: + destination_table: diff --git a/doc/add-index.md b/doc/add-index.md index e89ee90..e60a20f 100644 --- a/doc/add-index.md +++ b/doc/add-index.md @@ -7,5 +7,5 @@ There are a few things to consider when creating indexes after the bulk import s Before `ALTER TABLE ... ADD INDEX` command finishes, index is temporarily created in the local storage of the Aurora MySQL instance. The amount of FreeLocalStorage should be greater than the total size of the index being created together. If the FreeLocalStorage is not enough, the index creation will fail with when FreeLocalStorage reaches 0. ### Free Memory (Enhanced Monitoring) -Upon creating an index, the Free Memory as reported by Enhanced Monitoring will decrease. This decrease continues rapidly until it reaches a certain value. However, Aurora has the capability to immediately reclaim memory from Freeable Memory (as observed in CloudWatch), so this should not pose a significant issue. Nonetheless, it is important to monitor and ensure that neither Free Memory nor Freeable Memory reaches zero. +Upon creating an index, the Free Memory as reported by Enhanced Monitoring will decrease. This decrease continues rapidly until it reaches a certain value. However, Aurora has the capability to immediately reclaim memory from FreeableMemory (as observed in CloudWatch), so this should not pose a significant issue. Nonetheless, it is important to monitor and ensure that neither Free Memory nor Freeable Memory reaches zero. diff --git a/doc/config.md b/doc/config.md new file mode 100644 index 0000000..900b27e --- /dev/null +++ b/doc/config.md @@ -0,0 +1,26 @@ +# Config + +## Chunk +### max_chunk_count & min_chunk_size +SB-OSC calculates the number of chunks to create based on following formula +```python +chunk_count = min(max_id // min_chunk_size, max_chunk_count) +``` + +## Worker +### batch_size, thread_count, commit_interval +These parameters control insert throughput of SB-OSC. `batch_size` and `thread_count` are managed similarly. They have min, max values and step size. + +`batch_size` will be set to `min_batch_size` initially and will be increased by `batch_size_step` until it reaches `max_batch_size`. + +`thread_count` will be set to `min_thread_count` initially and will be increased by `thread_count_step` until it reaches `max_thread_count`. + +`commit_interval` is time to sleep after each commit. + +### use_batch_size_multiplier +`batch_size_multiplier` is used to increase insert rate on sparse or badly distributed table. If this parameter is set to `True`, SB-OSC will multiply `batch_size` by `batch_size_multiplier` after each successful insert. + +`batch_size_multiplier` is calculated by dividing `batch_size` by actual affected rows of the last insert + +`LIMIT batch_size` is applied to the next query to prevent from inserting too many rows at once. + From 8f3d7eb7070ade82349f62ebca3f4cdfefd4aa06 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Tue, 9 Apr 2024 18:37:48 +0900 Subject: [PATCH 15/22] add test_monitor --- tests/test_monitor.py | 153 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 tests/test_monitor.py diff --git a/tests/test_monitor.py b/tests/test_monitor.py new file mode 100644 index 0000000..fd15bd0 --- /dev/null +++ b/tests/test_monitor.py @@ -0,0 +1,153 @@ +import time + +import pytest + +from config import config +from modules.db import Database +from sbosc.const import Stage +from sbosc.monitor import MetricMonitor + + +class DatabaseTest(Database): + def get_instance_id(self, host='source', role='writer'): + return 'test' + + +class MetricMonitorTest(MetricMonitor): + def __init__(self): + super().__init__() + self.db = DatabaseTest() + + def get_writer_cpu(self, writer_id): + return 30 + + def get_write_latency(self, writer_id): + return 5 + + +############ +# Fixtures # +############ +@pytest.fixture +def monitor(redis_data): + redis_data.worker_config.batch_size = config.MIN_BATCH_SIZE + redis_data.worker_config.thread_count = config.MIN_THREAD_COUNT + redis_data.worker_config.commit_interval = 0.01 + redis_data.worker_config.revision = 0 + return MetricMonitorTest() + + +######## +# Test # +######## +def get_metric_names(monitor): + return set([ + metric.name for metric in + monitor.metric_sender.registry.collect() + if metric.samples + ]) + + +def test_update_worker_config(monitor, redis_data): + redis_data.set_current_stage(Stage.BULK_IMPORT) + + batch_size = config.MIN_BATCH_SIZE + thread_count = config.MIN_THREAD_COUNT + + # first update + redis_data.worker_metric.average_insert_rate = 100 + monitor.update_worker_config() + assert redis_data.worker_config.batch_size == batch_size + config.BATCH_SIZE_STEP_SIZE + assert redis_data.worker_config.thread_count == thread_count + assert monitor.previous_batch_size == batch_size + batch_size += config.BATCH_SIZE_STEP_SIZE + + metric_set = get_metric_names(monitor) + assert metric_set == { + 'sb_osc_worker_batch_size', + 'sb_osc_worker_thread_count', + 'sb_osc_worker_commit_interval', + 'sb_osc_average_insert_rate' + } + + # insert rate increased with new batch_size + redis_data.worker_metric.average_insert_rate = 200 + monitor.update_worker_config() + assert redis_data.worker_config.batch_size == batch_size + config.BATCH_SIZE_STEP_SIZE + assert redis_data.worker_config.thread_count == thread_count + assert monitor.previous_batch_size == batch_size + + # insert rate not increased with new batch_size + redis_data.worker_metric.average_insert_rate = 200 + monitor.update_worker_config() + assert redis_data.worker_config.batch_size == batch_size + assert redis_data.worker_config.thread_count == thread_count + config.THREAD_COUNT_STEP_SIZE + thread_count += config.THREAD_COUNT_STEP_SIZE + + # insert rate increased with new thread count + redis_data.worker_metric.average_insert_rate = 300 + monitor.update_worker_config() + assert redis_data.worker_config.batch_size == batch_size + config.BATCH_SIZE_STEP_SIZE + assert redis_data.worker_config.thread_count == thread_count + + # insert rate not increased with new batch_size, max thread count reached + redis_data.worker_metric.average_insert_rate = 200 + monitor.update_worker_config() + assert redis_data.worker_config.batch_size == batch_size + assert redis_data.worker_config.thread_count == thread_count + + # update triggerd with max thread count, but insert rate not increased + redis_data.worker_metric.average_insert_rate = 200 + monitor.update_worker_config() + assert monitor.optimal_batch_size == batch_size + assert monitor.optimal_thread_count == thread_count + + for _ in range(config.OPTIMAL_VALUE_USE_LIMIT): + monitor.update_worker_config() + + assert monitor.optimal_batch_size is None + assert redis_data.worker_config.batch_size == batch_size + config.BATCH_SIZE_STEP_SIZE + assert redis_data.worker_config.thread_count == thread_count + + +def test_check_migration_status(monitor, cursor, redis_data): + cursor.execute("TRUNCATE TABLE sbosc.event_handler_status") + cursor.execute("TRUNCATE TABLE sbosc.apply_dml_events_status") + monitor.redis_data.metadata.max_id = 0 + monitor.check_migration_status() + metric_set = get_metric_names(monitor) + expected_metrics = { + 'sb_osc_updated_pk_set_length', + 'sb_osc_removed_pk_set_length', + 'sb_osc_unmatched_rows', + 'sb_osc_remaining_binlog_size' + } + assert metric_set == expected_metrics + + monitor.redis_data.metadata.max_id = 100 + monitor.check_migration_status() + metric_set = get_metric_names(monitor) + expected_metrics.add('sb_osc_bulk_import_progress') + assert metric_set == expected_metrics + + cursor.execute(f''' + INSERT INTO sbosc.event_handler_status (migration_id, log_file, log_pos, last_event_timestamp, created_at) + VALUES (1, 'mysql-bin.000001', 4, 2, NOW()) + ''') + redis_data.set_last_catchup_timestamp(time.time()) + monitor.check_migration_status() + metric_set = get_metric_names(monitor) + expected_metrics = expected_metrics | { + 'sb_osc_last_catchup_timestamp', + 'sb_osc_last_event_timestamp', + } + assert metric_set == expected_metrics + + cursor.execute(f''' + INSERT INTO sbosc.apply_dml_events_status (migration_id, last_loaded_timestamp, created_at) + VALUES (1, NOW(), NOW()) + ''') + monitor.check_migration_status() + metric_set = get_metric_names(monitor) + expected_metrics.add('sb_osc_last_loaded_timestamp') + assert metric_set == expected_metrics From 9b2c62d44558b6381d77bc092e06e14094ea11e5 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Wed, 17 Apr 2024 17:52:43 +0900 Subject: [PATCH 16/22] change my.cnf path --- .github/workflows/tests.yml | 2 +- docker-compose.yml | 4 ++-- {.github/workflows/conf.d => tests/configs}/my.cnf | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename {.github/workflows/conf.d => tests/configs}/my.cnf (100%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 05232b9..df228d1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,7 @@ jobs: - uses: actions/checkout@v2 - name: Copy custom MySQL configuration file run: | - docker cp ./.github/workflows/conf.d/my.cnf $(docker ps -aqf "name=mysql"):/etc/mysql/conf.d/my.cnf + docker cp ./tests/configs/my.cnf $(docker ps -aqf "name=mysql"):/etc/mysql/conf.d/my.cnf docker kill $(docker ps -aqf "name=mysql") docker start $(docker ps -aqf "name=mysql" -a) - name: Set up Python 3.11 diff --git a/docker-compose.yml b/docker-compose.yml index f4af349..8acc8ab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,8 +6,8 @@ services: container_name: mysql ports: - "3306:3306" - command: - [ 'mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci', '--server-id=1', '--log-bin=mysqld-bin'] + volumes: + - ./tests/configs/my.cnf:/etc/mysql/conf.d/my.cnf environment: MYSQL_ALLOW_EMPTY_PASSWORD: 1 MYSQL_ROOT_HOST: "%" diff --git a/.github/workflows/conf.d/my.cnf b/tests/configs/my.cnf similarity index 100% rename from .github/workflows/conf.d/my.cnf rename to tests/configs/my.cnf From e68842b32a03a549381df018c47f56cb159159c8 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Thu, 25 Apr 2024 15:56:30 +0900 Subject: [PATCH 17/22] chnage log text --- src/sbosc/controller/controller.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sbosc/controller/controller.py b/src/sbosc/controller/controller.py index a09fe01..788dd24 100644 --- a/src/sbosc/controller/controller.py +++ b/src/sbosc/controller/controller.py @@ -157,7 +157,7 @@ def validate_bulk_import(self): while not self.is_preferred_window(): if self.stop_flag: return - self.logger.info("Waiting for least busy hour") + self.logger.info("Waiting for preferred window") time.sleep(300) self.slack.send_message("Start validating bulk import") try: @@ -171,7 +171,6 @@ def validate_bulk_import(self): except StopFlagSet: return - def apply_dml_events_validation(self): self.interval = 10 @@ -187,7 +186,7 @@ def apply_dml_events_validation(self): self.logger.info("Finished ANALYZE TABLE on destination table") if not self.is_preferred_window(): - self.logger.info("Waiting for least busy hour") + self.logger.info("Waiting for preferred window") time.sleep(300) return if not config.AUTO_SWAP: From bc128c35e338362031f5ceeaf7057d177ffe9c45 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Thu, 25 Apr 2024 16:54:39 +0900 Subject: [PATCH 18/22] allow auto swap when source_db != dest_db --- src/config/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/config/config.py b/src/config/config.py index 061b94f..c9cd366 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -127,8 +127,6 @@ def __init__(self): self.AUTO_SWAP = False if self.OPERATION_CLASS == 'BaseOperation': self.OPERATION_CLASS = 'CrossClusterBaseOperation' - if self.SOURCE_DB != self.DESTINATION_DB: - self.AUTO_SWAP = False self.SOURCE_CLUSTER_ID = get_cluster_id(self.SOURCE_WRITER_ENDPOINT, self.SOURCE_CLUSTER_ID) self.DESTINATION_CLUSTER_ID = get_cluster_id(self.DESTINATION_WRITER_ENDPOINT, self.DESTINATION_CLUSTER_ID) From f279bc616dcf4e794a837a550d6dee66ac2ce15f Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Thu, 25 Apr 2024 17:16:18 +0900 Subject: [PATCH 19/22] change config naming --- charts/values.yaml | 15 ++++++++------- src/config/config.py | 16 +++++++++------- src/sbosc/controller/controller.py | 8 ++++---- src/sbosc/controller/initializer.py | 2 +- src/sbosc/controller/validator.py | 2 +- src/sbosc/eventhandler/eventhandler.py | 4 ++-- src/sbosc/eventhandler/eventloader.py | 2 +- src/sbosc/monitor/monitor.py | 6 +++--- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/charts/values.yaml b/charts/values.yaml index a8d11cb..f198cbd 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -62,27 +62,28 @@ sbosc: max_thread_count: 8 thread_count_step_size: 1 - commit_interval: 1 + commit_interval_in_seconds: 1 use_batch_size_multiplier: true # Monitor cpu_soft_threshold: 40 # Soft threshold for CPU usage. If the CPU usage exceeds this value, thread count will be decreased into half. cpu_hard_threshold: 60 # Hard threshold for CPU usage. If the CPU usage exceeds this value, thread count will be decreased to 0. - latency_soft_threshold: 30 # Soft threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased into half. - latency_hard_threshold: 50 # Hard threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased to 0. + write_latency_soft_threshold: 30 # Soft threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased into half. + write_latency_hard_threshold: 50 # Hard threshold for WriteLatency. If the latency exceeds this value, batch size will be decreased to 0. # EventHandler - event_handler_thread_count: 4 # Number of threads for EventHandler. Max number of binlog files to read at once. (Max 4 recommended) - event_handler_thread_timeout: 300 # Timeout for EventHandler thread. If the thread is not finished within this time, it raises exception and restarts EventHandler. + eventhandler_thread_count: 4 # Number of threads for EventHandler. Max number of binlog files to read at once. (Max 4 recommended) + eventhandler_thread_timeout_in_seconds: 300 # Timeout for EventHandler thread. If the thread is not finished within this time, it raises exception and restarts EventHandler. # EventLoader pk_set_max_size: 1000000 # Max number of DML PKs to load from DB at once. No more than 2 * pk_set_max_size will be kept in Redis. This is used for memory optimization. - event_batch_duration: 3600 # Timestamp range of DML events to load from DB at once (seconds). + event_batch_duration_in_seconds: 3600 # Timestamp range of DML events to load from DB at once (seconds). # Validator bulk_import_validation_batch_size: 1000000 # Batch size for bulk import validation apply_dml_events_validation_batch_size: 3000 # Batch size for DML event validation - full_dml_event_validation_interval: 1 # Interval for full DML event validation (hours) + apply_dml_events_validation_interval_in_seconds: 10 # Interval for DML event validation (seconds) + full_dml_event_validation_interval_in_hours: 1 # Interval for full DML event validation (hours) validation_thread_count: 4 # Number of threads to use for validation process instances: diff --git a/src/config/config.py b/src/config/config.py index c9cd366..198168d 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -54,6 +54,7 @@ class Config: MIN_CHUNK_SIZE = 100000 MAX_CHUNK_COUNT = 200 AUTO_SWAP = False + WAIT_INTERVAL_UNTIL_AUTO_SWAP_IN_SECONDS = 60 PREFERRED_WINDOW = '00:00-23:59' SKIP_BULK_IMPORT = False OPERATION_CLASS = 'BaseOperation' @@ -67,31 +68,32 @@ class Config: MIN_THREAD_COUNT = 4 THREAD_COUNT_STEP_SIZE = 4 MAX_THREAD_COUNT = 64 - COMMIT_INTERVAL = 0.01 + COMMIT_INTERVAL_IN_SECONDS = 0.01 OPTIMAL_VALUE_USE_LIMIT = 10 USE_BATCH_SIZE_MULTIPLIER = False # EventHandler config - EVENT_HANDLER_THREAD_COUNT = 4 - EVENT_HANDLER_THREAD_TIMEOUT = 300 + EVENTHANDLER_THREAD_COUNT = 4 + EVENTHANDLER_THREAD_TIMEOUT_IN_SECONDS = 300 INIT_BINLOG_FILE: str = None INIT_BINLOG_POSITION: int = None # Threshold CPU_SOFT_THRESHOLD = 70 CPU_HARD_THRESHOLD = 90 - LATENCY_SOFT_THRESHOLD = 20 # milliseconds - LATENCY_HARD_THRESHOLD = 50 # milliseconds + WRITE_LATENCY_SOFT_THRESHOLD = 20 # milliseconds + WRITE_LATENCY_HARD_THRESHOLD = 50 # milliseconds # Validation BULK_IMPORT_VALIDATION_BATCH_SIZE = 100000 APPLY_DML_EVENTS_VALIDATION_BATCH_SIZE = 100000 VALIDATION_THREAD_COUNT = 4 - FULL_DML_EVENT_VALIDATION_INTERVAL = 1 # hours + APPLY_DML_EVENTS_VALIDATION_INTERVAL_IN_SECONDS = 10 + FULL_DML_EVENT_VALIDATION_INTERVAL_IN_HOURS = 1 # DML event loading PK_SET_MAX_SIZE = 1000000 - EVENT_BATCH_DURATION = 3600 + EVENT_BATCH_DURATION_IN_SECONDS = 3600 @property def operation_class(self): diff --git a/src/sbosc/controller/controller.py b/src/sbosc/controller/controller.py index 788dd24..25c9d7a 100644 --- a/src/sbosc/controller/controller.py +++ b/src/sbosc/controller/controller.py @@ -97,7 +97,7 @@ def create_bulk_import_chunks(self): self.redis_data.worker_config.set({ 'batch_size': config.MIN_BATCH_SIZE, 'thread_count': config.MIN_THREAD_COUNT, - 'commit_interval': config.COMMIT_INTERVAL, + 'commit_interval': config.COMMIT_INTERVAL_IN_SECONDS, 'revision': 0, }) @@ -117,7 +117,7 @@ def create_bulk_import_chunks(self): f"Chunk size: {chunk_size}\n" f"Batch size: {config.MIN_BATCH_SIZE}\n" f"Thread count: {config.MIN_THREAD_COUNT}\n" - f"Commit interval: {config.COMMIT_INTERVAL}" + f"Commit interval: {config.COMMIT_INTERVAL_IN_SECONDS}" ) def validate_bulk_import(self): @@ -172,7 +172,7 @@ def validate_bulk_import(self): return def apply_dml_events_validation(self): - self.interval = 10 + self.interval = config.APPLY_DML_EVENTS_VALIDATION_INTERVAL_IN_SECONDS try: is_valid = self.validator.apply_dml_events_validation() @@ -191,7 +191,7 @@ def apply_dml_events_validation(self): return if not config.AUTO_SWAP: self.logger.info("Auto swap is disabled") - time.sleep(60) + time.sleep(config.WAIT_INTERVAL_UNTIL_AUTO_SWAP_IN_SECONDS) return is_valid = self.validator.full_dml_event_validation() diff --git a/src/sbosc/controller/initializer.py b/src/sbosc/controller/initializer.py index d04966a..84179f2 100644 --- a/src/sbosc/controller/initializer.py +++ b/src/sbosc/controller/initializer.py @@ -240,7 +240,7 @@ def init_migration(self): redis_data.worker_config.set({ 'batch_size': config.MIN_BATCH_SIZE, 'thread_count': config.MIN_THREAD_COUNT, - 'commit_interval': config.COMMIT_INTERVAL, + 'commit_interval': config.COMMIT_INTERVAL_IN_SECONDS, 'revision': 0, }) diff --git a/src/sbosc/controller/validator.py b/src/sbosc/controller/validator.py index 17b9c52..2881481 100644 --- a/src/sbosc/controller/validator.py +++ b/src/sbosc/controller/validator.py @@ -24,7 +24,7 @@ def __init__(self, controller: 'Controller'): self.migration_id = controller.migration_id self.bulk_import_batch_size = config.BULK_IMPORT_VALIDATION_BATCH_SIZE self.apply_dml_events_batch_size = config.APPLY_DML_EVENTS_VALIDATION_BATCH_SIZE - self.full_dml_event_validation_interval = config.FULL_DML_EVENT_VALIDATION_INTERVAL + self.full_dml_event_validation_interval = config.FULL_DML_EVENT_VALIDATION_INTERVAL_IN_HOURS self.thread_count = config.VALIDATION_THREAD_COUNT self.db = Database() self.redis_data = RedisData(self.migration_id) diff --git a/src/sbosc/eventhandler/eventhandler.py b/src/sbosc/eventhandler/eventhandler.py index 49c0eb5..2030555 100644 --- a/src/sbosc/eventhandler/eventhandler.py +++ b/src/sbosc/eventhandler/eventhandler.py @@ -69,8 +69,8 @@ def clear(self): class EventHandler(SBOSCComponent): def __init__(self): super().__init__() - self.thread_count = config.EVENT_HANDLER_THREAD_COUNT - self.thread_timeout = config.EVENT_HANDLER_THREAD_TIMEOUT + self.thread_count = config.EVENTHANDLER_THREAD_COUNT + self.thread_timeout = config.EVENTHANDLER_THREAD_TIMEOUT_IN_SECONDS self.slack = SlackClient('SB-OSC EventHandler', f'{config.SOURCE_CLUSTER_ID}, {self.migration_id}') # EventLoader diff --git a/src/sbosc/eventhandler/eventloader.py b/src/sbosc/eventhandler/eventloader.py index 01b4581..ae53139 100644 --- a/src/sbosc/eventhandler/eventloader.py +++ b/src/sbosc/eventhandler/eventloader.py @@ -20,7 +20,7 @@ def __init__(self, event_handler: 'EventHandler'): self.logger = event_handler.logger self.last_loaded_timestamp = 1 - self.batch_duration = config.EVENT_BATCH_DURATION + self.batch_duration = config.EVENT_BATCH_DURATION_IN_SECONDS self.stop_flag = False diff --git a/src/sbosc/monitor/monitor.py b/src/sbosc/monitor/monitor.py index d076b87..d0e577d 100644 --- a/src/sbosc/monitor/monitor.py +++ b/src/sbosc/monitor/monitor.py @@ -234,18 +234,18 @@ def update_worker_config(self): next_batch_size = current_batch_size next_thread_count = max(config.MIN_THREAD_COUNT, current_thread_count // 2) - elif write_latency > config.LATENCY_HARD_THRESHOLD: + elif write_latency > config.WRITE_LATENCY_HARD_THRESHOLD: self.logger.warning("Latency exceeded hard threshold. Setting thread count to 0.") next_batch_size = config.MIN_BATCH_SIZE next_thread_count = 0 - elif write_latency > config.LATENCY_SOFT_THRESHOLD: + elif write_latency > config.WRITE_LATENCY_SOFT_THRESHOLD: self.logger.warning("Latency exceeded soft threshold. Start decreasing batch size.") next_batch_size = max(config.MIN_BATCH_SIZE, current_batch_size // 2) next_thread_count = current_thread_count elif writer_cpu < config.CPU_SOFT_THRESHOLD and \ - write_latency < config.LATENCY_SOFT_THRESHOLD and current_thread_count == 0: + write_latency < config.WRITE_LATENCY_SOFT_THRESHOLD and current_thread_count == 0: self.logger.info("Writer became stable. Restoring thread count to MIN_THREAD_COUNT.") next_batch_size = current_batch_size next_thread_count = config.MIN_THREAD_COUNT From aaf420adccbe36c1452186925ecd4817239b28e5 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Fri, 26 Apr 2024 11:06:44 +0900 Subject: [PATCH 20/22] add comparison with gh-ost --- README.md | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fea77dd..5f70651 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ For binlog event processing, SB-OSC processes binlog files in parallel, which en heavy write loads. ### Resumable -SB-OSC is resumable at any stage of the schema migration process. It saves the current state of each stage to database and Redis, allowing users to pause and resume the process at any time, as log as binlog retention is sufficient. + +SB-OSC is resumable at any stage of the schema migration process. It saves the current state of each stage to database +and Redis, allowing users to pause and resume the process at any time, as log as binlog retention is sufficient. ### Operation Class @@ -63,7 +65,9 @@ It requires the following resources to run: - AWS SecretsManager secret - IAM role -SB-OSC accepts `ROW` for binlog format. It is recommended to set `binlog-ignore-db` to `sbosc` to prevent SB-OSC from processing its own binlog events. +SB-OSC accepts `ROW` for binlog format. It is recommended to set `binlog-ignore-db` to `sbosc` to prevent SB-OSC from +processing its own binlog events. + - `binlog_format` set to `ROW` - `binlog-ignore-db` set to `sbosc` (Recommended) @@ -105,9 +109,11 @@ Result shows that SB-OSC can catch up DML events on tables with very high write ### Bulk Import -To provide general insight on bulk import performance, the test was conducted on table `A` with no secondary indexes, and no additional traffic. +To provide general insight on bulk import performance, the test was conducted on table `A` with no secondary indexes, +and no additional traffic. -Actual performance of bulk import can vary depending on the number of secondary indexes, the number of rows, column types, +Actual performance of bulk import can vary depending on the number of secondary indexes, the number of rows, column +types, production traffic, etc. Following are the results of bulk import performance based on instance sizes: @@ -120,6 +126,34 @@ Following are the results of bulk import performance based on instance sizes: Insert rate, network throughput, and storage throughput are the average values calculated from CloudWatch metrics. +### Comparison with gh-ost + +We've compared total migration time of SB-OSC and gh-ost on following conditions: + +- Table `C` with ~200M rows +- Aurora MySQL v3 cluster, r6g.8xlarge instance +- 2 secondary indexes +- `batch_size` (`chunk-size` for gh-ost): 50000 +- (gh-ost) `--allow-on-master` + +**w/o traffic** + +| Tool | Total Migration Time | CPU Utilization (%) | +|:------:|---------------------:|--------------------:| +| SB-OSC | 22m | 60.6 | +| gh-ost | 1h 52m | 19.7 | + +**w/ traffic** + +Traffic was generated only to table `C` during the migration. (~1.0K inserts/s, ~0.33K updates/s, ~0.33K deletes/s) + +| Tool | Total Migration Time | CPU Utilization (%) | +|:------:|---------------------:|--------------------:| +| SB-OSC | 27m | 62.7 | +| gh-ost | 1d+ | 27.4 | + +For gh-ost, we interrupted the migration at 50% (~12h) since ETA kept increasing. + ## Limitations - **Necessity of Integer Primary Keys** From 69c8d8e734cb72e7ee8eed5011534dd93b75703c Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Fri, 26 Apr 2024 15:30:35 +0900 Subject: [PATCH 21/22] README fix --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5f70651..94c48a7 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,12 @@ It also provides seamless pausing and resuming of tasks to adeptly handle extend changes, along with a built-in monitoring system to dynamically control its heavy DML load based on Aurora's performance metrics. -SB-OSC is designed to overcome the limitations that traditional migration tools face with large-scale tables, +SB-OSC is designed to overcome the limitations that existing migration tools face with large-scale tables, significantly reducing the operational overhead associated with managing large tables. ## Takeaways -SB-OSC has its own unique features that differentiate it from traditional schema migration tools such -as `pt-osc` and `gh-ost`. +SB-OSC has its own unique features that differentiate it from existing schema migration tools such as `pt-osc` and `gh-ost`. ### Multithreading @@ -38,6 +37,8 @@ allows users to customize queries for specific tables such as data retention, ta Also, it provides operation class that allows replication cross different Aurora clusters which can be used in various scenarios such as cross-region replication, cross-account replication, clone cluster replication, etc. +[Guide for operation class](doc/operation-class.md) + ### Data Validation SB-OSC provides strong data validation features to ensure data consistency between the source and destination tables. It @@ -89,7 +90,7 @@ performance testing: | G | 1211 | 60.7 K | **Avg Row Length**: `avg_row_length` from `information_schema.TABLES` -**Write IOPS**: Average increase of `count_star` from `performance_schema.table_io_waits_summary_by_table` per +**Write IOPS**: Average increase of `count_write` from `performance_schema.table_io_waits_summary_by_table` per minute. All tables were in the same Aurora MySQL v3 cluster From 0329251408c9c820b83d4fc585433ac05a811d61 Mon Sep 17 00:00:00 2001 From: Jimmy Kim Date: Wed, 8 May 2024 22:03:56 +0900 Subject: [PATCH 22/22] add catalog-info.yaml --- catalog-info.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 catalog-info.yaml diff --git a/catalog-info.yaml b/catalog-info.yaml new file mode 100644 index 0000000..4977dac --- /dev/null +++ b/catalog-info.yaml @@ -0,0 +1,12 @@ +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: sb-osc + description: Application for online schema change + annotations: + github.com/project-slug: sendbird/sb-osc +spec: + type: service + lifecycle: production + owner: team-data-infrastructure + system: sendbird-internal-tools