diff --git a/LearnClickHouseWithMark/NullTableEngine/README.md b/LearnClickHouseWithMark/NullTableEngine/README.md new file mode 100644 index 0000000..86c8631 --- /dev/null +++ b/LearnClickHouseWithMark/NullTableEngine/README.md @@ -0,0 +1,147 @@ +# Null Table Engine + +Video: https://www.youtube.com/watch?v=vaY5LQ7a_Dk + +Install ClickHouse + +```bash +curl https://clickhouse.com/ | sh +``` + +Launch ClickHouse + +```bash +./clickhouse local -m +``` + +Generate data: + +```bash +pip install faker jsonlines +``` + +```bash +python datagen.py > logs.json +``` + +Describe logs file: + +```sql +DESCRIBE 'logs.json' +SETTINGS describe_compact_output=1, + schema_inference_make_columns_nullable=0; +``` + + +Create `logs` table: + +```sql +CREATE TABLE logs ( + timestamp DateTime64(3), + service String, + logLevel String, + `X-Correlation-ID` String, + message String +) +ENGINE=Null; +``` + +Create `searches` table: + +```sql +CREATE TABLE searches ( + timestamp DateTime(3), + userId String, + location String, + checkin Date, + checkout Date, + guests Int +) +ORDER BY timestamp; +``` + +Create `bookings` table: + +```sql +CREATE TABLE bookings ( + timestamp DateTime(3), + userId String, + roomType LowCardinality(String), + price UInt16, + checkin Date, + checkout Date +) +ORDER BY timestamp; +``` + +Materialized views: + +```sql +CREATE MATERIALIZED VIEW searches_mv TO searches AS +WITH searchLogs AS ( + FROM logs + SELECT timestamp, extractAllGroups( + assumeNotNull(message), + 'User (.*) searching available hotels with criteria: (.*)\.' + )[1] AS groups, + groups[1] AS userId, + JSONExtract(groups[2], 'Map(String, Variant(String, Int))') as search + WHERE service = 'Search' +) +FROM searchLogs +SELECT timestamp, + userId, + search['location'] AS location, + search['checkin'] AS checkin, + search['checkout'] AS checkout, + search['guests'] AS guests; +``` + +```sql +CREATE MATERIALIZED VIEW bookings_mv TO bookings AS +WITH bookingLogs AS ( + FROM logs + SELECT timestamp, extractAllGroups( + assumeNotNull(message), + 'User (.*) selected a hotel room with details: (.*)\.' + )[1] AS groups, + groups[1] AS userId, + JSONExtract(groups[2], 'Map(String, Variant(String, Int))') as booking + WHERE service = 'Booking' +) +FROM bookingLogs +SELECT timestamp, + userId, + booking['roomType'] AS roomType, + booking['price'] AS price, + booking['checkin'] AS checkin, + booking['checkout'] AS checkout; +``` + +Insert data into the `logs` table: + +```sql +INSERT INTO logs +SELECT * FROM 'logs.json' +``` + +Queries: + +```sql +WITH userCount AS ( + SELECT userId, count(*) AS numberOfSearches + FROM searches + GROUP BY userId +) +SELECT numberOfSearches, count() AS count +FROM userCount +GROUP BY numberOfSearches +ORDER BY count DESC +LIMIT 10; +``` + +```sql +SELECT roomType, count(), avg(price) +FROM bookings +GROUP BY ALL; +``` \ No newline at end of file diff --git a/LearnClickHouseWithMark/NullTableEngine/datagen.py b/LearnClickHouseWithMark/NullTableEngine/datagen.py new file mode 100644 index 0000000..397191f --- /dev/null +++ b/LearnClickHouseWithMark/NullTableEngine/datagen.py @@ -0,0 +1,99 @@ +import sys +import json +import jsonlines +import random +from datetime import datetime, timedelta +from faker import Faker + +fake = Faker() + +def generate_log(timestamp, service, logLevel, correlation_id, message): + return { + "timestamp": timestamp.isoformat(), + "service": service, + "logLevel": logLevel, + "X-Correlation-ID": correlation_id, + "message": message + } + +def generate_search_log(user_id, correlation_id, timestamp): + location = fake.city() + guests = random.randint(1, 4) + checkin = fake.date_between(start_date="today", end_date="+30d") + checkout = fake.date_between(start_date=checkin, end_date=checkin + timedelta(days=10)) + message = f"User {user_id} searching available hotels with criteria: {{\"location\":\"{location}\", \"checkin\":\"{checkin}\", \"checkout\":\"{checkout}\", \"guests\":{guests}}}." + return generate_log(timestamp, "Search", "INFO", correlation_id, message) + +def generate_booking_log(user_id, correlation_id, timestamp): + room_types = ["Standard", "Deluxe", "Suite"] + room_type = random.choices(room_types, weights = [6, 3, 1], k=1)[0] + + if room_type == "Standard": + low, high = (100, 200) + elif room_type == "Deluxe": + low, high = (150, 400) + else: + low, high = (300, 1000) + + price = random.randint(low, high) + + checkin = fake.date_between(start_date="+30d", end_date="+60d") + checkout = fake.date_between(start_date=checkin, end_date=checkin + timedelta(days=10)) + message = f"User {user_id} selected a hotel room with details: {{\"roomType\":\"{room_type}\", \"price\":{price}, \"checkin\":\"{checkin}\", \"checkout\":\"{checkout}\"}}." + return generate_log(timestamp, "Booking", "INFO", correlation_id, message) + +def generate_payment_log(user_id, correlation_id, timestamp, success=True): + payment_methods = ["Credit Card", "PayPal", "Bank Transfer"] + payment_method = random.choice(payment_methods) + amount = random.randint(100, 1000) + if success: + message = f"Processing payment for user ID {user_id}, amount: {amount} USD, payment method: {payment_method}." + logLevel = "INFO" + else: + message = f"Payment failed for user ID {user_id}, amount: {amount} USD, reason: Insufficient funds." + logLevel = "ERROR" + return generate_log(timestamp, "Payment", logLevel, correlation_id, message) + +def generate_journey_logs(num_users): + start_time = datetime.now() + logs = [] + + for _ in range(1, num_users + 1): + user_id = fake.uuid4().split("-")[0] + correlation_id = fake.uuid4() + timestamp = start_time + timedelta(seconds=random.randint(0, num_users * 10)) + + # User starts with a search + for _ in range(1, random.randint(1, 20)): + logs.append(generate_search_log(user_id, correlation_id, timestamp)) + + # Randomly decide if user drops out after search + if random.random() < 0.2: + continue + + timestamp += timedelta(seconds=5) + # User proceeds to booking + logs.append(generate_booking_log(user_id, correlation_id, timestamp)) + + # Randomly decide if user drops out after booking + if random.random() < 0.1: + continue + + timestamp += timedelta(seconds=5) + # User proceeds to payment + payment_success = random.random() >= 0.1 # 10% chance of payment failure + logs.append(generate_payment_log(user_id, correlation_id, timestamp, success=payment_success)) + + if not payment_success: + continue + + return logs + +if __name__ == "__main__": + num_users = 100000 # Number of users to simulate + logs = generate_journey_logs(num_users) + + # Print logs as JSON + with jsonlines.Writer(sys.stdout) as out: + for log in logs: + out.write(log)