Skip to content

Commit a2d75b9

Browse files
committed
null table engine
1 parent 24f53ac commit a2d75b9

File tree

2 files changed

+246
-0
lines changed

2 files changed

+246
-0
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Null Table Engine
2+
3+
Video: https://www.youtube.com/watch?v=vaY5LQ7a_Dk
4+
5+
Install ClickHouse
6+
7+
```bash
8+
curl https://clickhouse.com/ | sh
9+
```
10+
11+
Launch ClickHouse
12+
13+
```bash
14+
./clickhouse local -m
15+
```
16+
17+
Generate data:
18+
19+
```bash
20+
pip install faker jsonlines
21+
```
22+
23+
```bash
24+
python datagen.py > logs.json
25+
```
26+
27+
Describe logs file:
28+
29+
```sql
30+
DESCRIBE 'logs.json'
31+
SETTINGS describe_compact_output=1,
32+
schema_inference_make_columns_nullable=0;
33+
```
34+
35+
36+
Create `logs` table:
37+
38+
```sql
39+
CREATE TABLE logs (
40+
timestamp DateTime64(3),
41+
service String,
42+
logLevel String,
43+
`X-Correlation-ID` String,
44+
message String
45+
)
46+
ENGINE=Null;
47+
```
48+
49+
Create `searches` table:
50+
51+
```sql
52+
CREATE TABLE searches (
53+
timestamp DateTime(3),
54+
userId String,
55+
location String,
56+
checkin Date,
57+
checkout Date,
58+
guests Int
59+
)
60+
ORDER BY timestamp;
61+
```
62+
63+
Create `bookings` table:
64+
65+
```sql
66+
CREATE TABLE bookings (
67+
timestamp DateTime(3),
68+
userId String,
69+
roomType LowCardinality(String),
70+
price UInt16,
71+
checkin Date,
72+
checkout Date
73+
)
74+
ORDER BY timestamp;
75+
```
76+
77+
Materialized views:
78+
79+
```sql
80+
CREATE MATERIALIZED VIEW searches_mv TO searches AS
81+
WITH searchLogs AS (
82+
FROM logs
83+
SELECT timestamp, extractAllGroups(
84+
assumeNotNull(message),
85+
'User (.*) searching available hotels with criteria: (.*)\.'
86+
)[1] AS groups,
87+
groups[1] AS userId,
88+
JSONExtract(groups[2], 'Map(String, Variant(String, Int))') as search
89+
WHERE service = 'Search'
90+
)
91+
FROM searchLogs
92+
SELECT timestamp,
93+
userId,
94+
search['location'] AS location,
95+
search['checkin'] AS checkin,
96+
search['checkout'] AS checkout,
97+
search['guests'] AS guests;
98+
```
99+
100+
```sql
101+
CREATE MATERIALIZED VIEW bookings_mv TO bookings AS
102+
WITH bookingLogs AS (
103+
FROM logs
104+
SELECT timestamp, extractAllGroups(
105+
assumeNotNull(message),
106+
'User (.*) selected a hotel room with details: (.*)\.'
107+
)[1] AS groups,
108+
groups[1] AS userId,
109+
JSONExtract(groups[2], 'Map(String, Variant(String, Int))') as booking
110+
WHERE service = 'Booking'
111+
)
112+
FROM bookingLogs
113+
SELECT timestamp,
114+
userId,
115+
booking['roomType'] AS roomType,
116+
booking['price'] AS price,
117+
booking['checkin'] AS checkin,
118+
booking['checkout'] AS checkout;
119+
```
120+
121+
Insert data into the `logs` table:
122+
123+
```sql
124+
INSERT INTO logs
125+
SELECT * FROM 'logs.json'
126+
```
127+
128+
Queries:
129+
130+
```sql
131+
WITH userCount AS (
132+
SELECT userId, count(*) AS numberOfSearches
133+
FROM searches
134+
GROUP BY userId
135+
)
136+
SELECT numberOfSearches, count() AS count
137+
FROM userCount
138+
GROUP BY numberOfSearches
139+
ORDER BY count DESC
140+
LIMIT 10;
141+
```
142+
143+
```sql
144+
SELECT roomType, count(), avg(price)
145+
FROM bookings
146+
GROUP BY ALL;
147+
```
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import sys
2+
import json
3+
import jsonlines
4+
import random
5+
from datetime import datetime, timedelta
6+
from faker import Faker
7+
8+
fake = Faker()
9+
10+
def generate_log(timestamp, service, logLevel, correlation_id, message):
11+
return {
12+
"timestamp": timestamp.isoformat(),
13+
"service": service,
14+
"logLevel": logLevel,
15+
"X-Correlation-ID": correlation_id,
16+
"message": message
17+
}
18+
19+
def generate_search_log(user_id, correlation_id, timestamp):
20+
location = fake.city()
21+
guests = random.randint(1, 4)
22+
checkin = fake.date_between(start_date="today", end_date="+30d")
23+
checkout = fake.date_between(start_date=checkin, end_date=checkin + timedelta(days=10))
24+
message = f"User {user_id} searching available hotels with criteria: {{\"location\":\"{location}\", \"checkin\":\"{checkin}\", \"checkout\":\"{checkout}\", \"guests\":{guests}}}."
25+
return generate_log(timestamp, "Search", "INFO", correlation_id, message)
26+
27+
def generate_booking_log(user_id, correlation_id, timestamp):
28+
room_types = ["Standard", "Deluxe", "Suite"]
29+
room_type = random.choices(room_types, weights = [6, 3, 1], k=1)[0]
30+
31+
if room_type == "Standard":
32+
low, high = (100, 200)
33+
elif room_type == "Deluxe":
34+
low, high = (150, 400)
35+
else:
36+
low, high = (300, 1000)
37+
38+
price = random.randint(low, high)
39+
40+
checkin = fake.date_between(start_date="+30d", end_date="+60d")
41+
checkout = fake.date_between(start_date=checkin, end_date=checkin + timedelta(days=10))
42+
message = f"User {user_id} selected a hotel room with details: {{\"roomType\":\"{room_type}\", \"price\":{price}, \"checkin\":\"{checkin}\", \"checkout\":\"{checkout}\"}}."
43+
return generate_log(timestamp, "Booking", "INFO", correlation_id, message)
44+
45+
def generate_payment_log(user_id, correlation_id, timestamp, success=True):
46+
payment_methods = ["Credit Card", "PayPal", "Bank Transfer"]
47+
payment_method = random.choice(payment_methods)
48+
amount = random.randint(100, 1000)
49+
if success:
50+
message = f"Processing payment for user ID {user_id}, amount: {amount} USD, payment method: {payment_method}."
51+
logLevel = "INFO"
52+
else:
53+
message = f"Payment failed for user ID {user_id}, amount: {amount} USD, reason: Insufficient funds."
54+
logLevel = "ERROR"
55+
return generate_log(timestamp, "Payment", logLevel, correlation_id, message)
56+
57+
def generate_journey_logs(num_users):
58+
start_time = datetime.now()
59+
logs = []
60+
61+
for _ in range(1, num_users + 1):
62+
user_id = fake.uuid4().split("-")[0]
63+
correlation_id = fake.uuid4()
64+
timestamp = start_time + timedelta(seconds=random.randint(0, num_users * 10))
65+
66+
# User starts with a search
67+
for _ in range(1, random.randint(1, 20)):
68+
logs.append(generate_search_log(user_id, correlation_id, timestamp))
69+
70+
# Randomly decide if user drops out after search
71+
if random.random() < 0.2:
72+
continue
73+
74+
timestamp += timedelta(seconds=5)
75+
# User proceeds to booking
76+
logs.append(generate_booking_log(user_id, correlation_id, timestamp))
77+
78+
# Randomly decide if user drops out after booking
79+
if random.random() < 0.1:
80+
continue
81+
82+
timestamp += timedelta(seconds=5)
83+
# User proceeds to payment
84+
payment_success = random.random() >= 0.1 # 10% chance of payment failure
85+
logs.append(generate_payment_log(user_id, correlation_id, timestamp, success=payment_success))
86+
87+
if not payment_success:
88+
continue
89+
90+
return logs
91+
92+
if __name__ == "__main__":
93+
num_users = 100000 # Number of users to simulate
94+
logs = generate_journey_logs(num_users)
95+
96+
# Print logs as JSON
97+
with jsonlines.Writer(sys.stdout) as out:
98+
for log in logs:
99+
out.write(log)

0 commit comments

Comments
 (0)