Skip to content

Commit

Permalink
update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
ruslandoga committed Jul 3, 2024
1 parent 97572b5 commit 68b9dee
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 173 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Unreleased

- raise on invalid UInt8 and Int8 when encoding RowBinary https://github.com/plausible/ch/pull/180
- - move rowbinary from `params` to `statement`

## 0.2.6 (2024-05-30)

Expand Down
224 changes: 58 additions & 166 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ defaults = [
{:ok, %Ch.Result{rows: [[0], [1], [2]]}} =
Ch.query(pid, "SELECT * FROM system.numbers LIMIT 3")

{:ok, %Ch.Result{rows: [[0], [1], [2]]}} =
Ch.query(pid, "SELECT * FROM system.numbers LIMIT {$0:UInt8}", [3])

{:ok, %Ch.Result{rows: [[0], [1], [2]]}} =
Ch.query(pid, "SELECT * FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 3})
```
Expand All @@ -75,46 +72,59 @@ Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null")
%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES (0), (1)")

%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({$0:UInt8}), ({$1:UInt32})", [0, 1])

%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({a:UInt16}), ({b:UInt64})", %{"a" => 0, "b" => 1})

%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) SELECT number FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 2})
```

#### Insert rows as [RowBinary](https://clickhouse.com/docs/en/interfaces/formats#rowbinary) (efficient)
#### Insert [RowBinary](https://clickhouse.com/docs/en/interfaces/formats#rowbinary)

```elixir
{:ok, pid} = Ch.start_link()

Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null")
Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64, text String) ENGINE Null")

types = ["UInt64"]
rows = [
[0, "a"],
[1, "b"]
]

types = ["UInt64", "String"]
# or
types = [Ch.Types.u64()]
types = [Ch.Types.u64(), Ch.Types.string()]
# or
types = [:u64]
types = [:u64, :string]

rowbinary = Ch.RowBinary.encode_rows(rows, types)

%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", [[0], [1]], types: types)
Ch.query!(pid, ["INSERT INTO ch_demo(id) FORMAT RowBinary\n" | rowbinary])
```

Note that RowBinary format encoding requires `:types` option to be provided.

Similarly, you can use [`RowBinaryWithNamesAndTypes`](https://clickhouse.com/docs/en/interfaces/formats#rowbinarywithnamesandtypes) which would additionally do something like a type check.

```elixir
sql = "INSERT INTO ch_demo FORMAT RowBinaryWithNamesAndTypes"
opts = [names: ["id"], types: ["UInt64"]]
rows = [[0], [1]]
sql = "INSERT INTO ch_demo FORMAT RowBinaryWithNamesAndTypes\n"

rows = [
[0, "a"],
[1, "b"]
]

%Ch.Result{num_rows: 2} = Ch.query!(pid, sql, rows, opts)
types = ["UInt64", "String"]
names = ["id", "text"]

data = [
Ch.RowBinary.encode_names_and_types(names, types),
Ch.RowBinary.encode_rows(rows, types)
]

%Ch.Result{num_rows: 2} = Ch.query!(pid, [sql | data])
```

#### Insert rows in custom [format](https://clickhouse.com/docs/en/interfaces/formats)
#### Insert rows in some other [format](https://clickhouse.com/docs/en/interfaces/formats)

```elixir
{:ok, pid} = Ch.start_link()
Expand All @@ -124,26 +134,27 @@ Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null")
csv = [0, 1] |> Enum.map(&to_string/1) |> Enum.intersperse(?\n)

%Ch.Result{num_rows: 2} =
Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT CSV", csv, encode: false)
Ch.query!(pid, ["INSERT INTO ch_demo(id) FORMAT CSV\n" | csv])
```

#### Insert rows as chunked RowBinary stream
#### Insert [chunked](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) RowBinary stream

```elixir
{:ok, pid} = Ch.start_link()

Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null")

stream = Stream.repeatedly(fn -> [:rand.uniform(100)] end)
chunked = Stream.chunk_every(stream, 100)
encoded = Stream.map(chunked, fn chunk -> Ch.RowBinary.encode_rows(chunk, _types = ["UInt64"]) end)
ten_encoded_chunks = Stream.take(encoded, 10)

%Ch.Result{num_rows: 1000} =
Ch.query(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", ten_encoded_chunks, encode: false)
DBConnection.run(pid, fn conn ->
Stream.repeatedly(fn -> [:rand.uniform(100)] end)
|> Stream.chunk_every(100_000)
|> Stream.map(fn chunk -> Ch.RowBinary.encode_rows(chunk, _types = ["UInt64"]) end)
|> Stream.take(10)
|> Stream.into(Ch.stream(conn, "INSERT INTO ch_demo(id) FORMAT RowBinary\n"))
|> Stream.run()
end)
```

This query makes a [`transfer-encoding: chunked`](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) HTTP request while unfolding the stream resulting in lower memory usage.
This query makes a [`transfer-encoding: chunked`] HTTP request while unfolding the stream resulting in lower memory usage.

#### Query with custom [settings](https://clickhouse.com/docs/en/operations/settings/settings)

Expand All @@ -156,7 +167,7 @@ settings = [async_insert: 1]
Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'")

%Ch.Result{rows: [["async_insert", "Bool", "1"]]} =
Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'", [], settings: settings)
Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'", _params = [], settings: settings)
```

## Caveats
Expand All @@ -179,13 +190,13 @@ CREATE TABLE ch_nulls (
""")

types = ["Nullable(UInt8)", "UInt8", "UInt8"]
inserted_rows = [[nil, nil, nil]]
selected_rows = [[nil, 0, 0]]
row = [nil, nil, nil]
rowbinary = Ch.RowBinary.encode_row(row, types)

%Ch.Result{num_rows: 1} =
Ch.query!(pid, "INSERT INTO ch_nulls(a, b, c) FORMAT RowBinary", inserted_rows, types: types)
Ch.query!(pid, ["INSERT INTO ch_nulls(a, b, c) FORMAT RowBinary\n" | rowbinary])

%Ch.Result{rows: ^selected_rows} =
%Ch.Result{rows: [[nil, _not_10 = 0, 0]]} =
Ch.query!(pid, "SELECT * FROM ch_nulls")
```

Expand All @@ -197,12 +208,16 @@ However, [`input()`](https://clickhouse.com/docs/en/sql-reference/table-function
sql = """
INSERT INTO ch_nulls
SELECT * FROM input('a Nullable(UInt8), b Nullable(UInt8), c UInt8')
FORMAT RowBinary\
FORMAT RowBinary
"""

Ch.query!(pid, sql, inserted_rows, types: ["Nullable(UInt8)", "Nullable(UInt8)", "UInt8"])
types = ["Nullable(UInt8)", "Nullable(UInt8)", "UInt8"]
rowbinary = Ch.RowBinary.encode_row(row, types)

%Ch.Result{num_rows: 1} =
Ch.query!(pid, [sql | rowbinary])

%Ch.Result{rows: [[0], [10]]} =
%Ch.Result{rows: [_before = [0], _after = [10]]} =
Ch.query!(pid, "SELECT b FROM ch_nulls ORDER BY b")
```

Expand All @@ -215,26 +230,18 @@ When decoding [`String`](https://clickhouse.com/docs/en/sql-reference/data-types

Ch.query!(pid, "CREATE TABLE ch_utf8(str String) ENGINE Memory")

bin = "\x61\xF0\x80\x80\x80b"
utf8 = "a�b"
rowbinary = Ch.RowBinary.encode(:string, "\x61\xF0\x80\x80\x80b")

%Ch.Result{num_rows: 1} =
Ch.query!(pid, "INSERT INTO ch_utf8(str) FORMAT RowBinary", [[bin]], types: ["String"])
Ch.query!(pid, ["INSERT INTO ch_utf8(str) FORMAT RowBinary\n" | rowbinary])

%Ch.Result{rows: [[^utf8]]} =
%Ch.Result{rows: [["a�b"]]} =
Ch.query!(pid, "SELECT * FROM ch_utf8")

%Ch.Result{rows: %{"data" => [[^utf8]]}} =
%Ch.Result{rows: %{"data" => [["a�b"]]}} =
pid |> Ch.query!("SELECT * FROM ch_utf8 FORMAT JSONCompact") |> Map.update!(:rows, &Jason.decode!/1)
```

To get raw binary from `String` columns use `:binary` type that skips UTF-8 checks.

```elixir
%Ch.Result{rows: [[^bin]]} =
Ch.query!(pid, "SELECT * FROM ch_utf8", [], types: [:binary])
```

#### Timezones in RowBinary

Decoding non-UTC datetimes like `DateTime('Asia/Taipei')` requires a [timezone database.](https://hexdocs.pm/elixir/DateTime.html#module-time-zone-database)
Expand Down Expand Up @@ -268,124 +275,9 @@ utc = DateTime.utc_now()
taipei = DateTime.shift_zone!(utc, "Asia/Taipei")

# ** (ArgumentError) non-UTC timezones are not supported for encoding: 2023-04-26 01:49:43.044569+08:00 CST Asia/Taipei
Ch.query!(pid, "INSERT INTO ch_datetimes(datetime) FORMAT RowBinary", [[naive], [utc], [taipei]], types: ["DateTime"])
Ch.RowBinary.encode_rows([[naive], [utc], [taipei]], ["DateTime"])
```

## Benchmarks

<details>
<summary><code>INSERT</code> 1 million rows <a href="https://github.com/ClickHouse/clickhouse-go#benchmark">(original)</a></summary>

<pre><code>
$ MIX_ENV=bench mix run bench/insert.exs

This benchmark is based on https://github.com/ClickHouse/clickhouse-go#benchmark

Operating System: macOS
CPU Information: Apple M1
Number of Available Cores: 8
Available memory: 8 GB
Elixir 1.14.4
Erlang 25.3

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: 1_000_000 rows
Estimated total run time: 28 s

Benchmarking encode with input 1_000_000 rows ...
Benchmarking encode stream with input 1_000_000 rows ...
Benchmarking insert with input 1_000_000 rows ...
Benchmarking insert stream with input 1_000_000 rows ...

##### With input 1_000_000 rows #####
Name ips average deviation median 99th %
encode stream 1.63 612.96 ms ±11.30% 583.03 ms 773.01 ms
insert stream 1.22 819.82 ms ±9.41% 798.94 ms 973.45 ms
encode 1.09 915.75 ms ±44.13% 750.98 ms 1637.02 ms
insert 0.73 1373.84 ms ±31.01% 1331.86 ms 1915.76 ms

Comparison:
encode stream 1.63
insert stream 1.22 - 1.34x slower +206.87 ms
encode 1.09 - 1.49x slower +302.79 ms
insert 0.73 - 2.24x slower +760.88 ms</code>
</pre>

</details>

<details>
<summary><code>SELECT</code> 500, 500 thousand, and 500 million rows <a href="https://github.com/ClickHouse/ch-bench">(original)</a></summary>

<pre><code>
$ MIX_ENV=bench mix run bench/stream.exs

This benchmark is based on https://github.com/ClickHouse/ch-bench

Operating System: macOS
CPU Information: Apple M1
Number of Available Cores: 8
Available memory: 8 GB
Elixir 1.14.4
Erlang 25.3

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: 500 rows, 500_000 rows, 500_000_000 rows
Estimated total run time: 1.05 min

Benchmarking stream with decode with input 500 rows ...
Benchmarking stream with decode with input 500_000 rows ...
Benchmarking stream with decode with input 500_000_000 rows ...
Benchmarking stream with manual decode with input 500 rows ...
Benchmarking stream with manual decode with input 500_000 rows ...
Benchmarking stream with manual decode with input 500_000_000 rows ...
Benchmarking stream without decode with input 500 rows ...
Benchmarking stream without decode with input 500_000 rows ...
Benchmarking stream without decode with input 500_000_000 rows ...

##### With input 500 rows #####
Name ips average deviation median 99th %
stream with decode 4.69 K 213.34 μs ±12.49% 211.38 μs 290.94 μs
stream with manual decode 4.69 K 213.43 μs ±17.40% 210.96 μs 298.75 μs
stream without decode 4.65 K 215.08 μs ±10.79% 213.79 μs 284.66 μs

Comparison:
stream with decode 4.69 K
stream with manual decode 4.69 K - 1.00x slower +0.0838 μs
stream without decode 4.65 K - 1.01x slower +1.74 μs

##### With input 500_000 rows #####
Name ips average deviation median 99th %
stream without decode 234.58 4.26 ms ±13.99% 4.04 ms 5.95 ms
stream with manual decode 64.26 15.56 ms ±8.36% 15.86 ms 17.97 ms
stream with decode 41.03 24.37 ms ±6.27% 24.39 ms 26.60 ms

Comparison:
stream without decode 234.58
stream with manual decode 64.26 - 3.65x slower +11.30 ms
stream with decode 41.03 - 5.72x slower +20.11 ms

##### With input 500_000_000 rows #####
Name ips average deviation median 99th %
stream without decode 0.32 3.17 s ±0.20% 3.17 s 3.17 s
stream with manual decode 0.0891 11.23 s ±0.00% 11.23 s 11.23 s
stream with decode 0.0462 21.66 s ±0.00% 21.66 s 21.66 s

Comparison:
stream without decode 0.32
stream with manual decode 0.0891 - 3.55x slower +8.06 s
stream with decode 0.0462 - 6.84x slower +18.50 s</code>
</pre>

</details>

[CI Results](https://github.com/plausible/ch/actions/workflows/bench.yml) (click the latest workflow run and scroll down to "Artifacts")
Please see [CI Results](https://github.com/plausible/ch/actions/workflows/bench.yml) (make sure to click the latest workflow run and scroll down to "Artifacts") for [some of our benchmarks.](./bench/) :)
11 changes: 4 additions & 7 deletions lib/ch.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ defmodule Ch do
| {:scheme, String.t()}
| {:hostname, String.t()}
| {:port, :inet.port_number()}
| {:transport_opts, :gen_tcp.connect_option()}
| {:transport_opts, :gen_tcp.connect_option() | :ssl.tls_client_option()}
| DBConnection.start_option()

@doc """
Expand All @@ -25,11 +25,10 @@ defmodule Ch do
* `:scheme` - HTTP scheme, defaults to `"http"`
* `:hostname` - server hostname, defaults to `"localhost"`
* `:port` - HTTP port, defualts to `8123`
* `:transport_opts` - options to be given to the transport being used. See `Mint.HTTP1.connect/4` for more info
* `:database` - Database, defaults to `"default"`
* `:username` - Username
* `:password` - User password
* `:settings` - Keyword list of ClickHouse settings
* `:settings` - Keyword list of ClickHouse settings to send wtih every query
* `:timeout` - HTTP receive timeout in milliseconds
* `:transport_opts` - options to be given to the transport being used. See `Mint.HTTP1.connect/4` for more info
* [`DBConnection.start_option()`](https://hexdocs.pm/db_connection/DBConnection.html#t:start_option/0)
Expand All @@ -55,8 +54,6 @@ defmodule Ch do
| {:command, Ch.Query.command()}
| {:headers, [{String.t(), String.t()}]}
| {:format, String.t()}
# TODO remove
| {:encode, boolean}
| {:decode, boolean}
| DBConnection.connection_option()

Expand All @@ -69,8 +66,8 @@ defmodule Ch do
* `:database` - Database
* `:username` - Username
* `:password` - User password
* `:settings` - Keyword list of settings
* `:timeout` - Query request timeout
* `:settings` - Keyword list of settings to merge with `:settings` from `start_link` and send with this query
* `:timeout` - Configures both query request timeout and HTTP receive timeout in milliseconds, whichever happens faster
* `:command` - Command tag for the query
* `:headers` - Custom HTTP headers for the request
* `:format` - Custom response format for the request
Expand Down

0 comments on commit 68b9dee

Please sign in to comment.