From 56ea38ad90e3acb8e5f705a404aea8aa3f75ab85 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:00:30 +0100 Subject: [PATCH 1/7] Add duckdb-parquet --- duckdb-parquet/benchmark.sh | 23 ++++++++++++++++++++ duckdb-parquet/create.sql | 1 + duckdb-parquet/load.py | 14 ++++++++++++ duckdb-parquet/queries.sql | 43 +++++++++++++++++++++++++++++++++++++ duckdb-parquet/run.sh | 9 ++++++++ 5 files changed, 90 insertions(+) create mode 100755 duckdb-parquet/benchmark.sh create mode 100644 duckdb-parquet/create.sql create mode 100755 duckdb-parquet/load.py create mode 100644 duckdb-parquet/queries.sql create mode 100755 duckdb-parquet/run.sh diff --git a/duckdb-parquet/benchmark.sh b/duckdb-parquet/benchmark.sh new file mode 100755 index 000000000..de080e6da --- /dev/null +++ b/duckdb-parquet/benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Install + +# FIXME: uncomment +# sudo apt-get update +# sudo apt-get install -y python3-pip +# pip install duckdb psutil + +# Load the data +# FIXME: uncomment +# seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' + +./load.py + +# Run the queries + +./run.sh 2>&1 | tee log.txt + +wc -c my-db.duckdb + +cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | + awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' diff --git a/duckdb-parquet/create.sql b/duckdb-parquet/create.sql new file mode 100644 index 000000000..f45a64a5f --- /dev/null +++ b/duckdb-parquet/create.sql @@ -0,0 +1 @@ +CREATE VIEW hits AS SELECT * FROM read_parquet('hits_*.parquet', binary_as_string=True); \ No newline at end of file diff --git a/duckdb-parquet/load.py b/duckdb-parquet/load.py new file mode 100755 index 000000000..82f047dca --- /dev/null +++ b/duckdb-parquet/load.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import duckdb +import timeit +import psutil + +con = duckdb.connect(database="my-db.duckdb", read_only=False) + +print("Set up a view over the Parquet files") + +start = timeit.default_timer() +con.execute(open("create.sql").read()) +end = timeit.default_timer() +print(end - start) diff --git a/duckdb-parquet/queries.sql b/duckdb-parquet/queries.sql new file mode 100644 index 000000000..2a2f1cc07 --- /dev/null +++ b/duckdb-parquet/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM epoch_ms(EventTime * 1000)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) ORDER BY DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) LIMIT 10 OFFSET 1000; diff --git a/duckdb-parquet/run.sh b/duckdb-parquet/run.sh new file mode 100755 index 000000000..97ad49f85 --- /dev/null +++ b/duckdb-parquet/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +cat queries.sql | while read query; do + # FIXME: uncomment + # sync + # echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + ./query.py <<< "${query}" +done From a7e62cdb22e36d155e73497749d7843bbbfaebe7 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:01:51 +0100 Subject: [PATCH 2/7] Uncomment --- duckdb-parquet/benchmark.sh | 10 ++++------ duckdb-parquet/run.sh | 5 ++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/duckdb-parquet/benchmark.sh b/duckdb-parquet/benchmark.sh index de080e6da..16bd969aa 100755 --- a/duckdb-parquet/benchmark.sh +++ b/duckdb-parquet/benchmark.sh @@ -2,14 +2,12 @@ # Install -# FIXME: uncomment -# sudo apt-get update -# sudo apt-get install -y python3-pip -# pip install duckdb psutil +sudo apt-get update +sudo apt-get install -y python3-pip +pip install duckdb psutil # Load the data -# FIXME: uncomment -# seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' +seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' ./load.py diff --git a/duckdb-parquet/run.sh b/duckdb-parquet/run.sh index 97ad49f85..64df8c608 100755 --- a/duckdb-parquet/run.sh +++ b/duckdb-parquet/run.sh @@ -1,9 +1,8 @@ #!/bin/bash cat queries.sql | while read query; do - # FIXME: uncomment - # sync - # echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null ./query.py <<< "${query}" done From ac0e929a4c24d7aacea0dcb42b6dd353a106a73c Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:04:58 +0100 Subject: [PATCH 3/7] Add query.py --- duckdb-parquet/query.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 duckdb-parquet/query.py diff --git a/duckdb-parquet/query.py b/duckdb-parquet/query.py new file mode 100755 index 000000000..c47c3aa56 --- /dev/null +++ b/duckdb-parquet/query.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import duckdb +import timeit +import psutil +import sys + +query = sys.stdin.read() +print(query) + +con = duckdb.connect(database="my-db.duckdb", read_only=False) +con.execute("PRAGMA enable_object_cache") + +for try_num in range(3): + start = timeit.default_timer() + results = con.execute(query).fetchall() + end = timeit.default_timer() + print(end - start) + print(results) From a45b3662504952d324f583abb73372f1d3777105 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:13:32 +0100 Subject: [PATCH 4/7] Remove result printing and add results --- duckdb-parquet/query.py | 1 - duckdb-parquet/results/c6a.4xlarge.json | 58 +++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 duckdb-parquet/results/c6a.4xlarge.json diff --git a/duckdb-parquet/query.py b/duckdb-parquet/query.py index c47c3aa56..a27ec5941 100755 --- a/duckdb-parquet/query.py +++ b/duckdb-parquet/query.py @@ -16,4 +16,3 @@ results = con.execute(query).fetchall() end = timeit.default_timer() print(end - start) - print(results) diff --git a/duckdb-parquet/results/c6a.4xlarge.json b/duckdb-parquet/results/c6a.4xlarge.json new file mode 100644 index 000000000..7b4b30a30 --- /dev/null +++ b/duckdb-parquet/results/c6a.4xlarge.json @@ -0,0 +1,58 @@ +{ + "system": "DuckDB (Parquet)", + "date": "2022-11-15", + "machine": "c6a.4xlarge, 500gb gp2", + "cluster_size": 1, + "comment": "", + + "tags": ["C++", "column-oriented", "embedded", "stateless"], + + "load_time": 0, + "data_size": 14737666736, + + "result": [ +[0.16168120500003624,0.035210315999961495,0.034063800000012634], +[0.16167625399998542,0.05377198899998348,0.054964605000009215], +[0.1892881350000266,0.0978321069999879,0.09758750200001032], +[0.3283154570000306,0.07877335299997412,0.07847510499999544], +[1.3122430980000104,0.8208032140000228,0.796963025000025], +[4.895974346999992,0.9223180289999959,0.8956201309999869], +[0.15683267400004297,0.05655506499999774,0.055968712999970194], +[0.16303855000001022,0.05514446500001213,0.05520890600001849], +[1.3237551689999805,0.928574636999997,0.9201574619999633], +[2.829673707999973,1.307827498999984,1.2822413250000295], +[0.5091962340000009,0.27178192800005263,0.24992075099999056], +[0.7770936019999795,0.311026346999995,0.2958223329999896], +[1.1973447080000028,0.7751119170000038,0.7795235870000283], +[4.069281185000023,1.2650335739999718,1.2476965710000059], +[1.2372995549999928,0.8697392149999814,0.8682284690000301], +[1.4103875689999654,0.8572837090000007,3.855091768999955], +[6.782516894000025,1.8570032560000413,2.2208985530000405], +[2.8736495839999634,1.8387350209999909,1.8347625829999856], +[7.7933057059999555,3.453838587000007,3.5254456339999933], +[1.0701669519999655,0.6210940909999749,0.6048664339999732], +[9.434266933000004,1.8513650950000056,1.822414589999994], +[11.097276300000033,1.701221796000027,1.7192594159999999], +[21.813821072999986,3.63487875200002,3.6129823899999565], +[55.86833970700002,11.366724750000003,11.340693486000077], +[2.623555624000005,0.4793877339999426,0.47395958399999927], +[0.7137676789999432,0.37197395099997266,0.3764627110000447], +[2.674838108000017,0.49673427100003664,0.4909879040000078], +[9.42473880099999,1.4894330269999045,1.4968619509999144], +[8.406158290999997,4.476945311999998,4.468175935999966], +[7.92713281500005,1.5328978850000112,1.5372800330000018], +[2.4851479609999387,0.9031012200000532,0.8943542770000477], +[6.335724623000033,1.1794555459999856,1.1771230450000303], +[8.797936313000037,6.372003423000024,5.232719126000006], +[11.938770673000022,3.3198122059999378,3.297666481999954], +[10.867663978999985,3.2635805390000314,3.2636204789999965], +[2.5721828759999426,0.9156106980000231,0.8919071269999677], +[0.010698502999957782,0.0013757969999232955,0.0014557990000412246], +[0.007774074000053588,0.0013829470000246147,0.001443538999978955], +[0.00666498300006424,0.0014766290000807203,0.0015407500000037544], +[0.009571720999929312,0.001920247999919411,0.0017587050000429372], +[0.0065415999999913765,0.0014684889999898587,0.0013960179999230604], +[0.007053870000049756,0.0014614700000947778,0.0014559199998984695], +[0.007413347000010617,0.0016273229999796968,0.0015346100000215301] +] +} From 37a698d633cc6a5e6ed53458ee3ded384070556d Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:20:32 +0100 Subject: [PATCH 5/7] Handle replacement in the view creation --- duckdb-parquet/create.sql | 7 ++++++- duckdb-parquet/queries.sql | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/duckdb-parquet/create.sql b/duckdb-parquet/create.sql index f45a64a5f..6f953ce31 100644 --- a/duckdb-parquet/create.sql +++ b/duckdb-parquet/create.sql @@ -1 +1,6 @@ -CREATE VIEW hits AS SELECT * FROM read_parquet('hits_*.parquet', binary_as_string=True); \ No newline at end of file +CREATE VIEW hits AS +SELECT * + REPLACE + (epoch_ms(EventTime * 1000) AS EventTime, + DATE '1970-01-01' + INTERVAL (EventDate) DAYS AS EventDate) +FROM read_parquet('hits_*.parquet', binary_as_string=True); \ No newline at end of file diff --git a/duckdb-parquet/queries.sql b/duckdb-parquet/queries.sql index 2a2f1cc07..b4115ee3a 100644 --- a/duckdb-parquet/queries.sql +++ b/duckdb-parquet/queries.sql @@ -16,7 +16,7 @@ SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM epoch_ms(EventTime * 1000)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID FROM hits WHERE UserID = 435090932899640449; SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; @@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate > SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) ORDER BY DATE_TRUNC('minute', epoch_ms(EventTime * 1000)) LIMIT 10 OFFSET 1000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; From ab3bb4d117f75c154e819b160f03ad91689e79ea Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:27:41 +0100 Subject: [PATCH 6/7] Correct results --- duckdb-parquet/results/c6a.4xlarge.json | 86 ++++++++++++------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/duckdb-parquet/results/c6a.4xlarge.json b/duckdb-parquet/results/c6a.4xlarge.json index 7b4b30a30..05d85be91 100644 --- a/duckdb-parquet/results/c6a.4xlarge.json +++ b/duckdb-parquet/results/c6a.4xlarge.json @@ -11,48 +11,48 @@ "data_size": 14737666736, "result": [ -[0.16168120500003624,0.035210315999961495,0.034063800000012634], -[0.16167625399998542,0.05377198899998348,0.054964605000009215], -[0.1892881350000266,0.0978321069999879,0.09758750200001032], -[0.3283154570000306,0.07877335299997412,0.07847510499999544], -[1.3122430980000104,0.8208032140000228,0.796963025000025], -[4.895974346999992,0.9223180289999959,0.8956201309999869], -[0.15683267400004297,0.05655506499999774,0.055968712999970194], -[0.16303855000001022,0.05514446500001213,0.05520890600001849], -[1.3237551689999805,0.928574636999997,0.9201574619999633], -[2.829673707999973,1.307827498999984,1.2822413250000295], -[0.5091962340000009,0.27178192800005263,0.24992075099999056], -[0.7770936019999795,0.311026346999995,0.2958223329999896], -[1.1973447080000028,0.7751119170000038,0.7795235870000283], -[4.069281185000023,1.2650335739999718,1.2476965710000059], -[1.2372995549999928,0.8697392149999814,0.8682284690000301], -[1.4103875689999654,0.8572837090000007,3.855091768999955], -[6.782516894000025,1.8570032560000413,2.2208985530000405], -[2.8736495839999634,1.8387350209999909,1.8347625829999856], -[7.7933057059999555,3.453838587000007,3.5254456339999933], -[1.0701669519999655,0.6210940909999749,0.6048664339999732], -[9.434266933000004,1.8513650950000056,1.822414589999994], -[11.097276300000033,1.701221796000027,1.7192594159999999], -[21.813821072999986,3.63487875200002,3.6129823899999565], -[55.86833970700002,11.366724750000003,11.340693486000077], -[2.623555624000005,0.4793877339999426,0.47395958399999927], -[0.7137676789999432,0.37197395099997266,0.3764627110000447], -[2.674838108000017,0.49673427100003664,0.4909879040000078], -[9.42473880099999,1.4894330269999045,1.4968619509999144], -[8.406158290999997,4.476945311999998,4.468175935999966], -[7.92713281500005,1.5328978850000112,1.5372800330000018], -[2.4851479609999387,0.9031012200000532,0.8943542770000477], -[6.335724623000033,1.1794555459999856,1.1771230450000303], -[8.797936313000037,6.372003423000024,5.232719126000006], -[11.938770673000022,3.3198122059999378,3.297666481999954], -[10.867663978999985,3.2635805390000314,3.2636204789999965], -[2.5721828759999426,0.9156106980000231,0.8919071269999677], -[0.010698502999957782,0.0013757969999232955,0.0014557990000412246], -[0.007774074000053588,0.0013829470000246147,0.001443538999978955], -[0.00666498300006424,0.0014766290000807203,0.0015407500000037544], -[0.009571720999929312,0.001920247999919411,0.0017587050000429372], -[0.0065415999999913765,0.0014684889999898587,0.0013960179999230604], -[0.007053870000049756,0.0014614700000947778,0.0014559199998984695], -[0.007413347000010617,0.0016273229999796968,0.0015346100000215301] +[0.15255265499990855,0.032177488999877824,0.03128326899991407], +[0.1512561879999339,0.05458722900016255,0.05392151399996692], +[0.17167986499998733,0.09995069000001422,0.09747308599980897], +[0.32638904600003116,0.08025165799995193,0.07781587599993145], +[1.3622741589999805,0.8020002760001717,0.7882643820000794], +[1.5986611480000192,0.905206084999918,0.8923251289998007], +[0.19578096999998706,0.1727834680000342,0.17144673999996485], +[0.14916418200004955,0.05640668699993512,0.05402569700004278], +[1.2932305809999889,0.9019594260000758,0.902378624999983], +[1.962573343000031,1.29343327100014,1.3015213239998502], +[0.504676014999859,0.2644338150000749,0.2537441370000124], +[0.8143152309999095,0.31369776199994703,0.29620394999983546], +[1.1580296610000005,0.7799177189999682,0.7586700070000916], +[3.4789537280000786,1.2198064450001311,1.206218268000157], +[1.2482245060000423,0.8424065669998981,0.8678128400001697], +[2.1837149620000673,0.8448476789999404,0.8492143610001222], +[2.9010631749999902,1.8770421239998996,1.859835455999928], +[4.6367616359998465,1.8144295800000236,2.977320546000101], +[5.932184310000139,3.7547057240001322,3.4905751789999613], +[1.0825107470000148,0.6089946629999758,0.6027930899999774], +[9.431518925000091,1.8313576000000467,1.8272091119999914], +[11.088325851000036,1.7101613590000397,1.7332129410001471], +[21.759323214000005,3.6137633480000204,3.5981952510001065], +[55.858323720000044,11.130785724999896,11.137916557000153], +[2.62945403599997,0.48112061599999834,0.47660431999997854], +[0.724943537999934,0.3750543330002074,0.37342856700001903], +[2.68823736000013,0.4855612100000144,0.5017041849998805], +[9.432771488999833,1.4729457909998018,1.4764195239999935], +[8.373684226000023,4.492152146999842,4.521108621999929], +[7.942450724999844,1.5387269580000975,1.547872401999939], +[2.4811867640000855,0.921913929000084,1.4909157100000812], +[6.329192314000011,1.1808154699999704,1.142661366000084], +[9.606873708999956,5.282511374999785,6.381937246000007], +[10.85659978700005,3.1545218410001326,3.1670173970001088], +[10.854515859000003,3.186118347999809,3.2063636679999945], +[1.615416661999916,0.8797393360000569,0.8869118689999596], +[0.27963824800008297,0.1805793179999,0.18357048100006068], +[0.18112159000020256,0.1387902070000564,0.13812656300001436], +[0.19202985199990508,0.13907866399995328,0.13576146300010805], +[0.4347770939998554,0.3260163970001031,0.3153109580000546], +[0.15686156200013102,0.08243094600015866,0.08062338900003851], +[0.1538962489998994,0.0836899139999332,0.08601155300016217], +[0.16666654100004052,0.08488733899980616,0.08224888199993075] ] } From 8fd9d90aac53caf64b8e068d2d5cd6292641a0dc Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 15 Nov 2022 15:29:30 +0100 Subject: [PATCH 7/7] Document what this does --- duckdb-parquet/query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/duckdb-parquet/query.py b/duckdb-parquet/query.py index a27ec5941..c2d6c4a9b 100755 --- a/duckdb-parquet/query.py +++ b/duckdb-parquet/query.py @@ -9,6 +9,7 @@ print(query) con = duckdb.connect(database="my-db.duckdb", read_only=False) +# enable parquet metadata cache con.execute("PRAGMA enable_object_cache") for try_num in range(3):