From b5b34de45f98d216153494eea07b1620b5c343c2 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt <mark.raasveldt@gmail.com> Date: Tue, 15 Nov 2022 10:02:06 +0100 Subject: [PATCH 1/3] Update DuckDB to v0.6.0 --- duckdb/README.md | 2 -- duckdb/benchmark.sh | 2 +- duckdb/create.sql | 3 +-- duckdb/load.py | 16 +++++++++++----- duckdb/queries.sql | 4 ++-- duckdb/query.py | 7 ++----- 6 files changed, 17 insertions(+), 17 deletions(-) delete mode 100644 duckdb/README.md diff --git a/duckdb/README.md b/duckdb/README.md deleted file mode 100644 index d2d7b22c8..000000000 --- a/duckdb/README.md +++ /dev/null @@ -1,2 +0,0 @@ -DuckDB cannot load parquet file due to OOM. -The only option is to load a CSV file, but sometimes it also fails with OOM. diff --git a/duckdb/benchmark.sh b/duckdb/benchmark.sh index 392f084c5..e0007dd6e 100755 --- a/duckdb/benchmark.sh +++ b/duckdb/benchmark.sh @@ -12,7 +12,7 @@ wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' gzip -d hits.csv.gz ./load.py -# 4216.5390389899985 seconds +# 414 seconds # Run the queries diff --git a/duckdb/create.sql b/duckdb/create.sql index 744d595ec..4d23eaac6 100644 --- a/duckdb/create.sql +++ b/duckdb/create.sql @@ -104,6 +104,5 @@ CREATE TABLE hits HasGCLID SMALLINT NOT NULL, RefererHash BIGINT NOT NULL, URLHash BIGINT NOT NULL, - CLID INTEGER NOT NULL, - PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) + CLID INTEGER NOT NULL ); diff --git a/duckdb/load.py b/duckdb/load.py index d4265d15f..5b581a83f 100755 --- a/duckdb/load.py +++ b/duckdb/load.py @@ -5,14 +5,20 @@ import psutil con = duckdb.connect(database="my-db.duckdb", read_only=False) -# See https://github.com/duckdb/duckdb/issues/3969 -con.execute("PRAGMA memory_limit='{}b'".format(psutil.virtual_memory().total / 4)) -con.execute("PRAGMA threads={}".format(psutil.cpu_count(logical=False))) -print("Will load the data") +# enable the progress bar +con.execute('PRAGMA enable_progress_bar') +con.execute('PRAGMA enable_print_progress_bar;') +# enable parallel CSV loading +con.execute("SET experimental_parallel_csv=true") +# disable preservation of insertion order +con.execute("SET preserve_insertion_order=false") + +# perform the actual load +print("Will load the data") start = timeit.default_timer() con.execute(open("create.sql").read()) -con.execute("INSERT INTO hits SELECT * FROM read_csv_auto('hits.csv')") +con.execute("COPY hits FROM 'hits.csv'") end = timeit.default_timer() print(end - start) diff --git a/duckdb/queries.sql b/duckdb/queries.sql index 31f65fc89..b4115ee3a 100644 --- a/duckdb/queries.sql +++ b/duckdb/queries.sql @@ -25,8 +25,8 @@ SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; -SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; diff --git a/duckdb/query.py b/duckdb/query.py index 43739be56..85b9eeb15 100755 --- a/duckdb/query.py +++ b/duckdb/query.py @@ -9,12 +9,9 @@ print(query) con = duckdb.connect(database="my-db.duckdb", read_only=False) -# See https://github.com/duckdb/duckdb/issues/3969 -con.execute("PRAGMA memory_limit='{}b'".format(psutil.virtual_memory().total / 4)) -con.execute("PRAGMA threads={}".format(psutil.cpu_count(logical=False))) - for try_num in range(3): start = timeit.default_timer() - con.execute(query) + results = con.execute(query).fetchall() end = timeit.default_timer() print(end - start) + del results From 315e43414ebded693bc22c0d0e520e2915c40280 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt <mark.raasveldt@gmail.com> Date: Tue, 15 Nov 2022 11:05:46 +0100 Subject: [PATCH 2/3] Update DuckDB results to v0.6.0 --- duckdb/results/c6a.4xlarge.json | 94 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/duckdb/results/c6a.4xlarge.json b/duckdb/results/c6a.4xlarge.json index 1ebe0d129..be8635332 100644 --- a/duckdb/results/c6a.4xlarge.json +++ b/duckdb/results/c6a.4xlarge.json @@ -1,58 +1,56 @@ { "system": "DuckDB", - "date": "2022-07-01", + "date": "2022-11-15", "machine": "c6a.4xlarge, 500gb gp2", "cluster_size": 1, - "comment": "Many queries triggered OOM", "tags": ["C++", "column-oriented", "embedded"], - "load_time": 4217, - "data_size": 27241492480, + "load_time": 416, + "data_size": 25024802816, "result": [ -[0.005694353996659629,0.003944558004150167,0.003837226002360694], -[0.16991353100456763,0.03919722700084094,0.03835860399703961], -[0.44898432699847035,0.04947217500011902,0.04852217998995911], -[0.07586832098604646,0.07051395199960098,0.07007493599667214], -[9.554053236002801,8.153356187991449,8.73448242500308], -[7.66042533799191,6.931124911992811,7.103380946995458], -[0.030703739990713075,0.027668555994750932,0.027583695002249442], -[0.1778664360026596,0.03942437999648973,0.03882004099432379], -[8.53439180701389,8.869582625004114,9.020313234999776], -[10.40215514000738,11.125320470004226,8.941559945000336], -[1.1747649609897053,1.04221136700653,1.004799570000614], -[1.2380354650085792,1.1211603130068397,2.4278587239969056], -[3.1751541379926493,0.9360461989999749,0.8868292279948946], -[6.855684430003748,7.300301584007684,5.712960822012974], -[3.70588762400439,1.0249276379909134,0.9473389159975341], -[2.1037107890006155,1.6215517020027619,1.5671920729946578], -[null,null,null], -[null,null,null], -[null,null,null], -[0.0002772739971987903,0.00016792300448287278,0.0001574420020915568], -[null,null,null], -[null,null,null], -[null,null,null], -[null,null,null], -[2.9310110910009826,0.19020285899750888,0.1736805049877148], -[2.939304119994631,0.18754731099761557,0.18073286200524308], -[2.8706370779982535,0.18822155400994234,0.17905898999015335], -[null,null,null], -[null,null,null], -[0.884408778991201,0.714329167996766,0.7135983259940986], -[5.3762675570033025,0.8803737630078103,0.8728962720051641], -[7.249190265996731,2.9648747390019707,2.866687831003219], -[null,null,null], -[null,null,null], -[null,null,null], -[4.515183198003797,4.030519469000865,4.014251719010645], -[0.11604027298744768,0.040539135996368714,0.04280066800129134], -[0.0457908230018802,0.021069509006338194,0.019683108999743126], -[0.0680370800109813,0.011889394998434,0.01056639499438461], -[0.22029169600864407,0.08547276000899728,0.09095505000732373], -[0.03759863799496088,0.008373684002435766,0.007633563989656977], -[0.025631797994719818,0.008081699008471332,0.007858585988287814], -[0.034359957004198804,0.025543516996549442,0.02533275399764534] -] +[0.007988478000015675,0.004515659999924537,0.004503920000161088], +[0.11100315900011992,0.0329610019998654,0.028879086999950232], +[0.6478460649998397,0.05915705200004595,0.05621285400002307], +[1.323183034000067,0.0455285499999718,0.04504826400011552], +[1.222814291000077,0.7548531599998114,0.7589927649999026], +[2.8619665539999914,0.7005150949999006,0.6948298479999266], +[0.11942326999997022,0.05013811599997098,0.04905426100003751], +[0.14966445099980774,0.034241171000076065,0.03345654100007778], +[4.244694572000071,0.8486790140000267,0.8334544100000585], +[3.0409637129998828,1.1786685279998892,1.1486964670000361], +[1.2471184640000956,0.42324365099989336,0.3113840030000574], +[4.312250543000118,0.2591428690000157,0.25120333299992126], +[3.587442906999968,0.5870498879999104,0.5784097270000075], +[6.9048232530001314,1.0046518700000888,0.987080990999857], +[4.13467192600001,0.6404815850000887,0.6235957799999596], +[2.0105650860000424,0.8113902259999577,0.78756950900015], +[3.248947224999938,2.7703119679999872,1.6008261179999863], +[3.2351198680000834,1.5603282849999687,1.5319727170001443], +[10.72432485399986,3.070198415999812,3.065633552999998], +[2.3459199960000205,0.43844037300004857,0.44263849999993], +[22.106918537999945,0.8214904700000716,0.815211096999974], +[22.445440309000105,0.5390162549999786,0.5336183530000653], +[38.162022707999995,0.7763018340001508,0.7754006909999589], +[96.78093567899987,4.105949430999999,4.102333362000081], +[3.6950249729998177,0.1867992550000963,0.18109103799997683], +[1.0970243589999882,0.1984391500000129,0.18246500600002946], +[3.6618248540000877,0.19925248999993528,0.185544137000079], +[21.191481531999898,0.478393307000033,0.4700544350000655], +[14.240373419999969,3.5203460679997534,3.638875715999802], +[0.8780410869999287,0.547536578999825,0.4830867450000369], +[4.3269140569996125,0.5835747720002473,0.5723880420000569], +[9.054497873999935,0.8204964669998844,0.8048715669997364], +[16.009907588999795,4.723961488999976,7.069065186999978], +[21.76933206700005,2.2042322819997935,2.213854970000284], +[21.591168588000073,2.22991274900005,2.237306400000307], +[1.5004984629999853,0.8202248339998732,0.8199325200002932], +[0.1191018049999002,0.0322099319996596,0.029625235999901633], +[0.06348333900041325,0.016720733999591175,0.01323864699998012], +[0.08746766900003422,0.021063181000045006,0.01836599499984004], +[0.21232219900002747,0.06138483099994119,0.05096832199978962], +[0.06383551399994758,0.015361045000190643,0.009370135000153823], +[0.05804349699974409,0.010943656000108604,0.008688407000136067], +[0.04755750599997555,0.010486199999832024,0.008859258000029513],] } From e29c31676faba52b61531074a11cab6d9278fe03 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt <mark.raasveldt@gmail.com> Date: Tue, 15 Nov 2022 11:12:41 +0100 Subject: [PATCH 3/3] Remove trailing comma --- duckdb/results/c6a.4xlarge.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/duckdb/results/c6a.4xlarge.json b/duckdb/results/c6a.4xlarge.json index be8635332..9b977e5c7 100644 --- a/duckdb/results/c6a.4xlarge.json +++ b/duckdb/results/c6a.4xlarge.json @@ -52,5 +52,6 @@ [0.21232219900002747,0.06138483099994119,0.05096832199978962], [0.06383551399994758,0.015361045000190643,0.009370135000153823], [0.05804349699974409,0.010943656000108604,0.008688407000136067], -[0.04755750599997555,0.010486199999832024,0.008859258000029513],] +[0.04755750599997555,0.010486199999832024,0.008859258000029513] +] }