From cbe74dd8f863f8208ecb69e061de32e6c74d6fae Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Fri, 23 Aug 2024 13:12:46 +0530 Subject: [PATCH] clickbench testing scripts for ingestion and query --- clickbench/clickbench_staticschema.json | 420 ++++++++++++++++++++++++ clickbench/ingestion.sh | 28 ++ clickbench/queries.sql | 43 +++ clickbench/run_query.sh | 19 ++ 4 files changed, 510 insertions(+) create mode 100644 clickbench/clickbench_staticschema.json create mode 100644 clickbench/ingestion.sh create mode 100644 clickbench/queries.sql create mode 100644 clickbench/run_query.sh diff --git a/clickbench/clickbench_staticschema.json b/clickbench/clickbench_staticschema.json new file mode 100644 index 0000000..06bbfe0 --- /dev/null +++ b/clickbench/clickbench_staticschema.json @@ -0,0 +1,420 @@ +{ + "fields":[ + { + "name": "AdvEngineID", + "data_type": "int" + }, + { + "name": "Age", + "data_type": "int" + }, + { + "name": "BrowserCountry", + "data_type": "string" + }, + { + "name": "BrowserLanguage", + "data_type": "string" + }, + { + "name": "CLID", + "data_type": "int" + }, + { + "name": "ClientEventTime", + "data_type": "datetime" + }, + { + "name": "ClientIP", + "data_type": "int" + }, + { + "name": "ClientTimeZone", + "data_type": "int" + }, + { + "name": "CodeVersion", + "data_type": "int" + }, + { + "name": "ConnectTiming", + "data_type": "int" + }, + { + "name": "CookieEnable", + "data_type": "int" + }, + { + "name": "CounterClass", + "data_type": "int" + }, + { + "name": "CounterID", + "data_type": "int" + }, + { + "name": "DNSTiming", + "data_type": "int" + }, + { + "name": "DontCountHits", + "data_type": "int" + }, + { + "name": "EventDate", + "data_type": "string" + }, + { + "name": "EventTime", + "data_type": "datetime" + }, + { + "name": "FUniqID", + "data_type": "string" + }, + { + "name": "FetchTiming", + "data_type": "int" + }, + { + "name": "FlashMajor", + "data_type": "int" + }, + { + "name": "FlashMinor", + "data_type": "int" + }, + { + "name": "FlashMinor2", + "data_type": "string" + }, + { + "name": "FromTag", + "data_type": "string" + }, + { + "name": "GoodEvent", + "data_type": "int" + }, + { + "name": "HID", + "data_type": "int" + }, + { + "name": "HTTPError", + "data_type": "int" + }, + { + "name": "HasGCLID", + "data_type": "int" + }, + { + "name": "HistoryLength", + "data_type": "int" + }, + { + "name": "HitColor", + "data_type": "string" + }, + { + "name": "IPNetworkID", + "data_type": "int" + }, + { + "name": "Income", + "data_type": "int" + }, + { + "name": "Interests", + "data_type": "int" + }, + { + "name": "IsArtifical", + "data_type": "int" + }, + { + "name": "IsDownload", + "data_type": "int" + }, + { + "name": "IsEvent", + "data_type": "int" + }, + { + "name": "IsLink", + "data_type": "int" + }, + { + "name": "IsMobile", + "data_type": "int" + }, + { + "name": "IsNotBounce", + "data_type": "int" + },{ + "name": "IsOldCounter", + "data_type": "int" + }, + { + "name": "IsParameter", + "data_type": "int" + }, + { + "name": "IsRefresh", + "data_type": "int" + }, + { + "name": "JavaEnable", + "data_type": "int" + }, + { + "name": "JavascriptEnable", + "data_type": "int" + }, + { + "name": "LocalEventTime", + "data_type": "datetime" + }, + { + "name": "MobilePhone", + "data_type": "int" + }, + { + "name": "MobilePhoneModel", + "data_type": "string" + }, + { + "name": "NetMajor", + "data_type": "int" + }, + { + "name": "NetMinor", + "data_type": "int" + }, + { + "name": "OS", + "data_type": "int" + }, + { + "name": "OpenerName", + "data_type": "int" + }, + { + "name": "OpenstatAdID", + "data_type": "string" + }, + { + "name": "OpenstatCampaignID", + "data_type": "string" + }, + { + "name": "OpenstatServiceName", + "data_type": "string" + }, + { + "name": "OpenstatSourceID", + "data_type": "string" + }, + { + "name": "OriginalURL", + "data_type": "string" + }, + { + "name": "PageCharset", + "data_type": "string" + }, + { + "name": "ParamCurrency", + "data_type": "string" + },{ + "name": "ParamCurrencyID", + "data_type": "int" + }, + { + "name": "ParamOrderID", + "data_type": "string" + }, + { + "name": "ParamPrice", + "data_type": "string" + }, + { + "name": "Params", + "data_type": "string" + }, + { + "name": "Referer", + "data_type": "string" + }, + { + "name": "RefererCategoryID", + "data_type": "int" + }, + { + "name": "RefererHash", + "data_type": "string" + }, + { + "name": "RefererRegionID", + "data_type": "int" + }, + { + "name": "RegionID", + "data_type": "int" + }, + { + "name": "RemoteIP", + "data_type": "int" + }, + { + "name": "ResolutionDepth", + "data_type": "int" + }, + { + "name": "ResolutionHeight", + "data_type": "int" + }, + { + "name": "ResolutionWidth", + "data_type": "int" + }, + { + "name": "ResponseEndTiming", + "data_type": "int" + }, + { + "name": "ResponseStartTiming", + "data_type": "int" + }, + { + "name": "Robotness", + "data_type": "int" + }, + { + "name": "SearchEngineID", + "data_type": "int" + }, + { + "name": "SearchPhrase", + "data_type": "string" + }, + { + "name": "SendTiming", + "data_type": "int" + }, + { + "name": "Sex", + "data_type": "int" + }, + { + "name": "SilverlightVersion1", + "data_type": "int" + }, + { + "name": "SilverlightVersion2", + "data_type": "int" + }, + { + "name": "SilverlightVersion3", + "data_type": "int" + },{ + "name": "SilverlightVersion4", + "data_type": "int" + }, + { + "name": "SocialAction", + "data_type": "string" + }, + { + "name": "SocialNetwork", + "data_type": "string" + }, + { + "name": "SocialSourceNetworkID", + "data_type": "int" + }, + { + "name": "SocialSourcePage", + "data_type": "string" + }, + { + "name": "Title", + "data_type": "string" + }, + { + "name": "TraficSourceID", + "data_type": "int" + }, + { + "name": "URL", + "data_type": "string" + }, + { + "name": "URLCategoryID", + "data_type": "int" + }, + { + "name": "URLHash", + "data_type": "string" + }, + { + "name": "URLRegionID", + "data_type": "int" + }, + { + "name": "UTMCampaign", + "data_type": "string" + }, + { + "name": "UTMContent", + "data_type": "string" + }, + { + "name": "UTMMedium", + "data_type": "string" + }, + { + "name": "UTMSource", + "data_type": "string" + }, + { + "name": "UTMTerm", + "data_type": "string" + }, + { + "name": "UserAgent", + "data_type": "int" + }, + { + "name": "UserAgentMajor", + "data_type": "int" + }, + { + "name": "UserAgentMinor", + "data_type": "string" + },{ + "name": "UserID", + "data_type": "int" + }, + { + "name": "WatchID", + "data_type": "string" + }, + { + "name": "WindowClientHeight", + "data_type": "int" + }, + { + "name": "WindowClientWidth", + "data_type": "int" + }, + { + "name": "WindowName", + "data_type": "int" + }, + { + "name": "WithHash", + "data_type": "int" + } + ] + } \ No newline at end of file diff --git a/clickbench/ingestion.sh b/clickbench/ingestion.sh new file mode 100644 index 0000000..1e7d29c --- /dev/null +++ b/clickbench/ingestion.sh @@ -0,0 +1,28 @@ +# Download and unzip dataset +wget https://datasets.clickhouse.com/hits_compatible/hits.json.gz +gzip -d hits.json.gz +split -l 2500 hits.json hits_ +for file in hits_*; do + # Add a comma at the end of each line except the last line + sed '$!s/$/,/' "$file" > temp_file + + # Add "[" at the beginning and "]" at the end + (echo "["; cat temp_file; echo "]") > "${file}_modified" + + # Replace the original file with the modified one + mv "${file}_modified" "$file" + + # Clean up + rm temp_file +done + +start_time=$(date +%s) + +for file in hits_*; do + curl -H "Content-Type: application/json" -H "X-P-Stream: hits" -k -XPOST -u "admin:admin" "http://3.140.239.140:8000/api/v1/ingest" --data-binary @"${file}" +done + +end_time=$(date +%s) +total_time=$((end_time - start_time)) + +echo "Total time: ${total_time} seconds" diff --git a/clickbench/queries.sql b/clickbench/queries.sql new file mode 100644 index 0000000..62f7684 --- /dev/null +++ b/clickbench/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; +SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; +SELECT AVG("UserID") FROM hits; +SELECT COUNT(DISTINCT "UserID") FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; +SELECT MIN("EventDate"::DATE), MAX("EventDate"::DATE) FROM hits; +SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; +SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; +SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10; +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; +SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URLHash", "EventDate"::DATE, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate"::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-01' AND "EventDate"::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::DATE >= '2013-07-14' AND "EventDate"::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/clickbench/run_query.sh b/clickbench/run_query.sh new file mode 100644 index 0000000..b4a4714 --- /dev/null +++ b/clickbench/run_query.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +QUERY_NUM=1 +cat 'queries.sql' | while read -r QUERY; do + + JSON=$(jq -n --arg query "$QUERY" \ + '{query: $query, startTime: "2024-08-22T00:00:00.000Z", endTime: "2024-08-22T23:00:00.000Z"}') + start_time=$(date +%s%3N) + + ES_RSP= curl -H "Content-Type: application/json" -k -XPOST -u "admin:admin" "http://3.140.239.140:8000/api/v1/query" --data "${JSON}" + + end_time=$(date +%s%3N) + elapsed_time=$((end_time - start_time)) + + echo "${QUERY_NUM},${elapsed_time}" >> result.csv + + QUERY_NUM=$((QUERY_NUM + 1)) + +done;