-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsworkload_smw3lt
130 lines (120 loc) · 4.79 KB
/
tsworkload_smw3lt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Copyright (c) 2020 YCSB contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You
# may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License. See accompanying
# LICENSE file.
# Yahoo! Cloud System Benchmark
#
# Workload Defintion: Smart Metering Workload 3 (SMW3LT)
#
# This workload was created as part of a bachelor thesis
# at the Technical University of Berlin, by Michael Smart
# https://github.com/smartygus
#
# Data to load as basis:
#
# There are 50000 time series, each with one year of
# data at half-hourly (1800 seconds) interval.
# We are using a fixed starting time stamp representing
# 1/1/2019 at midnight.
# This amounts to a total of:
# 50000 (time series) * 2 (point per hour) * 24 (hours in a day) * 365 (days in 2019) = 876 Million records
#
# Each time series has three tags:
#
# - se_segment: representing some socio-economic category defined by the provider
# - cardinality = 5 (ie. there are 5 different categories)
# - pricing_strategy: some customers have different pricing strategies
# - cardinality = 2 (ie. there are 2 different strategies, eg. fixed, and dynamic)
# - household_id = an id representing the household. The ID is unique within the context of the other two tags
# - cardinality = 5000
#
# Run phase of the workload:
#
# This is an "online" workload, in that it assumes that
# new measurements are incoming at the same time as the
# existing data is being queried. Therefore this workload
# includes an INSERT query in the run phase, as well as
# one other SCAN query.
# The idea behind this workload is from the perspective of
# an energy provider. We assume that they have secondary
# systems that perform further analysis for things like
# fraud detection, and these analysis systems need to
# constantly query the up-to-data to perform their tasks.
# The query from the analysis system always queries for
# the most recent 24 hours worth of data per time series,
# in case a smart meter was offline for a while due to an
# outage, and sends a bunch of data after-the-fact.
# The proportions (50% INSERTs, 50% SCANS) have been calculated
# based on the fact that each time series sends a new data point
# every half hour, and the assumption that the analysis systems
# would also query each time series approximately every half
# hour, to ensure they always have the most up-to-date data
# for any time-critical analysis.
# Because the secondary analysis systems work with the
# raw time series data, there are no additional
# aggregation or summary operations on the SCAN query
# in this workload.
workload=site.ycsb.workloads.TimeSeriesWorkload
recordcount=876000000
# If setting a pseudo-real-world target rate of ~56req/sec,
# then a 20 minute benchmark run would be 67200 operations
# total. This could serve well for measuring latency.
# If however one wanted to test throughput at faster-than-
# real-time, then the operationcount could be increased
# accordingly.
operationcount=67200
insertcount=876000000
insertstart=1546300800
# This workload is "online", so during the run phase
# there INSERTs as well as SCANs.
# Therefore we need to specify
# an inserttransactionstart timestamp
# Here we choose 1/1/2020 @ midnight, because this
# is the end of the data loaded in the load phase
# and would therefore represent a continuation of
# the measurements from that point.
inserttransactionstart=1577836800
# We only have a single metric, so we are
# dividing up the write load into groups of
# individual time series instead of groups
# of metrics
threadedwritedistribution=timeseries
fieldlength=3
# single metric: 'kwh'
fieldcount=1
# three tags, "real world" names: se_segment, pricing_strategy, household_id
tagcount=3
tagcardinality=5,2,5000
sparsity=0.0
delayedseries=0.0
delayedintervals=0
timestampunits=SECONDS
# one measurement every half hour
timestampinterval=1800
randomtimeseriesorder=false
scanproportion=0.5
# Each query should have a random selection of each tag,
# selecting one individual time series per query.
scantags=0,0,0
# A negative starting timestamp is a special notation
# indicating that the timestamp should be this many
# units behind the most-currently written timestamp
# from the INSERT queries, so that the most recent
# data is always being queried
scan1startts=-86400
scanquerytimespan=86400
scanqueryrandomtimespan=false
# INSERTs to represent new measurments incoming
insertproportion=0.50
readproportion=0.00
updateproportion=0.00