tsworkload_smw3lt

# Copyright (c) 2020 YCSB contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You
# may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License. See accompanying
# LICENSE file.

# Yahoo! Cloud System Benchmark
#
# Workload Defintion: Smart Metering Workload 3 (SMW3LT)
#
# This workload was created as part of a bachelor thesis
# at the Technical University of Berlin, by Michael Smart
# https://github.com/smartygus
#
# Data to load as basis:
#
# There are 50000 time series, each with one year of
# data at half-hourly (1800 seconds) interval.
# We are using a fixed starting time stamp representing
# 1/1/2019 at midnight.
# This amounts to a total of:
# 50000 (time series) * 2 (point per hour) * 24 (hours in a day) * 365 (days in 2019) = 876 Million records
#
# Each time series has three tags:
#
# - se_segment: representing some socio-economic category defined by the provider
#   - cardinality = 5 (ie. there are 5 different categories)
# - pricing_strategy: some customers have different pricing strategies
#   - cardinality = 2 (ie. there are 2 different strategies, eg. fixed, and dynamic)
# - household_id = an id representing the household. The ID is unique within the context of the other two tags
#   - cardinality = 5000
#
# Run phase of the workload:
#
# This is an "online" workload, in that it assumes that 
# new measurements are incoming at the same time as the
# existing data is being queried. Therefore this workload
# includes an INSERT query in the run phase, as well as
# one other SCAN query.
# The idea behind this workload is from the perspective of
# an energy provider. We assume that they have secondary
# systems that perform further analysis for things like
# fraud detection, and these analysis systems need to
# constantly query the up-to-data to perform their tasks.
# The query from the analysis system always queries for 
# the most recent 24 hours worth of data per time series,
# in case a smart meter was offline for a while due to an
# outage, and sends a bunch of data after-the-fact.
# The proportions (50% INSERTs, 50% SCANS) have been calculated
# based on the fact that each time series sends a new data point
# every half hour, and the assumption that the analysis systems
# would also query each time series approximately every half
# hour, to ensure they always have the most up-to-date data
# for any time-critical analysis.
# Because the secondary analysis systems work with the
# raw time series data, there are no additional
# aggregation or summary operations on the SCAN query
# in this workload.
workload=site.ycsb.workloads.TimeSeriesWorkload

recordcount=876000000
# If setting a pseudo-real-world target rate of ~56req/sec,
# then a 20 minute benchmark run would be 67200 operations
# total. This could serve well for measuring latency.
# If however one wanted to test throughput at faster-than-
# real-time, then the operationcount could be increased
# accordingly.
operationcount=67200
insertcount=876000000
insertstart=1546300800

# This workload is "online", so during the run phase
# there INSERTs as well as SCANs.
# Therefore we need to specify
# an inserttransactionstart timestamp
# Here we choose 1/1/2020 @ midnight, because this
# is the end of the data loaded in the load phase
# and would therefore represent a continuation of
# the measurements from that point.
inserttransactionstart=1577836800

# We only have a single metric, so we are
# dividing up the write load into groups of
# individual time series instead of groups
# of metrics
threadedwritedistribution=timeseries

fieldlength=3
# single metric: 'kwh'
fieldcount=1
# three tags, "real world" names: se_segment, pricing_strategy, household_id
tagcount=3
tagcardinality=5,2,5000

sparsity=0.0
delayedseries=0.0
delayedintervals=0

timestampunits=SECONDS
# one measurement every half hour
timestampinterval=1800
randomtimeseriesorder=false

scanproportion=0.5
# Each query should have a random selection of each tag,
# selecting one individual time series per query.
scantags=0,0,0
# A negative starting timestamp is a special notation
# indicating that the timestamp should be this many
# units behind the most-currently written timestamp
# from the INSERT queries, so that the most recent
# data is always being queried
scan1startts=-86400
scanquerytimespan=86400
scanqueryrandomtimespan=false

# INSERTs to represent new measurments incoming
insertproportion=0.50

readproportion=0.00
updateproportion=0.00