Skip to content

Commit

Permalink
PERF-3056 Add workloads for fragmented inserts in presence of columns…
Browse files Browse the repository at this point in the history
…tore index (#787)
  • Loading branch information
IrinaYatsenko authored Dec 16, 2022
1 parent 7a29faf commit f449258
Show file tree
Hide file tree
Showing 2 changed files with 349 additions and 0 deletions.
182 changes: 182 additions & 0 deletions src/workloads/query/CsiFragmentedInsertsFlat.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
SchemaVersion: 2018-07-01
Owner: "@mongodb/query-execution"
Description: |
This workload compares performance of inserts into a collection with only the default _id index,
and in presence of the columnstore index. It uses an artificial data set with a wide overall
schema and narrow individual objects to model fragmented access to CSI, which clusters entries by
path. The data size is relatively small (1e6 documents yield ~175MB data size and ~105MB storage
size).
We would like to be able to correlate the results of this workload with the similar one that uses
nested data (CsiFragmentedInsertsNested.yml). Please make sure to update both when making changes.
Keywords:
- columnstore
- insert

AutoRun:
- When:
mongodb_setup:
$eq:
- standalone-all-feature-flags
branch_name:
$neq:
- v4.0
- v4.2
- v4.4
- v5.0
- v6.0

Clients:
Default:
QueryOptions:
# Allow for longer duration since index builds may take a while.
socketTimeoutMS: 600_000 # = 10 min
connectTimeoutMS: 600_000

GlobalDefaults:
MaxPhases: &maxPhases 5
Database: &db csiFragmentedInsertsFlat

# The Loader actor creates collections named "Collection<N>" where N corresponds to the thread's
# number. We'll use a single collection, created by a single thread, so it becomes 'Collection0'.
Collection: &coll Collection0

# If modifying any of the these parameters, please review 'CsiFragmentedInsertsNested.yml' to
# ensure that the results of these two workloads can still be correlated.
DocumentCount: &docCount 1e6
SchemaWidth: &schemaWidth 10000
ObjectWidth: &objectWidth 10
SampleSize: &sampleSize 10

Document: &document
# {
# _id: ObjectId(...),
# root: {
# x271: NumberInt(9917),
# x6305: NumberInt(11),
# x8: NumberInt(1022),
# <7 more fields like this>
# }
# }
root: {^Object: {
withNEntries: *objectWidth,

# We are using uniform distribution of fields to make the sampling more stable.
havingKeys: {^FormatString: {
"format": "x%d",
"withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}]
}},

# We don't expect the actual values to matter.
andValues: {^RandomInt: {min: 0, max: *schemaWidth}},

# Occasionally, the key generator might produce the same key name. For this workload,
# it's OK to have some of the objects with fewer than 'objectWidth' fields (and we don't
# want to test parsing of the duplicated keys for the index as it's not a common user
# scenario).
duplicatedKeys: skip
}}

ActorTemplates:
# We want to be able to compare results from the same test between the runs in presence of different
# indexes. For this we'll have to instantiate the same actor multiple times with a unique name.
- TemplateName: InsertFromSample
Config:
Name: {^Parameter: {Name: "Name", Default: "Insert"}}
Type: SamplingLoader
Threads: {^Parameter: {Name: "Threads", Default: 1}}
Phases:
OnlyActiveInPhases:
Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Database: *db
Collection: *coll

# No more than sampleSize*objectWidth paths can be "touched" by a sample.
SampleSize: *sampleSize

# When the batch size is the same as sample size, each sampled document will be used in
# the batch exactly once, and this will be repeated for the number of batches. We expect,
# that the first batch might be affected by "cold" caches but the subsequent batches
# would be fully warmed up.
InsertBatchSize: *sampleSize

# The 'SamplingLoader' actor re-samples on repeat, meaning that it would get a new set of
# documents likely with different paths and values. This makes each repeat hit different
# parts of the indexes, causing a long warm up tail. To avoid this we test with a single
# repeat but multiple batches.
Repeat: {^Parameter: {Name: "Repeats", Default: 1}}

# The instances of the template must specify the number of batches to make it clear what
# they are testing wrt to cold/warm state. The stats per batch will be available in the
# 'IndividualBulkInsert' measurement.
Batches: {^Parameter: {Name: "Batches", Default: 500}}

Actors:
- Name: Loader
Type: Loader
Threads: 1
Phases:
OnlyActiveInPhases:
Active: [0]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
# Cannot have more threads than the actor itself.
Threads: 1
Repeat: 1
Database: *db
CollectionCount: 1
DocumentCount: *docCount
BatchSize: 1000
Document: *document

- Name: Quiese
Type: QuiesceActor
Threads: 1
Database: *db
Phases:
OnlyActiveInPhases:
Active: [1, 4]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Repeat: 1

# Do multiple inserts of documents with the same schema. The performance of the first insert might
# be affected by the inserts in the previous stages, but it should stabilize after that so, by using
# a large number of batches, it should amortize sufficiently to have P90 similar to P50 for the
# latency.
# Target measurements: Latency50thPercentile
- ActorFromTemplate:
TemplateName: InsertFromSample
TemplateParameters:
Name: NoIndexes
OnlyActiveInPhase: 2

- Name: BuildColumnStoreIndex
Type: RunCommand
Threads: 1
Phases:
OnlyActiveInPhases:
Active: [3]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Repeat: 1
Database: *db
Operations:
- OperationMetricsName: BulkBuildColumnStoreIndex
OperationName: RunCommand
OperationCommand:
createIndexes: *coll
indexes:
- key: {"$**": "columnstore"}
name: csi

# Repeat the same tests as with no index in presence of CSI.
- ActorFromTemplate:
TemplateName: InsertFromSample
TemplateParameters:
Name: Csi
OnlyActiveInPhase: 5


167 changes: 167 additions & 0 deletions src/workloads/query/CsiFragmentedInsertsNested.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
SchemaVersion: 2018-07-01
Owner: "@mongodb/query-execution"
Description: |
This workload compares performance of inserts into a collection with only the default _id index
and in presence of a full columnstore index. We are not comparing to wildcard index because the
nested data makes creating of a wildcard index too slow. Before changing any of the parameters in
this workload please make sure the results can be correlated with 'CsiFragmentedInsertsFlat.yml'.
As the approach in this workload is the same as in 'CsiFragmentedInsertsFlat.yml' with the exception
of data used by the loader (and not comparing to the wildcard index), comments are intentionally
omitted, please refer to the "flat" workload for the details.
Keywords:
- columnstore
- insert

AutoRun:
- When:
mongodb_setup:
$eq:
- standalone-all-feature-flags
branch_name:
$neq:
- v4.0
- v4.2
- v4.4
- v5.0
- v6.0

Clients:
Default:
QueryOptions:
socketTimeoutMS: 600_000 # = 10 min
connectTimeoutMS: 600_000

GlobalDefaults:
MaxPhases: &maxPhases 5
Database: &db csiFragmentedInsertsNested
Collection: &coll Collection0
DocumentCount: &docCount 1e6
SchemaWidth: &schemaWidth 10000
ObjectWidth: &objectWidth 5 # with two nested paths, get 10 paths_per_object
SampleSize: &sampleSize 10

Document: &document
# Generate documents that would produce non-trivial array info strings. When these docs
# are inserted, we want them to affect about 10 separated locations in the index, which is
# achieved by uniform distribution of xN fields and because for each xN both "a" and "b"
# subpaths are likely to be generated.
# The documents will looks like:
# {
# _id: ObjectId("63890d0df7b608a2d303b941"),
# root: [
# {
# x8372 : [{a: [42, *]}, {b: {obj: *}}, {a: [42, *], b: [42, *]}, {b: *}, {b: *}],
# <4 more xN fields>
# }
# ]
# }
root: {^Array: {
of: {^Object: {
withNEntries: *objectWidth,
havingKeys: {^FormatString: {
"format": "x%d", "withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}]
}},
andValues: {^Array: {
of:
# Create an object with either "a" or "b" key, or both. This is achieved by using
# 'duplicatedKeys: skip' parameter, which would ignore the second generated key if
# it has the same name as the first, so with probability 1/2 we'll get both fields
# and with probability 1/4 each of {a: } and {b: }.
{^Object: {
withNEntries: 2,
havingKeys: {^RandomString: {length: 1, alphabet: ab}},
andValues: {^Choose: {from: [
{^RandomInt: {min: 0, max: *schemaWidth}},
[42, {^RandomInt: {min: 0, max: *schemaWidth}}],
{obj: {^RandomInt: {min: 0, max: *schemaWidth}}}
]}},
duplicatedKeys: skip
}},
# 5 elements make it almost certain that both "a" and "b" are used in the array to
# yield ~10 paths per object.
number: 5
}},
duplicatedKeys: skip
}},
number: 1
}}

ActorTemplates:
- TemplateName: InsertFromSample
Config:
Name: {^Parameter: {Name: "Name", Default: "Insert"}}
Type: SamplingLoader
Threads: {^Parameter: {Name: "Threads", Default: 1}}
Phases:
OnlyActiveInPhases:
Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Database: *db
Collection: *coll
SampleSize: *sampleSize
InsertBatchSize: *sampleSize
Repeat: {^Parameter: {Name: "Repeats", Default: 1}}
Batches: {^Parameter: {Name: "Batches", Default: 500}}

Actors:
- Name: Loader
Type: Loader
Threads: 1
Phases:
OnlyActiveInPhases:
Active: [0]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Threads: 1
Repeat: 1
Database: *db
CollectionCount: 1
DocumentCount: *docCount
BatchSize: 1000
Document: *document

- Name: Quiese
Type: QuiesceActor
Threads: 1
Database: *db
Phases:
OnlyActiveInPhases:
Active: [1, 4]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Repeat: 1

- ActorFromTemplate:
TemplateName: InsertFromSample
TemplateParameters:
Name: NoIndexes
OnlyActiveInPhase: 2

- Name: BuildColumnStoreIndex
Type: RunCommand
Threads: 1
Phases:
OnlyActiveInPhases:
Active: [3]
NopInPhasesUpTo: *maxPhases
PhaseConfig:
Repeat: 1
Database: *db
Operations:
- OperationMetricsName: BulkBuildColumnStoreIndex
OperationName: RunCommand
OperationCommand:
createIndexes: *coll
indexes:
- key: {"$**": "columnstore"}
name: csi

- ActorFromTemplate:
TemplateName: InsertFromSample
TemplateParameters:
Name: Csi
OnlyActiveInPhase: 5


0 comments on commit f449258

Please sign in to comment.