diff --git a/src/workloads/query/CsiFragmentedInsertsFlat.yml b/src/workloads/query/CsiFragmentedInsertsFlat.yml new file mode 100644 index 0000000000..18c618fb11 --- /dev/null +++ b/src/workloads/query/CsiFragmentedInsertsFlat.yml @@ -0,0 +1,182 @@ +SchemaVersion: 2018-07-01 +Owner: "@mongodb/query-execution" +Description: | + This workload compares performance of inserts into a collection with only the default _id index, + and in presence of the columnstore index. It uses an artificial data set with a wide overall + schema and narrow individual objects to model fragmented access to CSI, which clusters entries by + path. The data size is relatively small (1e6 documents yield ~175MB data size and ~105MB storage + size). + We would like to be able to correlate the results of this workload with the similar one that uses + nested data (CsiFragmentedInsertsNested.yml). Please make sure to update both when making changes. + +Keywords: +- columnstore +- insert + +AutoRun: +- When: + mongodb_setup: + $eq: + - standalone-all-feature-flags + branch_name: + $neq: + - v4.0 + - v4.2 + - v4.4 + - v5.0 + - v6.0 + +Clients: + Default: + QueryOptions: + # Allow for longer duration since index builds may take a while. + socketTimeoutMS: 600_000 # = 10 min + connectTimeoutMS: 600_000 + +GlobalDefaults: + MaxPhases: &maxPhases 5 + Database: &db csiFragmentedInsertsFlat + + # The Loader actor creates collections named "Collection" where N corresponds to the thread's + # number. We'll use a single collection, created by a single thread, so it becomes 'Collection0'. + Collection: &coll Collection0 + + # If modifying any of the these parameters, please review 'CsiFragmentedInsertsNested.yml' to + # ensure that the results of these two workloads can still be correlated. + DocumentCount: &docCount 1e6 + SchemaWidth: &schemaWidth 10000 + ObjectWidth: &objectWidth 10 + SampleSize: &sampleSize 10 + + Document: &document + # { + # _id: ObjectId(...), + # root: { + # x271: NumberInt(9917), + # x6305: NumberInt(11), + # x8: NumberInt(1022), + # <7 more fields like this> + # } + # } + root: {^Object: { + withNEntries: *objectWidth, + + # We are using uniform distribution of fields to make the sampling more stable. + havingKeys: {^FormatString: { + "format": "x%d", + "withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}] + }}, + + # We don't expect the actual values to matter. + andValues: {^RandomInt: {min: 0, max: *schemaWidth}}, + + # Occasionally, the key generator might produce the same key name. For this workload, + # it's OK to have some of the objects with fewer than 'objectWidth' fields (and we don't + # want to test parsing of the duplicated keys for the index as it's not a common user + # scenario). + duplicatedKeys: skip + }} + +ActorTemplates: +# We want to be able to compare results from the same test between the runs in presence of different +# indexes. For this we'll have to instantiate the same actor multiple times with a unique name. +- TemplateName: InsertFromSample + Config: + Name: {^Parameter: {Name: "Name", Default: "Insert"}} + Type: SamplingLoader + Threads: {^Parameter: {Name: "Threads", Default: 1}} + Phases: + OnlyActiveInPhases: + Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Database: *db + Collection: *coll + + # No more than sampleSize*objectWidth paths can be "touched" by a sample. + SampleSize: *sampleSize + + # When the batch size is the same as sample size, each sampled document will be used in + # the batch exactly once, and this will be repeated for the number of batches. We expect, + # that the first batch might be affected by "cold" caches but the subsequent batches + # would be fully warmed up. + InsertBatchSize: *sampleSize + + # The 'SamplingLoader' actor re-samples on repeat, meaning that it would get a new set of + # documents likely with different paths and values. This makes each repeat hit different + # parts of the indexes, causing a long warm up tail. To avoid this we test with a single + # repeat but multiple batches. + Repeat: {^Parameter: {Name: "Repeats", Default: 1}} + + # The instances of the template must specify the number of batches to make it clear what + # they are testing wrt to cold/warm state. The stats per batch will be available in the + # 'IndividualBulkInsert' measurement. + Batches: {^Parameter: {Name: "Batches", Default: 500}} + +Actors: +- Name: Loader + Type: Loader + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [0] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + # Cannot have more threads than the actor itself. + Threads: 1 + Repeat: 1 + Database: *db + CollectionCount: 1 + DocumentCount: *docCount + BatchSize: 1000 + Document: *document + +- Name: Quiese + Type: QuiesceActor + Threads: 1 + Database: *db + Phases: + OnlyActiveInPhases: + Active: [1, 4] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Repeat: 1 + +# Do multiple inserts of documents with the same schema. The performance of the first insert might +# be affected by the inserts in the previous stages, but it should stabilize after that so, by using +# a large number of batches, it should amortize sufficiently to have P90 similar to P50 for the +# latency. +# Target measurements: Latency50thPercentile +- ActorFromTemplate: + TemplateName: InsertFromSample + TemplateParameters: + Name: NoIndexes + OnlyActiveInPhase: 2 + +- Name: BuildColumnStoreIndex + Type: RunCommand + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [3] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Repeat: 1 + Database: *db + Operations: + - OperationMetricsName: BulkBuildColumnStoreIndex + OperationName: RunCommand + OperationCommand: + createIndexes: *coll + indexes: + - key: {"$**": "columnstore"} + name: csi + +# Repeat the same tests as with no index in presence of CSI. +- ActorFromTemplate: + TemplateName: InsertFromSample + TemplateParameters: + Name: Csi + OnlyActiveInPhase: 5 + + diff --git a/src/workloads/query/CsiFragmentedInsertsNested.yml b/src/workloads/query/CsiFragmentedInsertsNested.yml new file mode 100644 index 0000000000..e75a8e4009 --- /dev/null +++ b/src/workloads/query/CsiFragmentedInsertsNested.yml @@ -0,0 +1,167 @@ +SchemaVersion: 2018-07-01 +Owner: "@mongodb/query-execution" +Description: | + This workload compares performance of inserts into a collection with only the default _id index + and in presence of a full columnstore index. We are not comparing to wildcard index because the + nested data makes creating of a wildcard index too slow. Before changing any of the parameters in + this workload please make sure the results can be correlated with 'CsiFragmentedInsertsFlat.yml'. + As the approach in this workload is the same as in 'CsiFragmentedInsertsFlat.yml' with the exception + of data used by the loader (and not comparing to the wildcard index), comments are intentionally + omitted, please refer to the "flat" workload for the details. + +Keywords: +- columnstore +- insert + +AutoRun: +- When: + mongodb_setup: + $eq: + - standalone-all-feature-flags + branch_name: + $neq: + - v4.0 + - v4.2 + - v4.4 + - v5.0 + - v6.0 + +Clients: + Default: + QueryOptions: + socketTimeoutMS: 600_000 # = 10 min + connectTimeoutMS: 600_000 + +GlobalDefaults: + MaxPhases: &maxPhases 5 + Database: &db csiFragmentedInsertsNested + Collection: &coll Collection0 + DocumentCount: &docCount 1e6 + SchemaWidth: &schemaWidth 10000 + ObjectWidth: &objectWidth 5 # with two nested paths, get 10 paths_per_object + SampleSize: &sampleSize 10 + + Document: &document + # Generate documents that would produce non-trivial array info strings. When these docs + # are inserted, we want them to affect about 10 separated locations in the index, which is + # achieved by uniform distribution of xN fields and because for each xN both "a" and "b" + # subpaths are likely to be generated. + # The documents will looks like: + # { + # _id: ObjectId("63890d0df7b608a2d303b941"), + # root: [ + # { + # x8372 : [{a: [42, *]}, {b: {obj: *}}, {a: [42, *], b: [42, *]}, {b: *}, {b: *}], + # <4 more xN fields> + # } + # ] + # } + root: {^Array: { + of: {^Object: { + withNEntries: *objectWidth, + havingKeys: {^FormatString: { + "format": "x%d", "withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}] + }}, + andValues: {^Array: { + of: + # Create an object with either "a" or "b" key, or both. This is achieved by using + # 'duplicatedKeys: skip' parameter, which would ignore the second generated key if + # it has the same name as the first, so with probability 1/2 we'll get both fields + # and with probability 1/4 each of {a: } and {b: }. + {^Object: { + withNEntries: 2, + havingKeys: {^RandomString: {length: 1, alphabet: ab}}, + andValues: {^Choose: {from: [ + {^RandomInt: {min: 0, max: *schemaWidth}}, + [42, {^RandomInt: {min: 0, max: *schemaWidth}}], + {obj: {^RandomInt: {min: 0, max: *schemaWidth}}} + ]}}, + duplicatedKeys: skip + }}, + # 5 elements make it almost certain that both "a" and "b" are used in the array to + # yield ~10 paths per object. + number: 5 + }}, + duplicatedKeys: skip + }}, + number: 1 + }} + +ActorTemplates: +- TemplateName: InsertFromSample + Config: + Name: {^Parameter: {Name: "Name", Default: "Insert"}} + Type: SamplingLoader + Threads: {^Parameter: {Name: "Threads", Default: 1}} + Phases: + OnlyActiveInPhases: + Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Database: *db + Collection: *coll + SampleSize: *sampleSize + InsertBatchSize: *sampleSize + Repeat: {^Parameter: {Name: "Repeats", Default: 1}} + Batches: {^Parameter: {Name: "Batches", Default: 500}} + +Actors: +- Name: Loader + Type: Loader + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [0] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Threads: 1 + Repeat: 1 + Database: *db + CollectionCount: 1 + DocumentCount: *docCount + BatchSize: 1000 + Document: *document + +- Name: Quiese + Type: QuiesceActor + Threads: 1 + Database: *db + Phases: + OnlyActiveInPhases: + Active: [1, 4] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Repeat: 1 + +- ActorFromTemplate: + TemplateName: InsertFromSample + TemplateParameters: + Name: NoIndexes + OnlyActiveInPhase: 2 + +- Name: BuildColumnStoreIndex + Type: RunCommand + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [3] + NopInPhasesUpTo: *maxPhases + PhaseConfig: + Repeat: 1 + Database: *db + Operations: + - OperationMetricsName: BulkBuildColumnStoreIndex + OperationName: RunCommand + OperationCommand: + createIndexes: *coll + indexes: + - key: {"$**": "columnstore"} + name: csi + +- ActorFromTemplate: + TemplateName: InsertFromSample + TemplateParameters: + Name: Csi + OnlyActiveInPhase: 5 + +