From 2018a4d5e8d9c0e12f4e3f9afa0fa2afd9f0b34e Mon Sep 17 00:00:00 2001 From: Yu Jin Kang Park Date: Mon, 4 Dec 2023 10:30:03 +0000 Subject: [PATCH 1/2] ExternalScriptRunner modified to run script agains remote +loadShed POC --- .../cast_core/actors/ExternalScriptRunner.hpp | 3 +- src/cast_core/src/ExternalScriptRunner.cpp | 15 +- .../scale/ReadMemoryStressUntilFailure.yml | 168 +++++++++++++++++- 3 files changed, 177 insertions(+), 9 deletions(-) diff --git a/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp b/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp index 8bdf8c519b..63935287fe 100644 --- a/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp +++ b/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -50,6 +50,7 @@ class ExternalScriptRunner : public Actor { /** @private */ struct PhaseConfig; + mongocxx::pool::entry _client; PhaseLoop _loop; std::string _command; }; diff --git a/src/cast_core/src/ExternalScriptRunner.cpp b/src/cast_core/src/ExternalScriptRunner.cpp index b89a259f61..9c1ec2906c 100644 --- a/src/cast_core/src/ExternalScriptRunner.cpp +++ b/src/cast_core/src/ExternalScriptRunner.cpp @@ -16,6 +16,7 @@ #include #include +#include #include namespace genny::actor { @@ -117,7 +118,7 @@ class ScriptRunner { class GeneralRunner: public ScriptRunner { public: - GeneralRunner(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath) + GeneralRunner(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const mongocxx::uri& uri) : ScriptRunner(phaseContext, id, workloadPath), _script{phaseContext["Script"].to()} { std::string command{phaseContext["Command"].to()}; @@ -127,7 +128,10 @@ class GeneralRunner: public ScriptRunner { // No --file argument is required here, the script is run like // sh /path/to/file _invocation = "sh"; - } else { + } else if (command == "mongo") { + _invocation = "/data/workdir/bin/mongo --quiet --tls --tlsAllowInvalidCertificates \"" + uri.to_string() + "\""; + } + else { throw std::runtime_error("Script type " + command + " is not supported."); } } @@ -179,12 +183,12 @@ struct ExternalScriptRunner::PhaseConfig { // ignored. metrics::Operation operation; - PhaseConfig(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const std::string& type) + PhaseConfig(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const std::string& type, const mongocxx::uri& uri) : operation{phaseContext.operation("DefaultMetricsName", id)} { if(type == "Python") { _scriptRunner = std::make_unique(phaseContext, id, workloadPath); } else { - _scriptRunner = std::make_unique(phaseContext, id, workloadPath); + _scriptRunner = std::make_unique(phaseContext, id, workloadPath, uri); } } std::string runScript() { @@ -218,7 +222,8 @@ void ExternalScriptRunner::run() { ExternalScriptRunner::ExternalScriptRunner(genny::ActorContext& context) // These are the attributes for the actor. : Actor{context}, - _loop{context, ExternalScriptRunner::id(), context.workload().workloadPath(), context["Type"].to()}{} + _client{context.client()}, + _loop{context, ExternalScriptRunner::id(), context.workload().workloadPath(), context["Type"].to(), _client->uri()}{} namespace { auto registerExternalScriptRunner = Cast::registerDefault(); diff --git a/src/workloads/scale/ReadMemoryStressUntilFailure.yml b/src/workloads/scale/ReadMemoryStressUntilFailure.yml index 03b678213d..d01c8d55de 100644 --- a/src/workloads/scale/ReadMemoryStressUntilFailure.yml +++ b/src/workloads/scale/ReadMemoryStressUntilFailure.yml @@ -61,8 +61,8 @@ Clients: Default: QueryOptions: socketTimeoutMS: -1 - maxPoolSize: 500 - + maxPoolSize: 11000 + Actors: # Drop database to get rid of stale data. Useful when running locally multiple times. - Name: Setup @@ -79,6 +79,21 @@ Actors: - OperationName: RunCommand OperationCommand: {dropDatabase: 1} +# Profile everything, so we can see $currentOp executions stats. The workload will fail, analysis won't work. +- Name: LogLevel + Type: RunCommand + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [0] + NopInPhasesUpTo: *MaxPhases + PhaseConfig: + Repeat: 1 + Database: admin + Operations: + - OperationName: RunCommand + OperationCommand: { profile: 2 } + # Load 20,096 documents around 520KB as described by the structure in GlobalDefaults. - Name: LoadDocuments Type: Loader @@ -96,6 +111,26 @@ Actors: DocumentCount: *NumDocs BatchSize: *LoadBatchSize +# Create 50k idle cursors, to simulate a situation where we have some big memory consumers and many small ops. +- Name: CreateCursors + Type: CrudActor + Threads: 10000 + Phases: + OnlyActiveInPhases: + Active: [1] + NopInPhasesUpTo: *MaxPhases + PhaseConfig: + Repeat: 1 + Database: someDb + Collection: doesNotMatter + ThrowOnFailure: false + Operations: + - OperationName: find + OperationCommand: + Filter: {} + Options: + BatchSize: 0 # We just want a cursor stablished, the collection does not contain data. + # Spawn many threads to sort enough documents to test server's capacity to handle memory pressure. - Name: SortMany Type: RunCommand @@ -105,8 +140,10 @@ Actors: Active: [2] NopInPhasesUpTo: *MaxPhases PhaseConfig: - Repeat: *SortRepeat + Duration: 2 minutes + # Repeat: *SortRepeat Database: *DBName + ThrowOnFailure: false Operations: - OperationMetricsName: SortMany OperationName: RunCommand @@ -128,6 +165,131 @@ Actors: {$sort: {b: 1}}] cursor: {batchSize: *SortBatchSize} +# - Name: currentOp +# Type: AdminCommand +# # Use a single thread. Multiple $currentOp will block due to the service context mutex (LockedClientCursor), which makes reported times meaningless. +# Threads: 1 +# Phases: +# OnlyActiveInPhases: +# Active: [2] +# NopInPhasesUpTo: *MaxPhases +# PhaseConfig: +# Duration: 2 minutes +# Operations: +# - OperationMetricsName: currentOp +# OperationName: RunCommand +# OperationCommand: +# aggregate: 1 +# pipeline: +# [{$currentOp: {allUsers: true, idleConnections: true, idleCursors: true, idleSessions: true}}, {$match: {type: "op"}}] +# cursor: {} + +# Force system.profile to be constantly flushed to disk +- Name: fsync + Type: AdminCommand + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [2] + NopInPhasesUpTo: *MaxPhases + PhaseConfig: + Duration: 2 minutes + Operations: + - OperationMetricsName: fsync + OperationName: RunCommand + OperationCommand: + fsync: 1 + +# Connect to mongodb server and run JS script +- Name: MongoshScriptRunnerWithDB + Type: ExternalScriptRunner + Threads: 1 + Phases: + OnlyActiveInPhases: + Active: [2] + NopInPhasesUpTo: *MaxPhases + PhaseConfig: + Duration: 2 minutes + Command: "mongo" + MetricsName: ScriptMetrics + Script: | + function shouldShedOps() { + let residentMemMb = db.serverStatus().mem.resident; + //jsTestLog("Resident " + residentMemMb); + return residentMemMb > 55000; + } + + function compareOps(a, b) { + return Date.parse(a.currentOpTime) - Date.parse(b.currentOpTime); + } + + const testDB = db.getSiblingDB("memorystress"); + function killCursors(cursorArr) { + //jsTestLog("Kill cursors " + cursorArr.length); + testDB.runCommand( { killCursors: "Collection0", cursors: cursorArr } ); + } + + function loadShedding() { + // Target only Collection0. + //jsTestLog("Running currentOp"); + let pipeline = [{$currentOp: {allUsers: true, idleCursors: true}}, {$match: {ns: "memorystress.Collection0"}}]; + let currOpResult = db.aggregate(pipeline); + + // Do some sorting. + //jsTestLog("sorting"); + let opsArr = currOpResult.toArray().sort(compareOps); + //jsTestLog("shedding"); + + let n = 0; + let cursorArr = []; + + for (const op of opsArr) { + //printjson(op); + if(op.opid) { + db.killOp(op.opid); + n++; + } else if(op?.cursor?.cursorId) { + cursorArr.push(op.cursor.cursorId); + n++; + } + + if(cursorArr.length >= 20 || (cursorArr.length && op.opid)) { + killCursors(cursorArr); + cursorArr = []; + } + + if((n%20==0) && !shouldShedOps()) { + sleep(1); + if(shouldShedOps()) { + n = 0; + continue; + } + jsTestLog("Break shedding"); + cursorArr = []; + break; + } + } + if(cursorArr.length) { + killCursors(cursorArr); + } + } + + while(true) { + if(!shouldShedOps()) { + sleep(1); + continue; + } + loadShedding(); + } + + + + + + + + + # Commented out because this should not be regularly scheduled, as the task is expected to fail. # Uncomment the lines below (and possibly change the build variant) to run the workload. # AutoRun: From c9ac5f43d52710b2ced5febacbb7eca8a8926318 Mon Sep 17 00:00:00 2001 From: Yu Jin Kang Park Date: Wed, 13 Dec 2023 14:37:29 +0000 Subject: [PATCH 2/2] limit runtime in script --- .../scale/ReadMemoryStressUntilFailure.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/workloads/scale/ReadMemoryStressUntilFailure.yml b/src/workloads/scale/ReadMemoryStressUntilFailure.yml index d01c8d55de..04f1e5c09b 100644 --- a/src/workloads/scale/ReadMemoryStressUntilFailure.yml +++ b/src/workloads/scale/ReadMemoryStressUntilFailure.yml @@ -213,6 +213,13 @@ Actors: Command: "mongo" MetricsName: ScriptMetrics Script: | + const minutes = 4; + const startTime = new Date().getTime(); + const targetEndTime = startTime + (1000 * 60 * minutes); + function shouldStopTest() { + return (new Date().getTime()) > targetEndTime; + } + function shouldShedOps() { let residentMemMb = db.serverStatus().mem.resident; //jsTestLog("Resident " + residentMemMb); @@ -274,12 +281,10 @@ Actors: } } - while(true) { - if(!shouldShedOps()) { - sleep(1); - continue; + while(!shouldStopTest()) { + if(shouldShedOps()) { + loadShedding(); } - loadShedding(); }