From 2018a4d5e8d9c0e12f4e3f9afa0fa2afd9f0b34e Mon Sep 17 00:00:00 2001
From: Yu Jin Kang Park <yujin.kang@mongodb.com>
Date: Mon, 4 Dec 2023 10:30:03 +0000
Subject: [PATCH 1/2] ExternalScriptRunner modified to run script agains remote
 +loadShed POC

---
 .../cast_core/actors/ExternalScriptRunner.hpp |   3 +-
 src/cast_core/src/ExternalScriptRunner.cpp    |  15 +-
 .../scale/ReadMemoryStressUntilFailure.yml    | 168 +++++++++++++++++-
 3 files changed, 177 insertions(+), 9 deletions(-)
diff --git a/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp b/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp
index 8bdf8c519b..63935287fe 100644
--- a/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp
+++ b/src/cast_core/include/cast_core/actors/ExternalScriptRunner.hpp
@@ -18,7 +18,7 @@
 #include <string_view>
 #include <string>
 
-#include <mongocxx/pool.hpp>
+#include <mongocxx/uri.hpp>
 
 #include <gennylib/Actor.hpp>
 #include <gennylib/PhaseLoop.hpp>
@@ -50,6 +50,7 @@ class ExternalScriptRunner : public Actor {
 
     /** @private */
     struct PhaseConfig;
+    mongocxx::pool::entry _client;
     PhaseLoop<PhaseConfig> _loop;
     std::string _command;
 };
diff --git a/src/cast_core/src/ExternalScriptRunner.cpp b/src/cast_core/src/ExternalScriptRunner.cpp
index b89a259f61..9c1ec2906c 100644
--- a/src/cast_core/src/ExternalScriptRunner.cpp
+++ b/src/cast_core/src/ExternalScriptRunner.cpp
@@ -16,6 +16,7 @@
 
 #include <boost/assert.hpp>
 #include <boost/filesystem.hpp>
+#include <mongocxx/client.hpp>
 #include <cast_core/actors/ExternalScriptRunner.hpp>
 
 namespace genny::actor {
@@ -117,7 +118,7 @@ class ScriptRunner {
 
 class GeneralRunner: public ScriptRunner {
 public:
-    GeneralRunner(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath)
+    GeneralRunner(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const mongocxx::uri& uri)
         : ScriptRunner(phaseContext, id, workloadPath),
           _script{phaseContext["Script"].to<std::string>()} {
         std::string command{phaseContext["Command"].to<std::string>()};
@@ -127,7 +128,10 @@ class GeneralRunner: public ScriptRunner {
             // No --file argument is required here, the script is run like
             // sh /path/to/file
             _invocation = "sh";
-        } else {
+        } else if (command == "mongo") {
+            _invocation = "/data/workdir/bin/mongo --quiet --tls --tlsAllowInvalidCertificates \"" + uri.to_string() + "\"";
+        }
+        else {
             throw std::runtime_error("Script type " + command + " is not supported.");
         }
     }
@@ -179,12 +183,12 @@ struct ExternalScriptRunner::PhaseConfig {
     // ignored.
     metrics::Operation operation;
 
-    PhaseConfig(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const std::string& type)
+    PhaseConfig(PhaseContext& phaseContext, ActorId id, const std::string& workloadPath, const std::string& type, const mongocxx::uri& uri)
         : operation{phaseContext.operation("DefaultMetricsName", id)} {
             if(type == "Python") {
                 _scriptRunner = std::make_unique<PythonRunner>(phaseContext, id, workloadPath);
             } else {
-                _scriptRunner = std::make_unique<GeneralRunner>(phaseContext, id, workloadPath);
+                _scriptRunner = std::make_unique<GeneralRunner>(phaseContext, id, workloadPath, uri);
             }
         }
     std::string runScript() {
@@ -218,7 +222,8 @@ void ExternalScriptRunner::run() {
 ExternalScriptRunner::ExternalScriptRunner(genny::ActorContext& context)
     // These are the attributes for the actor.
     : Actor{context},
-      _loop{context, ExternalScriptRunner::id(), context.workload().workloadPath(), context["Type"].to<std::string>()}{}
+      _client{context.client()},
+      _loop{context, ExternalScriptRunner::id(), context.workload().workloadPath(), context["Type"].to<std::string>(), _client->uri()}{}
 
 namespace {
 auto registerExternalScriptRunner = Cast::registerDefault<ExternalScriptRunner>();
diff --git a/src/workloads/scale/ReadMemoryStressUntilFailure.yml b/src/workloads/scale/ReadMemoryStressUntilFailure.yml
index 03b678213d..d01c8d55de 100644
--- a/src/workloads/scale/ReadMemoryStressUntilFailure.yml
+++ b/src/workloads/scale/ReadMemoryStressUntilFailure.yml
@@ -61,8 +61,8 @@ Clients:
   Default:
     QueryOptions:
       socketTimeoutMS: -1
-      maxPoolSize: 500
-    
+      maxPoolSize: 11000
+
 Actors:
 # Drop database to get rid of stale data. Useful when running locally multiple times.
 - Name: Setup
@@ -79,6 +79,21 @@ Actors:
         - OperationName: RunCommand
           OperationCommand: {dropDatabase: 1}
 
+# Profile everything, so we can see $currentOp executions stats. The workload will fail, analysis won't work.
+- Name: LogLevel
+  Type: RunCommand
+  Threads: 1
+  Phases:
+    OnlyActiveInPhases:
+      Active: [0]
+      NopInPhasesUpTo: *MaxPhases
+      PhaseConfig:
+        Repeat: 1
+        Database: admin
+        Operations:
+        - OperationName: RunCommand
+          OperationCommand: { profile: 2 }
+
 # Load 20,096 documents around 520KB as described by the structure in GlobalDefaults.
 - Name: LoadDocuments
   Type: Loader
@@ -96,6 +111,26 @@ Actors:
         DocumentCount: *NumDocs
         BatchSize: *LoadBatchSize
 
+# Create 50k idle cursors, to simulate a situation where we have some big memory consumers and many small ops.
+- Name: CreateCursors
+  Type: CrudActor
+  Threads: 10000
+  Phases:
+    OnlyActiveInPhases:
+      Active: [1]
+      NopInPhasesUpTo: *MaxPhases
+      PhaseConfig:
+        Repeat: 1
+        Database: someDb
+        Collection: doesNotMatter
+        ThrowOnFailure: false
+        Operations:
+        - OperationName: find
+          OperationCommand:
+            Filter: {}
+            Options:
+              BatchSize: 0 # We just want a cursor stablished, the collection does not contain data.
+
 # Spawn many threads to sort enough documents to test server's capacity to handle memory pressure.
 - Name: SortMany
   Type: RunCommand
@@ -105,8 +140,10 @@ Actors:
       Active: [2]
       NopInPhasesUpTo: *MaxPhases
       PhaseConfig:
-        Repeat: *SortRepeat
+        Duration: 2 minutes
+        # Repeat: *SortRepeat
         Database: *DBName
+        ThrowOnFailure: false
         Operations:
         - OperationMetricsName: SortMany
           OperationName: RunCommand
@@ -128,6 +165,131 @@ Actors:
               {$sort: {b: 1}}]
             cursor: {batchSize: *SortBatchSize}
 
+# - Name: currentOp
+#   Type: AdminCommand
+#   # Use a single thread. Multiple $currentOp will block due to the service context mutex (LockedClientCursor), which makes reported times meaningless.
+#   Threads: 1
+#   Phases:
+#     OnlyActiveInPhases:
+#       Active: [2]
+#       NopInPhasesUpTo: *MaxPhases
+#       PhaseConfig:
+#         Duration: 2 minutes
+#         Operations:
+#         - OperationMetricsName: currentOp
+#           OperationName: RunCommand
+#           OperationCommand:
+#             aggregate: 1
+#             pipeline:
+#               [{$currentOp: {allUsers: true, idleConnections: true, idleCursors: true, idleSessions: true}}, {$match: {type: "op"}}]
+#             cursor: {}
+
+# Force system.profile to be constantly flushed to disk
+- Name: fsync
+  Type: AdminCommand
+  Threads: 1
+  Phases:
+    OnlyActiveInPhases:
+      Active: [2]
+      NopInPhasesUpTo: *MaxPhases
+      PhaseConfig:
+        Duration: 2 minutes
+        Operations:
+        - OperationMetricsName: fsync
+          OperationName: RunCommand
+          OperationCommand:
+            fsync: 1
+
+# Connect to mongodb server and run JS script
+- Name: MongoshScriptRunnerWithDB
+  Type: ExternalScriptRunner
+  Threads: 1
+  Phases:
+      OnlyActiveInPhases:
+        Active: [2]
+        NopInPhasesUpTo: *MaxPhases
+        PhaseConfig:
+          Duration: 2 minutes
+          Command: "mongo"
+          MetricsName: ScriptMetrics
+          Script: |
+            function shouldShedOps() {
+              let residentMemMb = db.serverStatus().mem.resident;
+              //jsTestLog("Resident " + residentMemMb);
+              return residentMemMb > 55000;
+            }
+
+            function compareOps(a, b) {
+              return Date.parse(a.currentOpTime) - Date.parse(b.currentOpTime);
+            }
+
+            const testDB = db.getSiblingDB("memorystress");
+            function killCursors(cursorArr) {
+                //jsTestLog("Kill cursors " + cursorArr.length);
+                testDB.runCommand( { killCursors: "Collection0", cursors: cursorArr } );
+            }
+
+            function loadShedding() {
+              // Target only Collection0.
+              //jsTestLog("Running currentOp");
+              let pipeline = [{$currentOp: {allUsers: true, idleCursors: true}}, {$match: {ns: "memorystress.Collection0"}}];
+              let currOpResult = db.aggregate(pipeline);
+
+              // Do some sorting.
+              //jsTestLog("sorting");
+              let opsArr = currOpResult.toArray().sort(compareOps);
+              //jsTestLog("shedding");
+
+              let n = 0;
+              let cursorArr = [];
+
+              for (const op of opsArr) {
+                //printjson(op);
+                if(op.opid) {
+                  db.killOp(op.opid);
+                  n++;
+                } else if(op?.cursor?.cursorId) {
+                  cursorArr.push(op.cursor.cursorId);
+                  n++;
+                }
+
+                if(cursorArr.length >= 20 || (cursorArr.length && op.opid)) {
+                  killCursors(cursorArr);
+                  cursorArr = [];
+                }
+
+                if((n%20==0) && !shouldShedOps()) {
+                  sleep(1);
+                  if(shouldShedOps()) {
+                    n = 0;
+                    continue;
+                  }
+                  jsTestLog("Break shedding");
+                  cursorArr = [];
+                  break;
+                }
+              }
+              if(cursorArr.length) {
+                killCursors(cursorArr);
+              }
+            }
+
+            while(true) {
+              if(!shouldShedOps()) {
+                sleep(1);
+                continue;
+              }
+              loadShedding();
+            }
+
+
+
+
+
+
+
+
+
 # Commented out because this should not be regularly scheduled, as the task is expected to fail.
 # Uncomment the lines below (and possibly change the build variant) to run the workload.
 # AutoRun:

From c9ac5f43d52710b2ced5febacbb7eca8a8926318 Mon Sep 17 00:00:00 2001
From: Yu Jin Kang Park <yujin.kang@mongodb.com>
Date: Wed, 13 Dec 2023 14:37:29 +0000
Subject: [PATCH 2/2] limit runtime in script

---
 .../scale/ReadMemoryStressUntilFailure.yml        | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/workloads/scale/ReadMemoryStressUntilFailure.yml b/src/workloads/scale/ReadMemoryStressUntilFailure.yml
index d01c8d55de..04f1e5c09b 100644
--- a/src/workloads/scale/ReadMemoryStressUntilFailure.yml
+++ b/src/workloads/scale/ReadMemoryStressUntilFailure.yml
@@ -213,6 +213,13 @@ Actors:
           Command: "mongo"
           MetricsName: ScriptMetrics
           Script: |
+            const minutes = 4;
+            const startTime = new Date().getTime();
+            const targetEndTime = startTime + (1000 * 60 * minutes);
+            function shouldStopTest() {
+              return (new Date().getTime()) > targetEndTime;
+            }
+
             function shouldShedOps() {
               let residentMemMb = db.serverStatus().mem.resident;
               //jsTestLog("Resident " + residentMemMb);
@@ -274,12 +281,10 @@ Actors:
               }
             }
 
-            while(true) {
-              if(!shouldShedOps()) {
-                sleep(1);
-                continue;
+            while(!shouldStopTest()) {
+              if(shouldShedOps()) {
+                loadShedding();
               }
-              loadShedding();
             }