From 23991e7454be2c875337b8fb75a2a267d0efa4fc Mon Sep 17 00:00:00 2001
From: phisn <phisn@outlook.de>
Date: Tue, 30 Apr 2024 20:04:59 +0200
Subject: [PATCH] Continue refactoring ppo-tfjs

---
 packages/learning/src/ppo/base-ppo.ts |   2 +-
 packages/learning/src/ppo/ppo.ts      | 219 ++++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 11 deletions(-)

diff --git a/packages/learning/src/ppo/base-ppo.ts b/packages/learning/src/ppo/base-ppo.ts
index 57d3935f..b7d9bf2b 100644
--- a/packages/learning/src/ppo/base-ppo.ts
+++ b/packages/learning/src/ppo/base-ppo.ts
@@ -240,7 +240,7 @@ class PPO {
         this.lastObservation = null
 
         // Initialize buffer
-        this.buffer = new ReplayBuffer(config)
+        this.buffer = new Buffer(config)
 
         // Initialize models for actor and critic
         this.actor = this.createActor()
diff --git a/packages/learning/src/ppo/ppo.ts b/packages/learning/src/ppo/ppo.ts
index 49d4a370..71188139 100644
--- a/packages/learning/src/ppo/ppo.ts
+++ b/packages/learning/src/ppo/ppo.ts
@@ -109,15 +109,18 @@ class ReplayBuffer {
 interface DiscreteSpace {
     class: "Discrete"
     dtype?: "int32"
-    n: number
+
+    len: number
 }
 
 interface BoxSpace {
     class: "Box"
     dtype?: "float32"
-    shape: number[]
-    low: number[]
-    high: number[]
+
+    low: number
+    high: number
+
+    len: number
 }
 
 type Space = DiscreteSpace | BoxSpace
@@ -134,10 +137,36 @@ interface PPOConfig {
     clipRatio: number
     targetKL: number
 
-    observationSpace: Space
+    observationDimension: number
     actionSpace: Space
 }
 
+interface Environment {
+    reset(): number[]
+    step(action: number | number[]): [number[], number, boolean]
+}
+
+const ppo = new PPO(
+    {} as PPOConfig,
+    {} as Space,
+    [
+        {
+            class: "Box",
+            len: 2,
+            low: [0, 0],
+            high: [1, 1],
+        },
+        {
+            class: "Discrete",
+            len: 2,
+        },
+    ],
+    {} as tf.LayersModel,
+    {} as tf.LayersModel,
+)
+
+ppo.act([1, 2, 3])
+
 class PPO {
     private numTimeSteps: number
     private lastObservation: number[]
@@ -154,6 +183,9 @@ class PPO {
 
     constructor(
         private config: PPOConfig,
+
+        private env: Environment,
+
         private actorModel: tf.LayersModel,
         private criticModel: tf.LayersModel,
     ) {
@@ -167,7 +199,7 @@ class PPO {
                 layers: [
                     actorModel,
                     tf.layers.dense({
-                        units: config.actionSpace.n,
+                        units: config.actionSpace.len,
                     }),
                 ],
             })
@@ -176,7 +208,7 @@ class PPO {
                 layers: [
                     actorModel,
                     tf.layers.dense({
-                        units: config.actionSpace.shape[0],
+                        units: config.actionSpace.len,
                     }),
                 ],
             })
@@ -195,14 +227,181 @@ class PPO {
         })
 
         if (config.actionSpace.class === "Box") {
-            this.logStd = tf.variable(tf.zeros([config.actionSpace.shape[0]]), true, "logStd")
+            this.logStd = tf.variable(tf.zeros([config.actionSpace.len]), true, "logStd")
         }
 
         this.optimizerPolicy = tf.train.adam(config.policyLearningRate)
         this.optimizerValue = tf.train.adam(config.valueLearningRate)
     }
 
-    act(observation: number[]) {}
+    act(observation: number[]): GetPPOSpaceType<ActionSpaces, "actionType"> {}
+
+    private collectRollouts() {
+        this.buffer.reset()
+
+        let sumReturn = 0
+        let sumReward = 0
+        let numEpisodes = 0
+
+        for (let step = 0; step < this.config.steps; ++step) {
+            tf.tidy(() => {
+                const observation = tf.tensor2d(this.lastObservation)
+
+                const [predictions, action, actionSynced] = this.sampleAction(observation)
+                const value = this.critic.predict(observation) as tf.Tensor1D
+
+                // TODO verify types
+                const logProbability = this.logProb(predictions as any, action as any)
+
+                const [nextObservation, reward, done] = this.env.step(actionSynced)
+
+                sumReturn += reward
+                sumReward += reward
+            })
+        }
+    }
+
+    private trainValue(observationBuffer: tf.Tensor2D, returnBuffer: tf.Tensor1D) {
+        const optimize = () => {
+            const valuesPredictions = this.critic.predict(observationBuffer) as tf.Tensor1D
+            return tf.losses.meanSquaredError(returnBuffer, valuesPredictions) as tf.Scalar
+        }
+
+        tf.tidy(() => {
+            const { grads } = this.optimizerValue.computeGradients(optimize)
+            this.optimizerValue.applyGradients(grads)
+        })
+    }
+
+    private trainPolicy(
+        observationBuffer: tf.Tensor2D,
+        actionBuffer: tf.Tensor2D,
+        logProbabilityBuffer: tf.Tensor1D,
+        advantageBuffer: tf.Tensor1D,
+    ) {
+        const optimize = () => {
+            const predictions = this.actor.predict(observationBuffer) as tf.Tensor2D
+
+            const logProbDiff = tf.sub(
+                this.logProb(predictions, actionBuffer),
+                logProbabilityBuffer,
+            )
+
+            const ratio = tf.exp(logProbDiff)
+
+            const minAdvantage = tf.where(
+                tf.greater(advantageBuffer, 0),
+                tf.mul(tf.add(1, this.config.clipRatio), advantageBuffer),
+                tf.mul(tf.sub(1, this.config.clipRatio), advantageBuffer),
+            )
+
+            const policyLoss = tf.neg(
+                tf.mean(tf.minimum(tf.mul(ratio, advantageBuffer), minAdvantage)),
+            )
+
+            return policyLoss as tf.Scalar
+        }
+
+        return tf.tidy(() => {
+            const { grads } = this.optimizerPolicy.computeGradients(optimize)
+            this.optimizerPolicy.applyGradients(grads)
+
+            const kl = tf.mean(
+                tf.sub(
+                    logProbabilityBuffer,
+                    this.logProb(
+                        this.actor.predict(observationBuffer) as tf.Tensor2D,
+                        actionBuffer,
+                    ),
+                ),
+            )
+
+            return kl.arraySync()
+        })
+    }
+
+    private logProb(predictions: tf.Tensor2D, actions: tf.Tensor2D) {
+        if (this.config.actionSpace.class === "Discrete") {
+            return this.logProbCategorical(predictions, actions)
+        } else if (this.config.actionSpace.class === "Box") {
+            return this.logProbNormal(predictions, actions)
+        } else {
+            throw new Error("Unsupported action space")
+        }
+    }
+
+    private logProbCategorical(predictions: tf.Tensor2D, actions: tf.Tensor2D) {
+        return tf.tidy(() => {
+            const numActions = predictions.shape[predictions.shape.length - 1]
+            const logprobabilitiesAll = tf.logSoftmax(predictions)
+
+            return tf.sum(
+                tf.mul(tf.oneHot(actions, numActions), logprobabilitiesAll),
+                logprobabilitiesAll.shape.length - 1,
+            )
+        })
+    }
+
+    private logProbNormal(predictions: tf.Tensor2D, actions: tf.Tensor2D) {
+        return tf.tidy(() => {
+            if (this.logStd === undefined) {
+                throw new Error("logStd is not initialized")
+            }
+
+            const scale = tf.exp(this.logStd)
+
+            const logUnnormalized = tf.mul(
+                -0.5,
+                tf.square(tf.sub(tf.div(actions, scale), tf.div(predictions, scale))),
+            )
 
-    private sampleAction(observation: tf.Tensor2D): [tf.Tensor2D, tf.Tensor2D] {}
+            const logNormalization = tf.add(tf.scalar(0.5 * Math.log(2.0 * Math.PI)), tf.log(scale))
+
+            return tf.sum(
+                tf.sub(logUnnormalized, logNormalization),
+                logUnnormalized.shape.length - 1,
+            )
+        })
+    }
+
+    private sampleAction(observation: tf.Tensor2D) {
+        return tf.tidy(() => {
+            const predictions = tf.squeeze(
+                this.actor.predict(observation) as tf.Tensor2D,
+            ) as tf.Tensor1D
+
+            const actionSpace = this.config.actionSpace
+
+            if (actionSpace.class === "Discrete") {
+                const action = tf.squeeze(tf.multinomial(predictions, 1)) as tf.Scalar
+                const actionSynced = action.arraySync()
+
+                return [predictions, action, actionSynced] as const
+            } else if (actionSpace.class === "Box") {
+                if (this.logStd === undefined) {
+                    throw new Error("logStd is not initialized")
+                }
+
+                const action = tf.add(
+                    tf.mul(tf.randomNormal([actionSpace.len]), tf.exp(this.logStd)),
+                    predictions,
+                ) as tf.Tensor1D
+
+                const actionClipped = action.arraySync().map((x, i) => {
+                    const low =
+                        typeof actionSpace.low === "number" ? actionSpace.low : actionSpace.low[i]
+                    const high =
+                        typeof actionSpace.high === "number"
+                            ? actionSpace.high
+                            : actionSpace.high[i]
+
+                    return Math.min(Math.max(x, low), high)
+                })
+
+                return [predictions, action, actionClipped] as const
+            } else {
+                throw new Error("Unsupported action space")
+            }
+        })
+    }
 }