Merge pull request #6 from ak-1203/main

added summer-intern folder
ChinmayK0607 · Dec 24, 2024 · 1feb297 · 1feb297
2 parents 28b312b + 3c5b41d
commit 1feb297
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 0 deletions.
diff --git a/Summer 2024/Kuiper Belt Escape using Reinforcement Learning/final_implementation.py b/Summer 2024/Kuiper Belt Escape using Reinforcement Learning/final_implementation.py
@@ -0,0 +1,110 @@
+from collections import defaultdict
+import gym
+import math
+import numpy as np
+import random
+import time
+import gym_kuiper_escape 
+import matplotlib
+import matplotlib.pyplot as plt 
+matplotlib.use('Agg')
+
+env = gym.make('kuiper-escape-base-v0',mode='None',rock_rate = 0.4,player_speed = 0.5,rock_size_min = 0.08,rock_size_max = 0.12, )
+
+max_steps=1000 #max_steps in an episode
+
+#defining parameters
+initial_alpha=0.4
+min_alpha=0.1
+gamma=0.9
+numOfepisodes=5000
+min_epsilon=1e-2
+max_epsilon=1.0
+decay_rate = (max_epsilon-min_epsilon)/numOfepisodes #epsilon decay rate ( for linear decay )
+
+#Q table initialization
+def default_q_value():
+    return np.zeros(5)
+Q=defaultdict(default_q_value) # as new states are visited , its each action's value will be initialized as 0.
+policy={}
+
+# learning rate decay
+def linear_decay_alpha(episode_num, initial_alpha, min_alpha, num_episodes):
+    return max(min_alpha, initial_alpha - (initial_alpha - min_alpha) * (episode_num / num_episodes))   
+
+# initializing parameters  
+epi_count=0
+episodic_rewards=[]
+steps_per_epi=[]
+epsilon=max_epsilon
+alpha=initial_alpha
+
+while True:
+    state=env.reset() 
+    episode_history=[]
+    steps=0
+    total_reward=0
+    ''' Number of lidar beams was set to 8 on env_base file , it will give 16x1 array and
+        since only first half gives collision distances it is further sliced to get 8x1 and then is flatten in tuple.
+    '''
+    state=tuple(state[:8].flatten()) 
+    epi_count+=1
+    while True:
+        #---choosing action by epsilon greedy
+        p=random.random()
+        if p<=epsilon:
+            action=random.randint(0,4)
+        else:
+            action= np.argmax(Q[state])
+
+        next_obs, reward, done,info = env.step(action)
+        steps+=1
+        next_state=tuple(next_obs[:8].flatten())
+
+        ##---Reward Modification:
+        if done: 
+            reward -=10 #panality for hitting the asteroid.
+        ##-----
+        episode_history.append(((state,action),reward,next_state,done)) # storing episodic data
+        Q[state][action]+= alpha*(reward + gamma*(np.max(Q[next_state])-Q[state][action])) #Bellman otimality equation
+
+        total_reward+=reward # accumulating total reward
+        state=next_state
+
+        if done or steps>=max_steps: #for episode termination
+            break
+
+    epsilon= max(min_epsilon,max_epsilon-(epi_count*decay_rate)) #epsilon decay
+    alpha = linear_decay_alpha(epi_count, initial_alpha, min_alpha, numOfepisodes) # alpha decay
+
+    if epi_count % 100 == 0:
+        print(f" Total reward: {total_reward}, Epsilon: {epsilon},Episode: {epi_count}") #to get terminal feedback of learning process
+
+    episodic_rewards.append(total_reward)
+    steps_per_epi.append(steps)
+    #policy formation
+    for s in Q:
+        policy[s] = np.argmax(Q[s])
+    if epi_count>=numOfepisodes: #temination condition
+        break
+
+epi=np.arange(1,len(steps_per_epi)+1) #total number of episodes
+
+#for plotting simple moving average
+def moving_average(data, window_size):
+    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')
+window_size = 90
+
+#-------Plotting Results-----------------
+moving_avg = moving_average(episodic_rewards, window_size) 
+plt.figure(dpi=200)
+plt.plot(epi,episodic_rewards)
+plt.title("Rewards/episode")
+plt.xlabel("Episodes")
+plt.ylabel("Rewards")
+
+plt.suptitle("Training")
+plt.plot(epi[window_size-1:], moving_avg, label=f'Moving Average (window={window_size})', color='red', linewidth=2)
+plt.legend()
+plt.savefig('rewardVepisodes.png')
+plt.show()
diff --git a/Summer 2024/Kuiper Belt Escape using Reinforcement Learning/readme.md b/Summer 2024/Kuiper Belt Escape using Reinforcement Learning/readme.md
@@ -0,0 +1,158 @@
+# Kuiper Belt Escape Using Reinforcement Learning
+
+## Project Overview :
+In this project, we trained an agent in a challenging and dynamic **Kuiper Escape environment** using the **Q-learning algorithm**. The environment features a **continuous state space**, discretized for optimization, and includes obstacles like rocks that the agent must strategically avoid. The goal is to maximize rewards through efficient exploration and precise navigation.
+
+To know more about the environment follow this repository :  
+[Kuiper Escape Environment Documentation](https://github.com/jdegregorio/gym-kuiper-escape)  
+
+## Aim : 
+The project aims to simulate and analyze an RL agent’s navigation within a **custom-built Kuiper Belt environment**. The agent learns optimal strategies for:
+- **Avoiding obstacles (rocks)**  
+- **Maximizing rewards through efficient movement**  
+
+## Kuiper Escape Environment Overview
+The environment is a dynamic space populated with asteroids of varying sizes, speeds, and trajectories. The agent, represented as a spaceship, aims to navigate this challenging environment while avoiding collisions.  
+The primary objective is to survive for as long as possible, thereby improving its game score and maximizing the total cumulative reward.  
+
+<img src="results/gifs/env_overview_gif.gif" width="350" height="350" alt="env overview gif">  
+
+## Observation Space
+<img src="results/env_overview02.png" width="350" height="350" alt="obs space"> 
+The state is a virtual lidar system that emits beams in all directions to capture distances and object characteristics, with a configurable array size and observation space.
+
+The observation data (for each beam in the lidar array):
+ * Distance (i.e. radial distance from player to terminating point of lidar beam)
+ * Collision detection
+   * 0 if terminated at edge of screen, or at max radius distance
+   * 1 if collided with a rock
+
+For example, if the number of lidar beams is set to 8, the observation space would be a 16x1 array.  
+The observation data might look like this:  
+`[0, 0.1, 0.2, 0.3, 0.5, 0.6, 0.2, 0.3 | 0, 1, 0, 0, 0, 1, 0, 1]`  
+
+Here,
+- The **first half** (8 elements) represents the normalized distances of obstacles (e.g., rocks) from the agent.
+- The **second half** (8 elements) indicates collision status, where 1 represents a collision and 0 represents no collision.
+
+---
+
+## Action Space
+The agent has the following discrete set of actions:  
+
+<!--- **0:** Don't move  
+- **1:** Up  
+- **2:** Right  
+- **3:** Down  
+- **4:** Left  
+-->
+| Value | Action     |
+|-------|----------- |
+| 0     | Don't move |
+| 1     | Up         |
+| 2     | Right      |
+| 3     | Down       | 
+| 4     | Left       |
+
+---
+
+## **Reward Function**
+
+The reward function was designed to maximize the agent's survival time in the environment. The key components of the reward function are:  
+
+- A strong **negative reward** was given as a penalty for collisions, discouraging the agent from hitting obstacles.  
+- A **positive reward** was defined based on the agent's proximity to the center of the screen. The closer the agent remained to the center, the higher the reward.  
+
+For example, the following reward function, which varies inversely with the distance from the center, was used :
+
+```python
+if dist_from_center < 0.35:
+    reward = 1 / (0.65 + dist_from_center)
+else:
+    reward = 0
+```
+**Note** : The above reward modification was implemented directly in the `step` function of the `env_base` python file of the Kuiper escape environment.  
+
+---
+## Learning Process results
+
+Initially, the agent performs random actions, primarily navigating along the edges of the environment. During this phase, it explores the space without a clear strategy, gradually gathering information about its surroundings to inform future decision-making :  
+
+<img src="results/gifs/initial_exploration.gif" width="300" height="300" alt="Gif description">  
+
+As the number of episodes increases, the agent progressively enhances its navigation skills, adopting a more strategic approach to effectively avoid obstacles : 
+
+<div style="display: flex; justify-content: space-between;">
+  <img src="results/gifs/learning01.gif" width="300" height="300" alt="Gif 1">
+  <img src="results/gifs/learning02.gif" width="300" height="300" alt="Gif 2">
+</div>  
+
+
+
+After a substantial number of episodes (eg. 12k ) , the agent has acquired sufficient learning, now navigating the environment with skill and precision, effectively avoiding obstacles. Its movements have become increasingly efficient, demonstrating a clear understanding of its surroundings and an enhanced ability to adapt to challenges.  
+**Final Results:**
+
+<div style="display: flex; justify-content: space-between;">
+  <img src="results/gifs/result02.gif" width="300" height="300" alt="Gif 1">
+  <img src="results/gifs/result01.gif" width="300" height="300" alt="Gif 2">
+</div>
+
+
+---
+
+## Algorithm Used : Q-Learning 
+Reinforcement learning provides a range of algorithms ideal for solving problems in model-free environments, where the agent lacks prior knowledge of the environment's dynamics.  
+
+For this project, **Q-learning** was chosen as the primary algorithm due to its reliable convergence to the optimal policy over time, especially in simpler environments.  
+As an off-policy algorithm, Q-learning learns the optimal policy by considering actions that may not necessarily follow the current policy. This allows for a balanced exploration of the environment while still exploiting the best-known strategies. By using techniques like the **epsilon-greedy strategy**, Q-learning effectively manages the trade-off between exploration and exploitation. Given that Q-learning emphasizes gathering information through exploration, it is well-suited for this project, where the agent must continuously adapt and refine its navigation strategy in a dynamic, unknown environment.
+The link for the code has been provided below .
+
+[Implementaion Code](final_implementation.py)
+
+
+
+---
+
+## Discretisation of Continuous State space
+#### Convergence problem faced with continuous observation space:
+Since the observation space in the Kuiper Escape environment was continuous, we encountered a challenge. A continuous environment means there could be a vast number of possible states. The computation of action values for such a large state space and storing them in the Q-table was **computationally infeasible** and would lead to **memory overflow**.
+
+For instance, in a continuous observation space, values like `2.213` and `2.21345` would be treated as different states, which significantly increases the number of states to be stored. This results in an **insufficient Q-table for convergence** due to memory limitations.
+
+#### Solution :
+To reduce the computational load, we **discretized** the continuous observation space by dividing it into smaller, finite intervals. This approach allowed the algorithm to learn efficiently without overwhelming computational resources or memory capacity.
+
+---
+
+## Graphical Results :
+
+![Rewards Vs Episodes](results/Training_graph.png)  
+![Testing Graph](results/Testing_graph.png)
+
+### **Requirements** 🛠️
+
+- **Python 3.8+**
+- **Libraries**:
+  - `numpy`
+  - `matplotlib`
+  - `gymnasium`
+- **Custom Environment**: `gym_kuiper_escape`
+    For setting up environment, one must follow
+    [Kuiper Escape Environment Documentation](https://github.com/jdegregorio/gym-kuiper-escape) .
+---
+
+### **Future Improvements** 🚀
+
+- Implement **Deep Q-learning (DQN)** to handle larger and more complex environments.
+- Explore **reward shaping** techniques for more refined learning.
+- Extend the environment to include dynamic obstacles and more complex challenges.
+- Optimize the training process for faster convergence and better performance in larger-scale environments.
+
+---
+
+
+
+
+
+
+
diff --git a/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/Testing_graph.png b/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/Testing_graph.png
diff --git a/...2024/Kuiper Belt Escape using Reinforcement Learning/results/Training_graph.png b/...2024/Kuiper Belt Escape using Reinforcement Learning/results/Training_graph.png
diff --git a/...r 2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview.png b/...r 2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview.png
diff --git a/...2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview02.png b/...2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview02.png
diff --git a/...iper Belt Escape using Reinforcement Learning/results/gifs/env_overview_gif.gif b/...iper Belt Escape using Reinforcement Learning/results/gifs/env_overview_gif.gif
diff --git a/...r Belt Escape using Reinforcement Learning/results/gifs/initial_exploration.gif b/...r Belt Escape using Reinforcement Learning/results/gifs/initial_exploration.gif
diff --git a/...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning01.gif b/...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning01.gif
diff --git a/...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning02.gif b/...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning02.gif
diff --git a/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result01.gif b/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result01.gif
diff --git a/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result02.gif b/... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result02.gif