forked from IvLabs/Summer-Projects
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from ak-1203/main
added summer-intern folder
- Loading branch information
Showing
12 changed files
with
268 additions
and
0 deletions.
There are no files selected for viewing
110 changes: 110 additions & 0 deletions
110
Summer 2024/Kuiper Belt Escape using Reinforcement Learning/final_implementation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from collections import defaultdict | ||
import gym | ||
import math | ||
import numpy as np | ||
import random | ||
import time | ||
import gym_kuiper_escape | ||
import matplotlib | ||
import matplotlib.pyplot as plt | ||
matplotlib.use('Agg') | ||
|
||
env = gym.make('kuiper-escape-base-v0',mode='None',rock_rate = 0.4,player_speed = 0.5,rock_size_min = 0.08,rock_size_max = 0.12, ) | ||
|
||
max_steps=1000 #max_steps in an episode | ||
|
||
#defining parameters | ||
initial_alpha=0.4 | ||
min_alpha=0.1 | ||
gamma=0.9 | ||
numOfepisodes=5000 | ||
min_epsilon=1e-2 | ||
max_epsilon=1.0 | ||
decay_rate = (max_epsilon-min_epsilon)/numOfepisodes #epsilon decay rate ( for linear decay ) | ||
|
||
#Q table initialization | ||
def default_q_value(): | ||
return np.zeros(5) | ||
Q=defaultdict(default_q_value) # as new states are visited , its each action's value will be initialized as 0. | ||
policy={} | ||
|
||
# learning rate decay | ||
def linear_decay_alpha(episode_num, initial_alpha, min_alpha, num_episodes): | ||
return max(min_alpha, initial_alpha - (initial_alpha - min_alpha) * (episode_num / num_episodes)) | ||
|
||
# initializing parameters | ||
epi_count=0 | ||
episodic_rewards=[] | ||
steps_per_epi=[] | ||
epsilon=max_epsilon | ||
alpha=initial_alpha | ||
|
||
while True: | ||
state=env.reset() | ||
episode_history=[] | ||
steps=0 | ||
total_reward=0 | ||
''' Number of lidar beams was set to 8 on env_base file , it will give 16x1 array and | ||
since only first half gives collision distances it is further sliced to get 8x1 and then is flatten in tuple. | ||
''' | ||
state=tuple(state[:8].flatten()) | ||
epi_count+=1 | ||
while True: | ||
#---choosing action by epsilon greedy | ||
p=random.random() | ||
if p<=epsilon: | ||
action=random.randint(0,4) | ||
else: | ||
action= np.argmax(Q[state]) | ||
|
||
next_obs, reward, done,info = env.step(action) | ||
steps+=1 | ||
next_state=tuple(next_obs[:8].flatten()) | ||
|
||
##---Reward Modification: | ||
if done: | ||
reward -=10 #panality for hitting the asteroid. | ||
##----- | ||
episode_history.append(((state,action),reward,next_state,done)) # storing episodic data | ||
Q[state][action]+= alpha*(reward + gamma*(np.max(Q[next_state])-Q[state][action])) #Bellman otimality equation | ||
|
||
total_reward+=reward # accumulating total reward | ||
state=next_state | ||
|
||
if done or steps>=max_steps: #for episode termination | ||
break | ||
|
||
epsilon= max(min_epsilon,max_epsilon-(epi_count*decay_rate)) #epsilon decay | ||
alpha = linear_decay_alpha(epi_count, initial_alpha, min_alpha, numOfepisodes) # alpha decay | ||
|
||
if epi_count % 100 == 0: | ||
print(f" Total reward: {total_reward}, Epsilon: {epsilon},Episode: {epi_count}") #to get terminal feedback of learning process | ||
|
||
episodic_rewards.append(total_reward) | ||
steps_per_epi.append(steps) | ||
#policy formation | ||
for s in Q: | ||
policy[s] = np.argmax(Q[s]) | ||
if epi_count>=numOfepisodes: #temination condition | ||
break | ||
|
||
epi=np.arange(1,len(steps_per_epi)+1) #total number of episodes | ||
|
||
#for plotting simple moving average | ||
def moving_average(data, window_size): | ||
return np.convolve(data, np.ones(window_size)/window_size, mode='valid') | ||
window_size = 90 | ||
|
||
#-------Plotting Results----------------- | ||
moving_avg = moving_average(episodic_rewards, window_size) | ||
plt.figure(dpi=200) | ||
plt.plot(epi,episodic_rewards) | ||
plt.title("Rewards/episode") | ||
plt.xlabel("Episodes") | ||
plt.ylabel("Rewards") | ||
|
||
plt.suptitle("Training") | ||
plt.plot(epi[window_size-1:], moving_avg, label=f'Moving Average (window={window_size})', color='red', linewidth=2) | ||
plt.legend() | ||
plt.savefig('rewardVepisodes.png') | ||
plt.show() |
158 changes: 158 additions & 0 deletions
158
Summer 2024/Kuiper Belt Escape using Reinforcement Learning/readme.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# Kuiper Belt Escape Using Reinforcement Learning | ||
|
||
## Project Overview : | ||
In this project, we trained an agent in a challenging and dynamic **Kuiper Escape environment** using the **Q-learning algorithm**. The environment features a **continuous state space**, discretized for optimization, and includes obstacles like rocks that the agent must strategically avoid. The goal is to maximize rewards through efficient exploration and precise navigation. | ||
|
||
To know more about the environment follow this repository : | ||
[Kuiper Escape Environment Documentation](https://github.com/jdegregorio/gym-kuiper-escape) | ||
|
||
## Aim : | ||
The project aims to simulate and analyze an RL agent’s navigation within a **custom-built Kuiper Belt environment**. The agent learns optimal strategies for: | ||
- **Avoiding obstacles (rocks)** | ||
- **Maximizing rewards through efficient movement** | ||
|
||
## Kuiper Escape Environment Overview | ||
The environment is a dynamic space populated with asteroids of varying sizes, speeds, and trajectories. The agent, represented as a spaceship, aims to navigate this challenging environment while avoiding collisions. | ||
The primary objective is to survive for as long as possible, thereby improving its game score and maximizing the total cumulative reward. | ||
|
||
<img src="results/gifs/env_overview_gif.gif" width="350" height="350" alt="env overview gif"> | ||
|
||
## Observation Space | ||
<img src="results/env_overview02.png" width="350" height="350" alt="obs space"> | ||
The state is a virtual lidar system that emits beams in all directions to capture distances and object characteristics, with a configurable array size and observation space. | ||
|
||
The observation data (for each beam in the lidar array): | ||
* Distance (i.e. radial distance from player to terminating point of lidar beam) | ||
* Collision detection | ||
* 0 if terminated at edge of screen, or at max radius distance | ||
* 1 if collided with a rock | ||
|
||
For example, if the number of lidar beams is set to 8, the observation space would be a 16x1 array. | ||
The observation data might look like this: | ||
`[0, 0.1, 0.2, 0.3, 0.5, 0.6, 0.2, 0.3 | 0, 1, 0, 0, 0, 1, 0, 1]` | ||
|
||
Here, | ||
- The **first half** (8 elements) represents the normalized distances of obstacles (e.g., rocks) from the agent. | ||
- The **second half** (8 elements) indicates collision status, where 1 represents a collision and 0 represents no collision. | ||
|
||
--- | ||
|
||
## Action Space | ||
The agent has the following discrete set of actions: | ||
|
||
<!--- **0:** Don't move | ||
- **1:** Up | ||
- **2:** Right | ||
- **3:** Down | ||
- **4:** Left | ||
--> | ||
| Value | Action | | ||
|-------|----------- | | ||
| 0 | Don't move | | ||
| 1 | Up | | ||
| 2 | Right | | ||
| 3 | Down | | ||
| 4 | Left | | ||
|
||
--- | ||
|
||
## **Reward Function** | ||
|
||
The reward function was designed to maximize the agent's survival time in the environment. The key components of the reward function are: | ||
|
||
- A strong **negative reward** was given as a penalty for collisions, discouraging the agent from hitting obstacles. | ||
- A **positive reward** was defined based on the agent's proximity to the center of the screen. The closer the agent remained to the center, the higher the reward. | ||
|
||
For example, the following reward function, which varies inversely with the distance from the center, was used : | ||
|
||
```python | ||
if dist_from_center < 0.35: | ||
reward = 1 / (0.65 + dist_from_center) | ||
else: | ||
reward = 0 | ||
``` | ||
**Note** : The above reward modification was implemented directly in the `step` function of the `env_base` python file of the Kuiper escape environment. | ||
|
||
--- | ||
## Learning Process results | ||
|
||
Initially, the agent performs random actions, primarily navigating along the edges of the environment. During this phase, it explores the space without a clear strategy, gradually gathering information about its surroundings to inform future decision-making : | ||
|
||
<img src="results/gifs/initial_exploration.gif" width="300" height="300" alt="Gif description"> | ||
|
||
As the number of episodes increases, the agent progressively enhances its navigation skills, adopting a more strategic approach to effectively avoid obstacles : | ||
|
||
<div style="display: flex; justify-content: space-between;"> | ||
<img src="results/gifs/learning01.gif" width="300" height="300" alt="Gif 1"> | ||
<img src="results/gifs/learning02.gif" width="300" height="300" alt="Gif 2"> | ||
</div> | ||
|
||
|
||
|
||
After a substantial number of episodes (eg. 12k ) , the agent has acquired sufficient learning, now navigating the environment with skill and precision, effectively avoiding obstacles. Its movements have become increasingly efficient, demonstrating a clear understanding of its surroundings and an enhanced ability to adapt to challenges. | ||
**Final Results:** | ||
|
||
<div style="display: flex; justify-content: space-between;"> | ||
<img src="results/gifs/result02.gif" width="300" height="300" alt="Gif 1"> | ||
<img src="results/gifs/result01.gif" width="300" height="300" alt="Gif 2"> | ||
</div> | ||
|
||
|
||
--- | ||
|
||
## Algorithm Used : Q-Learning | ||
Reinforcement learning provides a range of algorithms ideal for solving problems in model-free environments, where the agent lacks prior knowledge of the environment's dynamics. | ||
|
||
For this project, **Q-learning** was chosen as the primary algorithm due to its reliable convergence to the optimal policy over time, especially in simpler environments. | ||
As an off-policy algorithm, Q-learning learns the optimal policy by considering actions that may not necessarily follow the current policy. This allows for a balanced exploration of the environment while still exploiting the best-known strategies. By using techniques like the **epsilon-greedy strategy**, Q-learning effectively manages the trade-off between exploration and exploitation. Given that Q-learning emphasizes gathering information through exploration, it is well-suited for this project, where the agent must continuously adapt and refine its navigation strategy in a dynamic, unknown environment. | ||
The link for the code has been provided below . | ||
|
||
[Implementaion Code](final_implementation.py) | ||
|
||
|
||
|
||
--- | ||
|
||
## Discretisation of Continuous State space | ||
#### Convergence problem faced with continuous observation space: | ||
Since the observation space in the Kuiper Escape environment was continuous, we encountered a challenge. A continuous environment means there could be a vast number of possible states. The computation of action values for such a large state space and storing them in the Q-table was **computationally infeasible** and would lead to **memory overflow**. | ||
|
||
For instance, in a continuous observation space, values like `2.213` and `2.21345` would be treated as different states, which significantly increases the number of states to be stored. This results in an **insufficient Q-table for convergence** due to memory limitations. | ||
|
||
#### Solution : | ||
To reduce the computational load, we **discretized** the continuous observation space by dividing it into smaller, finite intervals. This approach allowed the algorithm to learn efficiently without overwhelming computational resources or memory capacity. | ||
|
||
--- | ||
|
||
## Graphical Results : | ||
|
||
![Rewards Vs Episodes](results/Training_graph.png) | ||
![Testing Graph](results/Testing_graph.png) | ||
|
||
### **Requirements** 🛠️ | ||
|
||
- **Python 3.8+** | ||
- **Libraries**: | ||
- `numpy` | ||
- `matplotlib` | ||
- `gymnasium` | ||
- **Custom Environment**: `gym_kuiper_escape` | ||
For setting up environment, one must follow | ||
[Kuiper Escape Environment Documentation](https://github.com/jdegregorio/gym-kuiper-escape) . | ||
--- | ||
|
||
### **Future Improvements** 🚀 | ||
|
||
- Implement **Deep Q-learning (DQN)** to handle larger and more complex environments. | ||
- Explore **reward shaping** techniques for more refined learning. | ||
- Extend the environment to include dynamic obstacles and more complex challenges. | ||
- Optimize the training process for faster convergence and better performance in larger-scale environments. | ||
|
||
--- | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
Binary file added
BIN
+58.9 KB
... 2024/Kuiper Belt Escape using Reinforcement Learning/results/Testing_graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+47.4 KB
...2024/Kuiper Belt Escape using Reinforcement Learning/results/Training_graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+38.3 KB
...r 2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+39.9 KB
...2024/Kuiper Belt Escape using Reinforcement Learning/results/env_overview02.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+1.75 MB
...iper Belt Escape using Reinforcement Learning/results/gifs/env_overview_gif.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+2.02 MB
...r Belt Escape using Reinforcement Learning/results/gifs/initial_exploration.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+2.14 MB
...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning01.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+2.05 MB
...024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/learning02.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+2.91 MB
... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result01.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+2.81 MB
... 2024/Kuiper Belt Escape using Reinforcement Learning/results/gifs/result02.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.