diff --git a/Reinforcement Learning/LunarLander/lunarlander_random-sampling.ipynb b/Reinforcement Learning/LunarLander/lunarlander_random-sampling.ipynb index 3fa5614..07f6153 100644 --- a/Reinforcement Learning/LunarLander/lunarlander_random-sampling.ipynb +++ b/Reinforcement Learning/LunarLander/lunarlander_random-sampling.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -54,12 +54,57 @@ " \n", " return actions\n", "\n", + "def get_random_rewards(env, seed):\n", + " actions = []\n", + " rewards = []\n", + "\n", + " env.seed(seed)\n", + " env.reset()\n", + "\n", + " step=0\n", + " while True:\n", + " step+=1\n", + " action = env.action_space.sample()\n", + " ob, reward, terminated, info = env.step(action)\n", + " actions.append(action)\n", + " rewards.append(reward)\n", + " if terminated: \n", + " break\n", + " \n", + " return rewards, actions\n", + "\n", + "\n", + "def explore_rewards(env, change_node, actions, old_rewards):\n", + " env.seed(seed)\n", + " env.reset()\n", + "\n", + " new_actions = []\n", + " rewards = old_rewards[:-change_node]\n", + " \n", + " step=0\n", + " for action in actions[:-change_node]:\n", + " step+=1\n", + " ob, reward, terminated, info = env.step(action)\n", + " new_actions.append(action)\n", + " if terminated: break\n", + "\n", + " if not terminated:\n", + " # print(\"continue to explore\", len(new_actions))\n", + " while True:\n", + " step+=1\n", + " action = env.action_space.sample()\n", + " ob, reward, terminated, info = env.step(action)\n", + " new_actions.append(action), rewards.append(reward)\n", + " if terminated: break\n", "\n", + " return rewards, new_actions\n", + " \n", "def explore_actions(env, change_node, actions):\n", " env.seed(seed)\n", " env.reset()\n", "\n", " new_actions = []\n", + "\n", " step=0\n", " for action in actions[:-change_node]:\n", " step+=1\n", @@ -79,27 +124,31 @@ " return new_actions\n", "\n", "\n", - "def find_best_actions(env, actions):\n", + "def find_best_rewards(n, env, actions, rewards):\n", " best_actions=[]\n", - " best_obs = []\n", - " for ep in range(30):\n", - " # print(f\" {ep} \".center(80, '*'))\n", + " best_rewards = []\n", + " for ep in range(n):\n", + " print(f\" {ep} \".center(80, '*'))\n", " change_node=1\n", - " if len(actions) == 500:\n", - " return actions\n", + " # if len(actions) == 500:\n", + " # return actions\n", " if len(best_actions)>0:\n", " actions = best_actions\n", + " if len(best_rewards)>1:\n", + " rewards = best_rewards\n", " best_actions = []\n", - " while change_nodelen(actions):\n", + " best_rewards = [-1e6] \n", + " while change_nodesum(best_rewards):\n", " # print(len(new_actions), len(actions), change_node)\n", - " if len(new_actions)>len(best_actions):\n", - " # print(len(new_actions), len(actions), change_node)\n", - " best_actions=new_actions\n", + " #if len(new_actions)>len(best_actions):\n", + " print(len(new_actions), len(actions), change_node, sum(new_rewards))\n", + " best_actions=new_actions\n", + " best_rewards=new_rewards\n", " change_node+=1\n", " \n", - " return best_actions if len(best_actions)>len(actions) else actions\n", + " return best_actions, best_rewards if len(best_actions)>len(actions) else actions\n", "\n", "\n", "def get_obs(env, actions_result):\n", @@ -123,54 +172,97 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "************************************** 0 ***************************************\n", + "102 102 1 -149.00837859088466\n", + "102 102 5 -141.56807267363666\n", + "102 102 16 -128.0574026152969\n", + "102 102 24 -125.55624193964569\n", + "104 102 28 -123.26043672630061\n", + "103 102 30 -115.39515325051048\n", + "124 102 32 -13.980617247773807\n", + "120 102 34 10.231381970022568\n", + "************************************** 1 ***************************************\n", + "120 120 1 10.231381970022568\n", + "120 120 4 11.686774316464266\n", + "1000 120 34 73.23397092383432\n", + "************************************** 2 ***************************************\n", + "1000 1000 1 73.60809107572727\n", + "1000 1000 3 75.35113808488286\n", + "1000 1000 9 77.36513234831155\n", + "1000 1000 13 77.63069625639572\n", + "1000 1000 16 78.33312204631495\n", + "1000 1000 59 78.48170719275846\n" + ] + } + ], "source": [ - "# seed = 10\n", - "# actions = get_random_actions(env, seed)\n", - "# best_actions = find_best_actions(env, actions)" + "seed=0\n", + "rewards, actions = get_random_rewards(env, seed)\n", + "\n", + "new_r, new_act = explore_rewards(env, 50, actions, rewards)\n", + "# print(sum(new_r), sum(rewards))\n", + "best_actions, best_rewards = find_best_rewards(3, env, actions, rewards)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "98" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\stefano.giannini_ama\\Documents\\Python\\Learn\\data-science_projects\\Reinforcement Learning\\LunarLander\\lunarlander_random-sampling.ipynb Cell 5\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 12\u001b[0m step\u001b[39m+\u001b[39m\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m\n\u001b[0;32m 13\u001b[0m env\u001b[39m.\u001b[39mrender()\n\u001b[1;32m---> 14\u001b[0m time\u001b[39m.\u001b[39;49msleep(\u001b[39m0.2\u001b[39;49m)\n\u001b[0;32m 15\u001b[0m \u001b[39mif\u001b[39;00m terminated: \u001b[39mbreak\u001b[39;00m\u001b[39m#steps.append(step);break\u001b[39;00m\n\u001b[0;32m 17\u001b[0m env\u001b[39m.\u001b[39mclose()\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] } ], "source": [ - "actions = get_random_actions(env, seed)\n", - "len(actions)" + "import time\n", + "step=0\n", + "env = gym.make(\"LunarLander-v2\")\n", + "env.seed(seed)\n", + "env.reset()\n", + "\n", + "while True:\n", + " # print(ob.reshape(1, -1).shape)\n", + " action = best_actions[step]\n", + " # action = res[seeds[4]][step]\n", + " ob, reward, terminated, info = env.step(action)\n", + " step+=1\n", + " env.render()\n", + " time.sleep(0.05)\n", + " print(step)\n", + " if terminated: break#steps.append(step);break\n", + "\n", + "env.close()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", + "ename": "NameError", + "evalue": "name 'find_best_actions' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\stefano.giannini_ama\\Documents\\Python\\Learn\\data-science_projects\\Reinforcement Learning\\LunarLander\\lunarlander_random-sampling.ipynb Cell 6\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[39mfor\u001b[39;00m seed \u001b[39min\u001b[39;00m seeds[:\u001b[39m2\u001b[39m]:\n\u001b[0;32m 4\u001b[0m actions \u001b[39m=\u001b[39m get_random_actions(env, seed)\n\u001b[1;32m----> 5\u001b[0m best_actions \u001b[39m=\u001b[39m find_best_actions(env, actions)\n\u001b[0;32m 6\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m[\u001b[39m\u001b[39m{\u001b[39;00mseed\u001b[39m}\u001b[39;00m\u001b[39m] Actions length improvement:\u001b[39m\u001b[39m\"\u001b[39m,\u001b[39mlen\u001b[39m(actions), \u001b[39m\"\u001b[39m\u001b[39m->\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mlen\u001b[39m(best_actions))\n\u001b[0;32m 7\u001b[0m res[seed] \u001b[39m=\u001b[39m best_actions\n", - "\u001b[1;32mc:\\Users\\stefano.giannini_ama\\Documents\\Python\\Learn\\data-science_projects\\Reinforcement Learning\\LunarLander\\lunarlander_random-sampling.ipynb Cell 6\u001b[0m in \u001b[0;36mfind_best_actions\u001b[1;34m(env, actions)\u001b[0m\n\u001b[0;32m 53\u001b[0m best_actions \u001b[39m=\u001b[39m []\n\u001b[0;32m 54\u001b[0m \u001b[39mwhile\u001b[39;00m change_node\u001b[39m<\u001b[39m\u001b[39mlen\u001b[39m(actions):\n\u001b[1;32m---> 55\u001b[0m new_actions \u001b[39m=\u001b[39m explore_actions(env, change_node, actions)\n\u001b[0;32m 56\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(new_actions)\u001b[39m>\u001b[39m\u001b[39mlen\u001b[39m(actions):\n\u001b[0;32m 57\u001b[0m \u001b[39m# print(len(new_actions), len(actions), change_node)\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(new_actions)\u001b[39m>\u001b[39m\u001b[39mlen\u001b[39m(best_actions):\n\u001b[0;32m 59\u001b[0m \u001b[39m# print(len(new_actions), len(actions), change_node)\u001b[39;00m\n", - "\u001b[1;32mc:\\Users\\stefano.giannini_ama\\Documents\\Python\\Learn\\data-science_projects\\Reinforcement Learning\\LunarLander\\lunarlander_random-sampling.ipynb Cell 6\u001b[0m in \u001b[0;36mexplore_actions\u001b[1;34m(env, change_node, actions)\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[39mfor\u001b[39;00m action \u001b[39min\u001b[39;00m actions[:\u001b[39m-\u001b[39mchange_node]:\n\u001b[0;32m 26\u001b[0m step\u001b[39m+\u001b[39m\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m\n\u001b[1;32m---> 27\u001b[0m ob, rewards, terminated, info \u001b[39m=\u001b[39m env\u001b[39m.\u001b[39;49mstep(action)\n\u001b[0;32m 28\u001b[0m new_actions\u001b[39m.\u001b[39mappend(action)\n\u001b[0;32m 29\u001b[0m \u001b[39mif\u001b[39;00m terminated: \u001b[39mbreak\u001b[39;00m\n", - "File \u001b[1;32mc:\\Users\\stefano.giannini_ama\\Anaconda3\\lib\\site-packages\\gym\\wrappers\\time_limit.py:18\u001b[0m, in \u001b[0;36mTimeLimit.step\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mstep\u001b[39m(\u001b[39mself\u001b[39m, action):\n\u001b[0;32m 15\u001b[0m \u001b[39massert\u001b[39;00m (\n\u001b[0;32m 16\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_elapsed_steps \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 17\u001b[0m ), \u001b[39m\"\u001b[39m\u001b[39mCannot call env.step() before calling reset()\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m---> 18\u001b[0m observation, reward, done, info \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49menv\u001b[39m.\u001b[39;49mstep(action)\n\u001b[0;32m 19\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_elapsed_steps \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m 20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_elapsed_steps \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_max_episode_steps:\n", - "File \u001b[1;32mc:\\Users\\stefano.giannini_ama\\Anaconda3\\lib\\site-packages\\gym\\envs\\box2d\\lunar_lander.py:350\u001b[0m, in \u001b[0;36mLunarLander.step\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m 346\u001b[0m pos \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlander\u001b[39m.\u001b[39mposition\n\u001b[0;32m 347\u001b[0m vel \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlander\u001b[39m.\u001b[39mlinearVelocity\n\u001b[0;32m 348\u001b[0m state \u001b[39m=\u001b[39m [\n\u001b[0;32m 349\u001b[0m (pos\u001b[39m.\u001b[39mx \u001b[39m-\u001b[39m VIEWPORT_W \u001b[39m/\u001b[39m SCALE \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m) \u001b[39m/\u001b[39m (VIEWPORT_W \u001b[39m/\u001b[39m SCALE \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m),\n\u001b[1;32m--> 350\u001b[0m (pos\u001b[39m.\u001b[39my \u001b[39m-\u001b[39m (\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39mhelipad_y \u001b[39m+\u001b[39m LEG_DOWN \u001b[39m/\u001b[39m SCALE)) \u001b[39m/\u001b[39m (VIEWPORT_H \u001b[39m/\u001b[39m SCALE \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m),\n\u001b[0;32m 351\u001b[0m vel\u001b[39m.\u001b[39mx \u001b[39m*\u001b[39m (VIEWPORT_W \u001b[39m/\u001b[39m SCALE \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m) \u001b[39m/\u001b[39m FPS,\n\u001b[0;32m 352\u001b[0m vel\u001b[39m.\u001b[39my \u001b[39m*\u001b[39m (VIEWPORT_H \u001b[39m/\u001b[39m SCALE \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m) \u001b[39m/\u001b[39m FPS,\n\u001b[0;32m 353\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlander\u001b[39m.\u001b[39mangle,\n\u001b[0;32m 354\u001b[0m \u001b[39m20.0\u001b[39m \u001b[39m*\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlander\u001b[39m.\u001b[39mangularVelocity \u001b[39m/\u001b[39m FPS,\n\u001b[0;32m 355\u001b[0m \u001b[39m1.0\u001b[39m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlegs[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mground_contact \u001b[39melse\u001b[39;00m \u001b[39m0.0\u001b[39m,\n\u001b[0;32m 356\u001b[0m \u001b[39m1.0\u001b[39m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlegs[\u001b[39m1\u001b[39m]\u001b[39m.\u001b[39mground_contact \u001b[39melse\u001b[39;00m \u001b[39m0.0\u001b[39m,\n\u001b[0;32m 357\u001b[0m ]\n\u001b[0;32m 358\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mlen\u001b[39m(state) \u001b[39m==\u001b[39m \u001b[39m8\u001b[39m\n\u001b[0;32m 360\u001b[0m reward \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + "\u001b[1;31mNameError\u001b[0m: name 'find_best_actions' is not defined" ] } ], @@ -179,9 +271,9 @@ "seeds = [0, 1, 5, 10, 21, 42, 47, 63, 84, 100, 121, 144]\n", "for seed in seeds[:2]:\n", " actions = get_random_actions(env, seed)\n", - " best_actions = find_best_actions(env, actions)\n", - " print(f\"[{seed}] Actions length improvement:\",len(actions), \"->\", len(best_actions))\n", - " res[seed] = best_actions" + " # best_actions = find_best_actions(env, actions)\n", + " # print(f\"[{seed}] Actions length improvement:\",len(actions), \"->\", len(best_actions))\n", + " # res[seed] = best_actions" ] }, {