3
\$\begingroup\$

I'm a new student in reinforcement learning. Below is the code that I wrote for deep Q learning:

import torch
from torch import nn
from torch import optim
torch.set_default_device("cuda")
from collections import deque
import random
 
# Define the model
class Neural_Network(nn.Module):
 def __init__(self, input_size, output_size):
 super().__init__()
 self.network = nn.Sequential(
 nn.Linear(input_size, 128),
 nn.ReLU(),
 nn.Linear(128, 128),
 nn.ReLU(),
 nn.Linear(128, output_size))
 def forward(self, x):
 return self.network(x)
 
# Define state to tensor
def state_to_tensor(state):
 return torch.as_tensor(state, dtype=torch.float32)
def optimize(optimizer, mini_batch, loss_function, gamma, target_net, online_net):
 current_q = []
 target_q = []
 for state, action, next_state, reward, terminated in mini_batch:
 if terminated:
 # episode over. Target q value should be set to the reward
 target = torch.tensor([reward]).clone().detach()
 else:
 # Calculate target q value
 with torch.no_grad():
 next_state_tensor = state_to_tensor(next_state)
 target = torch.tensor(
 reward + gamma * target_net(next_state_tensor).max() # Q learning
 ).clone().detach()
 # Get the current set of Q values
 current_state_tensor = state_to_tensor(state)
 currrent_q_values = online_net(current_state_tensor)
 current_q.append(currrent_q_values)
 # print('current_q_values', currrent_q_values)
 # Get the target set of Q values
 target_q_values = target_net(current_state_tensor)
 # print('target_q_values', target_q_values)
 target_q_values[action] = target
 target_q.append(target_q_values)
 # Compute loss for the minibatch
 loss = loss_function(torch.stack(current_q), torch.stack(target_q))
 # Optimize the model
 optimizer.zero_grad()
 loss.backward()
 # In-place gradient clipping
 torch.nn.utils.clip_grad_value_(online_net.parameters(), 100)
 optimizer.step()
def deep_Q_learning(env, gamma, alpha, epsilon, episodes, sync_rate, batch_size): 
 loss_function = nn.SmoothL1Loss() # Loss function
 num_actions = env.action_space.n # Number of discrete actions
 replay_buffer = deque(maxlen=20000) # For replay
 reward_buffer = deque([0], maxlen=100) # For last 100 rewards
 sample_observation = env.observation_space.sample() # Get a sample observation from continuous space
 # Create policy and q network
 online_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
 target_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
 target_qnet.load_state_dict(online_qnet.state_dict()) # Make the same weights and biases for both networks
 # Set the optimizer
 optimizer = optim.AdamW(online_qnet.parameters(), lr=alpha, amsgrad=True)
 # List to keep track of rewards collected per episode
 # rewards_per_episode = torch.zeros(episodes)
 
 # Track number of steps taken. Used for syncing policy
 total_steps = 0
 episode_rewards = 0 # total rewards of an episode
 for ep in range(episodes):
 S, _ = env.reset() # Initial state
 # Epsilon decay
 epsilon = max(epsilon - 1/episodes, 0.05)
 
 while True:
 # Epsilon greedy action selection
 if random.random() < epsilon:
 A = env.action_space.sample() # Take random action
 # print("random action", A)
 else:
 with torch.no_grad():
 S_tensor = state_to_tensor(S)
 A = online_qnet(S_tensor).argmax().item()
 # print("greedy action", A)
 # Execute action, take step
 Sp, R, terminated, truncated, _ = env.step(A)
 # Running reward after taking action
 print("Reward, optimal action:", R, A)
 # Save it in memory
 replay_buffer.append((S,A,Sp,R,terminated))
 # Move to the next state
 S = Sp
 # Increment step counter and add reward
 total_steps += 1
 episode_rewards += R
 if terminated or truncated: # checking terminal
 reward_buffer.append(episode_rewards)
 episode_rewards = 0
 break
 # Check if enough experience or at least 1 reward has been collected
 if len(replay_buffer) > batch_size:
 mini_batch = random.sample(replay_buffer, batch_size)
 optimize(optimizer=optimizer, mini_batch=mini_batch, loss_function=loss_function, gamma=gamma, target_net=target_qnet, online_net=online_qnet)
 
 # Copy policy network to target network
 if total_steps > sync_rate:
 target_qnet.load_state_dict(online_qnet.state_dict())
 total_steps = 0
 if ep%1==0:
 avg_reward = sum(reward_buffer)/len(reward_buffer)
 print("Running episode:", ep, "Average reward:", avg_reward, end='\r')
 # Save policy
 torch.save(target_qnet.state_dict(), "frozen_lake_deepq_lunar.pt")
 # finished
 print("Training finished! Latest rewards: ", reward_buffer)
 return target_qnet
# Deep Q Learning
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode= 'human')
env.metadata['render_fps'] = 0
target_qnet = deep_Q_learning(env=env, gamma=0.99, alpha=0.001, epsilon=0.9, episodes=200, sync_rate=100, batch_size=128)
env.close()

The animation is running VERY SLOW while training. Also each step is very slow. I checked it by running the code both using torch.set_default_device("cuda") and torch.set_default_device("cpu"). Note that I have Pytorch installed properly and have Nvidia RTX 3060 GPU (torch.cuda.is_available() returns true). It is slow even if I ran everything in CPU. Please give me some suggestions on how can I make it faster?

asked May 2, 2024 at 20:12
\$\endgroup\$
2
  • 2
    \$\begingroup\$ Please edit your title to describe what the code does, not your concerns about it: codereview.stackexchange.com/help/how-to-ask \$\endgroup\$ Commented May 2, 2024 at 23:05
  • \$\begingroup\$ is it fine now? \$\endgroup\$ Commented May 2, 2024 at 23:38

1 Answer 1

4
\$\begingroup\$

What I can do for you and give you some general suggestions:

  1. Use library like Nuba or similar;
  2. try Pypy is a JIT compiler;
  3. if is possible use C or C++ modules.

and here the code with some improvements:

import torch
from torch import nn
from torch import optim
import numpy as np
from collections import deque
import random
# Define the model
class Neural_Network(nn.Module):
 def __init__(self, input_size, output_size):
 super().__init__()
 self.network = nn.Sequential(
 nn.Linear(input_size, 128),
 nn.ReLU(),
 nn.Linear(128, 128),
 nn.ReLU(),
 nn.Linear(128, output_size))
 def forward(self, x):
 return self.network(x)
# Define state to tensor
def state_to_tensor(state):
 return torch.from_numpy(np.array(state, dtype=np.float32))
def calculate_q_values(state, action, reward, next_state, terminated, gamma, target_net, online_net):
 if terminated:
 target = torch.tensor([reward]).clone().detach()
 else:
 with torch.no_grad():
 next_state_tensor = state_to_tensor(next_state)
 target = torch.tensor(reward + gamma * target_net(next_state_tensor).max()).clone().detach()
 current_state_tensor = state_to_tensor(state)
 current_q_values = online_net(current_state_tensor)
 target_q_values = target_net(current_state_tensor)
 target_q_values[action] = target
 return current_q_values, target_q_values
def optimize(optimizer, mini_batch, loss_function, gamma, target_net, online_net):
 current_q = []
 target_q = []
 for state, action, next_state, reward, terminated in mini_batch:
 current_q_values, target_q_values = calculate_q_values(state, action, reward, next_state, terminated, gamma, target_net, online_net)
 current_q.append(current_q_values)
 target_q.append(target_q_values)
 loss = loss_function(torch.stack(current_q), torch.stack(target_q))
 optimizer.zero_grad()
 loss.backward()
 torch.nn.utils.clip_grad_value_(online_net.parameters(), 100)
 optimizer.step()
def deep_Q_learning(env, gamma, alpha, epsilon, episodes, sync_rate, batch_size):
 loss_function = nn.SmoothL1Loss()
 num_actions = env.action_space.n
 replay_buffer = deque(maxlen=20000)
 reward_buffer = deque([0], maxlen=100)
 sample_observation = env.observation_space.sample()
 online_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
 target_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
 target_qnet.load_state_dict(online_qnet.state_dict())
 optimizer = optim.AdamW(online_qnet.parameters(), lr=alpha, amsgrad=True)
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)
 total_steps = 0
 episode_rewards = 0
 for ep in range(episodes):
 S, _ = env.reset()
 epsilon = max(epsilon - 1/episodes, 0.05)
 while True:
 if random.random() < epsilon:
 A = env.action_space.sample()
 else:
 with torch.no_grad():
 S_tensor = state_to_tensor(S)
 A = online_qnet(S_tensor).argmax().item()
 Sp, R, terminated, truncated, _ = env.step(A)
 replay_buffer.append((S,A,Sp,R,terminated))
 S = Sp
 total_steps += 1
 episode_rewards += R
 if terminated or truncated:
 reward_buffer.append(episode_rewards)
 episode_rewards = 0
 break
 if len(replay_buffer) > batch_size:
 mini_batch = random.sample(replay_buffer, batch_size)
 optimize(optimizer=optimizer, mini_batch=mini_batch, loss_function=loss_function, gamma=gamma, target_net=target_qnet, online_net=online_qnet)
 if total_steps > sync_rate:
 target_qnet.load_state_dict(online_qnet.state_dict())
 total_steps = 0
 scheduler.step()
 if ep%1==0:
 avg_reward = sum(reward_buffer)/len(reward_buffer)
 print(f"Running episode: {ep}, Average reward: {avg_reward}", end='\r')
 torch.save(target_qnet.state_dict(), "frozen_lake_deepq_lunar.pt")
 print(f"Training finished! Latest rewards: {reward_buffer}")
 return target_qnet
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode= 'human')
env.metadata['render_fps'] = 0
target_qnet = deep_Q_learning(env=env, gamma=0.99, alpha=0.001, epsilon=0.9, episodes=200, sync_rate=100, batch_size=128)
env.close()
answered May 25, 2024 at 15:15
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.