I'm a new student in reinforcement learning. Below is the code that I wrote for deep Q learning:
import torch
from torch import nn
from torch import optim
torch.set_default_device("cuda")
from collections import deque
import random
# Define the model
class Neural_Network(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(input_size, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, output_size))
def forward(self, x):
return self.network(x)
# Define state to tensor
def state_to_tensor(state):
return torch.as_tensor(state, dtype=torch.float32)
def optimize(optimizer, mini_batch, loss_function, gamma, target_net, online_net):
current_q = []
target_q = []
for state, action, next_state, reward, terminated in mini_batch:
if terminated:
# episode over. Target q value should be set to the reward
target = torch.tensor([reward]).clone().detach()
else:
# Calculate target q value
with torch.no_grad():
next_state_tensor = state_to_tensor(next_state)
target = torch.tensor(
reward + gamma * target_net(next_state_tensor).max() # Q learning
).clone().detach()
# Get the current set of Q values
current_state_tensor = state_to_tensor(state)
currrent_q_values = online_net(current_state_tensor)
current_q.append(currrent_q_values)
# print('current_q_values', currrent_q_values)
# Get the target set of Q values
target_q_values = target_net(current_state_tensor)
# print('target_q_values', target_q_values)
target_q_values[action] = target
target_q.append(target_q_values)
# Compute loss for the minibatch
loss = loss_function(torch.stack(current_q), torch.stack(target_q))
# Optimize the model
optimizer.zero_grad()
loss.backward()
# In-place gradient clipping
torch.nn.utils.clip_grad_value_(online_net.parameters(), 100)
optimizer.step()
def deep_Q_learning(env, gamma, alpha, epsilon, episodes, sync_rate, batch_size):
loss_function = nn.SmoothL1Loss() # Loss function
num_actions = env.action_space.n # Number of discrete actions
replay_buffer = deque(maxlen=20000) # For replay
reward_buffer = deque([0], maxlen=100) # For last 100 rewards
sample_observation = env.observation_space.sample() # Get a sample observation from continuous space
# Create policy and q network
online_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
target_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
target_qnet.load_state_dict(online_qnet.state_dict()) # Make the same weights and biases for both networks
# Set the optimizer
optimizer = optim.AdamW(online_qnet.parameters(), lr=alpha, amsgrad=True)
# List to keep track of rewards collected per episode
# rewards_per_episode = torch.zeros(episodes)
# Track number of steps taken. Used for syncing policy
total_steps = 0
episode_rewards = 0 # total rewards of an episode
for ep in range(episodes):
S, _ = env.reset() # Initial state
# Epsilon decay
epsilon = max(epsilon - 1/episodes, 0.05)
while True:
# Epsilon greedy action selection
if random.random() < epsilon:
A = env.action_space.sample() # Take random action
# print("random action", A)
else:
with torch.no_grad():
S_tensor = state_to_tensor(S)
A = online_qnet(S_tensor).argmax().item()
# print("greedy action", A)
# Execute action, take step
Sp, R, terminated, truncated, _ = env.step(A)
# Running reward after taking action
print("Reward, optimal action:", R, A)
# Save it in memory
replay_buffer.append((S,A,Sp,R,terminated))
# Move to the next state
S = Sp
# Increment step counter and add reward
total_steps += 1
episode_rewards += R
if terminated or truncated: # checking terminal
reward_buffer.append(episode_rewards)
episode_rewards = 0
break
# Check if enough experience or at least 1 reward has been collected
if len(replay_buffer) > batch_size:
mini_batch = random.sample(replay_buffer, batch_size)
optimize(optimizer=optimizer, mini_batch=mini_batch, loss_function=loss_function, gamma=gamma, target_net=target_qnet, online_net=online_qnet)
# Copy policy network to target network
if total_steps > sync_rate:
target_qnet.load_state_dict(online_qnet.state_dict())
total_steps = 0
if ep%1==0:
avg_reward = sum(reward_buffer)/len(reward_buffer)
print("Running episode:", ep, "Average reward:", avg_reward, end='\r')
# Save policy
torch.save(target_qnet.state_dict(), "frozen_lake_deepq_lunar.pt")
# finished
print("Training finished! Latest rewards: ", reward_buffer)
return target_qnet
# Deep Q Learning
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode= 'human')
env.metadata['render_fps'] = 0
target_qnet = deep_Q_learning(env=env, gamma=0.99, alpha=0.001, epsilon=0.9, episodes=200, sync_rate=100, batch_size=128)
env.close()
The animation is running VERY SLOW while training. Also each step is very slow. I checked it by running the code both using torch.set_default_device("cuda")
and torch.set_default_device("cpu")
. Note that I have Pytorch installed properly and have Nvidia RTX 3060 GPU (torch.cuda.is_available()
returns true
). It is slow even if I ran everything in CPU. Please give me some suggestions on how can I make it faster?
-
2\$\begingroup\$ Please edit your title to describe what the code does, not your concerns about it: codereview.stackexchange.com/help/how-to-ask \$\endgroup\$tdy– tdy2024年05月02日 23:05:13 +00:00Commented May 2, 2024 at 23:05
-
\$\begingroup\$ is it fine now? \$\endgroup\$Jahid Chowdhury Choton– Jahid Chowdhury Choton2024年05月02日 23:38:47 +00:00Commented May 2, 2024 at 23:38
1 Answer 1
What I can do for you and give you some general suggestions:
- Use library like Nuba or similar;
- try Pypy is a JIT compiler;
- if is possible use C or C++ modules.
and here the code with some improvements:
import torch
from torch import nn
from torch import optim
import numpy as np
from collections import deque
import random
# Define the model
class Neural_Network(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(input_size, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, output_size))
def forward(self, x):
return self.network(x)
# Define state to tensor
def state_to_tensor(state):
return torch.from_numpy(np.array(state, dtype=np.float32))
def calculate_q_values(state, action, reward, next_state, terminated, gamma, target_net, online_net):
if terminated:
target = torch.tensor([reward]).clone().detach()
else:
with torch.no_grad():
next_state_tensor = state_to_tensor(next_state)
target = torch.tensor(reward + gamma * target_net(next_state_tensor).max()).clone().detach()
current_state_tensor = state_to_tensor(state)
current_q_values = online_net(current_state_tensor)
target_q_values = target_net(current_state_tensor)
target_q_values[action] = target
return current_q_values, target_q_values
def optimize(optimizer, mini_batch, loss_function, gamma, target_net, online_net):
current_q = []
target_q = []
for state, action, next_state, reward, terminated in mini_batch:
current_q_values, target_q_values = calculate_q_values(state, action, reward, next_state, terminated, gamma, target_net, online_net)
current_q.append(current_q_values)
target_q.append(target_q_values)
loss = loss_function(torch.stack(current_q), torch.stack(target_q))
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(online_net.parameters(), 100)
optimizer.step()
def deep_Q_learning(env, gamma, alpha, epsilon, episodes, sync_rate, batch_size):
loss_function = nn.SmoothL1Loss()
num_actions = env.action_space.n
replay_buffer = deque(maxlen=20000)
reward_buffer = deque([0], maxlen=100)
sample_observation = env.observation_space.sample()
online_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
target_qnet = Neural_Network(input_size=len(sample_observation), output_size=num_actions)
target_qnet.load_state_dict(online_qnet.state_dict())
optimizer = optim.AdamW(online_qnet.parameters(), lr=alpha, amsgrad=True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)
total_steps = 0
episode_rewards = 0
for ep in range(episodes):
S, _ = env.reset()
epsilon = max(epsilon - 1/episodes, 0.05)
while True:
if random.random() < epsilon:
A = env.action_space.sample()
else:
with torch.no_grad():
S_tensor = state_to_tensor(S)
A = online_qnet(S_tensor).argmax().item()
Sp, R, terminated, truncated, _ = env.step(A)
replay_buffer.append((S,A,Sp,R,terminated))
S = Sp
total_steps += 1
episode_rewards += R
if terminated or truncated:
reward_buffer.append(episode_rewards)
episode_rewards = 0
break
if len(replay_buffer) > batch_size:
mini_batch = random.sample(replay_buffer, batch_size)
optimize(optimizer=optimizer, mini_batch=mini_batch, loss_function=loss_function, gamma=gamma, target_net=target_qnet, online_net=online_qnet)
if total_steps > sync_rate:
target_qnet.load_state_dict(online_qnet.state_dict())
total_steps = 0
scheduler.step()
if ep%1==0:
avg_reward = sum(reward_buffer)/len(reward_buffer)
print(f"Running episode: {ep}, Average reward: {avg_reward}", end='\r')
torch.save(target_qnet.state_dict(), "frozen_lake_deepq_lunar.pt")
print(f"Training finished! Latest rewards: {reward_buffer}")
return target_qnet
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode= 'human')
env.metadata['render_fps'] = 0
target_qnet = deep_Q_learning(env=env, gamma=0.99, alpha=0.001, epsilon=0.9, episodes=200, sync_rate=100, batch_size=128)
env.close()
Explore related questions
See similar questions with these tags.