0

I am new to pytorch-distributed, and any input will help. I have a code working with a single GPU. I am trying to make it distributed. I am getting a socket connect error. Below is the code ( I am avoiding the part of the code that may not be the issue). I suppose it's a socket error.

$> torchrun --nproc_per_node=4 --nnodes=1 train_dist.py CODE:

import datetime
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import time
import sys
import numpy as np
import torch
from torch.utils.data import DataLoader, DistributedSampler 
from torch.utils.data.dataloader import default_collate
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import torch.distributed as dist
import utils
from scheduler import WarmupMultiStepLR
from datasets.ntu60_hoi import NTU60Subject
import models.AR_pcd_flow as Models
# Function to initialize the distributed environment
def init_distributed():
 # Example using torch.distributed.launch:
 
 rank = int(os.environ['RANK'])
 world_size = int(os.environ['WORLD_SIZE'])
 dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
 # dist.init_process_group(backend='nccl') 
 local_rank = int(os.environ['LOCAL_RANK'])
 torch.cuda.set_device(local_rank)
 device = torch.device("cuda", local_rank)
 return device, rank, world_size
# training step
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq): # training code
def evaluate(): #evaluation code
# put it all together... Define data and network models
def main(args):
 if args.output_dir:
 utils.mkdir(args.output_dir)
 print(args)
 print("torch version: ", torch.__version__)
 print("torchvision version: ", torchvision.__version__)
 print("Number of GPUs:", torch.cuda.device_count())
 np.random.seed(args.seed)
 torch.manual_seed(args.seed)
 torch.cuda.manual_seed(args.seed)
 torch.backends.cudnn.deterministic = True
 torch.backends.cudnn.benchmark = False
 print("Creating model")
 Model = getattr(Models, args.model)
 model = Model(radius=args.radius, nsamples=args.nsamples, spatial_stride=args.spatial_stride,
 temporal_kernel_size=args.temporal_kernel_size, temporal_stride=args.temporal_stride,
 emb_relu=args.emb_relu,
 dim=args.dim, depth=args.depth, heads=args.heads, dim_head=args.dim_head,
 mlp_dim=args.mlp_dim, num_classes=60)
 
 if torch.cuda.device_count() > 1:
 device, rank, world_size = init_distributed()
 model.to(device)
 # model = nn.DataParallel(model)
 model = nn.parallel.DistributedDataParallel(model, device_ids=[device.index], output_device=device.index) # local_rank inplace of devices
 else:
 device = torch.device('cuda')
 model.to(device)
 # Data loading code
 print("Loading data")
 st = time.time()
 dataset_train = NTU60Subject(root = '/scratch/NTU60/', train=True)
 dataset_test = NTU60Subject(root = '/scratch/NTU60/', train=False)
 # dataset_test = SegDataset(root='/scratch/pgouripe/AS_data_base', train=False)
 print("Creating data loaders")
 if torch.cuda.device_count() > 1:
 sampler_train = DistributedSampler(dataset_train, num_replicas=world_size, rank=rank, shuffle=True)
 sampler_test = DistributedSampler(dataset_test, num_replicas=world_size, rank=rank, shuffle=False)
 else:
 sampler_train = None
 sampler_test = None
 data_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, sampler=sampler_train, num_workers=args.workers, pin_memory=True)
 data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, sampler=sampler_test, num_workers=args.workers, pin_memory=True)
 
 criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
 # convert scheduler to be per iteration, not per epoch, for warmup that lasts
 # between different epochs
 warmup_iters = args.lr_warmup_epochs * len(data_loader)
 lr_milestones = [len(data_loader) * m for m in args.lr_milestones]
 lr_scheduler = WarmupMultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5)
 # model_without_ddp = model
 print("Start training")
 start_time = time.time()
 cur_acc = 0
 acc = 0
 for epoch in range(args.start_epoch, args.epochs):
 train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq)
 cur_acc = max(acc, evaluate(model, criterion, data_loader_test, device, len(dataset_test), args.print_freq))
 if cur_acc > acc: # > 0.7 and cur_acc > acc:
 acc = cur_acc
 path = os.path.join(args.output_dir, f"model_{epoch}_ntu60_DTr.pth")
 torch.save(model.state_dict(), path)
 print("model saved")
 with open('NTU60_epoch.txt', 'a') as f:
 f.write(str(epoch) + '\n')

Below is the ERROR:

[2025年01月15日 22:44:52,198] torch.distributed.run: [WARNING] 
[2025年01月15日 22:44:52,198] torch.distributed.run: [WARNING] *****************************************
[2025年01月15日 22:44:52,198] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2025年01月15日 22:44:52,198] torch.distributed.run: [WARNING] *****************************************
Traceback (most recent call last):
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 315, in _lazy_init
 queued_call()
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 183, in _check_capability
 capability = get_device_capability(d)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 439, in get_device_capability
 prop = get_device_properties(device)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 457, in get_device_properties
 return _get_device_properties(device) # type: ignore[name-defined]
RuntimeError: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=1, num_gpus=
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 284, in <module>
 main(args)
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 177, in main
 device, rank, world_size = init_distributed()
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 33, in init_distributed
 torch.cuda.set_device(local_rank)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 408, in set_device
 torch._C._cuda_setDevice(device)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 321, in _lazy_init
 raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=1, num_gpus=
CUDA call was originally invoked at:
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 9, in <module>
 import torch
 File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
 File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
 File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
 File "<frozen importlib._bootstrap_external>", line 850, in exec_module
 File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/__init__.py", line 1427, in <module>
 _C._initExtension(manager_path())
 File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
 File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
 File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
 File "<frozen importlib._bootstrap_external>", line 850, in exec_module
 File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 247, in <module>
 _lazy_call(_check_capability)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 244, in _lazy_call
 _queued_calls.append((callable, traceback.format_stack()))
Traceback (most recent call last):
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 315, in _lazy_init
 queued_call()
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 183, in _check_capability
 capability = get_device_capability(d)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 439, in get_device_capability
 prop = get_device_properties(device)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 457, in get_device_properties
 return _get_device_properties(device) # type: ignore[name-defined]
RuntimeError: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=1, num_gpus=
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 284, in <module>
 main(args)
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 177, in main
 device, rank, world_size = init_distributed()
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 33, in init_distributed
 torch.cuda.set_device(local_rank)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 408, in set_device
 torch._C._cuda_setDevice(device)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 321, in _lazy_init
 raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=1, num_gpus=
CUDA call was originally invoked at:
 File "/scratch/nam123/HOI4D_ctr/train_dist.py", line 9, in <module>
 import torch
 File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
 File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
 File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
 File "<frozen importlib._bootstrap_external>", line 850, in exec_module
 File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/__init__.py", line 1427, in <module>
 _C._initExtension(manager_path())
 File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
 File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
 File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
 File "<frozen importlib._bootstrap_external>", line 850, in exec_module
 File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 247, in <module>
 _lazy_call(_check_capability)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/cuda/__init__.py", line 244, in _lazy_call
 _queued_calls.append((callable, traceback.format_stack()))
[2025年01月15日 22:44:57,235] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 2018871) of binary: /home/nam123/.conda/envs/py39/bin/python
Traceback (most recent call last):
 File "/home/nam123/.conda/envs/py39/bin/torchrun", line 8, in <module>
 sys.exit(main())
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
 return f(*args, **kwargs)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/distributed/run.py", line 812, in main
 run(args)
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/distributed/run.py", line 803, in run
 elastic_launch(
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 135, in __call__
 return launch_agent(self._config, self._entrypoint, list(args))
 File "/home/nam123/.conda/envs/py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
 raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
train_dist.py FAILED
------------------------------------------------------------a
Failures:
[1]:
 time : 2025年01月15日_22:44:57
 host : sg049.sol.rc.asu.edu
 rank : 1 (local_rank: 1)
 exitcode : 1 (pid: 2018872)
 error_file: <N/A>
 traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
 time : 2025年01月15日_22:44:57
 host : sg049.sol.rc.asu.edu
 rank : 0 (local_rank: 0)
 exitcode : 1 (pid: 2018871)
 error_file: <N/A>
 traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

I have checked LINK and non of the solution have helped

asked Jan 16, 2025 at 5:56

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.