resnet50 LXP
In [ ]:
Copied!
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# from datautils import MyTrainDataset
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# from datautils import MyTrainDataset
In [ ]:
Copied!
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import torch.distributed as dist
import os
from torchvision import models
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import torch.distributed as dist
import os
from torchvision import models
In [ ]:
Copied!
def ddp_setup():
init_process_group(backend="nccl")
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
world_size = dist.get_world_size()
rank = dist.get_rank()
return world_size, rank
def ddp_setup():
init_process_group(backend="nccl")
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
world_size = dist.get_world_size()
rank = dist.get_rank()
return world_size, rank
In [ ]:
Copied!
class Trainer:
def __init__(
self,
model: torch.nn.Module,
train_data: DataLoader,
optimizer: torch.optim.Optimizer,
save_every: int,
snapshot_path: str,
) -> None:
self.local_rank = int(os.environ["LOCAL_RANK"])
self.global_rank = int(os.environ["RANK"])
self.model = model.to(self.local_rank)
self.train_data = train_data
self.optimizer = optimizer
self.save_every = save_every
self.epochs_run = 0
self.snapshot_path = snapshot_path
if os.path.exists(snapshot_path):
print("Loading snapshot")
self._load_snapshot(snapshot_path)
self.model = DDP(self.model, device_ids=[self.local_rank])
def _load_snapshot(self, snapshot_path):
loc = f"cuda:{self.local_rank}"
snapshot = torch.load(snapshot_path, map_location=loc)
self.model.load_state_dict(snapshot["MODEL_STATE"])
self.epochs_run = snapshot["EPOCHS_RUN"]
print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
def _run_batch(self, source, targets):
self.optimizer.zero_grad()
output = self.model(source)
loss = F.cross_entropy(output, targets)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
b_sz = len(next(iter(self.train_data))[0])
print(f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
self.train_data.sampler.set_epoch(epoch)
for source, targets in self.train_data:
source = source.to(self.local_rank)
targets = targets.to(self.local_rank)
self._run_batch(source, targets)
def _save_snapshot(self, epoch):
snapshot = {
"MODEL_STATE": self.model.module.state_dict(),
"EPOCHS_RUN": epoch,
}
torch.save(snapshot, self.snapshot_path)
print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")
def train(self, max_epochs: int):
for epoch in range(self.epochs_run, max_epochs):
self._run_epoch(epoch)
if self.local_rank == 0 and epoch % self.save_every == 0:
self._save_snapshot(epoch)
class Trainer:
def __init__(
self,
model: torch.nn.Module,
train_data: DataLoader,
optimizer: torch.optim.Optimizer,
save_every: int,
snapshot_path: str,
) -> None:
self.local_rank = int(os.environ["LOCAL_RANK"])
self.global_rank = int(os.environ["RANK"])
self.model = model.to(self.local_rank)
self.train_data = train_data
self.optimizer = optimizer
self.save_every = save_every
self.epochs_run = 0
self.snapshot_path = snapshot_path
if os.path.exists(snapshot_path):
print("Loading snapshot")
self._load_snapshot(snapshot_path)
self.model = DDP(self.model, device_ids=[self.local_rank])
def _load_snapshot(self, snapshot_path):
loc = f"cuda:{self.local_rank}"
snapshot = torch.load(snapshot_path, map_location=loc)
self.model.load_state_dict(snapshot["MODEL_STATE"])
self.epochs_run = snapshot["EPOCHS_RUN"]
print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
def _run_batch(self, source, targets):
self.optimizer.zero_grad()
output = self.model(source)
loss = F.cross_entropy(output, targets)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
b_sz = len(next(iter(self.train_data))[0])
print(f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
self.train_data.sampler.set_epoch(epoch)
for source, targets in self.train_data:
source = source.to(self.local_rank)
targets = targets.to(self.local_rank)
self._run_batch(source, targets)
def _save_snapshot(self, epoch):
snapshot = {
"MODEL_STATE": self.model.module.state_dict(),
"EPOCHS_RUN": epoch,
}
torch.save(snapshot, self.snapshot_path)
print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")
def train(self, max_epochs: int):
for epoch in range(self.epochs_run, max_epochs):
self._run_epoch(epoch)
if self.local_rank == 0 and epoch % self.save_every == 0:
self._save_snapshot(epoch)
In [ ]:
Copied!
def load_train_objs(num_epochs, batch_size_per_gpu, world_size):
train_set = SyntheticDataset(num_epochs, batch_size_per_gpu, world_size)
model = models.resnet50(pretrained=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
return train_set, model, optimizer
def load_train_objs(num_epochs, batch_size_per_gpu, world_size):
train_set = SyntheticDataset(num_epochs, batch_size_per_gpu, world_size)
model = models.resnet50(pretrained=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
return train_set, model, optimizer
In [ ]:
Copied!
def prepare_dataloader(dataset: Dataset, batch_size: int):
return DataLoader(
dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(dataset),
num_workers=32
)
def prepare_dataloader(dataset: Dataset, batch_size: int):
return DataLoader(
dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(dataset),
num_workers=32
)
In [ ]:
Copied!
def main(save_every: int,\
total_epochs: int,\
batch_size: int,\
snapshot_path: str = os.path.join(os.getcwd(), 'snapshot.pt')):
world_size, rank = ddp_setup()
batch_size_per_gpu = int(batch_size / 4 )
dataset, model, optimizer = load_train_objs(total_epochs, batch_size_per_gpu, world_size)
train_data = prepare_dataloader(dataset, batch_size)
trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
trainer.train(total_epochs)
print('Will now destroy the process group\n')
destroy_process_group()
print('Process group destroyed')
import sys
# https://github.com/pytorch/pytorch/issues/76287
sys.exit(0)
def main(save_every: int,\
total_epochs: int,\
batch_size: int,\
snapshot_path: str = os.path.join(os.getcwd(), 'snapshot.pt')):
world_size, rank = ddp_setup()
batch_size_per_gpu = int(batch_size / 4 )
dataset, model, optimizer = load_train_objs(total_epochs, batch_size_per_gpu, world_size)
train_data = prepare_dataloader(dataset, batch_size)
trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
trainer.train(total_epochs)
print('Will now destroy the process group\n')
destroy_process_group()
print('Process group destroyed')
import sys
# https://github.com/pytorch/pytorch/issues/76287
sys.exit(0)
In [ ]:
Copied!
import random
class SyntheticDataset(Dataset):
def __init__(self, num_epochs, batch_size_per_gpu, world_size):
self.num_epochs = num_epochs
self.batch_size_per_gpu = batch_size_per_gpu
self.num_iters = world_size
self.len = self.num_epochs * self.batch_size_per_gpu * self.num_iters
def __getitem__(self, idx):
data = torch.randn(3, 224, 224)
target = random.randint(0, 999)
return (data, target)
def __len__(self):
return self.len
import random
class SyntheticDataset(Dataset):
def __init__(self, num_epochs, batch_size_per_gpu, world_size):
self.num_epochs = num_epochs
self.batch_size_per_gpu = batch_size_per_gpu
self.num_iters = world_size
self.len = self.num_epochs * self.batch_size_per_gpu * self.num_iters
def __getitem__(self, idx):
data = torch.randn(3, 224, 224)
target = random.randint(0, 999)
return (data, target)
def __len__(self):
return self.len
In [ ]:
Copied!
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='simple distributed training job')
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
parser.add_argument('batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
args = parser.parse_args()
main(args.save_every, args.total_epochs, args.batch_size)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='simple distributed training job')
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
parser.add_argument('batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
args = parser.parse_args()
main(args.save_every, args.total_epochs, args.batch_size)