diff --git a/README.md b/README.md index 0287110..74d4bfd 100644 --- a/README.md +++ b/README.md @@ -47,3 +47,7 @@ In this lab you will build Cloud Native infrastructure required for running dist [Torch Elastic Docs](https://pytorch.org/elastic/0.2.2/index.html) [Azure Spot VMs](https://docs.microsoft.com/en-us/azure/virtual-machines/spot-vms) + +kustomize build kube/overlays/4gpu | kubectl apply -f - + +kubectl apply -f kube/imagenet.yaml diff --git a/examples/imagenet/main.py b/examples/imagenet/main.py index afa9f0f..a65d63b 100644 --- a/examples/imagenet/main.py +++ b/examples/imagenet/main.py @@ -68,6 +68,7 @@ from torch.nn.parallel import DistributedDataParallel from torch.optim import SGD from torch.utils.data import DataLoader +import random model_names = sorted( @@ -145,6 +146,9 @@ type=str, help="checkpoint file path, to load and save to", ) +parser.add_argument('--amp', action='store_true', help='use automatic mixed precision') +parser.add_argument('--seed', type=int, default=42) +parser.add_argument('--accum-steps', type=int, default=1) def main(): @@ -590,5 +594,88 @@ def accuracy(output, target, topk=(1,)): return res +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="PyTorch Elastic ImageNet Training") + parser.add_argument("data", metavar="DIR", help="path to dataset") + parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet18", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet18)", + ) + parser.add_argument( + "-j", + "--workers", + default=0, + type=int, + metavar="N", + help="number of data loading workers", + ) + parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" + ) + parser.add_argument( + "-b", + "--batch-size", + default=32, + type=int, + metavar="N", + help="mini-batch size (default: 32), per worker (GPU)", + ) + parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", + ) + parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") + parser.add_argument( + "--wd", + "--weight-decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", + ) + parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", + ) + parser.add_argument( + "--dist-backend", + default="nccl", + choices=["nccl", "gloo"], + type=str, + help="distributed backend", + ) + parser.add_argument( + "--checkpoint-file", + default="/tmp/checkpoint.pth.tar", + type=str, + help="checkpoint file path, to load and save to", + ) + parser.add_argument('--amp', action='store_true', help='use automatic mixed precision') + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--accum-steps', type=int, default=1) + return parser.parse_args() + + +def set_seed(seed: int): + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + if __name__ == "__main__": main() diff --git a/kube/imagenet.yaml b/kube/imagenet.yaml index 67ae2d0..a847718 100644 --- a/kube/imagenet.yaml +++ b/kube/imagenet.yaml @@ -33,7 +33,7 @@ spec: image: torchelastic/examples:0.2.0 imagePullPolicy: Always args: - - "--nproc_per_node=1" + - "--nproc_per_node=4" - "/workspace/examples/imagenet/main.py" - "--arch=resnet18" - "--epochs=3" @@ -47,7 +47,7 @@ spec: - "--checkpoint-file=/mnt/blob/data/checkpoint.pth.tar" resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu: 4 volumeMounts: - name: trainingdata mountPath: "/mnt/blob/data"