diff --git a/PyTorch/examples/computer_vision/hello_world/README.md b/PyTorch/examples/computer_vision/hello_world/README.md index 53031a745..6cbebd77f 100644 --- a/PyTorch/examples/computer_vision/hello_world/README.md +++ b/PyTorch/examples/computer_vision/hello_world/README.md @@ -42,34 +42,56 @@ export PYTHONPATH=$PYTHONPATH:/path/to/Model-References **Run training on 1 HPU:** -- 1 HPU in FP32 Lazy mode: +- 1 HPU in FP32 Eager mode: ```bash -PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu +PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu ``` -- 1 HPU in BF16 Lazy mode: +- 1 HPU in BF16 Eager mode: ```bash -PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast +PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast +``` + +- 1 HPU in FP32 using `torch.compile()`: + +```bash +PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --use-torch-compile +``` + +- 1 HPU in BF16 using `torch.compile()`: + +```bash +PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast --use-torch-compile ``` **Run training on 8 HPUs:** **NOTE:** mpirun map-by PE attribute value may vary on your setup. For the recommended calculation, refer to the instructions detailed in [mpirun Configuration](https://docs.habana.ai/en/latest/PyTorch/PyTorch_Scaling_Guide/mpirun_Configuration.html#mpirun-configuration). +- 8 HPUs, 1 server in FP32 Eager mode: + +```bash +mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu +``` +- 8 HPU, 1 server in BF16 Eager mode: + +```bash +mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast +``` -- 8 HPUs, 1 server in FP32 Lazy mode: +- 8 HPUs, 1 server in FP32 using `torch.compile()`: ```bash -mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu +mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --use-torch-compile ``` -- 8 HPU, 1 server in BF16 Lazy mode: +- 8 HPU, 1 server in BF16 using `torch.compile()`: ```bash -mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast +mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast --use-torch-compile ``` #### Examples in Python Script @@ -81,7 +103,7 @@ The `example.py` presents a basic PyTorch code example. For more details, refer On 1 HPU in Lazy mode, run the following command: ```bash -$PYTHON example.py +PT_HPU_LAZY_MODE=0 $PYTHON example.py ``` ## Changelog diff --git a/PyTorch/examples/computer_vision/hello_world/example.py b/PyTorch/examples/computer_vision/hello_world/example.py index c3f5c0385..b878fc1ae 100644 --- a/PyTorch/examples/computer_vision/hello_world/example.py +++ b/PyTorch/examples/computer_vision/hello_world/example.py @@ -47,18 +47,8 @@ def train(net,criterion,optimizer,trainloader,device,lazy_mode): loss.backward() - ############################################################################## - if(lazy_mode): - htcore.mark_step() - ############################################################################## - optimizer.step() - ############################################################################## - if(lazy_mode): - htcore.mark_step() - ############################################################################## - train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) diff --git a/PyTorch/examples/computer_vision/hello_world/mnist.py b/PyTorch/examples/computer_vision/hello_world/mnist.py index 15e58c406..09f66dfc6 100644 --- a/PyTorch/examples/computer_vision/hello_world/mnist.py +++ b/PyTorch/examples/computer_vision/hello_world/mnist.py @@ -18,9 +18,6 @@ # todo: [SW-165872] revert below W/A when PR 113374 included in pytorch fork torch._dynamo.config.optimize_ddp = False -def is_lazy(): - return os.getenv("PT_HPU_LAZY_MODE", "1") != "0" - class Net(nn.Module): def __init__(self, use_autocast=False): @@ -67,8 +64,6 @@ def train_function(data, target): for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) loss = train_function(data, target) - if is_lazy(): - htcore.mark_step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * @@ -183,9 +178,6 @@ def main(): if args.use_torch_compile: assert int(torch.__version__.split('.')[ 0]) >= 2, "Graph mode is available only in PyTorch 2.x." - assert not is_lazy(), "Dynamo and lazy are mutually exclusive." - # Note: PT_HPU_LAZY_MODE=0 needs to be set before library is loaded, - # setting it here would be too late - hence assertion. utils.init_distributed_mode(args)