Linear/Dense performance for PyTorch vs JAX (flax/stax) #8497

Huizerd · 2021-11-09T15:24:33Z

Huizerd
Nov 9, 2021

I have quite some training code where JAX (in combination with flax) is faster than PyTorch. However, the code below is an example where JAX is much slower:

import functools
import time
from typing import Any

import flax.linen as jnn
import jax
import jax.numpy as jnp
import jax.experimental.stax as stax
import numpy as np
import torch
import torch.nn as tnn


Array = Any


class TorchLinear(tnn.Module):
    def __init__(self, layer_size):
        super().__init__()
        self.ff = tnn.Linear(layer_size, layer_size, bias=False)

    def forward(self, input_):
        output = self.ff(input_)
        return output


class JaxLinear(jnn.Module):
    layer_size: int

    @jnn.compact
    def __call__(self, input_: Array) -> Array:
        output = jnn.Dense(self.layer_size, use_bias=False)(input_)
        return output


def benchmark_torch_linear(inputs, layer_size, device, label):
    with torch.no_grad():
        # create model
        model = TorchLinear(layer_size)
        model.to(device)

        # run
        start = time.time()

        outputs = []
        for input_ in inputs:
            output = model(input_)
            outputs.append(output)
        outputs = torch.stack(outputs)

        end = time.time()

        duration = end - start
        print(f"{label}: {duration}")

        return duration


def benchmark_jax_linear(inputs, layer_size, device, label):
    # create model
    rng = jax.random.PRNGKey(0)
    model = JaxLinear(layer_size)

    @jax.jit
    def init(*args):
        return model.init(*args)

    input_shape = (batch_size, layer_size)
    params = init(rng, jnp.ones(input_shape))

    @functools.partial(jax.jit, static_argnums=(0,))
    def step(model, params, input_):
        output = model.apply(params, input_)
        return output

    # run
    start = time.time()

    outputs = []
    for input_ in inputs:
        output = step(model, params, input_)
        outputs.append(output)
    outputs = jnp.stack(outputs)
    outputs[0].block_until_ready()

    end = time.time()

    duration = end - start
    print(f"{label}: {duration}")

    return duration


def benchmark_stax_linear(inputs, layer_size, device, label):
    # create model
    rng = jax.random.PRNGKey(0)
    init_fn, apply_fn = stax.Dense(layer_size)  # TODO: has to have bias

    input_shape = (batch_size, layer_size)
    _, params = init_fn(rng, input_shape)

    @functools.partial(jax.jit, static_argnums=(0,))
    def step(apply_fn, params, input_):
        output = apply_fn(params, input_)
        return output

    # run
    start = time.time()

    outputs = []
    for input_ in inputs:
        output = step(apply_fn, params, input_)
        outputs.append(output)
    outputs = jnp.stack(outputs)
    outputs[0].block_until_ready()

    end = time.time()

    duration = end - start
    print(f"{label}: {duration}")

    return duration


if __name__ == "__main__":
    # benchmark
    runs = 10
    batch_size = 32
    seq_len = 100
    layer_size = 250
    device = torch.device("cuda")

    # input data
    input_torch = (torch.rand(seq_len, batch_size, layer_size, device=device) < 0.2).float().contiguous()
    input_np = np.asarray(input_torch.cpu(), dtype=jnp.float32)
    input_jnp = jnp.asarray(input_torch.cpu())

    # pytorch / loop outside module
    for _ in range(runs):
        benchmark_torch_linear(input_torch, layer_size, device, "torch, loop outside")

    # jax / np input / loop outside module
    for _ in range(runs):
        benchmark_jax_linear(input_np, layer_size, device, "jax, np input, loop outside")

    # stax / np input / loop outside module
    for _ in range(runs):
        benchmark_stax_linear(input_np, layer_size, device, "stax, np input, loop outside")

The output of the above, when run on a laptop GPU, is:

torch, loop outside: 0.004362821578979492
torch, loop outside: 0.002480745315551758
torch, loop outside: 0.002397775650024414
torch, loop outside: 0.0025179386138916016
torch, loop outside: 0.002394437789916992
torch, loop outside: 0.002406597137451172
torch, loop outside: 0.0026977062225341797
torch, loop outside: 0.002962827682495117
torch, loop outside: 0.0031156539916992188
torch, loop outside: 0.0030014514923095703
jax, np input, loop outside: 0.8470847606658936
jax, np input, loop outside: 0.0435943603515625
jax, np input, loop outside: 0.043665170669555664
jax, np input, loop outside: 0.042426347732543945
jax, np input, loop outside: 0.04245901107788086
jax, np input, loop outside: 0.045491933822631836
jax, np input, loop outside: 0.04279947280883789
jax, np input, loop outside: 0.042189836502075195
jax, np input, loop outside: 0.042439937591552734
jax, np input, loop outside: 0.04413032531738281
stax, np input, loop outside: 0.09287095069885254
stax, np input, loop outside: 0.03866934776306152
stax, np input, loop outside: 0.03732895851135254
stax, np input, loop outside: 0.03871488571166992
stax, np input, loop outside: 0.03256487846374512
stax, np input, loop outside: 0.03566265106201172
stax, np input, loop outside: 0.03671145439147949
stax, np input, loop outside: 0.03417348861694336
stax, np input, loop outside: 0.03485298156738281
stax, np input, loop outside: 0.03808856010437012

As you can see, the difference for feeding a sequence through a simple Linear/Dense layer is quite large; PyTorch (without JIT) is > 10x faster than JAX + flax (with JIT), and ~10x faster than JAX + stax (with JIT). Jitting PyTorch doesn't make much difference; not jitting JAX obviously does. Given that the loops are the same, it seems that the Linear implementation of PyTorch is much faster than the Dense implementation of flax/stax. Can you point me to an explanation on why this is the case, or to solutions on how to decrease the performance gap? Thanks!

Answered by PhilipVinc

Nov 10, 2021

for a proper comparison you should pre-heat the jit.
So you should call the jitted function once before starting the timer, so that you don't profile jit time.

For a fair comparison you should also feed jnp.arrays and not numpy arrays to jax.

View full answer

PhilipVinc · 2021-11-10T15:26:28Z

PhilipVinc
Nov 10, 2021

for a proper comparison you should pre-heat the jit.
So you should call the jitted function once before starting the timer, so that you don't profile jit time.

For a fair comparison you should also feed jnp.arrays and not numpy arrays to jax.

10 replies

sourabh2k15 Mar 2, 2022

bumping this up, do we have an answer now ? this is a very interesting find!

Huizerd Mar 3, 2022
Author

as @PhilipVinc suggested it could be that JAX is slower due to dispatch cost for smaller matrices, but I’m not an expert at all so no idea. maybe other people could shed light on this, and on what causes JAX to be faster for larger matrices?

Habush Jun 23, 2023

Also +1. Any update on this? I've run the same issue where the torch implementation of a vanilla VAE significantly outperforms (~5x) the JAX/Haiku version

mattjj Jun 23, 2023
Maintainer

The best way to debug performance is to look at profiles.

Habush Jun 26, 2023

UPDATE: After spending a weekend trying to pin down the cause for the huge performance gap between the Jax/Haiku and Pytorch implementation for a VAE, I found out that using Pytorch data loader with JAX results in significantly slower training time compared to using Tensorflow data loader. Initially, the Pytorch code was taking 6 secs per epoch whereas the Jax code was running at 29 seconds per second - using Pytorch data loader for both! After switching to Tensorflow data loader for the Jax code, it now runs with ~ 6.4 seconds per epoch!! That is a huge increase in performance. I believe I can even get better performance if I do some optimization of the JAX code.

I find it very odd that just switching data loaders provides this much speed up. Looking further to this, in the JAX Advanced tutorials, where it is shown how to train a simple neural network with JAX, there are two versions of the tutorial - one using Pytorch data loader and the other using TF data loader. As it can be seen it the tutorials, the TF version takes ~ 14 seconds per epoch whereas the Pytorch version takes ~ 41 seconds per epoch (almost 3x slower)! I wonder if someone else has also encountered this and if the reasons for this performance difference can be explained. Further, if this is a common known issue with Pytorch data loaders, then I also wonder this warrants encouraging in the docs for users to use TF dataloader for performance reasons.

FYI, this is how I'm loading the dataset with TF data loader:

  tf.config.experimental.set_visible_devices([], 'GPU')
  train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
  train_ds = train_ds.shuffle(100)
  train_ds = train_ds.batch(train_batch_size, drop_remainder=True, num_parallel_calls=10) # Note I'm also using num_workers in the Pytorch implementation set to 10 for fair comparison.
  train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
  train_ds = tfds.as_numpy(train_ds)

  val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val))
  val_ds = val_ds.batch(val_batch_size, num_parallel_calls=10)
  val_ds = val_ds.prefetch(tf.data.AUTOTUNE)
  val_ds = tfds.as_numpy(val_ds)

Note: Adding tf.config.experimental.set_visible_devices([], 'GPU') stops tensorflow form allocating gpu memory and without it at the beginning of the code, I run into Out of memory error when using the TF loader

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Linear/Dense performance for PyTorch vs JAX (flax/stax) #8497

{{title}}

Replies: 1 comment 10 replies

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

Linear/Dense performance for PyTorch vs JAX (flax/stax) #8497

Huizerd Nov 9, 2021

Replies: 1 comment · 10 replies

PhilipVinc Nov 10, 2021

sourabh2k15 Mar 2, 2022

Huizerd Mar 3, 2022 Author

Habush Jun 23, 2023

mattjj Jun 23, 2023 Maintainer

Habush Jun 26, 2023

Huizerd
Nov 9, 2021

Replies: 1 comment 10 replies

PhilipVinc
Nov 10, 2021

Huizerd Mar 3, 2022
Author

mattjj Jun 23, 2023
Maintainer