From 4c58e37579ba2db865cf52b8486501822aa0c516 Mon Sep 17 00:00:00 2001 From: ASEM000 Date: Wed, 19 Jul 2023 19:32:19 +0900 Subject: [PATCH] add batch norm , norm layers init func additions add batch norm , norm layers init func additions - add `tree_evaluation` to return a tree with layers modified to operate during eval. also enable `tree_evaluation.def_evaluation` to determine the evaluation rule by dispatch. - add `tree_state` to return a tree with state corresponds to layer (if exists), and defines `tree_state.def_state` to define state rules for layers by dispatch. --- .github/workflows/pypi.yml | 13 +- .github/workflows/tests.yml | 4 +- serket/__init__.py | 8 +- serket/nn/__init__.py | 14 +- serket/nn/contrast.py | 7 +- serket/nn/convolution.py | 67 ++++----- serket/nn/dropout.py | 11 +- serket/nn/evaluation.py | 58 ++++++++ serket/nn/fft_convolution.py | 38 +++-- serket/nn/initialization.py | 4 +- serket/nn/linear.py | 65 ++------- serket/nn/normalization.py | 260 +++++++++++++++++++++++++++-------- serket/nn/recurrent.py | 198 ++++++++++++++++++-------- serket/nn/state.py | 91 ++++++++++++ serket/nn/utils.py | 11 +- tests/test_linear.py | 18 --- tests/test_normalization.py | 53 +++++-- tests/test_rnn.py | 43 +++--- tests/test_utils.py | 2 +- 19 files changed, 643 insertions(+), 322 deletions(-) create mode 100644 serket/nn/evaluation.py create mode 100644 serket/nn/state.py diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 8e9e36f..7a52828 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -1,9 +1,5 @@ name: pypi - -on: - release: - types: [created] - +on: workflow_dispatch jobs: deploy: runs-on: ubuntu-latest @@ -16,11 +12,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install build + pip install twine - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python setup.py sdist bdist_wheel - twine upload dist/* + python -m build + python -m twine upload dist/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a37e82e..85f104b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,8 +22,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install git+https://github.com/ASEM000/PyTreeClass - python -m pip install tensorflow + python -m pip install pytreeclass>=0.4.0 + python -m pip install keras_core>=0.1.1 python -m pip install pytest wheel optax jaxlib coverage kernex - name: Pytest Check run: | diff --git a/serket/__init__.py b/serket/__init__.py index e3ca36b..8d7cffb 100644 --- a/serket/__init__.py +++ b/serket/__init__.py @@ -27,6 +27,7 @@ is_tree_equal, tree_diagram, tree_flatten_with_trace, + tree_graph, tree_leaves_with_trace, tree_map_with_trace, tree_mask, @@ -40,6 +41,8 @@ ) from . import nn +from .nn.evaluation import tree_evaluation +from .nn.state import tree_state __all__ = ( # general utils @@ -49,6 +52,7 @@ "fields", # pprint utils "tree_diagram", + "tree_graph", "tree_mermaid", "tree_repr", "tree_str", @@ -72,7 +76,9 @@ "Partial", # serket "nn", + "tree_evaluation", + "tree_state", ) -__version__ = "0.2.0b7" +__version__ = "0.4.0b1" diff --git a/serket/nn/__init__.py b/serket/nn/__init__.py index 6330869..d685f35 100644 --- a/serket/nn/__init__.py +++ b/serket/nn/__init__.py @@ -86,16 +86,8 @@ from .flatten import Flatten, Unflatten from .flip import FlipLeftRight2D, FlipUpDown2D from .fully_connected import FNN, MLP -from .linear import ( - Bilinear, - Embedding, - GeneralLinear, - Identity, - Linear, - MergeLinear, - Multilinear, -) -from .normalization import GroupNorm, InstanceNorm, LayerNorm +from .linear import Bilinear, Embedding, GeneralLinear, Identity, Linear, Multilinear +from .normalization import BatchNorm, GroupNorm, InstanceNorm, LayerNorm from .padding import Pad1D, Pad2D, Pad3D from .pooling import ( AdaptiveAvgPool1D, @@ -149,7 +141,6 @@ "Multilinear", "GeneralLinear", "Embedding", - "MergeLinear", # Dropout "Dropout", "Dropout1D", @@ -215,6 +206,7 @@ "LayerNorm", "InstanceNorm", "GroupNorm", + "BatchNorm", # Blur "AvgBlur2D", "GaussianBlur2D", diff --git a/serket/nn/contrast.py b/serket/nn/contrast.py index 0e9238c..d96af36 100644 --- a/serket/nn/contrast.py +++ b/serket/nn/contrast.py @@ -95,9 +95,10 @@ def __init__(self, contrast_range=(0.5, 1)): and len(contrast_range) == 2 and contrast_range[0] <= contrast_range[1] ): - msg = "contrast_range must be a tuple of two floats, " - msg += "with the first one smaller than the second one." - raise ValueError(msg) + raise ValueError( + "`contrast_range` must be a tuple of two floats, " + "with the first one smaller than the second one." + ) self.contrast_range = contrast_range diff --git a/serket/nn/convolution.py b/serket/nn/convolution.py index 9aba873..3c50438 100644 --- a/serket/nn/convolution.py +++ b/serket/nn/convolution.py @@ -81,24 +81,20 @@ def __init__( self.spatial_ndim, name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) + self.groups = positive_int_cb(groups) if self.out_features % self.groups != 0: - raise ValueError( - f"Expected out_features % groups == 0, \n" - f"got {self.out_features % self.groups}" - ) + raise ValueError(f"{(out_features % groups == 0)=}") weight_shape = (out_features, in_features // groups, *self.kernel_size) - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) - if bias_init_func is None: - self.bias = None - else: - bias_shape = (out_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + bias_shape = (out_features, *(1,) * self.spatial_ndim) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -432,24 +428,18 @@ def __init__( self.spatial_ndim, name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) self.groups = positive_int_cb(groups) if self.out_features % self.groups != 0: - raise ValueError( - "Expected out_features % groups == 0," - f"got {self.out_features % self.groups}" - ) + raise ValueError(f"{(self.out_features % self.groups ==0)=}") weight_shape = (out_features, in_features // groups, *self.kernel_size) # OIHW - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) - if bias_init_func is None: - self.bias = None - else: - bias_shape = (out_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + bias_shape = (out_features, *(1,) * self.spatial_ndim) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -774,19 +764,18 @@ def __init__( self.padding = padding # delayed canonicalization self.input_dilation = canonicalize(1, self.spatial_ndim, name="input_dilation") self.kernel_dilation = canonicalize( - 1, self.spatial_ndim, name="kernel_dilation" + 1, + self.spatial_ndim, + name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) weight_shape = (depth_multiplier * in_features, 1, *self.kernel_size) # OIHW - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) - if bias_init_func is None: - self.bias = None - else: - bias_shape = (depth_multiplier * in_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + bias_shape = (depth_multiplier * in_features, *(1,) * self.spatial_ndim) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -1359,8 +1348,8 @@ def __init__( self.spatial_ndim, name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) out_size = calculate_convolution_output_shape( shape=self.in_size, @@ -1376,14 +1365,10 @@ def __init__( *out_size, ) - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) bias_shape = (self.out_features, *out_size) - - if bias_init_func is None: - self.bias = None - else: - self.bias = self.bias_init_func(key, bias_shape) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) diff --git a/serket/nn/dropout.py b/serket/nn/dropout.py index ff40bfd..721e2eb 100644 --- a/serket/nn/dropout.py +++ b/serket/nn/dropout.py @@ -22,6 +22,8 @@ from jax import lax import serket as sk +from serket.nn.evaluation import tree_evaluation +from serket.nn.linear import Identity from serket.nn.utils import Range, validate_spatial_ndim @@ -38,7 +40,7 @@ class Dropout(sk.TreeClass): >>> import jax.numpy as jnp >>> layer = sk.nn.Dropout(0.5) >>> # change `p` to 0.0 to turn off dropout - >>> layer = layer.at["p"].set(0.0, is_leaf=pytc.is_frozen) + >>> layer = layer.at["p"].set(0.0, is_leaf=sk.is_frozen) Note: Use `p`= 0.0 to turn off dropout. @@ -157,3 +159,10 @@ def __init__(self, p: float = 0.5): @property def spatial_ndim(self) -> int: return 3 + + +@tree_evaluation.def_evalutation(Dropout) +@tree_evaluation.def_evalutation(DropoutND) +def dropout_evaluation(_): + # dropout is a no-op during evaluation + return Identity() diff --git a/serket/nn/evaluation.py b/serket/nn/evaluation.py new file mode 100644 index 0000000..a3206c4 --- /dev/null +++ b/serket/nn/evaluation.py @@ -0,0 +1,58 @@ +# Copyright 2023 Serket authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Define dispatchers for custom tree evaluation.""" + +from __future__ import annotations + +import functools as ft +from typing import Any, Callable, TypeVar + +import jax + +T = TypeVar("T") + + +def tree_evaluation(tree: T) -> T: + """Modify tree layers to disable any trainning related behavior. + + For example, `Dropout` layers drop probability is set to 0.0. and `BatchNorm` + layer `track_running_stats` is set to False when evaluating the tree. + + Args: + tree: A tree of layers. + + Returns: + A tree of layers with evaluation behavior. + + Example: + >>> # dropout is replaced by an identity layer in evaluation mode + >>> # by registering `tree_evaluation.def_evaluation(sk.nn.Dropout, sk.nn.Identity)` + >>> import jax.numpy as jnp + >>> import serket as sk + >>> layer = sk.nn.Dropout(0.5) + >>> sk.tree_evaluation(layer) + Identity() + """ + + def is_leaf(x: Callable[[Any], bool]) -> bool: + types = set(tree_evaluation.evaluation_dispatcher.registry.keys()) + types.discard(object) + return isinstance(x, tuple(types)) + + return jax.tree_map(tree_evaluation.evaluation_dispatcher, tree, is_leaf=is_leaf) + + +tree_evaluation.evaluation_dispatcher = ft.singledispatch(lambda x: x) +tree_evaluation.def_evalutation = tree_evaluation.evaluation_dispatcher.register diff --git a/serket/nn/fft_convolution.py b/serket/nn/fft_convolution.py index c49b12f..6fc15c5 100644 --- a/serket/nn/fft_convolution.py +++ b/serket/nn/fft_convolution.py @@ -179,8 +179,8 @@ def __init__( self.spatial_ndim, name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) self.groups = positive_int_cb(groups) if self.out_features % self.groups != 0: @@ -188,13 +188,10 @@ def __init__( raise ValueError(msg) weight_shape = (out_features, in_features // groups, *self.kernel_size) - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) - if bias_init_func is None: - self.bias = None - else: - bias_shape = (out_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + bias_shape = (out_features, *(1,) * self.spatial_ndim) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -513,8 +510,8 @@ def __init__( self.spatial_ndim, name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) self.groups = positive_int_cb(groups) if self.in_features % self.groups != 0: @@ -524,13 +521,13 @@ def __init__( ) weight_shape = (out_features, in_features // groups, *self.kernel_size) # OIHW - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) if bias_init_func is None: self.bias = None else: bias_shape = (out_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -857,19 +854,18 @@ def __init__( self.padding = padding self.input_dilation = canonicalize(1, self.spatial_ndim, name="input_dilation") self.kernel_dilation = canonicalize( - 1, self.spatial_ndim, name="kernel_dilation" + 1, + self.spatial_ndim, + name="kernel_dilation", ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) weight_shape = (depth_multiplier * in_features, 1, *self.kernel_size) # OIHW - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) - if bias_init_func is None: - self.bias = None - else: - bias_shape = (depth_multiplier * in_features, *(1,) * self.spatial_ndim) - self.bias = self.bias_init_func(key, bias_shape) + bias_shape = (depth_multiplier * in_features, *(1,) * self.spatial_ndim) + self.bias = bias_init_func(key, bias_shape) @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) diff --git a/serket/nn/initialization.py b/serket/nn/initialization.py index ff838d7..2f91741 100644 --- a/serket/nn/initialization.py +++ b/serket/nn/initialization.py @@ -62,7 +62,7 @@ init_map: dict[str, InitType] = dict(zip(get_args(InitLiteral), inits)) -def resolve_init_func(init_func: str | Callable) -> Callable: +def resolve_init_func(init_func: str | InitFuncType) -> Callable: if isinstance(init_func, FunctionType): return jtu.Partial(init_func) @@ -74,6 +74,6 @@ def resolve_init_func(init_func: str | Callable) -> Callable: raise ValueError(f"value must be one of ({', '.join(init_map.keys())})") if init_func is None: - return None + return jtu.Partial(lambda key, shape, dtype=None: None) raise ValueError("Value must be a string or a function.") diff --git a/serket/nn/linear.py b/serket/nn/linear.py index 5aeea2f..8deb872 100644 --- a/serket/nn/linear.py +++ b/serket/nn/linear.py @@ -110,16 +110,14 @@ def __init__( self.in_features = in_features self.out_features = out_features - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) weight_shape = (*self.in_features, out_features) - self.weight = self.weight_init_func(key, weight_shape) + self.weight = weight_init_func(key, weight_shape) self.bias = ( - None - if bias_init_func is None - else self.bias_init_func(key, (out_features,)) + None if bias_init_func is None else bias_init_func(key, (out_features,)) ) def __call__(self, *x, **k) -> jax.Array: @@ -239,15 +237,10 @@ def __init__( f"got {len(in_axes)=} and {len(in_features)=}" ) - self.weight_init_func = resolve_init_func(weight_init_func) - self.bias_init_func = resolve_init_func(bias_init_func) - self.weight = self.weight_init_func(key, (*self.in_features, self.out_features)) - - self.bias = ( - None - if self.bias_init_func is None - else self.bias_init_func(key, (self.out_features,)) - ) + weight_init_func = resolve_init_func(weight_init_func) + bias_init_func = resolve_init_func(bias_init_func) + self.weight = weight_init_func(key, (*self.in_features, self.out_features)) + self.bias = bias_init_func(key, (self.out_features,)) def __call__(self, x: jax.Array, **k) -> jax.Array: # ensure negative axes @@ -305,45 +298,3 @@ def __call__(self, x: jax.Array, **k) -> jax.Array: raise TypeError("Input must be an integer array.") return jnp.take(self.weight, x, axis=0) - - -class MergeLinear(sk.TreeClass): - """Merge multiple linear layers with the same `out_features`. - - Args: - layers: linear layers to merge - - Example: - >>> import serket as sk - >>> import numpy.testing as npt - >>> layer1 = sk.nn.Linear(5, 6) # 5 input features, 6 output features - >>> layer2 = sk.nn.Linear(7, 6) # 7 input features, 6 output features - >>> merged_layer = sk.nn.MergeLinear(layer1, layer2) # 12 input features, 6 output features - >>> x1 = jnp.ones([1, 5]) # 1 sample, 5 features - >>> x2 = jnp.ones([1, 7]) # 1 sample, 7 features - >>> y = merged_layer(x1, x2) # one matrix multiplication - >>> z = layer1(x1) + layer2(x2) # two matrix multiplications - >>> npt.assert_allclose(y, z, atol=1e-6) - - Note: - Use this layer to reduce the matrix multiplication operations in the forward pass. - """ - - def __init__(self, *layers: tuple[Linear, ...]): - out_dim0 = layers[0].out_features - if not all(isinstance(layer, Linear) for layer in layers): - raise TypeError("All layers must be instances of Linear.") - - for layer in layers[1:]: - if layer.out_features != out_dim0: - raise ValueError( - "All layers must have the same output dimension." - f" Got {out_dim0} and {layer.out_features}" - ) - - self.weight = jnp.concatenate([L.weight for L in layers], axis=0) - self.bias = sum([L.bias for L in layers if L.bias_init_func]) - - def __call__(self, *xs: tuple[jax.Array, ...], **k) -> jax.Array: - xs = jnp.concatenate(xs, axis=-1) - return xs @ self.weight + self.bias diff --git a/serket/nn/normalization.py b/serket/nn/normalization.py index 69cecd3..00f80ca 100644 --- a/serket/nn/normalization.py +++ b/serket/nn/normalization.py @@ -14,14 +14,16 @@ from __future__ import annotations -from typing import NamedTuple - import jax import jax.numpy as jnp +import jax.random as jr from jax.custom_batching import custom_vmap import serket as sk -from serket.nn.utils import IsInstance, Range, ScalarLike, positive_int_cb +from serket.nn.evaluation import tree_evaluation +from serket.nn.initialization import InitType, resolve_init_func +from serket.nn.state import tree_state +from serket.nn.utils import Range, ScalarLike, positive_int_cb def layer_norm( @@ -49,8 +51,12 @@ def layer_norm( σ_2 = jnp.var(x, axis=dims, keepdims=True) x̂ = (x - μ) * jax.lax.rsqrt((σ_2 + eps)) - if gamma is not None and beta is not None: - return x̂ * gamma + beta + if gamma is not None: + x̂ = x̂ * gamma + + if beta is not None: + x̂ = x̂ + beta + return x̂ @@ -80,10 +86,13 @@ def group_norm( x̂ = (xx - μ) * jax.lax.rsqrt((σ_2 + eps)) x̂ = x̂.reshape(*x.shape) - if gamma is not None and beta is not None: + if gamma is not None: gamma = jnp.expand_dims(gamma, axis=range(1, x.ndim)) + x̂ *= gamma + + if beta is not None: beta = jnp.expand_dims(beta, axis=range(1, x.ndim)) - x̂ = x̂ * gamma + beta + x̂ += beta return x̂ @@ -95,17 +104,23 @@ class LayerNorm(sk.TreeClass): Args: normalized_shape: the shape of the input to be normalized. eps: a value added to the denominator for numerical stability. - affine: a boolean value that when set to True, this module has learnable affine parameters. + gamma_init_func: a function to initialize the scale. Defaults to ones. + if None, the scale is not trainable. + beta_init_func: a function to initialize the shift. Defaults to zeros. + if None, the shift is not trainable. + key: a random key for initialization. Defaults to jax.random.PRNGKey(0). """ - eps: float = sk.field(callbacks=[Range(0), ScalarLike()]) + eps: float = sk.field(callbacks=[Range(0, min_inclusive=False), ScalarLike()]) def __init__( self, normalized_shape: int | tuple[int, ...], *, eps: float = 1e-5, - affine: bool = True, + gamma_init_func: InitType = "ones", + beta_init_func: InitType = "zeros", + key: jr.KeyArray = jr.PRNGKey(0), ): self.normalized_shape = ( normalized_shape @@ -113,11 +128,8 @@ def __init__( else (normalized_shape,) ) self.eps = eps - self.affine = affine - - # make gamma and beta trainable - self.gamma = jnp.ones(normalized_shape) if self.affine else None - self.beta = jnp.zeros(normalized_shape) if self.affine else None + self.gamma = resolve_init_func(gamma_init_func)(key, self.normalized_shape) + self.beta = resolve_init_func(beta_init_func)(key, self.normalized_shape) def __call__(self, x: jax.Array, **kwargs) -> jax.Array: return layer_norm( @@ -138,7 +150,11 @@ class GroupNorm(sk.TreeClass): in_features : the shape of the input to be normalized. groups : number of groups to separate the channels into. eps : a value added to the denominator for numerical stability. - affine : a boolean value that when set to True, this module has learnable affine parameters. + gamma_init_func: a function to initialize the scale. Defaults to ones. + if None, the scale is not trainable. + beta_init_func: a function to initialize the shift. Defaults to zeros. + if None, the shift is not trainable. + key: a random key for initialization. Defaults to jax.random.PRNGKey(0). """ eps: float = sk.field(callbacks=[Range(0), ScalarLike()]) @@ -149,21 +165,20 @@ def __init__( *, groups: int, eps: float = 1e-5, - affine: bool = True, + gamma_init_func: InitType = "ones", + beta_init_func: InitType = "zeros", + key: jr.KeyArray = jr.PRNGKey(0), ): self.in_features = positive_int_cb(in_features) self.groups = positive_int_cb(groups) - self.affine = affine self.eps = eps # needs more info for checking if in_features % groups != 0: - msg = f"in_features must be divisible by groups. Got {in_features} and {groups}" - raise ValueError(msg) + raise ValueError(f"{in_features} must be divisible by {groups=}.") - # make gamma and beta trainable - self.gamma = jnp.ones(self.in_features) if self.affine else None - self.beta = jnp.zeros(self.in_features) if self.affine else None + self.gamma = resolve_init_func(gamma_init_func)(key, (in_features,)) + self.beta = resolve_init_func(beta_init_func)(key, (in_features,)) def __call__(self, x: jax.Array, **k) -> jax.Array: return group_norm( @@ -183,7 +198,11 @@ class InstanceNorm(GroupNorm): Args: in_features : the shape of the input to be normalized. eps : a value added to the denominator for numerical stability. - affine : a boolean value that when set to True, this module has learnable affine parameters. + gamma_init_func: a function to initialize the scale. Defaults to ones. + if None, the scale is not trainable. + beta_init_func: a function to initialize the shift. Defaults to zeros. + if None, the shift is not trainable. + key: a random key for initialization. Defaults to jax.random.PRNGKey(0). """ def __init__( @@ -191,33 +210,86 @@ def __init__( in_features: int, *, eps: float = 1e-5, - affine: bool = True, + gamma_init_func: InitType = "ones", + beta_init_func: InitType = "zeros", + key: jr.KeyArray = jr.PRNGKey(0), ): super().__init__( in_features=in_features, groups=in_features, eps=eps, - affine=affine, + gamma_init_func=gamma_init_func, + beta_init_func=beta_init_func, + key=key, ) -class BatchNormState(NamedTuple): +class BatchNormState(sk.TreeClass): running_mean: jax.Array running_var: jax.Array +def _batchnorm_impl( + x: jax.Array, + state: BatchNormState, + momentum: float = 0.1, + eps: float = 1e-3, + gamma: jax.Array = None, + beta: jax.Array = None, + evalution: bool = False, + axis: int = 1, +): + # reduce over axis=1 + broadcast_shape = [1] * x.ndim + broadcast_shape[axis] = x.shape[axis] + + def bn_eval_step(x, state): + run_mean, run_var = state.running_mean, state.running_var + run_mean = jnp.reshape(run_mean, broadcast_shape) + run_var = jnp.reshape(run_var, broadcast_shape) + output = (x - run_mean) / jnp.sqrt(run_var + eps) + + return output, state + + def bn_train_step(x, state): + # maybe support axes option + run_mean, run_var = state.running_mean, state.running_var + axes = list(range(x.ndim)) + with jax.ensure_compile_time_eval(): + del axes[axis] + batch_mean = jnp.mean(x, axis=axes, keepdims=True) + batch_var = jnp.mean(jnp.square(x), axis=axes, keepdims=True) - batch_mean**2 + output = (x - batch_mean) / jnp.sqrt(batch_var + eps) + run_mean = momentum * run_mean + (1 - momentum) * jnp.squeeze(batch_mean) + run_var = momentum * run_var + (1 - momentum) * jnp.squeeze(batch_var) + return output, BatchNormState(run_mean, run_var) + + output, state = jax.lax.cond(evalution, bn_eval_step, bn_train_step, x, state) + + state = jax.lax.stop_gradient(state) + + if gamma is not None: + output *= jnp.reshape(gamma, broadcast_shape) + + if beta is not None: + output += jnp.reshape(beta, broadcast_shape) + + return output, state + + @custom_vmap def batchnorm( x: jax.Array, - state: tuple[jax.Array, jax.Array], - *, + state: BatchNormState, momentum: float = 0.1, eps: float = 1e-5, gamma: jax.Array | None = None, beta: jax.Array | None = None, - track_running_stats: bool = False, -): - del momentum, eps, gamma, beta, track_running_stats + evaluation: bool = False, + axis: int = 1, +) -> tuple[jax.Array, BatchNormState]: + del momentum, eps, gamma, beta, evaluation, axis + # no-op when unbatched return x, state @@ -226,40 +298,106 @@ def _( axis_size, in_batched, x: jax.Array, - state: tuple[jax.Array, jax.Array], - *, - momentum: float = 0.1, - eps: float = 1e-5, - track_running_stats: bool = True, -): - run_mean, run_var = state + state: BatchNormState, + momentum: float = 0.99, + eps: float = 1e-3, + gamma: jax.Array | None = None, + beta: jax.Array | None = None, + evaluation: bool = True, + axis: int = 1, +) -> tuple[jax.Array, BatchNormState]: + output = _batchnorm_impl( + x=x, + state=state, + momentum=momentum, + eps=eps, + gamma=gamma, + beta=beta, + evalution=evaluation, + axis=axis, + ) + return output, (True, BatchNormState(True, True)) - axes = [0] + list(range(2, x.ndim)) - batch_mean, batch_var = jnp.mean(x, axis=axes), jnp.var(x, axis=axes) +class BatchNorm(sk.TreeClass): + """Applies normalization over batched inputs` - run_mean = jnp.where( - track_running_stats, - (1 - momentum) * run_mean + momentum * batch_mean, - batch_mean, - ) + Works under ``jax.vmap(BatchNorm(...), in_axes=(0, None))``, otherwise will be a no-op. - run_var = jnp.where( - track_running_stats, - (1 - momentum) * run_var + momentum * batch_var * (axis_size / (axis_size - 1)), - batch_var, - ) - x_normalized = (x - batch_mean) * jax.lax.rsqrt(batch_var + eps) - return (x_normalized, (run_mean, run_var)), (True, (True, True)) + Evaluation behavior: + ``output = (x - running_mean) / sqrt(running_var + eps)`` + Training behavior: + ``output = (x - batch_mean) / sqrt(batch_var + eps)`` + ``running_mean = momentum * running_mean + (1 - momentum) * batch_mean`` + ``running_var = momentum * running_var + (1 - momentum) * batch_var`` -class BatchNorm(sk.TreeClass): - in_features: int = sk.field(callbacks=[IsInstance(int), Range(1)]) - momentum: float = sk.field(callbacks=[Range(0, 1), ScalarLike()]) - eps: float = sk.field(callbacks=[Range(0), ScalarLike()]) - track_running_stats: bool = sk.field(callbacks=[IsInstance(bool)]) + Args: + in_features : the shape of the input to be normalized. + momentum : the value used for the ``running_mean`` and ``running_var`` + computation. must be a number between ``0`` and ``1``. + eps : a value added to the denominator for numerical stability. + gamma_init_func: a function to initialize the scale. Defaults to ones. + if None, the scale is not trainable. + beta_init_func: a function to initialize the shift. Defaults to zeros. + if None, the shift is not trainable. + axis: the axis that should be normalized. Defaults to 1. + evaluation : a boolean value that when set to True, this module will run in + evaluation mode. In this case, this module will always use the running + estimates of the batch statistics during training. + + Note: + https://keras.io/api/layers/normalization_layers/batch_normalization/ + """ - def __post_init__(self): - self.state = BatchNormState( - jnp.zeros(self.in_features), jnp.ones(self.in_features) + def __init__( + self, + in_features: int, + *, + momentum: float = 0.99, + eps: float = 1e-3, + gamma_init_func: InitType = "ones", + beta_init_func: InitType = "zeros", + axis: int = 1, + evaluation: bool = False, + key: jr.KeyArray = jr.PRNGKey(0), + ) -> None: + self.in_features = in_features + self.momentum = momentum + self.eps = eps + self.gamma = resolve_init_func(gamma_init_func)(key, (in_features,)) + self.beta = resolve_init_func(beta_init_func)(key, (in_features,)) + self.axis = axis + self.evaluation = evaluation + + def __call__( + self, + x: jax.Array, + state: BatchNormState | None = None, + **k, + ) -> jax.Array: + state = sk.tree_state(self) if state is None else state + + x, state = batchnorm( + x, + state, + self.momentum, + self.eps, + self.gamma, + self.beta, + self.evaluation, + self.axis, ) + return x, state + + +@tree_evaluation.def_evalutation(BatchNorm) +def _(batchnorm: BatchNorm) -> BatchNorm: + return batchnorm.at["evaluation"].set(True) + + +@tree_state.def_state(BatchNorm) +def batchnorm_init_state(batchnorm: BatchNorm, _) -> BatchNormState: + running_mean = jnp.zeros([batchnorm.in_features]) + running_var = jnp.ones([batchnorm.in_features]) + return BatchNormState(running_mean, running_var) diff --git a/serket/nn/recurrent.py b/serket/nn/recurrent.py index 84832a9..6d33cb2 100644 --- a/serket/nn/recurrent.py +++ b/serket/nn/recurrent.py @@ -25,6 +25,7 @@ import serket as sk from serket.nn.activation import ActivationType, resolve_activation from serket.nn.initialization import InitType +from serket.nn.state import tree_state from serket.nn.utils import ( DilationType, KernelSizeType, @@ -62,13 +63,6 @@ class RNNCell(sk.TreeClass): def __call__(self, x: jax.Array, state: RNNState, **k) -> RNNState: ... - @abc.abstractclassmethod - def init_state(self, spatial_shape: tuple[int, ...]) -> RNNState: - # return the initial state of the RNN for a given input - # for non-spatial RNNs, output shape is (hidden_features,) - # for spatial RNNs, output shape is (hidden_features, *spatial_shape) - ... - @property @abc.abstractclassmethod def spatial_ndim(self) -> int: @@ -104,8 +98,10 @@ def __init__( key: the key to use to initialize the weights Example: + >>> import serket as sk + >>> import jax.numpy as jnp >>> cell = SimpleRNNCell(10, 20) # 10-dimensional input, 20-dimensional hidden state - >>> rnn_state = cell.init_state() # 20-dimensional hidden state + >>> rnn_state = sk.tree_state(cell) # 20-dimensional hidden state >>> x = jnp.ones((10,)) # 10 features >>> result = cell(x, rnn_state) >>> result.hidden_state.shape # 20 features @@ -120,7 +116,7 @@ def __init__( self.hidden_features = positive_int_cb(hidden_features) self.act_func = resolve_activation(act_func) - in_to_hidden = sk.nn.Linear( + i2h = sk.nn.Linear( in_features, hidden_features, weight_init_func=weight_init_func, @@ -128,7 +124,7 @@ def __init__( key=k1, ) - hidden_to_hidden = sk.nn.Linear( + h2h = sk.nn.Linear( hidden_features, hidden_features, weight_init_func=recurrent_weight_init_func, @@ -136,7 +132,8 @@ def __init__( key=k2, ) - self.in_and_hidden_to_hidden = sk.nn.MergeLinear(in_to_hidden, hidden_to_hidden) + self.ih2h_weight = jnp.concatenate([i2h.weight, h2h.weight], axis=0) + self.ih2h_bias = i2h.bias @property def spatial_ndim(self) -> int: @@ -148,13 +145,14 @@ def __call__(self, x: jax.Array, state: SimpleRNNState, **k) -> SimpleRNNState: if not isinstance(state, SimpleRNNState): raise TypeError(f"Expected {state=} to be an instance of `SimpleRNNState`") - h = self.act_func(self.in_and_hidden_to_hidden(x, state.hidden_state)) - return SimpleRNNState(h) + ih = jnp.concatenate([x, state.hidden_state], axis=-1) + h = ih @ self.ih2h_weight + self.ih2h_bias + return SimpleRNNState(self.act_func(h)) - def init_state(self, spatial_dim: tuple[int, ...] = ()) -> SimpleRNNState: - del spatial_dim - shape = (self.hidden_features,) - return SimpleRNNState(jnp.zeros(shape)) + +@tree_state.def_state(SimpleRNNCell) +def simple_rnn_init_state(cell: SimpleRNNCell, _) -> SimpleRNNState: + return SimpleRNNState(jnp.zeros([cell.hidden_features])) class DenseState(RNNState): @@ -174,8 +172,10 @@ class DenseCell(RNNCell): key: the key to use to initialize the weights Example: + >>> import serket as sk + >>> import jax.numpy as jnp >>> cell = DenseCell(10, 20) # 10-dimensional input, 20-dimensional hidden state - >>> dummy_state = cell.init_state() # 20-dimensional hidden state + >>> dummy_state = sk.tree_state(cell) # 20-dimensional hidden state >>> x = jnp.ones((10,)) # 10 features >>> result = cell(x, dummy_state) >>> result.hidden_state.shape # 20 features @@ -217,10 +217,10 @@ def __call__(self, x: jax.Array, state: DenseState, **k) -> DenseState: h = self.act_func(self.in_to_hidden(x)) return DenseState(h) - def init_state(self, spatial_dim: tuple[int, ...] = ()) -> DenseState: - del spatial_dim - shape = (self.hidden_features,) - return DenseState(jnp.empty(shape)) # dummy state + +@tree_state.def_state(DenseCell) +def dense_init_state(cell: DenseCell, _) -> DenseState: + return DenseState(jnp.empty([cell.hidden_features])) class LSTMState(RNNState): @@ -264,7 +264,7 @@ def __init__( self.act_func = resolve_activation(act_func) self.recurrent_act_func = resolve_activation(recurrent_act_func) - in_to_hidden = sk.nn.Linear( + i2h = sk.nn.Linear( in_features, hidden_features * 4, weight_init_func=weight_init_func, @@ -272,7 +272,7 @@ def __init__( key=k1, ) - hidden_to_hidden = sk.nn.Linear( + h2h = sk.nn.Linear( hidden_features, hidden_features * 4, weight_init_func=recurrent_weight_init_func, @@ -280,7 +280,8 @@ def __init__( key=k2, ) - self.in_and_hidden_to_hidden = sk.nn.MergeLinear(in_to_hidden, hidden_to_hidden) + self.ih2h_weight = jnp.concatenate([i2h.weight, h2h.weight], axis=0) + self.ih2h_bias = i2h.bias @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") @ft.partial(validate_axis_shape, attribute_name="in_features", axis=0) @@ -289,7 +290,8 @@ def __call__(self, x: jax.Array, state: LSTMState, **k) -> LSTMState: raise TypeError(f"Expected {state=} to be an instance of `LSTMState`") h, c = state.hidden_state, state.cell_state - h = self.in_and_hidden_to_hidden(x, h) + ih = jnp.concatenate([x, h], axis=-1) + h = ih @ self.ih2h_weight + self.ih2h_bias i, f, g, o = jnp.split(h, 4, axis=-1) i = self.recurrent_act_func(i) f = self.recurrent_act_func(f) @@ -309,6 +311,12 @@ def spatial_ndim(self) -> int: return 0 +@tree_state.def_state(LSTMCell) +def lstm_init_state(cell: LSTMCell, _) -> LSTMState: + shape = [cell.hidden_features] + return LSTMState(jnp.zeros(shape), jnp.zeros(shape)) + + class GRUState(RNNState): ... @@ -384,10 +392,10 @@ def __call__(self, x: jax.Array, state: GRUState, **k) -> GRUState: h = (1 - u) * o + u * h return GRUState(hidden_state=h) - def init_state(self, spatial_dim: tuple[int, ...]) -> GRUState: - del spatial_dim - shape = (self.hidden_features,) - return GRUState(jnp.zeros(shape, dtype=jnp.float32)) + +@tree_state.def_state(GRUCell) +def gru_init_state(cell: GRUCell, _) -> GRUState: + return GRUState(jnp.zeros([cell.hidden_features])) # Spatial RNN @@ -487,15 +495,33 @@ def __call__(self, x: jax.Array, state: ConvLSTMNDState, **k) -> ConvLSTMNDState h = o * self.act_func(c) return ConvLSTMNDState(h, c) - def init_state(self, spatial_dim: tuple[int, ...]) -> ConvLSTMNDState: - msg = f"Expected spatial_dim to be a tuple of length {self.spatial_ndim}, got {spatial_dim}" - assert len(spatial_dim) == self.spatial_ndim, msg - shape = (self.hidden_features, *spatial_dim) - return ConvLSTMNDState(jnp.zeros(shape), jnp.zeros(shape)) + +@tree_state.def_state(ConvLSTMNDCell) +def conv_lstm_init_state(cell: ConvLSTMNDCell, x: jax.Array | None) -> ConvLSTMNDState: + if not (hasattr(x, "ndim") and hasattr(x, "shape")): + raise TypeError( + f"Expected {x=} to have ndim and shape attributes.", + "To initialize the `ConvLSTMNDCell` state.\n" + "pass a single sample array to `tree_state` second argument.", + ) + + if x.ndim != cell.spatial_ndim + 1: + raise ValueError( + f"{x.ndim=} != {(cell.spatial_ndim + 1)=}.", + "Expected input to have shape (channel, *spatial_dim)." + "Pass a single sample array to `tree_state", + ) + + spatial_dim = x.shape[1:] + if len(spatial_dim) != cell.spatial_ndim: + raise ValueError(f"{len(spatial_dim)=} != {cell.spatial_ndim=}.") + shape = (cell.hidden_features, *spatial_dim) + return ConvLSTMNDState(jnp.zeros(shape), jnp.zeros(shape)) class ConvLSTM1DCell(ConvLSTMNDCell): """1D Convolution LSTM cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -510,7 +536,6 @@ class ConvLSTM1DCell(ConvLSTMNDCell): act_func: Activation function recurrent_act_func: Recurrent activation function key: PRNG key - spatial_ndim: Number of spatial dimensions. Note: https://www.tensorflow.org/api_docs/python/tf/keras/layers/ConvLSTM1D @@ -557,6 +582,7 @@ def spatial_ndim(self) -> int: class ConvLSTM2DCell(ConvLSTMNDCell): """2D Convolution LSTM cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -571,7 +597,6 @@ class ConvLSTM2DCell(ConvLSTMNDCell): act_func: Activation function recurrent_act_func: Recurrent activation function key: PRNG key - spatial_ndim: Number of spatial dimensions. Note: https://www.tensorflow.org/api_docs/python/tf/keras/layers/ConvLSTM1D @@ -618,6 +643,7 @@ def spatial_ndim(self) -> int: class ConvLSTM3DCell(ConvLSTMNDCell): """3D Convolution LSTM cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -632,7 +658,6 @@ class ConvLSTM3DCell(ConvLSTMNDCell): act_func: Activation function recurrent_act_func: Recurrent activation function key: PRNG key - spatial_ndim: Number of spatial dimensions. Note: https://www.tensorflow.org/api_docs/python/tf/keras/layers/ConvLSTM1D @@ -683,6 +708,7 @@ class ConvGRUNDState(RNNState): class ConvGRUNDCell(RNNCell): """Convolution GRU cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -698,7 +724,6 @@ class ConvGRUNDCell(RNNCell): recurrent_act_func: Recurrent activation function key: PRNG key spatial_ndim: Number of spatial dimensions. - """ def __init__( @@ -755,7 +780,7 @@ def __init__( @ft.partial(validate_spatial_ndim, attribute_name="spatial_ndim") def __call__(self, x: jax.Array, state: ConvGRUNDState, **k) -> ConvGRUNDState: if not isinstance(state, ConvGRUNDState): - raise TypeError(f"Expected {state=} to be an instance of GRUState") + raise TypeError(f"Expected {state=} to be an instance of `GRUState`") h = state.hidden_state xe, xu, xo = jnp.split(self.in_to_hidden(x), 3, axis=0) @@ -766,15 +791,35 @@ def __call__(self, x: jax.Array, state: ConvGRUNDState, **k) -> ConvGRUNDState: h = (1 - u) * o + u * h return ConvGRUNDState(hidden_state=h) - def init_state(self, spatial_dim: tuple[int, ...]) -> ConvGRUNDState: - msg = f"Expected spatial_dim to be a tuple of length {self.spatial_ndim}, got {spatial_dim}" - assert len(spatial_dim) == self.spatial_ndim, msg - shape = (self.hidden_features, *spatial_dim) - return ConvGRUNDState(hidden_state=jnp.zeros(shape)) + +@tree_state.def_state(ConvGRUNDCell) +def conv_gru_init_state(cell: ConvGRUNDCell, x: jax.Array | None) -> ConvGRUNDState: + if not (hasattr(x, "ndim") and hasattr(x, "shape")): + # maybe the input is not an array + raise TypeError( + f"Expected {x=} to have ndim and shape attributes.", + "To initialize the `ConvGRUNDCell` state.\n" + "pass a single sample array to `tree_state` second argument.", + ) + + if x.ndim != cell.spatial_ndim + 1: + # channel, *spatial_dim + raise ValueError( + f"{x.ndim=} != {(cell.spatial_ndim + 1)=}.", + "Expected input to have shape (channel, *spatial_dim)." + "Pass a single sample array to `tree_state", + ) + + spatial_dim = x.shape[1:] + if len(spatial_dim) != cell.spatial_ndim: + raise ValueError(f"{len(spatial_dim)=} != {cell.spatial_ndim=}.") + shape = (cell.hidden_features, *spatial_dim) + return ConvGRUNDState(jnp.zeros(shape), jnp.zeros(shape)) class ConvGRU1DCell(ConvGRUNDCell): """1D Convolution GRU cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -790,7 +835,6 @@ class ConvGRU1DCell(ConvGRUNDCell): recurrent_act_func: Recurrent activation function key: PRNG key spatial_ndim: Number of spatial dimensions. - """ def __init__( @@ -893,6 +937,7 @@ def spatial_ndim(self) -> int: class ConvGRU3DCell(ConvGRUNDCell): """3D Convolution GRU cell that defines the update rule for the hidden state and cell state + Args: in_features: Number of input features hidden_features: Number of output features @@ -907,8 +952,6 @@ class ConvGRU3DCell(ConvGRUNDCell): act_func: Activation function recurrent_act_func: Recurrent activation function key: PRNG key - spatial_ndim: Number of spatial dimensions. - """ def __init__( @@ -966,7 +1009,15 @@ class ScanRNN(sk.TreeClass): >>> cell = SimpleRNNCell(10, 20) # 10-dimensional input, 20-dimensional hidden state >>> rnn = ScanRNN(cell) >>> x = jnp.ones((5, 10)) # 5 timesteps, 10 features - >>> result = rnn(x) # 20 features + >>> result, state = rnn(x) # 20 features + >>> print(result.shape) + (20,) + >>> cell = SimpleRNNCell(10, 20) + >>> rnn = ScanRNN(cell, return_sequences=True) + >>> x = jnp.ones((5, 10)) # 5 timesteps, 10 features + >>> result, state = rnn(x) # 5 timesteps, 20 features + >>> print(result.shape) + (5, 20) """ # cell: RNN @@ -994,7 +1045,20 @@ def __call__( state: RNNState | None = None, backward_state: RNNState | None = None, **k, - ) -> jax.Array: + ) -> tuple[jax.Array, tuple[RNNState, RNNState] | RNNState]: + """Scans the RNN cell over a sequence. + + Args: + x: the input sequence. + state: the initial state. if None, a zero state is used. + backward_state: the initial backward state. if None, a zero state is used. + + Returns: + the output sequence and the final two states tuple if backward_cell + is not ``None``, otherwise return the final state of the forward + cell. + """ + if not isinstance(state, (RNNState, type(None))): raise TypeError(f"Expected state to be an instance of RNNState, {state=}") @@ -1013,20 +1077,29 @@ def __call__( f"Expected x to have shape (timesteps, {self.cell.in_features}," f"{'*'*self.cell.spatial_ndim}), got {x.shape=}" ) - - state = state or self.cell.init_state(x.shape[2:]) + # pass a sample not the whole sequence + state = state or tree_state(self.cell, x[0]) if self.backward_cell is not None and backward_state is None: - backward_state = self.backward_cell.init_state(x.shape[2:]) + # pass a sample not the whole sequence + backward_state = tree_state(self.backward_cell, x[0]) scan_func = _accumulate_scan if self.return_sequences else _no_accumulate_scan - result = scan_func(x, self.cell, state) + result, state = scan_func(x, self.cell, state) + + states = state if self.backward_cell is not None: - back_result = scan_func(x, self.backward_cell, backward_state) + backward_result, backward_state = scan_func( + x, + self.backward_cell, + backward_state, + ) + states = (state, backward_state) concat_axis = int(self.return_sequences) - result = jnp.concatenate((result, back_result), axis=concat_axis) - return result + result = jnp.concatenate((result, backward_result), axis=concat_axis) + + return result, states def _accumulate_scan( @@ -1034,14 +1107,17 @@ def _accumulate_scan( cell: RNNCell, state: RNNState, reverse: bool = False, -) -> jax.Array: +) -> tuple[jax.Array, RNNState]: def scan_func(carry, x): state = cell(x, state=carry) return state, state x = jnp.flip(x, axis=0) if reverse else x # flip over time axis result = jax.lax.scan(scan_func, state, x)[1].hidden_state - return jnp.flip(result, axis=-1) if reverse else result + carry, result = jax.lax.scan(scan_func, state, x) + result = result.hidden_state + result = jnp.flip(result, axis=-1) if reverse else result + return result, carry def _no_accumulate_scan( @@ -1055,4 +1131,6 @@ def scan_func(carry, x): return state, None x = jnp.flip(x, axis=0) if reverse else x - return jax.lax.scan(scan_func, state, x)[0].hidden_state + carry, _ = jax.lax.scan(scan_func, state, x) + result = carry.hidden_state + return result, carry diff --git a/serket/nn/state.py b/serket/nn/state.py new file mode 100644 index 0000000..d157a68 --- /dev/null +++ b/serket/nn/state.py @@ -0,0 +1,91 @@ +# Copyright 2023 Serket authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Define dispatchers for custom tree state.""" + +from __future__ import annotations + +import functools as ft +from typing import Any, Callable, TypeVar + +import jax + +import serket as sk + +T = TypeVar("T") + + +class NoState(sk.TreeClass): + """No state placeholder.""" + + def __init__(self, _: Any, __: Any): + del _, __ + + +def tree_state(tree: T, array: jax.Array | None = None) -> T: + """Build state for a tree of layers. + + Some layers require state to be initialized before training. For example, + `BatchNorm` layers require `running_mean` and `running_var` to be initialized + before training. This function initializes the state for a tree of layers, + based on the layer defined ``state`` rule using ``tree_state.def_state``. + + Args: + tree: A tree of layers. + array: An array to use for initializing state required by some layers + (e.g. ConvGRUNDCell). default: ``None``. + + Returns: + A tree of state leaves if it has state, otherwise ``None``. + + Example: + >>> import jax.numpy as jnp + >>> import serket as sk + >>> tree = [1, 2, sk.nn.BatchNorm(5)] + >>> sk.tree_state(tree) + [NoState(), NoState(), BatchNormState( + running_mean=f32[5](μ=0.00, σ=0.00, ∈(0.00,0.00)), + running_var=f32[5](μ=1.00, σ=0.00, ∈(1.00,1.00)) + )] + + Example: + >>> # define state initialization rule for a custom layer + >>> import jax + >>> import serket as sk + >>> class LayerWithState(sk.TreeClass): + ... pass + >>> # state function accept the `layer` and optional input array as arguments + >>> @sk.tree_state.def_state(LayerWithState) + ... def _(leaf, _): + ... del _ # array is not used + ... return "some state" + >>> sk.tree_state(LayerWithState()) + 'some state' + >>> sk.tree_state(LayerWithState(), jax.numpy.ones((1, 1))) + 'some state' + """ + + def is_leaf(x: Callable[[Any], bool]) -> bool: + types = set(tree_state.state_dispatcher.registry.keys()) + types.discard(object) + return isinstance(x, tuple(types)) + + def dispatch_func(node): + return tree_state.state_dispatcher(node, array) + + return jax.tree_map(dispatch_func, tree, is_leaf=is_leaf) + + +tree_state.state_dispatcher = ft.singledispatch(NoState) +tree_state.def_state = tree_state.state_dispatcher.register diff --git a/serket/nn/utils.py b/serket/nn/utils.py index 0ba2a1f..4908498 100644 --- a/serket/nn/utils.py +++ b/serket/nn/utils.py @@ -15,6 +15,7 @@ from __future__ import annotations import functools as ft +import operator as op from typing import Any, Sequence, Tuple, Union import jax @@ -184,11 +185,17 @@ class Range(sk.TreeClass): min_val: float = -float("inf") max_val: float = float("inf") + min_inclusive: bool = True + max_inclusive: bool = True def __call__(self, value: Any): - if self.min_val <= value <= self.max_val: + lop, ls = (op.ge, "[") if self.min_inclusive else (op.gt, "(") + rop, rs = (op.le, "]") if self.max_inclusive else (op.lt, ")") + + if lop(value, self.min_val) and rop(value, self.max_val): return value - raise ValueError(f"Not in range[{self.min_val}, {self.max_val}] got {value=}.") + + raise ValueError(f"Not in {ls}{self.min_val}, {self.max_val}{rs} got {value=}.") class IsInstance(sk.TreeClass): diff --git a/tests/test_linear.py b/tests/test_linear.py index a01b137..6556611 100644 --- a/tests/test_linear.py +++ b/tests/test_linear.py @@ -26,7 +26,6 @@ GeneralLinear, Identity, Linear, - MergeLinear, Multilinear, ) @@ -144,20 +143,3 @@ def test_general_linear(): with pytest.raises(ValueError): GeneralLinear(in_features=(1,), in_axes=(0, -3), out_features=5) - - -def test_merge_linear(): - layer1 = Linear(5, 6) # 5 input features, 6 output features - layer2 = Linear(7, 6) # 7 input features, 6 output features - merged_layer = MergeLinear(layer1, layer2) # 12 input features, 6 output features - x1 = jnp.ones([1, 5]) # 1 sample, 5 features - x2 = jnp.ones([1, 7]) # 1 sample, 7 features - y = merged_layer(x1, x2) - z = layer1(x1) + layer2(x2) - npt.assert_allclose(y, z, atol=1e-6) - - with pytest.raises(ValueError): - # output features of layer1 and layer2 mismatch - l1 = Linear(5, 6) - l2 = Linear(7, 8) - MergeLinear(l1, l2) diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 8af1d6d..7492e39 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -12,15 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + +import jax import jax.numpy as jnp import numpy.testing as npt import pytest -from serket.nn import GroupNorm, InstanceNorm, LayerNorm +import serket as sk +from serket.nn import BatchNorm, GroupNorm, InstanceNorm, LayerNorm + +os.environ["KERAS_BACKEND"] = "jax" def test_LayerNorm(): - layer = LayerNorm((5, 2), affine=False) + layer = LayerNorm((5, 2), beta_init_func=None, gamma_init_func=None) x = jnp.array( [ @@ -96,7 +102,7 @@ def test_InstanceNorm(): npt.assert_allclose(layer(x), y, atol=1e-5) - layer = InstanceNorm(in_features=3, affine=False) + layer = InstanceNorm(in_features=3, gamma_init_func=None, beta_init_func=None) npt.assert_allclose(layer(x), y, atol=1e-5) @@ -210,10 +216,39 @@ def test_group_norm(): layer = GroupNorm(in_features=-1, groups=0) -# def test_lazy_normalization(): -# layer = GroupNorm(None, groups=1) -# assert layer(jnp.ones([1, 2, 3, 4])).shape == (1, 2, 3, 4) +@pytest.mark.parametrize("axis", [0, 1, 2, 3]) +def test_batchnorm(axis): + import math + + from keras_core.layers import BatchNormalization + + mat_jax = lambda n: jnp.arange(1, math.prod(n) + 1).reshape(*n).astype(jnp.float32) + + x_keras = mat_jax((5, 10, 7, 8)) + + bn_keras = BatchNormalization(axis=axis, momentum=0.5, center=False, scale=False) + + for i in range(5): + x_keras = bn_keras(x_keras, training=True) + + bn_sk = BatchNorm( + x_keras.shape[axis], + momentum=0.5, + axis=axis, + beta_init_func=None, + gamma_init_func=None, + ) + state = sk.tree_state(bn_sk) + x_sk = mat_jax((5, 10, 7, 8)) + + for _ in range(5): + x_sk, state = jax.vmap(bn_sk, in_axes=(0, None))(x_sk, state) + + npt.assert_allclose(x_keras, x_sk, atol=1e-5) + npt.assert_allclose(bn_keras.moving_mean, state.running_mean, atol=1e-5) + npt.assert_allclose(bn_keras.moving_variance, state.running_var, rtol=1e-5) + + x_keras = bn_keras(x_keras, training=False) + x_sk, _ = jax.vmap(bn_sk.at["evaluation"].set(True), in_axes=(0, None))(x_sk, state) -# with pytest.raises(ConcretizationTypeError): -# layer = jax.jit(GroupNorm(None, groups=1)) -# layer(jnp.ones([1, 2, 3, 4])) + npt.assert_allclose(x_keras, x_sk, rtol=1e-5) diff --git a/tests/test_rnn.py b/tests/test_rnn.py index 9221723..7f9a0c8 100644 --- a/tests/test_rnn.py +++ b/tests/test_rnn.py @@ -114,10 +114,10 @@ def test_vanilla_rnn(): ) w_combined = jnp.concatenate([w_in_to_hidden, w_hidden_to_hidden], axis=0) - cell = cell.at["in_and_hidden_to_hidden"].at["weight"].set(w_combined) + cell = cell.at["ih2h_weight"].set(w_combined) sk_layer = ScanRNN(cell) y = jnp.array([0.9637042, -0.8282256, 0.7314449]) - npt.assert_allclose(sk_layer(x), y) + npt.assert_allclose(sk_layer(x)[0], y) def test_lstm(): @@ -228,12 +228,13 @@ def test_lstm(): recurrent_weight_init_func="glorot_uniform", ) w_combined = jnp.concatenate([w_in_to_hidden, w_hidden_to_hidden], axis=0) - cell = cell.at["in_and_hidden_to_hidden"].at["weight"].set(w_combined) - cell = cell.at["in_and_hidden_to_hidden"].at["bias"].set(b_hidden_to_hidden) + cell = cell.at["ih2h_weight"].set(w_combined) + cell = cell.at["ih2h_bias"].set(b_hidden_to_hidden) sk_layer = ScanRNN(cell, return_sequences=False) + y = jnp.array([0.18658024, -0.6338659, 0.3445018]) - npt.assert_allclose(y, sk_layer(x), atol=1e-5) + npt.assert_allclose(y, sk_layer(x)[0], atol=1e-5) w_in_to_hidden = jnp.array( [ @@ -327,8 +328,8 @@ def test_lstm(): w_combined = jnp.concatenate([w_in_to_hidden, w_hidden_to_hidden], axis=0) - cell = cell.at["in_and_hidden_to_hidden"].at["weight"].set(w_combined) - cell = cell.at["in_and_hidden_to_hidden"].at["bias"].set(b_hidden_to_hidden) + cell = cell.at["ih2h_weight"].set(w_combined) + cell = cell.at["ih2h_bias"].set(b_hidden_to_hidden) sk_layer = ScanRNN(cell, return_sequences=True) @@ -347,7 +348,7 @@ def test_lstm(): ] ) - npt.assert_allclose(y, sk_layer(x), atol=1e-5) + npt.assert_allclose(y, sk_layer(x)[0], atol=1e-5) cell = LSTMCell( in_features=in_features, @@ -356,7 +357,7 @@ def test_lstm(): ) sk_layer = ScanRNN(cell, return_sequences=True) - assert sk_layer(x).shape == (10, 3) + assert sk_layer(x)[0].shape == (10, 3) def test_gru(): @@ -418,7 +419,7 @@ def test_gru(): cell = cell.at["in_to_hidden"].at["weight"].set(w1) cell = cell.at["hidden_to_hidden"].at["weight"].set(w2) y = jnp.array([[-0.00142191, 0.11011646, 0.1613554]]) - ypred = ScanRNN(cell, return_sequences=True)(jnp.ones([1, 1])) + ypred, _ = ScanRNN(cell, return_sequences=True)(jnp.ones([1, 1])) npt.assert_allclose(y, ypred, atol=1e-4) @@ -586,7 +587,7 @@ def test_conv_lstm1d(): x = jnp.ones([time_steps, in_features, *spatial_dim]) - res_sk = ScanRNN(cell, return_sequences=False)(x) + res_sk, _ = ScanRNN(cell, return_sequences=False)(x) y = jnp.array( [ @@ -609,7 +610,7 @@ def test_conv_lstm1d(): bias_init_func="zeros", ) - res_sk = ScanRNN(cell, return_sequences=False)(x) + res_sk, _ = ScanRNN(cell, return_sequences=False)(x) assert res_sk.shape == (3, 3) @@ -748,22 +749,16 @@ def test_bilstm(): b_hidden_to_hidden_reverse = jnp.array([0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]) combined_w = jnp.concatenate([w_in_to_hidden, w_hidden_to_hidden], axis=0) - cell = cell.at["in_and_hidden_to_hidden"].at["weight"].set(combined_w) - cell = cell.at["in_and_hidden_to_hidden"].at["bias"].set(b_hidden_to_hidden) + cell = cell.at["ih2h_weight"].set(combined_w) + cell = cell.at["ih2h_bias"].set(b_hidden_to_hidden) combined_w_reverse = jnp.concatenate( [w_in_to_hidden_reverse, w_hidden_to_hidden_reverse], axis=0 ) - reverse_cell = ( - reverse_cell.at["in_and_hidden_to_hidden"].at["weight"].set(combined_w_reverse) - ) - reverse_cell = ( - reverse_cell.at["in_and_hidden_to_hidden"] - .at["bias"] - .set(b_hidden_to_hidden_reverse) - ) + reverse_cell = reverse_cell.at["ih2h_weight"].set(combined_w_reverse) + reverse_cell = reverse_cell.at["ih2h_bias"].set(b_hidden_to_hidden_reverse) - res = ScanRNN(cell, backward_cell=reverse_cell, return_sequences=False)(x) + res, _ = ScanRNN(cell, backward_cell=reverse_cell, return_sequences=False)(x) y = jnp.array([0.35901642, 0.00826644, -0.3015435, -0.13661332]) @@ -794,6 +789,6 @@ def test_dense_cell(): bias_init_func=None, ) x = jnp.ones([10, 10]) - res = ScanRNN(cell=cell)(x) + res, _ = ScanRNN(cell=cell)(x) # 1x10 @ 10x10 => 1x10 npt.assert_allclose(res, jnp.ones([10]) * 10.0) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7b45a69..93f6c5c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -38,7 +38,7 @@ def test_canonicalize_init_func(): assert resolve_init_func("xavier_uniform")(k, (2, 2)).shape == (2, 2) assert isinstance(resolve_init_func(jax.nn.initializers.he_normal()), jtu.Partial) - assert isinstance(resolve_init_func(None), type(None)) + assert isinstance(resolve_init_func(None), jtu.Partial) with pytest.raises(ValueError): resolve_init_func("invalid")