diff --git a/CHANGELOG.md b/CHANGELOG.md index e6553c7..ec4db72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you! ### Added +- CI workflow to update the changelog on release +- configurabilty of the dropout probability in the the MultiHeadSelfAttention module - CI workflow to update the changelog on release - Remapper: Preprocessor for remapping one variable to multiple ones. Includes changes to the data indices since the remapper changes the number of variables. With optional config keywords. - Codeowners file @@ -21,6 +23,7 @@ Keep it human-readable, your future self will thank you! - Docsig precommit hooks - Changelog merge strategy + ### Changed - Update CI to inherit from common infrastructue reusable workflows diff --git a/src/anemoi/models/layers/attention.py b/src/anemoi/models/layers/attention.py index 2063ad0..931e098 100644 --- a/src/anemoi/models/layers/attention.py +++ b/src/anemoi/models/layers/attention.py @@ -40,7 +40,7 @@ def __init__( bias: bool = False, is_causal: bool = False, window_size: Optional[int] = None, - dropout: float = 0.0, + dropout_p: float = 0.0, ): super().__init__() @@ -48,11 +48,11 @@ def __init__( embed_dim % num_heads == 0 ), f"Embedding dimension ({embed_dim}) must be divisible by number of heads ({num_heads})" - self.dropout = dropout self.num_heads = num_heads self.embed_dim = embed_dim self.head_dim = embed_dim // num_heads # q k v self.window_size = (window_size, window_size) # flash attention + self.dropout_p = dropout_p self.is_causal = is_causal self.lin_qkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias) @@ -86,15 +86,22 @@ def forward( query = shard_heads(query, shapes=shapes, mgroup=model_comm_group) key = shard_heads(key, shapes=shapes, mgroup=model_comm_group) value = shard_heads(value, shapes=shapes, mgroup=model_comm_group) + dropout_p = self.dropout_p if self.training else 0.0 if _FLASH_ATTENTION_AVAILABLE: query, key, value = ( einops.rearrange(t, "batch heads grid vars -> batch grid heads vars") for t in (query, key, value) ) - out = self.attention(query, key, value, causal=False, window_size=self.window_size) + out = self.attention(query, key, value, causal=False, window_size=self.window_size, dropout_p=dropout_p) out = einops.rearrange(out, "batch grid heads vars -> batch heads grid vars") else: - out = self.attention(query, key, value, is_causal=False) # expects (batch heads grid variable) format + out = self.attention( + query, + key, + value, + is_causal=False, + dropout_p=dropout_p, + ) # expects (batch heads grid variable) format out = shard_sequence(out, shapes=shapes, mgroup=model_comm_group) out = einops.rearrange(out, "batch heads grid vars -> (batch grid) (heads vars)") diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index ba29607..7fd3627 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -55,7 +55,15 @@ def forward( class TransformerProcessorBlock(BaseBlock): """Transformer block with MultiHeadSelfAttention and MLPs.""" - def __init__(self, num_channels, hidden_dim, num_heads, activation, window_size: int): + def __init__( + self, + num_channels: int, + hidden_dim: int, + num_heads: int, + activation: str, + window_size: int, + dropout_p: float = 0.0, + ): super().__init__() try: @@ -72,7 +80,7 @@ def __init__(self, num_channels, hidden_dim, num_heads, activation, window_size: window_size=window_size, bias=False, is_causal=False, - dropout=0.0, + dropout_p=dropout_p, ) self.mlp = nn.Sequential( diff --git a/src/anemoi/models/layers/chunk.py b/src/anemoi/models/layers/chunk.py index 61dec34..87d0ac7 100644 --- a/src/anemoi/models/layers/chunk.py +++ b/src/anemoi/models/layers/chunk.py @@ -73,6 +73,7 @@ def __init__( num_heads: int = 16, mlp_hidden_ratio: int = 4, activation: str = "GELU", + dropout_p: float = 0.0, ) -> None: """Initialize TransformerProcessor. @@ -88,6 +89,8 @@ def __init__( ratio of mlp hidden dimension to embedding dimension, default 4 activation : str, optional Activation function, by default "GELU" + dropout_p: float + Dropout probability used for multi-head self attention, default 0.0 """ super().__init__(num_channels=num_channels, num_layers=num_layers) @@ -98,6 +101,7 @@ def __init__( num_heads=num_heads, activation=activation, window_size=window_size, + dropout_p=dropout_p, ) def forward( diff --git a/src/anemoi/models/layers/processor.py b/src/anemoi/models/layers/processor.py index bb33609..6ba8eb1 100644 --- a/src/anemoi/models/layers/processor.py +++ b/src/anemoi/models/layers/processor.py @@ -95,6 +95,7 @@ def __init__( cpu_offload: bool = False, num_heads: int = 16, mlp_hidden_ratio: int = 4, + dropout_p: float = 0.1, **kwargs, ) -> None: """Initialize TransformerProcessor. @@ -113,6 +114,8 @@ def __init__( ratio of mlp hidden dimension to embedding dimension, default 4 activation : str, optional Activation function, by default "GELU" + dropout_p: float, optional + Dropout probability used for multi-head self attention, default 0.0 """ super().__init__( num_channels=num_channels, @@ -133,6 +136,7 @@ def __init__( num_layers=self.chunk_size, window_size=window_size, activation=activation, + dropout_p=dropout_p, ) self.offload_layers(cpu_offload) diff --git a/tests/layers/block/test_block_transformer.py b/tests/layers/block/test_block_transformer.py index 97c274f..2e63386 100644 --- a/tests/layers/block/test_block_transformer.py +++ b/tests/layers/block/test_block_transformer.py @@ -29,11 +29,14 @@ class TestTransformerProcessorBlock: num_heads=st.integers(min_value=1, max_value=10), activation=st.sampled_from(["ReLU", "GELU", "Tanh"]), window_size=st.integers(min_value=1, max_value=512), + dropout_p=st.floats(min_value=0.0, max_value=1.0), ) @settings(max_examples=10) - def test_init(self, factor_attention_heads, hidden_dim, num_heads, activation, window_size): + def test_init(self, factor_attention_heads, hidden_dim, num_heads, activation, window_size, dropout_p): num_channels = num_heads * factor_attention_heads - block = TransformerProcessorBlock(num_channels, hidden_dim, num_heads, activation, window_size) + block = TransformerProcessorBlock( + num_channels, hidden_dim, num_heads, activation, window_size, dropout_p=dropout_p + ) assert isinstance(block, TransformerProcessorBlock) assert isinstance(block.layer_norm1, nn.LayerNorm) @@ -49,6 +52,7 @@ def test_init(self, factor_attention_heads, hidden_dim, num_heads, activation, w window_size=st.integers(min_value=1, max_value=512), shapes=st.lists(st.integers(min_value=1, max_value=10), min_size=3, max_size=3), batch_size=st.integers(min_value=1, max_value=40), + dropout_p=st.floats(min_value=0.0, max_value=1.0), ) @settings(max_examples=10) def test_forward_output( @@ -60,9 +64,12 @@ def test_forward_output( window_size, shapes, batch_size, + dropout_p, ): num_channels = num_heads * factor_attention_heads - block = TransformerProcessorBlock(num_channels, hidden_dim, num_heads, activation, window_size) + block = TransformerProcessorBlock( + num_channels, hidden_dim, num_heads, activation, window_size, dropout_p=dropout_p + ) x = torch.randn((batch_size, num_channels)) diff --git a/tests/layers/chunk/test_chunk_transformer.py b/tests/layers/chunk/test_chunk_transformer.py index 1fe7c6d..5449e97 100644 --- a/tests/layers/chunk/test_chunk_transformer.py +++ b/tests/layers/chunk/test_chunk_transformer.py @@ -20,6 +20,7 @@ def init(self): mlp_hidden_ratio: int = 4 activation: str = "GELU" window_size: int = 13 + dropout_p: float = 0.1 # num_heads must be evenly divisible by num_channels for MHSA return ( @@ -29,6 +30,7 @@ def init(self): mlp_hidden_ratio, activation, window_size, + dropout_p, ) @pytest.fixture @@ -40,6 +42,7 @@ def processor_chunk(self, init): mlp_hidden_ratio, activation, window_size, + dropout_p, ) = init return TransformerProcessorChunk( num_channels=num_channels, @@ -48,6 +51,7 @@ def processor_chunk(self, init): mlp_hidden_ratio=mlp_hidden_ratio, activation=activation, window_size=window_size, + dropout_p=dropout_p, ) def test_all_blocks(self, processor_chunk): diff --git a/tests/layers/processor/test_transformer_processor.py b/tests/layers/processor/test_transformer_processor.py index d359c27..305af41 100644 --- a/tests/layers/processor/test_transformer_processor.py +++ b/tests/layers/processor/test_transformer_processor.py @@ -21,6 +21,7 @@ def transformer_processor_init(): cpu_offload = False num_heads = 16 mlp_hidden_ratio = 4 + dropout_p = 0.1 return ( num_layers, window_size, @@ -30,6 +31,7 @@ def transformer_processor_init(): cpu_offload, num_heads, mlp_hidden_ratio, + dropout_p, ) @@ -44,6 +46,7 @@ def transformer_processor(transformer_processor_init): cpu_offload, num_heads, mlp_hidden_ratio, + dropout_p, ) = transformer_processor_init return TransformerProcessor( num_layers=num_layers, @@ -54,6 +57,7 @@ def transformer_processor(transformer_processor_init): cpu_offload=cpu_offload, num_heads=num_heads, mlp_hidden_ratio=mlp_hidden_ratio, + dropout_p=dropout_p, ) @@ -67,6 +71,7 @@ def test_transformer_processor_init(transformer_processor, transformer_processor _cpu_offload, _num_heads, _mlp_hidden_ratio, + _dropout_p, ) = transformer_processor_init assert isinstance(transformer_processor, TransformerProcessor) assert transformer_processor.num_chunks == num_chunks @@ -84,6 +89,7 @@ def test_transformer_processor_forward(transformer_processor, transformer_proces _cpu_offload, _num_heads, _mlp_hidden_ratio, + _dropout_p, ) = transformer_processor_init gridsize = 100 batch_size = 1 diff --git a/tests/layers/test_attention.py b/tests/layers/test_attention.py index ffeaebc..9457317 100644 --- a/tests/layers/test_attention.py +++ b/tests/layers/test_attention.py @@ -18,17 +18,19 @@ @given( num_heads=st.integers(min_value=1, max_value=50), embed_dim_multiplier=st.integers(min_value=1, max_value=10), + dropout_p=st.floats(min_value=0.0, max_value=1.0), ) -def test_multi_head_self_attention_init(num_heads, embed_dim_multiplier): +def test_multi_head_self_attention_init(num_heads, embed_dim_multiplier, dropout_p): embed_dim = ( num_heads * embed_dim_multiplier ) # TODO: Make assert in MHSA to check if embed_dim is divisible by num_heads - mhsa = MultiHeadSelfAttention(num_heads, embed_dim) + mhsa = MultiHeadSelfAttention(num_heads, embed_dim, dropout_p=dropout_p) assert isinstance(mhsa, nn.Module) assert mhsa.num_heads == num_heads assert mhsa.embed_dim == embed_dim assert mhsa.head_dim == embed_dim // num_heads + assert dropout_p == mhsa.dropout_p @pytest.mark.gpu @@ -36,11 +38,12 @@ def test_multi_head_self_attention_init(num_heads, embed_dim_multiplier): batch_size=st.integers(min_value=1, max_value=64), num_heads=st.integers(min_value=1, max_value=20), embed_dim_multiplier=st.integers(min_value=1, max_value=10), + dropout_p=st.floats(min_value=0.0, max_value=1.0), ) @settings(deadline=None) -def test_multi_head_self_attention_forward(batch_size, num_heads, embed_dim_multiplier): +def test_multi_head_self_attention_forward(batch_size, num_heads, embed_dim_multiplier, dropout_p): embed_dim = num_heads * embed_dim_multiplier - mhsa = MultiHeadSelfAttention(num_heads, embed_dim) + mhsa = MultiHeadSelfAttention(num_heads, embed_dim, dropout_p=dropout_p) x = torch.randn(batch_size * 2, embed_dim) shapes = [list(x.shape)] @@ -54,10 +57,11 @@ def test_multi_head_self_attention_forward(batch_size, num_heads, embed_dim_mult batch_size=st.integers(min_value=1, max_value=64), num_heads=st.integers(min_value=1, max_value=20), embed_dim_multiplier=st.integers(min_value=1, max_value=10), + dropout_p=st.floats(min_value=0.0, max_value=1.0), ) -def test_multi_head_self_attention_backward(batch_size, num_heads, embed_dim_multiplier): +def test_multi_head_self_attention_backward(batch_size, num_heads, embed_dim_multiplier, dropout_p): embed_dim = num_heads * embed_dim_multiplier - mhsa = MultiHeadSelfAttention(num_heads, embed_dim) + mhsa = MultiHeadSelfAttention(num_heads, embed_dim, dropout_p=dropout_p) x = torch.randn(batch_size * 2, embed_dim, requires_grad=True) shapes = [list(x.shape)]