diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 947fd950632..76b51d617d1 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -27,8 +27,6 @@ updates:
       # https://github.com/pytorch/pytorch_sphinx_theme/issues/175
       - dependency-name: 'sphinx'
         versions: '>=6'
-      # segmentation-models-pytorch pins timm, must update in unison
-      - dependency-name: 'timm'
   - package-ecosystem: 'npm'
     directory: '/'
     schedule:
diff --git a/experiments/ssl4eo/flops.py b/experiments/ssl4eo/flops.py
index 5c7a77b78d5..c31352189c8 100755
--- a/experiments/ssl4eo/flops.py
+++ b/experiments/ssl4eo/flops.py
@@ -17,7 +17,7 @@
 for model in models:
     print(f'Model: {model}')
 
-    m = timm.create_model(model, num_classes=num_classes, in_chans=in_channels)
+    m = timm.create_model(model, num_classes=num_classes, in_chans=in_channels)  # type: ignore[attr-defined]
 
     # Calculate memory requirements of model
     mem_params = sum([p.nelement() * p.element_size() for p in m.parameters()])
diff --git a/requirements/required.txt b/requirements/required.txt
index 62295bb4c71..6f705fef858 100644
--- a/requirements/required.txt
+++ b/requirements/required.txt
@@ -14,9 +14,9 @@ pillow==11.1.0
 pyproj==3.7.0
 rasterio==1.4.3
 rtree==1.3.0
-segmentation-models-pytorch==0.3.4
+segmentation-models-pytorch==0.4.0
 shapely==2.0.6
-timm==0.9.7
+timm==1.0.13
 torch==2.5.1
 torchmetrics==1.6.1
 torchvision==0.20.1
diff --git a/tests/models/test_resnet.py b/tests/models/test_resnet.py
index 17cfe520ad9..55e6290882a 100644
--- a/tests/models/test_resnet.py
+++ b/tests/models/test_resnet.py
@@ -34,7 +34,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model('resnet18', in_chans=weights.meta['in_chans'])
+        model = timm.create_model('resnet18', in_chans=weights.meta['in_chans'])  # type: ignore[attr-defined]
         torch.save(model.state_dict(), path)
         try:
             monkeypatch.setattr(weights.value, 'url', str(path))
@@ -78,7 +78,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model('resnet50', in_chans=weights.meta['in_chans'])
+        model = timm.create_model('resnet50', in_chans=weights.meta['in_chans'])  # type: ignore[attr-defined]
         torch.save(model.state_dict(), path)
         try:
             monkeypatch.setattr(weights.value, 'url', str(path))
@@ -122,7 +122,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model('resnet152', in_chans=weights.meta['in_chans'])
+        model = timm.create_model('resnet152', in_chans=weights.meta['in_chans'])  # type: ignore[attr-defined]
         torch.save(model.state_dict(), path)
         try:
             monkeypatch.setattr(weights.value, 'url', str(path))
diff --git a/tests/models/test_vit.py b/tests/models/test_vit.py
index 4ae0e47bfbc..55987d31754 100644
--- a/tests/models/test_vit.py
+++ b/tests/models/test_vit.py
@@ -27,7 +27,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_byol.py b/tests/trainers/test_byol.py
index 808bf937220..bac512af031 100644
--- a/tests/trainers/test_byol.py
+++ b/tests/trainers/test_byol.py
@@ -89,7 +89,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_classification.py b/tests/trainers/test_classification.py
index e2e2d9bb3e5..a2f4915d467 100644
--- a/tests/trainers/test_classification.py
+++ b/tests/trainers/test_classification.py
@@ -126,7 +126,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_moco.py b/tests/trainers/test_moco.py
index 002944b929e..bf898efa6ad 100644
--- a/tests/trainers/test_moco.py
+++ b/tests/trainers/test_moco.py
@@ -91,7 +91,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_regression.py b/tests/trainers/test_regression.py
index f4089283242..4e78b54ff61 100644
--- a/tests/trainers/test_regression.py
+++ b/tests/trainers/test_regression.py
@@ -115,7 +115,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
@@ -273,7 +273,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_segmentation.py b/tests/trainers/test_segmentation.py
index 4bdd966a1bb..dd5d94559a6 100644
--- a/tests/trainers/test_segmentation.py
+++ b/tests/trainers/test_segmentation.py
@@ -138,7 +138,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_simclr.py b/tests/trainers/test_simclr.py
index 3924b6e3785..aa380eaf069 100644
--- a/tests/trainers/test_simclr.py
+++ b/tests/trainers/test_simclr.py
@@ -89,7 +89,7 @@ def mocked_weights(
         load_state_dict_from_url: None,
     ) -> WeightsEnum:
         path = tmp_path / f'{weights}.pth'
-        model = timm.create_model(
+        model = timm.create_model(  # type: ignore[attr-defined]
             weights.meta['model'], in_chans=weights.meta['in_chans']
         )
         torch.save(model.state_dict(), path)
diff --git a/tests/trainers/test_utils.py b/tests/trainers/test_utils.py
index 0b5fbe15b55..e62220e17b1 100644
--- a/tests/trainers/test_utils.py
+++ b/tests/trainers/test_utils.py
@@ -34,7 +34,7 @@ def test_extract_backbone_unsupported_model(tmp_path: Path) -> None:
 
 
 def test_get_input_layer_name_and_module() -> None:
-    key, module = _get_input_layer_name_and_module(timm.create_model('resnet18'))
+    key, module = _get_input_layer_name_and_module(timm.create_model('resnet18'))  # type: ignore[attr-defined]
     assert key == 'conv1'
     assert isinstance(module, nn.Conv2d)
     assert module.in_channels == 3
diff --git a/torchgeo/models/dofa.py b/torchgeo/models/dofa.py
index 7184429aff7..8785de00488 100644
--- a/torchgeo/models/dofa.py
+++ b/torchgeo/models/dofa.py
@@ -311,7 +311,7 @@ def __init__(
                     num_heads,
                     mlp_ratio,
                     qkv_bias=True,
-                    norm_layer=norm_layer,
+                    norm_layer=norm_layer,  # type: ignore[arg-type]
                 )
                 for i in range(depth)
             ]
diff --git a/torchgeo/models/resnet.py b/torchgeo/models/resnet.py
index 7bbeab22dc8..8aee1f4a59a 100644
--- a/torchgeo/models/resnet.py
+++ b/torchgeo/models/resnet.py
@@ -768,7 +768,7 @@ def resnet18(
     if weights:
         kwargs['in_chans'] = weights.meta['in_chans']
 
-    model: ResNet = timm.create_model('resnet18', *args, **kwargs)
+    model: ResNet = timm.create_model('resnet18', *args, **kwargs)  # type: ignore[attr-defined]
 
     if weights:
         missing_keys, unexpected_keys = model.load_state_dict(
@@ -803,7 +803,7 @@ def resnet50(
     if weights:
         kwargs['in_chans'] = weights.meta['in_chans']
 
-    model: ResNet = timm.create_model('resnet50', *args, **kwargs)
+    model: ResNet = timm.create_model('resnet50', *args, **kwargs)  # type: ignore[attr-defined]
 
     if weights:
         missing_keys, unexpected_keys = model.load_state_dict(
@@ -837,7 +837,7 @@ def resnet152(
     if weights:
         kwargs['in_chans'] = weights.meta['in_chans']
 
-    model: ResNet = timm.create_model('resnet152', *args, **kwargs)
+    model: ResNet = timm.create_model('resnet152', *args, **kwargs)  # type: ignore[attr-defined]
 
     if weights:
         missing_keys, unexpected_keys = model.load_state_dict(
diff --git a/torchgeo/models/scale_mae.py b/torchgeo/models/scale_mae.py
index 7dd689e0f1e..91caa903c30 100644
--- a/torchgeo/models/scale_mae.py
+++ b/torchgeo/models/scale_mae.py
@@ -92,7 +92,7 @@ def get_1d_sincos_pos_embed_from_grid_torch(embed_dim: int, pos: Tensor) -> Tens
     return emb
 
 
-class ScaleMAE(VisionTransformer):  # type: ignore[misc]
+class ScaleMAE(VisionTransformer):
     """Custom Vision Transformer for Scale-MAE with GSD positional embeddings.
 
     This is a ViT encoder only model of the Scale-MAE architecture with GSD positional embeddings.
@@ -117,7 +117,8 @@ def __init__(self, res: float = 1.0, *args: Any, **kwargs: Any) -> None:
         self.res = res
 
         # Scale MAE uses resolution specific positional embeddings
-        self.pos_embed.requires_grad = False
+        if self.pos_embed is not None:
+            self.pos_embed.requires_grad = False
 
     def _pos_embed(self, x: Tensor) -> Tensor:
         """Apply GSD positional embeddings to the input tensor."""
@@ -133,8 +134,9 @@ def _pos_embed(self, x: Tensor) -> Tensor:
             .to(x.dtype)
             .to(x.device)
         )
-        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1)
+        if self.cls_token is not None:
+            cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+            x = torch.cat((cls_tokens, x), dim=1)
         x = x + pos_embed
         x = self.pos_drop(x)
         return x
@@ -155,7 +157,9 @@ def interpolate_pos_embed(
     pos_embed_checkpoint = state_dict['pos_embed']
     embedding_size = pos_embed_checkpoint.shape[-1]
     num_patches = model.patch_embed.num_patches
-    num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+    num_extra_tokens = 0
+    if model.pos_embed is not None:
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
     # height (== width) for the checkpoint position embedding
     orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
     # height (== width) for the new position embedding
diff --git a/torchgeo/models/vit.py b/torchgeo/models/vit.py
index 3c876ed3fe7..5a769ff615c 100644
--- a/torchgeo/models/vit.py
+++ b/torchgeo/models/vit.py
@@ -243,7 +243,7 @@ def vit_small_patch16_224(
     if weights:
         kwargs['in_chans'] = weights.meta['in_chans']
 
-    model: VisionTransformer = timm.create_model(
+    model: VisionTransformer = timm.create_model(  # type: ignore[attr-defined]
         'vit_small_patch16_224', *args, **kwargs
     )
 
diff --git a/torchgeo/trainers/byol.py b/torchgeo/trainers/byol.py
index 18df10e02f0..f568c777581 100644
--- a/torchgeo/trainers/byol.py
+++ b/torchgeo/trainers/byol.py
@@ -332,7 +332,7 @@ def configure_models(self) -> None:
         in_channels: int = self.hparams['in_channels']
 
         # Create backbone
-        backbone = timm.create_model(
+        backbone = timm.create_model(  # type: ignore[attr-defined]
             self.hparams['model'], in_chans=in_channels, pretrained=weights is True
         )
 
diff --git a/torchgeo/trainers/classification.py b/torchgeo/trainers/classification.py
index 2e2766419a5..7d3a97d5641 100644
--- a/torchgeo/trainers/classification.py
+++ b/torchgeo/trainers/classification.py
@@ -80,7 +80,7 @@ def configure_models(self) -> None:
         weights = self.weights
 
         # Create model
-        self.model = timm.create_model(
+        self.model = timm.create_model(  # type: ignore[attr-defined]
             self.hparams['model'],
             num_classes=self.hparams['num_classes'],
             in_chans=self.hparams['in_channels'],
diff --git a/torchgeo/trainers/moco.py b/torchgeo/trainers/moco.py
index ce35855c12f..2e7e6907e37 100644
--- a/torchgeo/trainers/moco.py
+++ b/torchgeo/trainers/moco.py
@@ -238,10 +238,10 @@ def configure_models(self) -> None:
         output_dim: int = self.hparams['output_dim']
 
         # Create backbone
-        self.backbone = timm.create_model(
+        self.backbone = timm.create_model(  # type: ignore[attr-defined]
             model, in_chans=in_channels, num_classes=0, pretrained=weights is True
         )
-        self.backbone_momentum = timm.create_model(
+        self.backbone_momentum = timm.create_model(  # type: ignore[attr-defined]
             model, in_chans=in_channels, num_classes=0, pretrained=weights is True
         )
         deactivate_requires_grad(self.backbone_momentum)
diff --git a/torchgeo/trainers/regression.py b/torchgeo/trainers/regression.py
index 0381316050b..bf6b8a6f40e 100644
--- a/torchgeo/trainers/regression.py
+++ b/torchgeo/trainers/regression.py
@@ -83,7 +83,7 @@ def configure_models(self) -> None:
         """Initialize the model."""
         # Create model
         weights = self.weights
-        self.model = timm.create_model(
+        self.model = timm.create_model(  # type: ignore[attr-defined]
             self.hparams['model'],
             num_classes=self.hparams['num_outputs'],
             in_chans=self.hparams['in_channels'],
diff --git a/torchgeo/trainers/simclr.py b/torchgeo/trainers/simclr.py
index a0625f26ebb..107372dbcc7 100644
--- a/torchgeo/trainers/simclr.py
+++ b/torchgeo/trainers/simclr.py
@@ -153,7 +153,7 @@ def configure_models(self) -> None:
         weights = self.weights
 
         # Create backbone
-        self.backbone = timm.create_model(
+        self.backbone = timm.create_model(  # type: ignore[attr-defined]
             self.hparams['model'],
             in_chans=self.hparams['in_channels'],
             num_classes=0,