diff --git a/plugins/extract/_base.py b/plugins/extract/_base.py
index 4abe8a5e9d..36902e9932 100644
--- a/plugins/extract/_base.py
+++ b/plugins/extract/_base.py
@@ -9,11 +9,10 @@
 from dataclasses import dataclass, field
 
 import numpy as np
-from tensorflow.python.framework import errors_impl as tf_errors  # pylint:disable=no-name-in-module  # noqa
 
 from lib.multithreading import MultiThread
 from lib.queue_manager import queue_manager
-from lib.utils import GetModel, FaceswapError
+from lib.utils import GetModel
 from ._config import Config
 from .pipeline import ExtractMedia
 
@@ -109,7 +108,7 @@ class Extractor():
     model_filename: str
         The name of the model file to be loaded
     exclude_gpus: list, optional
-        A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
+        A list of indices correlating to connected GPUs that PyTorch should not use. Pass
         ``None`` to not exclude any GPUs. Default: ``None``
     configfile: str, optional
         Path to a custom configuration ``ini`` file. Default: Use system configfile
@@ -135,9 +134,6 @@ class Extractor():
     vram: int
         Approximate VRAM used by the model at :attr:`input_size`. Used to calculate the
         :attr:`batchsize`. Be conservative to avoid OOM.
-    vram_warnings: int
-        Approximate VRAM used by the model at :attr:`input_size` that will still run, but generates
-        warnings. Used to calculate the :attr:`batchsize`. Be conservative to avoid OOM.
     vram_per_batch: int
         Approximate additional VRAM used by the model for each additional batch. Used to calculate
         the :attr:`batchsize`. Be conservative to avoid OOM.
@@ -174,7 +170,6 @@ def __init__(self,
         self.input_size = 0
         self.color_format: T.Literal["BGR", "RGB", "GRAY"] = "BGR"
         self.vram = 0
-        self.vram_warnings = 0  # Will run at this with warnings
         self.vram_per_batch = 0
 
         # << THE FOLLOWING ARE SET IN self.initialize METHOD >> #
@@ -470,20 +465,7 @@ def initialize(self, *args, **kwargs) -> None:
                          kwargs["out_queue"],
                          [f"predict_{name}", f"post_{name}"])
         self._compile_threads()
-        try:
-            self.init_model()
-        except tf_errors.UnknownError as err:
-            if "failed to get convolution algorithm" in str(err).lower():
-                msg = ("Tensorflow raised an unknown error. This is most likely caused by a "
-                       "failure to launch cuDNN which can occur for some GPU/Tensorflow "
-                       "combinations. You should enable `allow_growth` to attempt to resolve this "
-                       "issue:"
-                       "\nGUI: Go to Settings > Extract Plugins > Global and enable the "
-                       "`allow_growth` option."
-                       "\nCLI: Go to `faceswap/config/extract.ini` and change the `allow_growth "
-                       "option to `True`.")
-                raise FaceswapError(msg) from err
-            raise err
+        self.init_model()
         self._is_initialized = True
         logger.info("Initialized %s (%s) with batchsize of %s",
                     self.name, self._plugin_type.title(), self.batchsize)
@@ -598,20 +580,7 @@ def _thread_process(self,
                 break
             if not batch.filename:  # Batch not populated. Possible during re-aligns
                 continue
-            try:
-                batch = function(batch)
-            except tf_errors.UnknownError as err:
-                if "failed to get convolution algorithm" in str(err).lower():
-                    msg = ("Tensorflow raised an unknown error. This is most likely caused by a "
-                           "failure to launch cuDNN which can occur for some GPU/Tensorflow "
-                           "combinations. You should enable `allow_growth` to attempt to resolve "
-                           "this issue:"
-                           "\nGUI: Go to Settings > Extract Plugins > Global and enable the "
-                           "`allow_growth` option."
-                           "\nCLI: Go to `faceswap/config/extract.ini` and change the "
-                           "`allow_growth option to `True`.")
-                    raise FaceswapError(msg) from err
-                raise err
+            batch = function(batch)
             if function.__name__ == "_process_output":
                 # Process output items to individual items from batch
                 for item in self.finalize(batch):
diff --git a/plugins/extract/_config.py b/plugins/extract/_config.py
index 8314cab2f0..528268fb5a 100644
--- a/plugins/extract/_config.py
+++ b/plugins/extract/_config.py
@@ -30,16 +30,6 @@ def set_globals(self) -> None:
         logger.debug("Setting global config")
         section = "global"
         self.add_section(section, _("Options that apply to all extraction plugins"))
-        self.add_item(
-            section=section,
-            title="allow_growth",
-            datatype=bool,
-            default=False,
-            group=_("settings"),
-            info=_("Enable the Tensorflow GPU `allow_growth` configuration option. "
-                   "This option prevents Tensorflow from allocating all of the GPU VRAM at launch "
-                   "but can lead to higher VRAM fragmentation and slower performance. Should only "
-                   "be enabled if you are having problems running extraction."))
         self.add_item(
             section=section,
             title="aligner_min_scale",
diff --git a/plugins/extract/align/_base/aligner.py b/plugins/extract/align/_base/aligner.py
index 3eec920c08..c249020924 100644
--- a/plugins/extract/align/_base/aligner.py
+++ b/plugins/extract/align/_base/aligner.py
@@ -21,8 +21,7 @@
 
 import cv2
 import numpy as np
-
-from tensorflow.python.framework import errors_impl as tf_errors  # pylint:disable=no-name-in-module # noqa
+from torch.cuda import OutOfMemoryError
 
 from lib.utils import FaceswapError
 from plugins.extract._base import BatchType, Extractor, ExtractMedia, ExtractorBatch
@@ -534,6 +533,7 @@ def _predict(self, batch: BatchType) -> AlignerBatch:
             preds = [self.predict(feed) for feed in batch.refeeds]
             try:
                 batch.prediction = np.array(preds)
+                logger.trace("Aligner out: %s", batch.prediction.shape)
             except ValueError as err:
                 # If refeed batches are different sizes, Numpy will error, so we need to explicitly
                 # set the dtype to 'object' rather than let it infer
@@ -549,8 +549,7 @@ def _predict(self, batch: BatchType) -> AlignerBatch:
                 else:
                     raise
 
-            return batch
-        except tf_errors.ResourceExhaustedError as err:
+        except OutOfMemoryError as err:
             msg = ("You do not have enough GPU memory available to run detection at the "
                    "selected batch size. You can try a number of things:"
                    "\n1) Close any other application that is using your GPU (web browsers are "
@@ -561,6 +560,8 @@ def _predict(self, batch: BatchType) -> AlignerBatch:
                    "\n3) Enable 'Single Process' mode.")
             raise FaceswapError(msg) from err
 
+        return batch
+        
     def _process_refeeds(self, batch: AlignerBatch) -> list[AlignerBatch]:
         """ Process the output for each selected re-feed
 
diff --git a/plugins/extract/align/fan.py b/plugins/extract/align/fan.py
index a829f3bcac..e446b7c71e 100644
--- a/plugins/extract/align/fan.py
+++ b/plugins/extract/align/fan.py
@@ -29,9 +29,8 @@ def __init__(self, **kwargs) -> None:
         self.name = "FAN"
         self.input_size = 256
         self.color_format = "RGB"
-        self.vram = 2240
-        self.vram_warnings = 512  # Will run at this with warnings
-        self.vram_per_batch = 64
+        self.vram = 896  # 810 in testing
+        self.vram_per_batch = 768  # ~720 in testing
         self.realign_centering = "head"
         self.batchsize: int = self.config["batch-size"]
         self.reference_scale = 200. / 195.
@@ -42,7 +41,6 @@ def init_model(self) -> None:
         assert isinstance(self.model_path, str)
         self.model = KSession(self.name,
                               self.model_path,
-                              allow_growth=self.config["allow_growth"],
                               exclude_gpus=self._exclude_gpus)
         self.model.load_model()
         # Feed a placeholder so Aligner is primed for Manual tool
@@ -224,7 +222,6 @@ def predict(self, feed: np.ndarray) -> np.ndarray:
         # TODO Remove lazy transpose and change points from predict to use the correct
         # order
         retval = self.model.predict(feed)[-1].transpose(0, 3, 1, 2)
-        logger.trace(retval.shape)  # type:ignore[attr-defined]
         return retval
 
     def process_output(self, batch: BatchType) -> None:
diff --git a/plugins/extract/detect/_base.py b/plugins/extract/detect/_base.py
index c85f75776b..2e44bab0c1 100644
--- a/plugins/extract/detect/_base.py
+++ b/plugins/extract/detect/_base.py
@@ -23,8 +23,7 @@
 
 import cv2
 import numpy as np
-
-from tensorflow.python.framework import errors_impl as tf_errors  # pylint:disable=no-name-in-module # noqa
+from torch.cuda import OutOfMemoryError
 
 from lib.align import DetectedFace
 from lib.utils import FaceswapError
@@ -280,15 +279,7 @@ def _predict(self, batch: BatchType) -> DetectorBatch:
             self._rotate_batch(batch, angle)
             try:
                 pred = self.predict(batch.feed)
-                if angle == 0:
-                    batch.prediction = pred
-                else:
-                    batch.prediction = np.array([b if b.any() else p
-                                                 for b, p in zip(batch.prediction, pred)])
-                logger.trace("angle: %s, filenames: %s, "  # type:ignore[attr-defined]
-                             "prediction: %s",
-                             angle, batch.filename, pred)
-            except tf_errors.ResourceExhaustedError as err:
+            except OutOfMemoryError as err:
                 msg = ("You do not have enough GPU memory available to run detection at the "
                        "selected batch size. You can try a number of things:"
                        "\n1) Close any other application that is using your GPU (web browsers are "
@@ -299,6 +290,15 @@ def _predict(self, batch: BatchType) -> DetectorBatch:
                        "\n3) Enable 'Single Process' mode.")
                 raise FaceswapError(msg) from err
 
+            if angle == 0:
+                batch.prediction = pred
+            else:
+                batch.prediction = np.array([b if b.any() else p
+                                                for b, p in zip(batch.prediction, pred)])
+            logger.trace("angle: %s, filenames: %s, "  # type:ignore[attr-defined]
+                            "prediction: %s",
+                            angle, batch.filename, pred)
+
             if angle != 0 and any(face.any() for face in batch.prediction):
                 logger.verbose("found face(s) by rotating image %s "  # type:ignore[attr-defined]
                                "degrees",
diff --git a/plugins/extract/detect/mtcnn.py b/plugins/extract/detect/mtcnn.py
index 8af8a41bef..b0efe8d80d 100644
--- a/plugins/extract/detect/mtcnn.py
+++ b/plugins/extract/detect/mtcnn.py
@@ -7,14 +7,13 @@
 import cv2
 import numpy as np
 
-# Ignore linting errors from Tensorflow's thoroughly broken import system
-from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input, MaxPool2D, Permute, PReLU  # noqa:E501  # pylint:disable=import-error
+from keras.layers import Conv2D, Dense, Flatten, Input, MaxPooling2D, Permute, PReLU
 
 from lib.model.session import KSession
 from ._base import BatchType, Detector
 
 if T.TYPE_CHECKING:
-    from tensorflow import Tensor
+    from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -27,9 +26,8 @@ def __init__(self, **kwargs) -> None:
         super().__init__(git_model_id=git_model_id, model_filename=model_filename, **kwargs)
         self.name = "MTCNN"
         self.input_size = 640
-        self.vram = 320 if not self.config["cpu"] else 0
-        self.vram_warnings = 64 if not self.config["cpu"] else 0  # Will run at this with warnings
-        self.vram_per_batch = 32 if not self.config["cpu"] else 0
+        self.vram = 128 if not self.config["cpu"] else 0  # 66 in testing
+        self.vram_per_batch = 64 if not self.config["cpu"] else 0  # ~50 in testing
         self.batchsize = self.config["batch-size"]
         self.kwargs = self._validate_kwargs()
         self.color_format = "RGB"
@@ -63,7 +61,6 @@ def init_model(self) -> None:
         """ Initialize MTCNN Model. """
         assert isinstance(self.model_path, list)
         self.model = MTCNN(self.model_path,
-                           self.config["allow_growth"],
                            self._exclude_gpus,
                            self.config["cpu"],
                            **self.kwargs)  # type:ignore
@@ -145,10 +142,6 @@ class PNet(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool, optional
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance. Default: ``False``
     exclude_gpus: list, optional
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs. Default: ``None``
@@ -163,7 +156,6 @@ class PNet(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  cpu_mode: bool,
                  input_size: int,
@@ -172,7 +164,6 @@ def __init__(self,
                  threshold: float) -> None:
         super().__init__("MTCNN-PNet",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus,
                          cpu_mode=cpu_mode)
 
@@ -193,7 +184,7 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]:
         input_ = Input(shape=(None, None, 3))
         var_x = Conv2D(10, (3, 3), strides=1, padding='valid', name='conv1')(input_)
         var_x = PReLU(shared_axes=[1, 2], name='PReLU1')(var_x)
-        var_x = MaxPool2D(pool_size=2)(var_x)
+        var_x = MaxPooling2D(pool_size=2)(var_x)
         var_x = Conv2D(16, (3, 3), strides=1, padding='valid', name='conv2')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='PReLU2')(var_x)
         var_x = Conv2D(32, (3, 3), strides=1, padding='valid', name='conv3')(var_x)
@@ -326,10 +317,6 @@ class RNet(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool, optional
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance. Default: ``False``
     exclude_gpus: list, optional
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs. Default: ``None``
@@ -343,14 +330,12 @@ class RNet(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  cpu_mode: bool,
                  input_size: int,
                  threshold: float) -> None:
         super().__init__("MTCNN-RNet",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus,
                          cpu_mode=cpu_mode)
         self.define_model(self.model_definition)
@@ -365,11 +350,11 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]:
         input_ = Input(shape=(24, 24, 3))
         var_x = Conv2D(28, (3, 3), strides=1, padding='valid', name='conv1')(input_)
         var_x = PReLU(shared_axes=[1, 2], name='prelu1')(var_x)
-        var_x = MaxPool2D(pool_size=3, strides=2, padding='same')(var_x)
+        var_x = MaxPooling2D(pool_size=3, strides=2, padding='same')(var_x)
 
         var_x = Conv2D(48, (3, 3), strides=1, padding='valid', name='conv2')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='prelu2')(var_x)
-        var_x = MaxPool2D(pool_size=3, strides=2)(var_x)
+        var_x = MaxPooling2D(pool_size=3, strides=2)(var_x)
 
         var_x = Conv2D(64, (2, 2), strides=1, padding='valid', name='conv3')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='prelu3')(var_x)
@@ -457,10 +442,6 @@ class ONet(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool, optional
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance. Default: ``False``
     exclude_gpus: list, optional
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs. Default: ``None``
@@ -473,14 +454,12 @@ class ONet(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  cpu_mode: bool,
                  input_size: int,
                  threshold: float) -> None:
         super().__init__("MTCNN-ONet",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus,
                          cpu_mode=cpu_mode)
         self.define_model(self.model_definition)
@@ -495,13 +474,13 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]:
         input_ = Input(shape=(48, 48, 3))
         var_x = Conv2D(32, (3, 3), strides=1, padding='valid', name='conv1')(input_)
         var_x = PReLU(shared_axes=[1, 2], name='prelu1')(var_x)
-        var_x = MaxPool2D(pool_size=3, strides=2, padding='same')(var_x)
+        var_x = MaxPooling2D(pool_size=3, strides=2, padding='same')(var_x)
         var_x = Conv2D(64, (3, 3), strides=1, padding='valid', name='conv2')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='prelu2')(var_x)
-        var_x = MaxPool2D(pool_size=3, strides=2)(var_x)
+        var_x = MaxPooling2D(pool_size=3, strides=2)(var_x)
         var_x = Conv2D(64, (3, 3), strides=1, padding='valid', name='conv3')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='prelu3')(var_x)
-        var_x = MaxPool2D(pool_size=2)(var_x)
+        var_x = MaxPooling2D(pool_size=2)(var_x)
         var_x = Conv2D(128, (2, 2), strides=1, padding='valid', name='conv4')(var_x)
         var_x = PReLU(shared_axes=[1, 2], name='prelu4')(var_x)
         var_x = Permute((3, 2, 1))(var_x)
@@ -603,10 +582,6 @@ class MTCNN():  # pylint: disable=too-few-public-methods
     ----------
     model_path: list
         List of paths to the 3 MTCNN subnet weights
-    allow_growth: bool, optional
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance. Default: ``False``
     exclude_gpus: list, optional
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs. Default: ``None``
@@ -624,21 +599,19 @@ class MTCNN():  # pylint: disable=too-few-public-methods
     """
     def __init__(self,
                  model_path: list[str],
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  cpu_mode: bool,
                  input_size: int = 640,
                  minsize: int = 20,
                  threshold: list[float] | None = None,
                  factor: float = 0.709) -> None:
-        logger.debug("Initializing: %s: (model_path: '%s', allow_growth: %s, exclude_gpus: %s, "
+        logger.debug("Initializing: %s: (model_path: '%s', exclude_gpus: %s, "
                      "input_size: %s, minsize: %s, threshold: %s, factor: %s)",
-                     self.__class__.__name__, model_path, allow_growth, exclude_gpus,
-                     input_size, minsize, threshold, factor)
+                     self.__class__.__name__, model_path, exclude_gpus, input_size, minsize,
+                     threshold, factor)
 
         threshold = [0.6, 0.7, 0.7] if threshold is None else threshold
         self._pnet = PNet(model_path[0],
-                          allow_growth,
                           exclude_gpus,
                           cpu_mode,
                           input_size,
@@ -646,13 +619,11 @@ def __init__(self,
                           factor,
                           threshold[0])
         self._rnet = RNet(model_path[1],
-                          allow_growth,
                           exclude_gpus,
                           cpu_mode,
                           input_size,
                           threshold[1])
         self._onet = ONet(model_path[2],
-                          allow_growth,
                           exclude_gpus,
                           cpu_mode,
                           input_size,
diff --git a/plugins/extract/detect/s3fd.py b/plugins/extract/detect/s3fd.py
index 89d538b76f..676f7999fe 100644
--- a/plugins/extract/detect/s3fd.py
+++ b/plugins/extract/detect/s3fd.py
@@ -12,17 +12,14 @@
 from scipy.special import logsumexp
 import numpy as np
 
-# Ignore linting errors from Tensorflow's thoroughly broken import system
-from tensorflow import keras
-from tensorflow.keras import backend as K  # pylint:disable=import-error
-from tensorflow.keras.layers import (  # pylint:disable=import-error
-    Concatenate, Conv2D, Input, Maximum, MaxPooling2D, ZeroPadding2D)
+from keras.layers import (Concatenate, Conv2D, Input, Layer, Maximum, MaxPooling2D, ZeroPadding2D)
+from keras import initializers, ops
 
 from lib.model.session import KSession
 from ._base import BatchType, Detector
 
 if T.TYPE_CHECKING:
-    from tensorflow import Tensor
+    from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -35,9 +32,8 @@ def __init__(self, **kwargs) -> None:
         super().__init__(git_model_id=git_model_id, model_filename=model_filename, **kwargs)
         self.name = "S3FD"
         self.input_size = 640
-        self.vram = 4112
-        self.vram_warnings = 1024  # Will run at this with warnings
-        self.vram_per_batch = 208
+        self.vram = 1088  # 1034 in testing
+        self.vram_per_batch = 960  # 922 in testing
         self.batchsize = self.config["batch-size"]
 
     def init_model(self) -> None:
@@ -47,7 +43,6 @@ def init_model(self) -> None:
         model_kwargs = {"custom_objects": {"L2Norm": L2Norm, "SliceO2K": SliceO2K}}
         self.model = S3fd(self.model_path,
                           model_kwargs,
-                          self.config["allow_growth"],
                           self._exclude_gpus,
                           confidence)
 
@@ -71,7 +66,7 @@ def process_output(self, batch) -> None:
 ################################################################################
 # CUSTOM KERAS LAYERS
 ################################################################################
-class L2Norm(keras.layers.Layer):
+class L2Norm(Layer):
     """ L2 Normalization layer for S3FD.
 
     Parameters
@@ -85,10 +80,10 @@ def __init__(self, n_channels: int, scale: float = 1.0, **kwargs) -> None:
         super().__init__(**kwargs)
         self._n_channels = n_channels
         self._scale = scale
-        self.w = self.add_weight("l2norm",  # pylint:disable=invalid-name
-                                 (self._n_channels, ),
+        self.w = self.add_weight(name="l2norm",
+                                 shape=(self._n_channels, ),
                                  trainable=True,
-                                 initializer=keras.initializers.Constant(value=self._scale),
+                                 initializer=initializers.Constant(value=self._scale),
                                  dtype="float32")
 
     def call(self, inputs: Tensor) -> Tensor:  # pylint:disable=arguments-differ
@@ -104,10 +99,26 @@ def call(self, inputs: Tensor) -> Tensor:  # pylint:disable=arguments-differ
         tensor:
             The output from the L2 Normalization Layer
         """
-        norm = K.sqrt(K.sum(K.pow(inputs, 2), axis=-1, keepdims=True)) + 1e-10
+        norm = ops.sqrt(ops.sum(ops.power(inputs, 2), axis=-1, keepdims=True)) + 1e-10
         var_x = inputs / norm * self.w
         return var_x
 
+    def compute_output_shape(self,
+                             input_shape: tuple[None, int, int, int]) -> tuple[None, int, int, int]:
+        """ Input shape and output shape are the same
+        
+        Parameters
+        ----------
+        input_shape: tuple[None, int, int, int]
+            Shape of the input tensor
+
+        Returns
+        -------
+        tuple[None, int, int, int]
+            The shape of the output tensor
+        """
+        return input_shape
+
     def get_config(self) -> dict:
         """ Returns the config of the layer.
 
@@ -122,7 +133,7 @@ def get_config(self) -> dict:
         return config
 
 
-class SliceO2K(keras.layers.Layer):
+class SliceO2K(Layer):
     """ Custom Keras Slice layer generated by onnx2keras. """
     def __init__(self,
                  starts: list[int],
@@ -202,8 +213,8 @@ def call(self, inputs, **kwargs):  # pylint:disable=unused-argument,arguments-di
         A tensor or list/tuple of tensors.
             The layer output
         """
-        ax_map = dict((x[0], slice(*x[1:])) for x in self._get_slices(K.ndim(inputs)))
-        shape = K.int_shape(inputs)
+        ax_map = dict((x[0], slice(*x[1:])) for x in self._get_slices(ops.ndim(inputs)))
+        shape = inputs.shape
         slices = [(ax_map[a] if a in ax_map else slice(None)) for a in range(len(shape))]
         retval = inputs[tuple(slices)]
         return retval
@@ -229,16 +240,14 @@ class S3fd(KSession):
     def __init__(self,
                  model_path: str,
                  model_kwargs: dict,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  confidence: float) -> None:
-        logger.debug("Initializing: %s: (model_path: '%s', model_kwargs: %s, allow_growth: %s, "
-                     "exclude_gpus: %s, confidence: %s)", self.__class__.__name__, model_path,
-                     model_kwargs, allow_growth, exclude_gpus, confidence)
+        logger.debug("Initializing: %s: (model_path: '%s', model_kwargs: %s,exclude_gpus: %s, "
+                     "confidence: %s)", self.__class__.__name__, model_path, model_kwargs,
+                     exclude_gpus, confidence)
         super().__init__("S3FD",
                          model_path,
                          model_kwargs=model_kwargs,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus)
         self.define_model(self.model_definition)
         self.load_model_weights()
diff --git a/plugins/extract/mask/_base.py b/plugins/extract/mask/_base.py
index 837b6812e7..cc6d7f7874 100644
--- a/plugins/extract/mask/_base.py
+++ b/plugins/extract/mask/_base.py
@@ -20,8 +20,7 @@
 
 import cv2
 import numpy as np
-
-from tensorflow.python.framework import errors_impl as tf_errors  # pylint:disable=no-name-in-module  # noqa
+from torch.cuda import OutOfMemoryError
 
 from lib.align import AlignedFace, transform_image
 from lib.utils import FaceswapError
@@ -204,18 +203,17 @@ def _predict(self, batch: BatchType) -> MaskerBatch:
         """ Just return the masker's predict function """
         assert isinstance(batch, MaskerBatch)
         assert self.name is not None
-        try:
-            # slightly hacky workaround to deal with landmarks based masks:
-            if self.name.lower() in ("components", "extended"):
-                feed = np.empty(2, dtype="object")
-                feed[0] = batch.feed
-                feed[1] = batch.feed_faces
-            else:
-                feed = batch.feed
+        # slightly hacky workaround to deal with landmarks based masks:
+        if self.name.lower() in ("components", "extended"):
+            feed = np.empty(2, dtype="object")
+            feed[0] = batch.feed
+            feed[1] = batch.feed_faces
+        else:
+            feed = batch.feed
 
+        try:
             batch.prediction = self.predict(feed)
-            return batch
-        except tf_errors.ResourceExhaustedError as err:
+        except OutOfMemoryError as err:
             msg = ("You do not have enough GPU memory available to run detection at the "
                    "selected batch size. You can try a number of things:"
                    "\n1) Close any other application that is using your GPU (web browsers are "
@@ -224,7 +222,9 @@ def _predict(self, batch: BatchType) -> MaskerBatch:
                    "editing the plugin settings (GUI: Settings > Configure extract settings, "
                    "CLI: Edit the file faceswap/config/extract.ini)."
                    "\n3) Enable 'Single Process' mode.")
-            raise FaceswapError(msg) from err
+            raise FaceswapError(msg) from err            
+
+        return batch
 
     def finalize(self, batch: BatchType) -> Generator[ExtractMedia, None, None]:
         """ Finalize the output from Masker
diff --git a/plugins/extract/mask/bisenet_fp.py b/plugins/extract/mask/bisenet_fp.py
index cf8a177fe6..ab694b9958 100644
--- a/plugins/extract/mask/bisenet_fp.py
+++ b/plugins/extract/mask/bisenet_fp.py
@@ -10,9 +10,8 @@
 
 import numpy as np
 
-# Ignore linting errors from Tensorflow's thoroughly broken import system
-from tensorflow.keras import backend as K  # pylint:disable=import-error
-from tensorflow.keras.layers import (  # pylint:disable=import-error
+import keras.backend as K
+from keras.layers import (  # pylint:disable=import-error
     Activation, Add, BatchNormalization, Concatenate, Conv2D, GlobalAveragePooling2D, Input,
     MaxPooling2D, Multiply, Reshape, UpSampling2D, ZeroPadding2D)
 
@@ -21,7 +20,7 @@
 from ._base import BatchType, Masker, MaskerBatch
 
 if T.TYPE_CHECKING:
-    from tensorflow import Tensor
+    from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -39,9 +38,8 @@ def __init__(self, **kwargs) -> None:
         self.name = "BiSeNet - Face Parsing"
         self.input_size = 512
         self.color_format = "RGB"
-        self.vram = 2304 if not self.config["cpu"] else 0
-        self.vram_warnings = 256 if not self.config["cpu"] else 0
-        self.vram_per_batch = 64 if not self.config["cpu"] else 0
+        self.vram = 384 if not self.config["cpu"] else 0  # 378 in testing
+        self.vram_per_batch = 384 if not self.config["cpu"] else 0  # ~328 in testing
         self.batchsize = self.config["batch-size"]
 
         self._segment_indices = self._get_segment_indices()
@@ -108,7 +106,6 @@ def init_model(self) -> None:
         assert isinstance(self.model_path, str)
         lbls = 5 if self._is_faceswap else 19
         self.model = BiSeNet(self.model_path,
-                             self.config["allow_growth"],
                              self._exclude_gpus,
                              self.input_size,
                              lbls,
@@ -299,7 +296,7 @@ def _basic_block(self, inputs: Tensor, prefix: str, filters: int, strides: int =
         res = ConvBn(filters, strides=1, padding=1, activation=False, prefix=prefix)(res)
 
         shortcut = inputs
-        filts = (K.int_shape(shortcut)[self._feature_index], K.int_shape(res)[self._feature_index])
+        filts = (shortcut.shape[self._feature_index], res.shape[self._feature_index])
         if strides != 1 or filts[0] != filts[1]:  # Downsample
             name = f"{prefix}.downsample."
             shortcut = Conv2D(filters, 1,
@@ -400,7 +397,7 @@ def __call__(self, inputs: Tensor, feats: int) -> Tensor:
         prefix = f"cp.arm{feats}"
         feat = ConvBn(self._filters, prefix=f"{prefix}.conv", start_idx=-1, padding=-1)(inputs)
         atten = GlobalAveragePooling2D(name=f"{prefix}.avgpool")(feat)
-        atten = Reshape((1, 1, K.int_shape(atten)[-1]))(atten)
+        atten = Reshape((1, 1, atten.shape[-1]))(atten)
         atten = Conv2D(self._filters, 1, use_bias=False, name=f"{prefix}.conv_atten")(atten)
         atten = BatchNormalization(epsilon=1e-5, name=f"{prefix}.bn_atten")(atten)
         atten = Activation("sigmoid", name=f"{prefix}.sigmoid")(atten)
@@ -429,10 +426,10 @@ def __call__(self, inputs: Tensor) -> Tensor:
         feat8, feat16, feat32 = self._resnet(inputs)
 
         avg = GlobalAveragePooling2D(name="cp.avgpool")(feat32)
-        avg = Reshape((1, 1, K.int_shape(avg)[-1]))(avg)
+        avg = Reshape((1, 1, avg.shape[-1]))(avg)
         avg = ConvBn(128, kernel_size=1, padding=0, prefix="cp.conv_avg", start_idx=-1)(avg)
 
-        avg_up = UpSampling2D(size=K.int_shape(feat32)[1:3], name="cp.upsample")(avg)
+        avg_up = UpSampling2D(size=feat32.shape[1:3], name="cp.upsample")(avg)
 
         feat32 = AttentionRefinementModule(128)(feat32, 32)
         feat32 = Add(name="cp.add")([feat32, avg_up])
@@ -480,7 +477,7 @@ def __call__(self, inputs: Tensor) -> Tensor:
                       start_idx=-1)(feat)
 
         atten = GlobalAveragePooling2D(name="ffm.avgpool")(feat)
-        atten = Reshape((1, 1, K.int_shape(atten)[-1]))(atten)
+        atten = Reshape((1, 1, atten.shape[-1]))(atten)
         atten = Conv2D(self._filters // 4, 1, use_bias=False, name="ffm.conv1")(atten)
         atten = Activation("relu", name="ffm.relu")(atten)
         atten = Conv2D(self._filters, 1, use_bias=False, name="ffm.conv2")(atten)
@@ -537,10 +534,6 @@ class BiSeNet(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance
     exclude_gpus: list
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs
@@ -553,14 +546,12 @@ class BiSeNet(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None,
                  input_size: int,
                  num_classes: int,
                  cpu_mode: bool) -> None:
         super().__init__("BiSeNet Face Parsing",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus,
                          cpu_mode=cpu_mode)
         self._input_size = input_size
@@ -587,10 +578,10 @@ def _model_definition(self) -> tuple[Tensor, list[Tensor]]:
         feat_out16 = BiSeNetOutput(64, self._num_classes, label="16")(features[1])
         feat_out32 = BiSeNetOutput(64, self._num_classes, label="32")(features[2])
 
-        height, width = K.int_shape(input_)[1:3]
-        f_h, f_w = K.int_shape(feat_out)[1:3]
-        f_h16, f_w16 = K.int_shape(feat_out16)[1:3]
-        f_h32, f_w32 = K.int_shape(feat_out32)[1:3]
+        height, width = input_.shape[1:3]
+        f_h, f_w = feat_out.shape[1:3]
+        f_h16, f_w16 = feat_out16.shape[1:3]
+        f_h32, f_w32 = feat_out32.shape[1:3]
 
         feat_out = UpSampling2D(size=(height // f_h, width // f_w),
                                 interpolation="bilinear")(feat_out)
diff --git a/plugins/extract/mask/unet_dfl.py b/plugins/extract/mask/unet_dfl.py
index 4ca2f3dc07..598c630889 100644
--- a/plugins/extract/mask/unet_dfl.py
+++ b/plugins/extract/mask/unet_dfl.py
@@ -31,9 +31,8 @@ def __init__(self, **kwargs) -> None:
         self.model: KSession
         self.name = "U-Net"
         self.input_size = 256
-        self.vram = 3424
-        self.vram_warnings = 256
-        self.vram_per_batch = 80
+        self.vram = 320  # 276 in testing
+        self.vram_per_batch = 256  # ~215 in testing
         self.batchsize = self.config["batch-size"]
         self._storage_centering = "legacy"
 
@@ -42,7 +41,6 @@ def init_model(self) -> None:
         self.model = KSession(self.name,
                               self.model_path,
                               model_kwargs={},
-                              allow_growth=self.config["allow_growth"],
                               exclude_gpus=self._exclude_gpus)
         self.model.load_model()
         placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3),
diff --git a/plugins/extract/mask/vgg_clear.py b/plugins/extract/mask/vgg_clear.py
index 50165f8015..ba2535f481 100644
--- a/plugins/extract/mask/vgg_clear.py
+++ b/plugins/extract/mask/vgg_clear.py
@@ -6,8 +6,7 @@
 
 import numpy as np
 
-# Ignore linting errors from Tensorflow's thoroughly broken import system
-from tensorflow.keras.layers import (  # pylint:disable=import-error
+from keras.layers import (
     Add, Conv2D, Conv2DTranspose, Cropping2D, Dropout, Input, Lambda, MaxPooling2D,
     ZeroPadding2D)
 
@@ -15,7 +14,7 @@
 from ._base import BatchType, Masker, MaskerBatch
 
 if T.TYPE_CHECKING:
-    from tensorflow import Tensor
+    from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -29,15 +28,13 @@ def __init__(self, **kwargs) -> None:
         self.model: KSession
         self.name = "VGG Clear"
         self.input_size = 300
-        self.vram = 2944
-        self.vram_warnings = 1088  # at BS 1. OOMs at higher batch sizes
-        self.vram_per_batch = 400
+        self.vram = 1344  # 1308 in testing
+        self.vram_per_batch = 448  # ~402 in testing
         self.batchsize = self.config["batch-size"]
 
     def init_model(self) -> None:
         assert isinstance(self.model_path, str)
         self.model = VGGClear(self.model_path,
-                              allow_growth=self.config["allow_growth"],
                               exclude_gpus=self._exclude_gpus)
         self.model.append_softmax_activation(layer_index=-1)
         placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3),
@@ -73,10 +70,6 @@ class VGGClear(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance
     exclude_gpus: list
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs
@@ -93,11 +86,9 @@ class VGGClear(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None):
         super().__init__("VGG Obstructed",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus)
         self.define_model(self._model_definition)
         self.load_model_weights()
diff --git a/plugins/extract/mask/vgg_obstructed.py b/plugins/extract/mask/vgg_obstructed.py
index a3f543d7e8..54d30af3ed 100644
--- a/plugins/extract/mask/vgg_obstructed.py
+++ b/plugins/extract/mask/vgg_obstructed.py
@@ -6,8 +6,7 @@
 
 import numpy as np
 
-# Ignore linting errors from Tensorflow's thoroughly broken import system
-from tensorflow.keras.layers import (  # pylint:disable=import-error
+from keras.layers import (
     Add, Conv2D, Conv2DTranspose, Cropping2D, Dropout, Input, Lambda, MaxPooling2D,
     ZeroPadding2D)
 
@@ -15,7 +14,7 @@
 from ._base import BatchType, Masker, MaskerBatch
 
 if T.TYPE_CHECKING:
-    from tensorflow import Tensor
+    from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -29,15 +28,13 @@ def __init__(self, **kwargs) -> None:
         self.model: KSession
         self.name = "VGG Obstructed"
         self.input_size = 500
-        self.vram = 3936
-        self.vram_warnings = 1088  # at BS 1. OOMs at higher batch sizes
-        self.vram_per_batch = 304
+        self.vram = 1728  # 1710 in testing
+        self.vram_per_batch = 896  # ~886 in testing
         self.batchsize = self.config["batch-size"]
 
     def init_model(self) -> None:
         assert isinstance(self.model_path, str)
         self.model = VGGObstructed(self.model_path,
-                                   allow_growth=self.config["allow_growth"],
                                    exclude_gpus=self._exclude_gpus)
         self.model.append_softmax_activation(layer_index=-1)
         placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3),
@@ -72,10 +69,6 @@ class VGGObstructed(KSession):
     ----------
     model_path: str
         The path to the keras model file
-    allow_growth: bool
-        Enable the Tensorflow GPU allow_growth configuration option. This option prevents
-        Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and
-        slower performance
     exclude_gpus: list
         A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
         ``None`` to not exclude any GPUs
@@ -89,11 +82,9 @@ class VGGObstructed(KSession):
     """
     def __init__(self,
                  model_path: str,
-                 allow_growth: bool,
                  exclude_gpus: list[int] | None) -> None:
         super().__init__("VGG Obstructed",
                          model_path,
-                         allow_growth=allow_growth,
                          exclude_gpus=exclude_gpus)
         self.define_model(self._model_definition)
         self.load_model_weights()
diff --git a/plugins/extract/pipeline.py b/plugins/extract/pipeline.py
index 80f598acc9..b4e5db64b5 100644
--- a/plugins/extract/pipeline.py
+++ b/plugins/extract/pipeline.py
@@ -692,12 +692,64 @@ def _launch_plugin(self, phase: str) -> None:
         plugin.start()
         logger.debug("Launched %s plugin", phase)
 
+    def _set_plugins_batchsize(self, gpu_plugins: list[str], vram_free: int) -> None:
+        """ Set the batch size for the current phase so that it will fit in available VRAM.
+
+        Do not update plugins which have a vram_per_batch of 0 (CPU plugins) due to
+        zero division error.
+
+        Reduces the batchsize of the plugin which has a batch size > 1 and the largest VRAM
+        requirements. The final reduction is the plugin which has a batch size > 1 and the
+        smallest VRAM requirements that would fit the pipeline inside VRAM
+
+        Parameters
+        ----------
+        gpu_plugins: list[str]
+            The name of the plugins that use the GPU for the current phase
+        vram_free: int
+            The amount of available VRAM, in MBs
+        """
+        logger.debug("GPU plugins: %s, Available vram: %s", gpu_plugins, vram_free)
+        plugins = [self._active_plugins[idx]
+                   for idx, plugin in enumerate(self._current_phase)
+                   if plugin in gpu_plugins]
+        base_vram = sum(p.vram for p in plugins)
+        vram_free = vram_free - base_vram
+        logger.debug("Base vram: %s, remaining vram: %s", base_vram, vram_free)
+
+        to_allocate = [(p.batchsize, p.vram_per_batch) for p in plugins]
+        excess = sum(a[0] * a[1] for a in to_allocate) - vram_free
+        logger.debug("Plugins to allocate: %s, excess vram: %s", to_allocate, excess)
+
+        while excess > 0:
+            chosen = next(p for p in to_allocate
+                          if p[0] > 1 and p[1] == max(p[1] for p in to_allocate if p[0] > 1))
+
+            if excess - chosen[1] <= 0:
+                chosen = next(p for p in to_allocate
+                              if p[0] > 1 and p[1] == min(p[1] for p in to_allocate
+                                                          if p[0] > 1 and p[1] >= excess))
+
+            excess -= chosen[1]
+            logger.debug("Reducing batch size for item %s. Remaining %s", chosen, excess)
+            to_allocate[to_allocate.index(chosen)] = (chosen[0] - 1, chosen[1])
+
+        msg = []
+        for plugin, alloc in zip(plugins, to_allocate):
+            if plugin.batchsize != alloc[0]:
+                logger.debug("Updating batchsize for plugin %s from %s to %s",
+                            plugin.name, plugin.batchsize, alloc[0])
+                plugin.batchsize = alloc[0]
+                msg.append(f"{plugin.__class__.__name__}: {plugin.batchsize}")
+
+        logger.info("Reset batch sizes due to available VRAM: %s", ", ".join(msg))
+
+
     def _set_extractor_batchsize(self) -> None:
         """
         Sets the batch size of the requested plugins based on their vram, their
         vram_per_batch_requirements and the number of plugins being loaded in the current phase.
-        Only adjusts if the the configured batch size requires more vram than is available. Nvidia
-        only.
+        Only adjusts if the the configured batch size requires more vram than is available.
         """
         backend = get_backend()
         if backend not in ("nvidia", "directml", "rocm"):
@@ -709,62 +761,20 @@ def _set_extractor_batchsize(self) -> None:
 
         batch_required = sum(plugin.vram_per_batch * plugin.batchsize
                              for plugin in self._active_plugins)
+
         gpu_plugins = [p for p in self._current_phase if self._vram_per_phase[p] > 0]
+
         scaling = self._parallel_scaling.get(len(gpu_plugins), self._scaling_fallback)
         plugins_required = sum(self._vram_per_phase[p] for p in gpu_plugins) * scaling
-        if plugins_required + batch_required <= T.cast(int, self._vram_stats["vram_free"]):
+
+        vram_free = T.cast(int, self._vram_stats["vram_free"])
+        total_required = plugins_required + batch_required
+        if total_required <= vram_free:
             logger.debug("Plugin requirements within threshold: (plugins_required: %sMB, "
                          "vram_free: %sMB)", plugins_required, self._vram_stats["vram_free"])
             return
-        # Hacky split across plugins that use vram
-        available_vram = (T.cast(int, self._vram_stats["vram_free"])
-                          - plugins_required) // len(gpu_plugins)
-        self._set_plugin_batchsize(gpu_plugins, available_vram)
 
-    def _set_plugin_batchsize(self, gpu_plugins: list[str], available_vram: float) -> None:
-        """ Set the batch size for the given plugin based on given available vram.
-        Do not update plugins which have a vram_per_batch of 0 (CPU plugins) due to
-        zero division error.
-        """
-        plugins = [self._active_plugins[idx]
-                   for idx, plugin in enumerate(self._current_phase)
-                   if plugin in gpu_plugins]
-        vram_per_batch = [plugin.vram_per_batch for plugin in plugins]
-        ratios = [vram / sum(vram_per_batch) for vram in vram_per_batch]
-        requested_batchsizes = [plugin.batchsize for plugin in plugins]
-        batchsizes = [min(requested, max(1, int((available_vram * ratio) / plugin.vram_per_batch)))
-                      for ratio, plugin, requested in zip(ratios, plugins, requested_batchsizes)]
-        remaining = available_vram - sum(batchsize * plugin.vram_per_batch
-                                         for batchsize, plugin in zip(batchsizes, plugins))
-        sorted_indices = [i[0] for i in sorted(enumerate(plugins),
-                                               key=lambda x: x[1].vram_per_batch, reverse=True)]
-
-        logger.debug("requested_batchsizes: %s, batchsizes: %s, remaining vram: %s",
-                     requested_batchsizes, batchsizes, remaining)
-
-        while remaining > min(plugin.vram_per_batch
-                              for plugin in plugins) and requested_batchsizes != batchsizes:
-            for idx in sorted_indices:
-                plugin = plugins[idx]
-                if plugin.vram_per_batch > remaining:
-                    logger.debug("Not enough VRAM to increase batch size of %s. Required: %sMB, "
-                                 "Available: %sMB", plugin, plugin.vram_per_batch, remaining)
-                    continue
-                if plugin.batchsize == batchsizes[idx]:
-                    logger.debug("Threshold reached for %s. Batch size: %s",
-                                 plugin, plugin.batchsize)
-                    continue
-                logger.debug("Incrementing batch size of %s to %s", plugin, batchsizes[idx] + 1)
-                batchsizes[idx] += 1
-                remaining -= plugin.vram_per_batch
-                logger.debug("Remaining VRAM to allocate: %sMB", remaining)
-
-        if batchsizes != requested_batchsizes:
-            text = ", ".join([f"{plugin.__class__.__name__}: {batchsize}"
-                              for plugin, batchsize in zip(plugins, batchsizes)])
-            for plugin, batchsize in zip(plugins, batchsizes):
-                plugin.batchsize = batchsize
-            logger.info("Reset batch sizes due to available VRAM: %s", text)
+        self._set_plugins_batchsize(gpu_plugins, vram_free)
 
     def _join_threads(self):
         """ Join threads for current pass """
diff --git a/plugins/extract/recognition/_base.py b/plugins/extract/recognition/_base.py
index 3630607b98..a0bde49cd1 100644
--- a/plugins/extract/recognition/_base.py
+++ b/plugins/extract/recognition/_base.py
@@ -22,7 +22,7 @@
 from dataclasses import dataclass, field
 
 import numpy as np
-from tensorflow.python.framework import errors_impl as tf_errors  # pylint:disable=no-name-in-module  # noqa
+from torch.cuda import OutOfMemoryError
 
 from lib.align import AlignedFace, DetectedFace
 from lib.image import read_image_meta
@@ -205,11 +205,10 @@ def get_batch(self, queue: Queue) -> tuple[bool, RecogBatch]:
     def _predict(self, batch: BatchType) -> RecogBatch:
         """ Just return the recognition's predict function """
         assert isinstance(batch, RecogBatch)
+        # slightly hacky workaround to deal with landmarks based masks:
         try:
-            # slightly hacky workaround to deal with landmarks based masks:
             batch.prediction = self.predict(batch.feed)
-            return batch
-        except tf_errors.ResourceExhaustedError as err:
+        except OutOfMemoryError as err:
             msg = ("You do not have enough GPU memory available to run recognition at the "
                    "selected batch size. You can try a number of things:"
                    "\n1) Close any other application that is using your GPU (web browsers are "
@@ -218,7 +217,9 @@ def _predict(self, batch: BatchType) -> RecogBatch:
                    "editing the plugin settings (GUI: Settings > Configure extract settings, "
                    "CLI: Edit the file faceswap/config/extract.ini)."
                    "\n3) Enable 'Single Process' mode.")
-            raise FaceswapError(msg) from err
+            raise FaceswapError(msg) from err        
+        
+        return batch
 
     def finalize(self, batch: BatchType) -> Generator[ExtractMedia, None, None]:
         """ Finalize the output from Masker
diff --git a/plugins/extract/recognition/vgg_face2.py b/plugins/extract/recognition/vgg_face2.py
index ae717c75a2..84c3d219f7 100644
--- a/plugins/extract/recognition/vgg_face2.py
+++ b/plugins/extract/recognition/vgg_face2.py
@@ -8,13 +8,17 @@
 import numpy as np
 import psutil
 from fastcluster import linkage, linkage_vector
+from keras.layers import (Activation, add, AveragePooling2D, BatchNormalization, Conv2D, Dense,
+                          Flatten, Input, MaxPooling2D)
+from keras.regularizers import L2
 
-from lib.model.layers import L2_normalize
+from lib.model.layers import L2Normalize
 from lib.model.session import KSession
 from lib.utils import FaceswapError
 from ._base import BatchType, RecogBatch, Identity
 
 if T.TYPE_CHECKING:
+    import torch
     from collections.abc import Generator
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -47,9 +51,8 @@ def __init__(self, *args, **kwargs) -> None:  # pylint:disable=unused-argument
         self.input_size = 224
         self.color_format = "BGR"
 
-        self.vram = 2468 if not self.config["cpu"] else 0
-        self.vram_warnings = 192 if not self.config["cpu"] else 0
-        self.vram_per_batch = 32 if not self.config["cpu"] else 0
+        self.vram = 384 if not self.config["cpu"] else 0  # 334 in testing
+        self.vram_per_batch = 192 if not self.config["cpu"] else 0  # ~155 in testing
         self.batchsize = self.config["batch-size"]
 
         # Average image provided in https://github.com/ox-vgg/vgg_face2
@@ -60,14 +63,13 @@ def __init__(self, *args, **kwargs) -> None:  # pylint:disable=unused-argument
     def init_model(self) -> None:
         """ Initialize VGG Face 2 Model. """
         assert isinstance(self.model_path, str)
-        model_kwargs = {"custom_objects": {"L2_normalize": L2_normalize}}
-        self.model = KSession(self.name,
+        self.model = VGGFace2(self.input_size,
                               self.model_path,
-                              model_kwargs=model_kwargs,
-                              allow_growth=self.config["allow_growth"],
                               exclude_gpus=self._exclude_gpus,
                               cpu_mode=self.config["cpu"])
-        self.model.load_model()
+        placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3),
+                               dtype="float32")
+        self.model.predict(placeholder)
 
     def process_input(self, batch: BatchType) -> None:
         """ Compile the detected faces for prediction """
@@ -99,6 +101,274 @@ def process_output(self, batch: BatchType) -> None:
         return
 
 
+class ResNet50:
+    """ ResNet50 imported for VGG-Face2 adapted from
+    https://github.com/WeidiXie/Keras-VGGFace2-ResNet50
+
+    Parameters
+    ----------
+    input_shape, Tuple[int, int, int] | None, optional
+        The input shape for the model. Default: ``None``
+    use_truncated: bool, optional
+        ``True`` to use a truncated version of resnet. Default ``False``
+    weight_decay: float
+        L2 Regularizer weight decay. Default: 1e-4
+    trainable: bool, optional
+        ``True`` if the block should be trainable. Default: ``True``
+    """
+    def __init__(self,
+                 input_shape: tuple[int, int, int] | None = None,
+                 use_truncated: bool = False,
+                 weight_decay: float = 1e-4,
+                 trainable: bool = True) -> None:
+        logger.debug("Initializing %s: input_shape: %s, use_truncated: %s, weight_decay: %s, "
+                     "trainable: %s", self.__class__.__name__, input_shape, use_truncated,
+                     weight_decay, trainable)
+
+        self._input_shape = (None, None, 3) if input_shape is None else input_shape
+        self._weight_decay = weight_decay
+        self._trainable = trainable
+
+        self._kernel_initializer = "orthogonal"
+        self._use_bias = False
+        self._bn_axis = 3
+        self._block_suffix = {0: "_reduce", 1: "", 2: "_increase"}
+
+        self._identity_calls = [2, 3, 5, 2]
+        self._filters = [(64, 64, 256), (128, 128, 512), (256, 256, 1024), (512, 512, 2048)]
+        if use_truncated:
+            self._identity_calls = self._identity_calls[:-1]
+            self._filters = self._filters[:-1]
+
+        logger.debug("Initialized %s", self.__class__.__name__)
+
+    def _identity_block(self,
+                        inputs: torch.Tensor,
+                        kernel_size: int,
+                        filters: tuple[int, int, int],
+                        stage: int,
+                        block: int) -> torch.Tensor:
+        """ The identity block is the block that has no conv layer at shortcut.
+
+        Parameters
+        ----------
+        inputs: :class:`torch.Tensor`
+            Input tensor
+        kernel_size: int
+            The kernel size of middle conv layer of the block
+        filters: tuple[int, int, int[
+            The filterss of 3 conv layers in the main path
+        stage: int
+            The current stage label, used for generating layer names
+        block: int
+            The current block label, used for generating layer names
+
+        Returns
+        -------
+        :class:`torch.Tensor`
+            Output tensor for the block
+        """
+        assert len(filters) == 3
+        var_x = inputs
+
+        for idx, filts in enumerate(filters):
+            k_size = kernel_size if idx == 1 else 1
+            conv_name = f"conv{stage}_{block}_{k_size}x{k_size}{self._block_suffix[idx]}"
+            bn_name = f"{conv_name}_bn"
+
+            var_x = Conv2D(filts,
+                           k_size,
+                           padding="same" if idx == 1 else "valid",
+                           kernel_initializer=self._kernel_initializer,
+                           use_bias=self._use_bias,
+                           kernel_regularizer=L2(self._weight_decay),
+                           trainable=self._trainable,
+                           name=conv_name)(var_x)
+            var_x = BatchNormalization(axis=self._bn_axis, name=bn_name)(var_x)
+            if idx < 2:
+                var_x = Activation("relu")(var_x)
+
+        var_x = add([var_x, inputs])
+        var_x = Activation("relu")(var_x)
+        return var_x
+
+    def _conv_block(self,
+                    inputs: torch.Tensor,
+                    kernel_size: int,
+                    filters: tuple[int, int, int],
+                    stage: int,
+                    block: int,
+                    strides: tuple[int, int] = (2, 2)) -> torch.Tensor:
+        """ A block that has a conv layer at shortcut.
+
+        Parameters
+        ----------
+        inputs: :class:`torch.Tensor`
+            Input tensor
+        kernel_size: int
+            The kernel size of middle conv layer of the block
+        filters: tuple[int, int, int[
+            The filterss of 3 conv layers in the main path
+        stage: int
+            The current stage label, used for generating layer names
+        block: int
+            The current block label, used for generating layer names
+        strides: tuple[int, int], optional
+            The stride length for the first and last convolution. Default: (2, 2)
+
+        Returns
+        -------
+        :class:`torch.Tensor`
+            Output tensor for the block
+
+        Notes
+        -----
+        From stage 3, the first conv layer at main path is with `strides = (2,2)` and the shortcut
+        should have `strides = (2,2)` as well
+        """
+        assert len(filters) == 3
+        var_x = inputs
+
+        for idx, filts in enumerate(filters):
+            k_size = kernel_size if idx == 1 else 1
+            conv_name = f"conv{stage}_{block}_{k_size}x{k_size}{self._block_suffix[idx]}"
+            bn_name = f"{conv_name}_bn"
+
+            var_x = Conv2D(filts,
+                           k_size,
+                           strides=strides if idx == 0 else (1, 1),
+                           padding="same" if idx == 1 else "valid",
+                           kernel_initializer=self._kernel_initializer,
+                           use_bias=self._use_bias,
+                           kernel_regularizer=L2(self._weight_decay),
+                           trainable=self._trainable,
+                           name=conv_name)(var_x)
+            var_x = BatchNormalization(axis=self._bn_axis, name=bn_name)(var_x)
+            if idx < 2:
+                var_x = Activation("relu")(var_x)
+
+        conv_name = f"conv{stage}_{block}_1x1_proj"
+        bn_name = f"{conv_name}_bn"
+
+        shortcut = Conv2D(filters[-1],
+                          (1, 1),
+                          strides=strides,
+                          kernel_initializer=self._kernel_initializer,
+                          use_bias=self._use_bias,
+                          kernel_regularizer=L2(self._weight_decay),
+                          trainable=self._trainable,
+                          name=conv_name)(inputs)
+        shortcut = BatchNormalization(axis=self._bn_axis, name=bn_name)(shortcut)
+
+        var_x = add([var_x, shortcut])
+        var_x = Activation("relu")(var_x)
+        return var_x
+
+    def __call__(self, inputs: torch.Tensor) -> torch.Tensor:
+        """ Call the resnet50 Network
+
+        Parameters
+        ----------
+        inputs: :class:`torch.Tensor`
+            Input tensor
+
+        Returns
+        -------
+        :class:`torch.Tensor`
+            Output tensor from resnet50
+        """
+        var_x = Conv2D(64,
+                       (7, 7),
+                       strides=(2, 2),
+                       padding="same",
+                       use_bias=self._use_bias,
+                       kernel_initializer=self._kernel_initializer,
+                       kernel_regularizer=L2(self._weight_decay),
+                       trainable=self._trainable,
+                       name="conv1_7x7_s2")(inputs)
+
+        var_x = BatchNormalization(axis=self._bn_axis, name="conv1_7x7_s2_bn")(var_x)
+        var_x = Activation("relu")(var_x)
+        var_x = MaxPooling2D((3, 3), strides=(2, 2))(var_x)
+
+        for idx, (recursuions, filters) in enumerate(zip(self._identity_calls, self._filters)):
+            stage = idx + 2
+            strides = (1, 1) if stage == 2 else (2, 2)
+            var_x = self._conv_block(var_x, 3, filters, stage=stage, block=1, strides=strides)
+
+            for recursion in range(recursuions):
+                block = recursion + 2
+                var_x = self._identity_block(var_x, 3, filters, stage=stage, block=block)
+
+        return var_x
+
+
+class VGGFace2(KSession):
+    """ VGG-Face 2 model with resnet 50 backbone. Adapted from
+    https://github.com/WeidiXie/Keras-VGGFace2-ResNet50
+
+    Parameters
+    ----------
+    input_size, int
+        The input size for the model.
+    model_path: str
+        The path to the keras model file
+    exclude_gpus: list
+        A list of indices correlating to connected GPUs that Tensorflow should not use. Pass
+        ``None`` to not exclude any GPUs
+    cpu_mode: bool, optional
+        ``True`` run the model on CPU. Default: ``False``
+    num_class: int, optional
+        Number of classes to train the model on
+    weight_decay: float
+        L2 Regularizer weight decay. Default: 1e-4
+    """
+    def __init__(self,
+                 input_size: int,
+                 model_path: str,
+                 exclude_gpus: list[int] | None,
+                 cpu_mode: bool,
+                 num_classes: int = 8631,
+                 weight_decay: float = 1e-4) -> None:
+        logger.debug("Initializing %s: input_size: %s, model_path: %s, exclude_gpus: %s, "
+                     "num_classes: %s, weight_decay: %s, train: %s",
+                     self.__class__.__name__, input_size, model_path, exclude_gpus, cpu_mode,
+                     num_classes, weight_decay)
+        super().__init__("VGG Face 2",
+                         model_path,
+                         exclude_gpus=exclude_gpus,
+                         cpu_mode=cpu_mode)
+        self._input_shape = (input_size, input_size, 3)
+        self._weight_decay = weight_decay
+        self._num_classes = num_classes
+        self._resnet = ResNet50(input_shape=self._input_shape, weight_decay=self._weight_decay)
+
+        self.define_model(self._model_definition)
+        self.load_model_weights()
+
+        logger.debug("Initialized %s", self.__class__.__name__)
+
+    def _model_definition(self) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """ Run the vgg-face2 model on the input tensor
+
+        Returns
+        -------
+        :class:`torch.Tensor`
+            The input tensor to vgg-face2
+        list[`torch.Tensor`]
+            The output from vgg-face2
+        """
+        inputs = Input(self._input_shape)
+        var_x = self._resnet(inputs)
+
+        var_x = AveragePooling2D((7, 7), name="avg_pool")(var_x)
+        var_x = Flatten()(var_x)
+        var_x = Dense(512, activation="relu", name="dim_proj")(var_x)
+
+        var_x = L2Normalize(axis=1)(var_x)
+        return inputs, [var_x]
+
+
 class Cluster():  # pylint: disable=too-few-public-methods
     """ Cluster the outputs from a VGG-Face 2 Model
 
diff --git a/scripts/extract.py b/scripts/extract.py
index 44a7434bf6..b963d498ba 100644
--- a/scripts/extract.py
+++ b/scripts/extract.py
@@ -12,6 +12,7 @@
 
 import numpy as np
 from tqdm import tqdm
+import torch
 from lib.align.alignments import PNGHeaderDict
 
 from lib.image import encode_image, generate_thumbnail, ImagesLoader, ImagesSaver, read_image_meta
@@ -740,7 +741,8 @@ def _run_extraction(self) -> None:
                     detected_faces[extract_media.filename] = extract_media
 
             if not is_final:
-                logger.debug("Reloading images")
+                logger.debug("Reloading images and resetting PyTorch memory cache")
+                torch.cuda.empty_cache()
                 self._loader.reload(detected_faces)
         if saver is not None:
             saver.close()
diff --git a/tests/lib/utils_test.py b/tests/lib/utils_test.py
index 34f9be5f4f..4db96cfcc4 100644
--- a/tests/lib/utils_test.py
+++ b/tests/lib/utils_test.py
@@ -19,7 +19,7 @@
 from lib import utils
 from lib.utils import (
     _Backend, camel_case_split, convert_to_secs, DebugTimes, deprecation_warning, FaceswapError,
-    full_path_split, get_backend, get_dpi, get_folder, get_image_paths, get_tf_version, GetModel,
+    full_path_split, get_backend, get_dpi, get_folder, get_image_paths, get_torch_version, GetModel,
     safe_shutdown, set_backend, set_system_verbosity)
 
 from lib.logger import log_setup
@@ -205,10 +205,10 @@ def test_camel_case_split(text: str, result: list[str]) -> None:
 
 
 # General utils
-def test_get_tf_version() -> None:
-    """ Test the :func:`~lib.utils.get_tf_version` function version returns correctly in range """
-    tf_version = get_tf_version()
-    assert (2, 10) <= tf_version < (2, 11)
+def test_get_torch_version() -> None:
+    """ Test the :func:`~lib.utils.get_torch_version` function version returns correctly in range """
+    tf_version = get_torch_version()
+    assert (2, 1) <= tf_version < (2, 3)
 
 
 def test_get_dpi() -> None: