diff --git a/plugins/extract/_base.py b/plugins/extract/_base.py index 4abe8a5e9d..36902e9932 100644 --- a/plugins/extract/_base.py +++ b/plugins/extract/_base.py @@ -9,11 +9,10 @@ from dataclasses import dataclass, field import numpy as np -from tensorflow.python.framework import errors_impl as tf_errors # pylint:disable=no-name-in-module # noqa from lib.multithreading import MultiThread from lib.queue_manager import queue_manager -from lib.utils import GetModel, FaceswapError +from lib.utils import GetModel from ._config import Config from .pipeline import ExtractMedia @@ -109,7 +108,7 @@ class Extractor(): model_filename: str The name of the model file to be loaded exclude_gpus: list, optional - A list of indices correlating to connected GPUs that Tensorflow should not use. Pass + A list of indices correlating to connected GPUs that PyTorch should not use. Pass ``None`` to not exclude any GPUs. Default: ``None`` configfile: str, optional Path to a custom configuration ``ini`` file. Default: Use system configfile @@ -135,9 +134,6 @@ class Extractor(): vram: int Approximate VRAM used by the model at :attr:`input_size`. Used to calculate the :attr:`batchsize`. Be conservative to avoid OOM. - vram_warnings: int - Approximate VRAM used by the model at :attr:`input_size` that will still run, but generates - warnings. Used to calculate the :attr:`batchsize`. Be conservative to avoid OOM. vram_per_batch: int Approximate additional VRAM used by the model for each additional batch. Used to calculate the :attr:`batchsize`. Be conservative to avoid OOM. @@ -174,7 +170,6 @@ def __init__(self, self.input_size = 0 self.color_format: T.Literal["BGR", "RGB", "GRAY"] = "BGR" self.vram = 0 - self.vram_warnings = 0 # Will run at this with warnings self.vram_per_batch = 0 # << THE FOLLOWING ARE SET IN self.initialize METHOD >> # @@ -470,20 +465,7 @@ def initialize(self, *args, **kwargs) -> None: kwargs["out_queue"], [f"predict_{name}", f"post_{name}"]) self._compile_threads() - try: - self.init_model() - except tf_errors.UnknownError as err: - if "failed to get convolution algorithm" in str(err).lower(): - msg = ("Tensorflow raised an unknown error. This is most likely caused by a " - "failure to launch cuDNN which can occur for some GPU/Tensorflow " - "combinations. You should enable `allow_growth` to attempt to resolve this " - "issue:" - "\nGUI: Go to Settings > Extract Plugins > Global and enable the " - "`allow_growth` option." - "\nCLI: Go to `faceswap/config/extract.ini` and change the `allow_growth " - "option to `True`.") - raise FaceswapError(msg) from err - raise err + self.init_model() self._is_initialized = True logger.info("Initialized %s (%s) with batchsize of %s", self.name, self._plugin_type.title(), self.batchsize) @@ -598,20 +580,7 @@ def _thread_process(self, break if not batch.filename: # Batch not populated. Possible during re-aligns continue - try: - batch = function(batch) - except tf_errors.UnknownError as err: - if "failed to get convolution algorithm" in str(err).lower(): - msg = ("Tensorflow raised an unknown error. This is most likely caused by a " - "failure to launch cuDNN which can occur for some GPU/Tensorflow " - "combinations. You should enable `allow_growth` to attempt to resolve " - "this issue:" - "\nGUI: Go to Settings > Extract Plugins > Global and enable the " - "`allow_growth` option." - "\nCLI: Go to `faceswap/config/extract.ini` and change the " - "`allow_growth option to `True`.") - raise FaceswapError(msg) from err - raise err + batch = function(batch) if function.__name__ == "_process_output": # Process output items to individual items from batch for item in self.finalize(batch): diff --git a/plugins/extract/_config.py b/plugins/extract/_config.py index 8314cab2f0..528268fb5a 100644 --- a/plugins/extract/_config.py +++ b/plugins/extract/_config.py @@ -30,16 +30,6 @@ def set_globals(self) -> None: logger.debug("Setting global config") section = "global" self.add_section(section, _("Options that apply to all extraction plugins")) - self.add_item( - section=section, - title="allow_growth", - datatype=bool, - default=False, - group=_("settings"), - info=_("Enable the Tensorflow GPU `allow_growth` configuration option. " - "This option prevents Tensorflow from allocating all of the GPU VRAM at launch " - "but can lead to higher VRAM fragmentation and slower performance. Should only " - "be enabled if you are having problems running extraction.")) self.add_item( section=section, title="aligner_min_scale", diff --git a/plugins/extract/align/_base/aligner.py b/plugins/extract/align/_base/aligner.py index 3eec920c08..c249020924 100644 --- a/plugins/extract/align/_base/aligner.py +++ b/plugins/extract/align/_base/aligner.py @@ -21,8 +21,7 @@ import cv2 import numpy as np - -from tensorflow.python.framework import errors_impl as tf_errors # pylint:disable=no-name-in-module # noqa +from torch.cuda import OutOfMemoryError from lib.utils import FaceswapError from plugins.extract._base import BatchType, Extractor, ExtractMedia, ExtractorBatch @@ -534,6 +533,7 @@ def _predict(self, batch: BatchType) -> AlignerBatch: preds = [self.predict(feed) for feed in batch.refeeds] try: batch.prediction = np.array(preds) + logger.trace("Aligner out: %s", batch.prediction.shape) except ValueError as err: # If refeed batches are different sizes, Numpy will error, so we need to explicitly # set the dtype to 'object' rather than let it infer @@ -549,8 +549,7 @@ def _predict(self, batch: BatchType) -> AlignerBatch: else: raise - return batch - except tf_errors.ResourceExhaustedError as err: + except OutOfMemoryError as err: msg = ("You do not have enough GPU memory available to run detection at the " "selected batch size. You can try a number of things:" "\n1) Close any other application that is using your GPU (web browsers are " @@ -561,6 +560,8 @@ def _predict(self, batch: BatchType) -> AlignerBatch: "\n3) Enable 'Single Process' mode.") raise FaceswapError(msg) from err + return batch + def _process_refeeds(self, batch: AlignerBatch) -> list[AlignerBatch]: """ Process the output for each selected re-feed diff --git a/plugins/extract/align/fan.py b/plugins/extract/align/fan.py index a829f3bcac..e446b7c71e 100644 --- a/plugins/extract/align/fan.py +++ b/plugins/extract/align/fan.py @@ -29,9 +29,8 @@ def __init__(self, **kwargs) -> None: self.name = "FAN" self.input_size = 256 self.color_format = "RGB" - self.vram = 2240 - self.vram_warnings = 512 # Will run at this with warnings - self.vram_per_batch = 64 + self.vram = 896 # 810 in testing + self.vram_per_batch = 768 # ~720 in testing self.realign_centering = "head" self.batchsize: int = self.config["batch-size"] self.reference_scale = 200. / 195. @@ -42,7 +41,6 @@ def init_model(self) -> None: assert isinstance(self.model_path, str) self.model = KSession(self.name, self.model_path, - allow_growth=self.config["allow_growth"], exclude_gpus=self._exclude_gpus) self.model.load_model() # Feed a placeholder so Aligner is primed for Manual tool @@ -224,7 +222,6 @@ def predict(self, feed: np.ndarray) -> np.ndarray: # TODO Remove lazy transpose and change points from predict to use the correct # order retval = self.model.predict(feed)[-1].transpose(0, 3, 1, 2) - logger.trace(retval.shape) # type:ignore[attr-defined] return retval def process_output(self, batch: BatchType) -> None: diff --git a/plugins/extract/detect/_base.py b/plugins/extract/detect/_base.py index c85f75776b..2e44bab0c1 100644 --- a/plugins/extract/detect/_base.py +++ b/plugins/extract/detect/_base.py @@ -23,8 +23,7 @@ import cv2 import numpy as np - -from tensorflow.python.framework import errors_impl as tf_errors # pylint:disable=no-name-in-module # noqa +from torch.cuda import OutOfMemoryError from lib.align import DetectedFace from lib.utils import FaceswapError @@ -280,15 +279,7 @@ def _predict(self, batch: BatchType) -> DetectorBatch: self._rotate_batch(batch, angle) try: pred = self.predict(batch.feed) - if angle == 0: - batch.prediction = pred - else: - batch.prediction = np.array([b if b.any() else p - for b, p in zip(batch.prediction, pred)]) - logger.trace("angle: %s, filenames: %s, " # type:ignore[attr-defined] - "prediction: %s", - angle, batch.filename, pred) - except tf_errors.ResourceExhaustedError as err: + except OutOfMemoryError as err: msg = ("You do not have enough GPU memory available to run detection at the " "selected batch size. You can try a number of things:" "\n1) Close any other application that is using your GPU (web browsers are " @@ -299,6 +290,15 @@ def _predict(self, batch: BatchType) -> DetectorBatch: "\n3) Enable 'Single Process' mode.") raise FaceswapError(msg) from err + if angle == 0: + batch.prediction = pred + else: + batch.prediction = np.array([b if b.any() else p + for b, p in zip(batch.prediction, pred)]) + logger.trace("angle: %s, filenames: %s, " # type:ignore[attr-defined] + "prediction: %s", + angle, batch.filename, pred) + if angle != 0 and any(face.any() for face in batch.prediction): logger.verbose("found face(s) by rotating image %s " # type:ignore[attr-defined] "degrees", diff --git a/plugins/extract/detect/mtcnn.py b/plugins/extract/detect/mtcnn.py index 8af8a41bef..b0efe8d80d 100644 --- a/plugins/extract/detect/mtcnn.py +++ b/plugins/extract/detect/mtcnn.py @@ -7,14 +7,13 @@ import cv2 import numpy as np -# Ignore linting errors from Tensorflow's thoroughly broken import system -from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input, MaxPool2D, Permute, PReLU # noqa:E501 # pylint:disable=import-error +from keras.layers import Conv2D, Dense, Flatten, Input, MaxPooling2D, Permute, PReLU from lib.model.session import KSession from ._base import BatchType, Detector if T.TYPE_CHECKING: - from tensorflow import Tensor + from torch import Tensor logger = logging.getLogger(__name__) @@ -27,9 +26,8 @@ def __init__(self, **kwargs) -> None: super().__init__(git_model_id=git_model_id, model_filename=model_filename, **kwargs) self.name = "MTCNN" self.input_size = 640 - self.vram = 320 if not self.config["cpu"] else 0 - self.vram_warnings = 64 if not self.config["cpu"] else 0 # Will run at this with warnings - self.vram_per_batch = 32 if not self.config["cpu"] else 0 + self.vram = 128 if not self.config["cpu"] else 0 # 66 in testing + self.vram_per_batch = 64 if not self.config["cpu"] else 0 # ~50 in testing self.batchsize = self.config["batch-size"] self.kwargs = self._validate_kwargs() self.color_format = "RGB" @@ -63,7 +61,6 @@ def init_model(self) -> None: """ Initialize MTCNN Model. """ assert isinstance(self.model_path, list) self.model = MTCNN(self.model_path, - self.config["allow_growth"], self._exclude_gpus, self.config["cpu"], **self.kwargs) # type:ignore @@ -145,10 +142,6 @@ class PNet(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool, optional - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance. Default: ``False`` exclude_gpus: list, optional A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs. Default: ``None`` @@ -163,7 +156,6 @@ class PNet(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None, cpu_mode: bool, input_size: int, @@ -172,7 +164,6 @@ def __init__(self, threshold: float) -> None: super().__init__("MTCNN-PNet", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus, cpu_mode=cpu_mode) @@ -193,7 +184,7 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]: input_ = Input(shape=(None, None, 3)) var_x = Conv2D(10, (3, 3), strides=1, padding='valid', name='conv1')(input_) var_x = PReLU(shared_axes=[1, 2], name='PReLU1')(var_x) - var_x = MaxPool2D(pool_size=2)(var_x) + var_x = MaxPooling2D(pool_size=2)(var_x) var_x = Conv2D(16, (3, 3), strides=1, padding='valid', name='conv2')(var_x) var_x = PReLU(shared_axes=[1, 2], name='PReLU2')(var_x) var_x = Conv2D(32, (3, 3), strides=1, padding='valid', name='conv3')(var_x) @@ -326,10 +317,6 @@ class RNet(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool, optional - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance. Default: ``False`` exclude_gpus: list, optional A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs. Default: ``None`` @@ -343,14 +330,12 @@ class RNet(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None, cpu_mode: bool, input_size: int, threshold: float) -> None: super().__init__("MTCNN-RNet", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus, cpu_mode=cpu_mode) self.define_model(self.model_definition) @@ -365,11 +350,11 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]: input_ = Input(shape=(24, 24, 3)) var_x = Conv2D(28, (3, 3), strides=1, padding='valid', name='conv1')(input_) var_x = PReLU(shared_axes=[1, 2], name='prelu1')(var_x) - var_x = MaxPool2D(pool_size=3, strides=2, padding='same')(var_x) + var_x = MaxPooling2D(pool_size=3, strides=2, padding='same')(var_x) var_x = Conv2D(48, (3, 3), strides=1, padding='valid', name='conv2')(var_x) var_x = PReLU(shared_axes=[1, 2], name='prelu2')(var_x) - var_x = MaxPool2D(pool_size=3, strides=2)(var_x) + var_x = MaxPooling2D(pool_size=3, strides=2)(var_x) var_x = Conv2D(64, (2, 2), strides=1, padding='valid', name='conv3')(var_x) var_x = PReLU(shared_axes=[1, 2], name='prelu3')(var_x) @@ -457,10 +442,6 @@ class ONet(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool, optional - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance. Default: ``False`` exclude_gpus: list, optional A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs. Default: ``None`` @@ -473,14 +454,12 @@ class ONet(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None, cpu_mode: bool, input_size: int, threshold: float) -> None: super().__init__("MTCNN-ONet", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus, cpu_mode=cpu_mode) self.define_model(self.model_definition) @@ -495,13 +474,13 @@ def model_definition() -> tuple[list[Tensor], list[Tensor]]: input_ = Input(shape=(48, 48, 3)) var_x = Conv2D(32, (3, 3), strides=1, padding='valid', name='conv1')(input_) var_x = PReLU(shared_axes=[1, 2], name='prelu1')(var_x) - var_x = MaxPool2D(pool_size=3, strides=2, padding='same')(var_x) + var_x = MaxPooling2D(pool_size=3, strides=2, padding='same')(var_x) var_x = Conv2D(64, (3, 3), strides=1, padding='valid', name='conv2')(var_x) var_x = PReLU(shared_axes=[1, 2], name='prelu2')(var_x) - var_x = MaxPool2D(pool_size=3, strides=2)(var_x) + var_x = MaxPooling2D(pool_size=3, strides=2)(var_x) var_x = Conv2D(64, (3, 3), strides=1, padding='valid', name='conv3')(var_x) var_x = PReLU(shared_axes=[1, 2], name='prelu3')(var_x) - var_x = MaxPool2D(pool_size=2)(var_x) + var_x = MaxPooling2D(pool_size=2)(var_x) var_x = Conv2D(128, (2, 2), strides=1, padding='valid', name='conv4')(var_x) var_x = PReLU(shared_axes=[1, 2], name='prelu4')(var_x) var_x = Permute((3, 2, 1))(var_x) @@ -603,10 +582,6 @@ class MTCNN(): # pylint: disable=too-few-public-methods ---------- model_path: list List of paths to the 3 MTCNN subnet weights - allow_growth: bool, optional - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance. Default: ``False`` exclude_gpus: list, optional A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs. Default: ``None`` @@ -624,21 +599,19 @@ class MTCNN(): # pylint: disable=too-few-public-methods """ def __init__(self, model_path: list[str], - allow_growth: bool, exclude_gpus: list[int] | None, cpu_mode: bool, input_size: int = 640, minsize: int = 20, threshold: list[float] | None = None, factor: float = 0.709) -> None: - logger.debug("Initializing: %s: (model_path: '%s', allow_growth: %s, exclude_gpus: %s, " + logger.debug("Initializing: %s: (model_path: '%s', exclude_gpus: %s, " "input_size: %s, minsize: %s, threshold: %s, factor: %s)", - self.__class__.__name__, model_path, allow_growth, exclude_gpus, - input_size, minsize, threshold, factor) + self.__class__.__name__, model_path, exclude_gpus, input_size, minsize, + threshold, factor) threshold = [0.6, 0.7, 0.7] if threshold is None else threshold self._pnet = PNet(model_path[0], - allow_growth, exclude_gpus, cpu_mode, input_size, @@ -646,13 +619,11 @@ def __init__(self, factor, threshold[0]) self._rnet = RNet(model_path[1], - allow_growth, exclude_gpus, cpu_mode, input_size, threshold[1]) self._onet = ONet(model_path[2], - allow_growth, exclude_gpus, cpu_mode, input_size, diff --git a/plugins/extract/detect/s3fd.py b/plugins/extract/detect/s3fd.py index 89d538b76f..676f7999fe 100644 --- a/plugins/extract/detect/s3fd.py +++ b/plugins/extract/detect/s3fd.py @@ -12,17 +12,14 @@ from scipy.special import logsumexp import numpy as np -# Ignore linting errors from Tensorflow's thoroughly broken import system -from tensorflow import keras -from tensorflow.keras import backend as K # pylint:disable=import-error -from tensorflow.keras.layers import ( # pylint:disable=import-error - Concatenate, Conv2D, Input, Maximum, MaxPooling2D, ZeroPadding2D) +from keras.layers import (Concatenate, Conv2D, Input, Layer, Maximum, MaxPooling2D, ZeroPadding2D) +from keras import initializers, ops from lib.model.session import KSession from ._base import BatchType, Detector if T.TYPE_CHECKING: - from tensorflow import Tensor + from torch import Tensor logger = logging.getLogger(__name__) @@ -35,9 +32,8 @@ def __init__(self, **kwargs) -> None: super().__init__(git_model_id=git_model_id, model_filename=model_filename, **kwargs) self.name = "S3FD" self.input_size = 640 - self.vram = 4112 - self.vram_warnings = 1024 # Will run at this with warnings - self.vram_per_batch = 208 + self.vram = 1088 # 1034 in testing + self.vram_per_batch = 960 # 922 in testing self.batchsize = self.config["batch-size"] def init_model(self) -> None: @@ -47,7 +43,6 @@ def init_model(self) -> None: model_kwargs = {"custom_objects": {"L2Norm": L2Norm, "SliceO2K": SliceO2K}} self.model = S3fd(self.model_path, model_kwargs, - self.config["allow_growth"], self._exclude_gpus, confidence) @@ -71,7 +66,7 @@ def process_output(self, batch) -> None: ################################################################################ # CUSTOM KERAS LAYERS ################################################################################ -class L2Norm(keras.layers.Layer): +class L2Norm(Layer): """ L2 Normalization layer for S3FD. Parameters @@ -85,10 +80,10 @@ def __init__(self, n_channels: int, scale: float = 1.0, **kwargs) -> None: super().__init__(**kwargs) self._n_channels = n_channels self._scale = scale - self.w = self.add_weight("l2norm", # pylint:disable=invalid-name - (self._n_channels, ), + self.w = self.add_weight(name="l2norm", + shape=(self._n_channels, ), trainable=True, - initializer=keras.initializers.Constant(value=self._scale), + initializer=initializers.Constant(value=self._scale), dtype="float32") def call(self, inputs: Tensor) -> Tensor: # pylint:disable=arguments-differ @@ -104,10 +99,26 @@ def call(self, inputs: Tensor) -> Tensor: # pylint:disable=arguments-differ tensor: The output from the L2 Normalization Layer """ - norm = K.sqrt(K.sum(K.pow(inputs, 2), axis=-1, keepdims=True)) + 1e-10 + norm = ops.sqrt(ops.sum(ops.power(inputs, 2), axis=-1, keepdims=True)) + 1e-10 var_x = inputs / norm * self.w return var_x + def compute_output_shape(self, + input_shape: tuple[None, int, int, int]) -> tuple[None, int, int, int]: + """ Input shape and output shape are the same + + Parameters + ---------- + input_shape: tuple[None, int, int, int] + Shape of the input tensor + + Returns + ------- + tuple[None, int, int, int] + The shape of the output tensor + """ + return input_shape + def get_config(self) -> dict: """ Returns the config of the layer. @@ -122,7 +133,7 @@ def get_config(self) -> dict: return config -class SliceO2K(keras.layers.Layer): +class SliceO2K(Layer): """ Custom Keras Slice layer generated by onnx2keras. """ def __init__(self, starts: list[int], @@ -202,8 +213,8 @@ def call(self, inputs, **kwargs): # pylint:disable=unused-argument,arguments-di A tensor or list/tuple of tensors. The layer output """ - ax_map = dict((x[0], slice(*x[1:])) for x in self._get_slices(K.ndim(inputs))) - shape = K.int_shape(inputs) + ax_map = dict((x[0], slice(*x[1:])) for x in self._get_slices(ops.ndim(inputs))) + shape = inputs.shape slices = [(ax_map[a] if a in ax_map else slice(None)) for a in range(len(shape))] retval = inputs[tuple(slices)] return retval @@ -229,16 +240,14 @@ class S3fd(KSession): def __init__(self, model_path: str, model_kwargs: dict, - allow_growth: bool, exclude_gpus: list[int] | None, confidence: float) -> None: - logger.debug("Initializing: %s: (model_path: '%s', model_kwargs: %s, allow_growth: %s, " - "exclude_gpus: %s, confidence: %s)", self.__class__.__name__, model_path, - model_kwargs, allow_growth, exclude_gpus, confidence) + logger.debug("Initializing: %s: (model_path: '%s', model_kwargs: %s,exclude_gpus: %s, " + "confidence: %s)", self.__class__.__name__, model_path, model_kwargs, + exclude_gpus, confidence) super().__init__("S3FD", model_path, model_kwargs=model_kwargs, - allow_growth=allow_growth, exclude_gpus=exclude_gpus) self.define_model(self.model_definition) self.load_model_weights() diff --git a/plugins/extract/mask/_base.py b/plugins/extract/mask/_base.py index 837b6812e7..cc6d7f7874 100644 --- a/plugins/extract/mask/_base.py +++ b/plugins/extract/mask/_base.py @@ -20,8 +20,7 @@ import cv2 import numpy as np - -from tensorflow.python.framework import errors_impl as tf_errors # pylint:disable=no-name-in-module # noqa +from torch.cuda import OutOfMemoryError from lib.align import AlignedFace, transform_image from lib.utils import FaceswapError @@ -204,18 +203,17 @@ def _predict(self, batch: BatchType) -> MaskerBatch: """ Just return the masker's predict function """ assert isinstance(batch, MaskerBatch) assert self.name is not None - try: - # slightly hacky workaround to deal with landmarks based masks: - if self.name.lower() in ("components", "extended"): - feed = np.empty(2, dtype="object") - feed[0] = batch.feed - feed[1] = batch.feed_faces - else: - feed = batch.feed + # slightly hacky workaround to deal with landmarks based masks: + if self.name.lower() in ("components", "extended"): + feed = np.empty(2, dtype="object") + feed[0] = batch.feed + feed[1] = batch.feed_faces + else: + feed = batch.feed + try: batch.prediction = self.predict(feed) - return batch - except tf_errors.ResourceExhaustedError as err: + except OutOfMemoryError as err: msg = ("You do not have enough GPU memory available to run detection at the " "selected batch size. You can try a number of things:" "\n1) Close any other application that is using your GPU (web browsers are " @@ -224,7 +222,9 @@ def _predict(self, batch: BatchType) -> MaskerBatch: "editing the plugin settings (GUI: Settings > Configure extract settings, " "CLI: Edit the file faceswap/config/extract.ini)." "\n3) Enable 'Single Process' mode.") - raise FaceswapError(msg) from err + raise FaceswapError(msg) from err + + return batch def finalize(self, batch: BatchType) -> Generator[ExtractMedia, None, None]: """ Finalize the output from Masker diff --git a/plugins/extract/mask/bisenet_fp.py b/plugins/extract/mask/bisenet_fp.py index cf8a177fe6..ab694b9958 100644 --- a/plugins/extract/mask/bisenet_fp.py +++ b/plugins/extract/mask/bisenet_fp.py @@ -10,9 +10,8 @@ import numpy as np -# Ignore linting errors from Tensorflow's thoroughly broken import system -from tensorflow.keras import backend as K # pylint:disable=import-error -from tensorflow.keras.layers import ( # pylint:disable=import-error +import keras.backend as K +from keras.layers import ( # pylint:disable=import-error Activation, Add, BatchNormalization, Concatenate, Conv2D, GlobalAveragePooling2D, Input, MaxPooling2D, Multiply, Reshape, UpSampling2D, ZeroPadding2D) @@ -21,7 +20,7 @@ from ._base import BatchType, Masker, MaskerBatch if T.TYPE_CHECKING: - from tensorflow import Tensor + from torch import Tensor logger = logging.getLogger(__name__) @@ -39,9 +38,8 @@ def __init__(self, **kwargs) -> None: self.name = "BiSeNet - Face Parsing" self.input_size = 512 self.color_format = "RGB" - self.vram = 2304 if not self.config["cpu"] else 0 - self.vram_warnings = 256 if not self.config["cpu"] else 0 - self.vram_per_batch = 64 if not self.config["cpu"] else 0 + self.vram = 384 if not self.config["cpu"] else 0 # 378 in testing + self.vram_per_batch = 384 if not self.config["cpu"] else 0 # ~328 in testing self.batchsize = self.config["batch-size"] self._segment_indices = self._get_segment_indices() @@ -108,7 +106,6 @@ def init_model(self) -> None: assert isinstance(self.model_path, str) lbls = 5 if self._is_faceswap else 19 self.model = BiSeNet(self.model_path, - self.config["allow_growth"], self._exclude_gpus, self.input_size, lbls, @@ -299,7 +296,7 @@ def _basic_block(self, inputs: Tensor, prefix: str, filters: int, strides: int = res = ConvBn(filters, strides=1, padding=1, activation=False, prefix=prefix)(res) shortcut = inputs - filts = (K.int_shape(shortcut)[self._feature_index], K.int_shape(res)[self._feature_index]) + filts = (shortcut.shape[self._feature_index], res.shape[self._feature_index]) if strides != 1 or filts[0] != filts[1]: # Downsample name = f"{prefix}.downsample." shortcut = Conv2D(filters, 1, @@ -400,7 +397,7 @@ def __call__(self, inputs: Tensor, feats: int) -> Tensor: prefix = f"cp.arm{feats}" feat = ConvBn(self._filters, prefix=f"{prefix}.conv", start_idx=-1, padding=-1)(inputs) atten = GlobalAveragePooling2D(name=f"{prefix}.avgpool")(feat) - atten = Reshape((1, 1, K.int_shape(atten)[-1]))(atten) + atten = Reshape((1, 1, atten.shape[-1]))(atten) atten = Conv2D(self._filters, 1, use_bias=False, name=f"{prefix}.conv_atten")(atten) atten = BatchNormalization(epsilon=1e-5, name=f"{prefix}.bn_atten")(atten) atten = Activation("sigmoid", name=f"{prefix}.sigmoid")(atten) @@ -429,10 +426,10 @@ def __call__(self, inputs: Tensor) -> Tensor: feat8, feat16, feat32 = self._resnet(inputs) avg = GlobalAveragePooling2D(name="cp.avgpool")(feat32) - avg = Reshape((1, 1, K.int_shape(avg)[-1]))(avg) + avg = Reshape((1, 1, avg.shape[-1]))(avg) avg = ConvBn(128, kernel_size=1, padding=0, prefix="cp.conv_avg", start_idx=-1)(avg) - avg_up = UpSampling2D(size=K.int_shape(feat32)[1:3], name="cp.upsample")(avg) + avg_up = UpSampling2D(size=feat32.shape[1:3], name="cp.upsample")(avg) feat32 = AttentionRefinementModule(128)(feat32, 32) feat32 = Add(name="cp.add")([feat32, avg_up]) @@ -480,7 +477,7 @@ def __call__(self, inputs: Tensor) -> Tensor: start_idx=-1)(feat) atten = GlobalAveragePooling2D(name="ffm.avgpool")(feat) - atten = Reshape((1, 1, K.int_shape(atten)[-1]))(atten) + atten = Reshape((1, 1, atten.shape[-1]))(atten) atten = Conv2D(self._filters // 4, 1, use_bias=False, name="ffm.conv1")(atten) atten = Activation("relu", name="ffm.relu")(atten) atten = Conv2D(self._filters, 1, use_bias=False, name="ffm.conv2")(atten) @@ -537,10 +534,6 @@ class BiSeNet(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance exclude_gpus: list A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs @@ -553,14 +546,12 @@ class BiSeNet(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None, input_size: int, num_classes: int, cpu_mode: bool) -> None: super().__init__("BiSeNet Face Parsing", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus, cpu_mode=cpu_mode) self._input_size = input_size @@ -587,10 +578,10 @@ def _model_definition(self) -> tuple[Tensor, list[Tensor]]: feat_out16 = BiSeNetOutput(64, self._num_classes, label="16")(features[1]) feat_out32 = BiSeNetOutput(64, self._num_classes, label="32")(features[2]) - height, width = K.int_shape(input_)[1:3] - f_h, f_w = K.int_shape(feat_out)[1:3] - f_h16, f_w16 = K.int_shape(feat_out16)[1:3] - f_h32, f_w32 = K.int_shape(feat_out32)[1:3] + height, width = input_.shape[1:3] + f_h, f_w = feat_out.shape[1:3] + f_h16, f_w16 = feat_out16.shape[1:3] + f_h32, f_w32 = feat_out32.shape[1:3] feat_out = UpSampling2D(size=(height // f_h, width // f_w), interpolation="bilinear")(feat_out) diff --git a/plugins/extract/mask/unet_dfl.py b/plugins/extract/mask/unet_dfl.py index 4ca2f3dc07..598c630889 100644 --- a/plugins/extract/mask/unet_dfl.py +++ b/plugins/extract/mask/unet_dfl.py @@ -31,9 +31,8 @@ def __init__(self, **kwargs) -> None: self.model: KSession self.name = "U-Net" self.input_size = 256 - self.vram = 3424 - self.vram_warnings = 256 - self.vram_per_batch = 80 + self.vram = 320 # 276 in testing + self.vram_per_batch = 256 # ~215 in testing self.batchsize = self.config["batch-size"] self._storage_centering = "legacy" @@ -42,7 +41,6 @@ def init_model(self) -> None: self.model = KSession(self.name, self.model_path, model_kwargs={}, - allow_growth=self.config["allow_growth"], exclude_gpus=self._exclude_gpus) self.model.load_model() placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3), diff --git a/plugins/extract/mask/vgg_clear.py b/plugins/extract/mask/vgg_clear.py index 50165f8015..ba2535f481 100644 --- a/plugins/extract/mask/vgg_clear.py +++ b/plugins/extract/mask/vgg_clear.py @@ -6,8 +6,7 @@ import numpy as np -# Ignore linting errors from Tensorflow's thoroughly broken import system -from tensorflow.keras.layers import ( # pylint:disable=import-error +from keras.layers import ( Add, Conv2D, Conv2DTranspose, Cropping2D, Dropout, Input, Lambda, MaxPooling2D, ZeroPadding2D) @@ -15,7 +14,7 @@ from ._base import BatchType, Masker, MaskerBatch if T.TYPE_CHECKING: - from tensorflow import Tensor + from torch import Tensor logger = logging.getLogger(__name__) @@ -29,15 +28,13 @@ def __init__(self, **kwargs) -> None: self.model: KSession self.name = "VGG Clear" self.input_size = 300 - self.vram = 2944 - self.vram_warnings = 1088 # at BS 1. OOMs at higher batch sizes - self.vram_per_batch = 400 + self.vram = 1344 # 1308 in testing + self.vram_per_batch = 448 # ~402 in testing self.batchsize = self.config["batch-size"] def init_model(self) -> None: assert isinstance(self.model_path, str) self.model = VGGClear(self.model_path, - allow_growth=self.config["allow_growth"], exclude_gpus=self._exclude_gpus) self.model.append_softmax_activation(layer_index=-1) placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3), @@ -73,10 +70,6 @@ class VGGClear(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance exclude_gpus: list A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs @@ -93,11 +86,9 @@ class VGGClear(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None): super().__init__("VGG Obstructed", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus) self.define_model(self._model_definition) self.load_model_weights() diff --git a/plugins/extract/mask/vgg_obstructed.py b/plugins/extract/mask/vgg_obstructed.py index a3f543d7e8..54d30af3ed 100644 --- a/plugins/extract/mask/vgg_obstructed.py +++ b/plugins/extract/mask/vgg_obstructed.py @@ -6,8 +6,7 @@ import numpy as np -# Ignore linting errors from Tensorflow's thoroughly broken import system -from tensorflow.keras.layers import ( # pylint:disable=import-error +from keras.layers import ( Add, Conv2D, Conv2DTranspose, Cropping2D, Dropout, Input, Lambda, MaxPooling2D, ZeroPadding2D) @@ -15,7 +14,7 @@ from ._base import BatchType, Masker, MaskerBatch if T.TYPE_CHECKING: - from tensorflow import Tensor + from torch import Tensor logger = logging.getLogger(__name__) @@ -29,15 +28,13 @@ def __init__(self, **kwargs) -> None: self.model: KSession self.name = "VGG Obstructed" self.input_size = 500 - self.vram = 3936 - self.vram_warnings = 1088 # at BS 1. OOMs at higher batch sizes - self.vram_per_batch = 304 + self.vram = 1728 # 1710 in testing + self.vram_per_batch = 896 # ~886 in testing self.batchsize = self.config["batch-size"] def init_model(self) -> None: assert isinstance(self.model_path, str) self.model = VGGObstructed(self.model_path, - allow_growth=self.config["allow_growth"], exclude_gpus=self._exclude_gpus) self.model.append_softmax_activation(layer_index=-1) placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3), @@ -72,10 +69,6 @@ class VGGObstructed(KSession): ---------- model_path: str The path to the keras model file - allow_growth: bool - Enable the Tensorflow GPU allow_growth configuration option. This option prevents - Tensorflow from allocating all of the GPU VRAM, but can lead to higher fragmentation and - slower performance exclude_gpus: list A list of indices correlating to connected GPUs that Tensorflow should not use. Pass ``None`` to not exclude any GPUs @@ -89,11 +82,9 @@ class VGGObstructed(KSession): """ def __init__(self, model_path: str, - allow_growth: bool, exclude_gpus: list[int] | None) -> None: super().__init__("VGG Obstructed", model_path, - allow_growth=allow_growth, exclude_gpus=exclude_gpus) self.define_model(self._model_definition) self.load_model_weights() diff --git a/plugins/extract/pipeline.py b/plugins/extract/pipeline.py index 80f598acc9..b4e5db64b5 100644 --- a/plugins/extract/pipeline.py +++ b/plugins/extract/pipeline.py @@ -692,12 +692,64 @@ def _launch_plugin(self, phase: str) -> None: plugin.start() logger.debug("Launched %s plugin", phase) + def _set_plugins_batchsize(self, gpu_plugins: list[str], vram_free: int) -> None: + """ Set the batch size for the current phase so that it will fit in available VRAM. + + Do not update plugins which have a vram_per_batch of 0 (CPU plugins) due to + zero division error. + + Reduces the batchsize of the plugin which has a batch size > 1 and the largest VRAM + requirements. The final reduction is the plugin which has a batch size > 1 and the + smallest VRAM requirements that would fit the pipeline inside VRAM + + Parameters + ---------- + gpu_plugins: list[str] + The name of the plugins that use the GPU for the current phase + vram_free: int + The amount of available VRAM, in MBs + """ + logger.debug("GPU plugins: %s, Available vram: %s", gpu_plugins, vram_free) + plugins = [self._active_plugins[idx] + for idx, plugin in enumerate(self._current_phase) + if plugin in gpu_plugins] + base_vram = sum(p.vram for p in plugins) + vram_free = vram_free - base_vram + logger.debug("Base vram: %s, remaining vram: %s", base_vram, vram_free) + + to_allocate = [(p.batchsize, p.vram_per_batch) for p in plugins] + excess = sum(a[0] * a[1] for a in to_allocate) - vram_free + logger.debug("Plugins to allocate: %s, excess vram: %s", to_allocate, excess) + + while excess > 0: + chosen = next(p for p in to_allocate + if p[0] > 1 and p[1] == max(p[1] for p in to_allocate if p[0] > 1)) + + if excess - chosen[1] <= 0: + chosen = next(p for p in to_allocate + if p[0] > 1 and p[1] == min(p[1] for p in to_allocate + if p[0] > 1 and p[1] >= excess)) + + excess -= chosen[1] + logger.debug("Reducing batch size for item %s. Remaining %s", chosen, excess) + to_allocate[to_allocate.index(chosen)] = (chosen[0] - 1, chosen[1]) + + msg = [] + for plugin, alloc in zip(plugins, to_allocate): + if plugin.batchsize != alloc[0]: + logger.debug("Updating batchsize for plugin %s from %s to %s", + plugin.name, plugin.batchsize, alloc[0]) + plugin.batchsize = alloc[0] + msg.append(f"{plugin.__class__.__name__}: {plugin.batchsize}") + + logger.info("Reset batch sizes due to available VRAM: %s", ", ".join(msg)) + + def _set_extractor_batchsize(self) -> None: """ Sets the batch size of the requested plugins based on their vram, their vram_per_batch_requirements and the number of plugins being loaded in the current phase. - Only adjusts if the the configured batch size requires more vram than is available. Nvidia - only. + Only adjusts if the the configured batch size requires more vram than is available. """ backend = get_backend() if backend not in ("nvidia", "directml", "rocm"): @@ -709,62 +761,20 @@ def _set_extractor_batchsize(self) -> None: batch_required = sum(plugin.vram_per_batch * plugin.batchsize for plugin in self._active_plugins) + gpu_plugins = [p for p in self._current_phase if self._vram_per_phase[p] > 0] + scaling = self._parallel_scaling.get(len(gpu_plugins), self._scaling_fallback) plugins_required = sum(self._vram_per_phase[p] for p in gpu_plugins) * scaling - if plugins_required + batch_required <= T.cast(int, self._vram_stats["vram_free"]): + + vram_free = T.cast(int, self._vram_stats["vram_free"]) + total_required = plugins_required + batch_required + if total_required <= vram_free: logger.debug("Plugin requirements within threshold: (plugins_required: %sMB, " "vram_free: %sMB)", plugins_required, self._vram_stats["vram_free"]) return - # Hacky split across plugins that use vram - available_vram = (T.cast(int, self._vram_stats["vram_free"]) - - plugins_required) // len(gpu_plugins) - self._set_plugin_batchsize(gpu_plugins, available_vram) - def _set_plugin_batchsize(self, gpu_plugins: list[str], available_vram: float) -> None: - """ Set the batch size for the given plugin based on given available vram. - Do not update plugins which have a vram_per_batch of 0 (CPU plugins) due to - zero division error. - """ - plugins = [self._active_plugins[idx] - for idx, plugin in enumerate(self._current_phase) - if plugin in gpu_plugins] - vram_per_batch = [plugin.vram_per_batch for plugin in plugins] - ratios = [vram / sum(vram_per_batch) for vram in vram_per_batch] - requested_batchsizes = [plugin.batchsize for plugin in plugins] - batchsizes = [min(requested, max(1, int((available_vram * ratio) / plugin.vram_per_batch))) - for ratio, plugin, requested in zip(ratios, plugins, requested_batchsizes)] - remaining = available_vram - sum(batchsize * plugin.vram_per_batch - for batchsize, plugin in zip(batchsizes, plugins)) - sorted_indices = [i[0] for i in sorted(enumerate(plugins), - key=lambda x: x[1].vram_per_batch, reverse=True)] - - logger.debug("requested_batchsizes: %s, batchsizes: %s, remaining vram: %s", - requested_batchsizes, batchsizes, remaining) - - while remaining > min(plugin.vram_per_batch - for plugin in plugins) and requested_batchsizes != batchsizes: - for idx in sorted_indices: - plugin = plugins[idx] - if plugin.vram_per_batch > remaining: - logger.debug("Not enough VRAM to increase batch size of %s. Required: %sMB, " - "Available: %sMB", plugin, plugin.vram_per_batch, remaining) - continue - if plugin.batchsize == batchsizes[idx]: - logger.debug("Threshold reached for %s. Batch size: %s", - plugin, plugin.batchsize) - continue - logger.debug("Incrementing batch size of %s to %s", plugin, batchsizes[idx] + 1) - batchsizes[idx] += 1 - remaining -= plugin.vram_per_batch - logger.debug("Remaining VRAM to allocate: %sMB", remaining) - - if batchsizes != requested_batchsizes: - text = ", ".join([f"{plugin.__class__.__name__}: {batchsize}" - for plugin, batchsize in zip(plugins, batchsizes)]) - for plugin, batchsize in zip(plugins, batchsizes): - plugin.batchsize = batchsize - logger.info("Reset batch sizes due to available VRAM: %s", text) + self._set_plugins_batchsize(gpu_plugins, vram_free) def _join_threads(self): """ Join threads for current pass """ diff --git a/plugins/extract/recognition/_base.py b/plugins/extract/recognition/_base.py index 3630607b98..a0bde49cd1 100644 --- a/plugins/extract/recognition/_base.py +++ b/plugins/extract/recognition/_base.py @@ -22,7 +22,7 @@ from dataclasses import dataclass, field import numpy as np -from tensorflow.python.framework import errors_impl as tf_errors # pylint:disable=no-name-in-module # noqa +from torch.cuda import OutOfMemoryError from lib.align import AlignedFace, DetectedFace from lib.image import read_image_meta @@ -205,11 +205,10 @@ def get_batch(self, queue: Queue) -> tuple[bool, RecogBatch]: def _predict(self, batch: BatchType) -> RecogBatch: """ Just return the recognition's predict function """ assert isinstance(batch, RecogBatch) + # slightly hacky workaround to deal with landmarks based masks: try: - # slightly hacky workaround to deal with landmarks based masks: batch.prediction = self.predict(batch.feed) - return batch - except tf_errors.ResourceExhaustedError as err: + except OutOfMemoryError as err: msg = ("You do not have enough GPU memory available to run recognition at the " "selected batch size. You can try a number of things:" "\n1) Close any other application that is using your GPU (web browsers are " @@ -218,7 +217,9 @@ def _predict(self, batch: BatchType) -> RecogBatch: "editing the plugin settings (GUI: Settings > Configure extract settings, " "CLI: Edit the file faceswap/config/extract.ini)." "\n3) Enable 'Single Process' mode.") - raise FaceswapError(msg) from err + raise FaceswapError(msg) from err + + return batch def finalize(self, batch: BatchType) -> Generator[ExtractMedia, None, None]: """ Finalize the output from Masker diff --git a/plugins/extract/recognition/vgg_face2.py b/plugins/extract/recognition/vgg_face2.py index ae717c75a2..84c3d219f7 100644 --- a/plugins/extract/recognition/vgg_face2.py +++ b/plugins/extract/recognition/vgg_face2.py @@ -8,13 +8,17 @@ import numpy as np import psutil from fastcluster import linkage, linkage_vector +from keras.layers import (Activation, add, AveragePooling2D, BatchNormalization, Conv2D, Dense, + Flatten, Input, MaxPooling2D) +from keras.regularizers import L2 -from lib.model.layers import L2_normalize +from lib.model.layers import L2Normalize from lib.model.session import KSession from lib.utils import FaceswapError from ._base import BatchType, RecogBatch, Identity if T.TYPE_CHECKING: + import torch from collections.abc import Generator logger = logging.getLogger(__name__) # pylint: disable=invalid-name @@ -47,9 +51,8 @@ def __init__(self, *args, **kwargs) -> None: # pylint:disable=unused-argument self.input_size = 224 self.color_format = "BGR" - self.vram = 2468 if not self.config["cpu"] else 0 - self.vram_warnings = 192 if not self.config["cpu"] else 0 - self.vram_per_batch = 32 if not self.config["cpu"] else 0 + self.vram = 384 if not self.config["cpu"] else 0 # 334 in testing + self.vram_per_batch = 192 if not self.config["cpu"] else 0 # ~155 in testing self.batchsize = self.config["batch-size"] # Average image provided in https://github.com/ox-vgg/vgg_face2 @@ -60,14 +63,13 @@ def __init__(self, *args, **kwargs) -> None: # pylint:disable=unused-argument def init_model(self) -> None: """ Initialize VGG Face 2 Model. """ assert isinstance(self.model_path, str) - model_kwargs = {"custom_objects": {"L2_normalize": L2_normalize}} - self.model = KSession(self.name, + self.model = VGGFace2(self.input_size, self.model_path, - model_kwargs=model_kwargs, - allow_growth=self.config["allow_growth"], exclude_gpus=self._exclude_gpus, cpu_mode=self.config["cpu"]) - self.model.load_model() + placeholder = np.zeros((self.batchsize, self.input_size, self.input_size, 3), + dtype="float32") + self.model.predict(placeholder) def process_input(self, batch: BatchType) -> None: """ Compile the detected faces for prediction """ @@ -99,6 +101,274 @@ def process_output(self, batch: BatchType) -> None: return +class ResNet50: + """ ResNet50 imported for VGG-Face2 adapted from + https://github.com/WeidiXie/Keras-VGGFace2-ResNet50 + + Parameters + ---------- + input_shape, Tuple[int, int, int] | None, optional + The input shape for the model. Default: ``None`` + use_truncated: bool, optional + ``True`` to use a truncated version of resnet. Default ``False`` + weight_decay: float + L2 Regularizer weight decay. Default: 1e-4 + trainable: bool, optional + ``True`` if the block should be trainable. Default: ``True`` + """ + def __init__(self, + input_shape: tuple[int, int, int] | None = None, + use_truncated: bool = False, + weight_decay: float = 1e-4, + trainable: bool = True) -> None: + logger.debug("Initializing %s: input_shape: %s, use_truncated: %s, weight_decay: %s, " + "trainable: %s", self.__class__.__name__, input_shape, use_truncated, + weight_decay, trainable) + + self._input_shape = (None, None, 3) if input_shape is None else input_shape + self._weight_decay = weight_decay + self._trainable = trainable + + self._kernel_initializer = "orthogonal" + self._use_bias = False + self._bn_axis = 3 + self._block_suffix = {0: "_reduce", 1: "", 2: "_increase"} + + self._identity_calls = [2, 3, 5, 2] + self._filters = [(64, 64, 256), (128, 128, 512), (256, 256, 1024), (512, 512, 2048)] + if use_truncated: + self._identity_calls = self._identity_calls[:-1] + self._filters = self._filters[:-1] + + logger.debug("Initialized %s", self.__class__.__name__) + + def _identity_block(self, + inputs: torch.Tensor, + kernel_size: int, + filters: tuple[int, int, int], + stage: int, + block: int) -> torch.Tensor: + """ The identity block is the block that has no conv layer at shortcut. + + Parameters + ---------- + inputs: :class:`torch.Tensor` + Input tensor + kernel_size: int + The kernel size of middle conv layer of the block + filters: tuple[int, int, int[ + The filterss of 3 conv layers in the main path + stage: int + The current stage label, used for generating layer names + block: int + The current block label, used for generating layer names + + Returns + ------- + :class:`torch.Tensor` + Output tensor for the block + """ + assert len(filters) == 3 + var_x = inputs + + for idx, filts in enumerate(filters): + k_size = kernel_size if idx == 1 else 1 + conv_name = f"conv{stage}_{block}_{k_size}x{k_size}{self._block_suffix[idx]}" + bn_name = f"{conv_name}_bn" + + var_x = Conv2D(filts, + k_size, + padding="same" if idx == 1 else "valid", + kernel_initializer=self._kernel_initializer, + use_bias=self._use_bias, + kernel_regularizer=L2(self._weight_decay), + trainable=self._trainable, + name=conv_name)(var_x) + var_x = BatchNormalization(axis=self._bn_axis, name=bn_name)(var_x) + if idx < 2: + var_x = Activation("relu")(var_x) + + var_x = add([var_x, inputs]) + var_x = Activation("relu")(var_x) + return var_x + + def _conv_block(self, + inputs: torch.Tensor, + kernel_size: int, + filters: tuple[int, int, int], + stage: int, + block: int, + strides: tuple[int, int] = (2, 2)) -> torch.Tensor: + """ A block that has a conv layer at shortcut. + + Parameters + ---------- + inputs: :class:`torch.Tensor` + Input tensor + kernel_size: int + The kernel size of middle conv layer of the block + filters: tuple[int, int, int[ + The filterss of 3 conv layers in the main path + stage: int + The current stage label, used for generating layer names + block: int + The current block label, used for generating layer names + strides: tuple[int, int], optional + The stride length for the first and last convolution. Default: (2, 2) + + Returns + ------- + :class:`torch.Tensor` + Output tensor for the block + + Notes + ----- + From stage 3, the first conv layer at main path is with `strides = (2,2)` and the shortcut + should have `strides = (2,2)` as well + """ + assert len(filters) == 3 + var_x = inputs + + for idx, filts in enumerate(filters): + k_size = kernel_size if idx == 1 else 1 + conv_name = f"conv{stage}_{block}_{k_size}x{k_size}{self._block_suffix[idx]}" + bn_name = f"{conv_name}_bn" + + var_x = Conv2D(filts, + k_size, + strides=strides if idx == 0 else (1, 1), + padding="same" if idx == 1 else "valid", + kernel_initializer=self._kernel_initializer, + use_bias=self._use_bias, + kernel_regularizer=L2(self._weight_decay), + trainable=self._trainable, + name=conv_name)(var_x) + var_x = BatchNormalization(axis=self._bn_axis, name=bn_name)(var_x) + if idx < 2: + var_x = Activation("relu")(var_x) + + conv_name = f"conv{stage}_{block}_1x1_proj" + bn_name = f"{conv_name}_bn" + + shortcut = Conv2D(filters[-1], + (1, 1), + strides=strides, + kernel_initializer=self._kernel_initializer, + use_bias=self._use_bias, + kernel_regularizer=L2(self._weight_decay), + trainable=self._trainable, + name=conv_name)(inputs) + shortcut = BatchNormalization(axis=self._bn_axis, name=bn_name)(shortcut) + + var_x = add([var_x, shortcut]) + var_x = Activation("relu")(var_x) + return var_x + + def __call__(self, inputs: torch.Tensor) -> torch.Tensor: + """ Call the resnet50 Network + + Parameters + ---------- + inputs: :class:`torch.Tensor` + Input tensor + + Returns + ------- + :class:`torch.Tensor` + Output tensor from resnet50 + """ + var_x = Conv2D(64, + (7, 7), + strides=(2, 2), + padding="same", + use_bias=self._use_bias, + kernel_initializer=self._kernel_initializer, + kernel_regularizer=L2(self._weight_decay), + trainable=self._trainable, + name="conv1_7x7_s2")(inputs) + + var_x = BatchNormalization(axis=self._bn_axis, name="conv1_7x7_s2_bn")(var_x) + var_x = Activation("relu")(var_x) + var_x = MaxPooling2D((3, 3), strides=(2, 2))(var_x) + + for idx, (recursuions, filters) in enumerate(zip(self._identity_calls, self._filters)): + stage = idx + 2 + strides = (1, 1) if stage == 2 else (2, 2) + var_x = self._conv_block(var_x, 3, filters, stage=stage, block=1, strides=strides) + + for recursion in range(recursuions): + block = recursion + 2 + var_x = self._identity_block(var_x, 3, filters, stage=stage, block=block) + + return var_x + + +class VGGFace2(KSession): + """ VGG-Face 2 model with resnet 50 backbone. Adapted from + https://github.com/WeidiXie/Keras-VGGFace2-ResNet50 + + Parameters + ---------- + input_size, int + The input size for the model. + model_path: str + The path to the keras model file + exclude_gpus: list + A list of indices correlating to connected GPUs that Tensorflow should not use. Pass + ``None`` to not exclude any GPUs + cpu_mode: bool, optional + ``True`` run the model on CPU. Default: ``False`` + num_class: int, optional + Number of classes to train the model on + weight_decay: float + L2 Regularizer weight decay. Default: 1e-4 + """ + def __init__(self, + input_size: int, + model_path: str, + exclude_gpus: list[int] | None, + cpu_mode: bool, + num_classes: int = 8631, + weight_decay: float = 1e-4) -> None: + logger.debug("Initializing %s: input_size: %s, model_path: %s, exclude_gpus: %s, " + "num_classes: %s, weight_decay: %s, train: %s", + self.__class__.__name__, input_size, model_path, exclude_gpus, cpu_mode, + num_classes, weight_decay) + super().__init__("VGG Face 2", + model_path, + exclude_gpus=exclude_gpus, + cpu_mode=cpu_mode) + self._input_shape = (input_size, input_size, 3) + self._weight_decay = weight_decay + self._num_classes = num_classes + self._resnet = ResNet50(input_shape=self._input_shape, weight_decay=self._weight_decay) + + self.define_model(self._model_definition) + self.load_model_weights() + + logger.debug("Initialized %s", self.__class__.__name__) + + def _model_definition(self) -> tuple[torch.Tensor, list[torch.Tensor]]: + """ Run the vgg-face2 model on the input tensor + + Returns + ------- + :class:`torch.Tensor` + The input tensor to vgg-face2 + list[`torch.Tensor`] + The output from vgg-face2 + """ + inputs = Input(self._input_shape) + var_x = self._resnet(inputs) + + var_x = AveragePooling2D((7, 7), name="avg_pool")(var_x) + var_x = Flatten()(var_x) + var_x = Dense(512, activation="relu", name="dim_proj")(var_x) + + var_x = L2Normalize(axis=1)(var_x) + return inputs, [var_x] + + class Cluster(): # pylint: disable=too-few-public-methods """ Cluster the outputs from a VGG-Face 2 Model diff --git a/scripts/extract.py b/scripts/extract.py index 44a7434bf6..b963d498ba 100644 --- a/scripts/extract.py +++ b/scripts/extract.py @@ -12,6 +12,7 @@ import numpy as np from tqdm import tqdm +import torch from lib.align.alignments import PNGHeaderDict from lib.image import encode_image, generate_thumbnail, ImagesLoader, ImagesSaver, read_image_meta @@ -740,7 +741,8 @@ def _run_extraction(self) -> None: detected_faces[extract_media.filename] = extract_media if not is_final: - logger.debug("Reloading images") + logger.debug("Reloading images and resetting PyTorch memory cache") + torch.cuda.empty_cache() self._loader.reload(detected_faces) if saver is not None: saver.close() diff --git a/tests/lib/utils_test.py b/tests/lib/utils_test.py index 34f9be5f4f..4db96cfcc4 100644 --- a/tests/lib/utils_test.py +++ b/tests/lib/utils_test.py @@ -19,7 +19,7 @@ from lib import utils from lib.utils import ( _Backend, camel_case_split, convert_to_secs, DebugTimes, deprecation_warning, FaceswapError, - full_path_split, get_backend, get_dpi, get_folder, get_image_paths, get_tf_version, GetModel, + full_path_split, get_backend, get_dpi, get_folder, get_image_paths, get_torch_version, GetModel, safe_shutdown, set_backend, set_system_verbosity) from lib.logger import log_setup @@ -205,10 +205,10 @@ def test_camel_case_split(text: str, result: list[str]) -> None: # General utils -def test_get_tf_version() -> None: - """ Test the :func:`~lib.utils.get_tf_version` function version returns correctly in range """ - tf_version = get_tf_version() - assert (2, 10) <= tf_version < (2, 11) +def test_get_torch_version() -> None: + """ Test the :func:`~lib.utils.get_torch_version` function version returns correctly in range """ + tf_version = get_torch_version() + assert (2, 1) <= tf_version < (2, 3) def test_get_dpi() -> None: