From 7344bf631decdacc0f8d7284031f772efa477ea7 Mon Sep 17 00:00:00 2001 From: getzze Date: Sat, 29 Jun 2024 23:32:57 +0100 Subject: [PATCH 1/2] use knowit to parse video metadata fix test due to a bug in enzyme --- Dockerfile | 4 + changelog.d/1154.change.rst | 8 ++ docs/user/how_it_works.rst | 4 +- pyproject.toml | 4 +- subliminal/refiners/metadata.py | 230 ++++++++++++++++++++++---------- tests/refiners/test_metadata.py | 155 ++++++++++++++++++--- 6 files changed, 312 insertions(+), 93 deletions(-) create mode 100644 changelog.d/1154.change.rst diff --git a/Dockerfile b/Dockerfile index 47d065c7..60f39036 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,12 +20,16 @@ if [ "$BUILD_WITH_UNRAR" = true ]; then \ rm -rf /tmp/unrar /tmp/unrar.tar.gz; \ fi +# install libmediainfo for metadata refiner +RUN apk add --no-cache libmediainfo + RUN mkdir -p /usr/src/app /usr/src/cache WORKDIR /usr/src/app VOLUME /usr/src/cache COPY . /usr/src/app +RUN python -m pip install -U pip RUN python -m pip install . diff --git a/changelog.d/1154.change.rst b/changelog.d/1154.change.rst new file mode 100644 index 00000000..a0b8f95c --- /dev/null +++ b/changelog.d/1154.change.rst @@ -0,0 +1,8 @@ +Use `knowit` to extract information from video file, instead of `enzyme`: +frame rate, duration and subtitles. +`knowit` relies on external programs (`mediainfo`, `ffmpeg`, `mkvmerge`) +and falls back to using `enzyme` if none is installed. +On Windows and MacOS, `libmediainfo` is installed automatically +via the `pymediainfo` python package dependency. +On Linux, the `libmediainfo` or `mediainfo` package needs to be installed +with the package manager of your distribution. diff --git a/docs/user/how_it_works.rst b/docs/user/how_it_works.rst index 0dd33718..e23a13c2 100644 --- a/docs/user/how_it_works.rst +++ b/docs/user/how_it_works.rst @@ -30,7 +30,7 @@ Scoring Rating subtitles and comparing them is probably the most difficult part and this is where subliminal excels with its powerful scoring algorithm. -Using `guessit `_ and `enzyme `_, subliminal extracts +Using `guessit `_ and `knowit `_, subliminal extracts properties of the video and match them with the properties of the subtitles found with the providers. Equations in :mod:`subliminal.score` give a score to each property (called a match). The more matches the video and @@ -42,7 +42,7 @@ Libraries Various libraries are used by subliminal and are key to its success: * `guessit `_ to guess information from filenames -* `enzyme `_ to detect embedded subtitles in videos and read other video metadata +* `knowit `_ to detect embedded subtitles in videos and read other video metadata * `babelfish `_ to work with languages * `requests `_ to make human readable HTTP requests * `BeautifulSoup `_ to parse HTML and XML diff --git a/pyproject.toml b/pyproject.toml index 1963cccf..6e59e071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,8 +41,9 @@ dependencies = [ "click>=8.0", "click-option-group>=0.5.6", "dogpile.cache>=1.0", - "enzyme>=0.5.0", "guessit>=3.0.0", + "knowit>=0.5.2; python_version <= '3.8'", + "knowit>=0.5.5; python_version > '3.8'", "platformdirs>=3", "pysubs2>=1.7", "rarfile>=2.7", @@ -202,6 +203,7 @@ select = [ "TCH", # flake8-typecheck "TID", # flake8-tidy-imports "RUF", # ruff-specific rules + "T", # flake8-print "ISC", # flake8-implicit-str-concat "PT", # flake8-pytest-style "FA", # flake8-future-annotations diff --git a/subliminal/refiners/metadata.py b/subliminal/refiners/metadata.py index 47f83982..9e93b74c 100644 --- a/subliminal/refiners/metadata.py +++ b/subliminal/refiners/metadata.py @@ -3,113 +3,207 @@ from __future__ import annotations import logging -import os +from datetime import timedelta from typing import TYPE_CHECKING, Any -from babelfish import Error as BabelfishError # type: ignore[import-untyped] from babelfish import Language # type: ignore[import-untyped] -from enzyme import MKV # type: ignore[import-untyped] +from knowit.api import available_providers, dependencies, know # type: ignore[import-untyped] from subliminal.subtitle import EmbeddedSubtitle if TYPE_CHECKING: + from collections.abc import Mapping + from subliminal.video import Video logger = logging.getLogger(__name__) -def refine(video: Video, *, embedded_subtitles: bool = True, **kwargs: Any) -> Video: +def loaded_providers(options: dict[str, Any] | None = None) -> dict[str, bool]: + """Return a dict with knowit providers and if they are installed.""" + # clear knowit cached available providers + available_providers.clear() + # find knowit providers with options + deps = dependencies(options) + # mediainfo requires more work, because 'pymediainfo' is always installed + # but it's not working alone. + return {k: len({v for v in d if v != 'pymediainfo'}) > 0 for k, d in deps.items()} + + +def refine( + video: Video, + *, + embedded_subtitles: bool = True, + metadata_provider: str | None = None, + metadata_options: Mapping[str, Any] | None = None, + **kwargs: Any, +) -> Video: """Refine a video by searching its metadata. + For better metadata discovery, at least one of the following external tool + needs to be installed: + + - ``mediainfo``: best capabilities, works with any video file format. + Automatically installed on Windows and MacOS (bundled with + the ``pymediainfo`` python package). + Needs to be installed on Linux. + - ``ffmpeg``: similar capabilities, works with any video file format. + Needs to be installed on Windows, MacOS and Linux. + - ``mkvmerge``: only works with ``mkv`` files. + Needs to be installed on Windows, MacOS and Linux. + Several :class:`~subliminal.video.Video` attributes can be found: * :attr:`~subliminal.video.Video.resolution` + * :attr:`~subliminal.video.Video.duration` + * :attr:`~subliminal.video.Video.frame_rate` * :attr:`~subliminal.video.Video.video_codec` * :attr:`~subliminal.video.Video.audio_codec` * :attr:`~subliminal.video.Video.subtitles` :param bool embedded_subtitles: search for embedded subtitles. + :param (str | None) metadata_provider: provider used to retrieve information from video metadata. + Should be one of ['mediainfo', 'ffmpeg', 'mkvmerge', 'enzyme']. None defaults to `mediainfo`. + :param dict metadata_options: keyword arguments to pass to knowit, like executable paths: + `metadata_options={'ffmpeg': '/opt/bin/ffmpeg'}`. """ # skip non existing videos - if not video.exists: - return video - - # check extensions - extension = os.path.splitext(video.name)[1] - if extension != '.mkv': - logger.debug('Unsupported video extension %s', extension) + if not video.exists: # pragma: no cover return video - with open(video.name, 'rb') as f: - mkv = MKV(f) + # metadata options + options = dict(metadata_options) if metadata_options is not None else {} + # a dict of providers installed on the system + providers = loaded_providers(options) + # check if the specified metadata provider is installed, otherwise use default + if metadata_provider is not None: + # not a valid provider name + if metadata_provider not in providers: + msg = ( + f'metadata_provider={metadata_provider!r} is not a valid argument to `refine`, ' + f'needs to be None or one of:\n{list(providers.keys())}' + ) + logger.warning(msg) + # provider library or executable not found + elif not providers[metadata_provider]: + msg = ( + 'The metadata_provider library or executable was not found, ' + 'you can specify the path with the argument to the refine function: ' + f'`metadata_options={{{metadata_provider!r}: }}' + ) + logger.warning(msg) + # provider installed, force using it + else: + options['provider'] = metadata_provider + + # get video metadata + logger.debug('Retrieving metadata from %r', video.name) + media = know(video.name, options) + + provider_info = media['provider'] + logger.debug('Using provider %r', provider_info) + + # duration, in seconds + # more reliable to take it from here than from the 'video' track + if 'duration' in media: + video.duration = get_float(media['duration']) + logger.debug('Found duration %.2f', video.duration) # main video track - if mkv.video_tracks: - video_track = mkv.video_tracks[0] + if 'video' in media and len(media['video']) > 0: + # pick the default track if defined, otherwise just pick the first track + default_videos = [track for track in media['video'] if track.get('default', False) is True] + video_track = default_videos[0] if len(default_videos) > 0 else media['video'][0] # resolution - if video_track.height in (480, 720, 1080): - if video_track.interlaced: - video.resolution = f'{video_track.height:d}i' - else: - video.resolution = f'{video_track.height:d}p' - logger.debug('Found resolution %s', video.resolution) + if 'resolution' in video_track: + resolution = str(video_track['resolution']) + if resolution in ('480p', '720p', '1080p'): + video.resolution = resolution + logger.debug('Found resolution %s', video.resolution) + + # frame rate + if 'frame_rate' in video_track: + video.frame_rate = get_float(video_track['frame_rate']) + logger.debug('Found frame_rate %.2f', video.frame_rate) # video codec - if video_track.codec_id == 'V_MPEG4/ISO/AVC': - video.video_codec = 'H.264' - logger.debug('Found video_codec %s', video.video_codec) - elif video_track.codec_id == 'V_MPEG4/ISO/SP': - video.video_codec = 'DivX' - logger.debug('Found video_codec %s', video.video_codec) - elif video_track.codec_id == 'V_MPEG4/ISO/ASP': - video.video_codec = 'Xvid' + if 'codec' in video_track: + video.video_codec = video_track['codec'] logger.debug('Found video_codec %s', video.video_codec) - else: - logger.warning('MKV has no video track') + else: # pragma: no cover + logger.warning('Video has no video track') # main audio track - if mkv.audio_tracks: - audio_track = mkv.audio_tracks[0] + if 'audio' in media and len(media['audio']) > 0: + # pick the default track if defined, otherwise just pick the first track + default_audios = [track for track in media['audio'] if track.get('default', False) is True] + audio_track = default_audios[0] if len(default_audios) > 0 else media['audio'][0] + # audio codec - if audio_track.codec_id == 'A_AC3': - video.audio_codec = 'Dolby Digital' - logger.debug('Found audio_codec %s', video.audio_codec) - elif audio_track.codec_id == 'A_DTS': - video.audio_codec = 'DTS' + if 'codec' in audio_track: + video.audio_codec = audio_track['codec'] logger.debug('Found audio_codec %s', video.audio_codec) - elif audio_track.codec_id == 'A_AAC': - video.audio_codec = 'AAC' - logger.debug('Found audio_codec %s', video.audio_codec) - else: - logger.warning('MKV has no audio track') + else: # pragma: no cover + logger.warning('Video has no audio track') # subtitle tracks - if mkv.subtitle_tracks: - if embedded_subtitles: - embedded_subtitle_languages = set() - for st in mkv.subtitle_tracks: - if st.language: - try: - embedded_subtitle_languages.add(Language.fromalpha3b(st.language)) - except BabelfishError: - logger.exception( - 'Embedded subtitle track language %r is not a valid language', - st.language, - ) - embedded_subtitle_languages.add(Language('und')) - elif st.name: - try: - embedded_subtitle_languages.add(Language.fromname(st.name)) - except BabelfishError: - logger.debug('Embedded subtitle track name %r is not a valid language', st.name) - embedded_subtitle_languages.add(Language('und')) - else: - embedded_subtitle_languages.add(Language('und')) - logger.debug('Found embedded subtitle %r', embedded_subtitle_languages) - video.subtitles |= {EmbeddedSubtitle(lang) for lang in embedded_subtitle_languages} - else: - logger.debug('MKV has no subtitle track') + if embedded_subtitles: + if 'subtitle' in media and len(media['subtitle']) > 0: + embedded_subtitles_set: set[EmbeddedSubtitle] = set() + for st in media['subtitle']: + # language + lang = st.get('language', Language('und')) + + sub = EmbeddedSubtitle( + lang, + hearing_impaired=st.get('hearing_impaired', st.get('closed_caption')), + foreign_only=st.get('forced'), + subtitle_format=get_subtitle_format(st.get('format', 'srt')), + ) + + # add to set + embedded_subtitles_set.add(sub) + + logger.debug('Found embedded subtitles %r', embedded_subtitles_set) + video.subtitles |= embedded_subtitles_set + else: + logger.debug('Video has no subtitle track') return video + + +def get_float(value: Any) -> float | None: + """Get the float value from a quantity.""" + if value is None: + return None + # already a float + if isinstance(value, (int, float, str)): + return float(value) + + # timedelta + if isinstance(value, timedelta): + return float(value.total_seconds()) + + # pint.Quantity + try: + return float(value.magnitude) + except AttributeError: + pass + return float(value) + + +def get_subtitle_format(value: str | None) -> str | None: + """Normalize the subtitle format name.""" + if value is None: + return None + + # lower case + value = value.lower() + + # knowit uses 'SubRip', subliminal uses 'srt' + if value == 'subrip': + return 'srt' + + return value diff --git a/tests/refiners/test_metadata.py b/tests/refiners/test_metadata.py index 79e88d8a..ab2ceac0 100644 --- a/tests/refiners/test_metadata.py +++ b/tests/refiners/test_metadata.py @@ -1,31 +1,142 @@ +from __future__ import annotations + +import logging +from datetime import timedelta +from typing import Any + +import pytest from babelfish import Language # type: ignore[import-untyped] +from knowit.units import units # type: ignore[import-untyped] from subliminal.core import scan_video -from subliminal.refiners.metadata import refine -from subliminal.video import Movie +from subliminal.refiners.metadata import ( + get_float, + get_subtitle_format, + loaded_providers, + refine, +) +providers = ['mediainfo', 'ffmpeg', 'mkvmerge', 'enzyme'] -def test_refine_video_metadata(mkv): - scanned_video = scan_video(mkv['test5']) - refine(scanned_video, embedded_subtitles=True) - assert type(scanned_video) is Movie +@pytest.mark.parametrize( + ('value', 'expected'), + [ + (None, None), + (1, 1.0), + (2, 2.0), + ('3.14', 3.14), + (timedelta(hours=1, minutes=60, seconds=60), 7260.0), + (24 * units.FPS, 24.0), + ], +) +def test_get_float(value: Any, expected: float | None) -> None: + ret = get_float(value) + assert ret is None or isinstance(ret, float) + assert ret == expected + + +def test_get_float_error() -> None: + with pytest.raises(TypeError): + get_float((1.0,)) + + +@pytest.mark.parametrize( + ('value', 'expected'), + [ + (None, None), + ('ass', 'ass'), + ('ssa', 'ssa'), + ('subrip', 'srt'), + ('pgs', 'pgs'), + ], +) +def test_get_subtitle_format(value: str | None, expected: str | None) -> None: + """Convert str subrip -> srt""" + subtitle_format = get_subtitle_format(value) + assert subtitle_format == expected + + +@pytest.mark.parametrize('provider', providers) +def test_refine_video_metadata(mkv: dict[str, Any], provider: str) -> None: + # Skip test if provider is not installed + if not loaded_providers().get(provider, False): + pytest.skip(f'uninstalled provider {provider}') + + # Scan video + scanned_video = scan_video(mkv['test5']) assert scanned_video.name == mkv['test5'] - assert scanned_video.source is None - assert scanned_video.release_group is None assert scanned_video.resolution is None + assert scanned_video.size == 31762747 + + # Refine with file metadata + refine(scanned_video, embedded_subtitles=True, metadata_provider=provider) + assert scanned_video.resolution is None + assert scanned_video.duration == 46.665 assert scanned_video.video_codec == 'H.264' assert scanned_video.audio_codec == 'AAC' - assert scanned_video.imdb_id is None - assert scanned_video.size == 31762747 - assert scanned_video.subtitle_languages == { - # Language('eng'), # bug in enzyme - Language('spa'), - Language('deu'), - Language('jpn'), - Language('und'), - Language('ita'), - Language('fra'), - Language('hun'), - } - assert scanned_video.title == 'test5' - assert scanned_video.year is None + + # Enzyme has limited functionalities + if provider == 'enzyme': + assert scanned_video.subtitle_languages == { + # Language('eng'), # bug with enzyme + Language('spa'), + Language('deu'), + Language('jpn'), + Language('und'), + Language('ita'), + Language('fra'), + Language('hun'), + } + + # other providers + else: + if provider != 'mkvmerge': + assert scanned_video.frame_rate == 24 + + assert scanned_video.subtitle_languages == { + Language('eng'), + Language('spa'), + Language('deu'), + Language('jpn'), + Language('und'), + Language('ita'), + Language('fra'), + Language('hun'), + } + for subtitle in scanned_video.subtitles: + assert subtitle.subtitle_format == 'srt' + + +def test_refine_video_metadata_no_provider(mkv: dict[str, Any]) -> None: + scanned_video = scan_video(mkv['test5']) + refine(scanned_video, embedded_subtitles=True) + + assert scanned_video.duration == 46.665 + # cannot put 8, because if enzyme is used it finds only 7 + assert len(scanned_video.subtitle_languages) >= 7 + + +def test_refine_video_metadata_wrong_provider(mkv: dict[str, Any], caplog) -> None: + scanned_video = scan_video(mkv['test5']) + with caplog.at_level(logging.WARNING): + refine(scanned_video, embedded_subtitles=True, metadata_provider='not-a-provider') + + assert "metadata_provider='not-a-provider' is not a valid argument" in caplog.text + + +def test_refine_video_metadata_no_embedded_subtitles(mkv: dict[str, Any]) -> None: + scanned_video = scan_video(mkv['test5']) + refine(scanned_video, embedded_subtitles=False) + + assert scanned_video.duration == 46.665 + assert len(scanned_video.subtitle_languages) == 0 + + +def test_refine_no_subtitle_track(mkv: dict[str, Any]) -> None: + """Also tests resolution 480p.""" + scanned_video = scan_video(mkv['test1']) + refine(scanned_video, embedded_subtitles=True) + + assert scanned_video.duration == 87.336 + assert scanned_video.resolution == '480p' + assert len(scanned_video.subtitle_languages) == 0 From 43b9584993643b032ce948c0f9af005ff6669934 Mon Sep 17 00:00:00 2001 From: getzze Date: Sat, 21 Sep 2024 22:07:17 +0100 Subject: [PATCH 2/2] add doc about knowit installation --- README.rst | 2 ++ docs/index.rst | 1 + docs/user/installation.rst | 30 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 100644 docs/user/installation.rst diff --git a/README.rst b/README.rst index c0c5cee2..aedf602d 100644 --- a/README.rst +++ b/README.rst @@ -91,6 +91,8 @@ clone your fork locally and install a development version:: $ cd subliminal $ pip install --user -e '.[dev,test,docs]' +To extract information about the video files, `subliminal` uses `knowit `. Integrations ------------ diff --git a/docs/index.rst b/docs/index.rst index 8e021dc2..78ff6520 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,6 +15,7 @@ Documentation :maxdepth: 2 user/usage + user/installation user/how_it_works user/cli user/provider_guide diff --git a/docs/user/installation.rst b/docs/user/installation.rst new file mode 100644 index 00000000..c1cf836e --- /dev/null +++ b/docs/user/installation.rst @@ -0,0 +1,30 @@ +Installation +============ + +From Pypi +--------- + +For a better isolation with your system you should use a dedicated virtualenv. +The preferred installation method is to use `pipx `_ that does that for you:: + + $ pipx install subliminal + +Subliminal can be also be installed as a regular python module by running:: + + $ pip install --user subliminal + +From source +----------- + +If you want to modify the code, `fork `_ this repo, +clone your fork locally and install a development version:: + + $ git clone https://github.com//subliminal + $ cd subliminal + $ pip install --user -e '.[dev,test,docs]' + +External dependencies +--------------------- + +To extract information about the video files, `subliminal` uses `knowit `.