From ac9b48ef92bb645e1366fa17bd08de16451d9786 Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 17:45:04 +0100 Subject: [PATCH 01/10] improve title filtering and tv shows results --- comet/api/stream.py | 2 ++ comet/utils/general.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/comet/api/stream.py b/comet/api/stream.py index edeb31c..29c7ef9 100644 --- a/comet/api/stream.py +++ b/comet/api/stream.py @@ -264,8 +264,10 @@ async def stream(request: Request, b64config: str, type: str, id: str): search_terms = [name] if type == "series": + search_terms = [] if not kitsu: search_terms.append(f"{name} S0{season}E0{episode}") + search_terms.append(f"{name} s0{season}e0{episode}") else: search_terms.append(f"{name} {episode}") tasks.extend( diff --git a/comet/utils/general.py b/comet/utils/general.py index c4c02d8..1269385 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -477,7 +477,12 @@ async def filter(torrents: list, name: str, year: int): parsed = parse(title) - if parsed.parsed_title and not title_match(name, parsed.parsed_title): + def title_sub_match(correct_title: str, torrent_title: str): + correct_title = correct_title.lower() + torrent_title = torrent_title.lower() + return correct_title in torrent_title or torrent_title in correct_title + + if parsed.parsed_title and not (title_match(name, parsed.parsed_title) or title_sub_match(name, parsed.parsed_title)): results.append((index, False)) continue From a8cf4cb75e5e8734b89319cf00d57f25352c1325 Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 18:14:25 +0100 Subject: [PATCH 02/10] improve tv show strings numbering --- comet/api/stream.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comet/api/stream.py b/comet/api/stream.py index 29c7ef9..541bce1 100644 --- a/comet/api/stream.py +++ b/comet/api/stream.py @@ -108,7 +108,7 @@ async def stream(request: Request, b64config: str, type: str, id: str): name = translate(name) log_name = name if type == "series": - log_name = f"{name} S0{season}E0{episode}" + log_name = f"{name} S{season:02d}E{episode:02d}" if ( settings.PROXY_DEBRID_STREAM @@ -266,8 +266,8 @@ async def stream(request: Request, b64config: str, type: str, id: str): if type == "series": search_terms = [] if not kitsu: - search_terms.append(f"{name} S0{season}E0{episode}") - search_terms.append(f"{name} s0{season}e0{episode}") + search_terms.append(f"{name} S{season:02d}E{episode:02d}") + search_terms.append(f"{name} s{season:02d}e{episode:02d}") else: search_terms.append(f"{name} {episode}") tasks.extend( From aadde165c451bce4942f9c3dfb02dad29ba9f37b Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 18:18:54 +0100 Subject: [PATCH 03/10] move function --- comet/utils/general.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index 1269385..c2ffd0f 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -467,6 +467,11 @@ async def get_mediafusion(log_name: str, type: str, full_id: str): async def filter(torrents: list, name: str, year: int): + def title_sub_match(correct_title: str, torrent_title: str): + correct_title = correct_title.lower() + torrent_title = torrent_title.lower() + return correct_title in torrent_title or torrent_title in correct_title + results = [] for torrent in torrents: index = torrent[0] @@ -477,11 +482,6 @@ async def filter(torrents: list, name: str, year: int): parsed = parse(title) - def title_sub_match(correct_title: str, torrent_title: str): - correct_title = correct_title.lower() - torrent_title = torrent_title.lower() - return correct_title in torrent_title or torrent_title in correct_title - if parsed.parsed_title and not (title_match(name, parsed.parsed_title) or title_sub_match(name, parsed.parsed_title)): results.append((index, False)) continue From 630a5fee7604a20e952b9ddda2efdcc370c7874a Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 18:20:07 +0100 Subject: [PATCH 04/10] rename variable --- comet/utils/general.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index c2ffd0f..3ff56b6 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -467,10 +467,10 @@ async def get_mediafusion(log_name: str, type: str, full_id: str): async def filter(torrents: list, name: str, year: int): - def title_sub_match(correct_title: str, torrent_title: str): - correct_title = correct_title.lower() + def title_sub_match(imdb_title: str, torrent_title: str): + imdb_title = imdb_title.lower() torrent_title = torrent_title.lower() - return correct_title in torrent_title or torrent_title in correct_title + return imdb_title in torrent_title or torrent_title in imdb_title results = [] for torrent in torrents: From 027bb54d1b91a2141ad6dfbd77f2c74b2ba75fad Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 19:19:46 +0100 Subject: [PATCH 05/10] improve title matching function --- comet/utils/general.py | 32 ++++++++++++++++++++++++++------ pyproject.toml | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index 3ff56b6..e9002e3 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -10,6 +10,7 @@ from RTN import parse, title_match from curl_cffi import requests from fastapi import Request +from fuzzywuzzy import fuzz from comet.utils.logger import logger from comet.utils.models import settings, ConfigModel @@ -465,13 +466,32 @@ async def get_mediafusion(log_name: str, type: str, full_id: str): return results +def match_titles(imdb_title, torrent_title, threshold=80, token_overlap_threshold=0.5): + """ + Match movie/TV show titles using a combination of fuzzy string matching and token overlap. -async def filter(torrents: list, name: str, year: int): - def title_sub_match(imdb_title: str, torrent_title: str): - imdb_title = imdb_title.lower() - torrent_title = torrent_title.lower() - return imdb_title in torrent_title or torrent_title in imdb_title + Parameters: + imdb_title (str): The title from the IMDB data source. + torrent_title (str): The title from the torrent data source. + threshold (int): The minimum fuzzy match ratio to consider the titles a match. + token_overlap_threshold (float): The minimum proportion of overlapping tokens to consider the titles a match. + + Returns: + bool: True if the titles match, False otherwise. + """ + # Calculate the fuzzy match ratio + match_ratio = fuzz.token_set_ratio(imdb_title, torrent_title) + + # Calculate the proportion of overlapping tokens + imdb_tokens = set(imdb_title.lower().split()) + torrent_tokens = set(torrent_title.lower().split()) + common_tokens = imdb_tokens.intersection(torrent_tokens) + token_overlap_ratio = len(common_tokens) / max(len(imdb_tokens), len(torrent_tokens)) + # Check if both the fuzzy match ratio and token overlap ratio meet the thresholds + return match_ratio >= threshold and token_overlap_ratio >= token_overlap_threshold + +async def filter(torrents: list, name: str, year: int): results = [] for torrent in torrents: index = torrent[0] @@ -482,7 +502,7 @@ def title_sub_match(imdb_title: str, torrent_title: str): parsed = parse(title) - if parsed.parsed_title and not (title_match(name, parsed.parsed_title) or title_sub_match(name, parsed.parsed_title)): + if parsed.parsed_title and not (match_titles(name, parsed.parsed_title)): results.append((index, False)) continue diff --git a/pyproject.toml b/pyproject.toml index 222dbe4..543895a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ aiosqlite = "*" jinja2 = "*" rank-torrent-name = "*" parsett = "*" +fuzzywuzzy = "*" [tool.poetry.group.dev.dependencies] From 0567166f842b20dd3daf56973b3923032a65ef3d Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 19:21:54 +0100 Subject: [PATCH 06/10] add python-Levenshtein for fuzzywuzzy --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 543895a..6ddd8f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ jinja2 = "*" rank-torrent-name = "*" parsett = "*" fuzzywuzzy = "*" +python-Levenshtein = "*" [tool.poetry.group.dev.dependencies] From 1123f086d9112fcccf65b373e34e9f889eb7fd61 Mon Sep 17 00:00:00 2001 From: Dariush Date: Fri, 15 Nov 2024 21:13:39 +0100 Subject: [PATCH 07/10] remove useless parenthesis --- comet/utils/general.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index e9002e3..2fa32d2 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -502,7 +502,7 @@ async def filter(torrents: list, name: str, year: int): parsed = parse(title) - if parsed.parsed_title and not (match_titles(name, parsed.parsed_title)): + if parsed.parsed_title and not match_titles(name, parsed.parsed_title): results.append((index, False)) continue From 1c9ed405698f0acd2788db281f832295f7c6fc85 Mon Sep 17 00:00:00 2001 From: Goldy <153996346+g0ldyy@users.noreply.github.com> Date: Sat, 16 Nov 2024 00:08:25 +0100 Subject: [PATCH 08/10] Update pyproject.toml --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6ddd8f6..64311c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,7 @@ aiosqlite = "*" jinja2 = "*" rank-torrent-name = "*" parsett = "*" -fuzzywuzzy = "*" -python-Levenshtein = "*" +fuzzywuzzy = {extras = ["speedup"], version = "*"} [tool.poetry.group.dev.dependencies] From f93fd6a6884236970b4e60f947419ccf67633f3a Mon Sep 17 00:00:00 2001 From: Dariush Date: Sat, 16 Nov 2024 01:59:59 +0100 Subject: [PATCH 09/10] improve title matching --- comet/utils/general.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index 2fa32d2..2bd3b75 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -7,7 +7,7 @@ import asyncio import orjson -from RTN import parse, title_match +from RTN import parse from curl_cffi import requests from fastapi import Request from fuzzywuzzy import fuzz @@ -466,30 +466,25 @@ async def get_mediafusion(log_name: str, type: str, full_id: str): return results -def match_titles(imdb_title, torrent_title, threshold=80, token_overlap_threshold=0.5): + +def match_titles(imdb_title: str, torrent_title: str, threshold: int = 80) -> bool: """ - Match movie/TV show titles using a combination of fuzzy string matching and token overlap. + Match movie/TV show titles using fuzzy string matching. Parameters: imdb_title (str): The title from the IMDB data source. torrent_title (str): The title from the torrent data source. threshold (int): The minimum fuzzy match ratio to consider the titles a match. - token_overlap_threshold (float): The minimum proportion of overlapping tokens to consider the titles a match. Returns: bool: True if the titles match, False otherwise. """ # Calculate the fuzzy match ratio - match_ratio = fuzz.token_set_ratio(imdb_title, torrent_title) + match_ratio = fuzz.ratio(imdb_title, torrent_title) - # Calculate the proportion of overlapping tokens - imdb_tokens = set(imdb_title.lower().split()) - torrent_tokens = set(torrent_title.lower().split()) - common_tokens = imdb_tokens.intersection(torrent_tokens) - token_overlap_ratio = len(common_tokens) / max(len(imdb_tokens), len(torrent_tokens)) + # Check if the fuzzy match ratio meets the thresholds + return match_ratio >= threshold - # Check if both the fuzzy match ratio and token overlap ratio meet the thresholds - return match_ratio >= threshold and token_overlap_ratio >= token_overlap_threshold async def filter(torrents: list, name: str, year: int): results = [] From 7f19c4c3839b6b46aab3cee3efb283469d7b68f4 Mon Sep 17 00:00:00 2001 From: Dariush Date: Sat, 16 Nov 2024 14:36:25 +0100 Subject: [PATCH 10/10] improve title matching with better ratios --- comet/utils/general.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/comet/utils/general.py b/comet/utils/general.py index 2bd3b75..7e9cfe6 100644 --- a/comet/utils/general.py +++ b/comet/utils/general.py @@ -480,10 +480,15 @@ def match_titles(imdb_title: str, torrent_title: str, threshold: int = 80) -> bo bool: True if the titles match, False otherwise. """ # Calculate the fuzzy match ratio - match_ratio = fuzz.ratio(imdb_title, torrent_title) - - # Check if the fuzzy match ratio meets the thresholds - return match_ratio >= threshold + # The idea is that ratio will give very low score to garbage ratio but will also give mid/average + # score to some good results. The WRatio will make sure these mid score passes the filter. + base_ratio = fuzz.ratio(imdb_title, torrent_title) # strict ratio + w_ratio = fuzz.WRatio(imdb_title, torrent_title) # less strict ratio + # The weight of the ratios needs to be adjusted because basic ratio is too strict. + match_ratio = (base_ratio*0.7 + w_ratio*1.3)/2 + + # Check if the fuzzy match ratio meets the thresholds + return match_ratio >= threshold async def filter(torrents: list, name: str, year: int):