From 2f86f1f8539a26772daa657a6f98f4b9e1084d5f Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 2 Jan 2025 16:55:35 -0800 Subject: [PATCH 1/6] feat: override yt-dlp generic extractor to add redirect loop detection logic --- brozzler/ydl.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 861b6e68..015ea3d6 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -18,7 +18,7 @@ import logging import yt_dlp -from yt_dlp.utils import match_filter_func +from yt_dlp.utils import match_filter_func, ExtractorError import brozzler import urllib.request import tempfile @@ -110,6 +110,31 @@ def _build_youtube_dl(worker, destdir, site, page): a yt-dlp `yt_dlp.YoutubeDL` instance """ + # Custom GenericIE to handle redirect loops with shared state + class CustomGenericIE(yt_dlp.extractor.generic.GenericIE): + """Custom Generic Information Extractor to detect redirect loops.""" + + logger = logging.getLogger(__module__ + "." + __qualname__) + shared_visited_urls = set() # Shared state for all instances + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.logger.info("[CustomGenericIE] Initialized") + + def _real_extract(self, url): + # Check for redirect loops in the shared state + if url in self.shared_visited_urls: + self.logger.error("Redirect loop detected for URL: {url}") + raise ExtractorError( + f"Redirect loop detected for URL: {url}", + expected=True, # Marks the error as non-fatal + ) + self.shared_visited_urls.add(url) + self.logger.info(f"[CustomGenericIE] Extracting URL: {url}") + return super()._real_extract(url) + + yt_dlp.extractor.generic.GenericIE = CustomGenericIE + class _YoutubeDL(yt_dlp.YoutubeDL): logger = logging.getLogger(__module__ + "." + __qualname__) @@ -361,6 +386,11 @@ def _try_youtube_dl(worker, ydl, site, page): and e.exc_info[1].code == 420 ): raise brozzler.ReachedLimit(e.exc_info[1]) + elif ( + isinstance(e, yt_dlp.utils.DownloadError) + and "Redirect loop detected" in e.msg + ): + raise brozzler.VideoExtractorError(e.msg) else: # todo: other errors to handle separately? # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) From 426570b0843c30767ab83fcd84007c8e22594395 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 6 Jan 2025 11:30:46 -0800 Subject: [PATCH 2/6] feat: Handle too many redirects as well --- brozzler/ydl.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 015ea3d6..87a672f2 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -37,6 +37,7 @@ YTDLP_PROXY = "" PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 +YTDLP_MAX_REDIRECTS = 5 def should_ytdlp(site, page, page_status, skip_av_seeds): @@ -115,24 +116,34 @@ class CustomGenericIE(yt_dlp.extractor.generic.GenericIE): """Custom Generic Information Extractor to detect redirect loops.""" logger = logging.getLogger(__module__ + "." + __qualname__) - shared_visited_urls = set() # Shared state for all instances + visited_redirect_urls = set() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.logger.info("[CustomGenericIE] Initialized") def _real_extract(self, url): - # Check for redirect loops in the shared state - if url in self.shared_visited_urls: - self.logger.error("Redirect loop detected for URL: {url}") - raise ExtractorError( - f"Redirect loop detected for URL: {url}", - expected=True, # Marks the error as non-fatal - ) - self.shared_visited_urls.add(url) + # self.visited_redirect_urls.clear() self.logger.info(f"[CustomGenericIE] Extracting URL: {url}") return super()._real_extract(url) + def report_following_redirect(self, new_url): + self.logger.info( + f"[CustomGenericIE] Following redirect URL: {new_url} " + f"redirect_count: {len(self.visited_redirect_urls)}" + ) + if new_url in self.visited_redirect_urls: + raise ExtractorError( + f"Redirect loop detected for URL: {new_url}", + expected=True, + ) + if len(self.visited_redirect_urls) > YTDLP_MAX_REDIRECTS: + raise ExtractorError( + f"Too many redirects for URL: {new_url}", + expected=True, + ) + self.visited_redirect_urls.add(new_url) + return super().report_following_redirect(new_url) + yt_dlp.extractor.generic.GenericIE = CustomGenericIE class _YoutubeDL(yt_dlp.YoutubeDL): @@ -386,9 +397,8 @@ def _try_youtube_dl(worker, ydl, site, page): and e.exc_info[1].code == 420 ): raise brozzler.ReachedLimit(e.exc_info[1]) - elif ( - isinstance(e, yt_dlp.utils.DownloadError) - and "Redirect loop detected" in e.msg + elif isinstance(e, yt_dlp.utils.DownloadError) and ( + "Redirect loop detected" in e.msg or "Too many redirects" in e.msg ): raise brozzler.VideoExtractorError(e.msg) else: From 159666791919169c5bf8e4db261f7557ec805581 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 6 Jan 2025 18:20:30 -0800 Subject: [PATCH 3/6] chore: rewrite approach using process_ie_result --- brozzler/ydl.py | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 87a672f2..563842f5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -111,43 +111,23 @@ def _build_youtube_dl(worker, destdir, site, page): a yt-dlp `yt_dlp.YoutubeDL` instance """ - # Custom GenericIE to handle redirect loops with shared state - class CustomGenericIE(yt_dlp.extractor.generic.GenericIE): - """Custom Generic Information Extractor to detect redirect loops.""" - + class _YoutubeDL(yt_dlp.YoutubeDL): logger = logging.getLogger(__module__ + "." + __qualname__) - visited_redirect_urls = set() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - def _real_extract(self, url): - # self.visited_redirect_urls.clear() - self.logger.info(f"[CustomGenericIE] Extracting URL: {url}") - return super()._real_extract(url) - - def report_following_redirect(self, new_url): - self.logger.info( - f"[CustomGenericIE] Following redirect URL: {new_url} " - f"redirect_count: {len(self.visited_redirect_urls)}" - ) - if new_url in self.visited_redirect_urls: + def process_ie_result(self, ie_result, download=True, extra_info=None): + if extra_info is None: + extra_info = {} + if 'redirect_count' in extra_info: + self.logger.info(f"Following redirect URL: {ie_result['url']} redirect_count: {extra_info['redirect_count']}") + extra_info['redirect_count'] = 1 + extra_info.get('redirect_count', 0) + if extra_info["redirect_count"] > YTDLP_MAX_REDIRECTS: raise ExtractorError( - f"Redirect loop detected for URL: {new_url}", + f"Too many redirects for URL: {ie_result['url']}", expected=True, ) - if len(self.visited_redirect_urls) > YTDLP_MAX_REDIRECTS: - raise ExtractorError( - f"Too many redirects for URL: {new_url}", - expected=True, - ) - self.visited_redirect_urls.add(new_url) - return super().report_following_redirect(new_url) - yt_dlp.extractor.generic.GenericIE = CustomGenericIE + super().process_ie_result(ie_result, download, extra_info) - class _YoutubeDL(yt_dlp.YoutubeDL): - logger = logging.getLogger(__module__ + "." + __qualname__) def add_default_extra_info(self, ie_result, ie, url): # hook in some logging From 5be1b3b22a0a42d3f2ff1e9bda9b04773132e0cb Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 6 Jan 2025 18:23:17 -0800 Subject: [PATCH 4/6] chore: formatting --- brozzler/ydl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 563842f5..01c8b7b3 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -117,9 +117,11 @@ class _YoutubeDL(yt_dlp.YoutubeDL): def process_ie_result(self, ie_result, download=True, extra_info=None): if extra_info is None: extra_info = {} - if 'redirect_count' in extra_info: - self.logger.info(f"Following redirect URL: {ie_result['url']} redirect_count: {extra_info['redirect_count']}") - extra_info['redirect_count'] = 1 + extra_info.get('redirect_count', 0) + if "redirect_count" in extra_info: + self.logger.info( + f"Following redirect URL: {ie_result['url']} redirect_count: {extra_info['redirect_count']}" + ) + extra_info["redirect_count"] = 1 + extra_info.get("redirect_count", 0) if extra_info["redirect_count"] > YTDLP_MAX_REDIRECTS: raise ExtractorError( f"Too many redirects for URL: {ie_result['url']}", @@ -128,7 +130,6 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): super().process_ie_result(ie_result, download, extra_info) - def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) From a250eb2b68fe09e8cdf8a994ec95b3dd4db5cea6 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 6 Jan 2025 18:56:22 -0800 Subject: [PATCH 5/6] fix: ensure url is not a video when determining if we are in a redirect --- brozzler/ydl.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 01c8b7b3..5b645be9 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -117,11 +117,17 @@ class _YoutubeDL(yt_dlp.YoutubeDL): def process_ie_result(self, ie_result, download=True, extra_info=None): if extra_info is None: extra_info = {} - if "redirect_count" in extra_info: + if ( + "redirect_count" in extra_info + and "_type" in ie_result + and ie_result.get("_type") in ("url", "url_transparent") + ): self.logger.info( f"Following redirect URL: {ie_result['url']} redirect_count: {extra_info['redirect_count']}" ) - extra_info["redirect_count"] = 1 + extra_info.get("redirect_count", 0) + extra_info["redirect_count"] = 1 + extra_info.get("redirect_count", 0) + else: + extra_info["redirect_count"] = 0 if extra_info["redirect_count"] > YTDLP_MAX_REDIRECTS: raise ExtractorError( f"Too many redirects for URL: {ie_result['url']}", From 493587ca2cf95afdbfe14a6cda17370b53937e9d Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Wed, 15 Jan 2025 12:00:07 -0800 Subject: [PATCH 6/6] fix: return ie_result and cleanup variable names to properly represent hop depth instead of redirects --- brozzler/ydl.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 5b645be9..55eb90d8 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -117,24 +117,24 @@ class _YoutubeDL(yt_dlp.YoutubeDL): def process_ie_result(self, ie_result, download=True, extra_info=None): if extra_info is None: extra_info = {} - if ( - "redirect_count" in extra_info - and "_type" in ie_result - and ie_result.get("_type") in ("url", "url_transparent") - ): - self.logger.info( - f"Following redirect URL: {ie_result['url']} redirect_count: {extra_info['redirect_count']}" - ) - extra_info["redirect_count"] = 1 + extra_info.get("redirect_count", 0) - else: - extra_info["redirect_count"] = 0 - if extra_info["redirect_count"] > YTDLP_MAX_REDIRECTS: - raise ExtractorError( - f"Too many redirects for URL: {ie_result['url']}", - expected=True, - ) + result_type = ie_result.get("_type", "video") - super().process_ie_result(ie_result, download, extra_info) + if result_type in ("url", "url_transparent"): + if "extraction_depth" in extra_info: + self.logger.info( + f"Following redirect URL: {ie_result['url']} extraction_depth: {extra_info['extraction_depth']}" + ) + extra_info["extraction_depth"] = 1 + extra_info.get( + "extraction_depth", 0 + ) + else: + extra_info["extraction_depth"] = 0 + if extra_info["extraction_depth"] >= YTDLP_MAX_REDIRECTS: + raise ExtractorError( + f"Too many hops for URL: {ie_result['url']}", + expected=True, + ) + return super().process_ie_result(ie_result, download, extra_info) def add_default_extra_info(self, ie_result, ie, url): # hook in some logging