Skip to content

Commit

Permalink
feat: mark the url as downloaded
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhenShuo2021 committed Dec 10, 2024
1 parent d61567a commit a39d543
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions v2dl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,23 @@ def __init__(
) -> None:
self.config = config
self.runtime_config = config.runtime_config

self.web_bot = web_bot
self.dry_run = config.static_config.dry_run
self.logger = config.runtime_config.logger

self.no_log = False # flag to not log download status

self.download_service = config.runtime_config.download_service
self.scrape_handler = ScrapeHandler(self.config, self.web_bot)

def start_scraping(self) -> None:
"""Start scraping based on URL type."""
try:
urls = self._load_urls()
if not urls:
self.logger.info(f"No valid urls found in {self.runtime_config.url_file}")
self.no_log = True

for url in urls:
url = LinkParser.update_language(url, self.config.static_config.language)
self.runtime_config.url = url
Expand All @@ -68,6 +73,9 @@ def start_scraping(self) -> None:
self.web_bot.close_driver()

def log_final_status(self) -> None:
if self.no_log:
return

self.logger.info("Download finished, showing download status")
download_status = self.get_download_status
for url, album_status in download_status.items():
Expand Down Expand Up @@ -112,7 +120,7 @@ def _load_urls(self) -> list[str]:
"""Load URLs from runtime_config (URL or txt file)."""
if self.runtime_config.url_file:
with open(self.runtime_config.url_file) as file:
urls = [line.strip() for line in file if line.strip()]
urls = [line.strip() for line in file if line.strip() and not line.startswith("#")]
else:
urls = [self.runtime_config.url]
return urls
Expand Down Expand Up @@ -171,7 +179,8 @@ def __init__(

def scrape(self, url: str, dry_run: bool = False) -> None:
"""Main entry point for scraping operations."""
scrape_type = self._get_scrape_type()
if (scrape_type := self._get_scrape_type()) is None:
return
_, start_page = LinkParser.parse_input_url(self.runtime_config.url)

if scrape_type == "album_list":
Expand Down Expand Up @@ -336,13 +345,14 @@ def _handle_pagination(
time.sleep(consecutive_sleep)
return next_page

def _get_scrape_type(self) -> ScrapeType:
def _get_scrape_type(self) -> ScrapeType | None:
"""Get the appropriate handler method based on URL path."""
path_parts, _ = LinkParser.parse_input_url(self.runtime_config.url)
for part in path_parts:
if part in self.URL_HANDLERS:
return self.URL_HANDLERS[part]
raise ValueError(f"Unsupported URL type: {self.runtime_config.url}")
self.logger.error(f"Unsupported URL type: {self.runtime_config.url}")
return None


class BaseScraper(Generic[LinkType], ABC):
Expand Down

0 comments on commit a39d543

Please sign in to comment.