From 493426ba5290a5a6bd33f07b87163f29a408ad6e Mon Sep 17 00:00:00 2001 From: Satyam Kumar Date: Tue, 11 Aug 2020 19:55:19 +0530 Subject: [PATCH] Fixes #104 - Prevent duplicates by downloading everything to single location --- openedx2zim/html_processor.py | 22 ++++++++++++++-- openedx2zim/scraper.py | 25 +++++++++++-------- openedx2zim/utils.py | 6 ++--- openedx2zim/xblocks_extractor/discussion.py | 12 ++++----- .../xblocks_extractor/drag_and_drop_v2.py | 9 +++---- .../xblocks_extractor/free_text_response.py | 4 +-- openedx2zim/xblocks_extractor/html.py | 4 +-- openedx2zim/xblocks_extractor/problem.py | 4 +-- openedx2zim/xblocks_extractor/vertical.py | 7 ++---- 9 files changed, 56 insertions(+), 37 deletions(-) diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py index f443dba..b361a5b 100644 --- a/openedx2zim/html_processor.py +++ b/openedx2zim/html_processor.py @@ -149,6 +149,23 @@ def download_images_from_html(self, html_body, output_path, path_from_html): img.attrib["style"] = " max-width:100%" return bool(imgs) + def get_root_from_asset(self, path_from_html, root_from_html): + """ get path to root from the downloaded/generated asset """ + + nb_jumps_root_from_html = root_from_html.count("../") + nb_back_jumps_output_path = path_from_html.count("../") + + # the path to the asset from HTML, minus the back jumps + path_without_back_jumps = path_from_html[ + (nb_back_jumps_output_path) * len("../") : + ] + + return ( + nb_jumps_root_from_html + - nb_back_jumps_output_path + + len(pathlib.Path(path_without_back_jumps).parts) + ) * "../" + def download_documents_from_html( self, html_body, output_path, path_from_html, root_from_html ): @@ -174,8 +191,9 @@ def download_documents_from_html( "audio_player.html", False, audio_path=filename, - path_to_root=root_from_html - + len(pathlib.Path(path_from_html).parts) * "../", + path_to_root=self.get_root_from_asset( + path_from_html, root_from_html + ), audio_format=file_format, ) filename = html_fpath.name diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py index b0f85aa..2105f31 100644 --- a/openedx2zim/scraper.py +++ b/openedx2zim/scraper.py @@ -179,6 +179,10 @@ def __init__( self.wiki = None self.forum = None + @property + def instance_assets_dir(self): + return self.build_dir.joinpath("instance_assets") + def get_course_id(self, url, course_page_name, course_prefix, instance_url): clean_url = re.match( instance_url + course_prefix + ".*" + course_page_name, url @@ -394,14 +398,14 @@ def annex(self): self.get_course_tabs() logger.info("Downloading content for extra pages ...") for page in self.annexed_pages: + root_from_html = ( + len(page["output_path"].relative_to(self.build_dir).parts) * "../" + ) page["content"] = self.html_processor.dl_dependencies_and_fix_links( content=page["content"], - output_path=page["output_path"], - path_from_html="", - root_from_html=len( - page["output_path"].relative_to(self.build_dir).parts - ) - * "../", + output_path=self.instance_assets_dir, + path_from_html=root_from_html + "instance_assets", + root_from_html=root_from_html, ) logger.info("Processing book lists ...") @@ -478,8 +482,8 @@ def clean_content(html_article): self.homepage_html.append( self.html_processor.dl_dependencies_and_fix_links( content=article.prettify(), - output_path=self.build_dir.joinpath("home"), - path_from_html="home", + output_path=self.instance_assets_dir, + path_from_html="instance_assets", root_from_html="", ) ) @@ -490,8 +494,8 @@ def clean_content(html_article): self.homepage_html.append( self.html_processor.dl_dependencies_and_fix_links( content=welcome_message.prettify(), - output_path=self.build_dir.joinpath("home"), - path_from_html="home", + output_path=self.instance_assets_dir, + path_from_html="instance_assets", root_from_html="", ) ) @@ -763,6 +767,7 @@ def run(self): ) self.instance_connection.establish_connection() jinja_init() + self.instance_assets_dir.mkdir(exist_ok=True, parents=True) self.html_processor = HtmlProcessor(self) self.prepare_mooc_data() self.parse_course_xblocks() diff --git a/openedx2zim/utils.py b/openedx2zim/utils.py index 1e94ead..2ad2aed 100644 --- a/openedx2zim/utils.py +++ b/openedx2zim/utils.py @@ -78,6 +78,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection): if not subtitle_file.exists(): try: raw_subtitle = instance_connection.get_page(subtitles[lang]) + if not raw_subtitle: + logger.error(f"Subtitle fetch failed from {subtitles[lang]}") + continue subtitle = html.unescape( re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M) ) @@ -87,9 +90,6 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection): webvtt = WebVTT().from_srt(subtitle_file) webvtt.save() processed_subtitles[lang] = f"{lang}.vtt" - except urllib.error.HTTPError as exc: - if exc.code == 404 or exc.code == 403: - logger.error(f"Failed to get subtitle from {subtitles[lang]}") except Exception as exc: logger.error( f"Error while converting subtitle {subtitles[lang]} : {exc}" diff --git a/openedx2zim/xblocks_extractor/discussion.py b/openedx2zim/xblocks_extractor/discussion.py index e9404f4..1ab935c 100644 --- a/openedx2zim/xblocks_extractor/discussion.py +++ b/openedx2zim/xblocks_extractor/discussion.py @@ -42,8 +42,8 @@ def get_extra_content(self, soup): remove_autogenerated_tags( self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(element), - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) ) @@ -53,8 +53,8 @@ def get_extra_content(self, soup): remove_autogenerated_tags( self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(child_node), - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) ) @@ -64,8 +64,8 @@ def get_extra_content(self, soup): remove_autogenerated_tags( self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(child_node), - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) ) diff --git a/openedx2zim/xblocks_extractor/drag_and_drop_v2.py b/openedx2zim/xblocks_extractor/drag_and_drop_v2.py index 9371ff2..0815a2e 100644 --- a/openedx2zim/xblocks_extractor/drag_and_drop_v2.py +++ b/openedx2zim/xblocks_extractor/drag_and_drop_v2.py @@ -2,7 +2,6 @@ import json from bs4 import BeautifulSoup -from slugify import slugify from .base_xblock import BaseXblock from ..utils import jinja, prepare_url @@ -34,18 +33,18 @@ def download(self, instance_connection): name = pathlib.Path(item["expandedImageURL"]).name self.scraper.download_file( prepare_url(item["expandedImageURL"], self.scraper.instance_url), - self.output_path.joinpath(name), + self.scraper.instance_assets_dir.joinpath(name), ) - item["expandedImageURL"] = f"{self.folder_name}/{name}" + item["expandedImageURL"] = "../" * 5 + f"instance_assets/{name}" # Grid name = pathlib.Path(self.content["target_img_expanded_url"]).name self.scraper.download_file( prepare_url( self.content["target_img_expanded_url"], self.scraper.instance_url ), - self.output_path.joinpath(name), + self.scraper.instance_assets_dir.joinpath(name), ) - self.content["target_img_expanded_url"] = f"{self.folder_name}/{name}" + self.content["target_img_expanded_url"] = "../" * 5 + f"instance_assets/{name}" def render(self): return jinja(None, "DragAndDropV2.html", False, dragdrop_content=self.content) diff --git a/openedx2zim/xblocks_extractor/free_text_response.py b/openedx2zim/xblocks_extractor/free_text_response.py index 686b59a..5b23782 100644 --- a/openedx2zim/xblocks_extractor/free_text_response.py +++ b/openedx2zim/xblocks_extractor/free_text_response.py @@ -35,8 +35,8 @@ def download(self, instance_connection): html_no_answers + self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(soup), - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) ) diff --git a/openedx2zim/xblocks_extractor/html.py b/openedx2zim/xblocks_extractor/html.py index ca7b9bf..f84c160 100644 --- a/openedx2zim/xblocks_extractor/html.py +++ b/openedx2zim/xblocks_extractor/html.py @@ -24,8 +24,8 @@ def download(self, instance_connection): html_content = str(soup.find("div", attrs={"class": "course-wrapper"})) self.html = self.scraper.html_processor.dl_dependencies_and_fix_links( content=html_content, - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) diff --git a/openedx2zim/xblocks_extractor/problem.py b/openedx2zim/xblocks_extractor/problem.py index d7aec64..e8a6dc9 100644 --- a/openedx2zim/xblocks_extractor/problem.py +++ b/openedx2zim/xblocks_extractor/problem.py @@ -120,8 +120,8 @@ def download(self, instance_connection): # process final HTML content html_content = self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(soup.find("div", attrs={"class": "problem"})), - output_path=self.output_path, - path_from_html=self.folder_name, + output_path=self.scraper.instance_assets_dir, + path_from_html="../" * 5 + "instance_assets", root_from_html="../" * 5, ) diff --git a/openedx2zim/xblocks_extractor/vertical.py b/openedx2zim/xblocks_extractor/vertical.py index 0659fe1..c682f4f 100644 --- a/openedx2zim/xblocks_extractor/vertical.py +++ b/openedx2zim/xblocks_extractor/vertical.py @@ -28,9 +28,6 @@ def __init__( self.icon_type = "fa-book" def download(self, instance_connection): - instance_assets_path = self.scraper.build_dir.joinpath("instance_assets") - instance_assets_path.mkdir(parents=True, exist_ok=True) - # get the LMS content for the vertical content = instance_connection.get_page(self.xblock_json["lms_web_url"]) soup = BeautifulSoup(content, "lxml") @@ -47,7 +44,7 @@ def download(self, instance_connection): remove_autogenerated_tags( self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(header_element), - output_path=instance_assets_path, + output_path=self.scraper.instance_assets_dir, path_from_html=f"{self.root_url}instance_assets", root_from_html=self.root_url, ) @@ -62,7 +59,7 @@ def download(self, instance_connection): remove_autogenerated_tags( self.scraper.html_processor.dl_dependencies_and_fix_links( content=str(script), - output_path=instance_assets_path, + output_path=self.scraper.instance_assets_dir, path_from_html=f"{self.root_url}instance_assets", root_from_html=self.root_url, )