openzim · rgaudin · Aug 14, 2020 · Aug 11, 2020 · Aug 13, 2020
diff --git a/openedx2zim/annex.py b/openedx2zim/annex.py
@@ -8,7 +8,7 @@
 from bs4 import BeautifulSoup
 
 from .constants import getLogger
-from .utils import jinja, markdown
+from .utils import jinja, markdown, get_back_jumps
 
 logger = getLogger()
 
@@ -157,10 +157,9 @@ def update_thread_children(self, thread):
                 content=markdown(children["body"]),
                 output_path=self.output_path.joinpath(thread["id"]),
                 path_from_html="",
-                root_from_html=len(
-                    self.output_path.relative_to(self.scraper.build_dir).parts
-                )
-                * "../",
+                root_from_html=get_back_jumps(
+                    len(self.output_path.relative_to(self.scraper.build_dir).parts)
+                ),
             )
             if "children" in children:
                 for children_children in children["children"]:
@@ -170,10 +169,13 @@ def update_thread_children(self, thread):
                         content=markdown(children_children["body"]),
                         output_path=self.output_path.joinpath(thread["id"]),
                         path_from_html="",
-                        root_from_html=len(
-                            self.output_path.relative_to(self.scraper.build_dir).parts
-                        )
-                        * "../",
+                        root_from_html=get_back_jumps(
+                            len(
+                                self.output_path.relative_to(
+                                    self.scraper.build_dir
+                                ).parts
+                            )
+                        ),
                     )
 
     def annex_forum(self):
@@ -210,10 +212,9 @@ def annex_forum(self):
                 content=markdown(thread["data_thread"]["content"]["body"]),
                 output_path=self.output_path.joinpath(thread["id"]),
                 path_from_html="",
-                root_from_html=len(
-                    self.output_path.relative_to(self.scraper.build_dir).parts
-                )
-                * "../",
+                root_from_html=get_back_jumps(
+                    len(self.output_path.relative_to(self.scraper.build_dir).parts)
+                ),
             )
             self.update_thread_children(thread)
 
@@ -308,10 +309,13 @@ def update_wiki_page(self, soup, text, url, page_to_visit):
             content=str(text),
             output_path=self.wiki_data[url]["path"],
             path_from_html="",
-            root_from_html=len(
-                self.wiki_data[url]["path"].relative_to(self.scraper.build_dir).parts
-            )
-            * "../",
+            root_from_html=get_back_jumps(
+                len(
+                    self.wiki_data[url]["path"]
+                    .relative_to(self.scraper.build_dir)
+                    .parts
+                )
+            ),
         )
         self.wiki_data[url]["title"] = soup.find("title").text
         self.wiki_data[url]["last-modif"] = soup.find(

diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py
@@ -7,7 +7,7 @@
 from bs4 import BeautifulSoup
 
 from .constants import DOWNLOADABLE_EXTENSIONS, AUDIO_FORMATS
-from .utils import jinja, prepare_url
+from .utils import jinja, prepare_url, get_back_jumps
 
 
 class HtmlProcessor:
@@ -149,6 +149,23 @@ def download_images_from_html(self, html_body, output_path, path_from_html):
                         img.attrib["style"] = " max-width:100%"
         return bool(imgs)
 
+    def get_root_from_asset(self, path_from_html, root_from_html):
+        """ get path to root from the downloaded/generated asset """
+
+        nb_jumps_root_from_html = root_from_html.count("../")
+        nb_back_jumps_output_path = path_from_html.count("../")
+
+        # the path to the asset from HTML, minus the back jumps
+        path_without_back_jumps = path_from_html[
+            (nb_back_jumps_output_path) * len("../") :
+        ]
+
+        return get_back_jumps(
+            nb_jumps_root_from_html
+            - nb_back_jumps_output_path
+            + len(pathlib.Path(path_without_back_jumps).parts)
+        )
+
     def download_documents_from_html(
         self, html_body, output_path, path_from_html, root_from_html
     ):
@@ -174,8 +191,9 @@ def download_documents_from_html(
                                 "audio_player.html",
                                 False,
                                 audio_path=filename,
-                                path_to_root=root_from_html
-                                + len(pathlib.Path(path_from_html).parts) * "../",
+                                path_to_root=self.get_root_from_asset(
+                                    path_from_html, root_from_html
+                                ),
                                 audio_format=file_format,
                             )
                         filename = html_fpath.name

diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py
@@ -41,6 +41,7 @@
     jinja,
     jinja_init,
     prepare_url,
+    get_back_jumps,
 )
 from .xblocks_extractor.chapter import Chapter
 from .xblocks_extractor.course import Course
@@ -179,6 +180,10 @@ def __init__(
         self.wiki = None
         self.forum = None
 
+    @property
+    def instance_assets_dir(self):
+        return self.build_dir.joinpath("instance_assets")
+
     def get_course_id(self, url, course_page_name, course_prefix, instance_url):
         clean_url = re.match(
             instance_url + course_prefix + ".*" + course_page_name, url
@@ -394,14 +399,14 @@ def annex(self):
         self.get_course_tabs()
         logger.info("Downloading content for extra pages ...")
         for page in self.annexed_pages:
+            root_from_html = get_back_jumps(
+                len(page["output_path"].relative_to(self.build_dir).parts)
+            )
             page["content"] = self.html_processor.dl_dependencies_and_fix_links(
                 content=page["content"],
-                output_path=page["output_path"],
-                path_from_html="",
-                root_from_html=len(
-                    page["output_path"].relative_to(self.build_dir).parts
-                )
-                * "../",
+                output_path=self.instance_assets_dir,
+                path_from_html=root_from_html + "instance_assets",
+                root_from_html=root_from_html,
             )
 
         logger.info("Processing book lists ...")
@@ -478,8 +483,8 @@ def clean_content(html_article):
                     self.homepage_html.append(
                         self.html_processor.dl_dependencies_and_fix_links(
                             content=article.prettify(),
-                            output_path=self.build_dir.joinpath("home"),
-                            path_from_html="home",
+                            output_path=self.instance_assets_dir,
+                            path_from_html="instance_assets",
                             root_from_html="",
                         )
                     )
@@ -490,8 +495,8 @@ def clean_content(html_article):
             self.homepage_html.append(
                 self.html_processor.dl_dependencies_and_fix_links(
                     content=welcome_message.prettify(),
-                    output_path=self.build_dir.joinpath("home"),
-                    path_from_html="home",
+                    output_path=self.instance_assets_dir,
+                    path_from_html="instance_assets",
                     root_from_html="",
                 )
             )
@@ -673,7 +678,7 @@ def render_booknav(self):
                 book_list=book_nav["book_list"],
                 dir_path=book_nav["dir_path"],
                 mooc=self,
-                rooturl="../../../",
+                rooturl=get_back_jumps(3),
             )
 
     def render(self):
@@ -763,6 +768,7 @@ def run(self):
         )
         self.instance_connection.establish_connection()
         jinja_init()
+        self.instance_assets_dir.mkdir(exist_ok=True, parents=True)
         self.html_processor = HtmlProcessor(self)
         self.prepare_mooc_data()
         self.parse_course_xblocks()

diff --git a/openedx2zim/utils.py b/openedx2zim/utils.py
@@ -4,7 +4,6 @@
 import re
 import shlex
 import subprocess
-import urllib
 import zlib
 
 import requests
@@ -78,6 +77,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
         if not subtitle_file.exists():
             try:
                 raw_subtitle = instance_connection.get_page(subtitles[lang])
+                if not raw_subtitle:
+                    logger.error(f"Subtitle fetch failed from {subtitles[lang]}")
+                    continue
                 subtitle = html.unescape(
                     re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M)
                 )
@@ -87,9 +89,6 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
                     webvtt = WebVTT().from_srt(subtitle_file)
                     webvtt.save()
                 processed_subtitles[lang] = f"{lang}.vtt"
-            except urllib.error.HTTPError as exc:
-                if exc.code == 404 or exc.code == 403:
-                    logger.error(f"Failed to get subtitle from {subtitles[lang]}")
             except Exception as exc:
                 logger.error(
                     f"Error while converting subtitle {subtitles[lang]} : {exc}"
@@ -172,3 +171,9 @@ def remove_autogenerated_tags(html_string):
         if html_string.endswith(search_string):
             html_string = html_string[: -len(search_string)].strip()
     return html_string
+
+
+def get_back_jumps(nb_jumps):
+    """ return a string path containing back jumps nb_jumps number of times """
+
+    return "../" * nb_jumps
diff --git a/openedx2zim/xblocks_extractor/discussion.py b/openedx2zim/xblocks_extractor/discussion.py
@@ -3,7 +3,7 @@
 from bs4 import BeautifulSoup, NavigableString
 
 from .base_xblock import BaseXblock
-from ..utils import jinja, remove_autogenerated_tags
+from ..utils import jinja, remove_autogenerated_tags, get_back_jumps
 
 
 class Discussion(BaseXblock):
@@ -42,9 +42,10 @@ def get_extra_content(self, soup):
                             remove_autogenerated_tags(
                                 self.scraper.html_processor.dl_dependencies_and_fix_links(
                                     content=str(element),
-                                    output_path=self.output_path,
-                                    path_from_html=self.folder_name,
-                                    root_from_html="../" * 5,
+                                    output_path=self.scraper.instance_assets_dir,
+                                    path_from_html=get_back_jumps(5)
+                                    + "instance_assets",
+                                    root_from_html=get_back_jumps(5),
                                 )
                             )
                         )
@@ -53,9 +54,9 @@ def get_extra_content(self, soup):
                     remove_autogenerated_tags(
                         self.scraper.html_processor.dl_dependencies_and_fix_links(
                             content=str(child_node),
-                            output_path=self.output_path,
-                            path_from_html=self.folder_name,
-                            root_from_html="../" * 5,
+                            output_path=self.scraper.instance_assets_dir,
+                            path_from_html=get_back_jumps(5) + "instance_assets",
+                            root_from_html=get_back_jumps(5),
                         )
                     )
                 )
@@ -64,9 +65,9 @@ def get_extra_content(self, soup):
                     remove_autogenerated_tags(
                         self.scraper.html_processor.dl_dependencies_and_fix_links(
                             content=str(child_node),
-                            output_path=self.output_path,
-                            path_from_html=self.folder_name,
-                            root_from_html="../" * 5,
+                            output_path=self.scraper.instance_assets_dir,
+                            path_from_html=get_back_jumps(5) + "instance_assets",
+                            root_from_html=get_back_jumps(5),
                         )
                     )
                 )
@@ -102,7 +103,7 @@ def render(self):
                 threads=self.data,
                 discussion=self,
                 staff_user=self.scraper.forum.staff_user,
-                rooturl="../" * 5,
+                rooturl=get_back_jumps(5),
                 pre_discussion_content=self.pre_discussion_content,
                 post_discussion_content=self.post_discussion_content,
                 discussion_header=self.discussion_header,

diff --git a/openedx2zim/xblocks_extractor/drag_and_drop_v2.py b/openedx2zim/xblocks_extractor/drag_and_drop_v2.py
@@ -2,10 +2,9 @@
 import json
 
 from bs4 import BeautifulSoup
-from slugify import slugify
 
 from .base_xblock import BaseXblock
-from ..utils import jinja, prepare_url
+from ..utils import jinja, prepare_url, get_back_jumps
 
 
 class DragAndDropV2(
@@ -34,18 +33,20 @@ def download(self, instance_connection):
             name = pathlib.Path(item["expandedImageURL"]).name
             self.scraper.download_file(
                 prepare_url(item["expandedImageURL"], self.scraper.instance_url),
-                self.output_path.joinpath(name),
+                self.scraper.instance_assets_dir.joinpath(name),
             )
-            item["expandedImageURL"] = f"{self.folder_name}/{name}"
+            item["expandedImageURL"] = get_back_jumps(5) + f"instance_assets/{name}"
         # Grid
         name = pathlib.Path(self.content["target_img_expanded_url"]).name
         self.scraper.download_file(
             prepare_url(
                 self.content["target_img_expanded_url"], self.scraper.instance_url
             ),
-            self.output_path.joinpath(name),
+            self.scraper.instance_assets_dir.joinpath(name),
+        )
+        self.content["target_img_expanded_url"] = (
+            get_back_jumps(5) + f"instance_assets/{name}"
         )
-        self.content["target_img_expanded_url"] = f"{self.folder_name}/{name}"
 
     def render(self):
         return jinja(None, "DragAndDropV2.html", False, dragdrop_content=self.content)
diff --git a/openedx2zim/xblocks_extractor/free_text_response.py b/openedx2zim/xblocks_extractor/free_text_response.py
@@ -1,7 +1,7 @@
 from bs4 import BeautifulSoup
 
 from .base_xblock import BaseXblock
-from ..utils import jinja
+from ..utils import jinja, get_back_jumps
 
 
 class FreeTextResponse(BaseXblock):
@@ -35,9 +35,9 @@ def download(self, instance_connection):
             html_no_answers
             + self.scraper.html_processor.dl_dependencies_and_fix_links(
                 content=str(soup),
-                output_path=self.output_path,
-                path_from_html=self.folder_name,
-                root_from_html="../" * 5,
+                output_path=self.scraper.instance_assets_dir,
+                path_from_html=get_back_jumps(5) + "instance_assets",
+                root_from_html=get_back_jumps(5),
             )
         )
 

diff --git a/openedx2zim/xblocks_extractor/html.py b/openedx2zim/xblocks_extractor/html.py
@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 
 from .base_xblock import BaseXblock
+from ..utils import get_back_jumps
 
 
 class Html(BaseXblock):
@@ -24,9 +25,9 @@ def download(self, instance_connection):
             html_content = str(soup.find("div", attrs={"class": "course-wrapper"}))
         self.html = self.scraper.html_processor.dl_dependencies_and_fix_links(
             content=html_content,
-            output_path=self.output_path,
-            path_from_html=self.folder_name,
-            root_from_html="../" * 5,
+            output_path=self.scraper.instance_assets_dir,
+            path_from_html=get_back_jumps(5) + "instance_assets",
+            root_from_html=get_back_jumps(5),
         )
 
     def render(self):

diff --git a/openedx2zim/xblocks_extractor/libcast.py b/openedx2zim/xblocks_extractor/libcast.py
@@ -1,11 +1,7 @@
 from bs4 import BeautifulSoup
 
 from .base_xblock import BaseXblock
-from ..utils import (
-    jinja,
-    download_and_convert_subtitles,
-    prepare_url,
-)
+from ..utils import jinja, download_and_convert_subtitles, prepare_url, get_back_jumps
 
 
 class Libcast(BaseXblock):
@@ -62,5 +58,5 @@ def render(self):
             title=self.xblock_json["display_name"],
             subs=self.subs,
             autoplay=self.scraper.autoplay,
-            path_to_root="../" * 5,
+            path_to_root=get_back_jumps(5),
         )