Skip to content

Commit

Permalink
Fixes #104 - Prevent duplicates by downloading everything to single l…
Browse files Browse the repository at this point in the history
…ocation
  • Loading branch information
satyamtg committed Aug 13, 2020
1 parent 0adf585 commit 493426b
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 37 deletions.
22 changes: 20 additions & 2 deletions openedx2zim/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,23 @@ def download_images_from_html(self, html_body, output_path, path_from_html):
img.attrib["style"] = " max-width:100%"
return bool(imgs)

def get_root_from_asset(self, path_from_html, root_from_html):
""" get path to root from the downloaded/generated asset """

nb_jumps_root_from_html = root_from_html.count("../")
nb_back_jumps_output_path = path_from_html.count("../")

# the path to the asset from HTML, minus the back jumps
path_without_back_jumps = path_from_html[
(nb_back_jumps_output_path) * len("../") :
]

return (
nb_jumps_root_from_html
- nb_back_jumps_output_path
+ len(pathlib.Path(path_without_back_jumps).parts)
) * "../"

def download_documents_from_html(
self, html_body, output_path, path_from_html, root_from_html
):
Expand All @@ -174,8 +191,9 @@ def download_documents_from_html(
"audio_player.html",
False,
audio_path=filename,
path_to_root=root_from_html
+ len(pathlib.Path(path_from_html).parts) * "../",
path_to_root=self.get_root_from_asset(
path_from_html, root_from_html
),
audio_format=file_format,
)
filename = html_fpath.name
Expand Down
25 changes: 15 additions & 10 deletions openedx2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ def __init__(
self.wiki = None
self.forum = None

@property
def instance_assets_dir(self):
return self.build_dir.joinpath("instance_assets")

def get_course_id(self, url, course_page_name, course_prefix, instance_url):
clean_url = re.match(
instance_url + course_prefix + ".*" + course_page_name, url
Expand Down Expand Up @@ -394,14 +398,14 @@ def annex(self):
self.get_course_tabs()
logger.info("Downloading content for extra pages ...")
for page in self.annexed_pages:
root_from_html = (
len(page["output_path"].relative_to(self.build_dir).parts) * "../"
)
page["content"] = self.html_processor.dl_dependencies_and_fix_links(
content=page["content"],
output_path=page["output_path"],
path_from_html="",
root_from_html=len(
page["output_path"].relative_to(self.build_dir).parts
)
* "../",
output_path=self.instance_assets_dir,
path_from_html=root_from_html + "instance_assets",
root_from_html=root_from_html,
)

logger.info("Processing book lists ...")
Expand Down Expand Up @@ -478,8 +482,8 @@ def clean_content(html_article):
self.homepage_html.append(
self.html_processor.dl_dependencies_and_fix_links(
content=article.prettify(),
output_path=self.build_dir.joinpath("home"),
path_from_html="home",
output_path=self.instance_assets_dir,
path_from_html="instance_assets",
root_from_html="",
)
)
Expand All @@ -490,8 +494,8 @@ def clean_content(html_article):
self.homepage_html.append(
self.html_processor.dl_dependencies_and_fix_links(
content=welcome_message.prettify(),
output_path=self.build_dir.joinpath("home"),
path_from_html="home",
output_path=self.instance_assets_dir,
path_from_html="instance_assets",
root_from_html="",
)
)
Expand Down Expand Up @@ -763,6 +767,7 @@ def run(self):
)
self.instance_connection.establish_connection()
jinja_init()
self.instance_assets_dir.mkdir(exist_ok=True, parents=True)
self.html_processor = HtmlProcessor(self)
self.prepare_mooc_data()
self.parse_course_xblocks()
Expand Down
6 changes: 3 additions & 3 deletions openedx2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
if not subtitle_file.exists():
try:
raw_subtitle = instance_connection.get_page(subtitles[lang])
if not raw_subtitle:
logger.error(f"Subtitle fetch failed from {subtitles[lang]}")
continue
subtitle = html.unescape(
re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M)
)
Expand All @@ -87,9 +90,6 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
webvtt = WebVTT().from_srt(subtitle_file)
webvtt.save()
processed_subtitles[lang] = f"{lang}.vtt"
except urllib.error.HTTPError as exc:
if exc.code == 404 or exc.code == 403:
logger.error(f"Failed to get subtitle from {subtitles[lang]}")
except Exception as exc:
logger.error(
f"Error while converting subtitle {subtitles[lang]} : {exc}"
Expand Down
12 changes: 6 additions & 6 deletions openedx2zim/xblocks_extractor/discussion.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(element),
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)
)
Expand All @@ -53,8 +53,8 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(child_node),
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)
)
Expand All @@ -64,8 +64,8 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(child_node),
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)
)
Expand Down
9 changes: 4 additions & 5 deletions openedx2zim/xblocks_extractor/drag_and_drop_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json

from bs4 import BeautifulSoup
from slugify import slugify

from .base_xblock import BaseXblock
from ..utils import jinja, prepare_url
Expand Down Expand Up @@ -34,18 +33,18 @@ def download(self, instance_connection):
name = pathlib.Path(item["expandedImageURL"]).name
self.scraper.download_file(
prepare_url(item["expandedImageURL"], self.scraper.instance_url),
self.output_path.joinpath(name),
self.scraper.instance_assets_dir.joinpath(name),
)
item["expandedImageURL"] = f"{self.folder_name}/{name}"
item["expandedImageURL"] = "../" * 5 + f"instance_assets/{name}"
# Grid
name = pathlib.Path(self.content["target_img_expanded_url"]).name
self.scraper.download_file(
prepare_url(
self.content["target_img_expanded_url"], self.scraper.instance_url
),
self.output_path.joinpath(name),
self.scraper.instance_assets_dir.joinpath(name),
)
self.content["target_img_expanded_url"] = f"{self.folder_name}/{name}"
self.content["target_img_expanded_url"] = "../" * 5 + f"instance_assets/{name}"

def render(self):
return jinja(None, "DragAndDropV2.html", False, dragdrop_content=self.content)
4 changes: 2 additions & 2 deletions openedx2zim/xblocks_extractor/free_text_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def download(self, instance_connection):
html_no_answers
+ self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(soup),
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)
)
Expand Down
4 changes: 2 additions & 2 deletions openedx2zim/xblocks_extractor/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def download(self, instance_connection):
html_content = str(soup.find("div", attrs={"class": "course-wrapper"}))
self.html = self.scraper.html_processor.dl_dependencies_and_fix_links(
content=html_content,
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)

Expand Down
4 changes: 2 additions & 2 deletions openedx2zim/xblocks_extractor/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ def download(self, instance_connection):
# process final HTML content
html_content = self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(soup.find("div", attrs={"class": "problem"})),
output_path=self.output_path,
path_from_html=self.folder_name,
output_path=self.scraper.instance_assets_dir,
path_from_html="../" * 5 + "instance_assets",
root_from_html="../" * 5,
)

Expand Down
7 changes: 2 additions & 5 deletions openedx2zim/xblocks_extractor/vertical.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ def __init__(
self.icon_type = "fa-book"

def download(self, instance_connection):
instance_assets_path = self.scraper.build_dir.joinpath("instance_assets")
instance_assets_path.mkdir(parents=True, exist_ok=True)

# get the LMS content for the vertical
content = instance_connection.get_page(self.xblock_json["lms_web_url"])
soup = BeautifulSoup(content, "lxml")
Expand All @@ -47,7 +44,7 @@ def download(self, instance_connection):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(header_element),
output_path=instance_assets_path,
output_path=self.scraper.instance_assets_dir,
path_from_html=f"{self.root_url}instance_assets",
root_from_html=self.root_url,
)
Expand All @@ -62,7 +59,7 @@ def download(self, instance_connection):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(script),
output_path=instance_assets_path,
output_path=self.scraper.instance_assets_dir,
path_from_html=f"{self.root_url}instance_assets",
root_from_html=self.root_url,
)
Expand Down

0 comments on commit 493426b

Please sign in to comment.