Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent duplicate files #108

Merged
merged 2 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 21 additions & 17 deletions openedx2zim/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bs4 import BeautifulSoup

from .constants import getLogger
from .utils import jinja, markdown
from .utils import jinja, markdown, get_back_jumps

logger = getLogger()

Expand Down Expand Up @@ -157,10 +157,9 @@ def update_thread_children(self, thread):
content=markdown(children["body"]),
output_path=self.output_path.joinpath(thread["id"]),
path_from_html="",
root_from_html=len(
self.output_path.relative_to(self.scraper.build_dir).parts
)
* "../",
root_from_html=get_back_jumps(
len(self.output_path.relative_to(self.scraper.build_dir).parts)
),
)
if "children" in children:
for children_children in children["children"]:
Expand All @@ -170,10 +169,13 @@ def update_thread_children(self, thread):
content=markdown(children_children["body"]),
output_path=self.output_path.joinpath(thread["id"]),
path_from_html="",
root_from_html=len(
self.output_path.relative_to(self.scraper.build_dir).parts
)
* "../",
root_from_html=get_back_jumps(
len(
self.output_path.relative_to(
self.scraper.build_dir
).parts
)
),
)

def annex_forum(self):
Expand Down Expand Up @@ -210,10 +212,9 @@ def annex_forum(self):
content=markdown(thread["data_thread"]["content"]["body"]),
output_path=self.output_path.joinpath(thread["id"]),
path_from_html="",
root_from_html=len(
self.output_path.relative_to(self.scraper.build_dir).parts
)
* "../",
root_from_html=get_back_jumps(
len(self.output_path.relative_to(self.scraper.build_dir).parts)
),
)
self.update_thread_children(thread)

Expand Down Expand Up @@ -308,10 +309,13 @@ def update_wiki_page(self, soup, text, url, page_to_visit):
content=str(text),
output_path=self.wiki_data[url]["path"],
path_from_html="",
root_from_html=len(
self.wiki_data[url]["path"].relative_to(self.scraper.build_dir).parts
)
* "../",
root_from_html=get_back_jumps(
len(
self.wiki_data[url]["path"]
.relative_to(self.scraper.build_dir)
.parts
)
),
)
self.wiki_data[url]["title"] = soup.find("title").text
self.wiki_data[url]["last-modif"] = soup.find(
Expand Down
24 changes: 21 additions & 3 deletions openedx2zim/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bs4 import BeautifulSoup

from .constants import DOWNLOADABLE_EXTENSIONS, AUDIO_FORMATS
from .utils import jinja, prepare_url
from .utils import jinja, prepare_url, get_back_jumps


class HtmlProcessor:
Expand Down Expand Up @@ -149,6 +149,23 @@ def download_images_from_html(self, html_body, output_path, path_from_html):
img.attrib["style"] = " max-width:100%"
return bool(imgs)

def get_root_from_asset(self, path_from_html, root_from_html):
""" get path to root from the downloaded/generated asset """

nb_jumps_root_from_html = root_from_html.count("../")
nb_back_jumps_output_path = path_from_html.count("../")

# the path to the asset from HTML, minus the back jumps
path_without_back_jumps = path_from_html[
(nb_back_jumps_output_path) * len("../") :
]

return get_back_jumps(
nb_jumps_root_from_html
- nb_back_jumps_output_path
+ len(pathlib.Path(path_without_back_jumps).parts)
)

def download_documents_from_html(
self, html_body, output_path, path_from_html, root_from_html
):
Expand All @@ -174,8 +191,9 @@ def download_documents_from_html(
"audio_player.html",
False,
audio_path=filename,
path_to_root=root_from_html
+ len(pathlib.Path(path_from_html).parts) * "../",
path_to_root=self.get_root_from_asset(
path_from_html, root_from_html
),
audio_format=file_format,
)
filename = html_fpath.name
Expand Down
28 changes: 17 additions & 11 deletions openedx2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
jinja,
jinja_init,
prepare_url,
get_back_jumps,
)
from .xblocks_extractor.chapter import Chapter
from .xblocks_extractor.course import Course
Expand Down Expand Up @@ -179,6 +180,10 @@ def __init__(
self.wiki = None
self.forum = None

@property
def instance_assets_dir(self):
return self.build_dir.joinpath("instance_assets")

def get_course_id(self, url, course_page_name, course_prefix, instance_url):
clean_url = re.match(
instance_url + course_prefix + ".*" + course_page_name, url
Expand Down Expand Up @@ -394,14 +399,14 @@ def annex(self):
self.get_course_tabs()
logger.info("Downloading content for extra pages ...")
for page in self.annexed_pages:
root_from_html = get_back_jumps(
len(page["output_path"].relative_to(self.build_dir).parts)
)
page["content"] = self.html_processor.dl_dependencies_and_fix_links(
content=page["content"],
output_path=page["output_path"],
path_from_html="",
root_from_html=len(
page["output_path"].relative_to(self.build_dir).parts
)
* "../",
output_path=self.instance_assets_dir,
path_from_html=root_from_html + "instance_assets",
root_from_html=root_from_html,
)

logger.info("Processing book lists ...")
Expand Down Expand Up @@ -478,8 +483,8 @@ def clean_content(html_article):
self.homepage_html.append(
self.html_processor.dl_dependencies_and_fix_links(
content=article.prettify(),
output_path=self.build_dir.joinpath("home"),
path_from_html="home",
output_path=self.instance_assets_dir,
path_from_html="instance_assets",
root_from_html="",
)
)
Expand All @@ -490,8 +495,8 @@ def clean_content(html_article):
self.homepage_html.append(
self.html_processor.dl_dependencies_and_fix_links(
content=welcome_message.prettify(),
output_path=self.build_dir.joinpath("home"),
path_from_html="home",
output_path=self.instance_assets_dir,
path_from_html="instance_assets",
root_from_html="",
)
)
Expand Down Expand Up @@ -673,7 +678,7 @@ def render_booknav(self):
book_list=book_nav["book_list"],
dir_path=book_nav["dir_path"],
mooc=self,
rooturl="../../../",
rooturl=get_back_jumps(3),
)

def render(self):
Expand Down Expand Up @@ -763,6 +768,7 @@ def run(self):
)
self.instance_connection.establish_connection()
jinja_init()
self.instance_assets_dir.mkdir(exist_ok=True, parents=True)
self.html_processor = HtmlProcessor(self)
self.prepare_mooc_data()
self.parse_course_xblocks()
Expand Down
13 changes: 9 additions & 4 deletions openedx2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import re
import shlex
import subprocess
import urllib
import zlib

import requests
Expand Down Expand Up @@ -78,6 +77,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
if not subtitle_file.exists():
try:
raw_subtitle = instance_connection.get_page(subtitles[lang])
if not raw_subtitle:
logger.error(f"Subtitle fetch failed from {subtitles[lang]}")
continue
subtitle = html.unescape(
re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M)
)
Expand All @@ -87,9 +89,6 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
webvtt = WebVTT().from_srt(subtitle_file)
webvtt.save()
processed_subtitles[lang] = f"{lang}.vtt"
except urllib.error.HTTPError as exc:
if exc.code == 404 or exc.code == 403:
logger.error(f"Failed to get subtitle from {subtitles[lang]}")
except Exception as exc:
logger.error(
f"Error while converting subtitle {subtitles[lang]} : {exc}"
Expand Down Expand Up @@ -172,3 +171,9 @@ def remove_autogenerated_tags(html_string):
if html_string.endswith(search_string):
html_string = html_string[: -len(search_string)].strip()
return html_string


def get_back_jumps(nb_jumps):
""" return a string path containing back jumps nb_jumps number of times """

return "../" * nb_jumps
23 changes: 12 additions & 11 deletions openedx2zim/xblocks_extractor/discussion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bs4 import BeautifulSoup, NavigableString

from .base_xblock import BaseXblock
from ..utils import jinja, remove_autogenerated_tags
from ..utils import jinja, remove_autogenerated_tags, get_back_jumps


class Discussion(BaseXblock):
Expand Down Expand Up @@ -42,9 +42,10 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(element),
output_path=self.output_path,
path_from_html=self.folder_name,
root_from_html="../" * 5,
output_path=self.scraper.instance_assets_dir,
path_from_html=get_back_jumps(5)
+ "instance_assets",
root_from_html=get_back_jumps(5),
)
)
)
Expand All @@ -53,9 +54,9 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(child_node),
output_path=self.output_path,
path_from_html=self.folder_name,
root_from_html="../" * 5,
output_path=self.scraper.instance_assets_dir,
path_from_html=get_back_jumps(5) + "instance_assets",
root_from_html=get_back_jumps(5),
)
)
)
Expand All @@ -64,9 +65,9 @@ def get_extra_content(self, soup):
remove_autogenerated_tags(
self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(child_node),
output_path=self.output_path,
path_from_html=self.folder_name,
root_from_html="../" * 5,
output_path=self.scraper.instance_assets_dir,
path_from_html=get_back_jumps(5) + "instance_assets",
root_from_html=get_back_jumps(5),
)
)
)
Expand Down Expand Up @@ -102,7 +103,7 @@ def render(self):
threads=self.data,
discussion=self,
staff_user=self.scraper.forum.staff_user,
rooturl="../" * 5,
rooturl=get_back_jumps(5),
pre_discussion_content=self.pre_discussion_content,
post_discussion_content=self.post_discussion_content,
discussion_header=self.discussion_header,
Expand Down
13 changes: 7 additions & 6 deletions openedx2zim/xblocks_extractor/drag_and_drop_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import json

from bs4 import BeautifulSoup
from slugify import slugify

from .base_xblock import BaseXblock
from ..utils import jinja, prepare_url
from ..utils import jinja, prepare_url, get_back_jumps


class DragAndDropV2(
Expand Down Expand Up @@ -34,18 +33,20 @@ def download(self, instance_connection):
name = pathlib.Path(item["expandedImageURL"]).name
self.scraper.download_file(
prepare_url(item["expandedImageURL"], self.scraper.instance_url),
self.output_path.joinpath(name),
self.scraper.instance_assets_dir.joinpath(name),
)
item["expandedImageURL"] = f"{self.folder_name}/{name}"
item["expandedImageURL"] = get_back_jumps(5) + f"instance_assets/{name}"
# Grid
name = pathlib.Path(self.content["target_img_expanded_url"]).name
self.scraper.download_file(
prepare_url(
self.content["target_img_expanded_url"], self.scraper.instance_url
),
self.output_path.joinpath(name),
self.scraper.instance_assets_dir.joinpath(name),
)
self.content["target_img_expanded_url"] = (
get_back_jumps(5) + f"instance_assets/{name}"
)
self.content["target_img_expanded_url"] = f"{self.folder_name}/{name}"

def render(self):
return jinja(None, "DragAndDropV2.html", False, dragdrop_content=self.content)
8 changes: 4 additions & 4 deletions openedx2zim/xblocks_extractor/free_text_response.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from bs4 import BeautifulSoup

from .base_xblock import BaseXblock
from ..utils import jinja
from ..utils import jinja, get_back_jumps


class FreeTextResponse(BaseXblock):
Expand Down Expand Up @@ -35,9 +35,9 @@ def download(self, instance_connection):
html_no_answers
+ self.scraper.html_processor.dl_dependencies_and_fix_links(
content=str(soup),
output_path=self.output_path,
path_from_html=self.folder_name,
root_from_html="../" * 5,
output_path=self.scraper.instance_assets_dir,
path_from_html=get_back_jumps(5) + "instance_assets",
root_from_html=get_back_jumps(5),
)
)

Expand Down
7 changes: 4 additions & 3 deletions openedx2zim/xblocks_extractor/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from bs4 import BeautifulSoup

from .base_xblock import BaseXblock
from ..utils import get_back_jumps


class Html(BaseXblock):
Expand All @@ -24,9 +25,9 @@ def download(self, instance_connection):
html_content = str(soup.find("div", attrs={"class": "course-wrapper"}))
self.html = self.scraper.html_processor.dl_dependencies_and_fix_links(
content=html_content,
output_path=self.output_path,
path_from_html=self.folder_name,
root_from_html="../" * 5,
output_path=self.scraper.instance_assets_dir,
path_from_html=get_back_jumps(5) + "instance_assets",
root_from_html=get_back_jumps(5),
)

def render(self):
Expand Down
8 changes: 2 additions & 6 deletions openedx2zim/xblocks_extractor/libcast.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from bs4 import BeautifulSoup

from .base_xblock import BaseXblock
from ..utils import (
jinja,
download_and_convert_subtitles,
prepare_url,
)
from ..utils import jinja, download_and_convert_subtitles, prepare_url, get_back_jumps


class Libcast(BaseXblock):
Expand Down Expand Up @@ -62,5 +58,5 @@ def render(self):
title=self.xblock_json["display_name"],
subs=self.subs,
autoplay=self.scraper.autoplay,
path_to_root="../" * 5,
path_to_root=get_back_jumps(5),
)
Loading