forked from coursera-dl/edx-dl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_edx_dl.py
151 lines (119 loc) · 5.61 KB
/
test_edx_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pytest
from edx_dl import edx_dl, parsing
from edx_dl.common import Unit, Video, DEFAULT_FILE_FORMATS
def test_failed_login():
resp = edx_dl.edx_login(
edx_dl.LOGIN_API, edx_dl.edx_get_headers(), "guest", "guest")
assert not resp.get('success', False)
def test_remove_repeated_urls():
url = "test/html/multiple_units.html"
site = 'https://courses.edx.org'
with open(url, "r") as f:
html_contents = f.read()
page_extractor = parsing.CurrentEdXPageExtractor()
units_extracted = page_extractor.extract_units_from_html(html_contents,
site,
DEFAULT_FILE_FORMATS)
all_units = {url: units_extracted}
filtered_units = edx_dl.remove_repeated_urls(all_units)
num_all_urls = edx_dl.num_urls_in_units_dict(all_units)
num_filtered_urls = edx_dl.num_urls_in_units_dict(filtered_units)
assert num_all_urls == 18
assert num_filtered_urls == 16
assert num_all_urls != num_filtered_urls
@pytest.fixture
def all_units():
return {
'empty_section': [],
'nonempty_section': [Unit(videos=[], resources_urls=[]),
Unit(videos=[Video(video_youtube_url=None,
available_subs_url=None,
sub_template_url=None,
mp4_urls=[])], resources_urls=[]),
Unit(videos=[Video(video_youtube_url=None,
available_subs_url=None,
sub_template_url=None,
mp4_urls=['1', '2'])], resources_urls=['3']),
]
}
@pytest.fixture
def unknown_units():
return {
'nonempty_section': ['shouldfail']
}
@pytest.fixture
def unknown_videos():
return {
'nonempty_section': [Unit(videos=['shoudfail'], resources_urls=['3'])]
}
def test_extract_urls_from_units(all_units):
"""
Make sure that urls are grabbed from both mp4_urls and from
resources_urls of Unit class.
"""
urls = edx_dl.extract_urls_from_units(all_units, '%(url)s')
expected = ['1\n', '2\n', '3\n']
assert sorted(urls) == sorted(expected)
def test_extract_urls_from_units_unknown_units(unknown_units):
"""
Make sure that we only expect Units in the list of units.
"""
with pytest.raises(TypeError):
edx_dl.extract_urls_from_units(unknown_units, '%(url)s')
def test_extract_urls_from_units_unknown_videos(unknown_videos):
"""
Make sure that we only expect Video in the list of Unit videos.
"""
with pytest.raises(TypeError):
edx_dl.extract_urls_from_units(unknown_videos, '%(url)s')
def test_edx_get_subtitle():
"""
Make sure Stanford subtitle URLs are distinguished from EdX ones.
"""
def mock_get_page_contents(u, h):
assert u == url
assert h == headers
return u
def mock_get_page_contents_as_json(u, h):
assert u == url
assert h == headers
return { 'start' : [123], 'end' : [456], 'text' : ["subtitle content"] }
url = "https://lagunita.stanford.edu/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_7f4f16e3eb294538aa8db4c43877132b/handler/transcript/download"
headers = {}
get_page_contents = lambda u, h: u
expected = url
actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
assert expected == actual
# Make sure Non-Stanford URLs still work
url = "https://www.edx.org/could/be/more/realistic"
expected = '0\n00:00:00,123 --> 00:00:00,456\nsubtitle content\n\n'
actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
assert expected == actual
def test_extract_subtitle_urls():
text = """
<li class="video-tracks video-download-button">
<a href="/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download">Download transcript</a>
<div class="a11y-menu-container">
<a class="a11y-menu-button" href="#" title=".srt" role="button" aria-disabled="false">.srt</a>
<ol class="a11y-menu-list" role="menu">
<li class="a11y-menu-item active">
<a class="a11y-menu-item-link" href="#srt" title="SubRip (.srt) file" data-value="srt" role="menuitem" aria-disabled="false">
SubRip (.srt) file
</a>
</li>
<li class="a11y-menu-item">
<a class="a11y-menu-item-link" href="#txt" title="Text (.txt) file" data-value="txt" role="menuitem" aria-disabled="false">
Text (.txt) file
</a>
</li>
</ol>
</div>
</li>
"""
page_extractor = parsing.CurrentEdXPageExtractor()
expected = (None, 'https://base.url/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download')
actual = page_extractor.extract_subtitle_urls(text, "https://base.url")
print("actual", actual)
assert expected == actual