From efaef4510ed333d5a1c12359c466162a2e780920 Mon Sep 17 00:00:00 2001 From: connorhaugh <49422820+connorhaugh@users.noreply.github.com> Date: Mon, 8 Jan 2024 18:14:03 -0500 Subject: [PATCH] fix: protect video transcript handler from scraper (#34017) Web scrapers do annoying stuff like visit urls they shouldn't know about and cause xblock handlers to break. I tested this by: Making sure video transcripts worked as normal while logged in making sure that I got no 500s in my logs while attempting to view it logged out. --- .../courseware/tests/test_video_handlers.py | 46 +++++++++++++------ xmodule/video_block/video_handlers.py | 7 +++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/lms/djangoapps/courseware/tests/test_video_handlers.py b/lms/djangoapps/courseware/tests/test_video_handlers.py index 3af1de706d4a..ef0fb862833d 100644 --- a/lms/djangoapps/courseware/tests/test_video_handlers.py +++ b/lms/djangoapps/courseware/tests/test_video_handlers.py @@ -12,9 +12,12 @@ import freezegun from django.core.files.base import ContentFile from django.utils.timezone import now +from django.test import RequestFactory from edxval import api +from xblock.django.request import DjangoWebobRequest from webob import Request, Response +from common.djangoapps.student.tests.factories import UserFactory from common.test.utils import normalize_repr from openedx.core.djangoapps.contentserver.caching import del_cached_content from xmodule.contentstore.content import StaticContent # lint-amnesty, pylint: disable=wrong-import-order @@ -36,6 +39,7 @@ from .helpers import BaseTestXmodule from .test_video_xml import SOURCE_XML +TEST_USERNAME = 'test-user' TRANSCRIPT = {"start": [10], "end": [100], "text": ["Hi, welcome to Edx."]} BUMPER_TRANSCRIPT = {"start": [1], "end": [10], "text": ["A bumper"]} SRT_content = textwrap.dedent(""" @@ -574,6 +578,16 @@ def test_download_fallback_transcript(self, mock_get_video_transcript_data): assert response.headers[attribute] == value +def _create_djangowebobrequest_object_for_url(url: str) -> DjangoWebobRequest: + """ + create a DjangoWebobRequest (The type of requests used by xblocks) + with a relevant user, + """ + django_request = RequestFactory().get(url) + django_request.user = UserFactory.create(username=TEST_USERNAME) + return DjangoWebobRequest(django_request) + + @ddt.ddt class TestTranscriptTranslationGetDispatch(TestVideo): # lint-amnesty, pylint: disable=test-inherits-tests """ @@ -620,7 +634,7 @@ def setUp(self): ) @ddt.unpack def test_translation_fails(self, url, dispatch, status_code): - request = Request.blank(url) + request = _create_djangowebobrequest_object_for_url(url) response = self.block.transcript(request=request, dispatch=dispatch) assert response.status == status_code @@ -635,7 +649,7 @@ def test_translaton_en_youtube_success(self, url, dispatch, attach): subs_id = _get_subs_id(good_sjson.name) attach(self.block, subs_id) - request = Request.blank(url.format(subs_id)) + request = _create_djangowebobrequest_object_for_url(url.format(subs_id)) response = self.block.transcript(request=request, dispatch=dispatch) self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs) @@ -655,12 +669,12 @@ def test_translation_non_en_youtube_success(self): self.block.youtube_id_1_0 = subs_id self.block.youtube_id_0_75 = '0_75' self.store.update_item(self.block, self.user.id) - request = Request.blank(f'/translation/uk?videoId={subs_id}') + request = _create_djangowebobrequest_object_for_url(f'/translation/uk?videoId={subs_id}') response = self.block.transcript(request=request, dispatch='translation/uk') self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs) # 0_75 subs are exist - request = Request.blank('/translation/uk?videoId={}'.format('0_75')) + request = _create_djangowebobrequest_object_for_url('/translation/uk?videoId={}'.format('0_75')) response = self.block.transcript(request=request, dispatch='translation/uk') calculated_0_75 = { 'end': [75], @@ -674,7 +688,7 @@ def test_translation_non_en_youtube_success(self): # 1_5 will be generated from 1_0 self.block.youtube_id_1_5 = '1_5' self.store.update_item(self.block, self.user.id) - request = Request.blank('/translation/uk?videoId={}'.format('1_5')) + request = _create_djangowebobrequest_object_for_url('/translation/uk?videoId={}'.format('1_5')) response = self.block.transcript(request=request, dispatch='translation/uk') calculated_1_5 = { 'end': [150], @@ -696,7 +710,7 @@ def test_translaton_en_html5_success(self, url, dispatch, attach): attach(self.block, subs_id) self.store.update_item(self.block, self.user.id) - request = Request.blank(url) + request = _create_djangowebobrequest_object_for_url(url) response = self.block.transcript(request=request, dispatch=dispatch) self.assertDictEqual(json.loads(response.body.decode('utf-8')), TRANSCRIPT) @@ -713,7 +727,7 @@ def test_translaton_non_en_html5_success(self): # manually clean youtube_id_1_0, as it has default value self.block.youtube_id_1_0 = "" - request = Request.blank('/translation/uk') + request = _create_djangowebobrequest_object_for_url('/translation/uk') response = self.block.transcript(request=request, dispatch='translation/uk') self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs) @@ -731,21 +745,21 @@ def test_translation_static_transcript_xml_with_data_dirc(self): self.block.runtime.modulestore = test_modulestore # Test youtube style en - request = Request.blank('/translation/en?videoId=12345') + request = _create_djangowebobrequest_object_for_url('/translation/en?videoId=12345') response = self.block.transcript(request=request, dispatch='translation/en') assert response.status == '307 Temporary Redirect' assert ('Location', '/static/dummy/static/subs_12345.srt.sjson') in response.headerlist # Test HTML5 video style self.block.sub = 'OEoXaMPEzfM' - request = Request.blank('/translation/en') + request = _create_djangowebobrequest_object_for_url('/translation/en') response = self.block.transcript(request=request, dispatch='translation/en') assert response.status == '307 Temporary Redirect' assert ('Location', '/static/dummy/static/subs_OEoXaMPEzfM.srt.sjson') in response.headerlist # Test different language to ensure we are just ignoring it since we can't # translate with static fallback - request = Request.blank('/translation/uk') + request = _create_djangowebobrequest_object_for_url('/translation/uk') response = self.block.transcript(request=request, dispatch='translation/uk') assert response.status == '404 Not Found' @@ -773,7 +787,7 @@ def test_translation_static_transcript(self, url, dispatch, status_code, sub=Non if attach: attach(self.block, sub) - request = Request.blank(url) + request = _create_djangowebobrequest_object_for_url(url) response = self.block.transcript(request=request, dispatch=dispatch) assert response.status == status_code if sub: @@ -788,7 +802,7 @@ def test_translation_static_non_course(self, __): self._set_static_asset_path() # When course_id is not mocked out, these values would result in 307, as tested above. - request = Request.blank('/translation/en?videoId=12345') + request = _create_djangowebobrequest_object_for_url('/translation/en?videoId=12345') response = self.block.transcript(request=request, dispatch='translation/en') assert response.status == '404 Not Found' @@ -819,7 +833,8 @@ def test_translation_fallback_transcript(self, mock_get_video_transcript_data): mock_get_video_transcript_data.return_value = transcript # Make request to XModule transcript handler - response = self.block.transcript(request=Request.blank('/translation/en'), dispatch='translation/en') + request = _create_djangowebobrequest_object_for_url('/translation/en') + response = self.block.transcript(request=request, dispatch='translation/en') # Expected headers expected_headers = { @@ -840,7 +855,10 @@ def test_translation_fallback_transcript_feature_disabled(self): Verify that val transcript is not returned when its feature is disabled. """ # Make request to XModule transcript handler - response = self.block.transcript(request=Request.blank('/translation/en'), dispatch='translation/en') + response = self.block.transcript( + request=_create_djangowebobrequest_object_for_url('/translation/en'), + dispatch='translation/en' + ) # Assert the actual response assert response.status_code == 404 diff --git a/xmodule/video_block/video_handlers.py b/xmodule/video_block/video_handlers.py index c75171151ba4..f79820603043 100644 --- a/xmodule/video_block/video_handlers.py +++ b/xmodule/video_block/video_handlers.py @@ -315,6 +315,12 @@ def transcript(self, request, dispatch): if dispatch.startswith('translation'): language = dispatch.replace('translation', '').strip('/') + # Because scrapers hit video blocks, verify that a user exists. + # use the _request attr to get the django request object. + if not request._request.user: # pylint: disable=protected-access + log.info("Transcript: user must be logged or public view enabled to get transcript") + return Response(status=403) + if not language: log.info("Invalid /translation request: no language.") return Response(status=400) @@ -324,6 +330,7 @@ def transcript(self, request, dispatch): return Response(status=404) if language != self.transcript_language: + self.transcript_language = language try: