Skip to content
This repository has been archived by the owner on May 1, 2024. It is now read-only.

WIP-Spark Tasks #476

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config/devstack.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,8 @@ api_root_url = http://localhost:8000/api/courses/v1/courses/

[course-blocks]
api_root_url = http://localhost:8000/api/courses/v1/blocks/

[spark]
driver-memory=3g
executor-memory=3g
executor-cores=1
480 changes: 480 additions & 0 deletions edx/analytics/tasks/common/spark.py

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions edx/analytics/tasks/insights/location_per_course.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from edx.analytics.tasks.common.pathutil import (
EventLogSelectionDownstreamMixin, EventLogSelectionMixin, PathSelectionByDateIntervalTask
)
from edx.analytics.tasks.common.spark import EventLogSelectionMixinSpark, SparkJobTask
from edx.analytics.tasks.insights.database_imports import ImportStudentCourseEnrollmentTask
from edx.analytics.tasks.util import eventlog
from edx.analytics.tasks.util.decorators import workflow_entry_point
Expand Down Expand Up @@ -163,6 +164,84 @@ def run(self):
target.open("w").close() # touch the file


class LastDailyIpAddressOfUserTaskSpark(EventLogSelectionMixinSpark, WarehouseMixin, SparkJobTask):
"""Spark alternate of LastDailyIpAddressOfUserTask"""

output_parent_dirname = 'last_ip_of_user_id'
marker = luigi.Parameter(
config_path={'section': 'map-reduce', 'name': 'marker'},
significant=False,
description='A URL location to a directory where a marker file will be written on task completion.',
)

def output_dir(self):
"""
Output directory for spark task
"""
return get_target_from_url(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason this is a target instead of just a URL? Is that the easiest way to get .exists() functionality?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is for .exists() functionality.

url_path_join(
self.warehouse_path,
self.output_parent_dirname
)
)

def output(self):
"""
Marker output path
"""
marker_url = url_path_join(self.marker, str(hash(self)))
return get_target_from_url(marker_url, marker=True)

def output_paths(self):
"""
Output partition paths
"""
return map(
lambda date: get_target_from_url(
url_path_join(
self.hive_partition_path(self.output_parent_dirname, date.isoformat())
)
),
self.interval
)

def on_success(self): # pragma: no cover
self.output().touch_marker()

def run(self):
self.remove_output_on_overwrite()
if self.output_dir().exists(): # only check partitions if parent dir exists
for target in self.output_paths():
if target.exists():
target.remove()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be good to put in a comment here, because I'm not remembering if we did this because we were writing in append mode, or because of issues if we reduced the number of partitions we coalesced down to on output, or because Spark really doesn't want the files around when it's planning on writing them out. Do you remember?

Copy link
Contributor Author

@rao-abdul-mannan rao-abdul-mannan Jun 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done to avoid duplicates in case of append mode.

super(LastDailyIpAddressOfUserTaskSpark, self).run()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the original, each date in the interval is checked to see if it produces actual data. If the data is sparse enough, then not all date partitions will be created. This was done because the downstream class needed to have all dates exist, even if empty, so as to know what dates had been processed. This was also dealt with using the downstream_input_tasks() to do this check in the LastCountryOfUser task.

But maybe this can be addressed when the follow-on (i.e. non-historic) workflow is written.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My approach is to handle it with non-historic task and if it isn't possible then I'll make adjustments here.


def spark_job(self, *args):
from edx.analytics.tasks.util.spark_util import validate_course_id
from pyspark.sql.functions import udf
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
df = self.get_dataframe(self._spark, *args)
validate_courseid = udf(validate_course_id, StringType())
df = df.withColumn('course_id', validate_courseid(df['course_id']))
df.createOrReplaceTempView('location')
query = """
SELECT
timestamp, ip, user_id, course_id, dt
FROM (
SELECT
event_date AS dt, user_id, course_id, timestamp, ip,
ROW_NUMBER() OVER ( PARTITION BY event_date, user_id, course_id ORDER BY timestamp DESC ) AS rank
FROM location
WHERE ip <> ''
) user_location
WHERE rank = 1
"""
result = self._spark.sql(query)
# write 4 tsv files in each partitioned directory
result.coalesce(4).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t')


class LastCountryOfUserDownstreamMixin(
WarehouseMixin,
OverwriteOutputMixin,
Expand Down
47 changes: 23 additions & 24 deletions edx/analytics/tasks/insights/tests/test_user_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
from mock import Mock, call

from edx.analytics.tasks.common.tests.map_reduce_mixins import MapperTestMixin, ReducerTestMixin
from edx.analytics.tasks.insights.user_activity import (
ACTIVE_LABEL, PLAY_VIDEO_LABEL, POST_FORUM_LABEL, PROBLEM_LABEL, InsertToMysqlCourseActivityTask, UserActivityTask
)
from edx.analytics.tasks.insights.user_activity import InsertToMysqlCourseActivityTask, UserActivityTask
from edx.analytics.tasks.util.constants import PredicateLabels
from edx.analytics.tasks.util.tests.opaque_key_mixins import InitializeLegacyKeysMixin, InitializeOpaqueKeysMixin


Expand Down Expand Up @@ -84,21 +83,21 @@ def test_illegal_course_id(self):
def test_good_dummy_event(self):
line = self.create_event_log_line()
event = tuple(self.task.mapper(line))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),)
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),)
self.assertEquals(event, expected)

def test_play_video_event(self):
line = self.create_event_log_line(event_source='browser', event_type='play_video')
event = tuple(self.task.mapper(line))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)))
self.assertEquals(event, expected)

def test_problem_event(self):
line = self.create_event_log_line(event_source='server', event_type='problem_check')
event = tuple(self.task.mapper(line))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PROBLEM_LABEL)))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PROBLEM_LABEL)))
self.assertEquals(event, expected)

@data(('edx.forum.thread.created', True), ('edx.forum.response.created', True), ('edx.forum.comment.created', True),
Expand All @@ -108,11 +107,11 @@ def test_post_forum_event(self, event_type, is_labeled_forum):
line = self.create_event_log_line(event_source='server', event_type=event_type)
event = tuple(self.task.mapper(line))
if is_labeled_forum:
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, POST_FORUM_LABEL)))
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.POST_FORUM_LABEL)))
else:
# The voted event is not a "discussion activity" and thus does not get the POST_FORUM_LABEL
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),)
expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),)
self.assertEquals(event, expected)

def test_exclusion_of_events_by_source(self):
Expand Down Expand Up @@ -147,13 +146,13 @@ def test_multiple(self):
outputs.append(output)

expected = (
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)),
('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', ACTIVE_LABEL)),
('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PROBLEM_LABEL)),
('2013-12-16', (str(self.user_id), self.encoded_course_id, '2013-12-16', ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
(self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)),
('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PredicateLabels.ACTIVE_LABEL)),
('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PredicateLabels.PROBLEM_LABEL)),
('2013-12-16', (str(self.user_id), self.encoded_course_id, '2013-12-16', PredicateLabels.ACTIVE_LABEL)),
)
self.assertItemsEqual(outputs, expected)

Expand All @@ -176,20 +175,20 @@ def setUp(self):

def test_multiple(self):
values = (
(self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL),
(self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL),
)

mock_output_file = Mock()

self.task.multi_output_reducer('2013-12-01', values, mock_output_file)
self.assertEquals(len(mock_output_file.write.mock_calls), 4)

expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL, '2'))
expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL, '2'))
self.assertIn(call(expected_string), mock_output_file.write.mock_calls)
expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL, '2'))
expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL, '2'))
self.assertIn(call(expected_string), mock_output_file.write.mock_calls)


Expand Down
Loading