openedx-unsupported · rao-abdul-mannan · Jan 24, 2018 · Jan 24, 2018 · Feb 21, 2018 · May 5, 2018
diff --git a/config/devstack.cfg b/config/devstack.cfg
@@ -131,3 +131,8 @@ api_root_url = http://localhost:8000/api/courses/v1/courses/
 
 [course-blocks]
 api_root_url = http://localhost:8000/api/courses/v1/blocks/
+
+[spark]
+driver-memory=3g
+executor-memory=3g
+executor-cores=1
diff --git a/edx/analytics/tasks/common/spark.py b/edx/analytics/tasks/common/spark.py
diff --git a/edx/analytics/tasks/insights/location_per_course.py b/edx/analytics/tasks/insights/location_per_course.py
@@ -15,6 +15,7 @@
 from edx.analytics.tasks.common.pathutil import (
     EventLogSelectionDownstreamMixin, EventLogSelectionMixin, PathSelectionByDateIntervalTask
 )
+from edx.analytics.tasks.common.spark import EventLogSelectionMixinSpark, SparkJobTask
 from edx.analytics.tasks.insights.database_imports import ImportStudentCourseEnrollmentTask
 from edx.analytics.tasks.util import eventlog
 from edx.analytics.tasks.util.decorators import workflow_entry_point
@@ -163,6 +164,84 @@ def run(self):
                 target.open("w").close()  # touch the file
 
 
+class LastDailyIpAddressOfUserTaskSpark(EventLogSelectionMixinSpark, WarehouseMixin, SparkJobTask):
+    """Spark alternate of LastDailyIpAddressOfUserTask"""
+
+    output_parent_dirname = 'last_ip_of_user_id'
+    marker = luigi.Parameter(
+        config_path={'section': 'map-reduce', 'name': 'marker'},
+        significant=False,
+        description='A URL location to a directory where a marker file will be written on task completion.',
+    )
+
+    def output_dir(self):
+        """
+        Output directory for spark task
+        """
+        return get_target_from_url(
+            url_path_join(
+                self.warehouse_path,
+                self.output_parent_dirname
+            )
+        )
+
+    def output(self):
+        """
+        Marker output path
+        """
+        marker_url = url_path_join(self.marker, str(hash(self)))
+        return get_target_from_url(marker_url, marker=True)
+
+    def output_paths(self):
+        """
+        Output partition paths
+        """
+        return map(
+            lambda date: get_target_from_url(
+                url_path_join(
+                    self.hive_partition_path(self.output_parent_dirname, date.isoformat())
+                )
+            ),
+            self.interval
+        )
+
+    def on_success(self):  # pragma: no cover
+        self.output().touch_marker()
+
+    def run(self):
+        self.remove_output_on_overwrite()
+        if self.output_dir().exists():      # only check partitions if parent dir exists
+            for target in self.output_paths():
+                if target.exists():
+                    target.remove()
+        super(LastDailyIpAddressOfUserTaskSpark, self).run()
+
+    def spark_job(self, *args):
+        from edx.analytics.tasks.util.spark_util import validate_course_id
+        from pyspark.sql.functions import udf
+        from pyspark.sql.window import Window
+        from pyspark.sql.types import StringType
+        df = self.get_dataframe(self._spark, *args)
+        validate_courseid = udf(validate_course_id, StringType())
+        df = df.withColumn('course_id', validate_courseid(df['course_id']))
+        df.createOrReplaceTempView('location')
+        query = """
+                SELECT
+                    timestamp, ip, user_id, course_id, dt
+                FROM (
+                    SELECT
+                        event_date AS dt, user_id, course_id, timestamp, ip,
+                        ROW_NUMBER() OVER ( PARTITION BY event_date, user_id, course_id ORDER BY timestamp DESC ) AS rank
+                    FROM location
+                    WHERE ip <> ''
+                ) user_location
+                WHERE rank = 1
+                """
+        result = self._spark.sql(query)
+        # write 4 tsv files in each partitioned directory
+        result.coalesce(4).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t')
+
+
 class LastCountryOfUserDownstreamMixin(
         WarehouseMixin,
         OverwriteOutputMixin,

diff --git a/edx/analytics/tasks/insights/tests/test_user_activity.py b/edx/analytics/tasks/insights/tests/test_user_activity.py
@@ -11,9 +11,8 @@
 from mock import Mock, call
 
 from edx.analytics.tasks.common.tests.map_reduce_mixins import MapperTestMixin, ReducerTestMixin
-from edx.analytics.tasks.insights.user_activity import (
-    ACTIVE_LABEL, PLAY_VIDEO_LABEL, POST_FORUM_LABEL, PROBLEM_LABEL, InsertToMysqlCourseActivityTask, UserActivityTask
-)
+from edx.analytics.tasks.insights.user_activity import InsertToMysqlCourseActivityTask, UserActivityTask
+from edx.analytics.tasks.util.constants import PredicateLabels
 from edx.analytics.tasks.util.tests.opaque_key_mixins import InitializeLegacyKeysMixin, InitializeOpaqueKeysMixin
 
 
@@ -84,21 +83,21 @@ def test_illegal_course_id(self):
     def test_good_dummy_event(self):
         line = self.create_event_log_line()
         event = tuple(self.task.mapper(line))
-        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),)
+        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),)
         self.assertEquals(event, expected)
 
     def test_play_video_event(self):
         line = self.create_event_log_line(event_source='browser', event_type='play_video')
         event = tuple(self.task.mapper(line))
-        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
-                    (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)))
+        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
+                    (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)))
         self.assertEquals(event, expected)
 
     def test_problem_event(self):
         line = self.create_event_log_line(event_source='server', event_type='problem_check')
         event = tuple(self.task.mapper(line))
-        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
-                    (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PROBLEM_LABEL)))
+        expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
+                    (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PROBLEM_LABEL)))
         self.assertEquals(event, expected)
 
     @data(('edx.forum.thread.created', True), ('edx.forum.response.created', True), ('edx.forum.comment.created', True),
@@ -108,11 +107,11 @@ def test_post_forum_event(self, event_type, is_labeled_forum):
         line = self.create_event_log_line(event_source='server', event_type=event_type)
         event = tuple(self.task.mapper(line))
         if is_labeled_forum:
-            expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
-                        (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, POST_FORUM_LABEL)))
+            expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
+                        (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.POST_FORUM_LABEL)))
         else:
             # The voted event is not a "discussion activity" and thus does not get the POST_FORUM_LABEL
-            expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),)
+            expected = ((self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),)
         self.assertEquals(event, expected)
 
     def test_exclusion_of_events_by_source(self):
@@ -147,13 +146,13 @@ def test_multiple(self):
                 outputs.append(output)
 
         expected = (
-            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
-            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)),
-            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, ACTIVE_LABEL)),
-            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PLAY_VIDEO_LABEL)),
-            ('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', ACTIVE_LABEL)),
-            ('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PROBLEM_LABEL)),
-            ('2013-12-16', (str(self.user_id), self.encoded_course_id, '2013-12-16', ACTIVE_LABEL)),
+            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
+            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)),
+            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.ACTIVE_LABEL)),
+            (self.expected_date_string, (str(self.user_id), self.encoded_course_id, self.expected_date_string, PredicateLabels.PLAY_VIDEO_LABEL)),
+            ('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PredicateLabels.ACTIVE_LABEL)),
+            ('2013-12-24', (str(self.user_id), self.encoded_course_id, '2013-12-24', PredicateLabels.PROBLEM_LABEL)),
+            ('2013-12-16', (str(self.user_id), self.encoded_course_id, '2013-12-16', PredicateLabels.ACTIVE_LABEL)),
         )
         self.assertItemsEqual(outputs, expected)
 
@@ -176,20 +175,20 @@ def setUp(self):
 
     def test_multiple(self):
         values = (
-            (self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL),
-            (self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL),
-            (self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL),
-            (self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL),
+            (self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL),
+            (self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL),
+            (self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL),
+            (self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL),
         )
 
         mock_output_file = Mock()
 
         self.task.multi_output_reducer('2013-12-01', values, mock_output_file)
         self.assertEquals(len(mock_output_file.write.mock_calls), 4)
 
-        expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', ACTIVE_LABEL, '2'))
+        expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.ACTIVE_LABEL, '2'))
         self.assertIn(call(expected_string), mock_output_file.write.mock_calls)
-        expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PLAY_VIDEO_LABEL, '2'))
+        expected_string = '\t'.join((self.user_id, self.encoded_course_id, '2013-12-01', PredicateLabels.PLAY_VIDEO_LABEL, '2'))
         self.assertIn(call(expected_string), mock_output_file.write.mock_calls)