From e1b528ab895bdee2ce563db636407a7c4dce5436 Mon Sep 17 00:00:00 2001 From: Troy Sankey Date: Fri, 12 Jul 2019 13:20:59 -0400 Subject: [PATCH] Python 2/3 compatibility for finance tasks This does not change any infrastructure to actually run anything under python 3, it only represents the results of my testing of the finance unit tests and acceptance tests under python 3 and modernizing the code to become both python 2 and 3 compatible. --- .isort.cfg | 1 + .travis.yml | 32 ++++++- Makefile | 44 +++++++-- edx/analytics/tasks/common/bigquery_load.py | 10 ++- edx/analytics/tasks/common/mapreduce.py | 8 +- edx/analytics/tasks/common/mysql_load.py | 2 +- edx/analytics/tasks/common/pathutil.py | 8 +- edx/analytics/tasks/common/sqoop.py | 9 +- .../tasks/common/tests/test_sqoop.py | 6 +- edx/analytics/tasks/common/vertica_load.py | 6 +- .../tasks/export/data_obfuscation.py | 6 +- .../tasks/export/events_obfuscation.py | 4 +- edx/analytics/tasks/export/obfuscation.py | 4 +- edx/analytics/tasks/insights/calendar_task.py | 9 +- .../tasks/insights/database_imports.py | 9 +- .../tasks/insights/tests/test_answer_dist.py | 16 ++-- .../insights/tests/test_course_blocks.py | 18 ++-- .../tasks/insights/tests/test_course_list.py | 5 +- .../tasks/insights/tests/test_enrollments.py | 36 ++++---- .../tasks/insights/tests/test_video.py | 6 +- edx/analytics/tasks/insights/video.py | 7 +- edx/analytics/tasks/launchers/local.py | 14 ++- edx/analytics/tasks/launchers/remote.py | 4 +- .../monitor/tests/test_overall_events.py | 2 +- .../monitor/tests/test_total_events_report.py | 4 +- .../tasks/tests/acceptance/__init__.py | 14 +-- .../tasks/tests/acceptance/services/task.py | 12 +-- .../tests/acceptance/test_database_export.py | 10 ++- .../tests/acceptance/test_database_import.py | 10 +-- .../acceptance/test_enrollment_validation.py | 4 +- .../tests/acceptance/test_enrollments.py | 2 +- edx/analytics/tasks/tools/obfuscate_eval.py | 15 ++-- edx/analytics/tasks/tools/s3util.py | 5 +- edx/analytics/tasks/util/csv_util.py | 10 ++- edx/analytics/tasks/util/eventlog.py | 11 ++- edx/analytics/tasks/util/fast_json.py | 36 ++++++++ edx/analytics/tasks/util/hive.py | 1 + edx/analytics/tasks/util/id_codec.py | 15 +++- edx/analytics/tasks/util/obfuscate_util.py | 2 +- edx/analytics/tasks/util/opaque_key_util.py | 4 +- edx/analytics/tasks/util/overwrite.py | 2 + edx/analytics/tasks/util/record.py | 2 +- edx/analytics/tasks/util/retry.py | 1 + edx/analytics/tasks/util/s3_util.py | 5 +- edx/analytics/tasks/util/tests/config.py | 4 +- .../tasks/util/tests/opaque_key_mixins.py | 6 +- edx/analytics/tasks/util/tests/target.py | 12 ++- edx/analytics/tasks/util/tests/test_hive.py | 1 + .../tasks/util/tests/test_id_codec.py | 18 ++-- .../tasks/util/tests/test_opaque_key_util.py | 11 ++- edx/analytics/tasks/util/tests/test_record.py | 4 +- edx/analytics/tasks/util/tests/test_retry.py | 7 +- .../tasks/util/tests/test_s3_util.py | 6 +- edx/analytics/tasks/util/tests/test_url.py | 1 + edx/analytics/tasks/util/url.py | 18 +++- .../warehouse/financial/affiliate_window.py | 4 +- .../tasks/warehouse/financial/cybersource.py | 3 +- .../financial/ed_services_financial_report.py | 2 + .../tasks/warehouse/financial/fees.py | 3 +- .../warehouse/financial/finance_reports.py | 2 + .../warehouse/financial/orders_import.py | 1 + .../tasks/warehouse/financial/payment.py | 1 + .../tasks/warehouse/financial/paypal.py | 40 +++++---- .../tasks/warehouse/financial/reconcile.py | 57 ++++++++---- .../warehouse/financial/tests/test_paypal.py | 47 +++++++--- .../financial/tests/test_reconcile.py | 8 +- .../warehouse/lms_courseware_link_clicked.py | 5 +- .../warehouse/tests/test_course_subjects.py | 6 +- requirements/base.in | 2 +- requirements/base.txt | 13 +-- requirements/default.in | 14 +-- requirements/default.txt | 44 ++++----- requirements/docs.txt | 67 +++++++------- requirements/extra.txt | 8 ++ requirements/pip-tools.txt | 2 +- requirements/pip.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 73 +++++++-------- share/task.yml | 89 +++++++++++-------- 79 files changed, 645 insertions(+), 369 deletions(-) create mode 100644 edx/analytics/tasks/util/fast_json.py diff --git a/.isort.cfg b/.isort.cfg index 241a19da9e..bdac169f2c 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,3 +1,4 @@ [settings] line_length=120 multi_line_output=5 +known_future_library=future diff --git a/.travis.yml b/.travis.yml index b22537b18f..bc0ad497bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,33 @@ env: - secure: NLqmm18NpV3JRwD4CaugXm5cMWgxjdOA88xRFocmmVrduv0QT9JxBZFGebLYmFQOoKNJ23hz6g3EHe1aWfhLYnr1iUYerrwIriSI1wzuqbXJBRN6gO2n3YW+IfG83OLMZkOIMswT8MEdT3JPWVJL3bsocjHp8bYhRCt1KTCMJjY= - secure: aG8l39jaLFWXB5CEOOAR9mJTT3GnqxCl/oFM/7NvTZCBoSWIPIztpFhSAkRE9xSIiKUKXakZcL5H349NLC28jdlHPVsNAaKKt2YNhB6MjmePihp3RPwZGn8c/SjslwY7DPVUKMdWsI7AVNJBH8ab30OPxKwXFAMOiJJza206CYQ= +# TODO: re-introduce the coverage test. +matrix: + # Mark travis build as finished before jobs under allow_failures complete. + fast_finish: true + + include: + # Standard unit tests. + - name: "Python 2.7 Unit Tests" + env: TEST_SUITE=test-docker + + # Python 3 whitelisted and full unit test jobs. Once python 3 support is + # complete, delete the whitelist job and remove the full job from + # allow_failures. + - name: "Python 3.x Whitelisted Unit Tests" + env: TEST_SUITE=test-docker-py3-whitelist + - name: "Python 3.x FULL Unit Tests" + env: TEST_SUITE=test-docker-py3 + + - name: "Quality Tests" + env: TEST_SUITE=quality-docker + + # Names of jobs (defined above) that cannot fail the travis build even if + # they fail. + allow_failures: + - name: "Python 3.x FULL Unit Tests" + - name: "Quality Tests" # This is here because isort is a hot mess right now. + # Do NOT install Python requirements. # Doing so is a waste of time since they won't be used. install: true @@ -37,10 +64,7 @@ before_install: # Ensure we have a place to store coverage output - mkdir -p coverage -script: - - make test-docker - - make quality-docker - - make coverage-docker +script: make $TEST_SUITE after_success: - pip install --upgrade codecov diff --git a/Makefile b/Makefile index 46488d5bf1..9ac03d1b6d 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ uninstall: pip install -r requirements/pip.txt - while pip uninstall -y edx.analytics.tasks; do true; done + pip uninstall -y edx.analytics.tasks python setup.py clean install: requirements uninstall @@ -28,7 +28,7 @@ docker-shell: system-requirements: ifeq (,$(wildcard /usr/bin/yum)) # This is not great, we can't use these libraries on slave nodes using this method. - sudo apt-get install -y -q libmysqlclient-dev libpq-dev python-dev libffi-dev libssl-dev libxml2-dev libxslt1-dev + sudo apt-get install -y -q libmysqlclient-dev libpq-dev python-dev python3-dev libffi-dev libssl-dev libxml2-dev libxslt1-dev else sudo yum install -y -q postgresql-devel libffi-devel endif @@ -56,20 +56,48 @@ upgrade: ## update the requirements/*.txt files with the latest packages satisfy CUSTOM_COMPILE_COMMAND="make upgrade" pip-compile --upgrade -o requirements/docs.txt requirements/docs.in CUSTOM_COMPILE_COMMAND="make upgrade" pip-compile --upgrade -o requirements/test.txt requirements/test.in -test-docker-local: - docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make develop-local test-local - +# Entry point for running python 2 unit tests in CI. test-docker: - docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv test-requirements develop-local test-local + docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv test-requirements develop-local test-local +# Entry point for running python 3 unit tests in CI. test-docker-py3: - docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv-py3 test-requirements develop-local test-local + docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv-py3 test-requirements develop-local test-local + +# Entry point for running python 3 unit tests in CI. Only invokes a subset +# (whitelist) of unit tests which are known to pass under python 3. +test-docker-py3-whitelist: + docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv-py3 test-requirements develop-local test-local-py3-whitelist test-local: # TODO: when we have better coverage, modify this to actually fail when coverage is too low. rm -rf .coverage LUIGI_CONFIG_PATH='config/test.cfg' python -m coverage run --rcfile=./.coveragerc -m nose --with-xunit --xunit-file=unittests.xml -A 'not acceptance' +# Speical test-local target specifically for running a whitelist of tests which +# are known to pass under python 3 +test-local-py3-whitelist: + # TODO: when we have better coverage, modify this to actually fail when coverage is too low. + rm -rf .coverage + LUIGI_CONFIG_PATH='config/test.cfg' python -m coverage run --rcfile=./.coveragerc -m nose --with-xunit --xunit-file=unittests.xml -A 'not acceptance' \ + edx.analytics.tasks.enterprise.tests \ + edx.analytics.tasks.insights.tests.test_database_imports \ + edx.analytics.tasks.insights.tests.test_grades \ + edx.analytics.tasks.monitor.tests.test_overall_events \ + edx.analytics.tasks.tests \ + edx.analytics.tasks.util.tests.helpers \ + edx.analytics.tasks.util.tests.opaque_key_mixins \ + edx.analytics.tasks.util.tests.test_decorators \ + edx.analytics.tasks.util.tests.test_geolocation \ + edx.analytics.tasks.util.tests.test_hive \ + edx.analytics.tasks.util.tests.test_retry \ + edx.analytics.tasks.util.tests.test_s3_util \ + edx.analytics.tasks.util.tests.test_url \ + edx.analytics.tasks.warehouse.financial.tests \ + edx.analytics.tasks.warehouse.tests.test_internal_reporting_active_users \ + edx.analytics.tasks.warehouse.tests.test_internal_reporting_database \ + edx.analytics.tasks.warehouse.tests.test_run_vertica_sql_scripts + test: test-requirements develop test-local test-acceptance: test-requirements @@ -98,7 +126,7 @@ quality-docker-local: docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make develop-local quality-local quality-docker: - docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv test-requirements develop-local quality-local + docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv test-requirements develop-local quality-local coverage-docker: docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest coverage xml diff --git a/edx/analytics/tasks/common/bigquery_load.py b/edx/analytics/tasks/common/bigquery_load.py index 27a9f4f762..a05de7cc42 100644 --- a/edx/analytics/tasks/common/bigquery_load.py +++ b/edx/analytics/tasks/common/bigquery_load.py @@ -1,10 +1,14 @@ +from __future__ import absolute_import +from future.standard_library import install_aliases +install_aliases() + import json import logging import os import subprocess import tempfile import time -import urlparse +from urllib.parse import urlparse import luigi @@ -216,7 +220,7 @@ def field_delimiter(self): @property def null_marker(self): - return '\N' + return r'\N' @property def quote_character(self): @@ -262,7 +266,7 @@ def init_copy(self, client): self.output().clear_marker_table() def _get_destination_from_source(self, source_path): - parsed_url = urlparse.urlparse(source_path) + parsed_url = urlparse(source_path) destination_path = url_path_join('gs://{}'.format(parsed_url.netloc), parsed_url.path) return destination_path diff --git a/edx/analytics/tasks/common/mapreduce.py b/edx/analytics/tasks/common/mapreduce.py index 071ee8eeea..19682280f1 100644 --- a/edx/analytics/tasks/common/mapreduce.py +++ b/edx/analytics/tasks/common/mapreduce.py @@ -7,7 +7,7 @@ import logging import logging.config import os -import StringIO +from io import StringIO from hashlib import md5 import luigi @@ -183,7 +183,7 @@ class EmulatedMapReduceJobRunner(luigi.contrib.hadoop.JobRunner): """ def group(self, input): - output = StringIO.StringIO() + output = StringIO() lines = [] for i, line in enumerate(input): parts = line.rstrip('\n').split('\t') @@ -197,7 +197,7 @@ def group(self, input): def run_job(self, job): job.init_hadoop() job.init_mapper() - map_output = StringIO.StringIO() + map_output = StringIO() input_targets = luigi.task.flatten(job.input_hadoop()) for input_target in input_targets: # if file is a directory, then assume that it's Hadoop output, @@ -232,7 +232,7 @@ def run_job(self, job): try: reduce_output = job.output().open('w') except Exception: - reduce_output = StringIO.StringIO() + reduce_output = StringIO() try: job._run_reducer(reduce_input, reduce_output) diff --git a/edx/analytics/tasks/common/mysql_load.py b/edx/analytics/tasks/common/mysql_load.py index 39fb14440c..f719f5321d 100644 --- a/edx/analytics/tasks/common/mysql_load.py +++ b/edx/analytics/tasks/common/mysql_load.py @@ -422,7 +422,7 @@ def coerce_for_mysql_connect(input): return input # Hive indicates a null value with the string "\N" # We represent an infinite value with the string "inf", MySQL has no such representation so we use NULL - if input in ('None', '\\N', 'inf', '-inf'): + if input in ('None', r'\N', 'inf', '-inf'): return None if isinstance(input, str): return input.decode('utf-8') diff --git a/edx/analytics/tasks/common/pathutil.py b/edx/analytics/tasks/common/pathutil.py index ee6cacd6eb..a87585c63b 100644 --- a/edx/analytics/tasks/common/pathutil.py +++ b/edx/analytics/tasks/common/pathutil.py @@ -268,12 +268,12 @@ def get_event_and_date_string(self, line): """Default mapper implementation, that always outputs the log line, but with a configurable key.""" event = eventlog.parse_json_event(line) if event is None: - self.incr_counter('Event', 'Discard Unparseable Event', 1) + self.incr_counter(u'Event', u'Discard Unparseable Event', 1) return None event_time = self.get_event_time(event) if not event_time: - self.incr_counter('Event', 'Discard Missing Time Field', 1) + self.incr_counter(u'Event', u'Discard Missing Time Field', 1) return None # Don't use strptime to parse the date, it is extremely slow @@ -283,7 +283,7 @@ def get_event_and_date_string(self, line): date_string = event_time.split("T")[0] if date_string < self.lower_bound_date_string or date_string >= self.upper_bound_date_string: - # Slow: self.incr_counter('Event', 'Discard Outside Date Interval', 1) + # Slow: self.incr_counter(u'Event', u'Discard Outside Date Interval', 1) return None return event, date_string @@ -307,5 +307,5 @@ def get_map_input_file(self): return os.environ['map_input_file'] except KeyError: log.warn('mapreduce_map_input_file not defined in os.environ, unable to determine input file path') - self.incr_counter('Event', 'Missing map_input_file', 1) + self.incr_counter(u'Event', u'Missing map_input_file', 1) return '' diff --git a/edx/analytics/tasks/common/sqoop.py b/edx/analytics/tasks/common/sqoop.py index 44a854bc40..2091a1cc0e 100644 --- a/edx/analytics/tasks/common/sqoop.py +++ b/edx/analytics/tasks/common/sqoop.py @@ -1,6 +1,8 @@ """ Gather data using Sqoop table dumps run on RDBMS databases. """ +from __future__ import absolute_import + import datetime import json import logging @@ -296,7 +298,12 @@ def run_job(self, job): metadata['end_time'] = datetime.datetime.utcnow().isoformat() try: with job.metadata_output().open('w') as metadata_file: - json.dump(metadata, metadata_file) + # Under python 2, json.dumps() will return ascii-only bytes, so .encode('utf-8') + # is a no-op. Under python 3, json.dumps() will return ascii-only unicode, so + # .encode('utf-8') will return bytes, thus normalizing the output to bytes + # across all python versions. + metadata_file.write(json.dumps(metadata).encode('utf-8')) + metadata_file.flush() except Exception: log.exception("Unable to dump metadata information.") pass diff --git a/edx/analytics/tasks/common/tests/test_sqoop.py b/edx/analytics/tasks/common/tests/test_sqoop.py index 1c8d50128d..27de9daa1d 100644 --- a/edx/analytics/tasks/common/tests/test_sqoop.py +++ b/edx/analytics/tasks/common/tests/test_sqoop.py @@ -197,12 +197,12 @@ def test_connect_with_columns(self): self.assertEquals(arglist[-3], 'column1,column2') def test_connect_with_null_string(self): - self.create_and_run_mysql_task(null_string='\\\\N') + self.create_and_run_mysql_task(null_string=r'\\N') arglist = self.get_call_args_after_run() self.assertEquals(arglist[-6], '--null-string') - self.assertEquals(arglist[-5], '\\\\N') + self.assertEquals(arglist[-5], r'\\N') self.assertEquals(arglist[-4], '--null-non-string') - self.assertEquals(arglist[-3], '\\\\N') + self.assertEquals(arglist[-3], r'\\N') def test_connect_with_fields_terminations(self): self.create_and_run_mysql_task(fields_terminated_by='\x01') diff --git a/edx/analytics/tasks/common/vertica_load.py b/edx/analytics/tasks/common/vertica_load.py index 4626b81b4d..c2086d30b0 100644 --- a/edx/analytics/tasks/common/vertica_load.py +++ b/edx/analytics/tasks/common/vertica_load.py @@ -1,6 +1,7 @@ """ Support for loading data into an HP Vertica database. """ +from __future__ import absolute_import import logging import traceback @@ -12,6 +13,7 @@ from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin from edx.analytics.tasks.util.url import ExternalURL from edx.analytics.tasks.util.vertica_target import CredentialFileVerticaTarget +import six log = logging.getLogger(__name__) @@ -416,7 +418,7 @@ def copy_delimiter(self): @property def copy_null_sequence(self): """The null sequence in the data to be copied. Default is Hive NULL (\\N)""" - return "'\\N'" + return r"'\N'" @property def copy_enclosed_by(self): @@ -437,7 +439,7 @@ def copy_escape_spec(self): def copy_data_table_from_target(self, cursor): """Performs the copy query from the insert source.""" - if isinstance(self.columns[0], basestring): + if isinstance(self.columns[0], six.string_types): column_names = ','.join([name for name in self.columns]) elif len(self.columns[0]) == 2: column_names = ','.join([name for name, _type in self.columns]) diff --git a/edx/analytics/tasks/export/data_obfuscation.py b/edx/analytics/tasks/export/data_obfuscation.py index 6d9261c44a..bfc0414e80 100644 --- a/edx/analytics/tasks/export/data_obfuscation.py +++ b/edx/analytics/tasks/export/data_obfuscation.py @@ -8,11 +8,11 @@ import tempfile import xml.etree.ElementTree -import cjson import luigi import yaml import edx.analytics.tasks.util.opaque_key_util as opaque_key_util +from edx.analytics.tasks.util.fast_json import FastJson from edx.analytics.tasks.common.pathutil import PathSetTask from edx.analytics.tasks.util.file_util import copy_file_to_file, read_config_file from edx.analytics.tasks.util.obfuscate_util import ( @@ -194,7 +194,7 @@ def filter_row(self, row): if state_str == 'NULL': updated_state_dict = {} else: - state_dict = cjson.decode(state_str, all_unicode=True) + state_dict = FastJson.loads(state_str) # Traverse the dictionary, looking for entries that need to be scrubbed. updated_state_dict = self.obfuscator.obfuscate_structure(state_dict, u"state", user_info) except Exception: # pylint: disable=broad-except @@ -204,7 +204,7 @@ def filter_row(self, row): if updated_state_dict is not None: # Can't reset values, so update original fields. - updated_state = cjson.encode(updated_state_dict).replace('\\', '\\\\') + updated_state = FastJson.dumps(updated_state_dict).replace('\\', '\\\\') row[4] = updated_state if self.obfuscator.is_logging_enabled(): log.info(u"Obfuscated state for user_id '%s' module_id '%s'", user_id, row[2]) diff --git a/edx/analytics/tasks/export/events_obfuscation.py b/edx/analytics/tasks/export/events_obfuscation.py index 1eb1000ff4..edb0632fa0 100644 --- a/edx/analytics/tasks/export/events_obfuscation.py +++ b/edx/analytics/tasks/export/events_obfuscation.py @@ -6,10 +6,10 @@ import re from collections import defaultdict, namedtuple -import cjson import luigi.date_interval import edx.analytics.tasks.util.opaque_key_util as opaque_key_util +from edx.analytics.tasks.util.fast_json import FastJson from edx.analytics.tasks.common.mapreduce import MapReduceJobTaskMixin, MultiOutputMapReduceJobTask from edx.analytics.tasks.common.pathutil import PathSetTask from edx.analytics.tasks.util import eventlog @@ -328,7 +328,7 @@ def _obfuscate_event(self, event): # Re-encode payload as a json string if it originally was one. # (This test works because we throw away string values that didn't parse as JSON.) if isinstance(event.get('event'), basestring): - event['event'] = cjson.encode(event_data) + event['event'] = FastJson.dumps(event_data) else: event['event'] = event_data diff --git a/edx/analytics/tasks/export/obfuscation.py b/edx/analytics/tasks/export/obfuscation.py index 0a189caffd..2a3a653e94 100644 --- a/edx/analytics/tasks/export/obfuscation.py +++ b/edx/analytics/tasks/export/obfuscation.py @@ -1,11 +1,13 @@ """Tasks to obfuscate course data for RDX.""" +from future.standard_library import install_aliases +install_aliases() import errno import json import logging import os import tarfile -import urlparse +from urllib.parse import urlparse import luigi diff --git a/edx/analytics/tasks/insights/calendar_task.py b/edx/analytics/tasks/insights/calendar_task.py index 0f5ce6fc01..1ac4811562 100644 --- a/edx/analytics/tasks/insights/calendar_task.py +++ b/edx/analytics/tasks/insights/calendar_task.py @@ -1,8 +1,11 @@ -"""A canonical calendar that can be joined with other tables to provide information about dates.""" - +""" +A canonical calendar that can be joined with other tables to provide information about dates. +""" +from __future__ import absolute_import import logging from datetime import timedelta +import six import luigi.configuration @@ -63,7 +66,7 @@ def run(self): (week.sunday() + timedelta(1)).isoformat(), iso_weekday ) - output_file.write('\t'.join([unicode(v).encode('utf8') for v in column_values]) + '\n') + output_file.write(b'\t'.join([six.text_type(v).encode('utf-8') for v in column_values]) + b'\n') class CalendarTableTask(CalendarDownstreamMixin, HiveTableTask): diff --git a/edx/analytics/tasks/insights/database_imports.py b/edx/analytics/tasks/insights/database_imports.py index fd6a7801f3..2896d674c6 100644 --- a/edx/analytics/tasks/insights/database_imports.py +++ b/edx/analytics/tasks/insights/database_imports.py @@ -1,6 +1,8 @@ """ Import data from external RDBMS databases into Hive. """ +from __future__ import absolute_import + import datetime import logging import textwrap @@ -91,7 +93,7 @@ def partition(self): def partition_location(self): """Provides location of Hive database table's partition data.""" # The actual folder name where the data is stored is expected to be in the format = - partition_name = '='.join(self.partition.items()[0]) + partition_name = '='.join(list(self.partition.items())[0]) # Make sure that input path ends with a slash, to indicate a directory. # (This is necessary for S3 paths that are output from Hadoop jobs.) return url_path_join(self.table_location, partition_name + '/') @@ -169,9 +171,8 @@ def requires(self): database=self.database, # Hive expects NULL to be represented by the string "\N" in the data. You have to pass in "\\N" to sqoop # since it uses that string directly in the generated Java code, so "\\N" actually looks like "\N" to the - # Java code. In order to get "\\N" onto the command line we have to use another set of escapes to tell the - # python code to pass through the "\" character. - null_string='\\\\N', + # Java code. In order to write "\\N" in python, we use a raw string prefix `r`. + null_string=r'\\N', # It's unclear why, but this setting prevents us from correctly substituting nulls with \N. mysql_delimiters=False, # This is a string that is interpreted as an octal number, so it is equivalent to the character Ctrl-A diff --git a/edx/analytics/tasks/insights/tests/test_answer_dist.py b/edx/analytics/tasks/insights/tests/test_answer_dist.py index b3691e233b..56088622a1 100644 --- a/edx/analytics/tasks/insights/tests/test_answer_dist.py +++ b/edx/analytics/tasks/insights/tests/test_answer_dist.py @@ -2,12 +2,13 @@ Tests for tasks that calculate answer distributions. """ +from __future__ import absolute_import import hashlib import json import math import os import shutil -import StringIO +from io import BytesIO import tempfile from unittest import TestCase @@ -20,6 +21,7 @@ ) from edx.analytics.tasks.util.tests.config import OPTION_REMOVED, with_luigi_config from edx.analytics.tasks.util.tests.opaque_key_mixins import InitializeLegacyKeysMixin, InitializeOpaqueKeysMixin +import six class ProblemCheckEventBaseTest(MapperTestMixin, ReducerTestMixin, TestCase): @@ -73,7 +75,7 @@ def _create_event_data_dict(self, **kwargs): @staticmethod def _update_with_kwargs(data_dict, **kwargs): """Updates a dict from kwargs only if it modifies a top-level value.""" - for key, value in kwargs.iteritems(): + for key, value in six.iteritems(kwargs): if key in data_dict: data_dict[key] = value @@ -317,7 +319,7 @@ def insert_answer_data(submission, attempt_category): submission: dictionary of all responses submitted at once for a user attempt_category: a string that is 'first' for a user's first submission and 'last' otherwise """ - for answer_id, submission_data in submission.iteritems(): + for answer_id, submission_data in six.iteritems(submission): answer_id_data = { "answer": submission_data['answer'], "problem_display_name": None, @@ -830,7 +832,7 @@ def _load_metadata(self, **kwargs): } } metadata_dict[self.answer_id].update(**kwargs) - answer_metadata = StringIO.StringIO(json.dumps(metadata_dict)) + answer_metadata = BytesIO(json.dumps(metadata_dict).encode('utf-8')) self.task.load_answer_metadata(answer_metadata) def test_non_submission_choice_with_metadata(self): @@ -944,7 +946,7 @@ def test_reduce_multiple_values(self): # To test sorting, the first sample is made to sort after the # second sample. - column_values_2 = [(k, unicode(k) + u'\u2603') for k in field_names] + column_values_2 = [(k, six.text_type(k) + u'\u2603') for k in field_names] column_values_2[3] = (column_values_2[3][0], 10) column_values_1 = list(column_values_2) column_values_1[4] = (column_values_1[4][0], u'ZZZZZZZZZZZ') @@ -958,9 +960,9 @@ def test_reduce_multiple_values(self): self.assertEquals(mock_output_file.write.mock_calls[0], call(expected_header_string)) # Confirm that the second sample appears before the first. - expected_row_1 = ','.join(unicode(v[1]).encode('utf8') for v in column_values_2) + '\r\n' + expected_row_1 = b','.join(six.text_type(v[1]).encode('utf8') for v in column_values_2) + b'\r\n' self.assertEquals(mock_output_file.write.mock_calls[1], call(expected_row_1)) - expected_row_2 = ','.join(unicode(v[1]).encode('utf8') for v in column_values_1) + '\r\n' + expected_row_2 = b','.join(six.text_type(v[1]).encode('utf8') for v in column_values_1) + b'\r\n' self.assertEquals(mock_output_file.write.mock_calls[2], call(expected_row_2)) def test_output_path_for_legacy_key(self): diff --git a/edx/analytics/tasks/insights/tests/test_course_blocks.py b/edx/analytics/tasks/insights/tests/test_course_blocks.py index 191172a573..93f39ad176 100644 --- a/edx/analytics/tasks/insights/tests/test_course_blocks.py +++ b/edx/analytics/tasks/insights/tests/test_course_blocks.py @@ -1,4 +1,6 @@ """Test course blocks tasks.""" +from future.standard_library import install_aliases +install_aliases() import json import logging @@ -6,7 +8,7 @@ import shutil import tempfile from unittest import TestCase -from urllib import urlencode +from urllib.parse import urlencode import httpretty from ddt import data, ddt, unpack @@ -53,7 +55,7 @@ def create_input_file(self, course_ids): """Create a tab-separated file containing the given course_ids.""" with open(self.input_file, 'w') as output: for course_id in course_ids: - output.write("\t".join([course_id, 'Name', 'Org', 'Number', 'http://'] + ['\\N'] * 9)) + output.write("\t".join([course_id, 'Name', 'Org', 'Number', 'http://'] + [r'\N'] * 9)) output.write("\r\n") def cleanup(self, dirname): @@ -174,18 +176,18 @@ class CourseBlocksApiDataReducerTaskTest(CourseBlocksTestMixin, ReducerTestMixin # data tuple fields are given in this order: # (block_id,block_type,display_name,is_root,is_orphan,is_dag,parent_block_id,course_path,sort_idx) @data( - ((('abc', 'course', 'ABC', '1', '0', '0', '\\N', '', '0'),), False), - ((('abc', 'course', 'ABC', '1', '0', '0', '\\N', '', '0'),), True), - ((('abc', 'block', 'ABC', '1', '0', '0', '\\N', '', '0'), + ((('abc', 'course', 'ABC', '1', '0', '0', r'\N', '', '0'),), False), + ((('abc', 'course', 'ABC', '1', '0', '0', r'\N', '', '0'),), True), + ((('abc', 'block', 'ABC', '1', '0', '0', r'\N', '', '0'), ('def', 'block', 'DEF', '0', '0', '0', 'abc', 'ABC', '1'), ('jkl', 'block', 'JKL', '0', '0', '1', 'def', 'ABC / DEF', '2'), ('vwx', 'block', 'VWX', '0', '0', '0', 'jkl', 'ABC / DEF / JKL', '3'), ('mno', 'block', 'MNO', '0', '0', '0', 'def', 'ABC / DEF', '4'), ('pqr', 'block', 'PQR', '0', '0', '0', 'mno', 'ABC / DEF / MNO', '5'), ('stu', 'block', 'STU', '0', '0', '0', 'abc', 'ABC', '6'), - ('ghi', 'block', 'GHI', '0', '1', '0', '\\N', '(Deleted block :)', '8')), False), - ((('ghi', 'block', 'GHI', '0', '1', '0', '\\N', '(Deleted block :)', '-1'), - ('abc', 'block', 'ABC', '1', '0', '0', '\\N', '', '0'), + ('ghi', 'block', 'GHI', '0', '1', '0', r'\N', '(Deleted block :)', '8')), False), + ((('ghi', 'block', 'GHI', '0', '1', '0', r'\N', '(Deleted block :)', '-1'), + ('abc', 'block', 'ABC', '1', '0', '0', r'\N', '', '0'), ('def', 'block', 'DEF', '0', '0', '0', 'abc', 'ABC', '1'), ('jkl', 'block', 'JKL', '0', '0', '1', 'def', 'ABC / DEF', '2'), ('vwx', 'block', 'VWX', '0', '0', '0', 'jkl', 'ABC / DEF / JKL', '3'), diff --git a/edx/analytics/tasks/insights/tests/test_course_list.py b/edx/analytics/tasks/insights/tests/test_course_list.py index 549b879463..0f879c97c4 100644 --- a/edx/analytics/tasks/insights/tests/test_course_list.py +++ b/edx/analytics/tasks/insights/tests/test_course_list.py @@ -1,4 +1,7 @@ """Test course list tasks.""" +from future.standard_library import install_aliases +install_aliases() + import json import logging import os @@ -6,7 +9,7 @@ import tempfile from datetime import datetime from unittest import TestCase -from urllib import urlencode +from urllib.parse import urlencode import httpretty from ddt import data, ddt, unpack diff --git a/edx/analytics/tasks/insights/tests/test_enrollments.py b/edx/analytics/tasks/insights/tests/test_enrollments.py index 06e1d893c0..25bea2125c 100644 --- a/edx/analytics/tasks/insights/tests/test_enrollments.py +++ b/edx/analytics/tasks/insights/tests/test_enrollments.py @@ -432,8 +432,8 @@ def test_no_events(self): def test_single_enrollment(self): inputs = [('2013-01-01T00:00:01', ACTIVATED, 'honor'), ] - expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', '\\N', '\\N', - '\\N', '2013-01-02 00:00:00.000000'),) + expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', r'\N', r'\N', + r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def create_enrollment_task(self, interval='2013-01-01'): @@ -461,7 +461,7 @@ def test_normal_multiple_event_sequence(self): ('2013-01-01T00:00:04', DEACTIVATED, 'honor'), ] expected = ((self.course_id, self.user_id, 'honor', '0', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:04.000000', '\\N', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:04.000000', r'\N', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_conflicting_activate_after_mode_change(self): @@ -471,7 +471,7 @@ def test_ignore_conflicting_activate_after_mode_change(self): ('2013-01-01T00:00:03', ACTIVATED, 'honor'), ] expected = ((self.course_id, self.user_id, 'verified', '1', 'honor', '2013-01-01 00:00:01.000000', - '\\N', '2013-01-01 00:00:02.000000', '\\N', '2013-01-02 00:00:00.000000'),) + r'\N', '2013-01-01 00:00:02.000000', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_redundant_unenroll_events(self): @@ -482,7 +482,7 @@ def test_ignore_redundant_unenroll_events(self): ('2013-01-01T00:00:04', ACTIVATED, 'honor'), ] expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:02.000000', '\\N', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:02.000000', r'\N', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_events_out_of_order(self): @@ -494,7 +494,7 @@ def test_events_out_of_order(self): ('2013-01-01T00:00:02', DEACTIVATED, 'honor'), ] expected = ((self.course_id, self.user_id, 'honor', '0', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:04.000000', '\\N', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:04.000000', r'\N', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_redundant_enroll_events(self): @@ -504,8 +504,8 @@ def test_ignore_redundant_enroll_events(self): ('2013-01-01T00:00:03', ACTIVATED, 'honor'), ('2013-01-01T00:00:04', ACTIVATED, 'honor'), ] - expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', '\\N', '\\N', - '\\N', '2013-01-02 00:00:00.000000'),) + expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', r'\N', r'\N', + r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_mode_change_on_redundant_enroll_events(self): @@ -513,8 +513,8 @@ def test_ignore_mode_change_on_redundant_enroll_events(self): ('2013-01-01T00:00:01', ACTIVATED, 'honor'), ('2013-01-01T00:00:02', ACTIVATED, 'verified'), ] - expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', '\\N', '\\N', - '\\N', '2013-01-02 00:00:00.000000'),) + expected = ((self.course_id, self.user_id, 'honor', '1', 'honor', '2013-01-01 00:00:01.000000', r'\N', r'\N', + r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_mode_change_while_deactivated(self): @@ -524,7 +524,7 @@ def test_ignore_mode_change_while_deactivated(self): ('2013-01-01T00:00:03', MODE_CHANGED, 'verified'), ] expected = ((self.course_id, self.user_id, 'honor', '0', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:02.000000', '\\N', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:02.000000', r'\N', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_mode_change_via_activation_events(self): @@ -534,7 +534,7 @@ def test_mode_change_via_activation_events(self): ('2013-01-01T00:00:03', ACTIVATED, 'verified'), ] expected = ((self.course_id, self.user_id, 'verified', '1', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:02.000000', '2013-01-01 00:00:03.000000', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:02.000000', '2013-01-01 00:00:03.000000', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_ignore_different_mode_on_unenroll_event(self): @@ -543,7 +543,7 @@ def test_ignore_different_mode_on_unenroll_event(self): ('2013-01-01T00:00:02', DEACTIVATED, 'verified'), ] expected = ((self.course_id, self.user_id, 'honor', '0', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:02.000000', '\\N', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:02.000000', r'\N', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_normal_explicit_mode_change(self): @@ -552,8 +552,8 @@ def test_normal_explicit_mode_change(self): ('2013-01-01T00:00:01', ACTIVATED, 'honor'), ('2013-01-02T00:00:02', MODE_CHANGED, 'verified') ] - expected = ((self.course_id, self.user_id, 'verified', '1', 'honor', '2013-01-01 00:00:01.000000', '\\N', - '2013-01-02 00:00:02.000000', '\\N', '2013-01-03 00:00:00.000000'),) + expected = ((self.course_id, self.user_id, 'verified', '1', 'honor', '2013-01-01 00:00:01.000000', r'\N', + '2013-01-02 00:00:02.000000', r'\N', '2013-01-03 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_multiple_mode_change_events(self): @@ -565,7 +565,7 @@ def test_multiple_mode_change_events(self): ('2013-01-02T00:00:04', MODE_CHANGED, 'audit'), ('2013-01-02T00:00:05', MODE_CHANGED, 'credit') ] - expected = ((self.course_id, self.user_id, 'credit', '1', 'honor', '2013-01-01 00:00:01.000000', '\\N', + expected = ((self.course_id, self.user_id, 'credit', '1', 'honor', '2013-01-01 00:00:01.000000', r'\N', '2013-01-02 00:00:02.000000', '2013-01-02 00:00:05.000000', '2013-01-03 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) self._check_output_complete_tuple(inputs, expected) @@ -579,7 +579,7 @@ def test_capture_first_verified_time(self): ('2013-01-01T00:00:04', ACTIVATED, 'verified') ] expected = ((self.course_id, self.user_id, 'verified', '1', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:03.000000', '2013-01-01 00:00:02.000000', '\\N', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:03.000000', '2013-01-01 00:00:02.000000', r'\N', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) def test_capture_first_credit_time(self): @@ -591,7 +591,7 @@ def test_capture_first_credit_time(self): ('2013-01-01T00:00:04', ACTIVATED, 'credit') ] expected = ((self.course_id, self.user_id, 'credit', '1', 'honor', '2013-01-01 00:00:01.000000', - '2013-01-01 00:00:03.000000', '\\N', '2013-01-01 00:00:02.000000', '2013-01-02 00:00:00.000000'),) + '2013-01-01 00:00:03.000000', r'\N', '2013-01-01 00:00:02.000000', '2013-01-02 00:00:00.000000'),) self._check_output_complete_tuple(inputs, expected) diff --git a/edx/analytics/tasks/insights/tests/test_video.py b/edx/analytics/tasks/insights/tests/test_video.py index a1e2f4d02b..80f2ff2476 100644 --- a/edx/analytics/tasks/insights/tests/test_video.py +++ b/edx/analytics/tasks/insights/tests/test_video.py @@ -397,8 +397,8 @@ def setUp(self): super(UserVideoViewingTaskReducerTest, self).setUp() self.user_id = 10 self.reduce_key = (self.user_id, self.COURSE_ID, self.VIDEO_MODULE_ID) - patcher = patch('edx.analytics.tasks.insights.video.urllib') - self.mock_urllib = patcher.start() + patcher = patch('edx.analytics.tasks.insights.video.urlopen') + self.mock_urlopen = patcher.start() self.addCleanup(patcher.stop) def test_simple_viewing(self): @@ -634,7 +634,7 @@ def prepare_youtube_api_mock_raw(self, response_string): mock_response = MagicMock(spec=file) mock_response.code = 200 mock_response.read.side_effect = [response_string, ''] - self.mock_urllib.urlopen.return_value = mock_response + self.mock_urlopen.return_value = mock_response def test_pause_after_end_of_video(self): self.prepare_youtube_api_mock('PT1M2S') diff --git a/edx/analytics/tasks/insights/video.py b/edx/analytics/tasks/insights/video.py index 2636caa11b..26b9705178 100644 --- a/edx/analytics/tasks/insights/video.py +++ b/edx/analytics/tasks/insights/video.py @@ -1,11 +1,14 @@ """Tasks for aggregating statistics about video viewing.""" +from future.standard_library import install_aliases +install_aliases() + import datetime import json import logging import math import re import textwrap -import urllib +from urllib.request import urlopen from collections import namedtuple import ciso8601 @@ -475,7 +478,7 @@ def get_video_duration(self, youtube_id): video_url = "https://www.googleapis.com/youtube/v3/videos?id={0}&part=contentDetails&key={1}".format( youtube_id, self.api_key ) - video_file = urllib.urlopen(video_url) + video_file = urlopen(video_url) content = json.load(video_file) items = content.get('items', []) if len(items) > 0: diff --git a/edx/analytics/tasks/launchers/local.py b/edx/analytics/tasks/launchers/local.py index 90b854c69f..a61d179c0a 100644 --- a/edx/analytics/tasks/launchers/local.py +++ b/edx/analytics/tasks/launchers/local.py @@ -20,7 +20,6 @@ import certifi import chardet import ciso8601 -import cjson import filechunkio import idna import luigi @@ -34,6 +33,11 @@ import six import stevedore import urllib3 +cjson, ujson = None, None +try: + import ujson +except ImportError: + import cjson import edx.analytics.tasks @@ -93,7 +97,7 @@ def main(): # Tell luigi what dependencies to pass to the Hadoop nodes: # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically. # - boto is used for all direct interactions with s3. - # - cjson is used for all parsing event logs. + # - cjson/ujson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - opaque_keys extensions: ccx_keys @@ -101,7 +105,11 @@ def main(): # - requests has several dependencies: # - chardet, urllib3, certifi, idna luigi.contrib.hadoop.attach(edx.analytics.tasks) - luigi.contrib.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, six, ciso8601, chardet, urllib3, certifi, idna, requests, pytz) + if cjson: + luigi.contrib.hadoop.attach(cjson) + if ujson: + luigi.contrib.hadoop.attach(ujson) + luigi.contrib.hadoop.attach(boto, filechunkio, opaque_keys, bson, stevedore, six, ciso8601, chardet, urllib3, certifi, idna, requests, pytz) if configuration.getboolean('ccx', 'enabled', default=False): import ccx_keys diff --git a/edx/analytics/tasks/launchers/remote.py b/edx/analytics/tasks/launchers/remote.py index f392494c7f..e62b90383a 100755 --- a/edx/analytics/tasks/launchers/remote.py +++ b/edx/analytics/tasks/launchers/remote.py @@ -1,5 +1,7 @@ #!/usr/bin/env python """Execute tasks on a remote EMR cluster.""" +from future.standard_library import install_aliases +install_aliases() import argparse import json @@ -8,7 +10,7 @@ import sys import uuid from subprocess import PIPE, Popen -from urlparse import parse_qsl, urlparse +from urllib.parse import urlparse, parse_qsl STATIC_FILES_PATH = os.path.join(sys.prefix, 'share', 'edx.analytics.tasks') EC2_INVENTORY_PATH = os.path.join(STATIC_FILES_PATH, 'ec2.py') diff --git a/edx/analytics/tasks/monitor/tests/test_overall_events.py b/edx/analytics/tasks/monitor/tests/test_overall_events.py index f0fde7bec3..4795c11550 100644 --- a/edx/analytics/tasks/monitor/tests/test_overall_events.py +++ b/edx/analytics/tasks/monitor/tests/test_overall_events.py @@ -2,7 +2,7 @@ import json import sys -from StringIO import StringIO +from io import StringIO from unittest import TestCase from edx.analytics.tasks.common.tests.map_reduce_mixins import MapperTestMixin, ReducerTestMixin diff --git a/edx/analytics/tasks/monitor/tests/test_total_events_report.py b/edx/analytics/tasks/monitor/tests/test_total_events_report.py index 5eb87a49c8..17510fd8ef 100644 --- a/edx/analytics/tasks/monitor/tests/test_total_events_report.py +++ b/edx/analytics/tasks/monitor/tests/test_total_events_report.py @@ -6,7 +6,7 @@ import shutil import tempfile import textwrap -from StringIO import StringIO +from io import BytesIO from unittest import TestCase import pandas @@ -86,7 +86,7 @@ def reformat(string): task.run() data = output_target.buffer.read() - result = pandas.read_csv(StringIO(data), + result = pandas.read_csv(BytesIO(data), na_values=['-'], index_col=False, header=None, diff --git a/edx/analytics/tasks/tests/acceptance/__init__.py b/edx/analytics/tasks/tests/acceptance/__init__.py index c1db23c20d..50b450f19f 100644 --- a/edx/analytics/tasks/tests/acceptance/__init__.py +++ b/edx/analytics/tasks/tests/acceptance/__init__.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import csv import hashlib import json @@ -374,19 +376,19 @@ def assert_data_frames_equal(data, expected): assert_frame_equal(data, expected) except AssertionError: pandas.set_option('display.max_columns', None) - print '----- The report generated this data: -----' - print data - print '----- vs expected: -----' - print expected + print('----- The report generated this data: -----') + print(data) + print('----- vs expected: -----') + print(expected) if data.shape != expected.shape: - print "Data shapes differ." + print("Data shapes differ.") else: for index, _series in data.iterrows(): # Try to print a more helpful/localized difference message: try: assert_series_equal(data.iloc[index, :], expected.iloc[index, :]) except AssertionError: - print "First differing row: {index}".format(index=index) + print("First differing row: {index}".format(index=index)) raise @staticmethod diff --git a/edx/analytics/tasks/tests/acceptance/services/task.py b/edx/analytics/tasks/tests/acceptance/services/task.py index 15a1abb774..361b8d93eb 100644 --- a/edx/analytics/tasks/tests/acceptance/services/task.py +++ b/edx/analytics/tasks/tests/acceptance/services/task.py @@ -1,11 +1,13 @@ +from __future__ import absolute_import -import ConfigParser +import six.moves.configparser import logging import os import sys import tempfile from edx.analytics.tasks.tests.acceptance.services import shell +import six log = logging.getLogger(__name__) @@ -31,7 +33,7 @@ def __init__(self, config, task_config_override, identifier): def launch(self, task_args, config_override=None): self.delete_existing_logs() - config_parser = ConfigParser.ConfigParser() + config_parser = six.moves.configparser.ConfigParser() config_parser.read(os.environ['LUIGI_CONFIG_PATH']) self.override_config(config_parser, self.default_config_override) if config_override: @@ -103,18 +105,18 @@ def delete_existing_logs(self): pass def override_config(self, config_parser, overrides): - for section_name, section in overrides.iteritems(): + for section_name, section in six.iteritems(overrides): if not config_parser.has_section(section_name): config_parser.add_section(section_name) - for key, value in section.iteritems(): + for key, value in six.iteritems(section): config_parser.set(section_name, key, value) def write_logs_to_standard_streams(self): if not self.log_path: return - for filename, output_file in self.logs.iteritems(): + for filename, output_file in six.iteritems(self.logs): try: with open(os.path.join(self.log_path, filename), 'r') as src_file: while True: diff --git a/edx/analytics/tasks/tests/acceptance/test_database_export.py b/edx/analytics/tasks/tests/acceptance/test_database_export.py index 489a6cb5fa..f20e389332 100644 --- a/edx/analytics/tasks/tests/acceptance/test_database_export.py +++ b/edx/analytics/tasks/tests/acceptance/test_database_export.py @@ -3,14 +3,17 @@ validate user visible outputs. """ +from future.standard_library import install_aliases +install_aliases() import datetime import logging import os import shutil +import stat import tempfile import textwrap -import urlparse +from urllib.parse import urlparse import gnupg @@ -56,7 +59,10 @@ def create_temporary_directories(self): for dir_path in [self.external_files_dir, self.working_dir, self.validation_dir, self.gpg_dir]: os.makedirs(dir_path) - os.chmod(self.gpg_dir, 0700) + os.chmod( + self.gpg_dir, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR, # equivalent to "0700" in Unix chmod. + ) # The exporter expects this directory to already exist. os.makedirs(os.path.join(self.working_dir, 'course-data')) diff --git a/edx/analytics/tasks/tests/acceptance/test_database_import.py b/edx/analytics/tasks/tests/acceptance/test_database_import.py index e9197008a0..7639e9eef6 100644 --- a/edx/analytics/tasks/tests/acceptance/test_database_import.py +++ b/edx/analytics/tasks/tests/acceptance/test_database_import.py @@ -50,7 +50,7 @@ def test_import_from_mysql(self): )) def map_null_to_hive_null(row): - return ['\\N' if x == 'NULL' else x for x in row] + return [r'\N' if x == 'NULL' else x for x in row] output_rows = [x.split('\t') for x in hive_output.splitlines() if '\t' in x] output_rows = map(map_null_to_hive_null, output_rows) @@ -59,16 +59,16 @@ def map_null_to_hive_null(row): expected_rows = [ [ - '1', '1', 'edX/Open_DemoX/edx_demo_course', '\\N', 'version-1', 'grading-policy-1', + '1', '1', 'edX/Open_DemoX/edx_demo_course', r'\N', 'version-1', 'grading-policy-1', '0.7', 'C', '2017-01-31 00:05:00', '2017-02-01 00:00:00', '2017-02-01 00:00:00', ], [ - '2', '2', 'edX/Open_DemoX/edx_demo_course', '\\N', 'version-1', 'grading-policy-1', + '2', '2', 'edX/Open_DemoX/edx_demo_course', r'\N', 'version-1', 'grading-policy-1', '0.8', 'B', '2017-01-31 00:05:00', '2017-02-01 00:00:00', '2017-02-01 00:00:00', ], [ - '3', '3', 'edX/Open_DemoX/edx_demo_course', '\\N', 'version-1', 'grading-policy-1', - '0.2', 'Fail', '\\N', '2017-02-01 00:00:00', '2017-02-01 00:00:00', + '3', '3', 'edX/Open_DemoX/edx_demo_course', r'\N', 'version-1', 'grading-policy-1', + '0.2', 'Fail', r'\N', '2017-02-01 00:00:00', '2017-02-01 00:00:00', ], ] diff --git a/edx/analytics/tasks/tests/acceptance/test_enrollment_validation.py b/edx/analytics/tasks/tests/acceptance/test_enrollment_validation.py index 4cdcfceb16..b9d04379a6 100644 --- a/edx/analytics/tasks/tests/acceptance/test_enrollment_validation.py +++ b/edx/analytics/tasks/tests/acceptance/test_enrollment_validation.py @@ -4,7 +4,7 @@ import gzip import json import logging -import StringIO +from io import StringIO from collections import defaultdict from edx.analytics.tasks.tests.acceptance import AcceptanceTestCase, as_list_param @@ -108,7 +108,7 @@ def check_synthetic_events(self, output_dir): histogram = defaultdict(int) # int() returns 0 for output in outputs: # Read S3 file into a buffer, since the S3 file doesn't support seek() and tell(). - gzip_output = StringIO.StringIO() + gzip_output = StringIO() with output.open('r') as event_file: gzip_output.write(event_file.read()) gzip_output.seek(0) diff --git a/edx/analytics/tasks/tests/acceptance/test_enrollments.py b/edx/analytics/tasks/tests/acceptance/test_enrollments.py index b053f7299f..dfc6414273 100644 --- a/edx/analytics/tasks/tests/acceptance/test_enrollments.py +++ b/edx/analytics/tasks/tests/acceptance/test_enrollments.py @@ -5,7 +5,7 @@ import datetime import logging import os -from cStringIO import StringIO +from io import StringIO import pandas from ddt import data, ddt diff --git a/edx/analytics/tasks/tools/obfuscate_eval.py b/edx/analytics/tasks/tools/obfuscate_eval.py index 513c3d8c19..d31a3cdb1d 100644 --- a/edx/analytics/tasks/tools/obfuscate_eval.py +++ b/edx/analytics/tasks/tools/obfuscate_eval.py @@ -14,6 +14,7 @@ read from the same directory as the database dump being analyzed. """ +from __future__ import print_function import argparse import errno @@ -24,11 +25,11 @@ import os import sys from collections import defaultdict, namedtuple -from cStringIO import StringIO +from io import StringIO -import cjson from pyinstrument import Profiler +from edx.analytics.tasks.util.fast_json import FastJson from edx.analytics.tasks.common.pathutil import PathSetTask from edx.analytics.tasks.util import eventlog from edx.analytics.tasks.util.obfuscate_util import Obfuscator, backslash_decode_value, backslash_encode_value @@ -378,12 +379,12 @@ def obfuscate_event_entry(self, line): log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type) if event_json_decoded: - # TODO: should really use cjson, if that were originally used for decoding the json. + # TODO: should really use FastJson, if that were originally used for decoding the json. updated_event_data = json.dumps(updated_event_data) event['event'] = updated_event_data - # TODO: should really use cjson, if that were originally used for decoding the json. + # TODO: should really use FastJson, if that were originally used for decoding the json. return json.dumps(event) def obfuscate_courseware_file(self, input_filepath, output_dir): @@ -440,7 +441,7 @@ def obfuscate_courseware_entry(self, line, user_profile): # is not escaped in the same way. In particular, we will not decode and encode it. state_str = record.state.replace('\\\\', '\\') try: - state_dict = cjson.decode(state_str, all_unicode=True) + state_dict = FastJson.loads(state_str) except Exception as exc: log.exception(u"Unable to parse state as JSON for record %s: type = %s, state = %r", record.id, type(state_str), state_str) return line @@ -539,7 +540,7 @@ def obfuscate_forum_entry(self, line, user_profile): # are also different, as to when \u notation is used for a character as # opposed to a utf8 encoding of the character. try: - entry = cjson.decode(line, all_unicode=True) + entry = FastJson.loads(line) except ValueError as exc: log.error("Failed to parse json for line: %r", line) return "" @@ -698,7 +699,7 @@ def main(): finally: if profiler: profiler.stop() - print >>sys.stderr, profiler.output_text(unicode=True, color=True) + print(profiler.output_text(unicode=True, color=True), file=sys.stderr) if __name__ == '__main__': diff --git a/edx/analytics/tasks/tools/s3util.py b/edx/analytics/tasks/tools/s3util.py index 3fc7286818..6452bc0bde 100644 --- a/edx/analytics/tasks/tools/s3util.py +++ b/edx/analytics/tasks/tools/s3util.py @@ -1,4 +1,5 @@ """Command-line utility for using (and testing) s3 utility methods.""" +from __future__ import print_function import argparse import os @@ -14,7 +15,7 @@ def list_s3_files(source_url, patterns): for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) src_key = get_s3_key(s3_conn, source) - print "%10d %s" % (src_key.size if src_key is not None else -1, path) + print("%10d %s" % (src_key.size if src_key is not None else -1, path)) def get_s3_files(source_url, dest_root, patterns): @@ -28,7 +29,7 @@ def get_s3_files(source_url, dest_root, patterns): if src_key is not None: src_key.get_contents_to_filename(destination) else: - print "No key for source " + source + print("No key for source " + source) def main(): diff --git a/edx/analytics/tasks/util/csv_util.py b/edx/analytics/tasks/util/csv_util.py index 408082b0eb..539106eb84 100644 --- a/edx/analytics/tasks/util/csv_util.py +++ b/edx/analytics/tasks/util/csv_util.py @@ -1,9 +1,11 @@ """ Simple CSV utilities. """ +from __future__ import absolute_import import csv -from StringIO import StringIO +from io import BytesIO +import six class MySQLDumpDialect(csv.Dialect): @@ -47,21 +49,21 @@ class MySQLExportDialect(MySQLPipeDialect): 'mysqlexport': MySQLExportDialect } -for dialect_name, dialect_class in DIALECTS.iteritems(): +for dialect_name, dialect_class in six.iteritems(DIALECTS): csv.register_dialect(dialect_name, dialect_class) def parse_line(line, dialect='excel'): """Parse one line of CSV in the dialect specified.""" # csv.reader requires an iterable per row, so we wrap the line in a list - parsed = csv.reader([line], dialect=dialect).next() + parsed = next(csv.reader([line], dialect=dialect)) return parsed def to_csv_line(row, dialect='excel'): """Return a CSV line by joining the values in row in the dialect specified.""" - output = StringIO() + output = BytesIO() csv.writer(output, dialect=dialect).writerow(row) output.seek(0) diff --git a/edx/analytics/tasks/util/eventlog.py b/edx/analytics/tasks/util/eventlog.py index 90b24301b8..29b18272b6 100644 --- a/edx/analytics/tasks/util/eventlog.py +++ b/edx/analytics/tasks/util/eventlog.py @@ -4,9 +4,8 @@ import logging import re -import cjson - import edx.analytics.tasks.util.opaque_key_util as opaque_key_util +from edx.analytics.tasks.util.fast_json import FastJson log = logging.getLogger(__name__) @@ -15,14 +14,14 @@ def decode_json(line): """Wrapper to decode JSON string in an implementation-independent way.""" - # TODO: Verify correctness of cjson - return cjson.decode(line) + # TODO: Verify correctness of FastJson + return FastJson.loads(line) def encode_json(obj): """Wrapper to re-encode JSON string in an implementation-independent way.""" - # TODO: Verify correctness of cjson - return cjson.encode(obj) + # TODO: Verify correctness of FastJson + return FastJson.dumps(obj) def parse_json_event(line, nested=False): diff --git a/edx/analytics/tasks/util/fast_json.py b/edx/analytics/tasks/util/fast_json.py new file mode 100644 index 0000000000..7f91f4b112 --- /dev/null +++ b/edx/analytics/tasks/util/fast_json.py @@ -0,0 +1,36 @@ +""" +Provide an abstraction layer for fast json implementations across python 2 and 3. +""" +try: + import ujson + ujson_found = True +except ImportError: + import cjson + ujson_found = False + +class FastJson(object): + """ + Abstraction layer on top of cjson (python 2 only) and ujson (python 3 only). + """ + @staticmethod + def dumps(obj): + """ + Dump/encode the Python object into a JSON message. + """ + if ujson_found: + return ujson.dumps(obj) + else: + return cjson.encode(obj) + + @staticmethod + def loads(msg): + """ + Load/decode the JSON message and return a Python object. + + All strings in the decoded object will be unicode strings! This + matches the behavior of python's built-in json library. + """ + if ujson_found: + return ujson.loads(msg) + else: + return cjson.decode(msg, all_unicode=True) diff --git a/edx/analytics/tasks/util/hive.py b/edx/analytics/tasks/util/hive.py index cbe434ac7d..3c15bb6660 100644 --- a/edx/analytics/tasks/util/hive.py +++ b/edx/analytics/tasks/util/hive.py @@ -1,4 +1,5 @@ """Various helper utilities that are commonly used when working with Hive""" +from __future__ import absolute_import import logging import textwrap diff --git a/edx/analytics/tasks/util/id_codec.py b/edx/analytics/tasks/util/id_codec.py index 79ba02233c..fca48da4e6 100644 --- a/edx/analytics/tasks/util/id_codec.py +++ b/edx/analytics/tasks/util/id_codec.py @@ -1,10 +1,13 @@ """Various helper utilities to calculate reversible one-to-one mappings of sensitive ids.""" +from __future__ import absolute_import import base64 import logging import random import luigi +from six.moves import map +from six.moves import range try: import numpy as np @@ -15,13 +18,17 @@ def encode_id(scope, id_type, id_value): - """Encode a scope-type-value tuple into a single ID string.""" - return base64.b32encode('|'.join([scope, id_type, id_value])) + """ + Encode a scope-type-value tuple into a single ID string. + + All inputs must be bytestrings (or `str` in python 2). + """ + return base64.b32encode(b'|'.join([scope, id_type, id_value])) def decode_id(encoded_id): """Decode an ID string back to the original scope-type-value tuple.""" - scope, id_type, id_value = base64.b32decode(encoded_id).split('|') + scope, id_type, id_value = base64.b32decode(encoded_id).split(b'|') return scope, id_type, id_value @@ -49,7 +56,7 @@ def random_permutation_matrix(self, seed, matrix_dim): """Return a random permutation matrix of dimension matrix_dim using seed.""" rng = random.Random(seed) # Decide where each bit goes. - mapping = range(matrix_dim) + mapping = list(range(matrix_dim)) rng.shuffle(mapping) # Then make a matrix that does that. permutation = np.zeros((matrix_dim, matrix_dim), dtype=int) diff --git a/edx/analytics/tasks/util/obfuscate_util.py b/edx/analytics/tasks/util/obfuscate_util.py index 0d6e7f1942..03a369add9 100644 --- a/edx/analytics/tasks/util/obfuscate_util.py +++ b/edx/analytics/tasks/util/obfuscate_util.py @@ -434,7 +434,7 @@ def find_name_context(text, log_context=DEFAULT_LOG_CONTEXT): # Find phone numbers. PHONE_CONTEXT = re.compile( - r'(\bphone:|\bp:|b\c:|\bcall me\b|\(home\)|\(cell\)|my phone|phone number)', + r'(\bphone:|\bp:|\bc:|\bcall me\b|\(home\)|\(cell\)|my phone|phone number)', re.IGNORECASE, ) diff --git a/edx/analytics/tasks/util/opaque_key_util.py b/edx/analytics/tasks/util/opaque_key_util.py index 7904384017..fccc45ef1d 100644 --- a/edx/analytics/tasks/util/opaque_key_util.py +++ b/edx/analytics/tasks/util/opaque_key_util.py @@ -1,4 +1,5 @@ """Utility functions that wrap opaque_keys in useful ways.""" +from __future__ import absolute_import import logging import re @@ -6,6 +7,7 @@ from opaque_keys import InvalidKeyError from opaque_keys.edx.keys import CourseKey from opaque_keys.edx.locator import CourseLocator +import six log = logging.getLogger(__name__) @@ -84,7 +86,7 @@ def get_filename_safe_course_id(course_id, replacement_char='_'): # The safest characters are A-Z, a-z, 0-9, , and . # We represent the first four with \w. # TODO: Once we support courses with unicode characters, we will need to revisit this. - return re.sub(r'[^\w\.\-]', unicode(replacement_char), filename) + return re.sub(r'[^\w\.\-]', six.text_type(replacement_char), filename) def get_course_key_from_url(url): diff --git a/edx/analytics/tasks/util/overwrite.py b/edx/analytics/tasks/util/overwrite.py index 68a264a9a1..c336b974fb 100644 --- a/edx/analytics/tasks/util/overwrite.py +++ b/edx/analytics/tasks/util/overwrite.py @@ -1,6 +1,8 @@ """ Provide support for overwriting existing output files. """ +from __future__ import absolute_import + import logging import luigi diff --git a/edx/analytics/tasks/util/record.py b/edx/analytics/tasks/util/record.py index 7250a23864..e16b5d92d7 100644 --- a/edx/analytics/tasks/util/record.py +++ b/edx/analytics/tasks/util/record.py @@ -18,7 +18,7 @@ bigquery_available = False # pylint: disable=invalid-name -DEFAULT_NULL_VALUE = '\\N' # This is the default string used by Hive to represent a NULL value. +DEFAULT_NULL_VALUE = r'\N' # This is the default string used by Hive to represent a NULL value. log = logging.getLogger(__name__) diff --git a/edx/analytics/tasks/util/retry.py b/edx/analytics/tasks/util/retry.py index d66c8c16a4..62c21e8a0d 100644 --- a/edx/analytics/tasks/util/retry.py +++ b/edx/analytics/tasks/util/retry.py @@ -1,4 +1,5 @@ """Utility decorator for retrying functions that fail.""" +from __future__ import absolute_import import logging import time diff --git a/edx/analytics/tasks/util/s3_util.py b/edx/analytics/tasks/util/s3_util.py index 280820a269..7814b0ca35 100644 --- a/edx/analytics/tasks/util/s3_util.py +++ b/edx/analytics/tasks/util/s3_util.py @@ -1,11 +1,14 @@ """ Utility methods for interacting with S3 via boto. """ +from future.standard_library import install_aliases +install_aliases() + import logging import os import time from fnmatch import fnmatch -from urlparse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse from luigi.contrib.hdfs.format import Plain from luigi.contrib.hdfs.target import HdfsTarget diff --git a/edx/analytics/tasks/util/tests/config.py b/edx/analytics/tasks/util/tests/config.py index c7eb87d2c1..6f1885d6fa 100644 --- a/edx/analytics/tasks/util/tests/config.py +++ b/edx/analytics/tasks/util/tests/config.py @@ -1,8 +1,10 @@ """Support modifying luigi configuration settings in tests.""" +from __future__ import absolute_import from functools import wraps from luigi.configuration import LuigiConfigParser +import six def with_luigi_config(*decorator_args): @@ -65,7 +67,7 @@ def modify_config(section, option, value): new_instance.set(section, option, str(value)) # Support the single override case: @with_luigi_config('section', 'option', 'value') - if isinstance(decorator_args[0], basestring): + if isinstance(decorator_args[0], six.string_types): section, option, value = decorator_args modify_config(section, option, value) else: diff --git a/edx/analytics/tasks/util/tests/opaque_key_mixins.py b/edx/analytics/tasks/util/tests/opaque_key_mixins.py index be874e34d7..be70dafe8b 100644 --- a/edx/analytics/tasks/util/tests/opaque_key_mixins.py +++ b/edx/analytics/tasks/util/tests/opaque_key_mixins.py @@ -1,6 +1,8 @@ """Mixin classes for providing opaque or legacy key values.""" +from __future__ import absolute_import from opaque_keys.edx.locator import CourseLocator +import six class InitializeOpaqueKeysMixin(object): @@ -9,13 +11,13 @@ class InitializeOpaqueKeysMixin(object): def initialize_ids(self): """Define set of id values for use in tests.""" course_key = CourseLocator(org=u'FooX\u00e9', course='1.23x', run='2013_Spring') - self.course_id = unicode(course_key) + self.course_id = six.text_type(course_key) self.encoded_course_id = self.course_id.encode('utf8') self.org_id = course_key.org self.encoded_org_id = self.org_id.encode('utf8') block_id = "9cee77a606ea4c1aa5440e0ea5d0f618" - self.problem_id = unicode(course_key.make_usage_key("problem", block_id)) + self.problem_id = six.text_type(course_key.make_usage_key("problem", block_id)) self.encoded_problem_id = self.problem_id.encode('utf8') self.answer_id = "{block_id}_2_1".format(block_id=block_id) self.second_answer_id = "{block_id}_3_1".format(block_id=block_id) diff --git a/edx/analytics/tasks/util/tests/target.py b/edx/analytics/tasks/util/tests/target.py index 5f6dfd4ca6..e237cd7826 100644 --- a/edx/analytics/tasks/util/tests/target.py +++ b/edx/analytics/tasks/util/tests/target.py @@ -1,14 +1,18 @@ """ Emulates a luigi target, storing all data in memory. """ +from __future__ import absolute_import from contextlib import contextmanager -from StringIO import StringIO +from io import BytesIO +import six class FakeTarget(object): - """Fake Luigi-like target that saves data in memory, using a StringIO buffer.""" - def __init__(self, path=None, value=''): + """ + Fake Luigi-like target that saves data in memory, using a BytesIO buffer. + """ + def __init__(self, path=None, value=b''): self.value = value self.path = path @@ -18,7 +22,7 @@ def value(self): @value.setter def value(self, value): - self.buffer = StringIO(value) + self.buffer = BytesIO(value) # Rewind the buffer head so the value can be read self.buffer.seek(0) diff --git a/edx/analytics/tasks/util/tests/test_hive.py b/edx/analytics/tasks/util/tests/test_hive.py index 081f7c861a..0ac67ec2d7 100644 --- a/edx/analytics/tasks/util/tests/test_hive.py +++ b/edx/analytics/tasks/util/tests/test_hive.py @@ -1,4 +1,5 @@ """Tests for some hive related utilities""" +from __future__ import absolute_import from datetime import date from unittest import TestCase diff --git a/edx/analytics/tasks/util/tests/test_id_codec.py b/edx/analytics/tasks/util/tests/test_id_codec.py index 1fa942c5de..9155ffce95 100644 --- a/edx/analytics/tasks/util/tests/test_id_codec.py +++ b/edx/analytics/tasks/util/tests/test_id_codec.py @@ -1,15 +1,17 @@ """ Tests for encoding/decoding id values. """ +from __future__ import absolute_import + from unittest import TestCase from ddt import data, ddt import edx.analytics.tasks.util.id_codec as id_codec -SCOPE = "Arbitrary Scope" -TYPE = "Arbitrary Type" -VALUE = "Arbitrary Value" +SCOPE = b"Arbitrary Scope" +TYPE = b"Arbitrary Type" +VALUE = b"Arbitrary Value" @ddt @@ -17,15 +19,15 @@ class EncodeDecodeIdTest(TestCase): """Test that encoding works in round-trip.""" @data( - '', + b'', + b'test', u'\ufffd'.encode('utf8'), u'\u00e9'.encode('utf8'), - u'test', ) def test_round_trip(self, suffix): - encoded_id = id_codec.encode_id(SCOPE + suffix, TYPE + suffix, VALUE + suffix) - decoded = id_codec.decode_id(encoded_id) - self.assertEquals((SCOPE + suffix, TYPE + suffix, VALUE + suffix), decoded) + input_id = (SCOPE + suffix, TYPE + suffix, VALUE + suffix) + decoded_id = id_codec.decode_id(id_codec.encode_id(*input_id)) + self.assertEquals(input_id, decoded_id) class PermutationGeneratorTest(TestCase): diff --git a/edx/analytics/tasks/util/tests/test_opaque_key_util.py b/edx/analytics/tasks/util/tests/test_opaque_key_util.py index 2bc25fa5f5..862b69c834 100644 --- a/edx/analytics/tasks/util/tests/test_opaque_key_util.py +++ b/edx/analytics/tasks/util/tests/test_opaque_key_util.py @@ -1,6 +1,8 @@ """ Tests for utilities that parse event logs. """ +from __future__ import absolute_import + from unittest import TestCase from ccx_keys.locator import CCXLocator @@ -8,14 +10,15 @@ from opaque_keys.edx.locator import CourseLocator import edx.analytics.tasks.util.opaque_key_util as opaque_key_util +import six -VALID_COURSE_ID = unicode(CourseLocator(org='org', course='course_id', run='course_run')) +VALID_COURSE_ID = six.text_type(CourseLocator(org='org', course='course_id', run='course_run')) VALID_LEGACY_COURSE_ID = "org/course_id/course_run" INVALID_LEGACY_COURSE_ID = "org:course_id:course_run" INVALID_NONASCII_LEGACY_COURSE_ID = u"org/course\ufffd_id/course_run" VALID_NONASCII_LEGACY_COURSE_ID = u"org/cours\u00e9_id/course_run" -VALID_CCX_COURSE_ID = unicode(CCXLocator(org='org', course='course_id', run='course_run', ccx='13')) -COURSE_ID_WITH_COLONS = unicode(CourseLocator(org='org', course='course:id', run='course:run')) +VALID_CCX_COURSE_ID = six.text_type(CCXLocator(org='org', course='course_id', run='course_run', ccx='13')) +COURSE_ID_WITH_COLONS = six.text_type(CourseLocator(org='org', course='course:id', run='course:run')) @ddt @@ -97,7 +100,7 @@ def test_get_filename_with_default_separator(self, course_id, expected_filename, def test_get_course_key_from_url(self, course_id): url = u"https://courses.edx.org/courses/{course_id}/stuff".format(course_id=course_id) course_key = opaque_key_util.get_course_key_from_url(url) - self.assertEquals(unicode(course_key), course_id) + self.assertEquals(six.text_type(course_key), course_id) @data( INVALID_LEGACY_COURSE_ID, diff --git a/edx/analytics/tasks/util/tests/test_record.py b/edx/analytics/tasks/util/tests/test_record.py index 1daaff573a..cbf89408d1 100644 --- a/edx/analytics/tasks/util/tests/test_record.py +++ b/edx/analytics/tasks/util/tests/test_record.py @@ -134,7 +134,7 @@ def test_to_string_tuple_nulls(self): test_record = SampleStruct(None, 0, None) self.assertEqual( test_record.to_string_tuple(), - ('\\N', '0', '\\N') + (r'\N', '0', r'\N') ) def test_to_string_tuple_custom_nulls(self): @@ -152,7 +152,7 @@ def test_from_string_tuple(self): self.assertEqual(test_record.date, datetime.date(2015, 11, 1)) def test_from_string_tuple_nulls(self): - string_tuple = ('\\N', '0', '2015-11-01') + string_tuple = (r'\N', '0', '2015-11-01') test_record = SampleStruct.from_string_tuple(string_tuple) self.assertEqual(test_record.name, None) self.assertEqual(test_record.index, 0) diff --git a/edx/analytics/tasks/util/tests/test_retry.py b/edx/analytics/tasks/util/tests/test_retry.py index df28799dbf..198d320718 100644 --- a/edx/analytics/tasks/util/tests/test_retry.py +++ b/edx/analytics/tasks/util/tests/test_retry.py @@ -1,5 +1,8 @@ """Test the retry decorator""" +from __future__ import absolute_import + +import six from datetime import datetime, timedelta from unittest import TestCase @@ -73,7 +76,7 @@ def some_func(): some_func() self.assertEqual(self.func_call_counter, 4) - self.assertItemsEqual(self.mock_sleep.mock_calls, [call(0.5), call(1), call(2)]) + six.assertCountEqual(self, self.mock_sleep.mock_calls, [call(0.5), call(1), call(2)]) def test_different_base_delay(self): @@ -85,7 +88,7 @@ def some_func(): raise Exception('error') some_func() - self.assertItemsEqual(self.mock_sleep.mock_calls, [call(1), call(2), call(4), call(8)]) + six.assertCountEqual(self, self.mock_sleep.mock_calls, [call(1), call(2), call(4), call(8)]) def test_fatal_exception(self): diff --git a/edx/analytics/tasks/util/tests/test_s3_util.py b/edx/analytics/tasks/util/tests/test_s3_util.py index 68e4c39120..c6504ca656 100644 --- a/edx/analytics/tasks/util/tests/test_s3_util.py +++ b/edx/analytics/tasks/util/tests/test_s3_util.py @@ -1,10 +1,10 @@ """ Tests for S3-related utility functionality. """ -from __future__ import print_function +from __future__ import absolute_import, print_function +import six from unittest import TestCase - from ddt import data, ddt, unpack from mock import MagicMock @@ -33,7 +33,7 @@ def _make_s3_generator(self, bucket_name, root, path_info, patterns): s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size) - for path, size in path_info.iteritems()] + for path, size in six.iteritems(path_info)] s3_bucket.list = MagicMock(return_value=target_list) print([(k.key, k.size) for k in target_list]) diff --git a/edx/analytics/tasks/util/tests/test_url.py b/edx/analytics/tasks/util/tests/test_url.py index e036dcd17b..eca1298e87 100644 --- a/edx/analytics/tasks/util/tests/test_url.py +++ b/edx/analytics/tasks/util/tests/test_url.py @@ -1,4 +1,5 @@ """Tests for URL-related functionality.""" +from __future__ import absolute_import from unittest import TestCase diff --git a/edx/analytics/tasks/util/url.py b/edx/analytics/tasks/util/url.py index 1d70af0391..04ae6e8e3f 100644 --- a/edx/analytics/tasks/util/url.py +++ b/edx/analytics/tasks/util/url.py @@ -9,11 +9,14 @@ hdfs://some/directory/ """ from __future__ import absolute_import +from future.standard_library import install_aliases +install_aliases() import logging import os import time -import urlparse +from urllib.parse import urlparse, urlunparse +import six import luigi import luigi.configuration @@ -124,7 +127,7 @@ def open(self, mode='r'): def get_target_class_from_url(url, marker=False): """Returns a luigi target class based on the url scheme""" - parsed_url = urlparse.urlparse(url) + parsed_url = urlparse(url) if marker: target_class = URL_SCHEME_TO_MARKER_TARGET_CLASS.get(parsed_url.scheme, DEFAULT_MARKER_TARGET_CLASS) @@ -174,6 +177,13 @@ def url_path_join(url, *extra_path): Returns: The URL with the path component joined with `extra_path` argument. """ - (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) + (scheme, netloc, path, params, query, fragment) = urlparse(url) joined_path = os.path.join(path, *extra_path) - return urlparse.urlunparse((scheme, netloc, joined_path, params, query, fragment)) + return urlunparse(( + six.text_type(scheme), + six.text_type(netloc), + six.text_type(joined_path), + six.text_type(params), + six.text_type(query), + six.text_type(fragment), + )) diff --git a/edx/analytics/tasks/warehouse/financial/affiliate_window.py b/edx/analytics/tasks/warehouse/financial/affiliate_window.py index 1af075c8ae..5bd6526f2a 100644 --- a/edx/analytics/tasks/warehouse/financial/affiliate_window.py +++ b/edx/analytics/tasks/warehouse/financial/affiliate_window.py @@ -1,6 +1,8 @@ """ Tasks to support pulling Affiliate Window reports from their REST API to the data warehouse. """ +from __future__ import absolute_import, print_function + import csv import datetime import json @@ -251,7 +253,7 @@ def run(self): json.dumps(row) ] - result = [col if col is not None else '\N' for col in result] + result = [col if col is not None else r'\N' for col in result] writer.writerow(result) def output(self): diff --git a/edx/analytics/tasks/warehouse/financial/cybersource.py b/edx/analytics/tasks/warehouse/financial/cybersource.py index 80d9eb5363..ae625a2acd 100644 --- a/edx/analytics/tasks/warehouse/financial/cybersource.py +++ b/edx/analytics/tasks/warehouse/financial/cybersource.py @@ -1,4 +1,5 @@ """Collect information about payments from third-party sources for financial reporting.""" +from __future__ import absolute_import import csv import datetime @@ -178,7 +179,7 @@ def run(self): row['currency'], row['amount'], # Transaction fee - '\\N', + r'\N', TRANSACTION_TYPE_MAP[row['transaction_type']], # We currently only process credit card transactions with Cybersource 'credit_card', diff --git a/edx/analytics/tasks/warehouse/financial/ed_services_financial_report.py b/edx/analytics/tasks/warehouse/financial/ed_services_financial_report.py index a14086e4fb..2e66ff74df 100644 --- a/edx/analytics/tasks/warehouse/financial/ed_services_financial_report.py +++ b/edx/analytics/tasks/warehouse/financial/ed_services_financial_report.py @@ -1,4 +1,6 @@ """Generates a financial report to be delivered to our good friends in Ed Services.""" +from __future__ import absolute_import + import luigi from edx.analytics.tasks.common.mapreduce import MapReduceJobTaskMixin diff --git a/edx/analytics/tasks/warehouse/financial/fees.py b/edx/analytics/tasks/warehouse/financial/fees.py index ab2530828d..61ea719f1d 100644 --- a/edx/analytics/tasks/warehouse/financial/fees.py +++ b/edx/analytics/tasks/warehouse/financial/fees.py @@ -1,8 +1,9 @@ """ Tasks associated with pulling and storing financial fees related data. """ -import logging +from __future__ import absolute_import +import logging import luigi from edx.analytics.tasks.common.vertica_load import VerticaCopyTask diff --git a/edx/analytics/tasks/warehouse/financial/finance_reports.py b/edx/analytics/tasks/warehouse/financial/finance_reports.py index ef8bb5b21e..6af1bb4a59 100644 --- a/edx/analytics/tasks/warehouse/financial/finance_reports.py +++ b/edx/analytics/tasks/warehouse/financial/finance_reports.py @@ -1,4 +1,6 @@ """Provide entry-point for generating finance reports.""" +from __future__ import absolute_import + import luigi from edx.analytics.tasks.common.mapreduce import MapReduceJobTaskMixin diff --git a/edx/analytics/tasks/warehouse/financial/orders_import.py b/edx/analytics/tasks/warehouse/financial/orders_import.py index 2f8ba06a19..c723525d05 100644 --- a/edx/analytics/tasks/warehouse/financial/orders_import.py +++ b/edx/analytics/tasks/warehouse/financial/orders_import.py @@ -1,4 +1,5 @@ """Import Orders: Shopping Cart Tables from the LMS, Orders from Otto.""" +from __future__ import absolute_import import luigi diff --git a/edx/analytics/tasks/warehouse/financial/payment.py b/edx/analytics/tasks/warehouse/financial/payment.py index 5f98c2a436..19f43debce 100644 --- a/edx/analytics/tasks/warehouse/financial/payment.py +++ b/edx/analytics/tasks/warehouse/financial/payment.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import luigi diff --git a/edx/analytics/tasks/warehouse/financial/paypal.py b/edx/analytics/tasks/warehouse/financial/paypal.py index 3d54fdf536..5531f39373 100644 --- a/edx/analytics/tasks/warehouse/financial/paypal.py +++ b/edx/analytics/tasks/warehouse/financial/paypal.py @@ -5,6 +5,7 @@ https://developer.paypal.com/docs/classic/payflow/reporting/ """ +from __future__ import absolute_import import datetime import logging @@ -12,7 +13,7 @@ import time import xml.etree.cElementTree as ET from collections import OrderedDict, namedtuple -from cStringIO import StringIO +from io import BytesIO from decimal import Decimal import luigi @@ -24,6 +25,8 @@ from edx.analytics.tasks.util.hive import WarehouseMixin from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin from edx.analytics.tasks.util.url import ExternalURL, get_target_from_url, url_path_join +import six +from six.moves import range log = logging.getLogger(__name__) @@ -180,7 +183,10 @@ class PaypalMalformedResponseError(PaypalError): def __init__(self, message, root_node=None): with_tree = message if root_node: - with_tree = message + ':' + ET.tostring(root_node, encoding='UTF-8', method='xml') + with_tree = u'{}:{}'.format( + message, + ET.tostring(root_node, encoding='UTF-8', method='xml').decode('utf-8'), + ) super(PaypalMalformedResponseError, self).__init__(with_tree) @@ -216,11 +222,11 @@ def create_request_document(self): self.append_request_node(root_node) # NOTE: we have to use this API to get the XML declaration, it is suboptimal that we have to construct a - # StringIO buffer to write to. + # BytesIO buffer to write to. tree = ET.ElementTree(root_node) - string_buffer = StringIO() - tree.write(string_buffer, encoding='UTF-8', xml_declaration=True) - return string_buffer.getvalue() + bytes_buffer = BytesIO() + tree.write(bytes_buffer, encoding='UTF-8', xml_declaration=True) + return bytes_buffer.getvalue() def append_authentication_node(self, root_node): """Inject the authentication elements into the request.""" @@ -228,7 +234,7 @@ def append_authentication_node(self, root_node): for attribute in ('user', 'vendor', 'partner', 'password'): child_node = ET.SubElement(auth_node, attribute) - child_node.text = unicode(getattr(self, attribute)) + child_node.text = six.text_type(getattr(self, attribute)) def append_request_node(self, root_node): """Inject the request-specific elements into the request.""" @@ -339,17 +345,17 @@ def append_request_node(self, root_node): # WARNING: the paypal XML parser is position sensitive. Do NOT change the ordering of the fields in the request. request_node = ET.SubElement(root_node, 'runReportRequest') name_node = ET.SubElement(request_node, 'reportName') - name_node.text = unicode(self.report_name) + name_node.text = six.text_type(self.report_name) - for param_name, param_value in self.report_params.iteritems(): + for param_name, param_value in six.iteritems(self.report_params): param_node = ET.SubElement(request_node, 'reportParam') param_name_node = ET.SubElement(param_node, 'paramName') - param_name_node.text = unicode(param_name) + param_name_node.text = six.text_type(param_name) param_value_node = ET.SubElement(param_node, 'paramValue') - param_value_node.text = unicode(param_value) + param_value_node.text = six.text_type(param_value) page_size_node = ET.SubElement(request_node, 'pageSize') - page_size_node.text = unicode(self.page_size) + page_size_node.text = six.text_type(self.page_size) ColumnMetadata = namedtuple('ColumnMetadata', ('name', 'data_type')) # pylint: disable=invalid-name @@ -411,7 +417,7 @@ def __init__(self, report_id): def append_request_node(self, root_node): request_node = ET.SubElement(root_node, 'getMetaDataRequest') report_id_node = ET.SubElement(request_node, 'reportId') - report_id_node.text = unicode(self.report_id) + report_id_node.text = six.text_type(self.report_id) class PaypalReportDataResponse(PaypalApiResponse): @@ -465,9 +471,9 @@ def __init__(self, report_id, page_num=1): def append_request_node(self, root_node): request_node = ET.SubElement(root_node, 'getDataRequest') report_id_node = ET.SubElement(request_node, 'reportId') - report_id_node.text = unicode(self.report_id) + report_id_node.text = six.text_type(self.report_id) page_num_node = ET.SubElement(request_node, 'pageNum') - page_num_node.text = unicode(self.page_num) + page_num_node.text = six.text_type(self.page_num) class PaypalReportResultsRequest(PaypalApiRequest): @@ -487,7 +493,7 @@ def __init__(self, report_id): def append_request_node(self, root_node): request_node = ET.SubElement(root_node, 'getResultsRequest') report_id_node = ET.SubElement(request_node, 'reportId') - report_id_node.text = unicode(self.report_id) + report_id_node.text = six.text_type(self.report_id) BaseSettlementReportRecord = namedtuple('SettlementReportRecord', [ # pylint: disable=invalid-name @@ -655,7 +661,7 @@ def write_transaction_record(self, row, output_tsv_file): # identifier for the transaction payment_record.paypal_transaction_id, ] - output_tsv_file.write('\t'.join(record) + '\n') + output_tsv_file.write(b'\t'.join(field.encode('utf-8') for field in record) + b'\n') def output(self): # NOTE: both the cybersource and paypal tasks write to the payments folder diff --git a/edx/analytics/tasks/warehouse/financial/reconcile.py b/edx/analytics/tasks/warehouse/financial/reconcile.py index a62bf603e9..80a59678d8 100644 --- a/edx/analytics/tasks/warehouse/financial/reconcile.py +++ b/edx/analytics/tasks/warehouse/financial/reconcile.py @@ -1,4 +1,5 @@ """Perform reconciliation of transaction history against order history""" +from __future__ import absolute_import import csv import json @@ -169,7 +170,7 @@ def mapper(self, line): key_index = ORDERITEM_FIELDS.index('payment_ref_id') key = fields[key_index] - # Convert Hive null values ('\\N') in fields like 'product_detail': + # Convert Hive null values ("\N") in fields like 'product_detail': defaults = ( ('product_detail', ''), ('refunded_amount', '0.0'), @@ -184,7 +185,7 @@ def mapper(self, line): ) for field_name, default_value in defaults: index = ORDERITEM_FIELD_INDICES[field_name] - if fields[index] == '\\N': + if fields[index] == r'\N': fields[index] = default_value elif len(fields) == len(TRANSACTION_FIELDS): @@ -192,7 +193,7 @@ def mapper(self, line): record_type = TransactionRecord.__name__ key = fields[3] # payment_ref_id # Convert nulls in 'transaction_fee'. - if fields[6] == '\\N': + if fields[6] == r'\N': fields[6] = None # Edx-only: if the transaction was within a time period when @@ -677,12 +678,20 @@ def format_transaction_table_output(self, audit_code, transaction, orderitem, tr orderitem.partner_short_code if orderitem else self.default_partner_short_code, orderitem.payment_ref_id if orderitem else transaction.payment_ref_id, orderitem.order_id if orderitem else None, - encode_id(orderitem.order_processor, "order_id", orderitem.order_id) if orderitem else None, + encode_id( + orderitem.order_processor.encode('utf-8'), + b"order_id", + orderitem.order_id.encode('utf-8') + ) if orderitem else None, orderitem.date_placed if orderitem else None, # transaction information transaction.date if transaction else None, transaction.transaction_id if transaction else None, - encode_id(transaction.payment_gateway_id, "transaction_id", transaction.transaction_id) if transaction else None, + encode_id( + transaction.payment_gateway_id.encode('utf-8'), + b"transaction_id", + transaction.transaction_id.encode('utf-8') + ) if transaction else None, transaction.payment_gateway_id if transaction else None, transaction.payment_gateway_account_id if transaction else None, transaction.transaction_type if transaction else None, @@ -695,7 +704,11 @@ def format_transaction_table_output(self, audit_code, transaction, orderitem, tr str(transaction_fee_per_item) if transaction_fee_per_item is not None else None, # orderitem information orderitem.line_item_id if orderitem else None, - encode_id(orderitem.order_processor, "line_item_id", orderitem.line_item_id) if orderitem else None, + encode_id( + orderitem.order_processor.encode('utf-8'), + b"line_item_id", + orderitem.line_item_id.encode('utf-8') + ) if orderitem else None, orderitem.line_item_product_id if orderitem else None, orderitem.line_item_price if orderitem else None, orderitem.line_item_unit_price if orderitem else None, @@ -771,13 +784,13 @@ class OrderTransactionRecord(OrderTransactionRecordBase): def to_tsv(self): """Serializes the record to a TSV-formatted string.""" - return '\t'.join([str(v) if v is not None else "\\N" for v in self]) + return '\t'.join([str(v) if v is not None else r'\N' for v in self]) @staticmethod def from_job_output(tsv_str): """Constructor that reads format generated by to_tsv().""" record = tsv_str.split('\t') - nulled_record = [v if v != "\\N" else None for v in record] + nulled_record = [v if v != r'\N' else None for v in record] return OrderTransactionRecord(*nulled_record) @@ -1121,13 +1134,13 @@ class FullOrderTransactionRecord(FullOrderTransactionRecordBase): def to_tsv(self): """Serializes the record to a TSV-formatted string.""" - return '\t'.join([str(v) if v is not None else "\\N" for v in self]) + return '\t'.join([str(v) if v is not None else r'\N' for v in self]) @staticmethod def from_job_output(tsv_str): """Constructor that reads format generated by to_tsv().""" record = tsv_str.split('\t') - nulled_record = [v if v != "\\N" else None for v in record] + nulled_record = [v if v != r'\N' else None for v in record] return FullOrderTransactionRecord(*nulled_record) @@ -1159,7 +1172,7 @@ def mapper(self, line): key_index = FULLORDERITEM_FIELDS.index('payment_ref_id') key = fields[key_index] - # Convert Hive null values ('\\N') in fields like 'product_detail': + # Convert Hive null values ("\N") in fields like 'product_detail': defaults = ( ('product_detail', ''), ('refunded_amount', '0.0'), @@ -1172,7 +1185,7 @@ def mapper(self, line): ) for field_name, default_value in defaults: index = FULLORDERITEM_FIELD_INDICES[field_name] - if fields[index] == '\\N': + if fields[index] == r'\N': fields[index] = default_value elif len(fields) == len(TRANSACTION_FIELDS): @@ -1180,7 +1193,7 @@ def mapper(self, line): record_type = TransactionRecord.__name__ key = fields[3] # payment_ref_id # Convert nulls in 'transaction_fee'. - if fields[6] == '\\N': + if fields[6] == r'\N': fields[6] = None # Edx-only: if the transaction was within a time period when @@ -1238,12 +1251,20 @@ def format_transaction_table_output(self, audit_code, transaction, orderitem, tr orderitem.partner_short_code if orderitem else self.default_partner_short_code, orderitem.payment_ref_id if orderitem else transaction.payment_ref_id, orderitem.order_id if orderitem else None, - encode_id(orderitem.order_processor, "order_id", orderitem.order_id) if orderitem else None, + encode_id( + orderitem.order_processor.encode('utf-8'), + b"order_id", + orderitem.order_id.encode('utf-8'), + ) if orderitem else None, orderitem.date_placed if orderitem else None, # transaction information transaction.date if transaction else None, transaction.transaction_id if transaction else None, - encode_id(transaction.payment_gateway_id, "transaction_id", transaction.transaction_id) if transaction else None, + encode_id( + transaction.payment_gateway_id.encode('utf-8'), + b"transaction_id", + transaction.transaction_id.encode('utf-8'), + ) if transaction else None, transaction.payment_gateway_id if transaction else None, transaction.payment_gateway_account_id if transaction else None, transaction.transaction_type if transaction else None, @@ -1256,7 +1277,11 @@ def format_transaction_table_output(self, audit_code, transaction, orderitem, tr str(transaction_fee_per_item) if transaction_fee_per_item is not None else None, # orderitem information orderitem.line_item_id if orderitem else None, - encode_id(orderitem.order_processor, "line_item_id", orderitem.line_item_id) if orderitem else None, + encode_id( + orderitem.order_processor.encode('utf-8'), + b"line_item_id", + orderitem.line_item_id.encode('utf-8'), + ) if orderitem else None, orderitem.line_item_product_id if orderitem else None, orderitem.line_item_price if orderitem else None, orderitem.line_item_unit_price if orderitem else None, diff --git a/edx/analytics/tasks/warehouse/financial/tests/test_paypal.py b/edx/analytics/tasks/warehouse/financial/tests/test_paypal.py index faed863124..d65244fe2f 100644 --- a/edx/analytics/tasks/warehouse/financial/tests/test_paypal.py +++ b/edx/analytics/tasks/warehouse/financial/tests/test_paypal.py @@ -1,7 +1,8 @@ +from __future__ import absolute_import import xml.etree.cElementTree as ET from collections import OrderedDict -from cStringIO import StringIO +from io import BytesIO from unittest import TestCase import httpretty @@ -16,6 +17,8 @@ PaypalReportMetadataRequest, PaypalReportRequest, PaypalReportResultsRequest, PaypalTimeoutError, PaypalTransactionsByDayTask, SettlementReportRecord ) +import six +from six.moves import zip TEST_URL = 'http://test.api/endpoint' @@ -30,9 +33,9 @@ def setUp(self): def on_post_return_xml(self): element_tree = ET.ElementTree(self.response_xml_root) - string_buffer = StringIO() - element_tree.write(string_buffer, encoding='UTF-8', xml_declaration=True) - response_xml_root_string = string_buffer.getvalue() + bytes_buffer = BytesIO() + element_tree.write(bytes_buffer, encoding='UTF-8', xml_declaration=True) + response_xml_root_string = bytes_buffer.getvalue() httpretty.register_uri(httpretty.POST, TEST_URL, response_xml_root_string) def remove_xml_node(self, path): @@ -42,7 +45,7 @@ def remove_xml_node(self, path): def set_xml_node_text(self, path, value): element = self.response_xml_root.findall(path)[0] - element.text = unicode(value) + element.text = six.text_type(value) def parse_request_xml(self): http_request = httpretty.last_request() @@ -755,9 +758,20 @@ def test_normal_run(self): self.task.run() - expected_record = ['2015-08-28', 'paypal', 'testing', 'EDX-123456', 'USD', '50.00', '1.40', 'sale', - 'instant_transfer', 'paypal', '1FW12345678901234'] - self.assertEquals(self.output_target.value.strip(), '\t'.join(expected_record)) + expected_record = [ + b'2015-08-28', + b'paypal', + b'testing', + b'EDX-123456', + b'USD', + b'50.00', + b'1.40', + b'sale', + b'instant_transfer', + b'paypal', + b'1FW12345678901234', + ] + self.assertEquals(self.output_target.value.strip(), b'\t'.join(expected_record)) @data( (4, 'Report has failed'), @@ -864,9 +878,20 @@ def test_delayed_report(self, mock_time): call(5) ]) - expected_record = ['2015-08-28', 'paypal', 'testing', 'EDX-123456', 'USD', '50.00', '1.40', 'sale', - 'instant_transfer', 'paypal', '1FW12345678901234'] - self.assertEquals(self.output_target.value.strip(), '\t'.join(expected_record)) + expected_record = [ + b'2015-08-28', + b'paypal', + b'testing', + b'EDX-123456', + b'USD', + b'50.00', + b'1.40', + b'sale', + b'instant_transfer', + b'paypal', + b'1FW12345678901234', + ] + self.assertEquals(self.output_target.value.strip(), b'\t'.join(expected_record)) @with_luigi_config('paypal', 'timeout', '1') @patch('edx.analytics.tasks.warehouse.financial.paypal.time') diff --git a/edx/analytics/tasks/warehouse/financial/tests/test_reconcile.py b/edx/analytics/tasks/warehouse/financial/tests/test_reconcile.py index e2f4f07faf..356c473527 100644 --- a/edx/analytics/tasks/warehouse/financial/tests/test_reconcile.py +++ b/edx/analytics/tasks/warehouse/financial/tests/test_reconcile.py @@ -1,4 +1,6 @@ """Tests for Order-transaction reconciliation and reporting.""" +from __future__ import absolute_import + import uuid from unittest import TestCase @@ -10,11 +12,13 @@ LOW_ORDER_ID_SHOPPINGCART_ORDERS, BaseOrderItemRecord, BaseTransactionRecord, OrderItemRecord, OrderTransactionRecord, ReconcileOrdersAndTransactionsTask, TransactionRecord ) +import six +from six.moves import zip TEST_DATE = '2015-06-01' TEST_LATER_DATE = '2015-06-10' DEFAULT_REF_ID = "EDX-12345" -HIVE_NULL = '\\N' +HIVE_NULL = r'\N' FIRST_ORDER_ITEM = '2345678' SECOND_ORDER_ITEM = '2345679' FIRST_TRANSACTION = '123423453456' @@ -206,7 +210,7 @@ def record_sort_key(record): # so that column names can be used instead of numbers. output_dict = record._asdict() # pylint: disable=no-member,protected-access expected_columns.update(**extra_values) - for column_num, expected_value in expected_columns.iteritems(): + for column_num, expected_value in six.iteritems(expected_columns): self.assertEquals(output_dict[column_num], expected_value) def test_no_transaction(self): diff --git a/edx/analytics/tasks/warehouse/lms_courseware_link_clicked.py b/edx/analytics/tasks/warehouse/lms_courseware_link_clicked.py index c945f51558..010f79623d 100644 --- a/edx/analytics/tasks/warehouse/lms_courseware_link_clicked.py +++ b/edx/analytics/tasks/warehouse/lms_courseware_link_clicked.py @@ -1,8 +1,11 @@ """ Tasks for collecting link click data per course, per day, and uploading that data to Vertica. """ +from future.standard_library import install_aliases +install_aliases() + import logging -from urlparse import urlparse +from urllib.parse import urlparse import luigi.task diff --git a/edx/analytics/tasks/warehouse/tests/test_course_subjects.py b/edx/analytics/tasks/warehouse/tests/test_course_subjects.py index 591b2ed516..af16b70247 100644 --- a/edx/analytics/tasks/warehouse/tests/test_course_subjects.py +++ b/edx/analytics/tasks/warehouse/tests/test_course_subjects.py @@ -110,8 +110,8 @@ def test_course_no_subjects(self): expected = { 'course_id': 'foo', 'date': '2015-06-25', - 'subject_uri': '\N', # pylint: disable-msg=anomalous-unicode-escape-in-string - 'subject_title': '\N', # pylint: disable-msg=anomalous-unicode-escape-in-string + 'subject_uri': r'\N', # pylint: disable-msg=anomalous-unicode-escape-in-string + 'subject_title': r'\N', # pylint: disable-msg=anomalous-unicode-escape-in-string 'subject_language': 'en', } self.assertTrue(self.check_subject_entry(data, 0, expected)) @@ -250,7 +250,7 @@ def test_catalog_missing_keys(self): 'course_id': 'bar', 'date': '2015-06-25', 'subject_uri': '/course/subject/testing', - 'subject_title': '\N', # pylint: disable-msg=anomalous-unicode-escape-in-string + 'subject_title': r'\N', # pylint: disable-msg=anomalous-unicode-escape-in-string 'subject_language': 'en' } # We expect only one row, a row for the course with a course_id. diff --git a/requirements/base.in b/requirements/base.in index d542595403..9189146072 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -3,7 +3,7 @@ # Workaround for https://github.com/ansible/ansible/issues/8875 --no-binary ansible -ansible==1.4.5 # GPL v3 License +ansible<2.9.0 # GPL v3 License boto==2.48.0 # MIT ecdsa==0.13 # MIT Jinja2 # BSD diff --git a/requirements/base.txt b/requirements/base.txt index 10249852c5..e369131ed9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -6,23 +6,18 @@ # --no-binary ansible -ansible==1.4.5 +ansible==2.8.2 asn1crypto==0.24.0 # via cryptography -bcrypt==3.1.6 # via paramiko boto==2.48.0 -cffi==1.12.3 # via bcrypt, cryptography, pynacl -cryptography==2.6.1 # via paramiko +cffi==1.12.3 # via cryptography +cryptography==2.7 # via ansible ecdsa==0.13 enum34==1.1.6 # via cryptography -httplib2==0.12.3 # via ansible ipaddress==1.0.22 # via cryptography jinja2==2.10.1 markupsafe==1.1.1 # via jinja2 -paramiko==2.4.2 # via ansible -pyasn1==0.4.5 # via paramiko pycparser==2.19 # via cffi pycrypto==2.6.1 -pynacl==1.3.0 # via paramiko -pyyaml==5.1 # via ansible +pyyaml==5.1.1 # via ansible six==1.10.0 wheel==0.30.0 diff --git a/requirements/default.in b/requirements/default.in index 9660a50b47..4823aee915 100644 --- a/requirements/default.in +++ b/requirements/default.in @@ -4,7 +4,7 @@ -r base.txt argparse==1.2.1 # Python Software Foundation License -boto3==1.4.8 # Apache 2.0 +boto3 # Apache 2.0 ciso8601==1.0.3 # MIT edx-ccx-keys==0.2.1 # AGPL edx-opaque-keys==0.4 # AGPL @@ -18,24 +18,26 @@ html5lib==1.0b3 # MIT isoweek==1.3.3 # BSD numpy==1.11.3 # BSD paypalrestsdk==1.9.0 # Paypal SDK License -psycopg2==2.6.2 # LGPL +psycopg2 # LGPL pygeoip==0.3.2 # LGPL -python-cjson==1.1.0 # LGPL +python-cjson; python_version <= "2.7" # LGPL +ujson; python_version > "2.7" # BSD python-dateutil==2.6.1 # BSD # There seems to be an issue(see: https://pagure.io/python-daemon/issue/18) with dependency installation on the # latest version of python-daemon(2.2.0), so we pin it to an earlier version. -python-daemon==2.1.2 +python-daemon python-gnupg==0.3.9 # BSD pytz==2017.3 # ZPL requests==2.18.4 # Apache 2.0 -six==1.10.0 # MIT +six # MIT stevedore==1.19.1 # Apache 2.0 ua-parser==0.3.6 # Apache urllib3==1.22 # MIT user-agents==0.3.2 # MIT vertica-python==0.6.11 # MIT yarn-api-client==0.2.3 # BSD -snowflake-connector-python==1.7.9 +snowflake-connector-python +futures; python_version == "2.7" -e git+https://github.com/edx/luigi.git@eb45bcc52243de11b2b16a81229ac584fe1e601b#egg=luigi # Apache License 2.0 diff --git a/requirements/default.txt b/requirements/default.txt index 2d0904fe77..1447b57b72 100644 --- a/requirements/default.txt +++ b/requirements/default.txt @@ -8,25 +8,24 @@ -e git+https://github.com/edx/luigi.git@eb45bcc52243de11b2b16a81229ac584fe1e601b#egg=luigi -e git+https://github.com/edx/pyinstrument.git@a35ff76df4c3d5ff9a2876d859303e33d895e78f#egg=pyinstrument -ansible==1.4.5 +ansible==2.8.2 argparse==1.2.1 asn1crypto==0.24.0 -azure-common==1.1.21 # via azure-storage-blob, azure-storage-common, snowflake-connector-python +azure-common==1.1.23 # via azure-storage-blob, azure-storage-common, snowflake-connector-python azure-nspkg==3.0.2 # via azure-common, azure-storage-nspkg azure-storage-blob==2.0.1 # via snowflake-connector-python azure-storage-common==2.0.0 # via azure-storage-blob azure-storage-nspkg==3.1.0 # via azure-storage-common backports-abc==0.5 # via tornado -bcrypt==3.1.6 -boto3==1.4.8 +boto3==1.9.195 boto==2.48.0 -botocore==1.8.50 # via boto3, s3transfer, snowflake-connector-python +botocore==1.12.195 # via boto3, s3transfer, snowflake-connector-python cachetools==3.1.1 # via google-auth -certifi==2019.3.9 # via requests, snowflake-connector-python, tornado +certifi==2019.6.16 # via requests, snowflake-connector-python, tornado cffi==1.12.3 chardet==3.0.4 # via requests ciso8601==1.0.3 -cryptography==2.6.1 +cryptography==2.7 distlib==0.2.2 docutils==0.14 # via botocore, python-daemon ecdsa==0.13 @@ -36,7 +35,7 @@ elasticsearch==1.7.0 enum34==1.1.6 filechunkio==1.8 future==0.17.1 # via snowflake-connector-python, vertica-python -futures==3.2.0 # via azure-storage-blob, google-cloud-core, s3transfer +futures==3.3.0 ; python_version == "2.7" google-api-python-client==1.7.7 google-auth-httplib2==0.0.3 # via google-api-python-client google-auth==1.6.3 # via google-api-python-client, google-auth-httplib2, google-cloud-bigquery, google-cloud-core @@ -47,9 +46,9 @@ googleapis-common-protos==1.6.0 # via google-cloud-core graphitesend==0.10.0 gspread==3.1.0 html5lib==1.0b3 -httplib2==0.12.3 +httplib2==0.13.0 # via google-api-python-client, google-auth-httplib2 idna==2.6 # via requests, snowflake-connector-python -ijson==2.3 # via snowflake-connector-python +ijson==2.4 # via snowflake-connector-python ipaddress==1.0.22 isoweek==1.3.3 jinja2==2.10.1 @@ -57,33 +56,31 @@ jmespath==0.9.4 # via boto3, botocore lockfile==0.12.2 # via python-daemon markupsafe==1.1.1 numpy==1.11.3 -paramiko==2.4.2 paypalrestsdk==1.9.0 -pbr==5.2.1 # via stevedore -protobuf==3.8.0 # via google-cloud-core, googleapis-common-protos -psycopg2==2.6.2 +pbr==5.4.1 # via stevedore +protobuf==3.9.0 # via google-cloud-core, googleapis-common-protos +psycopg2==2.8.3 pyasn1-modules==0.2.5 # via google-auth, snowflake-connector-python -pyasn1==0.4.5 +pyasn1==0.4.5 # via pyasn1-modules, rsa, snowflake-connector-python pycparser==2.19 pycrypto==2.6.1 pycryptodomex==3.8.2 # via snowflake-connector-python pygeoip==0.3.2 pyjwt==1.7.1 # via snowflake-connector-python pymongo==3.8.0 # via edx-opaque-keys -pynacl==1.3.0 pyopenssl==19.0.0 # via paypalrestsdk, snowflake-connector-python -python-cjson==1.1.0 -python-daemon==2.1.2 +python-cjson==1.2.1 ; python_version <= "2.7" +python-daemon==2.2.3 python-dateutil==2.6.1 python-gnupg==0.3.9 pytz==2017.3 -pyyaml==5.1 +pyyaml==5.1.1 requests==2.18.4 rsa==4.0 # via google-auth -s3transfer==0.1.13 # via boto3 +s3transfer==0.2.1 # via boto3 singledispatch==3.4.0.3 # via tornado six==1.10.0 -snowflake-connector-python==1.7.9 +snowflake-connector-python==1.8.5 stevedore==1.19.1 tornado==4.5.3 ua-parser==0.3.6 @@ -92,4 +89,7 @@ urllib3==1.22 user-agents==0.3.2 vertica-python==0.6.11 wheel==0.30.0 -yarn-api-client==0.2.3 \ No newline at end of file +yarn-api-client==0.2.3 + +# The following packages are considered to be unsafe in a requirements file: +# setuptools==41.0.1 # via google-cloud-core, protobuf, python-daemon diff --git a/requirements/docs.txt b/requirements/docs.txt index baca202202..9773d9cc4f 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -9,26 +9,25 @@ -e git+https://github.com/edx/luigi.git@eb45bcc52243de11b2b16a81229ac584fe1e601b#egg=luigi -e git+https://github.com/edx/pyinstrument.git@a35ff76df4c3d5ff9a2876d859303e33d895e78f#egg=pyinstrument alabaster==0.7.12 # via sphinx -ansible==1.4.5 +ansible==2.8.2 argparse==1.2.1 asn1crypto==0.24.0 -azure-common==1.1.18 +azure-common==1.1.23 azure-nspkg==3.0.2 -azure-storage-blob==1.5.0 -azure-storage-common==1.4.0 +azure-storage-blob==2.0.1 +azure-storage-common==2.0.0 azure-storage-nspkg==3.1.0 -babel==2.6.0 # via sphinx +babel==2.7.0 # via sphinx backports-abc==0.5 -bcrypt==3.1.6 -boto3==1.4.8 +boto3==1.9.195 boto==2.48.0 -botocore==1.8.50 -cachetools==3.1.0 -certifi==2019.3.9 -cffi==1.12.2 +botocore==1.12.195 +cachetools==3.1.1 +certifi==2019.6.16 +cffi==1.12.3 chardet==3.0.4 ciso8601==1.0.3 -cryptography==2.6.1 +cryptography==2.7 distlib==0.2.2 docutils==0.14 ecdsa==0.13 @@ -38,56 +37,55 @@ elasticsearch==1.7.0 enum34==1.1.6 filechunkio==1.8 future==0.17.1 -futures==3.2.0 +futures==3.3.0 ; python_version == "2.7" google-api-python-client==1.7.7 google-auth-httplib2==0.0.3 google-auth==1.6.3 google-cloud-bigquery==0.27.0 google-cloud-core==0.27.1 google-resumable-media==0.3.2 -googleapis-common-protos==1.5.9 +googleapis-common-protos==1.6.0 graphitesend==0.10.0 +gspread==3.1.0 html5lib==1.0b3 -httplib2==0.12.1 +httplib2==0.13.0 idna==2.6 -ijson==2.3 +ijson==2.4 imagesize==1.1.0 # via sphinx ipaddress==1.0.22 isoweek==1.3.3 -jinja2==2.8.1 +jinja2==2.10.1 jmespath==0.9.4 lockfile==0.12.2 markupsafe==1.1.1 numpy==1.11.3 -paramiko==2.4.2 paypalrestsdk==1.9.0 -pbr==5.1.3 -protobuf==3.7.1 -psycopg2==2.6.2 -pyasn1-modules==0.2.4 +pbr==5.4.1 +protobuf==3.9.0 +psycopg2==2.8.3 +pyasn1-modules==0.2.5 pyasn1==0.4.5 pycparser==2.19 pycrypto==2.6.1 -pycryptodomex==3.8.0 +pycryptodomex==3.8.2 pygeoip==0.3.2 -pygments==2.3.1 # via sphinx +pygments==2.4.2 # via sphinx pyjwt==1.7.1 -pymongo==3.7.2 -pynacl==1.3.0 +pymongo==3.8.0 pyopenssl==19.0.0 -python-cjson==1.1.0 -python-daemon==2.1.2 +python-cjson==1.2.1 ; python_version <= "2.7" +python-daemon==2.2.3 python-dateutil==2.6.1 python-gnupg==0.3.9 pytz==2017.3 -pyyaml==5.1 +pyyaml==5.1.1 requests==2.18.4 rsa==4.0 -s3transfer==0.1.13 +s3transfer==0.2.1 singledispatch==3.4.0.3 six==1.10.0 -snowballstemmer==1.2.1 # via sphinx -snowflake-connector-python==1.7.9 +snowballstemmer==1.9.0 # via sphinx +snowflake-connector-python==1.8.5 sphinx==1.5.1 stevedore==1.19.1 tornado==4.5.3 @@ -97,4 +95,7 @@ urllib3==1.22 user-agents==0.3.2 vertica-python==0.6.11 wheel==0.30.0 -yarn-api-client==0.2.3 \ No newline at end of file +yarn-api-client==0.2.3 + +# The following packages are considered to be unsafe in a requirements file: +# setuptools==41.0.1 # via google-cloud-core, protobuf, python-daemon diff --git a/requirements/extra.txt b/requirements/extra.txt index 43f6804127..2ee387709f 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,2 +1,10 @@ # Generic URLs are not yet supported by pip-compile. http://cdn.mysql.com/Downloads/Connector-Python/mysql-connector-python-1.2.2.zip # GPL v2 with FOSS License Exception + +# When pip-compile is run under python 3, it omits all packages with a python 2 +# condition. Re-add them here, pre-pinned. +python-cjson==1.1.0 ; python_version <= "2.7" + +# When pip-compile is run under python 2, it omits all packages with a python 3 +# condition. Re-add them here, pre-pinned. +ujson==1.35 ; python_version > "2.7" diff --git a/requirements/pip-tools.txt b/requirements/pip-tools.txt index 547630ffa4..a767d7d594 100644 --- a/requirements/pip-tools.txt +++ b/requirements/pip-tools.txt @@ -5,5 +5,5 @@ # make upgrade # click==7.0 # via pip-tools -pip-tools==3.7.0 +pip-tools==3.9.0 six==1.10.0 diff --git a/requirements/pip.txt b/requirements/pip.txt index 42d92b4bbb..893fe72454 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -1,2 +1,2 @@ -pip==9.0.1 +pip==19.1.1 setuptools==36.4.0 diff --git a/requirements/test.in b/requirements/test.in index ff6b7b828b..b0c3fe8547 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -23,7 +23,7 @@ pandas==0.13.0 # Stuff for quality -isort==4.2.15 +isort pycodestyle==2.3.1 pylint==1.6.4 diff --git a/requirements/test.txt b/requirements/test.txt index 269583e50b..41349026e1 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -8,31 +8,30 @@ -e git+https://github.com/edx/luigi.git@eb45bcc52243de11b2b16a81229ac584fe1e601b#egg=luigi -e git+https://github.com/edx/pyinstrument.git@a35ff76df4c3d5ff9a2876d859303e33d895e78f#egg=pyinstrument -ansible==1.4.5 +ansible==2.8.2 argparse==1.2.1 asn1crypto==0.24.0 astroid==1.4.9 # via pylint -azure-common==1.1.18 +azure-common==1.1.23 azure-nspkg==3.0.2 -azure-storage-blob==1.5.0 -azure-storage-common==1.4.0 +azure-storage-blob==2.0.1 +azure-storage-common==2.0.0 azure-storage-nspkg==3.1.0 backports-abc==0.5 -backports.functools-lru-cache==1.5 # via pylint -bcrypt==3.1.6 -boto3==1.4.8 +backports.functools-lru-cache==1.5 # via isort, pylint +boto3==1.9.195 boto==2.48.0 -botocore==1.8.50 -cachetools==3.1.0 -certifi==2019.3.9 -cffi==1.12.2 +botocore==1.12.195 +cachetools==3.1.1 +certifi==2019.6.16 +cffi==1.12.3 chardet==3.0.4 ciso8601==1.0.3 configparser==3.7.4 # via pylint coverage==4.3.1 -cryptography==2.6.1 +cryptography==2.7 ddt==1.1.1 -diff-cover==1.0.7 +diff-cover==2.3.0 distlib==0.2.2 docutils==0.14 ecdsa==0.13 @@ -44,28 +43,29 @@ filechunkio==1.8 freezegun==0.3.9 funcsigs==1.0.2 # via mock future==0.17.1 -futures==3.2.0 +futures==3.3.0 ; python_version == "2.7" google-api-python-client==1.7.7 google-auth-httplib2==0.0.3 google-auth==1.6.3 google-cloud-bigquery==0.27.0 google-cloud-core==0.27.1 google-resumable-media==0.3.2 -googleapis-common-protos==1.5.9 +googleapis-common-protos==1.6.0 graphitesend==0.10.0 +gspread==3.1.0 html5lib==1.0b3 -httplib2==0.12.1 +httplib2==0.13.0 httpretty==0.8.14 idna==2.6 -ijson==2.3 +ijson==2.4 inflect==2.1.0 # via jinja2-pluralize ipaddress==1.0.22 -isort==4.2.15 +isort==4.3.21 isoweek==1.3.3 jinja2-pluralize==0.3.0 # via diff-cover -jinja2==2.8.1 +jinja2==2.10.1 jmespath==0.9.4 -lazy-object-proxy==1.3.1 # via astroid +lazy-object-proxy==1.4.1 # via astroid lockfile==0.12.2 markupsafe==1.1.1 mccabe==0.6.1 # via pylint @@ -74,36 +74,34 @@ nose-ignore-docstring==0.2 nose==1.3.7 numpy==1.11.3 pandas==0.13.0 -paramiko==2.4.2 paypalrestsdk==1.9.0 -pbr==5.1.3 -protobuf==3.7.1 -psycopg2==2.6.2 -pyasn1-modules==0.2.4 +pbr==5.4.1 +protobuf==3.9.0 +psycopg2==2.8.3 +pyasn1-modules==0.2.5 pyasn1==0.4.5 pycodestyle==2.3.1 pycparser==2.19 pycrypto==2.6.1 -pycryptodomex==3.8.0 +pycryptodomex==3.8.2 pygeoip==0.3.2 -pygments==2.3.1 # via diff-cover +pygments==2.4.2 # via diff-cover pyjwt==1.7.1 pylint==1.6.4 -pymongo==3.7.2 -pynacl==1.3.0 +pymongo==3.8.0 pyopenssl==19.0.0 -python-cjson==1.1.0 -python-daemon==2.1.2 +python-cjson==1.2.1 ; python_version <= "2.7" +python-daemon==2.2.3 python-dateutil==2.6.1 python-gnupg==0.3.9 pytz==2017.3 -pyyaml==5.1 +pyyaml==5.1.1 requests==2.18.4 rsa==4.0 -s3transfer==0.1.13 +s3transfer==0.2.1 singledispatch==3.4.0.3 six==1.10.0 -snowflake-connector-python==1.7.9 +snowflake-connector-python==1.8.5 stevedore==1.19.1 tornado==4.5.3 ua-parser==0.3.6 @@ -112,5 +110,8 @@ urllib3==1.22 user-agents==0.3.2 vertica-python==0.6.11 wheel==0.30.0 -wrapt==1.11.1 # via astroid -yarn-api-client==0.2.3 \ No newline at end of file +wrapt==1.11.2 # via astroid +yarn-api-client==0.2.3 + +# The following packages are considered to be unsafe in a requirements file: +# setuptools==41.0.1 # via google-cloud-core, protobuf, python-daemon diff --git a/share/task.yml b/share/task.yml index 748ef890f1..74e449016e 100644 --- a/share/task.yml +++ b/share/task.yml @@ -3,7 +3,7 @@ - name: Configure luigi hosts: "{{ name }}" gather_facts: True - sudo: True + become: True vars: write_luigi_config: "yes" common_debian_variants: @@ -60,93 +60,100 @@ register: home_output - name: set the home variable - set_fact: home="{{ home_output.stdout }}" + set_fact: "home={{ home_output.stdout }}" - name: known_hosts file exists command: touch {{ home }}/.ssh/known_hosts creates={{ home }}/.ssh/known_hosts - name: git server in known_hosts file - lineinfile: > - dest={{ home }}/.ssh/known_hosts - regexp=^{{item.hostname}} - line="{{ item.hostname }} {{ item.public_key }}" - with_items: git_servers + lineinfile: + dest: "{{ home }}/.ssh/known_hosts" + regexp: "^{{item.hostname}}" + line: "{{ item.hostname }} {{ item.public_key }}" + loop: "{{ git_servers }}" - name: root directories created file: path={{ item }} state=directory owner=root group=root - sudo: True + become: True with_items: - "{{ root_data_dir }}" - "{{ root_log_dir }}" - name: working directories created file: path={{ item }} state=directory mode=777 owner={{ ansible_ssh_user }} group={{ ansible_ssh_user }} - sudo: True + become: True with_items: - "{{ working_dir }}" - "{{ working_venv_dir }}" - name: log directory created file: path={{ item }} state=directory mode=777 owner={{ ansible_ssh_user }} group={{ ansible_ssh_user }} - sudo: True + become: True with_items: - "{{ log_dir }}" - name: make sure git is available on the Debian server command: apt-get install -q -y git - sudo: True + become: True when: ansible_distribution in common_debian_variants - name: make sure git is available on the RHEL server yum: pkg=git state=present - sudo: True + become: True when: ansible_distribution in common_redhat_variants - name: repositories checked out - git: > - repo={{ item.url }} - dest={{ working_dir }}/{{ item.dir_name }} - version=master - with_items: repos + git: + repo: "{{ item.url }}" + dest: "{{ working_dir }}/{{ item.dir_name }}" + version: master + force: yes + loop: "{{ repos }}" - name: branches fetched - command: git fetch --all chdir={{ working_dir }}/{{ item.dir_name }} - with_items: repos + command: "git fetch --all chdir={{ working_dir }}/{{ item.dir_name }}" + loop: "{{ repos }}" - name: origin/HEAD updated - command: git remote set-head origin --auto chdir={{ working_dir }}/{{ item.dir_name }} - with_items: repos + command: "git remote set-head origin --auto chdir={{ working_dir }}/{{ item.dir_name }}" + loop: "{{ repos }}" - name: branches checked out - command: git checkout {{ item.branch }} chdir={{ working_dir }}/{{ item.dir_name }} - with_items: repos + command: "git checkout {{ item.branch }} chdir={{ working_dir }}/{{ item.dir_name }}" + loop: "{{ repos }}" - name: ensure system packages are installed command: make system-requirements chdir={{ working_repo_dir }} - sudo: True + become: True - name: bootstrap pip on Debian command: apt-get install -q -y python-pip - sudo: True + become: True when: ansible_distribution in common_debian_variants - name: bootstrap pip on RHEL command: yum install -q -y python-pip - sudo: True + become: True when: ansible_distribution in common_redhat_variants - name: virtualenv installed - pip: name=virtualenv version=1.10.1 - sudo: True + pip: name=virtualenv version=16.6.1 + become: True - - name: check if virtualenv already created - stat: path={{ working_venv_dir }}/bin/activate - register: virtualenv_created + #- name: check if virtualenv already created + # stat: + # path: "{{ working_venv_dir }}/bin/activate" + # register: virtualenv_created + + # This is required for running unit tests inside docker, at least until the + # default image comes with a py3 compatible virtualenv. + - name: virtualenv deleted + shell: "rm -rf {{ working_venv_dir }}/*" - name: virtualenv created command: > - virtualenv --python={{ virtualenv_python }} {{ virtualenv_extra_args }} {{ working_venv_dir }} - when: not virtualenv_created.stat.exists + virtualenv --clear --python={{ virtualenv_python }} {{ virtualenv_extra_args }} {{ working_venv_dir }} + #when: not virtualenv_created.stat.exists - name: update pip command: > @@ -155,28 +162,32 @@ - name: virtualenv initialized on Debian shell: > . {{ working_venv_dir }}/bin/activate && make install - chdir={{ working_repo_dir }} + args: + chdir: "{{ working_repo_dir }}" when: ansible_distribution in common_debian_variants - name: virtualenv initialized on RHEL shell: > . {{ working_venv_dir }}/bin/activate && make install - chdir={{ working_repo_dir }} + args: + chdir: "{{ working_repo_dir }}" when: ansible_distribution in common_redhat_variants - name: additional packages installed on Debian shell: > . {{ working_venv_dir }}/bin/activate && pip install {{ item }} - chdir={{ working_repo_dir }} + args: + chdir: "{{ working_repo_dir }}" when: ansible_distribution in common_debian_variants - with_items: packages + loop: "{{ packages }}" - name: additional packages installed on RHEL shell: > . {{ working_venv_dir }}/bin/activate && pip install {{ item }} - chdir={{ working_repo_dir }} + args: + chdir: "{{ working_repo_dir }}" when: ansible_distribution in common_redhat_variants - with_items: packages + loop: "{{ packages }}" - name: logging configured template: src=logging.cfg.j2 dest={{ working_repo_dir }}/logging.cfg