Skip to content
This repository has been archived by the owner on May 1, 2024. It is now read-only.

Python 2/3 compatibility for finance tasks #747

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[settings]
line_length=120
multi_line_output=5
known_future_library=future
32 changes: 28 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,33 @@ env:
- secure: NLqmm18NpV3JRwD4CaugXm5cMWgxjdOA88xRFocmmVrduv0QT9JxBZFGebLYmFQOoKNJ23hz6g3EHe1aWfhLYnr1iUYerrwIriSI1wzuqbXJBRN6gO2n3YW+IfG83OLMZkOIMswT8MEdT3JPWVJL3bsocjHp8bYhRCt1KTCMJjY=
- secure: aG8l39jaLFWXB5CEOOAR9mJTT3GnqxCl/oFM/7NvTZCBoSWIPIztpFhSAkRE9xSIiKUKXakZcL5H349NLC28jdlHPVsNAaKKt2YNhB6MjmePihp3RPwZGn8c/SjslwY7DPVUKMdWsI7AVNJBH8ab30OPxKwXFAMOiJJza206CYQ=

# TODO: re-introduce the coverage test.
matrix:
# Mark travis build as finished before jobs under allow_failures complete.
fast_finish: true

include:
# Standard unit tests.
- name: "Python 2.7 Unit Tests"
env: TEST_SUITE=test-docker

# Python 3 whitelisted and full unit test jobs. Once python 3 support is
# complete, delete the whitelist job and remove the full job from
# allow_failures.
- name: "Python 3.x Whitelisted Unit Tests"
env: TEST_SUITE=test-docker-py3-whitelist
- name: "Python 3.x FULL Unit Tests"
env: TEST_SUITE=test-docker-py3

- name: "Quality Tests"
env: TEST_SUITE=quality-docker

# Names of jobs (defined above) that cannot fail the travis build even if
# they fail.
allow_failures:
- name: "Python 3.x FULL Unit Tests"
- name: "Quality Tests" # This is here because isort is a hot mess right now.

# Do NOT install Python requirements.
# Doing so is a waste of time since they won't be used.
install: true
Expand All @@ -37,10 +64,7 @@ before_install:
# Ensure we have a place to store coverage output
- mkdir -p coverage

script:
- make test-docker
- make quality-docker
- make coverage-docker
script: make $TEST_SUITE

after_success:
- pip install --upgrade codecov
Expand Down
44 changes: 36 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

uninstall:
pip install -r requirements/pip.txt
while pip uninstall -y edx.analytics.tasks; do true; done
pip uninstall -y edx.analytics.tasks
python setup.py clean

install: requirements uninstall
Expand All @@ -28,7 +28,7 @@ docker-shell:
system-requirements:
ifeq (,$(wildcard /usr/bin/yum))
# This is not great, we can't use these libraries on slave nodes using this method.
sudo apt-get install -y -q libmysqlclient-dev libpq-dev python-dev libffi-dev libssl-dev libxml2-dev libxslt1-dev
sudo apt-get install -y -q libmysqlclient-dev libpq-dev python-dev python3-dev libffi-dev libssl-dev libxml2-dev libxslt1-dev
else
sudo yum install -y -q postgresql-devel libffi-devel
endif
Expand Down Expand Up @@ -56,20 +56,48 @@ upgrade: ## update the requirements/*.txt files with the latest packages satisfy
CUSTOM_COMPILE_COMMAND="make upgrade" pip-compile --upgrade -o requirements/docs.txt requirements/docs.in
CUSTOM_COMPILE_COMMAND="make upgrade" pip-compile --upgrade -o requirements/test.txt requirements/test.in

test-docker-local:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make develop-local test-local

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, I deleted this make target because I couldn't figure out what it's useful for. It isn't programmatically called, nor referenced by any documentation that I could find. Since it doesn't setup requirements via make system-requirements reset-virtualenv test-requirements, I can't imagine that it works at all.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The purpose of the target is precisely to run tests without performing setup. Again. When trying to do development or debugging on unit tests where the requirements aren't changing but only code, it suffices to run test-docker once, and then run test-docker-local on subsequent runs. It saves the time involved in trying to re-install the requirements each time. Same motivation for many of the other "local" targets.

# Entry point for running python 2 unit tests in CI.
test-docker:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv test-requirements develop-local test-local
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv test-requirements develop-local test-local

# Entry point for running python 3 unit tests in CI.
test-docker-py3:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv-py3 test-requirements develop-local test-local
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv-py3 test-requirements develop-local test-local

# Entry point for running python 3 unit tests in CI. Only invokes a subset
# (whitelist) of unit tests which are known to pass under python 3.
test-docker-py3-whitelist:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv-py3 test-requirements develop-local test-local-py3-whitelist

test-local:
# TODO: when we have better coverage, modify this to actually fail when coverage is too low.
rm -rf .coverage
LUIGI_CONFIG_PATH='config/test.cfg' python -m coverage run --rcfile=./.coveragerc -m nose --with-xunit --xunit-file=unittests.xml -A 'not acceptance'

# Speical test-local target specifically for running a whitelist of tests which
# are known to pass under python 3
test-local-py3-whitelist:
# TODO: when we have better coverage, modify this to actually fail when coverage is too low.
rm -rf .coverage
LUIGI_CONFIG_PATH='config/test.cfg' python -m coverage run --rcfile=./.coveragerc -m nose --with-xunit --xunit-file=unittests.xml -A 'not acceptance' \
edx.analytics.tasks.enterprise.tests \
edx.analytics.tasks.insights.tests.test_database_imports \
edx.analytics.tasks.insights.tests.test_grades \
edx.analytics.tasks.monitor.tests.test_overall_events \
edx.analytics.tasks.tests \
edx.analytics.tasks.util.tests.helpers \
edx.analytics.tasks.util.tests.opaque_key_mixins \
edx.analytics.tasks.util.tests.test_decorators \
edx.analytics.tasks.util.tests.test_geolocation \
edx.analytics.tasks.util.tests.test_hive \
edx.analytics.tasks.util.tests.test_retry \
edx.analytics.tasks.util.tests.test_s3_util \
edx.analytics.tasks.util.tests.test_url \
edx.analytics.tasks.warehouse.financial.tests \
edx.analytics.tasks.warehouse.tests.test_internal_reporting_active_users \
edx.analytics.tasks.warehouse.tests.test_internal_reporting_database \
edx.analytics.tasks.warehouse.tests.test_run_vertica_sql_scripts

test: test-requirements develop test-local

test-acceptance: test-requirements
Expand Down Expand Up @@ -98,7 +126,7 @@ quality-docker-local:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make develop-local quality-local

quality-docker:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make reset-virtualenv test-requirements develop-local quality-local
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest make system-requirements reset-virtualenv test-requirements develop-local quality-local

coverage-docker:
docker run --rm -u root -v `(pwd)`:/edx/app/analytics_pipeline/analytics_pipeline -it edxops/analytics_pipeline:latest coverage xml
Expand Down
10 changes: 7 additions & 3 deletions edx/analytics/tasks/common/bigquery_load.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from __future__ import absolute_import
from future.standard_library import install_aliases
install_aliases()

import json
import logging
import os
import subprocess
import tempfile
import time
import urlparse
from urllib.parse import urlparse

import luigi

Expand Down Expand Up @@ -216,7 +220,7 @@ def field_delimiter(self):

@property
def null_marker(self):
return '\N'
return r'\N'

@property
def quote_character(self):
Expand Down Expand Up @@ -262,7 +266,7 @@ def init_copy(self, client):
self.output().clear_marker_table()

def _get_destination_from_source(self, source_path):
parsed_url = urlparse.urlparse(source_path)
parsed_url = urlparse(source_path)
destination_path = url_path_join('gs://{}'.format(parsed_url.netloc), parsed_url.path)
return destination_path

Expand Down
8 changes: 4 additions & 4 deletions edx/analytics/tasks/common/mapreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging
import logging.config
import os
import StringIO
from io import StringIO
from hashlib import md5

import luigi
Expand Down Expand Up @@ -183,7 +183,7 @@ class EmulatedMapReduceJobRunner(luigi.contrib.hadoop.JobRunner):
"""

def group(self, input):
output = StringIO.StringIO()
output = StringIO()
lines = []
for i, line in enumerate(input):
parts = line.rstrip('\n').split('\t')
Expand All @@ -197,7 +197,7 @@ def group(self, input):
def run_job(self, job):
job.init_hadoop()
job.init_mapper()
map_output = StringIO.StringIO()
map_output = StringIO()
input_targets = luigi.task.flatten(job.input_hadoop())
for input_target in input_targets:
# if file is a directory, then assume that it's Hadoop output,
Expand Down Expand Up @@ -232,7 +232,7 @@ def run_job(self, job):
try:
reduce_output = job.output().open('w')
except Exception:
reduce_output = StringIO.StringIO()
reduce_output = StringIO()

try:
job._run_reducer(reduce_input, reduce_output)
Expand Down
2 changes: 1 addition & 1 deletion edx/analytics/tasks/common/mysql_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ def coerce_for_mysql_connect(input):
return input
# Hive indicates a null value with the string "\N"
# We represent an infinite value with the string "inf", MySQL has no such representation so we use NULL
if input in ('None', '\\N', 'inf', '-inf'):
if input in ('None', r'\N', 'inf', '-inf'):
return None
if isinstance(input, str):
return input.decode('utf-8')
Expand Down
8 changes: 4 additions & 4 deletions edx/analytics/tasks/common/pathutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,12 +268,12 @@ def get_event_and_date_string(self, line):
"""Default mapper implementation, that always outputs the log line, but with a configurable key."""
event = eventlog.parse_json_event(line)
if event is None:
self.incr_counter('Event', 'Discard Unparseable Event', 1)
self.incr_counter(u'Event', u'Discard Unparseable Event', 1)
return None

event_time = self.get_event_time(event)
if not event_time:
self.incr_counter('Event', 'Discard Missing Time Field', 1)
self.incr_counter(u'Event', u'Discard Missing Time Field', 1)
return None

# Don't use strptime to parse the date, it is extremely slow
Expand All @@ -283,7 +283,7 @@ def get_event_and_date_string(self, line):
date_string = event_time.split("T")[0]

if date_string < self.lower_bound_date_string or date_string >= self.upper_bound_date_string:
# Slow: self.incr_counter('Event', 'Discard Outside Date Interval', 1)
# Slow: self.incr_counter(u'Event', u'Discard Outside Date Interval', 1)
return None

return event, date_string
Expand All @@ -307,5 +307,5 @@ def get_map_input_file(self):
return os.environ['map_input_file']
except KeyError:
log.warn('mapreduce_map_input_file not defined in os.environ, unable to determine input file path')
self.incr_counter('Event', 'Missing map_input_file', 1)
self.incr_counter(u'Event', u'Missing map_input_file', 1)
return ''
9 changes: 8 additions & 1 deletion edx/analytics/tasks/common/sqoop.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Gather data using Sqoop table dumps run on RDBMS databases.
"""
from __future__ import absolute_import

import datetime
import json
import logging
Expand Down Expand Up @@ -296,7 +298,12 @@ def run_job(self, job):
metadata['end_time'] = datetime.datetime.utcnow().isoformat()
try:
with job.metadata_output().open('w') as metadata_file:
json.dump(metadata, metadata_file)
# Under python 2, json.dumps() will return ascii-only bytes, so .encode('utf-8')
# is a no-op. Under python 3, json.dumps() will return ascii-only unicode, so
# .encode('utf-8') will return bytes, thus normalizing the output to bytes
# across all python versions.
metadata_file.write(json.dumps(metadata).encode('utf-8'))
metadata_file.flush()
except Exception:
log.exception("Unable to dump metadata information.")
pass
Expand Down
6 changes: 3 additions & 3 deletions edx/analytics/tasks/common/tests/test_sqoop.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,12 @@ def test_connect_with_columns(self):
self.assertEquals(arglist[-3], 'column1,column2')

def test_connect_with_null_string(self):
self.create_and_run_mysql_task(null_string='\\\\N')
self.create_and_run_mysql_task(null_string=r'\\N')
arglist = self.get_call_args_after_run()
self.assertEquals(arglist[-6], '--null-string')
self.assertEquals(arglist[-5], '\\\\N')
self.assertEquals(arglist[-5], r'\\N')
self.assertEquals(arglist[-4], '--null-non-string')
self.assertEquals(arglist[-3], '\\\\N')
self.assertEquals(arglist[-3], r'\\N')

def test_connect_with_fields_terminations(self):
self.create_and_run_mysql_task(fields_terminated_by='\x01')
Expand Down
6 changes: 4 additions & 2 deletions edx/analytics/tasks/common/vertica_load.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Support for loading data into an HP Vertica database.
"""
from __future__ import absolute_import

import logging
import traceback
Expand All @@ -12,6 +13,7 @@
from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin
from edx.analytics.tasks.util.url import ExternalURL
from edx.analytics.tasks.util.vertica_target import CredentialFileVerticaTarget
import six

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -416,7 +418,7 @@ def copy_delimiter(self):
@property
def copy_null_sequence(self):
"""The null sequence in the data to be copied. Default is Hive NULL (\\N)"""
return "'\\N'"
return r"'\N'"

@property
def copy_enclosed_by(self):
Expand All @@ -437,7 +439,7 @@ def copy_escape_spec(self):

def copy_data_table_from_target(self, cursor):
"""Performs the copy query from the insert source."""
if isinstance(self.columns[0], basestring):
if isinstance(self.columns[0], six.string_types):
column_names = ','.join([name for name in self.columns])
elif len(self.columns[0]) == 2:
column_names = ','.join([name for name, _type in self.columns])
Expand Down
6 changes: 3 additions & 3 deletions edx/analytics/tasks/export/data_obfuscation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import tempfile
import xml.etree.ElementTree

import cjson
import luigi
import yaml

import edx.analytics.tasks.util.opaque_key_util as opaque_key_util
from edx.analytics.tasks.util.fast_json import FastJson
from edx.analytics.tasks.common.pathutil import PathSetTask
from edx.analytics.tasks.util.file_util import copy_file_to_file, read_config_file
from edx.analytics.tasks.util.obfuscate_util import (
Expand Down Expand Up @@ -194,7 +194,7 @@ def filter_row(self, row):
if state_str == 'NULL':
updated_state_dict = {}
else:
state_dict = cjson.decode(state_str, all_unicode=True)
state_dict = FastJson.loads(state_str)
# Traverse the dictionary, looking for entries that need to be scrubbed.
updated_state_dict = self.obfuscator.obfuscate_structure(state_dict, u"state", user_info)
except Exception: # pylint: disable=broad-except
Expand All @@ -204,7 +204,7 @@ def filter_row(self, row):

if updated_state_dict is not None:
# Can't reset values, so update original fields.
updated_state = cjson.encode(updated_state_dict).replace('\\', '\\\\')
updated_state = FastJson.dumps(updated_state_dict).replace('\\', '\\\\')
row[4] = updated_state
if self.obfuscator.is_logging_enabled():
log.info(u"Obfuscated state for user_id '%s' module_id '%s'", user_id, row[2])
Expand Down
4 changes: 2 additions & 2 deletions edx/analytics/tasks/export/events_obfuscation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import re
from collections import defaultdict, namedtuple

import cjson
import luigi.date_interval

import edx.analytics.tasks.util.opaque_key_util as opaque_key_util
from edx.analytics.tasks.util.fast_json import FastJson
from edx.analytics.tasks.common.mapreduce import MapReduceJobTaskMixin, MultiOutputMapReduceJobTask
from edx.analytics.tasks.common.pathutil import PathSetTask
from edx.analytics.tasks.util import eventlog
Expand Down Expand Up @@ -328,7 +328,7 @@ def _obfuscate_event(self, event):
# Re-encode payload as a json string if it originally was one.
# (This test works because we throw away string values that didn't parse as JSON.)
if isinstance(event.get('event'), basestring):
event['event'] = cjson.encode(event_data)
event['event'] = FastJson.dumps(event_data)
else:
event['event'] = event_data

Expand Down
4 changes: 3 additions & 1 deletion edx/analytics/tasks/export/obfuscation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Tasks to obfuscate course data for RDX."""
from future.standard_library import install_aliases
install_aliases()

import errno
import json
import logging
import os
import tarfile
import urlparse
from urllib.parse import urlparse

import luigi

Expand Down
Loading