Python 3 Compatibility (#10)

* Python 2+3 compatibility
lukalabs · Jun 15, 2018 · ec68708 · ec68708
1 parent d46c3ef
commit ec68708
Show file tree

Hide file tree

Showing 35 changed files with 248 additions and 52 deletions.
diff --git a/cakechat/api/response.py b/cakechat/api/response.py
@@ -1,5 +1,7 @@
 import random
 
+from six.moves import xrange, map
+
 from cakechat.api.config import PREDICTION_MODE, NUM_BEST_CANDIDATES_TO_PICK_FROM, SAMPLING_ATTEMPTS_NUM, \
     DEFAULT_RESPONSE
 from cakechat.config import INPUT_CONTEXT_SIZE, INPUT_SEQUENCE_LENGTH, PREDICTION_MODES
@@ -53,7 +55,7 @@ def get_response(dialog_context, emotion):
     :param emotion: emotion to condition response
     :return: dialog response conditioned on input emotion
     """
-    tokenized_dialog_context = map(get_tokens_sequence, dialog_context)
+    tokenized_dialog_context = list(map(get_tokens_sequence, dialog_context))
     tokenized_dialog_contexts = [tokenized_dialog_context]
     context_tokens_ids = transform_contexts_to_token_ids(tokenized_dialog_contexts, _cakechat_model.token_to_index,
                                                          INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE)

diff --git a/cakechat/api/utils.py b/cakechat/api/utils.py
@@ -1,4 +1,5 @@
 from flask import jsonify
+from six import text_type
 
 
 def get_api_error_response(message, code, logger):
@@ -7,8 +8,8 @@ def get_api_error_response(message, code, logger):
 
 
 def _is_list_of_unicode_strings(data):
-    return (isinstance(data, list) or isinstance(data, tuple)) and len(data) > 0 \
-           and all(isinstance(s, unicode) for s in data)
+    return bool(data and isinstance(data, (list, tuple)) and
+                all(isinstance(s, text_type) for s in data))
 
 
 def parse_dataset_param(params, param_name, required=True):

diff --git a/cakechat/api/v1/server.py b/cakechat/api/v1/server.py
@@ -20,9 +20,9 @@ def get_model_response():
     try:
         dialog_context = parse_dataset_param(params, param_name='context')
     except KeyError as e:
-        return get_api_error_response('Malformed request, no "%s" param was found' % e.message, 400, _logger)
+        return get_api_error_response('Malformed request, no "%s" param was found' % str(e), 400, _logger)
     except ValueError as e:
-        return get_api_error_response('Malformed request: %s' % e.message, 400, _logger)
+        return get_api_error_response('Malformed request: %s' % str(e), 400, _logger)
 
     emotion = params.get('emotion', DEFAULT_CONDITION)
     if emotion not in EMOTIONS_TYPES:

diff --git a/cakechat/dialog_model/inference/candidates/beamsearch.py b/cakechat/dialog_model/inference/candidates/beamsearch.py
@@ -1,6 +1,7 @@
-from itertools import izip_longest
+from six.moves import zip_longest
 
 import numpy as np
+from six.moves import xrange
 import theano
 
 from cakechat.dialog_model.inference.candidates.abstract_generator import AbstractCandidatesGenerator
@@ -248,7 +249,7 @@ def _generate_candidates_for_one_context(self, condition_id, output_seq_len):
 
     @timer
     def generate_candidates(self, context_token_ids, condition_ids, output_seq_len):
-        x_with_conditions_batch = izip_longest(context_token_ids, condition_ids if condition_ids is not None else [])
+        x_with_conditions_batch = zip_longest(context_token_ids, condition_ids if condition_ids is not None else [])
         result = []
         for x, condition_id in x_with_conditions_batch:
             self._compute_thought_vectors(x)

diff --git a/cakechat/dialog_model/inference/candidates/sampling.py b/cakechat/dialog_model/inference/candidates/sampling.py
@@ -1,4 +1,5 @@
 import numpy as np
+from six.moves import xrange
 import theano
 
 from cakechat.dialog_model.inference.candidates.abstract_generator import AbstractCandidatesGenerator

diff --git a/cakechat/dialog_model/inference/predict.py b/cakechat/dialog_model/inference/predict.py
@@ -1,4 +1,5 @@
 import numpy as np
+from six.moves import xrange
 
 from cakechat.config import MAX_PREDICTIONS_LENGTH, BEAM_SIZE, MMI_REVERSE_MODEL_SCORE_WEIGHT, DEFAULT_TEMPERATURE, \
     SAMPLES_NUM_FOR_RERANKING, PREDICTION_MODES, REPETITION_PENALIZE_COEFFICIENT
@@ -98,7 +99,7 @@ def get_nn_responses(context_token_ids,
     response_tokens_ids = np.reshape(response_tokens_ids, (-1, output_seq_len))
     response_tokens = transform_token_ids_to_sentences(response_tokens_ids, nn_model.index_to_token)
 
-    lines_num = len(response_tokens) / output_candidates_num
+    lines_num = len(response_tokens) // output_candidates_num
     responses = [response_tokens[i * output_candidates_num:(i + 1) * output_candidates_num] for i in xrange(lines_num)]
 
     return responses
diff --git a/cakechat/dialog_model/inference/predictor.py b/cakechat/dialog_model/inference/predictor.py
@@ -1,4 +1,5 @@
 import numpy as np
+from six.moves import xrange
 
 
 class Predictor(object):

diff --git a/cakechat/dialog_model/inference/reranking.py b/cakechat/dialog_model/inference/reranking.py
@@ -1,7 +1,8 @@
 from abc import ABCMeta, abstractmethod
-from itertools import izip_longest
+from six.moves import zip_longest
 
 import numpy as np
+from six.moves import xrange
 
 from cakechat.dialog_model.inference.service_tokens import ServiceTokensIDs
 from cakechat.dialog_model.inference.utils import get_sequence_score_by_thought_vector, get_sequence_score, \
@@ -103,7 +104,7 @@ def rerank_candidates(self, contexts, all_candidates, condition_ids):
         condition_ids = [] if condition_ids is None else condition_ids  # For izip_lingest
         candidates_scores = [
             self._compute_candidates_scores(context, candidates, condition_id)
-            for context, candidates, condition_id in izip_longest(contexts, all_candidates, condition_ids)
+            for context, candidates, condition_id in zip_longest(contexts, all_candidates, condition_ids)
         ]
         scores_order = [np.argsort(-np.array(scores)) for scores in candidates_scores]
         batch_size = len(contexts)

diff --git a/cakechat/dialog_model/inference/tests/predict.py b/cakechat/dialog_model/inference/tests/predict.py
@@ -2,6 +2,7 @@
 import sys
 import unittest
 import numpy as np
+from six.moves import xrange
 
 sys.path.append(
     os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))))

diff --git a/cakechat/dialog_model/inference/tests/sampling.py b/cakechat/dialog_model/inference/tests/sampling.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from scipy.stats import binom
+from six.moves import xrange
 
 sys.path.append(
     os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))))

diff --git a/cakechat/dialog_model/inference/utils.py b/cakechat/dialog_model/inference/utils.py
@@ -1,4 +1,5 @@
 import numpy as np
+from six.moves import xrange
 
 from cakechat.config import BATCH_SIZE, DEFAULT_CONDITION
 from cakechat.dialog_model.model_utils import get_training_batch

diff --git a/cakechat/dialog_model/layers.py b/cakechat/dialog_model/layers.py
@@ -1,5 +1,6 @@
 import theano.tensor as T
 from lasagne.layers.base import MergeLayer, Layer
+from six.moves import xrange
 
 
 class RepeatLayer(Layer):

diff --git a/cakechat/dialog_model/model.py b/cakechat/dialog_model/model.py
@@ -3,6 +3,7 @@
 
 import lasagne
 import numpy as np
+from six.moves import xrange
 import theano
 import theano.tensor as T
 from lasagne.init import Normal
@@ -583,8 +584,10 @@ def print_matrices_weights(self):
         total_network_size = 0
         for p, v in zip(params, values):
             param_size = float(v.nbytes) / 1024 / 1024
+            # Work around numpy/python 3 regression: 
+            # http://www.markhneedham.com/blog/2017/11/19/python-3-typeerror-unsupported-format-string-passed-to-numpy-ndarray-__format__/
             laconic_logger.info('\t{0:<40} dtype: {1:<10} shape: {2:<12} size: {3:<.2f}M'.format(
-                p.name, v.dtype, v.shape, param_size))
+                p.name, repr(v.dtype), repr(v.shape), param_size))
             total_network_size += param_size
         laconic_logger.info('Total network size: {0:.1f} Mb'.format(total_network_size))
 

diff --git a/cakechat/dialog_model/model_utils.py b/cakechat/dialog_model/model_utils.py
@@ -1,8 +1,10 @@
 import os
 from collections import namedtuple
-from itertools import imap, islice, izip
+from itertools import islice
 
 import numpy as np
+from six import text_type
+from six.moves import xrange, map, zip
 
 from cakechat.config import BASE_CORPUS_NAME, TRAIN_CORPUS_NAME, WORD_EMBEDDING_DIMENSION, INPUT_CONTEXT_SIZE, \
     HIDDEN_LAYER_DIMENSION, ENCODER_DEPTH, DECODER_DEPTH, INPUT_SEQUENCE_LENGTH, \
@@ -21,7 +23,7 @@
 
 
 def transform_conditions_to_ids(conditions, condition_to_index, n_dialogs):
-    condition_ids_iterator = imap(
+    condition_ids_iterator = map(
         lambda condition: condition_to_index.get(condition, condition_to_index[DEFAULT_CONDITION]), conditions)
     condition_ids = np.full(n_dialogs, condition_to_index[DEFAULT_CONDITION], dtype=np.int32)
     for sample_idx, condition_id in enumerate(condition_ids_iterator):
@@ -143,7 +145,7 @@ def transform_token_ids_to_sentences(y_ids, index_to_token):
             response_tokens.append(token_to_add)
 
         response_str = ' '.join(response_tokens)
-        if not isinstance(response_str, unicode):
+        if not isinstance(response_str, text_type):
             response_str = response_str.decode('utf-8')
 
         responses.append(response_str)
@@ -178,7 +180,7 @@ def transform_context_token_ids_to_sentences(x_ids, index_to_token):
                 sample_tokens.append(token_to_add)
 
             sample_str = ' '.join(sample_tokens)
-            if not isinstance(sample_str, unicode):
+            if not isinstance(sample_str, text_type):
                 sample_str = sample_str.decode('utf-8')
 
             context_samples.append(sample_str)
@@ -235,7 +237,7 @@ def get_w2v_embedding_matrix(tokenized_dialog_lines, index_to_token, add_start_e
 
 def get_training_batch(inputs, batch_size, random_permute=False):
     n_samples = inputs[0].shape[0]
-    n_batches = n_samples / batch_size
+    n_batches = n_samples // batch_size
     batches_seq = np.arange(n_batches)
     samples_seq = np.arange(n_samples)
 
@@ -312,7 +314,7 @@ def reverse_nn_input(dataset, service_tokens):
     """
     # Swap last utterance of x with y, while padding with start- and eos-tokens
     y_output = np.full(dataset.y.shape, service_tokens.pad_token_id, dtype=dataset.y.dtype)
-    for y_output_sample, x_input_sample in izip(y_output, dataset.x[:, -1]):
+    for y_output_sample, x_input_sample in zip(y_output, dataset.x[:, -1]):
         # Write start token at the first index
         y_output_sample[0] = service_tokens.start_token_id
         y_output_token_index = 1
@@ -330,7 +332,7 @@ def reverse_nn_input(dataset, service_tokens):
 
     # Use utterances from y in x while truncating start- and eos-tokens
     x_output = np.full(dataset.x.shape, service_tokens.pad_token_id, dtype=dataset.x.dtype)
-    for x_output_sample, x_input_sample, y_input_sample in izip(x_output, dataset.x[:, :-1], dataset.y):
+    for x_output_sample, x_input_sample, y_input_sample in zip(x_output, dataset.x[:, :-1], dataset.y):
         # Copy all the context utterances except the last one right to the output
         x_output_sample[:-1] = x_input_sample
         x_output_token_index = 0
@@ -359,7 +361,7 @@ def _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator, context_
     context = []
 
     last_y_line = None
-    for x_line, y_line in izip(x_data_iterator, y_data_iterator):
+    for x_line, y_line in zip(x_data_iterator, y_data_iterator):
         if x_line != last_y_line:
             context = []  # clear context if last response != current dialog context (new dialog)
 
@@ -380,7 +382,7 @@ def transform_lines_to_nn_input(tokenized_dialog_lines, token_to_index):
 
     x_data_iterator = islice(x_data_iterator, 0, None, 2)
     y_data_iterator = islice(y_data_iterator, 1, None, 2)
-    n_dialogs /= 2
+    n_dialogs //= 2
 
     y_data_iterator, y_data_iterator_for_context = file_buffered_tee(y_data_iterator)
     x_data_iterator = _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator_for_context)

diff --git a/cakechat/dialog_model/quality/logging.py b/cakechat/dialog_model/quality/logging.py
@@ -1,10 +1,18 @@
 import os
 import subprocess
+import sys
 import time
 from collections import namedtuple
 from datetime import datetime
 
-import unicodecsv as csv
+from six.moves import xrange
+
+# UnicodeCSV requires files to be opened as binary on Python3 by design.
+# https://github.com/jdunck/python-unicodecsv/issues/65
+if sys.version_info[0] == 2:
+    import unicodecsv as csv
+else:
+    import csv
 
 from cakechat.config import DATA_DIR, PREDICTION_MODE_FOR_TESTS, LOG_CANDIDATES_NUM, MAX_PREDICTIONS_LENGTH
 from cakechat.dialog_model.inference import get_nn_responses
@@ -54,7 +62,7 @@ def _get_iteration_stats(stats_info):
 
 
 def init_csv_writer(fh, mode, output_seq_len):
-    csv_writer = csv.writer(fh, encoding='utf-8', delimiter='\t')
+    csv_writer = csv.writer(fh, delimiter='\t')
     csv_writer.writerow([''])  # empty row for better readability
     csv_writer.writerow([_NN_MODEL_PARAMS_STR])
     csv_writer.writerow(['commit hash: %s' % _get_git_revision_short_hash()])

diff --git a/cakechat/dialog_model/quality/metrics/distinctness.py b/cakechat/dialog_model/quality/metrics/distinctness.py
@@ -1,4 +1,6 @@
 import numpy as np
+from six.moves import xrange
+
 
 from cakechat.config import PREDICTION_MODE_FOR_TESTS, DEFAULT_TEMPERATURE, BEAM_SIZE, \
     PREDICTION_DISTINCTNESS_NUM_TOKENS

diff --git a/cakechat/dialog_model/quality/metrics/ranking.py b/cakechat/dialog_model/quality/metrics/ranking.py
@@ -5,7 +5,7 @@
 def compute_average_precision(expected_answers, weighted_actual_answers, top):
     actual_responses, actual_weights = zip(*weighted_actual_answers.items())
 
-    expected_labels = map(lambda response: int(response in expected_answers), actual_responses)[:top]
+    expected_labels = [int(response in expected_answers) for response in actual_responses][:top]
     actual_weights = actual_weights[:top]
 
     if any(expected_labels):

diff --git a/cakechat/dialog_model/train.py b/cakechat/dialog_model/train.py
@@ -1,6 +1,8 @@
 import os
 import time
 
+from six.moves import xrange
+
 from cakechat.config import MAX_PREDICTIONS_LENGTH, BATCH_SIZE, EPOCHES_NUM, LOG_FREQUENCY_PER_BATCHES, \
     SCREEN_LOG_FREQUENCY_PER_BATCHES, SCREEN_LOG_NUM_TEST_LINES, SHUFFLE_TRAINING_BATCHES, PREDICTION_MODE_FOR_TESTS, \
     PREDICTION_MODES, LOG_CANDIDATES_NUM, VAL_SUBSET_SIZE, LOG_LOSS_DECAY

diff --git a/cakechat/utils/files_utils.py b/cakechat/utils/files_utils.py
@@ -1,7 +1,7 @@
 import os
 import codecs
 from abc import abstractmethod, ABCMeta
-import cPickle as pickle
+from six.moves import cPickle as pickle
 
 from cakechat.utils.logger import get_logger
 
@@ -50,7 +50,7 @@ def load_file(file_path, filter_empty_lines=True):
     with codecs.open(file_path, 'r', 'utf-8') as fh:
         lines = [line.strip() for line in fh.readlines()]
         if filter_empty_lines:
-            lines = filter(None, lines)
+            lines = list(filter(None, lines))
 
         return lines
 

diff --git a/cakechat/utils/offense_detector/detector.py b/cakechat/utils/offense_detector/detector.py
@@ -1,4 +1,7 @@
 import nltk
+from six import string_types
+from six.moves import xrange
+
 
 from cakechat.utils.files_utils import load_file
 from cakechat.utils.text_processing import get_tokens_sequence
@@ -25,7 +28,7 @@ def _get_ngrams(self, tokenized_line):
         return flatten(ngrams, constructor=set)
 
     def has_offensive_ngrams(self, text_or_tokenized_text):
-        if isinstance(text_or_tokenized_text, basestring):
+        if isinstance(text_or_tokenized_text, string_types):
             tokenized_text = get_tokens_sequence(text_or_tokenized_text)
         elif isinstance(text_or_tokenized_text, list):
             tokenized_text = text_or_tokenized_text

diff --git a/cakechat/utils/s3/resolver.py b/cakechat/utils/s3/resolver.py
@@ -40,6 +40,6 @@ def _resolve(self):
             bucket.download(remote_path, self._file_path)
             return True
         except Exception as e:
-            self._logger.warn('File can not be downloaded from AWS S3 because: %s' % e.message)
+            self._logger.warn('File can not be downloaded from AWS S3 because: %s' % str(e))
 
         return False
diff --git a/cakechat/utils/tee_file.py b/cakechat/utils/tee_file.py
@@ -1,11 +1,15 @@
-import cPickle as pickle
 import os
 import tempfile
 
+from six.moves import xrange
+from six.moves import cPickle as pickle
+
+# Pickle on HIGHEST_PROTOCOL breaks on Python 3.6.5
+_PICKLE_PROTOCOL = 2
 
 def _pickle_iterable(filename, iterable):
     with open(filename, 'wb') as pickle_fh:
-        pklr = pickle.Pickler(pickle_fh, pickle.HIGHEST_PROTOCOL)
+        pklr = pickle.Pickler(pickle_fh, _PICKLE_PROTOCOL)
         for entry in iterable:
             pklr.dump(entry)
             pklr.clear_memo()

diff --git a/cakechat/utils/telegram_bot_client.py b/cakechat/utils/telegram_bot_client.py
@@ -1,4 +1,5 @@
 from abc import ABCMeta, abstractmethod
+from six import iteritems
 
 import telepot
 import telepot.loop
@@ -74,7 +75,7 @@ def _bot_info():
 
     def _send_bot_help(self, _):
         help_lines = [self._bot_info(), '', 'List of available commands:']
-        for command, (_, description) in self._command_to_handler.iteritems():
+        for command, (_, description) in iteritems(self._command_to_handler):
             help_lines.append('/{} - {}'.format(command, description))
 
         return self._send_text('\n'.join(help_lines))

diff --git a/cakechat/utils/text_processing/dialog.py b/cakechat/utils/text_processing/dialog.py
@@ -1,4 +1,4 @@
-from itertools import imap
+from six.moves import map
 from operator import itemgetter
 
 from cakechat.utils.tee_file import file_buffered_tee
@@ -24,9 +24,9 @@ def get_dialog_lines_and_conditions(dialog_lines, text_field_name, condition_fie
     Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines
     """
     conditions_iter, dialog_lines_iter = file_buffered_tee(
-        imap(lambda line: [line[condition_field_name], line[text_field_name]], dialog_lines))
-    conditions_iter = imap(itemgetter(0), conditions_iter)
-    dialog_lines_iter = imap(itemgetter(1), dialog_lines_iter)
+        map(lambda line: [line[condition_field_name], line[text_field_name]], dialog_lines))
+    conditions_iter = map(itemgetter(0), conditions_iter)
+    dialog_lines_iter = map(itemgetter(1), dialog_lines_iter)
     return dialog_lines_iter, conditions_iter