Minor bug fixed and major chang to commenting structure to match pep8

ritazagoni · May 23, 2017 · 51377c1 · 51377c1
1 parent 06678c9
commit 51377c1
Show file tree

Hide file tree

Showing 5 changed files with 249 additions and 106 deletions.
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/active.py b/src/active.py
@@ -2,21 +2,25 @@
 
 from logistic import predict_prob
 
+
 def entropy(p):
     """
     Calculate the entropy of probabilities of binary outcomes
     (the closer the probability is to 1/2, the higher the entropy)
+
     :param p: numpy array of probabilities
     :return: numpy array of entropies
     """
-    q = 1-p
+    q = 1 - p
     return p * np.log(p) + q * np.log(q)
 
+
 def score_by_uncertainty(data, classifiers):
     """
     Score datapoints by how uncertain a classifier is
     :param data: array of vectors
-    :param classifiers: one or more probabilistic classifiers for a binary decisions
+    :param classifiers: one or more probabilistic classifiers for a binary
+    decisions
     :return: entropy of each datapoint
     """
     # Get the classifier's prediction probabilities
@@ -36,13 +40,16 @@ def score_by_uncertainty(data, classifiers):
 # If the overfitting model makes a confident prediction, while the underfitting model does not,
 # this suggests that this data point uses features that we are marginally sure about.
 
+
 def score_by_relative_uncertainty(data, over, under):
     """
     Score datapoints by the difference in uncertainty between two classifiers
+
     :param data: array of vectors
     :param over: one or more probabilistic classifiers for a binary decisions
     :param under: one or more probabilistic classifiers for a binary decisions
-    :return: "under" entropy minus "over" entropy 
+
+    :return: "under" entropy minus "over" entropy
     """
     # Get the classifiers' prediction probabilities
     over_prob = predict_prob(over, data)
@@ -55,6 +62,7 @@ def score_by_relative_uncertainty(data, over, under):
 # Below, we use reweighted range voting - http://rangevoting.org/RRV.html
 # This is so that we choose some examples from each classifier
 
+
 def top_N(scores, N=None, weights=None, R=2, normalise=False):
     """
     Find the most highly scored datapoints
@@ -64,7 +72,7 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
     how much weight to place on each column of scores (default equal weight)
     :param R: factor to use in reweighting
     R=1 corresponds to Jefferson/D'Hondt
-    R=2 (default) corresponds to Webster/Sainte-Laguë 
+    R=2 (default) corresponds to Webster/Sainte-Laguë
     :param normalise: whether to normalise each column of scores (default no)
     :return: indices of the top N datapoints, sorted from highest to lowest
     """
@@ -73,10 +81,10 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
         N = len(scores)
     # If there is just one set of scores, return the highest
     if scores.ndim == 1:
-        return scores.argsort()[:-N-1:-1]
-    
+        return scores.argsort()[:-N - 1:-1]
+
     # Apply reweighted range voting
-    
+
     # If weights are not given, weight classifiers evenly
     if weights is None:
         weights = np.ones(scores.shape[1])
@@ -91,13 +99,14 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
     # Iteratively find the highest scoring datapoint
     for _ in range(N):
         # Reweight, then find the range voting winner
-        # Downweight each classifier, according to the sum of its scores for the datapoints already chosen
+        # Downweight each classifier, according to the sum of its scores for
+        # the datapoints already chosen
         cur_weights = weights / (1 + R * scores[top].sum(0))
         # Find the total reweighted score
         weighted_scores = (scores * cur_weights).sum(1)
         # Ignore datapoints that have already been chosen
         weighted_scores[top] = 0
-        # Record the highest 
+        # Record the highest
         top.append(weighted_scores.argmax())
-    
+
     return np.array(top)
diff --git a/src/features.py b/src/features.py
@@ -2,61 +2,79 @@
 from collections import Counter
 from abc import ABC, abstractmethod
 
-### Functions mapping messages to bags of features
+# Functions mapping messages to bags of features
+
 
 def bag_of_words(msg):
     """
     Extract a bag of words from a message, based on whitespace
+
     :param msg: input string
+
     :return: bag of features, as a Counter
     """
     return Counter(('word', w) for w in msg.split())
 
+
 def bag_of_ngrams(msg, n):
     """
     Extract a bag of word ngrams from a message, with fixed n
+
     :param msg: input string
     :param n: size of ngram
+
     :return: bag of features, as a Counter
     """
-    if n == 0:
+    if n <= 0:
         raise ValueError('n must be a positive integer')
     words = msg.split()
     if n > len(words):
         return Counter()
     else:
-        return Counter(('ngram', tuple(words[i:i+n])) for i in range(len(words)-n+1))
+        return Counter(('ngram', tuple(words[i:i + n]))
+                       for i in range(len(words) - n + 1))
+
 
 def bag_of_character_ngrams(msg, n):
     """
-    Extract a bag of character ngrams from a message (including whitespace), with fixed n
+    Extract a bag of character ngrams from a message (including whitespace),
+    with fixed n
+
     :param msg: input string
     :param n: size of ngram
+
     :return: bag of features, as a Counter
     """
-    if n == 0:
+    if n <= 0:
         raise ValueError('n must be a positive integer')
     elif n > len(msg):
         return Counter()
     else:
-        return Counter(('char', msg[i:i+n]) for i in range(len(msg)-n+1))
+        return Counter(('char', msg[i:i + n]) for i in range(len(msg) - n + 1))
+
 
 def bag_of_variable_character_ngrams(msg, min_n, max_n):
     """
-    Extract a bag of ngrams from a message (including whitespace), with variable n
+    Extract a bag of ngrams from a message (including whitespace), with
+    variable n
+
     :param msg: input string
     :param min_n: minimum size of ngram (inclusive)
     :param min_n: maximum size of ngram (inclusive)
+
     :return: bag of features, as a Counter
     """
+    if min_n <= 0:
+        raise ValueError('min_n must be a positive integer')
     if max_n < min_n:
         raise ValueError('max_n must be more than or equal to min_n')
     bag = Counter()
-    for n in range(min_n, max_n+1):
+    for n in range(min_n, max_n + 1):
         bag += bag_of_character_ngrams(msg, n)
     return bag
 
-### Functions for combining types of feature
+# Functions for combining types of feature
+
 
 class Extractor(ABC):
     """
@@ -79,10 +97,15 @@ class combine(Extractor):
     def __init__(self, functions, arg_params=None, kwarg_params=None):
         """
         Wrap many feature extractors in a single function
-        :param functions: iterable of functions mapping from a string to a Counter
+
+        :param functions: iterable of functions mapping from a string to a
+        Counter
         - Counters should have distinct keys, to avoid collisions
-        :param arg_params: iterable of additional arguments for the feature extractors
-        :param kwarg_params: iterable of additional keyword arguments for the feature extractors
+        :param arg_params: iterable of additional arguments for the feature
+        extractors
+        :param kwarg_params: iterable of additional keyword arguments for the
+        feature extractors
+
         :return: combined feature extractor
         """
         # If parameters for functions are not given, set empty parameters
@@ -91,8 +114,9 @@ def __init__(self, functions, arg_params=None, kwarg_params=None):
         if kwarg_params is None:
             kwarg_params = [{} for _ in functions]
         # Save functions and additional arguments, to be used in __call__
-        self.functions_with_params = list(zip(functions, arg_params, kwarg_params))
-
+        self.functions_with_params = list(zip(functions, arg_params,
+                                          kwarg_params))
+
     def __call__(self, msg):
         """
         Convert a message to a bag of features
@@ -108,13 +132,17 @@ def __call__(self, msg):
 
 class apply_to_parts(Extractor):
     """
-    Wrap a feature extractor, so it applies to several messages concatenated together
+    Wrap a feature extractor, so it applies to several messages concatenated
+    together
     """
     def __init__(self, function, sep):
         """
-        Wrap a feature extractor, so it applies to several messages concatenated together
+        Wrap a feature extractor, so it applies to several messages
+        concatenated together
+
         :param function: function mapping from a string to a Counter
         :param sep: substring separating the individual messages
+
         :return: new feature extractor
         """
         self.function = function
@@ -133,41 +161,50 @@ def __call__(self, msg):
         return bag
 
 
-### Functions for producing vectors of features
+# Functions for producing vectors of features
 
 def get_global_set(bags_of_features):
     """
-    Find all the distinct features in many bags of features 
+    Find all the distinct features in many bags of features
+
     :param bags_of_features: iterable of dict-like or set-like
+
     :return: set of features
     """
     features = set()
     for bag in bags_of_features:
         features.update(bag)
     return features
 
+
 def document_frequency(bags_of_features):
     """
     Find all the distinct features in many bags of features,
     and how often each occurs (in how many bags each occurs)
+
     :param bags_of_features: iterable of Counters
+
     :return: Counter mapping features to their document frequencies
     """
     freq = Counter()
     for bag in bags_of_features:
         freq.update(bag.keys())
     return freq
 
+
 def feature_list_and_dict(features):
     """
     Assign numerical indices to a global list of features
+
     :param features: iterable of feature names
+
     :return: sorted list of features, dict mapping features to their indices
     """
     feature_list = sorted(features)
-    feature_dict = {feat:i for i, feat in enumerate(feature_list)}
+    feature_dict = {feat: i for i, feat in enumerate(feature_list)}
     return feature_list, feature_dict
 
+
 def vectorise_one(bag, feature_dict):
     """
     Convert a bag of features to a numpy array
@@ -178,32 +215,41 @@ def vectorise_one(bag, feature_dict):
     N = len(feature_dict)
     vec = np.zeros(N)
     for feat, value in bag.items():
-        if feat in feature_dict:  # Ignore features that are not in the dictionary
+        # Ignore features that are not in the dictionary
+        if feat in feature_dict:
             vec[feature_dict[feat]] = value
     return vec
 
+
 def vectorise(bags, feature_dict):
     """
     Convert bags of features to numpy arrays
+
     :param bags: Counters of features
     :param feature_dict: dict mapping feature names to indices
+
     :return: feature vectors as a matrix
     """
     N = len(feature_dict)
     vecs = np.zeros((len(bags), N))
     for i, b in enumerate(bags):
         for feat, value in b.items():
-            if feat in feature_dict:  # Ignore features that are not in the dictionary
+            # Ignore features that are not in the dictionary
+            if feat in feature_dict:
                 vecs[i, feature_dict[feat]] = value
     return vecs
 
+
 def get_vectors(msgs, extractor, feature_dict, weights=None):
     """
     Get feature vectors for many messages
+
     :param msgs: input strings
-    :param extractor: feature extractor, mapping from a string to a bag of features
+    :param extractor: feature extractor, mapping from a string to a bag of
+    features
     :param feature_dict: dict mapping from features names to indices
     :param weights: array of weights, to be multiplied with extracted vectors
+
     :return: feature vectors as a matrix
     """
     bags = [extractor(m) for m in msgs]
@@ -212,15 +258,18 @@ def get_vectors(msgs, extractor, feature_dict, weights=None):
         vectors *= weights
     return vectors
 
+
 class Vectoriser:
     """
     Class for converting messages to feature vectors
     """
     def __init__(self, extractor, feature_dict, weights=None):
         """
-        :param extractor: feature extractor, mapping from a string to a bag of features
+        :param extractor: feature extractor, mapping from a string to a bag
+        of features
         :param feature_dict: dict mapping from features names to indices
-        :param weights: array of weights, to be multiplied with extracted vectors
+        :param weights: array of weights, to be multiplied with extracted
+        vectors
         """
         self.extractor = extractor
         self.feature_dict = feature_dict
@@ -235,28 +284,33 @@ def __call__(self, msgs):
         # If only one message was given, convert to a list
         if isinstance(msgs, str):
             msgs = [msgs]
-        return get_vectors(msgs, self.extractor, self.feature_dict, self.weights)
-
+        return get_vectors(msgs, self.extractor, self.feature_dict,
+                           self.weights)
 
-### For human readability
 
+# For human readability
 def bagify_one(vector, feature_list):
     """
     Convert a feature vector to a bag of features
+
     :param vector: numpy array
     :param feature_list: global list of feature names
+
     :return: bag of features
     """
     bag = Counter()
     for i in vector.nonzero():
         bag[feature_list[i]] = vector[i]
     return bag
 
+
 def bagify(vectors, feature_list):
     """
     Convert feature vectors to bags of features
+
     :param vectors: numpy array (matrix)
     :param feature_list: global list of feature names
+
     :return: list of bags of features
     """
-    return [bagify_one(v, feature_list) for v in vectors]
+    return [bagify_one(v, feature_list) for v in vectors]