Skip to content

Commit

Permalink
Minor bug fixed and major chang to commenting structure to match pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
princelySid committed May 23, 2017
1 parent 06678c9 commit 51377c1
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 106 deletions.
Empty file added src/__init__.py
Empty file.
29 changes: 19 additions & 10 deletions src/active.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,25 @@

from logistic import predict_prob


def entropy(p):
"""
Calculate the entropy of probabilities of binary outcomes
(the closer the probability is to 1/2, the higher the entropy)
:param p: numpy array of probabilities
:return: numpy array of entropies
"""
q = 1-p
q = 1 - p
return p * np.log(p) + q * np.log(q)


def score_by_uncertainty(data, classifiers):
"""
Score datapoints by how uncertain a classifier is
:param data: array of vectors
:param classifiers: one or more probabilistic classifiers for a binary decisions
:param classifiers: one or more probabilistic classifiers for a binary
decisions
:return: entropy of each datapoint
"""
# Get the classifier's prediction probabilities
Expand All @@ -36,13 +40,16 @@ def score_by_uncertainty(data, classifiers):
# If the overfitting model makes a confident prediction, while the underfitting model does not,
# this suggests that this data point uses features that we are marginally sure about.


def score_by_relative_uncertainty(data, over, under):
"""
Score datapoints by the difference in uncertainty between two classifiers
:param data: array of vectors
:param over: one or more probabilistic classifiers for a binary decisions
:param under: one or more probabilistic classifiers for a binary decisions
:return: "under" entropy minus "over" entropy
:return: "under" entropy minus "over" entropy
"""
# Get the classifiers' prediction probabilities
over_prob = predict_prob(over, data)
Expand All @@ -55,6 +62,7 @@ def score_by_relative_uncertainty(data, over, under):
# Below, we use reweighted range voting - http://rangevoting.org/RRV.html
# This is so that we choose some examples from each classifier


def top_N(scores, N=None, weights=None, R=2, normalise=False):
"""
Find the most highly scored datapoints
Expand All @@ -64,7 +72,7 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
how much weight to place on each column of scores (default equal weight)
:param R: factor to use in reweighting
R=1 corresponds to Jefferson/D'Hondt
R=2 (default) corresponds to Webster/Sainte-Laguë
R=2 (default) corresponds to Webster/Sainte-Laguë
:param normalise: whether to normalise each column of scores (default no)
:return: indices of the top N datapoints, sorted from highest to lowest
"""
Expand All @@ -73,10 +81,10 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
N = len(scores)
# If there is just one set of scores, return the highest
if scores.ndim == 1:
return scores.argsort()[:-N-1:-1]
return scores.argsort()[:-N - 1:-1]

# Apply reweighted range voting

# If weights are not given, weight classifiers evenly
if weights is None:
weights = np.ones(scores.shape[1])
Expand All @@ -91,13 +99,14 @@ def top_N(scores, N=None, weights=None, R=2, normalise=False):
# Iteratively find the highest scoring datapoint
for _ in range(N):
# Reweight, then find the range voting winner
# Downweight each classifier, according to the sum of its scores for the datapoints already chosen
# Downweight each classifier, according to the sum of its scores for
# the datapoints already chosen
cur_weights = weights / (1 + R * scores[top].sum(0))
# Find the total reweighted score
weighted_scores = (scores * cur_weights).sum(1)
# Ignore datapoints that have already been chosen
weighted_scores[top] = 0
# Record the highest
# Record the highest
top.append(weighted_scores.argmax())

return np.array(top)
110 changes: 82 additions & 28 deletions src/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,79 @@
from collections import Counter
from abc import ABC, abstractmethod

### Functions mapping messages to bags of features
# Functions mapping messages to bags of features


def bag_of_words(msg):
"""
Extract a bag of words from a message, based on whitespace
:param msg: input string
:return: bag of features, as a Counter
"""
return Counter(('word', w) for w in msg.split())


def bag_of_ngrams(msg, n):
"""
Extract a bag of word ngrams from a message, with fixed n
:param msg: input string
:param n: size of ngram
:return: bag of features, as a Counter
"""
if n == 0:
if n <= 0:
raise ValueError('n must be a positive integer')
words = msg.split()
if n > len(words):
return Counter()
else:
return Counter(('ngram', tuple(words[i:i+n])) for i in range(len(words)-n+1))
return Counter(('ngram', tuple(words[i:i + n]))
for i in range(len(words) - n + 1))


def bag_of_character_ngrams(msg, n):
"""
Extract a bag of character ngrams from a message (including whitespace), with fixed n
Extract a bag of character ngrams from a message (including whitespace),
with fixed n
:param msg: input string
:param n: size of ngram
:return: bag of features, as a Counter
"""
if n == 0:
if n <= 0:
raise ValueError('n must be a positive integer')
elif n > len(msg):
return Counter()
else:
return Counter(('char', msg[i:i+n]) for i in range(len(msg)-n+1))
return Counter(('char', msg[i:i + n]) for i in range(len(msg) - n + 1))


def bag_of_variable_character_ngrams(msg, min_n, max_n):
"""
Extract a bag of ngrams from a message (including whitespace), with variable n
Extract a bag of ngrams from a message (including whitespace), with
variable n
:param msg: input string
:param min_n: minimum size of ngram (inclusive)
:param min_n: maximum size of ngram (inclusive)
:return: bag of features, as a Counter
"""
if min_n <= 0:
raise ValueError('min_n must be a positive integer')
if max_n < min_n:
raise ValueError('max_n must be more than or equal to min_n')
bag = Counter()
for n in range(min_n, max_n+1):
for n in range(min_n, max_n + 1):
bag += bag_of_character_ngrams(msg, n)
return bag

### Functions for combining types of feature
# Functions for combining types of feature


class Extractor(ABC):
"""
Expand All @@ -79,10 +97,15 @@ class combine(Extractor):
def __init__(self, functions, arg_params=None, kwarg_params=None):
"""
Wrap many feature extractors in a single function
:param functions: iterable of functions mapping from a string to a Counter
:param functions: iterable of functions mapping from a string to a
Counter
- Counters should have distinct keys, to avoid collisions
:param arg_params: iterable of additional arguments for the feature extractors
:param kwarg_params: iterable of additional keyword arguments for the feature extractors
:param arg_params: iterable of additional arguments for the feature
extractors
:param kwarg_params: iterable of additional keyword arguments for the
feature extractors
:return: combined feature extractor
"""
# If parameters for functions are not given, set empty parameters
Expand All @@ -91,8 +114,9 @@ def __init__(self, functions, arg_params=None, kwarg_params=None):
if kwarg_params is None:
kwarg_params = [{} for _ in functions]
# Save functions and additional arguments, to be used in __call__
self.functions_with_params = list(zip(functions, arg_params, kwarg_params))

self.functions_with_params = list(zip(functions, arg_params,
kwarg_params))

def __call__(self, msg):
"""
Convert a message to a bag of features
Expand All @@ -108,13 +132,17 @@ def __call__(self, msg):

class apply_to_parts(Extractor):
"""
Wrap a feature extractor, so it applies to several messages concatenated together
Wrap a feature extractor, so it applies to several messages concatenated
together
"""
def __init__(self, function, sep):
"""
Wrap a feature extractor, so it applies to several messages concatenated together
Wrap a feature extractor, so it applies to several messages
concatenated together
:param function: function mapping from a string to a Counter
:param sep: substring separating the individual messages
:return: new feature extractor
"""
self.function = function
Expand All @@ -133,41 +161,50 @@ def __call__(self, msg):
return bag


### Functions for producing vectors of features
# Functions for producing vectors of features

def get_global_set(bags_of_features):
"""
Find all the distinct features in many bags of features
Find all the distinct features in many bags of features
:param bags_of_features: iterable of dict-like or set-like
:return: set of features
"""
features = set()
for bag in bags_of_features:
features.update(bag)
return features


def document_frequency(bags_of_features):
"""
Find all the distinct features in many bags of features,
and how often each occurs (in how many bags each occurs)
:param bags_of_features: iterable of Counters
:return: Counter mapping features to their document frequencies
"""
freq = Counter()
for bag in bags_of_features:
freq.update(bag.keys())
return freq


def feature_list_and_dict(features):
"""
Assign numerical indices to a global list of features
:param features: iterable of feature names
:return: sorted list of features, dict mapping features to their indices
"""
feature_list = sorted(features)
feature_dict = {feat:i for i, feat in enumerate(feature_list)}
feature_dict = {feat: i for i, feat in enumerate(feature_list)}
return feature_list, feature_dict


def vectorise_one(bag, feature_dict):
"""
Convert a bag of features to a numpy array
Expand All @@ -178,32 +215,41 @@ def vectorise_one(bag, feature_dict):
N = len(feature_dict)
vec = np.zeros(N)
for feat, value in bag.items():
if feat in feature_dict: # Ignore features that are not in the dictionary
# Ignore features that are not in the dictionary
if feat in feature_dict:
vec[feature_dict[feat]] = value
return vec


def vectorise(bags, feature_dict):
"""
Convert bags of features to numpy arrays
:param bags: Counters of features
:param feature_dict: dict mapping feature names to indices
:return: feature vectors as a matrix
"""
N = len(feature_dict)
vecs = np.zeros((len(bags), N))
for i, b in enumerate(bags):
for feat, value in b.items():
if feat in feature_dict: # Ignore features that are not in the dictionary
# Ignore features that are not in the dictionary
if feat in feature_dict:
vecs[i, feature_dict[feat]] = value
return vecs


def get_vectors(msgs, extractor, feature_dict, weights=None):
"""
Get feature vectors for many messages
:param msgs: input strings
:param extractor: feature extractor, mapping from a string to a bag of features
:param extractor: feature extractor, mapping from a string to a bag of
features
:param feature_dict: dict mapping from features names to indices
:param weights: array of weights, to be multiplied with extracted vectors
:return: feature vectors as a matrix
"""
bags = [extractor(m) for m in msgs]
Expand All @@ -212,15 +258,18 @@ def get_vectors(msgs, extractor, feature_dict, weights=None):
vectors *= weights
return vectors


class Vectoriser:
"""
Class for converting messages to feature vectors
"""
def __init__(self, extractor, feature_dict, weights=None):
"""
:param extractor: feature extractor, mapping from a string to a bag of features
:param extractor: feature extractor, mapping from a string to a bag
of features
:param feature_dict: dict mapping from features names to indices
:param weights: array of weights, to be multiplied with extracted vectors
:param weights: array of weights, to be multiplied with extracted
vectors
"""
self.extractor = extractor
self.feature_dict = feature_dict
Expand All @@ -235,28 +284,33 @@ def __call__(self, msgs):
# If only one message was given, convert to a list
if isinstance(msgs, str):
msgs = [msgs]
return get_vectors(msgs, self.extractor, self.feature_dict, self.weights)

return get_vectors(msgs, self.extractor, self.feature_dict,
self.weights)

### For human readability

# For human readability
def bagify_one(vector, feature_list):
"""
Convert a feature vector to a bag of features
:param vector: numpy array
:param feature_list: global list of feature names
:return: bag of features
"""
bag = Counter()
for i in vector.nonzero():
bag[feature_list[i]] = vector[i]
return bag


def bagify(vectors, feature_list):
"""
Convert feature vectors to bags of features
:param vectors: numpy array (matrix)
:param feature_list: global list of feature names
:return: list of bags of features
"""
return [bagify_one(v, feature_list) for v in vectors]
return [bagify_one(v, feature_list) for v in vectors]
Loading

0 comments on commit 51377c1

Please sign in to comment.