linear_regression.py

import math
import operator
import numpy as np

class Regression:
	'''
	Implements Linear Regression with Weight Decay Regularization
	fields:
	int dim				Dimensionality of the data
	int lam 			Regularization factor
	ndarray weights		numpy ndarray (dim+1 x 1) of the weights
	List data			Array (N x 1) of tuples (x, y) composed of vectors x and results y=f(x)
	'''
	def __init__(self, dim, data = [], lam = 0):
		self.dim = dim
		self.reset(data, lam)

	def reset(self, data, lam = 0):
		'''
		Reset weights and lambda and feed a data sample
		'''
		self.lam = lam
		self.weights = np.zeros(self.dim+1)
		for t in data:
			if len(t[0])!=self.dim:
				raise ValueError('Wrong data dimensionality')
		self.data = data

	def hypothesis(self, x):
		'''
		Takes d-dimensional data vector x and computes h(x) using the current weights
		'''
		x_adj = [1.0] + x	#adjusted to include 1 at the start
		return np.dot(self.weights, x_adj)	#dot product of w and x

	def quadratic_error(self, point):
		'''
		Takes as "point" a tuple (x, y) with x a vector and y=f(x)
		and returns the quadratic error (h(x) - f(x))**2
		'''
		h = self.hypothesis(point[0])
		return (h - point[1])**2

	def sample_error(self, data):
		'''
		Computes the in-sample error Ein if given self.data as data,
		computes the out-of-sample error Eout if given a dataset generated by f(x) as data
		'''
		total_error = 0.0
		for point in data:
			total_error += self.quadratic_error(point)
		return total_error / len(data)

	def solve(self):
		'''
		Computes the weights using linear regression
		'''
		data_matrix = []
		for point in self.data:
			data_matrix.append([1.0] + point[0])		#Create the input data matrix X with x vectors as its rows
		data_matrix = np.matrix(data_matrix)

		target_vector = []
		for point in self.data:
			target_vector.append([point[1]])		#Create the target vector y of f(x) values for the x inputs
		target_vector = np.matrix(target_vector)

		reg_pinv = np.linalg.inv((np.transpose(data_matrix)*data_matrix) + (self.lam*np.matrix(np.identity(self.dim+1))))*np.transpose(data_matrix)

		self.weights = np.array(np.transpose(reg_pinv * target_vector))


	def classify(self, point):
		'''
		For classification problems:
		Takes as "point" a tuple (x, y) with x a vector and y=f(x)
		and classifies it, returning True if sign(h(x))=f(x) and False if not
		'''
		h = self.hypothesis(point[0])
		return math.copysign(1.0, h) == point[1]

	def classification_error(self, data):
		'''
		For classification problems:
		Computes the in-sample error Ein if given self.data as data,
		computes the out-of-sample error Eout if given a dataset generated by f(x) as data
		'''
		g_misclass_points = 0	#counter of newdata points misclassified by g
		for point in data:
			if not self.classify(point):
				g_misclass_points += 1
		#return the fraction of P
		return g_misclass_points / len(data)