Skip to content

Commit

Permalink
new comments
Browse files Browse the repository at this point in the history
  • Loading branch information
AmazaspShumik committed Jun 7, 2015
1 parent 42613a2 commit 4981bdf
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
Binary file added Linear Regression MapReduce/.DS_Store
Binary file not shown.
26 changes: 12 additions & 14 deletions Linear Regression MapReduce/LinearRegressionTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ def __str__(self):
return err



############################## Map Reduce Job #################################



class LinearRegressionTS(MRJob):
'''
Calculates sample covariance matix of explanatory variables (x_t_x) and
Expand Down Expand Up @@ -95,8 +93,8 @@ class LinearRegressionTS(MRJob):
def __init__(self,*args, **kwargs):
super(LinearRegressionTS, self).__init__(*args, **kwargs)
n = self.options.dimension
self.x_t_x = np.zeros([n,n])
self.x_t_y = np.zeros(n)
self.x_t_x = np.zeros([n,n])
self.x_t_y = np.zeros(n)
self.counts = 0

#--------------------------- feature extraction --------------------------#
Expand Down Expand Up @@ -146,17 +144,17 @@ def mapper_lr(self,_,line):
if self.options.bias is "True":
features.append(1.0)
x = np.array(features)
self.x_t_x+=np.outer(x, x)
self.x_t_y+=y*x
self.counts+=1
self.x_t_x += np.outer(x, x)
self.x_t_y += y*x
self.counts += 1

def mapper_lr_final(self):
'''
Transforms numpy arrays x_t_x and x_t_y into json-encodable list format
and sends to reducer
'''
yield 1,("x_t_x", [list(row) for row in self.x_t_x])
yield 1,("x_t_y", [xy for xy in self.x_t_y])
yield 1,("x_t_x", [list(row) for row in self.x_t_x])
yield 1,("x_t_y", [xy for xy in self.x_t_y])
yield 1,("counts", self.counts)

def reducer_lr(self,key,values):
Expand All @@ -170,19 +168,19 @@ def reducer_lr(self,key,values):
x_t_x = np.zeros([n,n]); x_t_y = np.zeros(n)
for val in values:
if val[0]=="x_t_x":
x_t_x+=np.array(val[1])
x_t_x += np.array(val[1])
elif val[0]=="x_t_y":
x_t_y+=np.array(val[1])
x_t_y += np.array(val[1])
elif val[0]=="counts":
observations+=val[1]
observations += val[1]
betas = cholesky_solution_linear_regression(x_t_x,x_t_y)
yield None,[e for e in betas]

def steps(self):
'''Defines map-reduce steps '''
return [MRStep(mapper = self.mapper_lr,
return [MRStep(mapper = self.mapper_lr,
mapper_final = self.mapper_lr_final,
reducer = self.reducer_lr)]
reducer = self.reducer_lr)]

if __name__=="__main__":
LinearRegressionTS.run()
Expand Down

1 comment on commit 4981bdf

@KarlHakanNordgren
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was under the impression that if the key -- 1 in the case of 'mapper_lr_final' -- is the same then the mapper will not divide the datasets among the nodes. So in this case all the data will be processed by the same node. I could well be wrong because until yesterday I didn't know what Hadoop was...

Please sign in to comment.