Now including feature selection via forests for interconnectors.

luke-marshall · May 22, 2017 · 437a182 · 437a182
1 parent 7349072
commit 437a182
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 
 *.png
+
+*.pyc
diff --git a/featureselection.py b/featureselection.py
@@ -0,0 +1,90 @@
+# Feature selection trial
+# Using forests
+# Adapted from sklearn tutorials. 
+# Original code here http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import make_classification
+from sklearn.ensemble import ExtraTreesClassifier
+
+import marketUtils
+
+
+
+nem = marketUtils.getNem()
+interconnectors = marketUtils.getInterconnectorFlows()
+X = []
+# xlabels = []
+y = []
+
+# for attribute in list(interconnectors.itervalues().next()):
+# 	X.append([])
+# 	xlabels.append(attribute)
+
+
+# Prepare the data for the classifier.
+times = list(nem)
+times.sort()
+xLabels = list(interconnectors.itervalues().next())
+xLabels.sort()
+
+for time in times:
+	# Add the classifications:
+	if float(nem[time]['nsw']['price']) >= 300:
+		classification = 1
+	else: 
+		classification = 0
+	y.append(classification)
+	row = []
+	for attribute in xLabels:
+		row.append(interconnectors[time][attribute])	
+	X.append(row)
+
+X = np.array(X)
+y = np.array(y)
+
+print X
+print np.sum(y)
+
+
+
+
+# Build a classification task using 3 informative features
+# X, y = make_classification(n_samples=1000,
+# 							n_features=10,
+# 							n_informative=3,
+# 							n_redundant=0,
+# 							n_repeated=0,
+# 							n_classes=2,
+# 							random_state=0,
+# 							shuffle=False)
+
+# print X
+
+# Build a forest and compute the feature importances
+forest = ExtraTreesClassifier(n_estimators=250,random_state=0)
+
+
+
+forest.fit(X, y)
+importances = forest.feature_importances_
+std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
+indices = np.argsort(importances)[::-1]
+
+# Print the feature ranking
+print("Feature ranking:")
+
+for f in range(X.shape[1]):
+
+	print("%d. feature %d (%f) %s" % (f + 1, indices[f], importances[indices[f]], str(xLabels[indices[f]])))
+
+# Plot the feature importances of the forest
+plt.figure()
+plt.title("Feature importances")
+plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
+plt.xticks(range(X.shape[1]), indices)
+plt.xlim([-1, X.shape[1]])
+plt.show()
diff --git a/interconnector.py b/interconnector.py
@@ -7,91 +7,14 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
-from matplotlib import colors as mcolors
-
-
-# from bokeh.charts import Scatter, output_file, show
-from bokeh.plotting import figure, output_file, show
-from bokeh.sampledata.autompg import autompg as df
-
-
-
-
-
-
-def saveToPickle(my_object, fileName):
-    print("Pickling my_object to file: "+str(fileName)+"...")
-    pickle.dump(my_object, open(fileName, "wb"))
-    print ("Saved.")
-
-def getFromPickle(fileName):
-    if os.path.isfile(fileName):
-        my_object = pickle.load(open(fileName, "rb"))
-        return my_object
-    else:
-        return None
-
-def getNem():
-    myFile = open('nem_allstates.csv')
-    nemData = csv.DictReader(myFile)
-    nem = {}
-
-    for timePeriod in nemData:
-        timeString = timePeriod['Time-ending']
-
-        nem[timeString] = {
-			 'nsw': {
-				'price': timePeriod['NSW1 Price'],
-				'demand':float(timePeriod['NSW1 Scheduled Demand']),
-				'nonScheduled':float(timePeriod['NSW1 Non-scheduled']),
-				'generation': float(timePeriod['NSW1 Generation']),
-				'availability':float(timePeriod['NSW1 Availability']),
-				},
-			'vic': {
-				'price': timePeriod['VIC1 Price'],
-				'demand':float(timePeriod['VIC1 Scheduled Demand']),
-				'nonScheduled':float(timePeriod['VIC1 Non-scheduled']),
-				'generation': float(timePeriod['VIC1 Generation']),
-				'availability':float(timePeriod['VIC1 Availability']),
-				},
-			'qld': {
-				'price': timePeriod['QLD1 Price'],
-				'demand':float(timePeriod['QLD1 Scheduled Demand']),
-				'nonScheduled':float(timePeriod['QLD1 Non-scheduled']),
-				'generation': float(timePeriod['QLD1 Generation']),
-				'availability':float(timePeriod['QLD1 Availability']),
-				},
-			'sa': {
-				'price': timePeriod['SA1 Price'],
-				'demand':float(timePeriod['SA1 Scheduled Demand']),
-				'nonScheduled':float(timePeriod['SA1 Non-scheduled']),
-				'generation': float(timePeriod['SA1 Generation']),
-				'availability':float(timePeriod['SA1 Availability']),
-				},
-			'tas': {
-				'price': timePeriod['TAS1 Price'],
-				'demand':float(timePeriod['TAS1 Scheduled Demand']),
-				'nonScheduled':float(timePeriod['TAS1 Non-scheduled']),
-				'generation': float(timePeriod['TAS1 Generation']),
-				'availability':float(timePeriod['TAS1 Availability']),
-				},
-		}
-    return nem
-
-def getInterconnectorFlows():
-	filename = 'interconnectorflows.csv'
-	# Reading the file
-	df = pd.read_csv(filename, index_col=0)
-	# Creating the dict
-	flows = df.transpose().to_dict()
-	return flows
-
 
 
+import marketUtils
 
 
 
 
+# Generates data for plotting, returns as dict, is then plotted when key_event function is called. 
 
 def chartFlowVsPrice(nem, flows):
 	plots = []
@@ -153,11 +76,13 @@ def key_event(e):
 	plt.title(plots[curr_plt_index]['title'])
 	fig.canvas.draw()
 
-nem = getNem()
-flows = getInterconnectorFlows()
-plots = chartFlowVsPrice(nem, flows)
 
 
+
+nem = marketUtils.getNem()
+flows = marketUtils.getInterconnectorFlows()
+plots = chartFlowVsPrice(nem, flows)
+
 fig = plt.figure()
 fig.canvas.mpl_connect('key_press_event', key_event)
 ax = fig.add_subplot(111)

diff --git a/marketUtils.py b/marketUtils.py
@@ -0,0 +1,66 @@
+import os
+import csv
+import numpy as np
+import pickle
+import pandas as pd
+
+
+def getNem():
+    myFile = open('nem_allstates.csv')
+    nemData = csv.DictReader(myFile)
+    nem = {}
+
+    for timePeriod in nemData:
+        timeString = timePeriod['Time-ending']
+
+        nem[timeString] = {
+			 'nsw': {
+				'price': float(timePeriod['NSW1 Price']),
+				'demand':float(timePeriod['NSW1 Scheduled Demand']),
+				'nonScheduled':float(timePeriod['NSW1 Non-scheduled']),
+				'generation': float(timePeriod['NSW1 Generation']),
+				'availability':float(timePeriod['NSW1 Availability']),
+				},
+			'vic': {
+				'price': float(timePeriod['VIC1 Price']),
+				'demand':float(timePeriod['VIC1 Scheduled Demand']),
+				'nonScheduled':float(timePeriod['VIC1 Non-scheduled']),
+				'generation': float(timePeriod['VIC1 Generation']),
+				'availability':float(timePeriod['VIC1 Availability']),
+				},
+			'qld': {
+				'price': float(timePeriod['QLD1 Price']),
+				'demand':float(timePeriod['QLD1 Scheduled Demand']),
+				'nonScheduled':float(timePeriod['QLD1 Non-scheduled']),
+				'generation': float(timePeriod['QLD1 Generation']),
+				'availability':float(timePeriod['QLD1 Availability']),
+				},
+			'sa': {
+				'price': float(timePeriod['SA1 Price']),
+				'demand':float(timePeriod['SA1 Scheduled Demand']),
+				'nonScheduled':float(timePeriod['SA1 Non-scheduled']),
+				'generation': float(timePeriod['SA1 Generation']),
+				'availability':float(timePeriod['SA1 Availability']),
+				},
+			'tas': {
+				'price': float(timePeriod['TAS1 Price']),
+				'demand':float(timePeriod['TAS1 Scheduled Demand']),
+				'nonScheduled':float(timePeriod['TAS1 Non-scheduled']),
+				'generation': float(timePeriod['TAS1 Generation']),
+				'availability':float(timePeriod['TAS1 Availability']),
+				},
+		}
+    return nem
+
+def getInterconnectorFlows():
+	filename = 'interconnectorflows.csv'
+	# Reading the file
+	df = pd.read_csv(filename, index_col=0)
+	# Creating the dict
+	flows = df.transpose().to_dict()
+	return flows
+
+
+
+
+