diff --git a/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb b/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb
index 4223a52..5c459fd 100755
--- a/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb
+++ b/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb
@@ -74,6 +74,7 @@
"import numpy as np\n",
"#import sklearn as sk\n",
"from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.preprocessing import KBinsDiscretizer\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn import svm\n",
"from sklearn.ensemble import AdaBoostClassifier as ABC\n",
@@ -84,7 +85,6 @@
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.metrics import balanced_accuracy_score\n",
- "from sklearn.preprocessing import KBinsDiscretizer\n",
"from sklearn.compose import ColumnTransformer"
]
},
@@ -159,7 +159,6 @@
"\n",
"#Trim the data of irrelevant fields and convert to numpy array\n",
"data = data[:,4:]\n",
- "#print(data.dtypes)\n",
"\n",
"#Load and format the test_2016_no_label.csv file\n",
"test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
@@ -248,16 +247,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "knn avg weighted accuracy: 0.6203334398310165\n",
- "dtc avg weighted accuracy: 0.7070879048578987\n",
- "dtc_t avg weighted accuracy: 0.6931251192737173\n",
- "adb_dtc avg weighted accuracy: 0.6961593601061467\n",
- "svc avg weighted accuracy: 0.7004446186230044\n",
- "lr avg weighted accuracy: 0.6590264805800705\n",
- "adb_lr avg weighted accuracy: 0.6497330664169988\n",
- "nbc avg weighted accuracy: 0.538413031775743\n",
- "adb_nbc avg weighted accuracy: 0.5742813326608044\n",
- "219.0\n"
+ "knn avg weighted accuracy: 0.6237076582105193\n",
+ "dtc avg weighted accuracy: 0.6979205324376133\n",
+ "dtc_t avg weighted accuracy: 0.6855368083208002\n",
+ "adb_dtc avg weighted accuracy: 0.700633705568728\n",
+ "svc avg weighted accuracy: 0.6971740145905911\n",
+ "lr avg weighted accuracy: 0.6592688655540591\n",
+ "adb_lr avg weighted accuracy: 0.6488225086599588\n",
+ "nbc avg weighted accuracy: 0.5358837627311691\n",
+ "adb_nbc avg weighted accuracy: 0.5720794507211426\n",
+ "250.0\n"
]
}
],
@@ -350,69 +349,9 @@
"You may follow the steps in part 2 again but making innovative changes like creating new features, using new training algorithms, etc. Make sure you explain everything clearly in part 3.2. Note that reaching the 75% creative baseline is only a small portion of this part. Any creative ideas will receive most points as long as they are reasonable and clearly explained."
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Make sure you comment your code clearly and you may refer to these comments in the part 3.2\n",
- "# TODO\n",
- "\n",
- "dtc.fit(train, yTr)\n",
- "dtc_preds = dtc.predict(valid)\n",
- "print(\"dtc weighted accuracy: \", weighted_accuracy(dtc_preds,yV)) #does pretty well, even with no pruning\n",
- "\n",
- "dtc_g.fit(train, yTr)\n",
- "dtc_g_preds = dtc.predict(valid)\n",
- "print(\"dtc_g weighted accuracy: \", weighted_accuracy(dtc_g_preds,yV)) #does pretty well, even with no pruning\n",
- "\n",
- "adb_dtc.fit(train, yTr)\n",
- "adb_dtc_preds = adb_dtc.predict(valid)\n",
- "print(\"adb_dtc weighted accuracy: \", weighted_accuracy(adb_dtc_preds,yV))\n",
- "\n",
- "#best validation error so far\n",
- "svc.fit(train, yTr)\n",
- "svc_preds = svc.predict(valid)\n",
- "print(\"svc weighted accuracy: \", weighted_accuracy(svc_preds,yV))\n",
- "\n",
- "#does not work very well; not really better than random\n",
- "nbc.fit(train, yTr)\n",
- "nbc_preds = nbc.predict(valid)\n",
- "print(\"nbc weighted accuracy: \", weighted_accuracy(nbc_preds,yV))\n",
- "\n",
- "adb_nbc.fit(train, yTr)\n",
- "adb_nbc_preds = adb_nbc.predict(valid)\n",
- "print(\"adb_nbc weighted accuracy: \", weighted_accuracy(adb_nbc_preds,yV))\n",
- "\n",
- "#Split the data into a train and test set\n",
- "np.random.shuffle(data)\n",
- " \n",
- "results = np.array(df[:,-1],dtype='bool')\n",
- "data = df[:,:6]\n",
- "n_valid = int(len(data)/5)\n",
- "train = data[:4*n_valid]\n",
- "yTr = results[:4*n_valid]\n",
- "valid = data[4*n_valid:]\n",
- " yV = results[4*n_valid:]\n",
- "\n",
- "n_valid = int(len(data)/5)\n",
- "j = 100;\n",
- "\n",
- "#for k in range(5, 100):\n",
- "sum_err = 0;\n",
- "#knn = KNeighborsClassifier(n_neighbors = k)\n",
- "for i in range(j):\n",
- " #randomize training and validation set\n",
- " np.random.shuffle(data)\n",
- "\n",
- " #run algorithm\n",
- " knn.fit(train, yTr)\n",
- " knn_preds = knn.predict(valid)\n",
- " sum_err = sum_err + sum(knn_preds != yV)/n_valid"
- ]
- },
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -420,97 +359,48 @@
"\n",
"### (3.1) Preprocessing and Feature Extraction ###\n",
"\n",
- "###(3.1.a) Load and format the training data set(s)###\n",
- "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
- "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n",
- "graph = pd.read_csv(\"graph.csv\", sep=',')\n",
+ "###Helper functions to preprocess data###\n",
"\n",
"#Add binary features for the state that the county is in (where each binary feature represents one state)\n",
- "\n",
- "###2016###\n",
- "data[\"State\"] = [x.strip()[-2:] for x in data['County']] #Extract state abbreviations from County\n",
- "dummies = pd.get_dummies(data['State']) #Create state dummies from State\n",
- "data = pd.concat([data,dummies],axis=1)\n",
- "data = data.drop('State', axis=1)\n",
- "\n",
- "###2012###\n",
- "data_2012[\"State\"] = [x.strip()[-2:] for x in data_2012['County']] #Extract state abbreviations from County\n",
- "dummies_2012 = pd.get_dummies(data_2012['State']) #Create state dummies from State\n",
- "data_2012 = pd.concat([data_2012,dummies_2012],axis=1)\n",
- "data_2012 = data_2012.drop(\"State\", axis=1)\n",
- "\n",
- "#Define a function which will preprocess the data\n",
- "def preprocess(data):\n",
- " #Normalize features\n",
- " #Standardize the data with mean 0 and s.d. 1\n",
+ "def get_states(X):\n",
+ " X[\"State\"] = [x.strip()[-2:] for x in X['County']]\n",
+ " dummies_test = pd.get_dummies(X['State'])\n",
+ " X = pd.concat([X,dummies_test],axis=1)\n",
+ " X = X.drop('State',axis=1)\n",
+ " return X\n",
+ "\n",
+ "#Normalizes all variables from the data except for the binary state variables\n",
+ "def normalize(X):\n",
+ " #Normalize features Standardize the data with mean 0 and s.d. 1\n",
" scalar = StandardScaler()\n",
- " _,d = np.shape(data)\n",
- " data_norm = data[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n",
- " #data_raw_indexes = np.r_[6:(d-2)]\n",
- " data_bin = data[:,6:(d-2)]\n",
- " #print('untransformed data', data_bin)\n",
+ " _,d = np.shape(X)\n",
+ " data_norm = X[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n",
+ " data_bin = X[:,6:(d-2)]\n",
" scalar.fit(data_norm)\n",
" data_norm = scalar.transform(data_norm)\n",
- " #print('transformed data', data_norm)\n",
- " data = np.hstack((data_norm, data_bin))\n",
- " #print(data)\n",
- "\n",
- " #scalar = StandardScaler()\n",
- " #binning = ['MedianIncome']\n",
- " #standardization = ['MigraRate','BirthRate','DeathRate','BachelorRate','UnemploymentRate']\n",
- " #n,d = np.shape(counties)\n",
- " #print(d)\n",
- " #binning = [4]\n",
- " #standardization = [*range(5,d)]\n",
- " #print(standardization)\n",
- " #ct = ColumnTransformer([(\"standardize\", scalar, standardization), \n",
- " # (\"binning\", KBinsDiscretizer(n_bins=10), binning)])\n",
- " #try min maxing if binning doesn't \n",
- " #data = ct.fit_transform(counties)\n",
- " return data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "###EXPERIMENTING DON'T RUN YET###\n",
- "\n",
- "#Add binary feature for whether a county is on a state border (which may indicate they receive other states' political ads)\n",
- "fips_allstates = data[[\"FIPS\",\"State\"]]\n",
- "tmp1 = graph.merge(fips_allstates, left_on=\"SRC\", right_on=\"FIPS\", how=\"left\")\n",
- "tmp1 = tmp1[[\"SRC\",\"DST\",\"State\"]]\n",
- "tmp1.rename(columns={'State':'SRC_State'}, inplace=True)\n",
- "print(tmp1)\n",
+ " X = np.hstack((data_norm, data_bin))\n",
+ " return X\n",
"\n",
- "tmp2 = tmp1.merge(fips_allstates, left_on=\"DST\", right_on=\"FIPS\", how=\"left\")\n",
- "tmp2 = tmp2[[\"SRC\",\"DST\",\"SRC_State\",\"State\"]]\n",
- "tmp2.rename(columns={'State':'DST_State'}, inplace=True)\n",
- "print(tmp2)\n",
- "\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRC_State\"].eq(tmp2[\"DST_State\"])\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].astype(int)\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].replace({0:1,1:0}) #Make sure =1 iff on border\n",
- "print(tmp2)\n",
- "tmp2 = tmp2.dropna() #Because NaNs indicate counties that are not in training data - not sure how to deal with loss of data\n",
- "tmp2 = tmp2.groupby(['SRC'])['SRConBorder'].sum()\n",
- "print(tmp2)\n",
+ "###(3.1.a) Load and format the training data set(s)###\n",
+ "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
+ "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n",
+ "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
"\n",
- "data = data.merge(tmp2, left_on=\"FIPS\", right_on=\"SRC\", how=\"left\")\n",
- "print(data)\n",
+ "###2016###\n",
+ "data = get_states(data)\n",
"\n",
- "#Notes: Source counties can be found in test, destination in train (or vice versa - one occurence in either column(?))\n",
- "#Notes: Get list of neighbors that voted one way or other (check one per column)"
+ "###2012###\n",
+ "data_2012 = get_states(data_2012)\n",
+ "\n"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"#Creates a lexicographically ordered list of all the county's neighbors\n",
- "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
"neighbors = {}\n",
"for i in range(len(graph)):\n",
" src = graph['SRC'][i]\n",
@@ -531,15 +421,12 @@
"dem = dict(zip(data['FIPS'], dem_perc))\n",
"votes = data[['DEM','GOP']]\n",
"votes = list(votes.itertuples(index=False, name=None))\n",
- "votes = dict(zip(data['FIPS'], votes))\n",
- "votes_2012 = data_2012[['DEM','GOP']]\n",
- "votes_2012 = list(votes_2012.itertuples(index=False, name=None))\n",
- "votes_2012 = dict(zip(data['FIPS'], votes_2012))"
+ "votes = dict(zip(data['FIPS'], votes))"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@@ -554,7 +441,7 @@
" score += dem[i]\n",
" return n, score/n if n != 0 else np.nan\n",
"\n",
- "#TODO - implement a function that calculates the percentage of all voters in the surrounding counties who voted dem\n",
+ "#Calculates the percentage of all voters from the neighboring counties who voted democratic\n",
"def aggregate_gscore(neighs, votes):\n",
" n = 0\n",
" dem = 0\n",
@@ -577,27 +464,37 @@
" n = 0\n",
" else:\n",
" n, score = aggregate_gscore(neighbors[key], votes)\n",
- " _, s2012 = aggregate_gscore(neighbors[key], votes_2012)\n",
+ " total_n = len(neighbors[key])\n",
" counties.loc[i, 'GraphScore'] = score\n",
" counties.loc[i, 'Neighbors'] = n\n",
- " mean = counties['GraphScore'].sum()/len(counties)\n",
- " counties['GraphScore'] = counties['GraphScore'].map(lambda x: mean if pd.isnull(x) else x)\n",
+ " smean = counties['GraphScore'].sum()/len(counties)\n",
+ " counties['GraphScore'] = counties['GraphScore'].map(lambda x: smean if pd.isnull(x) else x)\n",
" return counties"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 61,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n",
+ "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n"
+ ]
+ }
+ ],
"source": [
"###Prep for concatenating###\n",
"\n",
"###2016###\n",
- "#cols = data.columns.tolist()\n",
- "#cols = cols[0:2] + cols[4:] #Helps make sure columns in test align with those in data\n",
- "\n",
"data = add_gscore(data)\n",
+ "#Rearrange the columns to ensure that DC is in the same location of each dataset\n",
+ "cols = data.columns.tolist()\n",
+ "cols = cols[0:2] + cols[4:]\n",
+ "print(list(data.columns))\n",
"data = data.to_numpy()\n",
"\n",
"#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n",
@@ -606,11 +503,13 @@
"\n",
"#Trim the data of irrelevant fields and convert to numpy array\n",
"data = data[:,4:]\n",
- "#print(np.shape(data))\n",
- "#print(data)\n",
"\n",
"###2012###\n",
"data_2012 = add_gscore(data_2012)\n",
+ "#Rearrange the columns to ensure that DC is in the same location of each dataset\n",
+ "cols_2012 = data_2012.columns.tolist()\n",
+ "cols_2012 = cols_2012[0:2] + cols_2012[4:]\n",
+ "print(list(data_2012.columns))\n",
"data_2012 = data_2012.to_numpy()\n",
"\n",
"#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n",
@@ -622,9 +521,18 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 62,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n",
+ "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n"
+ ]
+ }
+ ],
"source": [
"###(3.1.b) Load and format the testing data sets###\n",
"test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
@@ -633,15 +541,11 @@
"#Process the testing data set in the same manner as the training data set\n",
"\n",
"###2016###\n",
- "test[\"State\"] = [x.strip()[-2:] for x in test['County']]\n",
- "dummies_test = pd.get_dummies(test['State'])\n",
- "#print(dummies.columns.difference(dummies_test.columns)) #Check that DC is the only missing county/state\n",
- "test = pd.concat([test,dummies_test],axis=1)\n",
+ "test = get_states(test)\n",
"test['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n",
- "test = test.drop('State',axis=1)\n",
- "#print(test)\n",
- "#test = test[cols] #Make sure columns in test align with those in data\n",
"test = add_gscore(test)\n",
+ "test = test[cols]\n",
+ "print(list(test.columns))\n",
"test = test.to_numpy()\n",
"\n",
"fips = test[:,0]\n",
@@ -651,21 +555,19 @@
"data = np.vstack((data, test))\n",
"\n",
"#Standardize the (non-binary) features to have mean 0 and s.d. 1\n",
- "data = preprocess(data)\n",
+ "data = normalize(data)\n",
"\n",
"test = data[-n_data:,:]\n",
"data = data[:n_data,:]\n",
"data = np.hstack((data, np.transpose(np.array([results]))))\n",
"\n",
"###2012###\n",
- "test_2012[\"State\"] = [x.strip()[-2:] for x in test_2012['County']]\n",
- "dummies_test_2012 = pd.get_dummies(test_2012['State'])\n",
- "#print(dummies_2012.columns.difference(dummies_test_2012.columns)) #Check that DC is the only missing county/state\n",
- "test_2012 = pd.concat([test_2012,dummies_test_2012],axis=1)\n",
+ "test_2012 = get_states(test_2012)\n",
"test_2012['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n",
- "test_2012 = test_2012.drop('State',axis=1)\n",
+ "test_2012 = add_gscore(test_2012)\n",
+ "test_2012 = test_2012[cols_2012]\n",
+ "print(list(test_2012.columns))\n",
"\n",
- "add_gscore(test_2012)\n",
"test_2012 = test_2012.to_numpy()\n",
"\n",
"fips_2012 = test_2012[:,0]\n",
@@ -674,7 +576,7 @@
"n_data_2012 = len(data_2012)\n",
"data_2012 = np.vstack((data_2012, test_2012))\n",
"\n",
- "data_2012 = preprocess(data_2012)\n",
+ "data_2012 = normalize(data_2012)\n",
"test_2012 = data_2012[-n_data_2012:,:]\n",
"data_2012 = data_2012[:n_data_2012,:]\n",
"data_2012 = np.hstack((data_2012, np.transpose(np.array([results_2012]))))"
@@ -682,9 +584,20 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 67,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ 1.25892541 1.49170935 1.76753662 2.09436625 2.48162892 2.94049911\n",
+ " 3.48421754 4.12847324 4.89185622 5.79639395 6.86818691 8.13816172\n",
+ " 9.64296358 11.42601361 13.5387618 16.04217161 19.00847905 22.52327705\n",
+ " 26.68798528 31.6227766 ]\n"
+ ]
+ }
+ ],
"source": [
"### (3.2) Testing algorithms ###\n",
"\n",
@@ -699,7 +612,9 @@
"adb_dtc_g = ABC(base_estimator = DTC(criterion = \"entropy\"), n_estimators = 100)\n",
"\n",
"#SVM classifier\n",
- "Cs = np.logspace(1.25,1.25,20)\n",
+ "Cs = np.logspace(.1,1.5,20)\n",
+ "print(Cs)\n",
+ "#Cs = np.logspace(1.25,1.25,20)\n",
"svc = svm.SVC(gamma='scale', kernel='rbf')\n",
"c_svc = GridSearchCV(estimator=svc,param_grid=dict(C=Cs))\n",
"\n",
@@ -716,27 +631,25 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "knn avg weighted accuracy: 0.6329961166249547\n",
- "knn avg weighted accuracy 2012: 0.6215636493214642\n",
- "dtc_t avg weighted accuracy: 0.7461445847623741\n",
- "dtc_t avg weighted accuracy 2012: 0.6661740263653098\n",
- "adb_dtc avg weighted accuracy: 0.708280844124877\n",
- "adb_dtc avg weighted accuracy 2012: 0.7105907368805827\n",
- "dtc_g avg weighted accuracy: 0.7506852491001083\n",
- "dtc_g avg weighted accuracy 2012: 0.683730011834672\n",
- "adb_dtc_g avg weighted accuracy: 0.760068618725555\n",
- "adb_dtc_g avg weighted accuracy 2012: 0.6846362764834062\n",
- "svc avg weighted accuracy: 0.7974214368589101\n",
- "svc avg weighted accuracy 2012: 0.7389995155649615\n",
- "lr avg weighted accuracy: 0.7449232757260804\n",
- "lr avg weighted accuracy 2012: 0.713245836578974\n"
+ "knn avg weighted accuracy: 0.6330706501088754\n",
+ "knn avg weighted accuracy 2012: 0.6287551049058671\n",
+ "dtc_t avg weighted accuracy: 0.7126628181827105\n",
+ "dtc_t avg weighted accuracy 2012: 0.6654573475411754\n",
+ "adb_dtc avg weighted accuracy: 0.7583268043136182\n",
+ "adb_dtc avg weighted accuracy 2012: 0.6876199289767525\n",
+ "dtc_g avg weighted accuracy: 0.7331652849631233\n",
+ "dtc_g avg weighted accuracy 2012: 0.6960158238672269\n",
+ "adb_dtc_g avg weighted accuracy: 0.7526903758034124\n",
+ "adb_dtc_g avg weighted accuracy 2012: 0.6782594899860952\n",
+ "svc avg weighted accuracy: 0.7850360498781626\n",
+ "svc avg weighted accuracy 2012: 0.7207312417562269\n"
]
}
],
@@ -832,42 +745,30 @@
"print(\"svc avg weighted accuracy: \", x)\n",
"print(\"svc avg weighted accuracy 2012: \", x_2012)\n",
"\n",
- "#print(\"lr avg weighted accuracy: \", kfold(lr, data))\n",
- "#print(\"lr avg weighted accuracy: \", kfold(c_lr, data, grid=\"Yes\", grid_model=\"lr\")) #.721 using Cs = np.logspace(-1,1,100) and solver = 'liblinear'\n",
- "#print(\"lr avg weighted accuracy 2012: \", kfold(c_lr, data_2012, grid=\"Yes\", grid_model=\"lr\"))\n",
"x_lr,y_lr,z_lr = kfold_lr(c_lr, data)\n",
"x_lr_2012,y_lr_2012,z_lr_2012 = kfold_lr(c_lr, data_2012)\n",
"print(\"lr avg weighted accuracy: \", x_lr)\n",
"print(\"lr avg weighted accuracy 2012: \", x_lr_2012)\n",
"\n",
- "#print(\"adb_lr avg weighted accuracy: \", kfold(adb_lr, data)) #.713 using solver = 'liblinear'\n",
- "#print(\"adb_lr avg weighted accuracy 2012: \", kfold(adb_lr, data_2012)) #.713 using solver = 'liblinear'\n",
- "\n",
- "#print(\"nbc avg weighted accuracy: \", kfold(nbc, data)) #.653\n",
- "\n",
- "#print(\"adb_nbc avg weighted accuracy: \", kfold(adb_nbc, data)) #.5 using n_estimators = 100\n",
- "\n",
"#Get final predictions\n",
- "preds = z.predict(test) #Uses 2016"
+ "preds_creative = z.predict(test) #Uses 2016"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "223\n",
- "216\n"
+ "235\n"
]
}
],
"source": [
- "print(sum(preds))\n",
- "print(sum(z.predict(data[:,:-1])))"
+ "print(sum(preds_creative))"
]
},
{
@@ -888,6 +789,13 @@
"3.2.2 Please explain in detail how you achieved this and what you did specifically and why you tried this."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -908,16 +816,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 56,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "228\n"
+ ]
+ }
+ ],
"source": [
"# TODO\n",
"output = np.array([fips, preds])\n",
"output = pd.DataFrame(np.transpose(output))\n",
"output[1] = output[1].astype(int)\n",
- "#output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
- "output.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
+ "output_creative = np.array([fips, preds_creative])\n",
+ "output_creative = pd.DataFrame(np.transpose(output_creative))\n",
+ "output_creative[1] = output_creative[1].astype(int)\n",
+ "output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
+ "output_creative.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
"\n",
"# You may use pandas to generate a dataframe with FIPS and your predictions first \n",
"# and then use to_csv to generate a CSV file."
diff --git a/CS 4780 Final Project Student Template.ipynb b/CS 4780 Final Project Student Template.ipynb
index c6aadc1..5c459fd 100755
--- a/CS 4780 Final Project Student Template.ipynb
+++ b/CS 4780 Final Project Student Template.ipynb
@@ -74,6 +74,7 @@
"import numpy as np\n",
"#import sklearn as sk\n",
"from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.preprocessing import KBinsDiscretizer\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn import svm\n",
"from sklearn.ensemble import AdaBoostClassifier as ABC\n",
@@ -84,7 +85,6 @@
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.metrics import balanced_accuracy_score\n",
- "from sklearn.preprocessing import KBinsDiscretizer\n",
"from sklearn.compose import ColumnTransformer"
]
},
@@ -159,7 +159,6 @@
"\n",
"#Trim the data of irrelevant fields and convert to numpy array\n",
"data = data[:,4:]\n",
- "#print(data.dtypes)\n",
"\n",
"#Load and format the test_2016_no_label.csv file\n",
"test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
@@ -248,16 +247,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "knn avg weighted accuracy: 0.6203334398310165\n",
- "dtc avg weighted accuracy: 0.7070879048578987\n",
- "dtc_t avg weighted accuracy: 0.6931251192737173\n",
- "adb_dtc avg weighted accuracy: 0.6961593601061467\n",
- "svc avg weighted accuracy: 0.7004446186230044\n",
- "lr avg weighted accuracy: 0.6590264805800705\n",
- "adb_lr avg weighted accuracy: 0.6497330664169988\n",
- "nbc avg weighted accuracy: 0.538413031775743\n",
- "adb_nbc avg weighted accuracy: 0.5742813326608044\n",
- "219.0\n"
+ "knn avg weighted accuracy: 0.6237076582105193\n",
+ "dtc avg weighted accuracy: 0.6979205324376133\n",
+ "dtc_t avg weighted accuracy: 0.6855368083208002\n",
+ "adb_dtc avg weighted accuracy: 0.700633705568728\n",
+ "svc avg weighted accuracy: 0.6971740145905911\n",
+ "lr avg weighted accuracy: 0.6592688655540591\n",
+ "adb_lr avg weighted accuracy: 0.6488225086599588\n",
+ "nbc avg weighted accuracy: 0.5358837627311691\n",
+ "adb_nbc avg weighted accuracy: 0.5720794507211426\n",
+ "250.0\n"
]
}
],
@@ -350,69 +349,9 @@
"You may follow the steps in part 2 again but making innovative changes like creating new features, using new training algorithms, etc. Make sure you explain everything clearly in part 3.2. Note that reaching the 75% creative baseline is only a small portion of this part. Any creative ideas will receive most points as long as they are reasonable and clearly explained."
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Make sure you comment your code clearly and you may refer to these comments in the part 3.2\n",
- "# TODO\n",
- "\n",
- "dtc.fit(train, yTr)\n",
- "dtc_preds = dtc.predict(valid)\n",
- "print(\"dtc weighted accuracy: \", weighted_accuracy(dtc_preds,yV)) #does pretty well, even with no pruning\n",
- "\n",
- "dtc_g.fit(train, yTr)\n",
- "dtc_g_preds = dtc.predict(valid)\n",
- "print(\"dtc_g weighted accuracy: \", weighted_accuracy(dtc_g_preds,yV)) #does pretty well, even with no pruning\n",
- "\n",
- "adb_dtc.fit(train, yTr)\n",
- "adb_dtc_preds = adb_dtc.predict(valid)\n",
- "print(\"adb_dtc weighted accuracy: \", weighted_accuracy(adb_dtc_preds,yV))\n",
- "\n",
- "#best validation error so far\n",
- "svc.fit(train, yTr)\n",
- "svc_preds = svc.predict(valid)\n",
- "print(\"svc weighted accuracy: \", weighted_accuracy(svc_preds,yV))\n",
- "\n",
- "#does not work very well; not really better than random\n",
- "nbc.fit(train, yTr)\n",
- "nbc_preds = nbc.predict(valid)\n",
- "print(\"nbc weighted accuracy: \", weighted_accuracy(nbc_preds,yV))\n",
- "\n",
- "adb_nbc.fit(train, yTr)\n",
- "adb_nbc_preds = adb_nbc.predict(valid)\n",
- "print(\"adb_nbc weighted accuracy: \", weighted_accuracy(adb_nbc_preds,yV))\n",
- "\n",
- "#Split the data into a train and test set\n",
- "np.random.shuffle(data)\n",
- " \n",
- "results = np.array(df[:,-1],dtype='bool')\n",
- "data = df[:,:6]\n",
- "n_valid = int(len(data)/5)\n",
- "train = data[:4*n_valid]\n",
- "yTr = results[:4*n_valid]\n",
- "valid = data[4*n_valid:]\n",
- " yV = results[4*n_valid:]\n",
- "\n",
- "n_valid = int(len(data)/5)\n",
- "j = 100;\n",
- "\n",
- "#for k in range(5, 100):\n",
- "sum_err = 0;\n",
- "#knn = KNeighborsClassifier(n_neighbors = k)\n",
- "for i in range(j):\n",
- " #randomize training and validation set\n",
- " np.random.shuffle(data)\n",
- "\n",
- " #run algorithm\n",
- " knn.fit(train, yTr)\n",
- " knn_preds = knn.predict(valid)\n",
- " sum_err = sum_err + sum(knn_preds != yV)/n_valid"
- ]
- },
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -420,97 +359,48 @@
"\n",
"### (3.1) Preprocessing and Feature Extraction ###\n",
"\n",
- "###(3.1.a) Load and format the training data set(s)###\n",
- "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
- "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n",
- "graph = pd.read_csv(\"graph.csv\", sep=',')\n",
+ "###Helper functions to preprocess data###\n",
"\n",
"#Add binary features for the state that the county is in (where each binary feature represents one state)\n",
- "\n",
- "###2016###\n",
- "data[\"State\"] = [x.strip()[-2:] for x in data['County']] #Extract state abbreviations from County\n",
- "dummies = pd.get_dummies(data['State']) #Create state dummies from State\n",
- "data = pd.concat([data,dummies],axis=1)\n",
- "data = data.drop('State', axis=1)\n",
- "\n",
- "###2012###\n",
- "data_2012[\"State\"] = [x.strip()[-2:] for x in data_2012['County']] #Extract state abbreviations from County\n",
- "dummies_2012 = pd.get_dummies(data_2012['State']) #Create state dummies from State\n",
- "data_2012 = pd.concat([data_2012,dummies_2012],axis=1)\n",
- "data_2012 = data_2012.drop(\"State\", axis=1)\n",
- "\n",
- "#Define a function which will preprocess the data\n",
- "def preprocess(data):\n",
- " #Normalize features\n",
- " #Standardize the data with mean 0 and s.d. 1\n",
+ "def get_states(X):\n",
+ " X[\"State\"] = [x.strip()[-2:] for x in X['County']]\n",
+ " dummies_test = pd.get_dummies(X['State'])\n",
+ " X = pd.concat([X,dummies_test],axis=1)\n",
+ " X = X.drop('State',axis=1)\n",
+ " return X\n",
+ "\n",
+ "#Normalizes all variables from the data except for the binary state variables\n",
+ "def normalize(X):\n",
+ " #Normalize features Standardize the data with mean 0 and s.d. 1\n",
" scalar = StandardScaler()\n",
- " _,d = np.shape(data)\n",
- " data_norm = data[:,[0,1,2,3,4,5,(d-3),(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n",
- " #data_raw_indexes = np.r_[6:(d-2)]\n",
- " data_bin = data[:,6:(d-3)]\n",
- " #print('untransformed data', data_bin)\n",
+ " _,d = np.shape(X)\n",
+ " data_norm = X[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n",
+ " data_bin = X[:,6:(d-2)]\n",
" scalar.fit(data_norm)\n",
" data_norm = scalar.transform(data_norm)\n",
- " #print('transformed data', data_norm)\n",
- " data = np.hstack((data_norm, data_bin))\n",
- " #print(data)\n",
- "\n",
- " #scalar = StandardScaler()\n",
- " #binning = ['MedianIncome']\n",
- " #standardization = ['MigraRate','BirthRate','DeathRate','BachelorRate','UnemploymentRate']\n",
- " #n,d = np.shape(counties)\n",
- " #print(d)\n",
- " #binning = [4]\n",
- " #standardization = [*range(5,d)]\n",
- " #print(standardization)\n",
- " #ct = ColumnTransformer([(\"standardize\", scalar, standardization), \n",
- " # (\"binning\", KBinsDiscretizer(n_bins=10), binning)])\n",
- " #try min maxing if binning doesn't \n",
- " #data = ct.fit_transform(counties)\n",
- " return data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "###EXPERIMENTING DON'T RUN YET###\n",
- "\n",
- "#Add binary feature for whether a county is on a state border (which may indicate they receive other states' political ads)\n",
- "fips_allstates = data[[\"FIPS\",\"State\"]]\n",
- "tmp1 = graph.merge(fips_allstates, left_on=\"SRC\", right_on=\"FIPS\", how=\"left\")\n",
- "tmp1 = tmp1[[\"SRC\",\"DST\",\"State\"]]\n",
- "tmp1.rename(columns={'State':'SRC_State'}, inplace=True)\n",
- "print(tmp1)\n",
- "\n",
- "tmp2 = tmp1.merge(fips_allstates, left_on=\"DST\", right_on=\"FIPS\", how=\"left\")\n",
- "tmp2 = tmp2[[\"SRC\",\"DST\",\"SRC_State\",\"State\"]]\n",
- "tmp2.rename(columns={'State':'DST_State'}, inplace=True)\n",
- "print(tmp2)\n",
+ " X = np.hstack((data_norm, data_bin))\n",
+ " return X\n",
"\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRC_State\"].eq(tmp2[\"DST_State\"])\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].astype(int)\n",
- "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].replace({0:1,1:0}) #Make sure =1 iff on border\n",
- "print(tmp2)\n",
- "tmp2 = tmp2.dropna() #Because NaNs indicate counties that are not in training data - not sure how to deal with loss of data\n",
- "tmp2 = tmp2.groupby(['SRC'])['SRConBorder'].sum()\n",
- "print(tmp2)\n",
+ "###(3.1.a) Load and format the training data set(s)###\n",
+ "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
+ "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n",
+ "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
"\n",
- "data = data.merge(tmp2, left_on=\"FIPS\", right_on=\"SRC\", how=\"left\")\n",
- "print(data)\n",
+ "###2016###\n",
+ "data = get_states(data)\n",
"\n",
- "#Notes: Source counties can be found in test, destination in train (or vice versa - one occurence in either column(?))\n",
- "#Notes: Get list of neighbors that voted one way or other (check one per column)"
+ "###2012###\n",
+ "data_2012 = get_states(data_2012)\n",
+ "\n"
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"#Creates a lexicographically ordered list of all the county's neighbors\n",
- "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
"neighbors = {}\n",
"for i in range(len(graph)):\n",
" src = graph['SRC'][i]\n",
@@ -531,15 +421,12 @@
"dem = dict(zip(data['FIPS'], dem_perc))\n",
"votes = data[['DEM','GOP']]\n",
"votes = list(votes.itertuples(index=False, name=None))\n",
- "votes = dict(zip(data['FIPS'], votes))\n",
- "votes_2012 = data_2012[['DEM','GOP']]\n",
- "votes_2012 = list(votes_2012.itertuples(index=False, name=None))\n",
- "votes_2012 = dict(zip(data['FIPS'], votes_2012))"
+ "votes = dict(zip(data['FIPS'], votes))"
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@@ -554,7 +441,7 @@
" score += dem[i]\n",
" return n, score/n if n != 0 else np.nan\n",
"\n",
- "#TODO - implement a function that calculates the percentage of all voters in the surrounding counties who voted dem\n",
+ "#Calculates the percentage of all voters from the neighboring counties who voted democratic\n",
"def aggregate_gscore(neighs, votes):\n",
" n = 0\n",
" dem = 0\n",
@@ -569,40 +456,45 @@
"#Function that takes a list of counties and appends their graph_score to the end of their vector:\n",
"def add_gscore(counties):\n",
" counties['GraphScore'] = 0\n",
- " counties['2012GScore'] = 0\n",
" counties['Neighbors'] = 0\n",
" for i in range(len(counties)):\n",
" key = int(counties['FIPS'][i])\n",
" if neighbors.get(key) == None:\n",
" score = np.nan\n",
- " s2012 = np.nan\n",
" n = 0\n",
" else:\n",
" n, score = aggregate_gscore(neighbors[key], votes)\n",
- " _, s2012 = aggregate_gscore(neighbors[key], votes_2012)\n",
+ " total_n = len(neighbors[key])\n",
" counties.loc[i, 'GraphScore'] = score\n",
- " counties.loc[i, '2012GScore'] = s2012\n",
" counties.loc[i, 'Neighbors'] = n\n",
" smean = counties['GraphScore'].sum()/len(counties)\n",
- " s2012_mean = counties['2012GScore'].sum()/len(counties)\n",
" counties['GraphScore'] = counties['GraphScore'].map(lambda x: smean if pd.isnull(x) else x)\n",
- " counties['2012GScore'] = counties['2012GScore'].map(lambda x: s2012_mean if pd.isnull(x) else x)\n",
" return counties"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 61,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n",
+ "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n"
+ ]
+ }
+ ],
"source": [
"###Prep for concatenating###\n",
"\n",
"###2016###\n",
- "#cols = data.columns.tolist()\n",
- "#cols = cols[0:2] + cols[4:] #Helps make sure columns in test align with those in data\n",
- "\n",
"data = add_gscore(data)\n",
+ "#Rearrange the columns to ensure that DC is in the same location of each dataset\n",
+ "cols = data.columns.tolist()\n",
+ "cols = cols[0:2] + cols[4:]\n",
+ "print(list(data.columns))\n",
"data = data.to_numpy()\n",
"\n",
"#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n",
@@ -611,11 +503,13 @@
"\n",
"#Trim the data of irrelevant fields and convert to numpy array\n",
"data = data[:,4:]\n",
- "#print(np.shape(data))\n",
- "#print(data)\n",
"\n",
"###2012###\n",
"data_2012 = add_gscore(data_2012)\n",
+ "#Rearrange the columns to ensure that DC is in the same location of each dataset\n",
+ "cols_2012 = data_2012.columns.tolist()\n",
+ "cols_2012 = cols_2012[0:2] + cols_2012[4:]\n",
+ "print(list(data_2012.columns))\n",
"data_2012 = data_2012.to_numpy()\n",
"\n",
"#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n",
@@ -627,9 +521,18 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 62,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n",
+ "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n"
+ ]
+ }
+ ],
"source": [
"###(3.1.b) Load and format the testing data sets###\n",
"test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n",
@@ -638,15 +541,11 @@
"#Process the testing data set in the same manner as the training data set\n",
"\n",
"###2016###\n",
- "test[\"State\"] = [x.strip()[-2:] for x in test['County']]\n",
- "dummies_test = pd.get_dummies(test['State'])\n",
- "#print(dummies.columns.difference(dummies_test.columns)) #Check that DC is the only missing county/state\n",
- "test = pd.concat([test,dummies_test],axis=1)\n",
+ "test = get_states(test)\n",
"test['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n",
- "test = test.drop('State',axis=1)\n",
- "#print(test)\n",
- "#test = test[cols] #Make sure columns in test align with those in data\n",
"test = add_gscore(test)\n",
+ "test = test[cols]\n",
+ "print(list(test.columns))\n",
"test = test.to_numpy()\n",
"\n",
"fips = test[:,0]\n",
@@ -656,21 +555,19 @@
"data = np.vstack((data, test))\n",
"\n",
"#Standardize the (non-binary) features to have mean 0 and s.d. 1\n",
- "data = preprocess(data)\n",
+ "data = normalize(data)\n",
"\n",
"test = data[-n_data:,:]\n",
"data = data[:n_data,:]\n",
"data = np.hstack((data, np.transpose(np.array([results]))))\n",
"\n",
"###2012###\n",
- "test_2012[\"State\"] = [x.strip()[-2:] for x in test_2012['County']]\n",
- "dummies_test_2012 = pd.get_dummies(test_2012['State'])\n",
- "#print(dummies_2012.columns.difference(dummies_test_2012.columns)) #Check that DC is the only missing county/state\n",
- "test_2012 = pd.concat([test_2012,dummies_test_2012],axis=1)\n",
+ "test_2012 = get_states(test_2012)\n",
"test_2012['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n",
- "test_2012 = test_2012.drop('State',axis=1)\n",
+ "test_2012 = add_gscore(test_2012)\n",
+ "test_2012 = test_2012[cols_2012]\n",
+ "print(list(test_2012.columns))\n",
"\n",
- "add_gscore(test_2012)\n",
"test_2012 = test_2012.to_numpy()\n",
"\n",
"fips_2012 = test_2012[:,0]\n",
@@ -679,7 +576,7 @@
"n_data_2012 = len(data_2012)\n",
"data_2012 = np.vstack((data_2012, test_2012))\n",
"\n",
- "data_2012 = preprocess(data_2012)\n",
+ "data_2012 = normalize(data_2012)\n",
"test_2012 = data_2012[-n_data_2012:,:]\n",
"data_2012 = data_2012[:n_data_2012,:]\n",
"data_2012 = np.hstack((data_2012, np.transpose(np.array([results_2012]))))"
@@ -687,9 +584,20 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 67,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ 1.25892541 1.49170935 1.76753662 2.09436625 2.48162892 2.94049911\n",
+ " 3.48421754 4.12847324 4.89185622 5.79639395 6.86818691 8.13816172\n",
+ " 9.64296358 11.42601361 13.5387618 16.04217161 19.00847905 22.52327705\n",
+ " 26.68798528 31.6227766 ]\n"
+ ]
+ }
+ ],
"source": [
"### (3.2) Testing algorithms ###\n",
"\n",
@@ -704,7 +612,9 @@
"adb_dtc_g = ABC(base_estimator = DTC(criterion = \"entropy\"), n_estimators = 100)\n",
"\n",
"#SVM classifier\n",
- "Cs = np.logspace(1.25,1.25,20)\n",
+ "Cs = np.logspace(.1,1.5,20)\n",
+ "print(Cs)\n",
+ "#Cs = np.logspace(1.25,1.25,20)\n",
"svc = svm.SVC(gamma='scale', kernel='rbf')\n",
"c_svc = GridSearchCV(estimator=svc,param_grid=dict(C=Cs))\n",
"\n",
@@ -728,18 +638,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "knn avg weighted accuracy: 0.6307393036738488\n",
- "knn avg weighted accuracy 2012: 0.6547367701142874\n",
- "dtc_t avg weighted accuracy: 0.745723590078064\n",
- "dtc_t avg weighted accuracy 2012: 0.6877354733446237\n",
- "adb_dtc avg weighted accuracy: 0.7285094502910128\n",
- "adb_dtc avg weighted accuracy 2012: 0.7070487212085304\n",
- "dtc_g avg weighted accuracy: 0.7504774822113874\n",
- "dtc_g avg weighted accuracy 2012: 0.6999027843035145\n",
- "adb_dtc_g avg weighted accuracy: 0.7373869781842327\n",
- "adb_dtc_g avg weighted accuracy 2012: 0.6992326684962192\n",
- "svc avg weighted accuracy: 0.7840850521161004\n",
- "svc avg weighted accuracy 2012: 0.7782245706696347\n"
+ "knn avg weighted accuracy: 0.6330706501088754\n",
+ "knn avg weighted accuracy 2012: 0.6287551049058671\n",
+ "dtc_t avg weighted accuracy: 0.7126628181827105\n",
+ "dtc_t avg weighted accuracy 2012: 0.6654573475411754\n",
+ "adb_dtc avg weighted accuracy: 0.7583268043136182\n",
+ "adb_dtc avg weighted accuracy 2012: 0.6876199289767525\n",
+ "dtc_g avg weighted accuracy: 0.7331652849631233\n",
+ "dtc_g avg weighted accuracy 2012: 0.6960158238672269\n",
+ "adb_dtc_g avg weighted accuracy: 0.7526903758034124\n",
+ "adb_dtc_g avg weighted accuracy 2012: 0.6782594899860952\n",
+ "svc avg weighted accuracy: 0.7850360498781626\n",
+ "svc avg weighted accuracy 2012: 0.7207312417562269\n"
]
}
],
@@ -835,33 +745,30 @@
"print(\"svc avg weighted accuracy: \", x)\n",
"print(\"svc avg weighted accuracy 2012: \", x_2012)\n",
"\n",
- "#print(\"lr avg weighted accuracy: \", kfold(lr, data))\n",
- "#print(\"lr avg weighted accuracy: \", kfold(c_lr, data, grid=\"Yes\", grid_model=\"lr\")) #.721 using Cs = np.logspace(-1,1,100) and solver = 'liblinear'\n",
- "#print(\"lr avg weighted accuracy 2012: \", kfold(c_lr, data_2012, grid=\"Yes\", grid_model=\"lr\"))\n",
"x_lr,y_lr,z_lr = kfold_lr(c_lr, data)\n",
"x_lr_2012,y_lr_2012,z_lr_2012 = kfold_lr(c_lr, data_2012)\n",
"print(\"lr avg weighted accuracy: \", x_lr)\n",
"print(\"lr avg weighted accuracy 2012: \", x_lr_2012)\n",
"\n",
- "#print(\"adb_lr avg weighted accuracy: \", kfold(adb_lr, data)) #.713 using solver = 'liblinear'\n",
- "#print(\"adb_lr avg weighted accuracy 2012: \", kfold(adb_lr, data_2012)) #.713 using solver = 'liblinear'\n",
- "\n",
- "#print(\"nbc avg weighted accuracy: \", kfold(nbc, data)) #.653\n",
- "\n",
- "#print(\"adb_nbc avg weighted accuracy: \", kfold(adb_nbc, data)) #.5 using n_estimators = 100\n",
- "\n",
"#Get final predictions\n",
- "preds = z.predict(test) #Uses 2016"
+ "preds_creative = z.predict(test) #Uses 2016"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 65,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "235\n"
+ ]
+ }
+ ],
"source": [
- "print(sum(preds))\n",
- "print(sum(z.predict(data[:,:-1])))"
+ "print(sum(preds_creative))"
]
},
{
@@ -882,6 +789,13 @@
"3.2.2 Please explain in detail how you achieved this and what you did specifically and why you tried this."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -902,16 +816,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 56,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "228\n"
+ ]
+ }
+ ],
"source": [
"# TODO\n",
"output = np.array([fips, preds])\n",
"output = pd.DataFrame(np.transpose(output))\n",
"output[1] = output[1].astype(int)\n",
- "#output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
- "output.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
+ "output_creative = np.array([fips, preds_creative])\n",
+ "output_creative = pd.DataFrame(np.transpose(output_creative))\n",
+ "output_creative[1] = output_creative[1].astype(int)\n",
+ "output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
+ "output_creative.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n",
"\n",
"# You may use pandas to generate a dataframe with FIPS and your predictions first \n",
"# and then use to_csv to generate a CSV file."
diff --git a/submission.csv b/submission.csv
index a59fce4..e2da191 100644
--- a/submission.csv
+++ b/submission.csv
@@ -6,21 +6,21 @@ FIPS,Result
39039,0
30083,0
46047,0
-45013,1
+45013,0
18119,0
48333,0
-53005,1
+53005,0
5097,0
48297,0
51141,0
54107,0
-12103,1
+12103,0
51077,0
25015,1
47043,0
48363,0
31077,0
-21043,1
+21043,0
28047,0
19193,0
18165,0
@@ -30,33 +30,33 @@ FIPS,Result
21229,0
17097,1
29159,0
-28149,1
+28149,0
48467,0
5053,0
31093,0
19143,0
13181,0
47127,0
-21153,0
+21153,1
20143,0
22119,0
19119,0
47083,0
42095,0
-6081,1
+6081,0
13313,0
5123,0
28039,0
-51760,0
+51760,1
48245,0
-41053,1
+41053,0
8025,0
5079,0
49043,0
-48489,0
+48489,1
22057,0
1021,0
-29185,1
+29185,0
29211,0
54081,0
6003,0
@@ -67,7 +67,7 @@ FIPS,Result
37097,0
48283,0
46083,0
-31055,1
+31055,0
51580,0
37157,0
42033,0
@@ -76,7 +76,7 @@ FIPS,Result
13075,0
21123,0
48185,0
-13245,1
+13245,0
13001,0
31139,0
38025,0
@@ -90,25 +90,25 @@ FIPS,Result
13183,0
53035,1
48479,1
-21063,0
+21063,1
20101,0
48415,0
51185,0
-51043,1
-36013,1
+51043,0
+36013,0
55119,0
-45075,1
+45075,0
47101,0
26089,1
48367,0
-37131,0
-21205,1
+37131,1
+21205,0
47179,0
48093,0
-5101,1
+5101,0
1093,0
31101,0
-5069,1
+5069,0
8119,0
54037,0
13213,0
@@ -121,39 +121,39 @@ FIPS,Result
19045,0
21167,0
31127,0
-5015,1
+5015,0
51159,0
-28133,0
+28133,1
5045,0
39069,0
-48381,1
+48381,0
13129,0
37041,0
51057,0
25005,0
22059,0
-38017,1
+38017,0
26109,0
-49037,0
+49037,1
48265,0
39129,0
20055,0
-21121,1
+21121,0
31173,0
-22035,1
+22035,0
8011,0
39027,0
17107,0
-42021,1
+42021,0
37023,0
54003,0
31097,0
17141,0
48203,0
20011,0
-23031,1
+23031,0
16027,0
-48003,1
+48003,0
51600,0
13027,0
21021,0
@@ -168,7 +168,7 @@ FIPS,Result
36015,0
1063,1
13279,0
-42125,1
+42125,0
42105,0
20207,0
13285,0
@@ -196,7 +196,7 @@ FIPS,Result
46119,0
12045,0
40141,0
-41015,1
+41015,0
48129,0
29083,0
17117,0
@@ -205,7 +205,7 @@ FIPS,Result
27009,0
19195,0
19039,0
-18005,1
+18005,0
37083,0
39077,0
19147,0
@@ -227,12 +227,12 @@ FIPS,Result
26121,0
21203,0
21211,0
-44005,1
+44005,0
35005,0
30017,0
31107,0
20003,0
-22103,1
+22103,0
5117,0
47107,0
20047,0
@@ -241,15 +241,15 @@ FIPS,Result
54027,0
13147,0
30055,0
-20203,1
+20203,0
1051,0
18153,0
29043,0
-8065,0
+8065,1
29135,0
31083,0
17065,0
-12097,1
+12097,0
28111,0
1113,0
48501,0
@@ -261,7 +261,7 @@ FIPS,Result
12125,0
51073,0
47149,0
-46031,1
+46031,0
40085,0
40127,0
28015,0
@@ -274,8 +274,8 @@ FIPS,Result
13319,0
20135,0
48021,0
-28121,1
-51678,1
+28121,0
+51678,0
12027,0
30005,0
17113,1
@@ -287,26 +287,26 @@ FIPS,Result
27111,0
38039,0
20153,0
-48361,1
+48361,0
20195,0
40101,0
21199,0
55117,0
-34039,0
+34039,1
16019,0
30009,0
19155,0
29105,0
23009,0
30101,0
-8059,0
+8059,1
17069,0
29175,0
31051,0
56039,1
-26131,1
+26131,0
48177,0
-8027,1
+8027,0
23007,0
51145,0
51087,1
@@ -333,9 +333,9 @@ FIPS,Result
26081,0
26155,0
17143,1
-53067,1
+53067,0
21105,0
-24009,0
+24009,1
1025,0
27067,0
19075,0
@@ -345,21 +345,21 @@ FIPS,Result
1105,1
16071,0
17199,0
-48061,0
+48061,1
1039,0
8097,1
27091,0
42099,0
42117,0
-13077,1
+13077,0
51520,0
36051,0
54025,0
-30049,1
+30049,0
1079,0
37117,0
48133,0
-42041,1
+42041,0
49007,0
47143,0
55039,0
@@ -370,16 +370,16 @@ FIPS,Result
53041,0
31169,0
37185,0
-26099,1
+26099,0
47155,0
6111,1
29029,0
13081,0
5111,0
-51133,1
-6057,1
+51133,0
+6057,0
21225,0
-48141,0
+48141,1
28073,0
27135,0
37129,0
@@ -390,8 +390,8 @@ FIPS,Result
46129,0
28003,0
1123,0
-39017,1
-6097,0
+39017,0
+6097,1
4021,0
38031,0
48405,0
@@ -414,13 +414,13 @@ FIPS,Result
36117,0
48043,1
45063,0
-16057,0
+16057,1
48157,0
12073,1
-41067,0
+41067,1
29187,0
8049,0
-40133,1
+40133,0
37183,1
54005,0
18045,0
@@ -430,9 +430,9 @@ FIPS,Result
54057,0
21065,0
48341,0
-47077,1
-28135,0
-1013,0
+47077,0
+28135,1
+1013,1
40145,0
1111,0
37111,0
@@ -440,7 +440,7 @@ FIPS,Result
12129,0
17103,0
1077,0
-45089,1
+45089,0
51187,0
13013,0
31087,0
@@ -448,11 +448,11 @@ FIPS,Result
13293,0
48461,0
28107,0
-8041,1
+8041,0
5049,0
27073,0
-36055,0
-13259,1
+36055,1
+13259,0
51155,0
35001,1
47025,0
@@ -462,11 +462,11 @@ FIPS,Result
48199,0
19037,0
13311,0
-51165,1
-17077,0
+51165,0
+17077,1
29001,0
54011,0
-56035,1
+56035,0
51135,0
25001,0
21175,0
@@ -492,14 +492,14 @@ FIPS,Result
6089,0
13227,0
51011,0
-36113,1
+36113,0
54061,1
18135,0
21221,0
17011,0
-29169,1
+29169,0
29081,0
-12087,1
+12087,0
17001,0
21117,0
47035,0
@@ -508,7 +508,7 @@ FIPS,Result
28069,0
56015,0
21159,0
-17203,1
+17203,0
37135,1
5011,0
28151,1
@@ -521,12 +521,12 @@ FIPS,Result
20021,0
26129,0
48507,1
-21037,1
+21037,0
13083,0
17137,0
48053,0
18041,0
-17167,0
+17167,1
42023,0
42081,0
17041,0
@@ -546,9 +546,9 @@ FIPS,Result
20023,0
26107,0
13255,0
-5093,0
-53063,1
-45067,1
+5093,1
+53063,0
+45067,0
46123,0
38007,0
29117,0
@@ -556,9 +556,9 @@ FIPS,Result
37139,0
29127,0
13123,0
-13283,1
+13283,0
31073,0
-51103,1
+51103,0
36119,1
13229,0
48281,0
@@ -571,10 +571,10 @@ FIPS,Result
17179,0
1055,0
55141,0
-25007,0
+25007,1
24015,0
29227,0
-48031,1
+48031,0
20175,0
41059,0
27087,0
@@ -601,7 +601,7 @@ FIPS,Result
21049,0
54019,0
21135,0
-29223,1
+29223,0
24031,1
55109,0
55001,0
@@ -609,13 +609,13 @@ FIPS,Result
55053,0
47011,0
13009,0
-13003,1
+13003,0
48471,0
12085,0
21047,0
51685,0
5139,0
-48271,1
+48271,0
56007,0
54015,0
6115,0
@@ -658,36 +658,36 @@ FIPS,Result
31069,0
26037,0
23003,0
-13063,0
+13063,1
19109,0
42079,0
39107,0
19103,1
39047,0
-50005,0
+50005,1
47051,0
39105,0
38091,0
-54089,1
+54089,0
42051,0
51081,0
-22055,0
+22055,1
51029,0
51093,0
22025,0
51750,1
-35011,1
-49005,1
+35011,0
+49005,0
48197,0
31125,0
1107,0
45085,0
-51061,1
+51061,0
13091,0
5131,0
49045,0
39159,0
-19053,1
+19053,0
37193,0
13149,0
18105,1
@@ -695,17 +695,17 @@ FIPS,Result
20089,0
28099,0
48373,0
-51157,1
+51157,0
31081,0
27125,0
-1035,1
-13163,1
+1035,0
+13163,0
31057,0
13257,0
18109,0
28053,1
13211,0
-22017,1
+22017,0
12039,0
13043,0
18075,0
@@ -715,19 +715,19 @@ FIPS,Result
29033,0
24047,0
6085,1
-45079,0
-46011,1
+45079,1
+46011,0
48145,0
49039,0
45051,0
39113,0
-12086,0
+12086,1
21223,0
39057,0
18157,0
18133,0
28021,1
-53061,0
+53061,1
6019,1
45087,0
37085,0
@@ -738,21 +738,21 @@ FIPS,Result
5003,0
18029,0
48231,0
-24043,1
+24043,0
51017,0
-8021,1
+8021,0
27053,1
18145,0
37031,0
-18087,1
+18087,0
5063,0
31017,0
51710,1
45047,0
30039,0
31035,0
-13039,1
-12061,1
+13039,0
+12061,0
18123,0
39119,0
26071,0
@@ -763,7 +763,7 @@ FIPS,Result
25021,1
42133,0
8077,0
-13021,1
+13021,0
6075,1
37093,0
37055,0
@@ -785,21 +785,21 @@ FIPS,Result
51015,0
45071,0
19031,0
-31109,1
+31109,0
55137,0
39063,0
5017,0
54035,0
-26061,0
+26061,1
28101,0
8087,0
31043,0
21039,0
39009,0
-13269,1
+13269,0
30067,0
48445,0
-41017,1
+41017,0
41057,0
48473,0
51021,0
@@ -812,29 +812,29 @@ FIPS,Result
28093,0
48171,0
48139,0
-50017,0
+50017,1
38081,0
-35017,1
+35017,0
51117,0
-51161,1
+51161,0
19157,0
26157,0
47135,0
-13153,1
+13153,0
48431,0
20075,0
20069,0
38051,0
-13061,1
+13061,0
16053,0
-50025,0
+50025,1
47085,0
-9007,0
+9007,1
27061,0
27081,0
51153,1
-46071,1
-50015,0
+46071,0
+50015,1
39061,0
19035,0
1133,0
@@ -842,18 +842,18 @@ FIPS,Result
19145,0
36043,0
42039,0
-19113,1
-48247,0
+19113,0
+48247,1
21171,0
47133,0
46097,0
35013,1
-22071,0
+22071,1
22015,0
35019,0
37145,0
-37081,0
-41069,1
+37081,1
+41069,0
29189,1
4015,0
21227,0
@@ -864,14 +864,14 @@ FIPS,Result
37059,0
42111,0
36017,0
-35053,1
+35053,0
48067,0
18057,1
6101,0
48465,0
51740,0
20037,0
-17093,0
+17093,1
40055,0
26117,0
27071,0
@@ -889,25 +889,25 @@ FIPS,Result
54013,0
26073,0
48291,0
-51660,0
+51660,1
39007,0
37077,0
41003,1
26125,1
25017,1
-17099,1
+17099,0
37013,0
20013,0
27107,0
28077,0
-21235,1
+21235,0
31071,0
19027,0
28159,0
39109,0
6041,1
30077,0
-48419,1
+48419,0
20073,0
37067,0
55013,0
@@ -924,7 +924,7 @@ FIPS,Result
8121,0
19167,0
18049,0
-1125,1
+1125,0
20169,0
36003,0
20129,0
@@ -936,9 +936,9 @@ FIPS,Result
48039,0
20139,0
12095,0
-51650,1
+51650,0
1087,1
-54069,1
+54069,0
39133,0
21001,0
17073,0
@@ -948,7 +948,7 @@ FIPS,Result
40137,0
40047,0
48435,0
-49017,1
+49017,0
40105,0
30027,0
26025,0
@@ -957,13 +957,13 @@ FIPS,Result
17071,0
47041,0
21067,1
-12065,1
-42043,1
+12065,0
+42043,0
19187,0
27093,0
48045,0
21003,0
-45033,1
+45033,0
55055,0
8091,0
27063,0
@@ -974,27 +974,27 @@ FIPS,Result
22001,0
20017,0
13029,0
-48323,0
+48323,1
46085,0
54093,0
-36019,0
+36019,1
19165,0
45003,0
51810,1
-26053,1
+26053,0
22045,0
38003,0
-12099,1
+12099,0
40115,0
-1091,0
+1091,1
51023,0
-13085,1
+13085,0
36007,0
45041,0
22023,0
51053,0
40131,0
-6001,0
+6001,1
19091,0
19097,0
13131,0
@@ -1014,12 +1014,12 @@ FIPS,Result
20137,0
13189,0
6055,1
-22065,1
+22065,0
55033,0
-1081,0
+1081,1
31001,0
25009,1
-13215,0
+13215,1
20131,0
8073,0
17039,0
@@ -1033,7 +1033,7 @@ FIPS,Result
9015,0
4017,0
29145,0
-5099,1
+5099,0
39147,0
48261,0
28071,1
@@ -1049,8 +1049,8 @@ FIPS,Result
47073,0
13071,0
37091,0
-27045,1
-8013,0
+27045,0
+8013,1
13197,0
32005,0
17161,0
@@ -1068,7 +1068,7 @@ FIPS,Result
23011,0
17193,0
22005,0
-48249,0
+48249,1
34031,1
18183,0
51001,0
@@ -1095,17 +1095,17 @@ FIPS,Result
19169,1
49051,0
55125,0
-26145,1
+26145,0
4009,0
-47037,1
-36077,1
+47037,0
+36077,0
54105,0
27017,0
12053,0
39059,0
21095,0
32023,0
-13199,0
+13199,1
55069,0
47161,0
29015,0
@@ -1124,7 +1124,7 @@ FIPS,Result
46043,0
18003,0
17013,0
-51700,0
+51700,1
29097,0
54021,0
55093,0
@@ -1132,7 +1132,7 @@ FIPS,Result
5085,0
46015,0
6043,0
-13235,1
+13235,0
48025,0
18013,0
35006,0
@@ -1144,7 +1144,7 @@ FIPS,Result
48075,0
26147,0
26063,0
-53077,0
+53077,1
21077,0
21093,0
18065,0
@@ -1153,7 +1153,7 @@ FIPS,Result
27059,0
32003,0
21197,0
-49049,1
+49049,0
1115,0
40075,0
51143,0
@@ -1161,18 +1161,18 @@ FIPS,Result
16011,0
6017,0
37011,0
-29133,1
+29133,0
55075,0
36059,1
30097,0
-26065,0
+26065,1
40039,0
18043,0
-18127,0
+18127,1
48485,0
23001,0
-36021,1
-32510,1
+36021,0
+32510,0
51036,0
38103,0
18091,0
@@ -1180,23 +1180,23 @@ FIPS,Result
13117,0
47053,0
47023,0
-18081,1
+18081,0
20117,0
33007,0
40027,0
50009,0
48097,0
-48041,0
+48041,1
36053,0
16079,0
36005,1
-47097,1
+47097,0
37175,0
21191,0
51097,0
5027,0
21035,0
-17019,0
+17019,1
37009,0
48313,0
20059,0
@@ -1207,8 +1207,8 @@ FIPS,Result
41033,0
28049,1
19177,0
-28043,1
-13165,1
+28043,0
+13165,0
53057,0
51610,1
27139,0
@@ -1222,27 +1222,27 @@ FIPS,Result
1029,0
22085,0
30041,0
-1041,1
-46041,0
+1041,0
+46041,1
41051,1
-47131,1
+47131,0
51169,0
18071,0
51830,1
34023,1
8055,0
-28105,0
+28105,1
48309,0
-6105,0
+6105,1
21057,0
18073,0
28031,0
-4019,0
-56033,1
+4019,1
+56033,0
40107,0
48335,0
17025,0
-1065,1
+1065,0
51051,0
31145,0
28027,1
@@ -1292,10 +1292,10 @@ FIPS,Result
44003,0
48317,0
38089,0
-48505,0
+48505,1
13069,0
-28087,1
-29085,1
+28087,0
+29085,0
12105,0
54023,0
51101,0
@@ -1311,7 +1311,7 @@ FIPS,Result
1061,0
41063,0
40135,0
-13145,1
+13145,0
29077,0
30079,0
16035,0
@@ -1319,38 +1319,38 @@ FIPS,Result
48023,0
40021,0
48267,0
-54067,1
-5119,1
+54067,0
+5119,0
40083,0
48251,0
21183,0
27027,0
-23013,1
+23013,0
13139,0
28009,0
51025,0
-13095,0
-41027,0
+13095,1
+41027,1
39033,0
-35031,0
+35031,1
38047,0
36105,0
51690,0
30023,0
29047,0
12071,0
-48191,1
-21075,1
+48191,0
+21075,0
19057,0
18063,0
40113,0
-22101,1
+22101,0
38087,0
13289,0
37121,0
45017,0
36071,0
-46095,1
+46095,0
48441,0
47071,0
13109,0
@@ -1364,20 +1364,20 @@ FIPS,Result
45065,0
17021,0
47087,0
-36079,1
+36079,0
8115,0
10001,0
45039,0
22091,0
-24041,0
+24041,1
21165,0
25013,0
-36067,0
+36067,1
31089,0
38001,0
46075,0
29053,0
-27127,1
+27127,0
6073,1
8069,1
51175,0
@@ -1385,7 +1385,7 @@ FIPS,Result
13143,0
18051,0
54051,0
-6065,0
+6065,1
17183,0
35043,1
21089,0
@@ -1398,14 +1398,14 @@ FIPS,Result
40097,0
51127,0
1009,0
-37147,0
+37147,1
37103,0
47065,0
55101,0
29137,0
31151,0
-27109,1
-36027,1
+27109,0
+36027,0
6045,0
54039,0
8001,0
@@ -1431,11 +1431,11 @@ FIPS,Result
29037,0
16085,0
38075,0
-21189,1
+21189,0
29181,0
19085,0
37161,0
-46121,1
+46121,0
37195,0
48417,0
37113,0
@@ -1448,7 +1448,7 @@ FIPS,Result
55099,0
36041,0
31079,0
-44009,1
+44009,0
16049,0
39049,1
20209,0
@@ -1459,9 +1459,9 @@ FIPS,Result
5021,0
35045,0
21031,0
-21025,1
+21025,0
48163,0
-46017,1
+46017,0
48095,0
55123,0
12077,0
@@ -1477,7 +1477,7 @@ FIPS,Result
21041,0
48503,0
27173,0
-55063,1
+55063,0
29115,0
27005,0
29125,0
@@ -1494,7 +1494,7 @@ FIPS,Result
38027,0
5105,0
51033,0
-54047,1
+54047,0
28155,0
28097,0
13321,0
@@ -1508,7 +1508,7 @@ FIPS,Result
39127,0
5071,0
47125,0
-8037,0
+8037,1
38045,0
17153,0
29067,0
@@ -1525,9 +1525,9 @@ FIPS,Result
17095,0
18095,0
29027,0
-51540,0
+51540,1
37027,0
-54055,1
+54055,0
51059,1
48189,0
37179,0
@@ -1536,7 +1536,7 @@ FIPS,Result
21127,0
47045,0
51119,0
-1085,0
+1085,1
13037,0
47089,0
40067,0
diff --git a/submission_creative.csv b/submission_creative.csv
index 20a59f6..e4517d3 100644
--- a/submission_creative.csv
+++ b/submission_creative.csv
@@ -43,17 +43,17 @@ FIPS,Result
19119,0
47083,0
42095,0
-6081,1
+6081,0
13313,0
-5123,1
+5123,0
28039,0
51760,1
48245,0
-41053,0
+41053,1
8025,0
5079,0
49043,0
-48489,0
+48489,1
22057,0
1021,0
29185,0
@@ -61,13 +61,13 @@ FIPS,Result
54081,0
6003,0
29225,0
-33015,0
+33015,1
42071,0
20065,0
37097,0
48283,0
46083,0
-31055,0
+31055,1
51580,0
37157,0
42033,0
@@ -76,7 +76,7 @@ FIPS,Result
13075,0
21123,0
48185,0
-13245,1
+13245,0
13001,0
31139,0
38025,0
@@ -85,12 +85,12 @@ FIPS,Result
48101,0
42037,0
19079,0
-12049,0
+12049,1
21131,0
13183,0
53035,1
48479,1
-21063,1
+21063,0
20101,0
48415,0
51185,0
@@ -99,7 +99,7 @@ FIPS,Result
55119,0
45075,1
47101,0
-26089,1
+26089,0
48367,0
37131,1
21205,0
@@ -109,7 +109,7 @@ FIPS,Result
1093,0
31101,0
5069,0
-8119,0
+8119,1
54037,0
13213,0
29089,0
@@ -134,13 +134,13 @@ FIPS,Result
22059,0
38017,0
26109,0
-49037,0
+49037,1
48265,0
39129,0
20055,0
21121,0
31173,1
-22035,1
+22035,0
8011,0
39027,0
17107,0
@@ -159,7 +159,7 @@ FIPS,Result
21021,0
42123,0
32015,0
-15009,0
+15009,1
42057,0
53065,0
48211,0
@@ -191,7 +191,7 @@ FIPS,Result
48083,0
45031,0
4023,1
-47095,0
+47095,1
12081,0
46119,0
12045,0
@@ -222,12 +222,12 @@ FIPS,Result
28081,0
29093,0
46003,0
-4027,1
-53025,1
+4027,0
+53025,0
26121,0
21203,0
21211,0
-44005,1
+44005,0
35005,0
30017,0
31107,0
@@ -307,7 +307,7 @@ FIPS,Result
26131,0
48177,0
8027,0
-23007,0
+23007,1
51145,0
51087,1
20157,0
@@ -335,8 +335,8 @@ FIPS,Result
17143,1
53067,1
21105,0
-24009,0
-1025,1
+24009,1
+1025,0
27067,0
19075,0
12037,0
@@ -351,15 +351,15 @@ FIPS,Result
27091,0
42099,0
42117,0
-13077,1
-51520,1
+13077,0
+51520,0
36051,0
54025,0
30049,0
1079,0
-37117,0
+37117,1
48133,0
-42041,1
+42041,0
49007,0
47143,0
55039,0
@@ -376,11 +376,11 @@ FIPS,Result
29029,0
13081,0
5111,0
-51133,0
+51133,1
6057,1
21225,0
-48141,1
-28073,1
+48141,0
+28073,0
27135,0
37129,0
37025,0
@@ -407,19 +407,19 @@ FIPS,Result
53001,0
19019,0
39043,0
-34001,0
+34001,1
31113,0
27145,0
47105,0
36117,0
48043,1
-45063,1
+45063,0
16057,1
48157,0
12073,1
41067,1
29187,0
-8049,0
+8049,1
40133,0
37183,1
54005,0
@@ -432,7 +432,7 @@ FIPS,Result
48341,0
47077,0
28135,1
-1013,0
+1013,1
40145,0
1111,0
37111,0
@@ -446,9 +446,9 @@ FIPS,Result
31087,0
17091,0
13293,0
-48461,0
+48461,1
28107,1
-8041,0
+8041,1
5049,0
27073,0
36055,1
@@ -479,7 +479,7 @@ FIPS,Result
27033,0
22047,0
37171,0
-51003,1
+51003,0
54041,0
17155,0
5075,0
@@ -493,7 +493,7 @@ FIPS,Result
13227,0
51011,0
36113,0
-54061,0
+54061,1
18135,0
21221,0
17011,0
@@ -538,7 +538,7 @@ FIPS,Result
51071,0
26113,0
8015,0
-19061,0
+19061,1
27049,0
31011,0
48357,0
@@ -558,8 +558,8 @@ FIPS,Result
13123,0
13283,0
31073,0
-51103,0
-36119,1
+51103,1
+36119,0
13229,0
48281,0
6039,0
@@ -577,7 +577,7 @@ FIPS,Result
48031,0
20175,0
41059,0
-27087,1
+27087,0
41061,0
48307,0
56005,0
@@ -640,23 +640,23 @@ FIPS,Result
32027,0
19181,0
13267,0
-17007,1
+17007,0
48137,0
-8035,1
+8035,0
34025,1
18007,0
-4013,0
+4013,1
54073,0
26045,0
29011,0
-55135,0
+55135,1
30071,0
12093,0
6107,0
-1011,0
+1011,1
29035,0
31069,0
-26037,1
+26037,0
23003,0
13063,1
19109,0
@@ -668,20 +668,20 @@ FIPS,Result
47051,0
39105,0
38091,0
-54089,1
+54089,0
42051,0
51081,1
-22055,1
+22055,0
51029,0
51093,0
22025,0
51750,1
35011,0
-49005,1
+49005,0
48197,0
31125,0
1107,0
-45085,1
+45085,0
51061,0
13091,0
5131,0
@@ -698,7 +698,7 @@ FIPS,Result
51157,0
31081,0
27125,0
-1035,1
+1035,0
13163,0
31057,0
13257,0
@@ -716,14 +716,14 @@ FIPS,Result
24047,0
6085,1
45079,1
-46011,0
+46011,1
48145,0
49039,0
45051,0
39113,0
-12086,1
+12086,0
21223,0
-39057,1
+39057,0
18157,1
18133,0
28021,1
@@ -740,7 +740,7 @@ FIPS,Result
48231,0
24043,0
51017,0
-8021,1
+8021,0
27053,1
18145,0
37031,0
@@ -762,8 +762,8 @@ FIPS,Result
5037,0
25021,1
42133,0
-8077,0
-13021,1
+8077,1
+13021,0
6075,1
37093,1
37055,0
@@ -776,7 +776,7 @@ FIPS,Result
34029,0
20099,0
22079,0
-13239,1
+13239,0
51079,0
31105,0
13249,0
@@ -788,9 +788,9 @@ FIPS,Result
31109,0
55137,0
39063,0
-5017,1
+5017,0
54035,0
-26061,1
+26061,0
28101,0
8087,0
31043,0
@@ -815,12 +815,12 @@ FIPS,Result
50017,1
38081,0
35017,0
-51117,0
+51117,1
51161,0
19157,0
26157,0
47135,0
-13153,1
+13153,0
48431,0
20075,0
20069,0
@@ -832,7 +832,7 @@ FIPS,Result
9007,0
27061,0
27081,0
-51153,1
+51153,0
46071,0
50015,1
39061,1
@@ -848,7 +848,7 @@ FIPS,Result
47133,0
46097,0
35013,1
-22071,1
+22071,0
22015,0
35019,0
37145,0
@@ -866,11 +866,11 @@ FIPS,Result
36017,0
35053,0
48067,0
-18057,0
+18057,1
6101,0
48465,1
51740,1
-20037,1
+20037,0
17093,1
40055,0
26117,0
@@ -887,7 +887,7 @@ FIPS,Result
26051,0
29171,0
54013,0
-26073,1
+26073,0
48291,0
51660,1
39007,0
@@ -909,7 +909,7 @@ FIPS,Result
30077,0
48419,0
20073,0
-37067,0
+37067,1
55013,0
30013,0
42065,0
@@ -930,13 +930,13 @@ FIPS,Result
20129,0
18023,0
51031,0
-22097,1
+22097,0
16051,0
27147,0
48039,0
20139,0
12095,1
-51650,0
+51650,1
1087,1
54069,0
39133,0
@@ -977,11 +977,11 @@ FIPS,Result
48323,1
46085,0
54093,0
-36019,1
+36019,0
19165,0
45003,0
51810,0
-26053,0
+26053,1
22045,0
38003,0
12099,0
@@ -989,8 +989,8 @@ FIPS,Result
1091,1
51023,0
13085,0
-36007,0
-45041,1
+36007,1
+45041,0
22023,0
51053,0
40131,0
@@ -1014,7 +1014,7 @@ FIPS,Result
20137,0
13189,0
6055,1
-22065,1
+22065,0
55033,0
1081,1
31001,0
@@ -1025,12 +1025,12 @@ FIPS,Result
17039,0
1083,0
21213,0
-13219,1
+13219,0
17157,0
48153,0
54001,0
54083,0
-9015,0
+9015,1
4017,1
29145,0
5099,0
@@ -1041,7 +1041,7 @@ FIPS,Result
13187,0
40149,0
20171,0
-5035,1
+5035,0
13231,0
40009,0
38069,0
@@ -1051,9 +1051,9 @@ FIPS,Result
37091,0
27045,0
8013,1
-13197,0
+13197,1
32005,0
-17161,1
+17161,0
12023,0
40059,0
42127,0
@@ -1067,7 +1067,7 @@ FIPS,Result
13047,0
23011,0
17193,0
-22005,1
+22005,0
48249,1
34031,1
18183,0
@@ -1075,7 +1075,7 @@ FIPS,Result
19007,0
13295,0
34019,1
-55079,0
+55079,1
17027,0
41029,0
25027,1
@@ -1089,7 +1089,7 @@ FIPS,Result
27103,0
31033,0
47175,0
-1047,1
+1047,0
40095,0
55077,0
19169,1
@@ -1097,8 +1097,8 @@ FIPS,Result
55125,0
26145,0
4009,1
-47037,0
-36077,0
+47037,1
+36077,1
54105,0
27017,0
12053,0
@@ -1122,7 +1122,7 @@ FIPS,Result
22027,0
48047,1
46043,0
-18003,0
+18003,1
17013,0
51700,1
29097,0
@@ -1135,7 +1135,7 @@ FIPS,Result
13235,0
48025,0
18013,0
-35006,1
+35006,0
36037,0
27129,0
23017,0
@@ -1151,24 +1151,24 @@ FIPS,Result
27023,0
29139,0
27059,0
-32003,0
+32003,1
21197,0
49049,0
1115,0
40075,0
51143,0
-16039,1
+16039,0
16011,0
-6017,0
+6017,1
37011,0
29133,0
55075,0
-36059,1
+36059,0
30097,0
26065,1
40039,0
-18043,0
-18127,0
+18043,1
+18127,1
48485,0
23001,0
36021,0
@@ -1209,12 +1209,12 @@ FIPS,Result
19177,0
28043,0
13165,0
-53057,1
+53057,0
51610,1
27139,0
16009,0
22093,0
-38005,0
+38005,1
48147,0
48423,0
54099,0
@@ -1232,7 +1232,7 @@ FIPS,Result
34023,1
8055,1
28105,1
-48309,0
+48309,1
6105,1
21057,0
18073,0
@@ -1287,9 +1287,9 @@ FIPS,Result
49035,0
20063,0
18079,0
-13125,0
+13125,1
17191,0
-44003,1
+44003,0
48317,0
38089,0
48505,1
@@ -1328,7 +1328,7 @@ FIPS,Result
23013,0
13139,0
28009,0
-51025,0
+51025,1
13095,1
41027,1
39033,0
@@ -1367,11 +1367,11 @@ FIPS,Result
36079,0
8115,0
10001,0
-45039,0
+45039,1
22091,0
24041,0
21165,0
-25013,0
+25013,1
36067,1
31089,0
38001,0
@@ -1387,7 +1387,7 @@ FIPS,Result
54051,0
6065,1
17183,0
-35043,1
+35043,0
21089,0
16077,0
22043,0
@@ -1406,7 +1406,7 @@ FIPS,Result
31151,0
27109,0
36027,0
-6045,0
+6045,1
54039,0
8001,0
21027,0
@@ -1415,7 +1415,7 @@ FIPS,Result
29123,0
46079,0
31121,0
-5107,1
+5107,0
34015,1
18121,0
1099,0
@@ -1436,11 +1436,11 @@ FIPS,Result
19085,0
37161,0
46121,1
-37195,0
+37195,1
48417,0
37113,0
48007,0
-51179,1
+51179,0
12015,0
4012,0
29063,0
@@ -1448,20 +1448,20 @@ FIPS,Result
55099,0
36041,0
31079,0
-44009,1
+44009,0
16049,0
39049,1
-20209,1
-47069,0
+20209,0
+47069,1
29205,0
-5007,1
+5007,0
6029,1
5021,0
35045,1
21031,0
21025,0
-48163,0
-46017,0
+48163,1
+46017,1
48095,0
55123,0
12077,0
@@ -1477,7 +1477,7 @@ FIPS,Result
21041,0
48503,0
27173,0
-55063,0
+55063,1
29115,0
27005,0
29125,0
@@ -1531,7 +1531,7 @@ FIPS,Result
51059,1
48189,0
37179,0
-53055,1
+53055,0
48063,0
21127,0
47045,0
@@ -1541,7 +1541,7 @@ FIPS,Result
47089,0
40067,0
8101,0
-17119,1
+17119,0
26059,0
12047,0
18021,0