diff --git a/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb b/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb index 4223a52..5c459fd 100755 --- a/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb +++ b/.ipynb_checkpoints/CS 4780 Final Project Student Template-checkpoint.ipynb @@ -74,6 +74,7 @@ "import numpy as np\n", "#import sklearn as sk\n", "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import KBinsDiscretizer\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn import svm\n", "from sklearn.ensemble import AdaBoostClassifier as ABC\n", @@ -84,7 +85,6 @@ "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import KFold\n", "from sklearn.metrics import balanced_accuracy_score\n", - "from sklearn.preprocessing import KBinsDiscretizer\n", "from sklearn.compose import ColumnTransformer" ] }, @@ -159,7 +159,6 @@ "\n", "#Trim the data of irrelevant fields and convert to numpy array\n", "data = data[:,4:]\n", - "#print(data.dtypes)\n", "\n", "#Load and format the test_2016_no_label.csv file\n", "test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", @@ -248,16 +247,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "knn avg weighted accuracy: 0.6203334398310165\n", - "dtc avg weighted accuracy: 0.7070879048578987\n", - "dtc_t avg weighted accuracy: 0.6931251192737173\n", - "adb_dtc avg weighted accuracy: 0.6961593601061467\n", - "svc avg weighted accuracy: 0.7004446186230044\n", - "lr avg weighted accuracy: 0.6590264805800705\n", - "adb_lr avg weighted accuracy: 0.6497330664169988\n", - "nbc avg weighted accuracy: 0.538413031775743\n", - "adb_nbc avg weighted accuracy: 0.5742813326608044\n", - "219.0\n" + "knn avg weighted accuracy: 0.6237076582105193\n", + "dtc avg weighted accuracy: 0.6979205324376133\n", + "dtc_t avg weighted accuracy: 0.6855368083208002\n", + "adb_dtc avg weighted accuracy: 0.700633705568728\n", + "svc avg weighted accuracy: 0.6971740145905911\n", + "lr avg weighted accuracy: 0.6592688655540591\n", + "adb_lr avg weighted accuracy: 0.6488225086599588\n", + "nbc avg weighted accuracy: 0.5358837627311691\n", + "adb_nbc avg weighted accuracy: 0.5720794507211426\n", + "250.0\n" ] } ], @@ -350,69 +349,9 @@ "You may follow the steps in part 2 again but making innovative changes like creating new features, using new training algorithms, etc. Make sure you explain everything clearly in part 3.2. Note that reaching the 75% creative baseline is only a small portion of this part. Any creative ideas will receive most points as long as they are reasonable and clearly explained." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Make sure you comment your code clearly and you may refer to these comments in the part 3.2\n", - "# TODO\n", - "\n", - "dtc.fit(train, yTr)\n", - "dtc_preds = dtc.predict(valid)\n", - "print(\"dtc weighted accuracy: \", weighted_accuracy(dtc_preds,yV)) #does pretty well, even with no pruning\n", - "\n", - "dtc_g.fit(train, yTr)\n", - "dtc_g_preds = dtc.predict(valid)\n", - "print(\"dtc_g weighted accuracy: \", weighted_accuracy(dtc_g_preds,yV)) #does pretty well, even with no pruning\n", - "\n", - "adb_dtc.fit(train, yTr)\n", - "adb_dtc_preds = adb_dtc.predict(valid)\n", - "print(\"adb_dtc weighted accuracy: \", weighted_accuracy(adb_dtc_preds,yV))\n", - "\n", - "#best validation error so far\n", - "svc.fit(train, yTr)\n", - "svc_preds = svc.predict(valid)\n", - "print(\"svc weighted accuracy: \", weighted_accuracy(svc_preds,yV))\n", - "\n", - "#does not work very well; not really better than random\n", - "nbc.fit(train, yTr)\n", - "nbc_preds = nbc.predict(valid)\n", - "print(\"nbc weighted accuracy: \", weighted_accuracy(nbc_preds,yV))\n", - "\n", - "adb_nbc.fit(train, yTr)\n", - "adb_nbc_preds = adb_nbc.predict(valid)\n", - "print(\"adb_nbc weighted accuracy: \", weighted_accuracy(adb_nbc_preds,yV))\n", - "\n", - "#Split the data into a train and test set\n", - "np.random.shuffle(data)\n", - " \n", - "results = np.array(df[:,-1],dtype='bool')\n", - "data = df[:,:6]\n", - "n_valid = int(len(data)/5)\n", - "train = data[:4*n_valid]\n", - "yTr = results[:4*n_valid]\n", - "valid = data[4*n_valid:]\n", - " yV = results[4*n_valid:]\n", - "\n", - "n_valid = int(len(data)/5)\n", - "j = 100;\n", - "\n", - "#for k in range(5, 100):\n", - "sum_err = 0;\n", - "#knn = KNeighborsClassifier(n_neighbors = k)\n", - "for i in range(j):\n", - " #randomize training and validation set\n", - " np.random.shuffle(data)\n", - "\n", - " #run algorithm\n", - " knn.fit(train, yTr)\n", - " knn_preds = knn.predict(valid)\n", - " sum_err = sum_err + sum(knn_preds != yV)/n_valid" - ] - }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -420,97 +359,48 @@ "\n", "### (3.1) Preprocessing and Feature Extraction ###\n", "\n", - "###(3.1.a) Load and format the training data set(s)###\n", - "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", - "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n", - "graph = pd.read_csv(\"graph.csv\", sep=',')\n", + "###Helper functions to preprocess data###\n", "\n", "#Add binary features for the state that the county is in (where each binary feature represents one state)\n", - "\n", - "###2016###\n", - "data[\"State\"] = [x.strip()[-2:] for x in data['County']] #Extract state abbreviations from County\n", - "dummies = pd.get_dummies(data['State']) #Create state dummies from State\n", - "data = pd.concat([data,dummies],axis=1)\n", - "data = data.drop('State', axis=1)\n", - "\n", - "###2012###\n", - "data_2012[\"State\"] = [x.strip()[-2:] for x in data_2012['County']] #Extract state abbreviations from County\n", - "dummies_2012 = pd.get_dummies(data_2012['State']) #Create state dummies from State\n", - "data_2012 = pd.concat([data_2012,dummies_2012],axis=1)\n", - "data_2012 = data_2012.drop(\"State\", axis=1)\n", - "\n", - "#Define a function which will preprocess the data\n", - "def preprocess(data):\n", - " #Normalize features\n", - " #Standardize the data with mean 0 and s.d. 1\n", + "def get_states(X):\n", + " X[\"State\"] = [x.strip()[-2:] for x in X['County']]\n", + " dummies_test = pd.get_dummies(X['State'])\n", + " X = pd.concat([X,dummies_test],axis=1)\n", + " X = X.drop('State',axis=1)\n", + " return X\n", + "\n", + "#Normalizes all variables from the data except for the binary state variables\n", + "def normalize(X):\n", + " #Normalize features Standardize the data with mean 0 and s.d. 1\n", " scalar = StandardScaler()\n", - " _,d = np.shape(data)\n", - " data_norm = data[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n", - " #data_raw_indexes = np.r_[6:(d-2)]\n", - " data_bin = data[:,6:(d-2)]\n", - " #print('untransformed data', data_bin)\n", + " _,d = np.shape(X)\n", + " data_norm = X[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n", + " data_bin = X[:,6:(d-2)]\n", " scalar.fit(data_norm)\n", " data_norm = scalar.transform(data_norm)\n", - " #print('transformed data', data_norm)\n", - " data = np.hstack((data_norm, data_bin))\n", - " #print(data)\n", - "\n", - " #scalar = StandardScaler()\n", - " #binning = ['MedianIncome']\n", - " #standardization = ['MigraRate','BirthRate','DeathRate','BachelorRate','UnemploymentRate']\n", - " #n,d = np.shape(counties)\n", - " #print(d)\n", - " #binning = [4]\n", - " #standardization = [*range(5,d)]\n", - " #print(standardization)\n", - " #ct = ColumnTransformer([(\"standardize\", scalar, standardization), \n", - " # (\"binning\", KBinsDiscretizer(n_bins=10), binning)])\n", - " #try min maxing if binning doesn't \n", - " #data = ct.fit_transform(counties)\n", - " return data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###EXPERIMENTING DON'T RUN YET###\n", - "\n", - "#Add binary feature for whether a county is on a state border (which may indicate they receive other states' political ads)\n", - "fips_allstates = data[[\"FIPS\",\"State\"]]\n", - "tmp1 = graph.merge(fips_allstates, left_on=\"SRC\", right_on=\"FIPS\", how=\"left\")\n", - "tmp1 = tmp1[[\"SRC\",\"DST\",\"State\"]]\n", - "tmp1.rename(columns={'State':'SRC_State'}, inplace=True)\n", - "print(tmp1)\n", + " X = np.hstack((data_norm, data_bin))\n", + " return X\n", "\n", - "tmp2 = tmp1.merge(fips_allstates, left_on=\"DST\", right_on=\"FIPS\", how=\"left\")\n", - "tmp2 = tmp2[[\"SRC\",\"DST\",\"SRC_State\",\"State\"]]\n", - "tmp2.rename(columns={'State':'DST_State'}, inplace=True)\n", - "print(tmp2)\n", - "\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRC_State\"].eq(tmp2[\"DST_State\"])\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].astype(int)\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].replace({0:1,1:0}) #Make sure =1 iff on border\n", - "print(tmp2)\n", - "tmp2 = tmp2.dropna() #Because NaNs indicate counties that are not in training data - not sure how to deal with loss of data\n", - "tmp2 = tmp2.groupby(['SRC'])['SRConBorder'].sum()\n", - "print(tmp2)\n", + "###(3.1.a) Load and format the training data set(s)###\n", + "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", + "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n", + "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", "\n", - "data = data.merge(tmp2, left_on=\"FIPS\", right_on=\"SRC\", how=\"left\")\n", - "print(data)\n", + "###2016###\n", + "data = get_states(data)\n", "\n", - "#Notes: Source counties can be found in test, destination in train (or vice versa - one occurence in either column(?))\n", - "#Notes: Get list of neighbors that voted one way or other (check one per column)" + "###2012###\n", + "data_2012 = get_states(data_2012)\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "#Creates a lexicographically ordered list of all the county's neighbors\n", - "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", "neighbors = {}\n", "for i in range(len(graph)):\n", " src = graph['SRC'][i]\n", @@ -531,15 +421,12 @@ "dem = dict(zip(data['FIPS'], dem_perc))\n", "votes = data[['DEM','GOP']]\n", "votes = list(votes.itertuples(index=False, name=None))\n", - "votes = dict(zip(data['FIPS'], votes))\n", - "votes_2012 = data_2012[['DEM','GOP']]\n", - "votes_2012 = list(votes_2012.itertuples(index=False, name=None))\n", - "votes_2012 = dict(zip(data['FIPS'], votes_2012))" + "votes = dict(zip(data['FIPS'], votes))" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -554,7 +441,7 @@ " score += dem[i]\n", " return n, score/n if n != 0 else np.nan\n", "\n", - "#TODO - implement a function that calculates the percentage of all voters in the surrounding counties who voted dem\n", + "#Calculates the percentage of all voters from the neighboring counties who voted democratic\n", "def aggregate_gscore(neighs, votes):\n", " n = 0\n", " dem = 0\n", @@ -577,27 +464,37 @@ " n = 0\n", " else:\n", " n, score = aggregate_gscore(neighbors[key], votes)\n", - " _, s2012 = aggregate_gscore(neighbors[key], votes_2012)\n", + " total_n = len(neighbors[key])\n", " counties.loc[i, 'GraphScore'] = score\n", " counties.loc[i, 'Neighbors'] = n\n", - " mean = counties['GraphScore'].sum()/len(counties)\n", - " counties['GraphScore'] = counties['GraphScore'].map(lambda x: mean if pd.isnull(x) else x)\n", + " smean = counties['GraphScore'].sum()/len(counties)\n", + " counties['GraphScore'] = counties['GraphScore'].map(lambda x: smean if pd.isnull(x) else x)\n", " return counties" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 61, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n", + "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n" + ] + } + ], "source": [ "###Prep for concatenating###\n", "\n", "###2016###\n", - "#cols = data.columns.tolist()\n", - "#cols = cols[0:2] + cols[4:] #Helps make sure columns in test align with those in data\n", - "\n", "data = add_gscore(data)\n", + "#Rearrange the columns to ensure that DC is in the same location of each dataset\n", + "cols = data.columns.tolist()\n", + "cols = cols[0:2] + cols[4:]\n", + "print(list(data.columns))\n", "data = data.to_numpy()\n", "\n", "#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n", @@ -606,11 +503,13 @@ "\n", "#Trim the data of irrelevant fields and convert to numpy array\n", "data = data[:,4:]\n", - "#print(np.shape(data))\n", - "#print(data)\n", "\n", "###2012###\n", "data_2012 = add_gscore(data_2012)\n", + "#Rearrange the columns to ensure that DC is in the same location of each dataset\n", + "cols_2012 = data_2012.columns.tolist()\n", + "cols_2012 = cols_2012[0:2] + cols_2012[4:]\n", + "print(list(data_2012.columns))\n", "data_2012 = data_2012.to_numpy()\n", "\n", "#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n", @@ -622,9 +521,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 62, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n", + "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n" + ] + } + ], "source": [ "###(3.1.b) Load and format the testing data sets###\n", "test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", @@ -633,15 +541,11 @@ "#Process the testing data set in the same manner as the training data set\n", "\n", "###2016###\n", - "test[\"State\"] = [x.strip()[-2:] for x in test['County']]\n", - "dummies_test = pd.get_dummies(test['State'])\n", - "#print(dummies.columns.difference(dummies_test.columns)) #Check that DC is the only missing county/state\n", - "test = pd.concat([test,dummies_test],axis=1)\n", + "test = get_states(test)\n", "test['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n", - "test = test.drop('State',axis=1)\n", - "#print(test)\n", - "#test = test[cols] #Make sure columns in test align with those in data\n", "test = add_gscore(test)\n", + "test = test[cols]\n", + "print(list(test.columns))\n", "test = test.to_numpy()\n", "\n", "fips = test[:,0]\n", @@ -651,21 +555,19 @@ "data = np.vstack((data, test))\n", "\n", "#Standardize the (non-binary) features to have mean 0 and s.d. 1\n", - "data = preprocess(data)\n", + "data = normalize(data)\n", "\n", "test = data[-n_data:,:]\n", "data = data[:n_data,:]\n", "data = np.hstack((data, np.transpose(np.array([results]))))\n", "\n", "###2012###\n", - "test_2012[\"State\"] = [x.strip()[-2:] for x in test_2012['County']]\n", - "dummies_test_2012 = pd.get_dummies(test_2012['State'])\n", - "#print(dummies_2012.columns.difference(dummies_test_2012.columns)) #Check that DC is the only missing county/state\n", - "test_2012 = pd.concat([test_2012,dummies_test_2012],axis=1)\n", + "test_2012 = get_states(test_2012)\n", "test_2012['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n", - "test_2012 = test_2012.drop('State',axis=1)\n", + "test_2012 = add_gscore(test_2012)\n", + "test_2012 = test_2012[cols_2012]\n", + "print(list(test_2012.columns))\n", "\n", - "add_gscore(test_2012)\n", "test_2012 = test_2012.to_numpy()\n", "\n", "fips_2012 = test_2012[:,0]\n", @@ -674,7 +576,7 @@ "n_data_2012 = len(data_2012)\n", "data_2012 = np.vstack((data_2012, test_2012))\n", "\n", - "data_2012 = preprocess(data_2012)\n", + "data_2012 = normalize(data_2012)\n", "test_2012 = data_2012[-n_data_2012:,:]\n", "data_2012 = data_2012[:n_data_2012,:]\n", "data_2012 = np.hstack((data_2012, np.transpose(np.array([results_2012]))))" @@ -682,9 +584,20 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1.25892541 1.49170935 1.76753662 2.09436625 2.48162892 2.94049911\n", + " 3.48421754 4.12847324 4.89185622 5.79639395 6.86818691 8.13816172\n", + " 9.64296358 11.42601361 13.5387618 16.04217161 19.00847905 22.52327705\n", + " 26.68798528 31.6227766 ]\n" + ] + } + ], "source": [ "### (3.2) Testing algorithms ###\n", "\n", @@ -699,7 +612,9 @@ "adb_dtc_g = ABC(base_estimator = DTC(criterion = \"entropy\"), n_estimators = 100)\n", "\n", "#SVM classifier\n", - "Cs = np.logspace(1.25,1.25,20)\n", + "Cs = np.logspace(.1,1.5,20)\n", + "print(Cs)\n", + "#Cs = np.logspace(1.25,1.25,20)\n", "svc = svm.SVC(gamma='scale', kernel='rbf')\n", "c_svc = GridSearchCV(estimator=svc,param_grid=dict(C=Cs))\n", "\n", @@ -716,27 +631,25 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "knn avg weighted accuracy: 0.6329961166249547\n", - "knn avg weighted accuracy 2012: 0.6215636493214642\n", - "dtc_t avg weighted accuracy: 0.7461445847623741\n", - "dtc_t avg weighted accuracy 2012: 0.6661740263653098\n", - "adb_dtc avg weighted accuracy: 0.708280844124877\n", - "adb_dtc avg weighted accuracy 2012: 0.7105907368805827\n", - "dtc_g avg weighted accuracy: 0.7506852491001083\n", - "dtc_g avg weighted accuracy 2012: 0.683730011834672\n", - "adb_dtc_g avg weighted accuracy: 0.760068618725555\n", - "adb_dtc_g avg weighted accuracy 2012: 0.6846362764834062\n", - "svc avg weighted accuracy: 0.7974214368589101\n", - "svc avg weighted accuracy 2012: 0.7389995155649615\n", - "lr avg weighted accuracy: 0.7449232757260804\n", - "lr avg weighted accuracy 2012: 0.713245836578974\n" + "knn avg weighted accuracy: 0.6330706501088754\n", + "knn avg weighted accuracy 2012: 0.6287551049058671\n", + "dtc_t avg weighted accuracy: 0.7126628181827105\n", + "dtc_t avg weighted accuracy 2012: 0.6654573475411754\n", + "adb_dtc avg weighted accuracy: 0.7583268043136182\n", + "adb_dtc avg weighted accuracy 2012: 0.6876199289767525\n", + "dtc_g avg weighted accuracy: 0.7331652849631233\n", + "dtc_g avg weighted accuracy 2012: 0.6960158238672269\n", + "adb_dtc_g avg weighted accuracy: 0.7526903758034124\n", + "adb_dtc_g avg weighted accuracy 2012: 0.6782594899860952\n", + "svc avg weighted accuracy: 0.7850360498781626\n", + "svc avg weighted accuracy 2012: 0.7207312417562269\n" ] } ], @@ -832,42 +745,30 @@ "print(\"svc avg weighted accuracy: \", x)\n", "print(\"svc avg weighted accuracy 2012: \", x_2012)\n", "\n", - "#print(\"lr avg weighted accuracy: \", kfold(lr, data))\n", - "#print(\"lr avg weighted accuracy: \", kfold(c_lr, data, grid=\"Yes\", grid_model=\"lr\")) #.721 using Cs = np.logspace(-1,1,100) and solver = 'liblinear'\n", - "#print(\"lr avg weighted accuracy 2012: \", kfold(c_lr, data_2012, grid=\"Yes\", grid_model=\"lr\"))\n", "x_lr,y_lr,z_lr = kfold_lr(c_lr, data)\n", "x_lr_2012,y_lr_2012,z_lr_2012 = kfold_lr(c_lr, data_2012)\n", "print(\"lr avg weighted accuracy: \", x_lr)\n", "print(\"lr avg weighted accuracy 2012: \", x_lr_2012)\n", "\n", - "#print(\"adb_lr avg weighted accuracy: \", kfold(adb_lr, data)) #.713 using solver = 'liblinear'\n", - "#print(\"adb_lr avg weighted accuracy 2012: \", kfold(adb_lr, data_2012)) #.713 using solver = 'liblinear'\n", - "\n", - "#print(\"nbc avg weighted accuracy: \", kfold(nbc, data)) #.653\n", - "\n", - "#print(\"adb_nbc avg weighted accuracy: \", kfold(adb_nbc, data)) #.5 using n_estimators = 100\n", - "\n", "#Get final predictions\n", - "preds = z.predict(test) #Uses 2016" + "preds_creative = z.predict(test) #Uses 2016" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "223\n", - "216\n" + "235\n" ] } ], "source": [ - "print(sum(preds))\n", - "print(sum(z.predict(data[:,:-1])))" + "print(sum(preds_creative))" ] }, { @@ -888,6 +789,13 @@ "3.2.2 Please explain in detail how you achieved this and what you did specifically and why you tried this." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -908,16 +816,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "228\n" + ] + } + ], "source": [ "# TODO\n", "output = np.array([fips, preds])\n", "output = pd.DataFrame(np.transpose(output))\n", "output[1] = output[1].astype(int)\n", - "#output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", - "output.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", + "output_creative = np.array([fips, preds_creative])\n", + "output_creative = pd.DataFrame(np.transpose(output_creative))\n", + "output_creative[1] = output_creative[1].astype(int)\n", + "output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", + "output_creative.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", "\n", "# You may use pandas to generate a dataframe with FIPS and your predictions first \n", "# and then use to_csv to generate a CSV file." diff --git a/CS 4780 Final Project Student Template.ipynb b/CS 4780 Final Project Student Template.ipynb index c6aadc1..5c459fd 100755 --- a/CS 4780 Final Project Student Template.ipynb +++ b/CS 4780 Final Project Student Template.ipynb @@ -74,6 +74,7 @@ "import numpy as np\n", "#import sklearn as sk\n", "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import KBinsDiscretizer\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn import svm\n", "from sklearn.ensemble import AdaBoostClassifier as ABC\n", @@ -84,7 +85,6 @@ "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import KFold\n", "from sklearn.metrics import balanced_accuracy_score\n", - "from sklearn.preprocessing import KBinsDiscretizer\n", "from sklearn.compose import ColumnTransformer" ] }, @@ -159,7 +159,6 @@ "\n", "#Trim the data of irrelevant fields and convert to numpy array\n", "data = data[:,4:]\n", - "#print(data.dtypes)\n", "\n", "#Load and format the test_2016_no_label.csv file\n", "test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", @@ -248,16 +247,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "knn avg weighted accuracy: 0.6203334398310165\n", - "dtc avg weighted accuracy: 0.7070879048578987\n", - "dtc_t avg weighted accuracy: 0.6931251192737173\n", - "adb_dtc avg weighted accuracy: 0.6961593601061467\n", - "svc avg weighted accuracy: 0.7004446186230044\n", - "lr avg weighted accuracy: 0.6590264805800705\n", - "adb_lr avg weighted accuracy: 0.6497330664169988\n", - "nbc avg weighted accuracy: 0.538413031775743\n", - "adb_nbc avg weighted accuracy: 0.5742813326608044\n", - "219.0\n" + "knn avg weighted accuracy: 0.6237076582105193\n", + "dtc avg weighted accuracy: 0.6979205324376133\n", + "dtc_t avg weighted accuracy: 0.6855368083208002\n", + "adb_dtc avg weighted accuracy: 0.700633705568728\n", + "svc avg weighted accuracy: 0.6971740145905911\n", + "lr avg weighted accuracy: 0.6592688655540591\n", + "adb_lr avg weighted accuracy: 0.6488225086599588\n", + "nbc avg weighted accuracy: 0.5358837627311691\n", + "adb_nbc avg weighted accuracy: 0.5720794507211426\n", + "250.0\n" ] } ], @@ -350,69 +349,9 @@ "You may follow the steps in part 2 again but making innovative changes like creating new features, using new training algorithms, etc. Make sure you explain everything clearly in part 3.2. Note that reaching the 75% creative baseline is only a small portion of this part. Any creative ideas will receive most points as long as they are reasonable and clearly explained." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Make sure you comment your code clearly and you may refer to these comments in the part 3.2\n", - "# TODO\n", - "\n", - "dtc.fit(train, yTr)\n", - "dtc_preds = dtc.predict(valid)\n", - "print(\"dtc weighted accuracy: \", weighted_accuracy(dtc_preds,yV)) #does pretty well, even with no pruning\n", - "\n", - "dtc_g.fit(train, yTr)\n", - "dtc_g_preds = dtc.predict(valid)\n", - "print(\"dtc_g weighted accuracy: \", weighted_accuracy(dtc_g_preds,yV)) #does pretty well, even with no pruning\n", - "\n", - "adb_dtc.fit(train, yTr)\n", - "adb_dtc_preds = adb_dtc.predict(valid)\n", - "print(\"adb_dtc weighted accuracy: \", weighted_accuracy(adb_dtc_preds,yV))\n", - "\n", - "#best validation error so far\n", - "svc.fit(train, yTr)\n", - "svc_preds = svc.predict(valid)\n", - "print(\"svc weighted accuracy: \", weighted_accuracy(svc_preds,yV))\n", - "\n", - "#does not work very well; not really better than random\n", - "nbc.fit(train, yTr)\n", - "nbc_preds = nbc.predict(valid)\n", - "print(\"nbc weighted accuracy: \", weighted_accuracy(nbc_preds,yV))\n", - "\n", - "adb_nbc.fit(train, yTr)\n", - "adb_nbc_preds = adb_nbc.predict(valid)\n", - "print(\"adb_nbc weighted accuracy: \", weighted_accuracy(adb_nbc_preds,yV))\n", - "\n", - "#Split the data into a train and test set\n", - "np.random.shuffle(data)\n", - " \n", - "results = np.array(df[:,-1],dtype='bool')\n", - "data = df[:,:6]\n", - "n_valid = int(len(data)/5)\n", - "train = data[:4*n_valid]\n", - "yTr = results[:4*n_valid]\n", - "valid = data[4*n_valid:]\n", - " yV = results[4*n_valid:]\n", - "\n", - "n_valid = int(len(data)/5)\n", - "j = 100;\n", - "\n", - "#for k in range(5, 100):\n", - "sum_err = 0;\n", - "#knn = KNeighborsClassifier(n_neighbors = k)\n", - "for i in range(j):\n", - " #randomize training and validation set\n", - " np.random.shuffle(data)\n", - "\n", - " #run algorithm\n", - " knn.fit(train, yTr)\n", - " knn_preds = knn.predict(valid)\n", - " sum_err = sum_err + sum(knn_preds != yV)/n_valid" - ] - }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -420,97 +359,48 @@ "\n", "### (3.1) Preprocessing and Feature Extraction ###\n", "\n", - "###(3.1.a) Load and format the training data set(s)###\n", - "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", - "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n", - "graph = pd.read_csv(\"graph.csv\", sep=',')\n", + "###Helper functions to preprocess data###\n", "\n", "#Add binary features for the state that the county is in (where each binary feature represents one state)\n", - "\n", - "###2016###\n", - "data[\"State\"] = [x.strip()[-2:] for x in data['County']] #Extract state abbreviations from County\n", - "dummies = pd.get_dummies(data['State']) #Create state dummies from State\n", - "data = pd.concat([data,dummies],axis=1)\n", - "data = data.drop('State', axis=1)\n", - "\n", - "###2012###\n", - "data_2012[\"State\"] = [x.strip()[-2:] for x in data_2012['County']] #Extract state abbreviations from County\n", - "dummies_2012 = pd.get_dummies(data_2012['State']) #Create state dummies from State\n", - "data_2012 = pd.concat([data_2012,dummies_2012],axis=1)\n", - "data_2012 = data_2012.drop(\"State\", axis=1)\n", - "\n", - "#Define a function which will preprocess the data\n", - "def preprocess(data):\n", - " #Normalize features\n", - " #Standardize the data with mean 0 and s.d. 1\n", + "def get_states(X):\n", + " X[\"State\"] = [x.strip()[-2:] for x in X['County']]\n", + " dummies_test = pd.get_dummies(X['State'])\n", + " X = pd.concat([X,dummies_test],axis=1)\n", + " X = X.drop('State',axis=1)\n", + " return X\n", + "\n", + "#Normalizes all variables from the data except for the binary state variables\n", + "def normalize(X):\n", + " #Normalize features Standardize the data with mean 0 and s.d. 1\n", " scalar = StandardScaler()\n", - " _,d = np.shape(data)\n", - " data_norm = data[:,[0,1,2,3,4,5,(d-3),(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n", - " #data_raw_indexes = np.r_[6:(d-2)]\n", - " data_bin = data[:,6:(d-3)]\n", - " #print('untransformed data', data_bin)\n", + " _,d = np.shape(X)\n", + " data_norm = X[:,[0,1,2,3,4,5,(d-2),(d-1)]] #d-2 and d-1 store the graph score and number of known neighbors\n", + " data_bin = X[:,6:(d-2)]\n", " scalar.fit(data_norm)\n", " data_norm = scalar.transform(data_norm)\n", - " #print('transformed data', data_norm)\n", - " data = np.hstack((data_norm, data_bin))\n", - " #print(data)\n", - "\n", - " #scalar = StandardScaler()\n", - " #binning = ['MedianIncome']\n", - " #standardization = ['MigraRate','BirthRate','DeathRate','BachelorRate','UnemploymentRate']\n", - " #n,d = np.shape(counties)\n", - " #print(d)\n", - " #binning = [4]\n", - " #standardization = [*range(5,d)]\n", - " #print(standardization)\n", - " #ct = ColumnTransformer([(\"standardize\", scalar, standardization), \n", - " # (\"binning\", KBinsDiscretizer(n_bins=10), binning)])\n", - " #try min maxing if binning doesn't \n", - " #data = ct.fit_transform(counties)\n", - " return data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###EXPERIMENTING DON'T RUN YET###\n", - "\n", - "#Add binary feature for whether a county is on a state border (which may indicate they receive other states' political ads)\n", - "fips_allstates = data[[\"FIPS\",\"State\"]]\n", - "tmp1 = graph.merge(fips_allstates, left_on=\"SRC\", right_on=\"FIPS\", how=\"left\")\n", - "tmp1 = tmp1[[\"SRC\",\"DST\",\"State\"]]\n", - "tmp1.rename(columns={'State':'SRC_State'}, inplace=True)\n", - "print(tmp1)\n", - "\n", - "tmp2 = tmp1.merge(fips_allstates, left_on=\"DST\", right_on=\"FIPS\", how=\"left\")\n", - "tmp2 = tmp2[[\"SRC\",\"DST\",\"SRC_State\",\"State\"]]\n", - "tmp2.rename(columns={'State':'DST_State'}, inplace=True)\n", - "print(tmp2)\n", + " X = np.hstack((data_norm, data_bin))\n", + " return X\n", "\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRC_State\"].eq(tmp2[\"DST_State\"])\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].astype(int)\n", - "tmp2[\"SRConBorder\"] = tmp2[\"SRConBorder\"].replace({0:1,1:0}) #Make sure =1 iff on border\n", - "print(tmp2)\n", - "tmp2 = tmp2.dropna() #Because NaNs indicate counties that are not in training data - not sure how to deal with loss of data\n", - "tmp2 = tmp2.groupby(['SRC'])['SRConBorder'].sum()\n", - "print(tmp2)\n", + "###(3.1.a) Load and format the training data set(s)###\n", + "data = pd.read_csv(\"train_2016.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", + "data_2012 = pd.read_csv(\"train_2012.csv\", sep=\",\", header=0, encoding='unicode_escape', thousands=',')\n", + "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", "\n", - "data = data.merge(tmp2, left_on=\"FIPS\", right_on=\"SRC\", how=\"left\")\n", - "print(data)\n", + "###2016###\n", + "data = get_states(data)\n", "\n", - "#Notes: Source counties can be found in test, destination in train (or vice versa - one occurence in either column(?))\n", - "#Notes: Get list of neighbors that voted one way or other (check one per column)" + "###2012###\n", + "data_2012 = get_states(data_2012)\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "#Creates a lexicographically ordered list of all the county's neighbors\n", - "graph = pd.read_csv(\"graph.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", "neighbors = {}\n", "for i in range(len(graph)):\n", " src = graph['SRC'][i]\n", @@ -531,15 +421,12 @@ "dem = dict(zip(data['FIPS'], dem_perc))\n", "votes = data[['DEM','GOP']]\n", "votes = list(votes.itertuples(index=False, name=None))\n", - "votes = dict(zip(data['FIPS'], votes))\n", - "votes_2012 = data_2012[['DEM','GOP']]\n", - "votes_2012 = list(votes_2012.itertuples(index=False, name=None))\n", - "votes_2012 = dict(zip(data['FIPS'], votes_2012))" + "votes = dict(zip(data['FIPS'], votes))" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -554,7 +441,7 @@ " score += dem[i]\n", " return n, score/n if n != 0 else np.nan\n", "\n", - "#TODO - implement a function that calculates the percentage of all voters in the surrounding counties who voted dem\n", + "#Calculates the percentage of all voters from the neighboring counties who voted democratic\n", "def aggregate_gscore(neighs, votes):\n", " n = 0\n", " dem = 0\n", @@ -569,40 +456,45 @@ "#Function that takes a list of counties and appends their graph_score to the end of their vector:\n", "def add_gscore(counties):\n", " counties['GraphScore'] = 0\n", - " counties['2012GScore'] = 0\n", " counties['Neighbors'] = 0\n", " for i in range(len(counties)):\n", " key = int(counties['FIPS'][i])\n", " if neighbors.get(key) == None:\n", " score = np.nan\n", - " s2012 = np.nan\n", " n = 0\n", " else:\n", " n, score = aggregate_gscore(neighbors[key], votes)\n", - " _, s2012 = aggregate_gscore(neighbors[key], votes_2012)\n", + " total_n = len(neighbors[key])\n", " counties.loc[i, 'GraphScore'] = score\n", - " counties.loc[i, '2012GScore'] = s2012\n", " counties.loc[i, 'Neighbors'] = n\n", " smean = counties['GraphScore'].sum()/len(counties)\n", - " s2012_mean = counties['2012GScore'].sum()/len(counties)\n", " counties['GraphScore'] = counties['GraphScore'].map(lambda x: smean if pd.isnull(x) else x)\n", - " counties['2012GScore'] = counties['2012GScore'].map(lambda x: s2012_mean if pd.isnull(x) else x)\n", " return counties" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 61, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n", + "['FIPS', 'County', 'DEM', 'GOP', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n" + ] + } + ], "source": [ "###Prep for concatenating###\n", "\n", "###2016###\n", - "#cols = data.columns.tolist()\n", - "#cols = cols[0:2] + cols[4:] #Helps make sure columns in test align with those in data\n", - "\n", "data = add_gscore(data)\n", + "#Rearrange the columns to ensure that DC is in the same location of each dataset\n", + "cols = data.columns.tolist()\n", + "cols = cols[0:2] + cols[4:]\n", + "print(list(data.columns))\n", "data = data.to_numpy()\n", "\n", "#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n", @@ -611,11 +503,13 @@ "\n", "#Trim the data of irrelevant fields and convert to numpy array\n", "data = data[:,4:]\n", - "#print(np.shape(data))\n", - "#print(data)\n", "\n", "###2012###\n", "data_2012 = add_gscore(data_2012)\n", + "#Rearrange the columns to ensure that DC is in the same location of each dataset\n", + "cols_2012 = data_2012.columns.tolist()\n", + "cols_2012 = cols_2012[0:2] + cols_2012[4:]\n", + "print(list(data_2012.columns))\n", "data_2012 = data_2012.to_numpy()\n", "\n", "#These are the training labels for each county, 1 if the county voted Dem and 0 if it voted Rep\n", @@ -627,9 +521,18 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 62, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n", + "['FIPS', 'County', 'MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'GraphScore', 'Neighbors']\n" + ] + } + ], "source": [ "###(3.1.b) Load and format the testing data sets###\n", "test = pd.read_csv(\"test_2016_no_label.csv\", sep=',', header=0, encoding='unicode_escape', thousands=\",\")\n", @@ -638,15 +541,11 @@ "#Process the testing data set in the same manner as the training data set\n", "\n", "###2016###\n", - "test[\"State\"] = [x.strip()[-2:] for x in test['County']]\n", - "dummies_test = pd.get_dummies(test['State'])\n", - "#print(dummies.columns.difference(dummies_test.columns)) #Check that DC is the only missing county/state\n", - "test = pd.concat([test,dummies_test],axis=1)\n", + "test = get_states(test)\n", "test['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n", - "test = test.drop('State',axis=1)\n", - "#print(test)\n", - "#test = test[cols] #Make sure columns in test align with those in data\n", "test = add_gscore(test)\n", + "test = test[cols]\n", + "print(list(test.columns))\n", "test = test.to_numpy()\n", "\n", "fips = test[:,0]\n", @@ -656,21 +555,19 @@ "data = np.vstack((data, test))\n", "\n", "#Standardize the (non-binary) features to have mean 0 and s.d. 1\n", - "data = preprocess(data)\n", + "data = normalize(data)\n", "\n", "test = data[-n_data:,:]\n", "data = data[:n_data,:]\n", "data = np.hstack((data, np.transpose(np.array([results]))))\n", "\n", "###2012###\n", - "test_2012[\"State\"] = [x.strip()[-2:] for x in test_2012['County']]\n", - "dummies_test_2012 = pd.get_dummies(test_2012['State'])\n", - "#print(dummies_2012.columns.difference(dummies_test_2012.columns)) #Check that DC is the only missing county/state\n", - "test_2012 = pd.concat([test_2012,dummies_test_2012],axis=1)\n", + "test_2012 = get_states(test_2012)\n", "test_2012['DC'] = 0 #Add in a zero vector for DC (because there is no DC in either test data set)\n", - "test_2012 = test_2012.drop('State',axis=1)\n", + "test_2012 = add_gscore(test_2012)\n", + "test_2012 = test_2012[cols_2012]\n", + "print(list(test_2012.columns))\n", "\n", - "add_gscore(test_2012)\n", "test_2012 = test_2012.to_numpy()\n", "\n", "fips_2012 = test_2012[:,0]\n", @@ -679,7 +576,7 @@ "n_data_2012 = len(data_2012)\n", "data_2012 = np.vstack((data_2012, test_2012))\n", "\n", - "data_2012 = preprocess(data_2012)\n", + "data_2012 = normalize(data_2012)\n", "test_2012 = data_2012[-n_data_2012:,:]\n", "data_2012 = data_2012[:n_data_2012,:]\n", "data_2012 = np.hstack((data_2012, np.transpose(np.array([results_2012]))))" @@ -687,9 +584,20 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1.25892541 1.49170935 1.76753662 2.09436625 2.48162892 2.94049911\n", + " 3.48421754 4.12847324 4.89185622 5.79639395 6.86818691 8.13816172\n", + " 9.64296358 11.42601361 13.5387618 16.04217161 19.00847905 22.52327705\n", + " 26.68798528 31.6227766 ]\n" + ] + } + ], "source": [ "### (3.2) Testing algorithms ###\n", "\n", @@ -704,7 +612,9 @@ "adb_dtc_g = ABC(base_estimator = DTC(criterion = \"entropy\"), n_estimators = 100)\n", "\n", "#SVM classifier\n", - "Cs = np.logspace(1.25,1.25,20)\n", + "Cs = np.logspace(.1,1.5,20)\n", + "print(Cs)\n", + "#Cs = np.logspace(1.25,1.25,20)\n", "svc = svm.SVC(gamma='scale', kernel='rbf')\n", "c_svc = GridSearchCV(estimator=svc,param_grid=dict(C=Cs))\n", "\n", @@ -728,18 +638,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "knn avg weighted accuracy: 0.6307393036738488\n", - "knn avg weighted accuracy 2012: 0.6547367701142874\n", - "dtc_t avg weighted accuracy: 0.745723590078064\n", - "dtc_t avg weighted accuracy 2012: 0.6877354733446237\n", - "adb_dtc avg weighted accuracy: 0.7285094502910128\n", - "adb_dtc avg weighted accuracy 2012: 0.7070487212085304\n", - "dtc_g avg weighted accuracy: 0.7504774822113874\n", - "dtc_g avg weighted accuracy 2012: 0.6999027843035145\n", - "adb_dtc_g avg weighted accuracy: 0.7373869781842327\n", - "adb_dtc_g avg weighted accuracy 2012: 0.6992326684962192\n", - "svc avg weighted accuracy: 0.7840850521161004\n", - "svc avg weighted accuracy 2012: 0.7782245706696347\n" + "knn avg weighted accuracy: 0.6330706501088754\n", + "knn avg weighted accuracy 2012: 0.6287551049058671\n", + "dtc_t avg weighted accuracy: 0.7126628181827105\n", + "dtc_t avg weighted accuracy 2012: 0.6654573475411754\n", + "adb_dtc avg weighted accuracy: 0.7583268043136182\n", + "adb_dtc avg weighted accuracy 2012: 0.6876199289767525\n", + "dtc_g avg weighted accuracy: 0.7331652849631233\n", + "dtc_g avg weighted accuracy 2012: 0.6960158238672269\n", + "adb_dtc_g avg weighted accuracy: 0.7526903758034124\n", + "adb_dtc_g avg weighted accuracy 2012: 0.6782594899860952\n", + "svc avg weighted accuracy: 0.7850360498781626\n", + "svc avg weighted accuracy 2012: 0.7207312417562269\n" ] } ], @@ -835,33 +745,30 @@ "print(\"svc avg weighted accuracy: \", x)\n", "print(\"svc avg weighted accuracy 2012: \", x_2012)\n", "\n", - "#print(\"lr avg weighted accuracy: \", kfold(lr, data))\n", - "#print(\"lr avg weighted accuracy: \", kfold(c_lr, data, grid=\"Yes\", grid_model=\"lr\")) #.721 using Cs = np.logspace(-1,1,100) and solver = 'liblinear'\n", - "#print(\"lr avg weighted accuracy 2012: \", kfold(c_lr, data_2012, grid=\"Yes\", grid_model=\"lr\"))\n", "x_lr,y_lr,z_lr = kfold_lr(c_lr, data)\n", "x_lr_2012,y_lr_2012,z_lr_2012 = kfold_lr(c_lr, data_2012)\n", "print(\"lr avg weighted accuracy: \", x_lr)\n", "print(\"lr avg weighted accuracy 2012: \", x_lr_2012)\n", "\n", - "#print(\"adb_lr avg weighted accuracy: \", kfold(adb_lr, data)) #.713 using solver = 'liblinear'\n", - "#print(\"adb_lr avg weighted accuracy 2012: \", kfold(adb_lr, data_2012)) #.713 using solver = 'liblinear'\n", - "\n", - "#print(\"nbc avg weighted accuracy: \", kfold(nbc, data)) #.653\n", - "\n", - "#print(\"adb_nbc avg weighted accuracy: \", kfold(adb_nbc, data)) #.5 using n_estimators = 100\n", - "\n", "#Get final predictions\n", - "preds = z.predict(test) #Uses 2016" + "preds_creative = z.predict(test) #Uses 2016" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "235\n" + ] + } + ], "source": [ - "print(sum(preds))\n", - "print(sum(z.predict(data[:,:-1])))" + "print(sum(preds_creative))" ] }, { @@ -882,6 +789,13 @@ "3.2.2 Please explain in detail how you achieved this and what you did specifically and why you tried this." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -902,16 +816,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "228\n" + ] + } + ], "source": [ "# TODO\n", "output = np.array([fips, preds])\n", "output = pd.DataFrame(np.transpose(output))\n", "output[1] = output[1].astype(int)\n", - "#output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", - "output.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", + "output_creative = np.array([fips, preds_creative])\n", + "output_creative = pd.DataFrame(np.transpose(output_creative))\n", + "output_creative[1] = output_creative[1].astype(int)\n", + "output.to_csv(\"submission.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", + "output_creative.to_csv(\"submission_creative.csv\", header=[\"FIPS\",\"Result\"], index = False)\n", "\n", "# You may use pandas to generate a dataframe with FIPS and your predictions first \n", "# and then use to_csv to generate a CSV file." diff --git a/submission.csv b/submission.csv index a59fce4..e2da191 100644 --- a/submission.csv +++ b/submission.csv @@ -6,21 +6,21 @@ FIPS,Result 39039,0 30083,0 46047,0 -45013,1 +45013,0 18119,0 48333,0 -53005,1 +53005,0 5097,0 48297,0 51141,0 54107,0 -12103,1 +12103,0 51077,0 25015,1 47043,0 48363,0 31077,0 -21043,1 +21043,0 28047,0 19193,0 18165,0 @@ -30,33 +30,33 @@ FIPS,Result 21229,0 17097,1 29159,0 -28149,1 +28149,0 48467,0 5053,0 31093,0 19143,0 13181,0 47127,0 -21153,0 +21153,1 20143,0 22119,0 19119,0 47083,0 42095,0 -6081,1 +6081,0 13313,0 5123,0 28039,0 -51760,0 +51760,1 48245,0 -41053,1 +41053,0 8025,0 5079,0 49043,0 -48489,0 +48489,1 22057,0 1021,0 -29185,1 +29185,0 29211,0 54081,0 6003,0 @@ -67,7 +67,7 @@ FIPS,Result 37097,0 48283,0 46083,0 -31055,1 +31055,0 51580,0 37157,0 42033,0 @@ -76,7 +76,7 @@ FIPS,Result 13075,0 21123,0 48185,0 -13245,1 +13245,0 13001,0 31139,0 38025,0 @@ -90,25 +90,25 @@ FIPS,Result 13183,0 53035,1 48479,1 -21063,0 +21063,1 20101,0 48415,0 51185,0 -51043,1 -36013,1 +51043,0 +36013,0 55119,0 -45075,1 +45075,0 47101,0 26089,1 48367,0 -37131,0 -21205,1 +37131,1 +21205,0 47179,0 48093,0 -5101,1 +5101,0 1093,0 31101,0 -5069,1 +5069,0 8119,0 54037,0 13213,0 @@ -121,39 +121,39 @@ FIPS,Result 19045,0 21167,0 31127,0 -5015,1 +5015,0 51159,0 -28133,0 +28133,1 5045,0 39069,0 -48381,1 +48381,0 13129,0 37041,0 51057,0 25005,0 22059,0 -38017,1 +38017,0 26109,0 -49037,0 +49037,1 48265,0 39129,0 20055,0 -21121,1 +21121,0 31173,0 -22035,1 +22035,0 8011,0 39027,0 17107,0 -42021,1 +42021,0 37023,0 54003,0 31097,0 17141,0 48203,0 20011,0 -23031,1 +23031,0 16027,0 -48003,1 +48003,0 51600,0 13027,0 21021,0 @@ -168,7 +168,7 @@ FIPS,Result 36015,0 1063,1 13279,0 -42125,1 +42125,0 42105,0 20207,0 13285,0 @@ -196,7 +196,7 @@ FIPS,Result 46119,0 12045,0 40141,0 -41015,1 +41015,0 48129,0 29083,0 17117,0 @@ -205,7 +205,7 @@ FIPS,Result 27009,0 19195,0 19039,0 -18005,1 +18005,0 37083,0 39077,0 19147,0 @@ -227,12 +227,12 @@ FIPS,Result 26121,0 21203,0 21211,0 -44005,1 +44005,0 35005,0 30017,0 31107,0 20003,0 -22103,1 +22103,0 5117,0 47107,0 20047,0 @@ -241,15 +241,15 @@ FIPS,Result 54027,0 13147,0 30055,0 -20203,1 +20203,0 1051,0 18153,0 29043,0 -8065,0 +8065,1 29135,0 31083,0 17065,0 -12097,1 +12097,0 28111,0 1113,0 48501,0 @@ -261,7 +261,7 @@ FIPS,Result 12125,0 51073,0 47149,0 -46031,1 +46031,0 40085,0 40127,0 28015,0 @@ -274,8 +274,8 @@ FIPS,Result 13319,0 20135,0 48021,0 -28121,1 -51678,1 +28121,0 +51678,0 12027,0 30005,0 17113,1 @@ -287,26 +287,26 @@ FIPS,Result 27111,0 38039,0 20153,0 -48361,1 +48361,0 20195,0 40101,0 21199,0 55117,0 -34039,0 +34039,1 16019,0 30009,0 19155,0 29105,0 23009,0 30101,0 -8059,0 +8059,1 17069,0 29175,0 31051,0 56039,1 -26131,1 +26131,0 48177,0 -8027,1 +8027,0 23007,0 51145,0 51087,1 @@ -333,9 +333,9 @@ FIPS,Result 26081,0 26155,0 17143,1 -53067,1 +53067,0 21105,0 -24009,0 +24009,1 1025,0 27067,0 19075,0 @@ -345,21 +345,21 @@ FIPS,Result 1105,1 16071,0 17199,0 -48061,0 +48061,1 1039,0 8097,1 27091,0 42099,0 42117,0 -13077,1 +13077,0 51520,0 36051,0 54025,0 -30049,1 +30049,0 1079,0 37117,0 48133,0 -42041,1 +42041,0 49007,0 47143,0 55039,0 @@ -370,16 +370,16 @@ FIPS,Result 53041,0 31169,0 37185,0 -26099,1 +26099,0 47155,0 6111,1 29029,0 13081,0 5111,0 -51133,1 -6057,1 +51133,0 +6057,0 21225,0 -48141,0 +48141,1 28073,0 27135,0 37129,0 @@ -390,8 +390,8 @@ FIPS,Result 46129,0 28003,0 1123,0 -39017,1 -6097,0 +39017,0 +6097,1 4021,0 38031,0 48405,0 @@ -414,13 +414,13 @@ FIPS,Result 36117,0 48043,1 45063,0 -16057,0 +16057,1 48157,0 12073,1 -41067,0 +41067,1 29187,0 8049,0 -40133,1 +40133,0 37183,1 54005,0 18045,0 @@ -430,9 +430,9 @@ FIPS,Result 54057,0 21065,0 48341,0 -47077,1 -28135,0 -1013,0 +47077,0 +28135,1 +1013,1 40145,0 1111,0 37111,0 @@ -440,7 +440,7 @@ FIPS,Result 12129,0 17103,0 1077,0 -45089,1 +45089,0 51187,0 13013,0 31087,0 @@ -448,11 +448,11 @@ FIPS,Result 13293,0 48461,0 28107,0 -8041,1 +8041,0 5049,0 27073,0 -36055,0 -13259,1 +36055,1 +13259,0 51155,0 35001,1 47025,0 @@ -462,11 +462,11 @@ FIPS,Result 48199,0 19037,0 13311,0 -51165,1 -17077,0 +51165,0 +17077,1 29001,0 54011,0 -56035,1 +56035,0 51135,0 25001,0 21175,0 @@ -492,14 +492,14 @@ FIPS,Result 6089,0 13227,0 51011,0 -36113,1 +36113,0 54061,1 18135,0 21221,0 17011,0 -29169,1 +29169,0 29081,0 -12087,1 +12087,0 17001,0 21117,0 47035,0 @@ -508,7 +508,7 @@ FIPS,Result 28069,0 56015,0 21159,0 -17203,1 +17203,0 37135,1 5011,0 28151,1 @@ -521,12 +521,12 @@ FIPS,Result 20021,0 26129,0 48507,1 -21037,1 +21037,0 13083,0 17137,0 48053,0 18041,0 -17167,0 +17167,1 42023,0 42081,0 17041,0 @@ -546,9 +546,9 @@ FIPS,Result 20023,0 26107,0 13255,0 -5093,0 -53063,1 -45067,1 +5093,1 +53063,0 +45067,0 46123,0 38007,0 29117,0 @@ -556,9 +556,9 @@ FIPS,Result 37139,0 29127,0 13123,0 -13283,1 +13283,0 31073,0 -51103,1 +51103,0 36119,1 13229,0 48281,0 @@ -571,10 +571,10 @@ FIPS,Result 17179,0 1055,0 55141,0 -25007,0 +25007,1 24015,0 29227,0 -48031,1 +48031,0 20175,0 41059,0 27087,0 @@ -601,7 +601,7 @@ FIPS,Result 21049,0 54019,0 21135,0 -29223,1 +29223,0 24031,1 55109,0 55001,0 @@ -609,13 +609,13 @@ FIPS,Result 55053,0 47011,0 13009,0 -13003,1 +13003,0 48471,0 12085,0 21047,0 51685,0 5139,0 -48271,1 +48271,0 56007,0 54015,0 6115,0 @@ -658,36 +658,36 @@ FIPS,Result 31069,0 26037,0 23003,0 -13063,0 +13063,1 19109,0 42079,0 39107,0 19103,1 39047,0 -50005,0 +50005,1 47051,0 39105,0 38091,0 -54089,1 +54089,0 42051,0 51081,0 -22055,0 +22055,1 51029,0 51093,0 22025,0 51750,1 -35011,1 -49005,1 +35011,0 +49005,0 48197,0 31125,0 1107,0 45085,0 -51061,1 +51061,0 13091,0 5131,0 49045,0 39159,0 -19053,1 +19053,0 37193,0 13149,0 18105,1 @@ -695,17 +695,17 @@ FIPS,Result 20089,0 28099,0 48373,0 -51157,1 +51157,0 31081,0 27125,0 -1035,1 -13163,1 +1035,0 +13163,0 31057,0 13257,0 18109,0 28053,1 13211,0 -22017,1 +22017,0 12039,0 13043,0 18075,0 @@ -715,19 +715,19 @@ FIPS,Result 29033,0 24047,0 6085,1 -45079,0 -46011,1 +45079,1 +46011,0 48145,0 49039,0 45051,0 39113,0 -12086,0 +12086,1 21223,0 39057,0 18157,0 18133,0 28021,1 -53061,0 +53061,1 6019,1 45087,0 37085,0 @@ -738,21 +738,21 @@ FIPS,Result 5003,0 18029,0 48231,0 -24043,1 +24043,0 51017,0 -8021,1 +8021,0 27053,1 18145,0 37031,0 -18087,1 +18087,0 5063,0 31017,0 51710,1 45047,0 30039,0 31035,0 -13039,1 -12061,1 +13039,0 +12061,0 18123,0 39119,0 26071,0 @@ -763,7 +763,7 @@ FIPS,Result 25021,1 42133,0 8077,0 -13021,1 +13021,0 6075,1 37093,0 37055,0 @@ -785,21 +785,21 @@ FIPS,Result 51015,0 45071,0 19031,0 -31109,1 +31109,0 55137,0 39063,0 5017,0 54035,0 -26061,0 +26061,1 28101,0 8087,0 31043,0 21039,0 39009,0 -13269,1 +13269,0 30067,0 48445,0 -41017,1 +41017,0 41057,0 48473,0 51021,0 @@ -812,29 +812,29 @@ FIPS,Result 28093,0 48171,0 48139,0 -50017,0 +50017,1 38081,0 -35017,1 +35017,0 51117,0 -51161,1 +51161,0 19157,0 26157,0 47135,0 -13153,1 +13153,0 48431,0 20075,0 20069,0 38051,0 -13061,1 +13061,0 16053,0 -50025,0 +50025,1 47085,0 -9007,0 +9007,1 27061,0 27081,0 51153,1 -46071,1 -50015,0 +46071,0 +50015,1 39061,0 19035,0 1133,0 @@ -842,18 +842,18 @@ FIPS,Result 19145,0 36043,0 42039,0 -19113,1 -48247,0 +19113,0 +48247,1 21171,0 47133,0 46097,0 35013,1 -22071,0 +22071,1 22015,0 35019,0 37145,0 -37081,0 -41069,1 +37081,1 +41069,0 29189,1 4015,0 21227,0 @@ -864,14 +864,14 @@ FIPS,Result 37059,0 42111,0 36017,0 -35053,1 +35053,0 48067,0 18057,1 6101,0 48465,0 51740,0 20037,0 -17093,0 +17093,1 40055,0 26117,0 27071,0 @@ -889,25 +889,25 @@ FIPS,Result 54013,0 26073,0 48291,0 -51660,0 +51660,1 39007,0 37077,0 41003,1 26125,1 25017,1 -17099,1 +17099,0 37013,0 20013,0 27107,0 28077,0 -21235,1 +21235,0 31071,0 19027,0 28159,0 39109,0 6041,1 30077,0 -48419,1 +48419,0 20073,0 37067,0 55013,0 @@ -924,7 +924,7 @@ FIPS,Result 8121,0 19167,0 18049,0 -1125,1 +1125,0 20169,0 36003,0 20129,0 @@ -936,9 +936,9 @@ FIPS,Result 48039,0 20139,0 12095,0 -51650,1 +51650,0 1087,1 -54069,1 +54069,0 39133,0 21001,0 17073,0 @@ -948,7 +948,7 @@ FIPS,Result 40137,0 40047,0 48435,0 -49017,1 +49017,0 40105,0 30027,0 26025,0 @@ -957,13 +957,13 @@ FIPS,Result 17071,0 47041,0 21067,1 -12065,1 -42043,1 +12065,0 +42043,0 19187,0 27093,0 48045,0 21003,0 -45033,1 +45033,0 55055,0 8091,0 27063,0 @@ -974,27 +974,27 @@ FIPS,Result 22001,0 20017,0 13029,0 -48323,0 +48323,1 46085,0 54093,0 -36019,0 +36019,1 19165,0 45003,0 51810,1 -26053,1 +26053,0 22045,0 38003,0 -12099,1 +12099,0 40115,0 -1091,0 +1091,1 51023,0 -13085,1 +13085,0 36007,0 45041,0 22023,0 51053,0 40131,0 -6001,0 +6001,1 19091,0 19097,0 13131,0 @@ -1014,12 +1014,12 @@ FIPS,Result 20137,0 13189,0 6055,1 -22065,1 +22065,0 55033,0 -1081,0 +1081,1 31001,0 25009,1 -13215,0 +13215,1 20131,0 8073,0 17039,0 @@ -1033,7 +1033,7 @@ FIPS,Result 9015,0 4017,0 29145,0 -5099,1 +5099,0 39147,0 48261,0 28071,1 @@ -1049,8 +1049,8 @@ FIPS,Result 47073,0 13071,0 37091,0 -27045,1 -8013,0 +27045,0 +8013,1 13197,0 32005,0 17161,0 @@ -1068,7 +1068,7 @@ FIPS,Result 23011,0 17193,0 22005,0 -48249,0 +48249,1 34031,1 18183,0 51001,0 @@ -1095,17 +1095,17 @@ FIPS,Result 19169,1 49051,0 55125,0 -26145,1 +26145,0 4009,0 -47037,1 -36077,1 +47037,0 +36077,0 54105,0 27017,0 12053,0 39059,0 21095,0 32023,0 -13199,0 +13199,1 55069,0 47161,0 29015,0 @@ -1124,7 +1124,7 @@ FIPS,Result 46043,0 18003,0 17013,0 -51700,0 +51700,1 29097,0 54021,0 55093,0 @@ -1132,7 +1132,7 @@ FIPS,Result 5085,0 46015,0 6043,0 -13235,1 +13235,0 48025,0 18013,0 35006,0 @@ -1144,7 +1144,7 @@ FIPS,Result 48075,0 26147,0 26063,0 -53077,0 +53077,1 21077,0 21093,0 18065,0 @@ -1153,7 +1153,7 @@ FIPS,Result 27059,0 32003,0 21197,0 -49049,1 +49049,0 1115,0 40075,0 51143,0 @@ -1161,18 +1161,18 @@ FIPS,Result 16011,0 6017,0 37011,0 -29133,1 +29133,0 55075,0 36059,1 30097,0 -26065,0 +26065,1 40039,0 18043,0 -18127,0 +18127,1 48485,0 23001,0 -36021,1 -32510,1 +36021,0 +32510,0 51036,0 38103,0 18091,0 @@ -1180,23 +1180,23 @@ FIPS,Result 13117,0 47053,0 47023,0 -18081,1 +18081,0 20117,0 33007,0 40027,0 50009,0 48097,0 -48041,0 +48041,1 36053,0 16079,0 36005,1 -47097,1 +47097,0 37175,0 21191,0 51097,0 5027,0 21035,0 -17019,0 +17019,1 37009,0 48313,0 20059,0 @@ -1207,8 +1207,8 @@ FIPS,Result 41033,0 28049,1 19177,0 -28043,1 -13165,1 +28043,0 +13165,0 53057,0 51610,1 27139,0 @@ -1222,27 +1222,27 @@ FIPS,Result 1029,0 22085,0 30041,0 -1041,1 -46041,0 +1041,0 +46041,1 41051,1 -47131,1 +47131,0 51169,0 18071,0 51830,1 34023,1 8055,0 -28105,0 +28105,1 48309,0 -6105,0 +6105,1 21057,0 18073,0 28031,0 -4019,0 -56033,1 +4019,1 +56033,0 40107,0 48335,0 17025,0 -1065,1 +1065,0 51051,0 31145,0 28027,1 @@ -1292,10 +1292,10 @@ FIPS,Result 44003,0 48317,0 38089,0 -48505,0 +48505,1 13069,0 -28087,1 -29085,1 +28087,0 +29085,0 12105,0 54023,0 51101,0 @@ -1311,7 +1311,7 @@ FIPS,Result 1061,0 41063,0 40135,0 -13145,1 +13145,0 29077,0 30079,0 16035,0 @@ -1319,38 +1319,38 @@ FIPS,Result 48023,0 40021,0 48267,0 -54067,1 -5119,1 +54067,0 +5119,0 40083,0 48251,0 21183,0 27027,0 -23013,1 +23013,0 13139,0 28009,0 51025,0 -13095,0 -41027,0 +13095,1 +41027,1 39033,0 -35031,0 +35031,1 38047,0 36105,0 51690,0 30023,0 29047,0 12071,0 -48191,1 -21075,1 +48191,0 +21075,0 19057,0 18063,0 40113,0 -22101,1 +22101,0 38087,0 13289,0 37121,0 45017,0 36071,0 -46095,1 +46095,0 48441,0 47071,0 13109,0 @@ -1364,20 +1364,20 @@ FIPS,Result 45065,0 17021,0 47087,0 -36079,1 +36079,0 8115,0 10001,0 45039,0 22091,0 -24041,0 +24041,1 21165,0 25013,0 -36067,0 +36067,1 31089,0 38001,0 46075,0 29053,0 -27127,1 +27127,0 6073,1 8069,1 51175,0 @@ -1385,7 +1385,7 @@ FIPS,Result 13143,0 18051,0 54051,0 -6065,0 +6065,1 17183,0 35043,1 21089,0 @@ -1398,14 +1398,14 @@ FIPS,Result 40097,0 51127,0 1009,0 -37147,0 +37147,1 37103,0 47065,0 55101,0 29137,0 31151,0 -27109,1 -36027,1 +27109,0 +36027,0 6045,0 54039,0 8001,0 @@ -1431,11 +1431,11 @@ FIPS,Result 29037,0 16085,0 38075,0 -21189,1 +21189,0 29181,0 19085,0 37161,0 -46121,1 +46121,0 37195,0 48417,0 37113,0 @@ -1448,7 +1448,7 @@ FIPS,Result 55099,0 36041,0 31079,0 -44009,1 +44009,0 16049,0 39049,1 20209,0 @@ -1459,9 +1459,9 @@ FIPS,Result 5021,0 35045,0 21031,0 -21025,1 +21025,0 48163,0 -46017,1 +46017,0 48095,0 55123,0 12077,0 @@ -1477,7 +1477,7 @@ FIPS,Result 21041,0 48503,0 27173,0 -55063,1 +55063,0 29115,0 27005,0 29125,0 @@ -1494,7 +1494,7 @@ FIPS,Result 38027,0 5105,0 51033,0 -54047,1 +54047,0 28155,0 28097,0 13321,0 @@ -1508,7 +1508,7 @@ FIPS,Result 39127,0 5071,0 47125,0 -8037,0 +8037,1 38045,0 17153,0 29067,0 @@ -1525,9 +1525,9 @@ FIPS,Result 17095,0 18095,0 29027,0 -51540,0 +51540,1 37027,0 -54055,1 +54055,0 51059,1 48189,0 37179,0 @@ -1536,7 +1536,7 @@ FIPS,Result 21127,0 47045,0 51119,0 -1085,0 +1085,1 13037,0 47089,0 40067,0 diff --git a/submission_creative.csv b/submission_creative.csv index 20a59f6..e4517d3 100644 --- a/submission_creative.csv +++ b/submission_creative.csv @@ -43,17 +43,17 @@ FIPS,Result 19119,0 47083,0 42095,0 -6081,1 +6081,0 13313,0 -5123,1 +5123,0 28039,0 51760,1 48245,0 -41053,0 +41053,1 8025,0 5079,0 49043,0 -48489,0 +48489,1 22057,0 1021,0 29185,0 @@ -61,13 +61,13 @@ FIPS,Result 54081,0 6003,0 29225,0 -33015,0 +33015,1 42071,0 20065,0 37097,0 48283,0 46083,0 -31055,0 +31055,1 51580,0 37157,0 42033,0 @@ -76,7 +76,7 @@ FIPS,Result 13075,0 21123,0 48185,0 -13245,1 +13245,0 13001,0 31139,0 38025,0 @@ -85,12 +85,12 @@ FIPS,Result 48101,0 42037,0 19079,0 -12049,0 +12049,1 21131,0 13183,0 53035,1 48479,1 -21063,1 +21063,0 20101,0 48415,0 51185,0 @@ -99,7 +99,7 @@ FIPS,Result 55119,0 45075,1 47101,0 -26089,1 +26089,0 48367,0 37131,1 21205,0 @@ -109,7 +109,7 @@ FIPS,Result 1093,0 31101,0 5069,0 -8119,0 +8119,1 54037,0 13213,0 29089,0 @@ -134,13 +134,13 @@ FIPS,Result 22059,0 38017,0 26109,0 -49037,0 +49037,1 48265,0 39129,0 20055,0 21121,0 31173,1 -22035,1 +22035,0 8011,0 39027,0 17107,0 @@ -159,7 +159,7 @@ FIPS,Result 21021,0 42123,0 32015,0 -15009,0 +15009,1 42057,0 53065,0 48211,0 @@ -191,7 +191,7 @@ FIPS,Result 48083,0 45031,0 4023,1 -47095,0 +47095,1 12081,0 46119,0 12045,0 @@ -222,12 +222,12 @@ FIPS,Result 28081,0 29093,0 46003,0 -4027,1 -53025,1 +4027,0 +53025,0 26121,0 21203,0 21211,0 -44005,1 +44005,0 35005,0 30017,0 31107,0 @@ -307,7 +307,7 @@ FIPS,Result 26131,0 48177,0 8027,0 -23007,0 +23007,1 51145,0 51087,1 20157,0 @@ -335,8 +335,8 @@ FIPS,Result 17143,1 53067,1 21105,0 -24009,0 -1025,1 +24009,1 +1025,0 27067,0 19075,0 12037,0 @@ -351,15 +351,15 @@ FIPS,Result 27091,0 42099,0 42117,0 -13077,1 -51520,1 +13077,0 +51520,0 36051,0 54025,0 30049,0 1079,0 -37117,0 +37117,1 48133,0 -42041,1 +42041,0 49007,0 47143,0 55039,0 @@ -376,11 +376,11 @@ FIPS,Result 29029,0 13081,0 5111,0 -51133,0 +51133,1 6057,1 21225,0 -48141,1 -28073,1 +48141,0 +28073,0 27135,0 37129,0 37025,0 @@ -407,19 +407,19 @@ FIPS,Result 53001,0 19019,0 39043,0 -34001,0 +34001,1 31113,0 27145,0 47105,0 36117,0 48043,1 -45063,1 +45063,0 16057,1 48157,0 12073,1 41067,1 29187,0 -8049,0 +8049,1 40133,0 37183,1 54005,0 @@ -432,7 +432,7 @@ FIPS,Result 48341,0 47077,0 28135,1 -1013,0 +1013,1 40145,0 1111,0 37111,0 @@ -446,9 +446,9 @@ FIPS,Result 31087,0 17091,0 13293,0 -48461,0 +48461,1 28107,1 -8041,0 +8041,1 5049,0 27073,0 36055,1 @@ -479,7 +479,7 @@ FIPS,Result 27033,0 22047,0 37171,0 -51003,1 +51003,0 54041,0 17155,0 5075,0 @@ -493,7 +493,7 @@ FIPS,Result 13227,0 51011,0 36113,0 -54061,0 +54061,1 18135,0 21221,0 17011,0 @@ -538,7 +538,7 @@ FIPS,Result 51071,0 26113,0 8015,0 -19061,0 +19061,1 27049,0 31011,0 48357,0 @@ -558,8 +558,8 @@ FIPS,Result 13123,0 13283,0 31073,0 -51103,0 -36119,1 +51103,1 +36119,0 13229,0 48281,0 6039,0 @@ -577,7 +577,7 @@ FIPS,Result 48031,0 20175,0 41059,0 -27087,1 +27087,0 41061,0 48307,0 56005,0 @@ -640,23 +640,23 @@ FIPS,Result 32027,0 19181,0 13267,0 -17007,1 +17007,0 48137,0 -8035,1 +8035,0 34025,1 18007,0 -4013,0 +4013,1 54073,0 26045,0 29011,0 -55135,0 +55135,1 30071,0 12093,0 6107,0 -1011,0 +1011,1 29035,0 31069,0 -26037,1 +26037,0 23003,0 13063,1 19109,0 @@ -668,20 +668,20 @@ FIPS,Result 47051,0 39105,0 38091,0 -54089,1 +54089,0 42051,0 51081,1 -22055,1 +22055,0 51029,0 51093,0 22025,0 51750,1 35011,0 -49005,1 +49005,0 48197,0 31125,0 1107,0 -45085,1 +45085,0 51061,0 13091,0 5131,0 @@ -698,7 +698,7 @@ FIPS,Result 51157,0 31081,0 27125,0 -1035,1 +1035,0 13163,0 31057,0 13257,0 @@ -716,14 +716,14 @@ FIPS,Result 24047,0 6085,1 45079,1 -46011,0 +46011,1 48145,0 49039,0 45051,0 39113,0 -12086,1 +12086,0 21223,0 -39057,1 +39057,0 18157,1 18133,0 28021,1 @@ -740,7 +740,7 @@ FIPS,Result 48231,0 24043,0 51017,0 -8021,1 +8021,0 27053,1 18145,0 37031,0 @@ -762,8 +762,8 @@ FIPS,Result 5037,0 25021,1 42133,0 -8077,0 -13021,1 +8077,1 +13021,0 6075,1 37093,1 37055,0 @@ -776,7 +776,7 @@ FIPS,Result 34029,0 20099,0 22079,0 -13239,1 +13239,0 51079,0 31105,0 13249,0 @@ -788,9 +788,9 @@ FIPS,Result 31109,0 55137,0 39063,0 -5017,1 +5017,0 54035,0 -26061,1 +26061,0 28101,0 8087,0 31043,0 @@ -815,12 +815,12 @@ FIPS,Result 50017,1 38081,0 35017,0 -51117,0 +51117,1 51161,0 19157,0 26157,0 47135,0 -13153,1 +13153,0 48431,0 20075,0 20069,0 @@ -832,7 +832,7 @@ FIPS,Result 9007,0 27061,0 27081,0 -51153,1 +51153,0 46071,0 50015,1 39061,1 @@ -848,7 +848,7 @@ FIPS,Result 47133,0 46097,0 35013,1 -22071,1 +22071,0 22015,0 35019,0 37145,0 @@ -866,11 +866,11 @@ FIPS,Result 36017,0 35053,0 48067,0 -18057,0 +18057,1 6101,0 48465,1 51740,1 -20037,1 +20037,0 17093,1 40055,0 26117,0 @@ -887,7 +887,7 @@ FIPS,Result 26051,0 29171,0 54013,0 -26073,1 +26073,0 48291,0 51660,1 39007,0 @@ -909,7 +909,7 @@ FIPS,Result 30077,0 48419,0 20073,0 -37067,0 +37067,1 55013,0 30013,0 42065,0 @@ -930,13 +930,13 @@ FIPS,Result 20129,0 18023,0 51031,0 -22097,1 +22097,0 16051,0 27147,0 48039,0 20139,0 12095,1 -51650,0 +51650,1 1087,1 54069,0 39133,0 @@ -977,11 +977,11 @@ FIPS,Result 48323,1 46085,0 54093,0 -36019,1 +36019,0 19165,0 45003,0 51810,0 -26053,0 +26053,1 22045,0 38003,0 12099,0 @@ -989,8 +989,8 @@ FIPS,Result 1091,1 51023,0 13085,0 -36007,0 -45041,1 +36007,1 +45041,0 22023,0 51053,0 40131,0 @@ -1014,7 +1014,7 @@ FIPS,Result 20137,0 13189,0 6055,1 -22065,1 +22065,0 55033,0 1081,1 31001,0 @@ -1025,12 +1025,12 @@ FIPS,Result 17039,0 1083,0 21213,0 -13219,1 +13219,0 17157,0 48153,0 54001,0 54083,0 -9015,0 +9015,1 4017,1 29145,0 5099,0 @@ -1041,7 +1041,7 @@ FIPS,Result 13187,0 40149,0 20171,0 -5035,1 +5035,0 13231,0 40009,0 38069,0 @@ -1051,9 +1051,9 @@ FIPS,Result 37091,0 27045,0 8013,1 -13197,0 +13197,1 32005,0 -17161,1 +17161,0 12023,0 40059,0 42127,0 @@ -1067,7 +1067,7 @@ FIPS,Result 13047,0 23011,0 17193,0 -22005,1 +22005,0 48249,1 34031,1 18183,0 @@ -1075,7 +1075,7 @@ FIPS,Result 19007,0 13295,0 34019,1 -55079,0 +55079,1 17027,0 41029,0 25027,1 @@ -1089,7 +1089,7 @@ FIPS,Result 27103,0 31033,0 47175,0 -1047,1 +1047,0 40095,0 55077,0 19169,1 @@ -1097,8 +1097,8 @@ FIPS,Result 55125,0 26145,0 4009,1 -47037,0 -36077,0 +47037,1 +36077,1 54105,0 27017,0 12053,0 @@ -1122,7 +1122,7 @@ FIPS,Result 22027,0 48047,1 46043,0 -18003,0 +18003,1 17013,0 51700,1 29097,0 @@ -1135,7 +1135,7 @@ FIPS,Result 13235,0 48025,0 18013,0 -35006,1 +35006,0 36037,0 27129,0 23017,0 @@ -1151,24 +1151,24 @@ FIPS,Result 27023,0 29139,0 27059,0 -32003,0 +32003,1 21197,0 49049,0 1115,0 40075,0 51143,0 -16039,1 +16039,0 16011,0 -6017,0 +6017,1 37011,0 29133,0 55075,0 -36059,1 +36059,0 30097,0 26065,1 40039,0 -18043,0 -18127,0 +18043,1 +18127,1 48485,0 23001,0 36021,0 @@ -1209,12 +1209,12 @@ FIPS,Result 19177,0 28043,0 13165,0 -53057,1 +53057,0 51610,1 27139,0 16009,0 22093,0 -38005,0 +38005,1 48147,0 48423,0 54099,0 @@ -1232,7 +1232,7 @@ FIPS,Result 34023,1 8055,1 28105,1 -48309,0 +48309,1 6105,1 21057,0 18073,0 @@ -1287,9 +1287,9 @@ FIPS,Result 49035,0 20063,0 18079,0 -13125,0 +13125,1 17191,0 -44003,1 +44003,0 48317,0 38089,0 48505,1 @@ -1328,7 +1328,7 @@ FIPS,Result 23013,0 13139,0 28009,0 -51025,0 +51025,1 13095,1 41027,1 39033,0 @@ -1367,11 +1367,11 @@ FIPS,Result 36079,0 8115,0 10001,0 -45039,0 +45039,1 22091,0 24041,0 21165,0 -25013,0 +25013,1 36067,1 31089,0 38001,0 @@ -1387,7 +1387,7 @@ FIPS,Result 54051,0 6065,1 17183,0 -35043,1 +35043,0 21089,0 16077,0 22043,0 @@ -1406,7 +1406,7 @@ FIPS,Result 31151,0 27109,0 36027,0 -6045,0 +6045,1 54039,0 8001,0 21027,0 @@ -1415,7 +1415,7 @@ FIPS,Result 29123,0 46079,0 31121,0 -5107,1 +5107,0 34015,1 18121,0 1099,0 @@ -1436,11 +1436,11 @@ FIPS,Result 19085,0 37161,0 46121,1 -37195,0 +37195,1 48417,0 37113,0 48007,0 -51179,1 +51179,0 12015,0 4012,0 29063,0 @@ -1448,20 +1448,20 @@ FIPS,Result 55099,0 36041,0 31079,0 -44009,1 +44009,0 16049,0 39049,1 -20209,1 -47069,0 +20209,0 +47069,1 29205,0 -5007,1 +5007,0 6029,1 5021,0 35045,1 21031,0 21025,0 -48163,0 -46017,0 +48163,1 +46017,1 48095,0 55123,0 12077,0 @@ -1477,7 +1477,7 @@ FIPS,Result 21041,0 48503,0 27173,0 -55063,0 +55063,1 29115,0 27005,0 29125,0 @@ -1531,7 +1531,7 @@ FIPS,Result 51059,1 48189,0 37179,0 -53055,1 +53055,0 48063,0 21127,0 47045,0 @@ -1541,7 +1541,7 @@ FIPS,Result 47089,0 40067,0 8101,0 -17119,1 +17119,0 26059,0 12047,0 18021,0