Scores are in each file, along with values

MLDERES · Aug 21, 2020 · ce7f75b · ce7f75b
1 parent 3398072
commit ce7f75b
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 31 deletions.
diff --git a/conf/base/scoring.yml b/conf/base/scoring.yml
@@ -38,4 +38,6 @@ full_ppr:
 half_ppr:
     <<: *standard_scoring
     rcv_rec: 0.5
-
+
+custom:
+    <<: *standard_scoring
diff --git a/notebooks/analysis.ipynb b/notebooks/analysis.ipynb
@@ -24,9 +24,7 @@
     "proj_path = current_dir.parent  # point back to the root of the project\n",
     "context = load_context(proj_path)\n",
     "catalog = context.catalog\n",
-    "\n",
-    "from phantasyfootballer.settings import *\n",
-    "from phantasyfootballer.common import *"
+    "\n"
    ]
   },
   {
@@ -37,8 +35,32 @@
    },
    "outputs": [],
    "source": [
+    "from phantasyfootballer.settings import *\n",
+    "from phantasyfootballer.common import Stats\n",
     "df_ppr = catalog.load('scoring.ppr')\n",
-    "df_ppr.head()"
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_ppr.sort_values(Stats.POS_RANK)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "df_ppr[Stats.POS_RANK] = df_ppr.groupby(POSITION)[Stats.FANTASY_POINTS].rank(na_option=\"bottom\", ascending=False)\n",
+    "df_ppr.sort_values(Stats.POS_RANK, ascending=True)"
    ]
   },
   {

diff --git a/notebooks/sample.ipynb b/notebooks/sample.ipynb
@@ -76,6 +76,48 @@
     "df_right['NEW_COL'] = 'new_data'\n",
     "combine_data_horizontal(df_left,df_right)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from phantasyfootballer.settings import *\n",
+    "from phantasyfootballer.common import Stats\n",
+    "data = context.catalog.load('scoring.ppr')\n",
+    "#data.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].mean()\n",
+    "joined = data.join(pos_data,on=POSITION,rsuffix='avg')\n",
+    "data[Stats.PCT_AVERAGE] = joined[Stats.FANTASY_POINTS]/joined['fpavg']\n",
+    "joined.head()\n",
+    "data.head()"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/phantasyfootballer/common.py b/src/phantasyfootballer/common.py
@@ -65,8 +65,15 @@ class Stats():
     FP_HALF = "fp_hppr"
     FP_FULL = "fp_ppr"
     FANTASY_POINTS = "fp"
-
-    ALL_STATS = [
+    RANK = 'overall_rank'
+    PCT_TYPICAL_POS = 'percent_typical_position'
+    PCT_MEAN_POS = 'percent_average_position'
+    PCT_MEDIAN_POS = 'percent_median_position'
+    PCT_TYPICAL_OVR = 'percent_typical_overall'
+    PCT_MEAN_OVR = 'percent_average_overall'
+    PCT_MEDIAN_OVR= 'percent_median_overall'
+
+    ALL_STATS= [
         PASS_ATT,
         PASS_COMP,
         PASS_YDS,
@@ -87,7 +94,15 @@ class Stats():
         DST_SAFE,
         DST_PA,
         MISC_FL,
-        FANTASY_POINTS
+        RANK,
+        POS_RANK,
+        FANTASY_POINTS,
+        PCT_TYPICAL_POS,
+        PCT_MEAN_POS,
+        PCT_MEDIAN_POS,
+        PCT_TYPICAL_OVR,
+        PCT_MEAN_OVR,
+        PCT_MEDIAN_OVR 
     ]
 
     @staticmethod

diff --git a/src/phantasyfootballer/pipelines/data_engineering/nodes.py b/src/phantasyfootballer/pipelines/data_engineering/nodes.py
@@ -5,25 +5,8 @@
 from kedro.config import ConfigLoader
 from functools import reduce, partial, update_wrapper
 
-
 String_or_List = Union[str, List[str]]
 
-
-def normalize_data_source(data: pd.DataFrame, stat_name: str, common_stats: Dict[str, any]) -> pd.DataFrame:
-    """
-    This node will take a data source that is provided and adjust the stats so that they have 
-    a common stat column name.  Additionally, if there is a stat that is common to the entire dataset
-    (e.g. NFL week, NFL year, all qbs) that isn't already part of the file then this will be set as well.
-
-    Say for instance, that the provider returns a file called 2019_passing_stats.  The column NFL Year is
-    not likley included, so you can have it included by specifying that in the common_stats dictionary.
-
-    The mapping from a provider column name and the common name are taking from conf/project/parameters.yml
-    """
-    pass
-
-
-
 def _craft_scoring_dict(scheme: str) -> Dict[str, Any]:
     """
     Look up the scoring system in the scoring.yml file 
@@ -57,7 +40,7 @@ def _calculate_projected_points(scoring: String_or_List, data: pd.DataFrame) ->
         for c in data.columns:
             if (m := score_map.get(c)) :
                 df_pts[c + "_pts"] = data[c] * m
-        data[FANTASY_POINTS] = round(df_pts.sum(axis=1), 2)
+        data[Stats.FANTASY_POINTS] = round(df_pts.sum(axis=1), 2)
 
     return data
 
@@ -99,24 +82,44 @@ def average_stats_by_player(*dataframes: Sequence[pd.DataFrame]) -> pd.DataFrame
     # Pull all the dataframes into a single one
     df_all = pd.concat(dataframes)
     # Get the mean keeping the columns that matter
-    df_all = df_all.groupby(["player", "team", "position"]).mean().fillna(0)
+    df_all = df_all.groupby([PLAYER_NAME, TEAM, POSITION]).mean().fillna(0)
     # Drop all the players where they have 0 projections
     df_all = df_all[df_all.sum(axis=1) > 0].reset_index()
+    # Drop positions we don't care about 
+    df_all = df_all.query('position in ["QB","RB","TE","WR","DST"]').reset_index(drop=True)
     return df_all
 
 def calculate_player_rank(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Calculate player ranking in a few different ways
+
+    Args:
+        data (pd.DataFrame): datafrme of player stats including project fantasy points
+
+    Returns:
+        pd.DataFrame: same dataframe with additional columns for player rank (overall) and by position
+    """    
+    # Calculate overall rank by points
+    data[Stats.RANK] = data[Stats.FANTASY_POINTS].rank(na_option="bottom", ascending=False)
+    # Calculate rank by position
+    data[Stats.POS_RANK] = data.groupby(POSITION)[Stats.FANTASY_POINTS].rank(na_option="bottom", ascending=False)
     return data
 
 def percent_mean(data: pd.DataFrame) -> pd.DataFrame:
     """
-    Calculate the overall rank and position rank using the score provided
+    Calculate the mean points for the player position, then determine how much more value this player has than 
+    other players in the same position
 
     Args:
         data (pd.DataFrame): the dataframe that has the players and a single scoring scheme
 
     Returns:
-        pd.DataFrame: updated dataframe with two new columns, rank and position rank
+        pd.DataFrame: updated with percentage
     """
+    pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].mean()
+    joined = data.join(pos_data,on=POSITION,rsuffix='avg')
+    data[Stats.PCT_MEAN_POS] = joined[Stats.FANTASY_POINTS]/joined['fpavg']
+    data[Stats.PCT_MEAN_OVR] = data[Stats.FANTASY_POINTS]/(data[Stats.FANTASY_POINTS].mean())
     return data
 
 
@@ -131,6 +134,10 @@ def percent_typical(data: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame: updated dataframe with a column that has identified the value of a player 
         relative to the typical player in his position
     """
+    pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].median()
+    joined = data.join(pos_data,on=POSITION,rsuffix='_med')
+    data[Stats.PCT_MEDIAN_POS] = joined[Stats.FANTASY_POINTS]/joined['fp_med']
+    data[Stats.PCT_MEDIAN_OVR] = data[Stats.FANTASY_POINTS]/(data[Stats.FANTASY_POINTS].median())
     return data
 
 

diff --git a/src/phantasyfootballer/pipelines/data_engineering/pipeline.py b/src/phantasyfootballer/pipelines/data_engineering/pipeline.py
@@ -58,6 +58,15 @@
         )
     ]
 )
+score_custom_pipeline = Pipeline(
+    [
+        node(
+            calculate_projected_points("custom"),
+            "average_stats_by_player_data",
+            "scored_custom_data",
+        )
+    ]
+)
 ranking_pipeline = Pipeline(
     [
         node(calculate_player_rank, "scored_data", "ranked_data", name="overall_rank_node"),
@@ -77,7 +86,8 @@
         ),
     ]
 )
-
+# Each of the following pipelines are here to do the ranking for each 
+#  scoring type
 full_ppr_pipeline = pipeline(
     ranking_pipeline,
     inputs={"scored_data": "scored_ppr_data"},
@@ -98,15 +108,22 @@
     outputs={"final_score_data": "scoring.standard"},
     namespace="std",
 )
-
+full_custom_pipeline = pipeline(
+    ranking_pipeline,
+    inputs={"scored_data": "scored_custom_data"},
+    outputs={"final_score_data": "scoring.custom"},
+    namespace="custom",
+)
 
 final_scoring_ranking_pipeline = (
     score_ppr_pipeline
     + score_half_ppr_pipeline
     + score_std_pipeline
+    + score_std_pipeline
     + full_ppr_pipeline
     + full_half_ppr_pipeline
     + full_standard_pipeline
+    + full_standard_pipeline
 )