diff --git a/conf/base/scoring.yml b/conf/base/scoring.yml index dad53c4..348cba3 100644 --- a/conf/base/scoring.yml +++ b/conf/base/scoring.yml @@ -38,4 +38,6 @@ full_ppr: half_ppr: <<: *standard_scoring rcv_rec: 0.5 - \ No newline at end of file + +custom: + <<: *standard_scoring \ No newline at end of file diff --git a/notebooks/analysis.ipynb b/notebooks/analysis.ipynb index df9eba7..04760cd 100644 --- a/notebooks/analysis.ipynb +++ b/notebooks/analysis.ipynb @@ -24,9 +24,7 @@ "proj_path = current_dir.parent # point back to the root of the project\n", "context = load_context(proj_path)\n", "catalog = context.catalog\n", - "\n", - "from phantasyfootballer.settings import *\n", - "from phantasyfootballer.common import *" + "\n" ] }, { @@ -37,8 +35,32 @@ }, "outputs": [], "source": [ + "from phantasyfootballer.settings import *\n", + "from phantasyfootballer.common import Stats\n", "df_ppr = catalog.load('scoring.ppr')\n", - "df_ppr.head()" + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_ppr.sort_values(Stats.POS_RANK)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "df_ppr[Stats.POS_RANK] = df_ppr.groupby(POSITION)[Stats.FANTASY_POINTS].rank(na_option=\"bottom\", ascending=False)\n", + "df_ppr.sort_values(Stats.POS_RANK, ascending=True)" ] }, { diff --git a/notebooks/sample.ipynb b/notebooks/sample.ipynb index 231f4f9..f0eeeca 100644 --- a/notebooks/sample.ipynb +++ b/notebooks/sample.ipynb @@ -76,6 +76,48 @@ "df_right['NEW_COL'] = 'new_data'\n", "combine_data_horizontal(df_left,df_right)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from phantasyfootballer.settings import *\n", + "from phantasyfootballer.common import Stats\n", + "data = context.catalog.load('scoring.ppr')\n", + "#data.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].mean()\n", + "joined = data.join(pos_data,on=POSITION,rsuffix='avg')\n", + "data[Stats.PCT_AVERAGE] = joined[Stats.FANTASY_POINTS]/joined['fpavg']\n", + "joined.head()\n", + "data.head()" + ] } ], "metadata": { diff --git a/src/phantasyfootballer/common.py b/src/phantasyfootballer/common.py index ed4d0bf..43bfdc2 100644 --- a/src/phantasyfootballer/common.py +++ b/src/phantasyfootballer/common.py @@ -65,8 +65,15 @@ class Stats(): FP_HALF = "fp_hppr" FP_FULL = "fp_ppr" FANTASY_POINTS = "fp" - - ALL_STATS = [ + RANK = 'overall_rank' + PCT_TYPICAL_POS = 'percent_typical_position' + PCT_MEAN_POS = 'percent_average_position' + PCT_MEDIAN_POS = 'percent_median_position' + PCT_TYPICAL_OVR = 'percent_typical_overall' + PCT_MEAN_OVR = 'percent_average_overall' + PCT_MEDIAN_OVR= 'percent_median_overall' + + ALL_STATS= [ PASS_ATT, PASS_COMP, PASS_YDS, @@ -87,7 +94,15 @@ class Stats(): DST_SAFE, DST_PA, MISC_FL, - FANTASY_POINTS + RANK, + POS_RANK, + FANTASY_POINTS, + PCT_TYPICAL_POS, + PCT_MEAN_POS, + PCT_MEDIAN_POS, + PCT_TYPICAL_OVR, + PCT_MEAN_OVR, + PCT_MEDIAN_OVR ] @staticmethod diff --git a/src/phantasyfootballer/pipelines/data_engineering/nodes.py b/src/phantasyfootballer/pipelines/data_engineering/nodes.py index b2a3ec5..60612c8 100644 --- a/src/phantasyfootballer/pipelines/data_engineering/nodes.py +++ b/src/phantasyfootballer/pipelines/data_engineering/nodes.py @@ -5,25 +5,8 @@ from kedro.config import ConfigLoader from functools import reduce, partial, update_wrapper - String_or_List = Union[str, List[str]] - -def normalize_data_source(data: pd.DataFrame, stat_name: str, common_stats: Dict[str, any]) -> pd.DataFrame: - """ - This node will take a data source that is provided and adjust the stats so that they have - a common stat column name. Additionally, if there is a stat that is common to the entire dataset - (e.g. NFL week, NFL year, all qbs) that isn't already part of the file then this will be set as well. - - Say for instance, that the provider returns a file called 2019_passing_stats. The column NFL Year is - not likley included, so you can have it included by specifying that in the common_stats dictionary. - - The mapping from a provider column name and the common name are taking from conf/project/parameters.yml - """ - pass - - - def _craft_scoring_dict(scheme: str) -> Dict[str, Any]: """ Look up the scoring system in the scoring.yml file @@ -57,7 +40,7 @@ def _calculate_projected_points(scoring: String_or_List, data: pd.DataFrame) -> for c in data.columns: if (m := score_map.get(c)) : df_pts[c + "_pts"] = data[c] * m - data[FANTASY_POINTS] = round(df_pts.sum(axis=1), 2) + data[Stats.FANTASY_POINTS] = round(df_pts.sum(axis=1), 2) return data @@ -99,24 +82,44 @@ def average_stats_by_player(*dataframes: Sequence[pd.DataFrame]) -> pd.DataFrame # Pull all the dataframes into a single one df_all = pd.concat(dataframes) # Get the mean keeping the columns that matter - df_all = df_all.groupby(["player", "team", "position"]).mean().fillna(0) + df_all = df_all.groupby([PLAYER_NAME, TEAM, POSITION]).mean().fillna(0) # Drop all the players where they have 0 projections df_all = df_all[df_all.sum(axis=1) > 0].reset_index() + # Drop positions we don't care about + df_all = df_all.query('position in ["QB","RB","TE","WR","DST"]').reset_index(drop=True) return df_all def calculate_player_rank(data: pd.DataFrame) -> pd.DataFrame: + """ + Calculate player ranking in a few different ways + + Args: + data (pd.DataFrame): datafrme of player stats including project fantasy points + + Returns: + pd.DataFrame: same dataframe with additional columns for player rank (overall) and by position + """ + # Calculate overall rank by points + data[Stats.RANK] = data[Stats.FANTASY_POINTS].rank(na_option="bottom", ascending=False) + # Calculate rank by position + data[Stats.POS_RANK] = data.groupby(POSITION)[Stats.FANTASY_POINTS].rank(na_option="bottom", ascending=False) return data def percent_mean(data: pd.DataFrame) -> pd.DataFrame: """ - Calculate the overall rank and position rank using the score provided + Calculate the mean points for the player position, then determine how much more value this player has than + other players in the same position Args: data (pd.DataFrame): the dataframe that has the players and a single scoring scheme Returns: - pd.DataFrame: updated dataframe with two new columns, rank and position rank + pd.DataFrame: updated with percentage """ + pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].mean() + joined = data.join(pos_data,on=POSITION,rsuffix='avg') + data[Stats.PCT_MEAN_POS] = joined[Stats.FANTASY_POINTS]/joined['fpavg'] + data[Stats.PCT_MEAN_OVR] = data[Stats.FANTASY_POINTS]/(data[Stats.FANTASY_POINTS].mean()) return data @@ -131,6 +134,10 @@ def percent_typical(data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: updated dataframe with a column that has identified the value of a player relative to the typical player in his position """ + pos_data = data.groupby(POSITION)[Stats.FANTASY_POINTS].median() + joined = data.join(pos_data,on=POSITION,rsuffix='_med') + data[Stats.PCT_MEDIAN_POS] = joined[Stats.FANTASY_POINTS]/joined['fp_med'] + data[Stats.PCT_MEDIAN_OVR] = data[Stats.FANTASY_POINTS]/(data[Stats.FANTASY_POINTS].median()) return data diff --git a/src/phantasyfootballer/pipelines/data_engineering/pipeline.py b/src/phantasyfootballer/pipelines/data_engineering/pipeline.py index 90bf071..464a18f 100644 --- a/src/phantasyfootballer/pipelines/data_engineering/pipeline.py +++ b/src/phantasyfootballer/pipelines/data_engineering/pipeline.py @@ -58,6 +58,15 @@ ) ] ) +score_custom_pipeline = Pipeline( + [ + node( + calculate_projected_points("custom"), + "average_stats_by_player_data", + "scored_custom_data", + ) + ] +) ranking_pipeline = Pipeline( [ node(calculate_player_rank, "scored_data", "ranked_data", name="overall_rank_node"), @@ -77,7 +86,8 @@ ), ] ) - +# Each of the following pipelines are here to do the ranking for each +# scoring type full_ppr_pipeline = pipeline( ranking_pipeline, inputs={"scored_data": "scored_ppr_data"}, @@ -98,15 +108,22 @@ outputs={"final_score_data": "scoring.standard"}, namespace="std", ) - +full_custom_pipeline = pipeline( + ranking_pipeline, + inputs={"scored_data": "scored_custom_data"}, + outputs={"final_score_data": "scoring.custom"}, + namespace="custom", +) final_scoring_ranking_pipeline = ( score_ppr_pipeline + score_half_ppr_pipeline + score_std_pipeline + + score_std_pipeline + full_ppr_pipeline + full_half_ppr_pipeline + full_standard_pipeline + + full_standard_pipeline )