Skip to content

Commit

Permalink
fix(matchhistory): different encoding since 2024/25 (#788)
Browse files Browse the repository at this point in the history
  • Loading branch information
probberechts authored Jan 16, 2025
1 parent 1b81895 commit dada484
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 13 deletions.
1 change: 1 addition & 0 deletions soccerdata/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
},
}
logging.config.dictConfig(logging_config)
logging.captureWarnings(True)
logger = logging.getLogger("root")
logger.handlers[0] = RichHandler(markup=True)

Expand Down
28 changes: 22 additions & 6 deletions soccerdata/match_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,35 @@

import itertools
from pathlib import Path
from typing import Callable, Optional, Union
from typing import IO, Callable, Optional, Union

import pandas as pd

from ._common import BaseRequestsReader, make_game_id
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

MATCH_HISTORY_DATA_DIR = DATA_DIR / "MatchHistory"
MATCH_HISTORY_API = "https://www.football-data.co.uk"


def _parse_csv(raw_data: IO[bytes], lkey: str, skey: str) -> pd.DataFrame:
logger.info("Parsing league=%s season=%s", lkey, skey)
if int(skey) >= 2425:
# Since 2024-25, the CSV files are encoded in UTF-8-SIG
df_games = pd.read_csv(
raw_data,
encoding="UTF-8-SIG",
on_bad_lines="warn",
)
else:
df_games = pd.read_csv(
raw_data,
encoding="latin-1",
on_bad_lines="warn",
)
return df_games


class MatchHistory(BaseRequestsReader):
"""Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php.
Expand Down Expand Up @@ -92,12 +110,10 @@ def read_games(self) -> pd.DataFrame:
filepath = self.data_dir / filemask.format(lkey, skey)
url = urlmask.format(skey, lkey)
current_season = not self._is_complete(lkey, skey)

reader = self.get(url, filepath, no_cache=current_season)
df_games = _parse_csv(reader, lkey, skey).assign(season=skey)

df_games = pd.read_csv(
reader,
encoding="ISO-8859-1",
).assign(season=skey)
if "Time" not in df_games.columns:
df_games["Time"] = "12:00"
df_games["Time"] = df_games["Time"].fillna("12:00")
Expand Down
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def elo() -> sd.ClubElo:


@pytest.fixture()
def match_epl_2y() -> sd.MatchHistory:
"""Return a MatchHistory instance for the last 2 years of the EPL."""
return sd.MatchHistory("ENG-Premier League", list(range(2018, 2020)))
def match_epl_5y() -> sd.MatchHistory:
"""Return a MatchHistory instance for the last 5 years of the EPL."""
return sd.MatchHistory("ENG-Premier League", list(range(2019, 2025)))


@pytest.fixture()
Expand Down
9 changes: 5 additions & 4 deletions tests/test_MatchHistory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from soccerdata.match_history import MatchHistory


def test_read_games(match_epl_2y: MatchHistory) -> None:
def test_read_games(match_epl_5y: MatchHistory) -> None:
"""It should return a DataFrame with all games from the selected leagues and seasons."""
df = match_epl_2y.read_games()
df = match_epl_5y.read_games()
assert isinstance(df, pd.DataFrame)
assert len(df.index.get_level_values("season").unique()) == 2
assert len(df) == 760
assert len(df.index.get_level_values("season").unique()) == 5
assert len(df) == 2107
assert not any("" in c for c in df.columns)

0 comments on commit dada484

Please sign in to comment.