Skip to content

Commit

Permalink
Merge pull request #9 from PySport/datasets
Browse files Browse the repository at this point in the history
Add datasets loader and bump version
  • Loading branch information
koenvo authored May 15, 2020
2 parents 584aa6b + 2a17efd commit 85cf2d2
Show file tree
Hide file tree
Showing 17 changed files with 203 additions and 11 deletions.
5 changes: 5 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ v0.1.0, 2020-04-23 -- Initial release.
v0.2.0, 2020-05-05 -- Change interface of TrackingDataSerializer
Add Metrica Tracking Serializer including automated tests
Cleanup some import statements
v0.2.1, 2020-05-12 -- Add some helpers functions to directly load a dataset by filenames
v0.3.0, 2020-05-15 -- Add FIFA EPTS Tracking data loader
Add some examples
Add datasets loader to directly load dataset from your python code
Add limit argument to all loaders
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ different tracking- and event data like a breeze. It aims to be the fundamental

## Main Features
Here are just a few of the things that kloppy does well:
- Directly load [**Public datasets**](#datasets) to get started right away.
- Understandable [**Standardized models**](#models) for tracking- and event datasets
- Out-of-the-box [**(De)serializing**](#serializing) tracking- and event data from different source into standardized models and visa-versa
- Flexible [**pitch dimensions**](#pitch-dimensions) transformer for changing a dataset pitch dimensions from one to another (eg OPTA's 100x100 -> TRACAB meters)
- Intelligent [**orientation**](#orientation) transforming orientation of a dataset (eg from TRACAB fixed orientation to "Home Team" orientation)


## Where to get it
The source code is currently hosted on GitHub at:
https://github.com/PySport/kloppy
Expand Down Expand Up @@ -50,7 +52,15 @@ data_set = load_epts_tracking_data('meta.xml', 'raw_data.txt')

data_set = transform(data_set, pitch_dimensions=[[0, 108], [-34, 34]])
pandas_data_frame = to_pandas(data_set)
```

### <a name="datasets"></a>Public datasets / Very quick start
More and more companies are publishing (demo) datasets to get you started. Inspired by the `tensorflow_datasets` package,
we added a "dataset loader" which does all the heavy lifting for you: find urls, download files, organize and load them.
```python
from kloppy import datasets

data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
```

### <a name="models"></a>Standardized models
Expand Down
22 changes: 22 additions & 0 deletions examples/datasets/metrica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from kloppy import datasets, to_pandas


def main():
"""
This example shows the use of Metrica datasets, and how we can pass argument
to the dataset loader.
"""

# The metrica dataset loader loads by default the 'game1' dataset
data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
print(len(data_set.frames))

# We can pass additional keyword arguments to the loaders to specify a different dataset
data_set = datasets.load("metrica_tracking", options={'limit': 1000}, game='game2')

data_frame = to_pandas(data_set)
print(data_frame)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions kloppy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .infra.serializers import *
from .helpers import *
from .infra import datasets
9 changes: 9 additions & 0 deletions kloppy/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,12 @@ def to_pandas(data_set: DataSet, _record_converter: Callable = None) -> 'DataFra
return pd.DataFrame.from_records(
map(_record_converter, data_set.records)
)


__all__ = [
'load_tracab_tracking_data',
'load_metrica_tracking_data',
'load_epts_tracking_data',
'to_pandas',
'transform'
]
8 changes: 8 additions & 0 deletions kloppy/infra/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# import for registration
from . import tracking

from .core.loading import load

__all__ = [
'load'
]
1 change: 1 addition & 0 deletions kloppy/infra/datasets/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .builder import DatasetBuilder
15 changes: 15 additions & 0 deletions kloppy/infra/datasets/core/builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from abc import abstractmethod
from typing import Dict, Type, Union

from ...serializers.tracking import TrackingDataSerializer
from .registered import RegisteredDataset


class DatasetBuilder(metaclass=RegisteredDataset):
@abstractmethod
def get_data_set_files(self, **kwargs) -> Dict[str, Dict[str, str]]:
raise NotImplementedError

@abstractmethod
def get_serializer_cls(self) -> Union[Type[TrackingDataSerializer]]:
raise NotImplementedError
67 changes: 67 additions & 0 deletions kloppy/infra/datasets/core/loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os

import requests

from typing import Dict, Union

from kloppy.domain import DataSet, TrackingDataSet

from .registered import _DATASET_REGISTRY


def download_file(url, local_filename):
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)


def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str]:
datasets_base_dir = os.environ.get('KLOPPY_BASE_DIR', None)
if not datasets_base_dir:
datasets_base_dir = os.path.expanduser('~/kloppy_datasets')

dataset_base_dir = f'{datasets_base_dir}/{data_set_name}'
if not os.path.exists(dataset_base_dir):
os.makedirs(dataset_base_dir)

local_files = {}
for file_key, file_url in files.items():
filename = file_url.split('/')[-1]
local_filename = f'{dataset_base_dir}/{filename}'
if not os.path.exists(local_filename):
print(f'Downloading {filename}...')
download_file(file_url, local_filename)
print('Done')
local_files[file_key] = local_filename
return local_files


def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet]:
if data_set_name not in _DATASET_REGISTRY:
raise ValueError(f"Dataset {data_set_name} not found")

builder_cls = _DATASET_REGISTRY[data_set_name]
builder = builder_cls()

dataset_remote_files = builder.get_data_set_files(**dataset_kwargs)
dataset_local_files = get_local_files(data_set_name, dataset_remote_files)

file_handlers = {
local_file_key: open(local_file_name, 'rb')
for local_file_key, local_file_name
in dataset_local_files.items()
}

try:
serializer_cls = builder.get_serializer_cls()
serializer = serializer_cls()
data_set = serializer.deserialize(
inputs=file_handlers,
options=options
)
finally:
for fp in file_handlers.values():
fp.close()
return data_set
27 changes: 27 additions & 0 deletions kloppy/infra/datasets/core/registered.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import inspect
import re
import abc
from typing import Type, Dict


_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
_all_cap_re = re.compile("([a-z0-9])([A-Z])")

# from .builder import DatasetBuilder
_DATASET_REGISTRY: Dict[str, Type['DatasetBuilder']] = {}


def camelcase_to_snakecase(name):
"""Convert camel-case string to snake-case."""
s1 = _first_cap_re.sub(r"\1_\2", name)
return _all_cap_re.sub(r"\1_\2", s1).lower()


class RegisteredDataset(abc.ABCMeta):
def __new__(mcs, cls_name, bases, class_dict):
name = camelcase_to_snakecase(cls_name)
class_dict["name"] = name
builder_cls = super(RegisteredDataset, mcs).__new__(mcs, cls_name, bases, class_dict)
if not inspect.isabstract(builder_cls):
_DATASET_REGISTRY[name] = builder_cls
return builder_cls
1 change: 1 addition & 0 deletions kloppy/infra/datasets/tracking/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .metrica import MetricaTracking
25 changes: 25 additions & 0 deletions kloppy/infra/datasets/tracking/metrica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Dict, Type

from ..core.builder import DatasetBuilder
from ...serializers.tracking import TrackingDataSerializer, MetricaTrackingSerializer


_DATASET_URLS = {
'game1': {
'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv',
'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv'
},
'game2': {
'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Home_Team.csv',
'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Away_Team.csv'
}
}


class MetricaTracking(DatasetBuilder):
def get_data_set_files(self,**kwargs) -> Dict[str, str]:
game = kwargs.get('game', 'game1')
return _DATASET_URLS[game]

def get_serializer_cls(self) -> Type[TrackingDataSerializer]:
return MetricaTrackingSerializer
2 changes: 1 addition & 1 deletion kloppy/infra/serializers/tracking/epts/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _set_current_data_spec(idx):
yield row

n += 1
if limit and n > limit:
if limit and n >= limit:
break

if frame_id >= end_frame_id:
Expand Down
2 changes: 1 addition & 1 deletion kloppy/infra/serializers/tracking/metrica.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac
)

n += 1
if limit and n > limit:
if limit and n >= limit:
break

orientation = (
Expand Down
3 changes: 1 addition & 2 deletions kloppy/infra/serializers/tracking/tracab.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,7 @@ def _iter():
attacking_direction=attacking_direction_from_frame(frame)
)

n += 1
if limit and n > limit:
if limit and n >= limit:
break

orientation = (
Expand Down
11 changes: 6 additions & 5 deletions kloppy/tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
from io import BytesIO

from pandas import DataFrame
from pandas.testing import assert_frame_equal

from kloppy import MetricaTrackingSerializer, to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, \
TrackingDataSet, PitchDimensions, Dimension, Orientation, Frame, transform
from kloppy.domain import Period, DataSetFlag, Point, AttackingDirection
from kloppy.infra.utils import performance_logging
from kloppy import to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, transform
from kloppy.domain import (
Period, DataSetFlag, Point, AttackingDirection,
TrackingDataSet, PitchDimensions, Dimension,
Orientation, Frame
)


class TestHelpers:
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='kloppy',
version='0.2.1',
version='0.3.0',
author='Koen Vossen',
author_email='[email protected]',
url="https://github.com/PySport/kloppy",
Expand All @@ -26,7 +26,8 @@
"Topic :: Scientific/Engineering"
],
install_requires=[
'lxml>=4.5.0'
'lxml>=4.5.0',
'requests>=2.0.0'
],
extras_require={
'test': [
Expand Down

0 comments on commit 85cf2d2

Please sign in to comment.