Merge pull request #9 from PySport/datasets

Add datasets loader and bump version
PySport · May 15, 2020 · 85cf2d2 · 85cf2d2
2 parents 584aa6b + 2a17efd
commit 85cf2d2
Show file tree

Hide file tree

Showing 17 changed files with 203 additions and 11 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -2,3 +2,8 @@ v0.1.0, 2020-04-23 -- Initial release.
 v0.2.0, 2020-05-05 -- Change interface of TrackingDataSerializer
                       Add Metrica Tracking Serializer including automated tests
                       Cleanup some import statements
+v0.2.1, 2020-05-12 -- Add some helpers functions to directly load a dataset by filenames
+v0.3.0, 2020-05-15 -- Add FIFA EPTS Tracking data loader
+                      Add some examples
+                      Add datasets loader to directly load dataset from your python code
+                      Add limit argument to all loaders
diff --git a/README.md b/README.md
@@ -13,11 +13,13 @@ different tracking- and event data like a breeze. It aims to be the fundamental
 
 ## Main Features
 Here are just a few of the things that kloppy does well:
+- Directly load [**Public datasets**](#datasets) to get started right away. 
 - Understandable [**Standardized models**](#models) for tracking- and event datasets
 - Out-of-the-box [**(De)serializing**](#serializing) tracking- and event data from different source into standardized models and visa-versa
 - Flexible [**pitch dimensions**](#pitch-dimensions) transformer for changing a dataset pitch dimensions from one to another (eg OPTA's 100x100 -> TRACAB meters)
 - Intelligent [**orientation**](#orientation) transforming orientation of a dataset (eg from TRACAB fixed orientation to "Home Team" orientation)
 
+
 ## Where to get it
 The source code is currently hosted on GitHub at:
 https://github.com/PySport/kloppy
@@ -50,7 +52,15 @@ data_set = load_epts_tracking_data('meta.xml', 'raw_data.txt')
 
 data_set = transform(data_set, pitch_dimensions=[[0, 108], [-34, 34]])
 pandas_data_frame = to_pandas(data_set)
+```
+
+### <a name="datasets"></a>Public datasets / Very quick start
+More and more companies are publishing (demo) datasets to get you started. Inspired by the `tensorflow_datasets` package,
+we added a "dataset loader" which does all the heavy lifting for you: find urls, download files, organize and load them.
+```python
+from kloppy import datasets
 
+data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
 ```
 
 ### <a name="models"></a>Standardized models

diff --git a/examples/datasets/metrica.py b/examples/datasets/metrica.py
@@ -0,0 +1,22 @@
+from kloppy import datasets, to_pandas
+
+
+def main():
+    """
+        This example shows the use of Metrica datasets, and how we can pass argument
+        to the dataset loader.
+    """
+
+    # The metrica dataset loader loads by default the 'game1' dataset
+    data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
+    print(len(data_set.frames))
+
+    # We can pass additional keyword arguments to the loaders to specify a different dataset
+    data_set = datasets.load("metrica_tracking", options={'limit': 1000}, game='game2')
+
+    data_frame = to_pandas(data_set)
+    print(data_frame)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kloppy/__init__.py b/kloppy/__init__.py
@@ -1,2 +1,3 @@
 from .infra.serializers import *
 from .helpers import *
+from .infra import datasets
diff --git a/kloppy/helpers.py b/kloppy/helpers.py
@@ -103,3 +103,12 @@ def to_pandas(data_set: DataSet, _record_converter: Callable = None) -> 'DataFra
     return pd.DataFrame.from_records(
         map(_record_converter, data_set.records)
     )
+
+
+__all__ = [
+    'load_tracab_tracking_data',
+    'load_metrica_tracking_data',
+    'load_epts_tracking_data',
+    'to_pandas',
+    'transform'
+]
diff --git a/kloppy/infra/datasets/__init__.py b/kloppy/infra/datasets/__init__.py
@@ -0,0 +1,8 @@
+# import for registration
+from . import tracking
+
+from .core.loading import load
+
+__all__ = [
+    'load'
+]
diff --git a/kloppy/infra/datasets/core/__init__.py b/kloppy/infra/datasets/core/__init__.py
@@ -0,0 +1 @@
+from .builder import DatasetBuilder
diff --git a/kloppy/infra/datasets/core/builder.py b/kloppy/infra/datasets/core/builder.py
@@ -0,0 +1,15 @@
+from abc import abstractmethod
+from typing import Dict, Type, Union
+
+from ...serializers.tracking import TrackingDataSerializer
+from .registered import RegisteredDataset
+
+
+class DatasetBuilder(metaclass=RegisteredDataset):
+    @abstractmethod
+    def get_data_set_files(self, **kwargs) -> Dict[str, Dict[str, str]]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_serializer_cls(self) -> Union[Type[TrackingDataSerializer]]:
+        raise NotImplementedError
diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py
@@ -0,0 +1,67 @@
+import os
+
+import requests
+
+from typing import Dict, Union
+
+from kloppy.domain import DataSet, TrackingDataSet
+
+from .registered import _DATASET_REGISTRY
+
+
+def download_file(url, local_filename):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+
+def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str]:
+    datasets_base_dir = os.environ.get('KLOPPY_BASE_DIR', None)
+    if not datasets_base_dir:
+        datasets_base_dir = os.path.expanduser('~/kloppy_datasets')
+
+    dataset_base_dir = f'{datasets_base_dir}/{data_set_name}'
+    if not os.path.exists(dataset_base_dir):
+        os.makedirs(dataset_base_dir)
+
+    local_files = {}
+    for file_key, file_url in files.items():
+        filename = file_url.split('/')[-1]
+        local_filename = f'{dataset_base_dir}/{filename}'
+        if not os.path.exists(local_filename):
+            print(f'Downloading {filename}...')
+            download_file(file_url, local_filename)
+            print('Done')
+        local_files[file_key] = local_filename
+    return local_files
+
+
+def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet]:
+    if data_set_name not in _DATASET_REGISTRY:
+        raise ValueError(f"Dataset {data_set_name} not found")
+
+    builder_cls = _DATASET_REGISTRY[data_set_name]
+    builder = builder_cls()
+
+    dataset_remote_files = builder.get_data_set_files(**dataset_kwargs)
+    dataset_local_files = get_local_files(data_set_name, dataset_remote_files)
+
+    file_handlers = {
+        local_file_key: open(local_file_name, 'rb')
+        for local_file_key, local_file_name
+        in dataset_local_files.items()
+    }
+
+    try:
+        serializer_cls = builder.get_serializer_cls()
+        serializer = serializer_cls()
+        data_set = serializer.deserialize(
+            inputs=file_handlers,
+            options=options
+        )
+    finally:
+        for fp in file_handlers.values():
+            fp.close()
+    return data_set
diff --git a/kloppy/infra/datasets/core/registered.py b/kloppy/infra/datasets/core/registered.py
@@ -0,0 +1,27 @@
+import inspect
+import re
+import abc
+from typing import Type, Dict
+
+
+_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
+_all_cap_re = re.compile("([a-z0-9])([A-Z])")
+
+# from .builder import DatasetBuilder
+_DATASET_REGISTRY: Dict[str, Type['DatasetBuilder']] = {}
+
+
+def camelcase_to_snakecase(name):
+    """Convert camel-case string to snake-case."""
+    s1 = _first_cap_re.sub(r"\1_\2", name)
+    return _all_cap_re.sub(r"\1_\2", s1).lower()
+
+
+class RegisteredDataset(abc.ABCMeta):
+    def __new__(mcs, cls_name, bases, class_dict):
+        name = camelcase_to_snakecase(cls_name)
+        class_dict["name"] = name
+        builder_cls = super(RegisteredDataset, mcs).__new__(mcs, cls_name, bases, class_dict)
+        if not inspect.isabstract(builder_cls):
+            _DATASET_REGISTRY[name] = builder_cls
+        return builder_cls
diff --git a/kloppy/infra/datasets/tracking/__init__.py b/kloppy/infra/datasets/tracking/__init__.py
@@ -0,0 +1 @@
+from .metrica import MetricaTracking
diff --git a/kloppy/infra/datasets/tracking/metrica.py b/kloppy/infra/datasets/tracking/metrica.py
@@ -0,0 +1,25 @@
+from typing import Dict, Type
+
+from ..core.builder import DatasetBuilder
+from ...serializers.tracking import TrackingDataSerializer, MetricaTrackingSerializer
+
+
+_DATASET_URLS = {
+    'game1': {
+        'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv',
+        'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv'
+    },
+    'game2': {
+        'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Home_Team.csv',
+        'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Away_Team.csv'
+    }
+}
+
+
+class MetricaTracking(DatasetBuilder):
+    def get_data_set_files(self,**kwargs) -> Dict[str, str]:
+        game = kwargs.get('game', 'game1')
+        return _DATASET_URLS[game]
+
+    def get_serializer_cls(self) -> Type[TrackingDataSerializer]:
+        return MetricaTrackingSerializer
diff --git a/kloppy/infra/serializers/tracking/epts/reader.py b/kloppy/infra/serializers/tracking/epts/reader.py
@@ -87,7 +87,7 @@ def _set_current_data_spec(idx):
             yield row
 
             n += 1
-            if limit and n > limit:
+            if limit and n >= limit:
                 break
 
         if frame_id >= end_frame_id:

diff --git a/kloppy/infra/serializers/tracking/metrica.py b/kloppy/infra/serializers/tracking/metrica.py
@@ -189,7 +189,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac
                     )
 
                 n += 1
-                if limit and n > limit:
+                if limit and n >= limit:
                     break
 
         orientation = (

diff --git a/kloppy/infra/serializers/tracking/tracab.py b/kloppy/infra/serializers/tracking/tracab.py
@@ -179,8 +179,7 @@ def _iter():
                         attacking_direction=attacking_direction_from_frame(frame)
                     )
 
-                n += 1
-                if limit and n > limit:
+                if limit and n >= limit:
                     break
 
         orientation = (

diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py
@@ -1,13 +1,14 @@
 import os
-from io import BytesIO
 
 from pandas import DataFrame
 from pandas.testing import assert_frame_equal
 
-from kloppy import MetricaTrackingSerializer, to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, \
-    TrackingDataSet, PitchDimensions, Dimension, Orientation, Frame, transform
-from kloppy.domain import Period, DataSetFlag, Point, AttackingDirection
-from kloppy.infra.utils import performance_logging
+from kloppy import to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, transform
+from kloppy.domain import (
+    Period, DataSetFlag, Point, AttackingDirection,
+    TrackingDataSet, PitchDimensions, Dimension,
+    Orientation, Frame
+)
 
 
 class TestHelpers:

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name='kloppy',
-    version='0.2.1',
+    version='0.3.0',
     author='Koen Vossen',
     author_email='[email protected]',
     url="https://github.com/PySport/kloppy",
@@ -26,7 +26,8 @@
         "Topic :: Scientific/Engineering"
     ],
     install_requires=[
-        'lxml>=4.5.0'
+        'lxml>=4.5.0',
+        'requests>=2.0.0'
     ],
     extras_require={
         'test': [
-Original file line number
+Diff line change
@@ Expand Up @@
                         )
                     n += 1
-                    if limit and n > limit:
+                    if limit and n >= limit:
                         break
             orientation = (
@@ Expand Down @@