Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numericalizer abc #227

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f82aee7
WIP: DatasetABC
ivansmokovic Oct 26, 2020
7dde35a
WIP: DatasetABC
ivansmokovic Oct 27, 2020
eced615
updated Dataset
ivansmokovic Oct 27, 2020
09a7b5e
updated ArrowDataset
ivansmokovic Oct 27, 2020
4f9d055
updated iterator.py
ivansmokovic Oct 27, 2020
9884c06
flake8 fixes
ivansmokovic Oct 27, 2020
73f711a
getattr -> dict notation
ivansmokovic Oct 28, 2020
7395f93
fixed datasetABC __repr__
ivansmokovic Oct 29, 2020
ed11aad
_get -> __getitem__
ivansmokovic Oct 29, 2020
09d401d
Small changes
ivansmokovic Oct 29, 2020
44bfbcd
made ._fields a tuple
ivansmokovic Nov 3, 2020
3b9c4aa
Added shuffled
ivansmokovic Nov 3, 2020
15be987
removed as_dataset
ivansmokovic Nov 3, 2020
7fcec4d
Added type hint to shuffled
ivansmokovic Nov 3, 2020
cb0e447
Added type hint overloads to __getitem__
ivansmokovic Nov 3, 2020
aa192b3
black correction
ivansmokovic Nov 3, 2020
23234dd
WIP: NumericalizerABC prototype
ivansmokovic Nov 5, 2020
07ec682
style changes
ivansmokovic Nov 5, 2020
bca8b63
Merge remote-tracking branch 'origin/master' into numericalizer-abc
ivansmokovic Dec 9, 2020
65906ec
Merged master
ivansmokovic Dec 9, 2020
125a015
minor style changes
ivansmokovic Dec 9, 2020
7a26a04
TODO docs
ivansmokovic Dec 9, 2020
5668253
_finalize -> mark_finalized
ivansmokovic Dec 10, 2020
f110c5f
black
ivansmokovic Dec 10, 2020
e28a777
docs
ivansmokovic Dec 10, 2020
affa494
Merge remote-tracking branch 'origin/master' into numericalizer-abc
ivansmokovic Dec 10, 2020
dac186c
added error message
ivansmokovic Dec 10, 2020
56beaea
black compliance
ivansmokovic Dec 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 9 additions & 134 deletions podium/arrow/arrow_tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import shutil
import tempfile
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple, Union
from typing import Any, Dict, Iterable, Iterator, List, Tuple, Union

from podium.datasets import Dataset
from podium.datasets import Dataset, DatasetABC
from podium.storage import ExampleFactory, Field, unpack_fields
from podium.storage.example_factory import Example

Expand Down Expand Up @@ -36,7 +36,7 @@ def _group(iterable, n):
yield chunk


class ArrowDataset:
class ArrowDataset(DatasetABC):
"""Podium dataset implementation which uses PyArrow as its data storage backend.
Examples are stored in a file which is then memory mapped for fast random access.
"""
Expand Down Expand Up @@ -78,11 +78,10 @@ def __init__(
inferred data type if possible.
"""
self.cache_path = cache_path
self.fields = unpack_fields(fields)
self.field_dict = {field.name: field for field in fields}
self.mmapped_file = mmapped_file
self.table = table
self.data_types = data_types
super().__init__(fields)

@staticmethod
def from_dataset(
Expand Down Expand Up @@ -562,36 +561,10 @@ def dump_cache(self, cache_path: str = None) -> str:

return cache_path

@property
def examples(self):
def _get_examples(self) -> List[Example]:
"""Loads this ArrowDataset into memory and returns a list containing
the loaded Examples."""
return self.as_dataset().examples

def as_dataset(self) -> Dataset:
"""Loads this ArrowDataset into memory and returns an Dataset object containing
the loaded data.

Returns
-------
Dataset
Dataset containing all examples of this ArrowDataset.
"""
examples = list(ArrowDataset._recordbatch_to_examples(self.table, self.fields))
return Dataset(examples, self.fields)

def batch(self):
"""Creates an input and target batch containing the whole dataset.
The format of the batch is the same as the batches returned by the Iterator class.

Returns
-------
input_batch, target_batch
Two objects containing the input and target batches over
the whole dataset.
"""
# TODO custom batch method?
return self.as_dataset().batch()
return list(ArrowDataset._recordbatch_to_examples(self.table, self.fields))

@staticmethod
def _field_values(
Expand Down Expand Up @@ -632,7 +605,9 @@ def _field_values(

return zip(raw_values, tokenized_values)

def __getitem__(self, item, deep_copy=False) -> Union[Example, "ArrowDataset"]:
def __getitem__(
self, item: Union[int, Iterable[int], slice]
) -> Union[Example, "ArrowDataset"]:
"""Returns an example or a new ArrowDataset containing the indexed examples.
If indexed with an int, only the example at that position will be returned.
If Indexed with a slice or iterable, all examples indexed by the object
Expand All @@ -658,9 +633,6 @@ def __getitem__(self, item, deep_copy=False) -> Union[Example, "ArrowDataset"]:
item: int or slice or iterable
Index used to index examples.

deep_copy: bool
Not used.

Returns
-------
Example or Dataset
Expand Down Expand Up @@ -712,69 +684,6 @@ def __iter__(self) -> Iterator[Example]:
"""
yield from self._recordbatch_to_examples(self.table, self.fields)

def __getattr__(self, fieldname) -> Iterator[Tuple[Any, Any]]:
"""Iterates over the raw and tokenized values of all examples in this dataset.

Parameters
----------
fieldname: str
Name of the field whose values are to be iterated over.

Returns
-------
Iterator[Tuple[raw, tokenized]]
Iterator over the raw and tokenized values of all examples in this dataset.

"""
if fieldname in self.field_dict:
return ArrowDataset._field_values(self.table, fieldname)

else:
raise AttributeError(f"Dataset has no field {fieldname}.")

def filter(self, predicate: Callable[[Example], bool]) -> "ArrowDataset":
"""Creates a new ArrowDataset instance containing only Examples for which the
predicate returns True.

Parameters
----------
predicate : Callable[[Example], bool]
Callable used as a filtering predicate. It takes an Example as a parameter
and returns True if the Example is to be accepted, and False otherwise.

Returns
-------
ArrowDataset
New ArrowDataset containing Filtered Examples.
"""
indices = [i for i, example in enumerate(self) if predicate(example)]
return self[indices]

def sorted(self, key: Callable[[Example], Any], reverse=False) -> "ArrowDataset":
"""Returns a new ArrowDataset with sorted Examples.

Parameters
----------
key: Callable[[Example], Any]
Extracts a comparable value from an Example.
That value will be used to determine Example ordering.

reverse: bool
If True, the returned dataset will be reversed.

Returns
-------
ArrowDataset
An ArrowDataset containing sorted Examples from this dataset.
"""

def index_key(i, _dataset=self):
return key(_dataset[i])

indices = list(range(len(self)))
indices.sort(key=index_key, reverse=reverse)
return self[indices]

def close(self):
""" Closes resources held by the ArrowDataset."""
if self.mmapped_file is not None:
Expand All @@ -790,37 +699,3 @@ def delete_cache(self):
if self.mmapped_file is not None:
self.close()
shutil.rmtree(self.cache_path)

def finalize_fields(self, *datasets):
"""Builds vocabularies of all the non-eager fields in the dataset,
from the Dataset objects given as \\*args and then finalizes all the
fields.

Parameters
----------
\\*args
A variable number of Dataset objects from which to build the
vocabularies for non-eager fields. If none provided, the
vocabularies are built from this Dataset (self).
"""
# if there are non-eager fields, we need to build their vocabularies
fields_to_build = [f for f in self.fields if not f.eager and f.use_vocab]
if fields_to_build:
# there can be multiple datasets we want to iterate over
data_sources = [
dataset for dataset in datasets if isinstance(dataset, Dataset)
]

# use self as a data source if no other given
if not data_sources:
data_sources.append(self)

# for each example in each dataset,
# update _all_ non-eager fields
for dataset in data_sources:
for example in dataset:
for field in fields_to_build:
field.update_vocab(*example[field.name])

for field in self.fields:
field.finalize()
2 changes: 2 additions & 0 deletions podium/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Package contains datasets"""

from .dataset import Dataset, rationed_split, stratified_split
from .dataset_abc import DatasetABC
from .hierarhical_dataset import HierarchicalDataset
from .impl.catacx_dataset import CatacxDataset
from .impl.conllu_dataset import CoNLLUDataset
Expand All @@ -21,6 +22,7 @@

__all__ = [
"Dataset",
"DatasetABC",
"TabularDataset",
"HierarchicalDataset",
"stratified_split",
Expand Down
Loading