Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Pandas dynamically create StringDtype aliases #77

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/visions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
"""Core functionality"""
import pandas as pd

from visions import types, typesets, utils
from visions.dtypes.boolean import BoolDtype

if pd.__version__.split(".")[0] == 0:
from visions.dtypes.boolean import BoolDtype

from visions.functional import (
cast_frame,
cast_series,
Expand Down
33 changes: 33 additions & 0 deletions src/visions/dtypes/stringdtype_alias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Type

from pandas import StringDtype
from pandas.core.arrays import StringArray
from pandas.core.dtypes.dtypes import registry


def create_alias(name: str) -> Type[StringDtype]:
# Note that isinstance([Name]Dtype(), StringDtype()) == True

# @classmethod
# def _from_sequence(cls, scalars, dtype=None, copy=False):
# return super()._from_sequence(scalars, copy=copy)
snake_name = "".join([part.capitalize() for part in name.split("_")])
dtype = f"{snake_name}Dtype"
arr = f"{snake_name}Array"

alias_dtype = type(
dtype, (StringDtype,), {"name": name, "__repr__": lambda self: dtype}
)

alias_array = type(arr, (StringArray,), {})

def constructor(self, values, copy=False):
super(alias_array, self).__init__(values, copy)
self._dtype = alias_dtype()

alias_array.__init__ = constructor
alias_dtype.construct_array_type = classmethod(lambda cls: alias_array)

registry.register(alias_dtype)

return alias_dtype
21 changes: 11 additions & 10 deletions src/visions/types/email_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,35 @@
from visions.relations import IdentityRelation, InferenceRelation, TypeRelation
from visions.types.type import VisionsBaseType
from visions.utils.coercion import test_utils
from visions.utils.series_utils import nullable_series_contains
from visions.dtypes.stringdtype_alias import create_alias


EmailDtype = create_alias("email")


def str_to_email(s):
if isinstance(s, FQDA):
return s

if isinstance(s, str):
return FQDA(*s.split("@", maxsplit=1))

return None


def to_email(series: pd.Series) -> pd.Series:
return series.apply(str_to_email)
return series.astype("email")


def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import Object, String
from visions.types import String

relations = [
IdentityRelation(cls, Object),
IdentityRelation(cls, String),
InferenceRelation(
cls,
String,
relationship=test_utils.coercion_test(to_email),
relationship=test_utils.coercion_test(
lambda series: series.apply(str_to_email)
),
transformer=to_email,
),
]
Expand Down Expand Up @@ -67,7 +70,5 @@ def get_relations(cls) -> Sequence[TypeRelation]:
return _get_relations(cls)

@classmethod
@nullable_series_contains
def contains_op(cls, series: pd.Series) -> bool:
# TODO: x.local and x.fqdn for all, isinstance for a sample
return all(isinstance(x, FQDA) and all((x.local, x.fqdn)) for x in series)
return series.dtype == "email"
24 changes: 19 additions & 5 deletions src/visions/types/file.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
import os

import pathlib
from typing import Sequence

import pandas as pd

from visions.relations import IdentityRelation, TypeRelation
from visions.relations import IdentityRelation, TypeRelation, InferenceRelation
from visions.types.type import VisionsBaseType
from visions.dtypes.stringdtype_alias import create_alias


def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import Path
create_alias("file")

relations = [IdentityRelation(cls, Path)]

def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import Path, String

relations = [
IdentityRelation(cls, String),
InferenceRelation(
cls,
Path,
relationship=lambda series: all(os.path.exists(p) for p in series),
transformer=lambda series: series.astype("file"),
),
]
return relations


Expand All @@ -30,4 +44,4 @@ def get_relations(cls) -> Sequence[TypeRelation]:

@classmethod
def contains_op(cls, series: pd.Series) -> bool:
return all(isinstance(p, pathlib.Path) and p.exists() for p in series)
return series.dtype == "file"
24 changes: 17 additions & 7 deletions src/visions/types/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@

import pandas as pd

from visions.relations import IdentityRelation, TypeRelation
from visions.relations import IdentityRelation, TypeRelation, InferenceRelation
from visions.types.type import VisionsBaseType
from visions.dtypes.stringdtype_alias import create_alias


def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import File
ImageDtype = create_alias("image")


relations = [IdentityRelation(cls, File)]
def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import File, String

relations = [
IdentityRelation(cls, String),
InferenceRelation(
cls,
File,
relationship=lambda series: all(imghdr.what(p) for p in series),
transformer=lambda series: series.astype("image"),
),
]
return relations


Expand All @@ -31,6 +43,4 @@ def get_relations(cls) -> Sequence[TypeRelation]:

@classmethod
def contains_op(cls, series: pd.Series) -> bool:
return all(
isinstance(p, Path) and p.exists() and imghdr.what(p) for p in series
)
return series.dtype == "image"
21 changes: 10 additions & 11 deletions src/visions/types/path.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import ntpath
import posixpath
import pathlib
from typing import Sequence

Expand All @@ -9,25 +11,22 @@

def string_is_path(series) -> bool:
try:
s = to_path(series.copy())
return s.apply(lambda x: x.is_absolute()).all()
return all(posixpath.isabs(x) for x in series) or all(
ntpath.isabs(x) for x in series
)
except TypeError:
return False


def to_path(series: pd.Series) -> pd.Series:
s = series.copy().apply(pathlib.PureWindowsPath)
if not s.apply(lambda x: x.is_absolute()).all():
return series.apply(pathlib.PurePosixPath)
else:
return s
def to_path(series):
return series.astype("path")


def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import Object, String
from visions.types import String

relations = [
IdentityRelation(cls, Object),
IdentityRelation(cls, String),
InferenceRelation(
cls, String, relationship=string_is_path, transformer=to_path
),
Expand All @@ -51,4 +50,4 @@ def get_relations(cls) -> Sequence[TypeRelation]:

@classmethod
def contains_op(cls, series: pd.Series) -> bool:
return all(isinstance(x, pathlib.PurePath) and x.is_absolute() for x in series)
return series.dtype == "path"
18 changes: 9 additions & 9 deletions src/visions/types/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,28 @@

from visions.relations import IdentityRelation, InferenceRelation, TypeRelation
from visions.types.type import VisionsBaseType
from visions.utils.series_utils import nullable_series_contains
from visions.dtypes.stringdtype_alias import create_alias


UrlDtype = create_alias("url")


def test_url(series) -> bool:
try:
return to_url(series).apply(lambda x: x.netloc and x.scheme).all()
return all(urlparse(x) and x.netloc and x.scheme for x in series)
except AttributeError:
return False


def to_url(series: pd.Series) -> pd.Series:
return series.apply(urlparse)
return series.astype("url")


def _get_relations(cls) -> Sequence[TypeRelation]:
from visions.types import String, Object
from visions.types import String

relations = [
IdentityRelation(cls, Object),
IdentityRelation(cls, String),
InferenceRelation(cls, String, relationship=test_url, transformer=to_url),
]
return relations
Expand All @@ -46,8 +49,5 @@ def get_relations(cls) -> Sequence[TypeRelation]:
return _get_relations(cls)

@classmethod
@nullable_series_contains
def contains_op(cls, series: pd.Series) -> bool:
return all(
isinstance(x, ParseResult) and all((x.netloc, x.scheme)) for x in series
)
return series.dtype == "url"