Skip to content

Commit

Permalink
Merge pull request #5 from pwwang/dev
Browse files Browse the repository at this point in the history
0.0.3
  • Loading branch information
pwwang authored May 1, 2021
2 parents 8638216 + 5c241c6 commit b714ab2
Show file tree
Hide file tree
Showing 12 changed files with 108 additions and 65 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ repos:
rev: 5df1a4bf6f04a1ed3a643167b38d502575e29aef
hooks:
- id: trailing-whitespace
exclude: 'docs/'
- id: end-of-file-fixer
- id: check-yaml
- repo: local
Expand Down
16 changes: 7 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ df >> mutate(z=if_else(f.x>1, 1, 0))
df >> filter(f.x>1)
"""# output:
x y
2 2 two
3 3 three
0 2 two
1 3 three
"""

df >> mutate(z=if_else(f.x>1, 1, 0)) >> filter(f.z==1)
"""# output:
x y z
2 2 two 1
3 3 three 1
0 2 two 1
1 3 three 1
"""
```

Expand All @@ -65,10 +65,9 @@ from datar.base import sin, pi
from plotnine import ggplot, aes, geom_line, theme_classic

df = tibble(x=numpy.linspace(0, 2*pi, 500))
(
df >>
mutate(y=sin(f.x), sign=if_else(f.y>=0, "positive", "negative")) >>
ggplot(aes(x='x', y='y')) + theme_classic()
(df >>
mutate(y=sin(f.x), sign=if_else(f.y>=0, "positive", "negative")) >>
ggplot(aes(x='x', y='y')) + theme_classic()
) + geom_line(aes(color='sign'), size=1.2)
```

Expand All @@ -79,7 +78,6 @@ df = tibble(x=numpy.linspace(0, 2*pi, 500))
# for example: klib
import klib
from pipda import register_verb
from datar.core.contexts import Context
from datar.datasets import iris
from datar.dplyr import pull

Expand Down
2 changes: 1 addition & 1 deletion datar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .core import operator as _datar_operator
from .core.defaults import f

__version__ = '0.0.2'
__version__ = '0.0.3'
12 changes: 6 additions & 6 deletions datar/dplyr/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,24 @@
from .group_data import group_vars

# n used directly in count
@register_func(context=Context.EVAL)
@register_func(context=Context.EVAL, summarise_prefers_input=True)
def n(series: Iterable[Any]) -> int:
"""gives the current group size."""
return len(series)

@register_func(DataFrame, verb_arg_only=True)
@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
def cur_data_all(_data: DataFrame) -> DataFrame:
"""gives the current data for the current group
(including grouping variables)"""
return _data

@register_func(DataFrame, verb_arg_only=True)
@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
def cur_data(_data: DataFrame) -> int:
"""gives the current data for the current group
(excluding grouping variables)."""
return _data[setdiff(_data.columns, group_vars(_data))]

@register_func(DataFrame, verb_arg_only=True)
@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
def cur_group(_data: DataFrame) -> DataFrame:
"""gives the group keys, a tibble with one row and one column for
each grouping variable."""
Expand All @@ -41,12 +41,12 @@ def cur_group(_data: DataFrame) -> DataFrame:
gdata = _data.attrs['group_data']
return gdata.iloc[[index], :-1]

@register_func(DataFrame, verb_arg_only=True)
@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
def cur_group_id(_data: DataFrame) -> int:
"""gives a unique numeric identifier for the current group."""
return _data.attrs.get('group_index', 1)

@register_func(DataFrame, verb_arg_only=True)
@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
def cur_group_rows(_data: DataFrame) -> List[int]:
"""gives the row indices for the current group."""
index = _data.attrs.get('group_index', None)
Expand Down
30 changes: 26 additions & 4 deletions datar/dplyr/summarise.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Iterable, Mapping, Optional, Union
from pandas import DataFrame
from pipda import register_verb, evaluate_expr
from pipda.function import Function

from ..core.defaults import DEFAULT_COLUMN_PREFIX
from ..core.contexts import Context
Expand Down Expand Up @@ -35,6 +36,23 @@ def summarise(
>>> # 0 10 20
>>> df >> summarise(y=sum(f.x), z=f.x+f.y) # fail
But they should not be mixed in later argument. For example:
>>> df = tibble(x=[1,2,3,4], g=list('aabb')) >> group_by(f.g)
>>> df >> summarise(n=n() + f.x)
>>> # as expected:
>>> g n
>>> # 0 a 3
>>> # 1 a 4
>>> # 2 b 5
>>> # 3 b 6
>>> # [Groups: ['g'] (n=2)]
>>> # However:
>>> df >> summarise(y=1, n=n() + f.y)
>>> # n() will be recycling output instead of input
>>> # g y n
>>> # 0 a 1 2
>>> # 1 b 1 2
Args:
_groups: Grouping structure of the result.
- "drop_last": dropping the last level of grouping.
Expand Down Expand Up @@ -140,11 +158,15 @@ def summarise_build(

out = group_keys(_data)
for key, val in named.items():
envdata = out
if out.shape[1] == 0 or (
isinstance(val, Function) and
getattr(val.func, 'summarise_prefers_input', False)
):
envdata = _data

try:
if out.shape[1] == 0:
val = evaluate_expr(val, _data, context)
else:
val = evaluate_expr(val, out, context)
val = evaluate_expr(val, envdata, context)
except ColumnNotExistingError:
# also recycle input
val = evaluate_expr(val, _data, context)
Expand Down
2 changes: 1 addition & 1 deletion datar/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""APIs for stats"""

from .funcs import rnorm, rpois, runif, quantile, sd
from .funcs import rnorm, rpois, runif, quantile, sd, weighted_mean
from .verbs import setNames
34 changes: 32 additions & 2 deletions datar/stats/funcs.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Some functions ported from R-stats"""
from typing import Any, Iterable, List
from typing import Any, Iterable, List, Optional

import numpy
from pipda import register_func

from ..core.types import FloatOrIter, SeriesLikeType
from ..core.types import (
FloatOrIter, NumericOrIter, NumericType, SeriesLikeType, is_scalar
)
from ..core.contexts import Context

# pylint: disable=redefined-builtin, redefined-outer-name
Expand Down Expand Up @@ -86,3 +88,31 @@ def sd(
numpy.nanstd(series, ddof=ddof) if na_rm
else numpy.std(series, ddof=ddof)
)

@register_func(None, context=Context.EVAL)
def weighted_mean(
# pylint: disable=invalid-name
x: NumericOrIter,
w: Optional[NumericOrIter] = None,
na_rm: bool = False
) -> NumericType:
"""Calculate weighted mean"""
if is_scalar(x):
x = [x]
if w is not None and is_scalar(w):
w = [w]
x = numpy.array(x)
if w is not None:
w = numpy.array(w)
if len(x) != len(w):
raise ValueError("'x' and 'w' must have the same length")

if na_rm:
notna = ~numpy.isnan(x)
x = x[notna]
if w is not None:
w = w[notna]

if w is not None and sum(w) == 0:
return numpy.nan
return numpy.average(x, weights=w)
4 changes: 4 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
## 0.0.3
- Add stats.weighted_mean
- Allow function to prefer recycling input or output for summarise

## 0.0.2
- Port verbs and functions from tidyverse/dplyr and test them with original cases
3 changes: 3 additions & 0 deletions docs/TODO.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

- Add tests for tidyr from original tidyverse/tidyr cases
- Add more tests for base/core
- Port more functions from `r-base`, `r-stats`, etc
- Port more datasets from `r-datasets` namespace
- Add more detailed documentations
61 changes: 23 additions & 38 deletions docs/notebooks/readme.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "datar"
version = "0.0.2"
description = "Probably the closest port of tidyr, dplyr and tibble in python"
version = "0.0.3"
description = "Port of R data packages tidyr, dplyr, tibble and so on in python"
authors = ["pwwang <[email protected]>"]
license = "MIT"

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
setup(
long_description=readme,
name='datar',
version='0.0.2',
description='Probably the closest port of tidyr, dplyr and tibble in python',
version='0.0.3',
description='Port of R data packages tidyr, dplyr, tibble and so on in python',
python_requires='==3.*,>=3.7.1',
author='pwwang',
author_email='[email protected]',
Expand Down

0 comments on commit b714ab2

Please sign in to comment.