Merge pull request #5 from pwwang/dev

0.0.3
pwwang · May 1, 2021 · b714ab2 · b714ab2
2 parents 8638216 + 5c241c6
commit b714ab2
Show file tree

Hide file tree

Showing 12 changed files with 108 additions and 65 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     rev: 5df1a4bf6f04a1ed3a643167b38d502575e29aef
     hooks:
     -   id: trailing-whitespace
+        exclude: 'docs/'
     -   id: end-of-file-fixer
     -   id: check-yaml
 -   repo: local

diff --git a/README.md b/README.md
@@ -46,15 +46,15 @@ df >> mutate(z=if_else(f.x>1, 1, 0))
 df >> filter(f.x>1)
 """# output:
    x      y
-2  2    two
-3  3  three
+0  2    two
+1  3  three
 """
 
 df >> mutate(z=if_else(f.x>1, 1, 0)) >> filter(f.z==1)
 """# output:
    x      y  z
-2  2    two  1
-3  3  three  1
+0  2    two  1
+1  3  three  1
 """
 ```
 
@@ -65,10 +65,9 @@ from datar.base import sin, pi
 from plotnine import ggplot, aes, geom_line, theme_classic
 
 df = tibble(x=numpy.linspace(0, 2*pi, 500))
-(
-    df >>
-        mutate(y=sin(f.x), sign=if_else(f.y>=0, "positive", "negative")) >>
-        ggplot(aes(x='x', y='y')) + theme_classic()
+(df >>
+   mutate(y=sin(f.x), sign=if_else(f.y>=0, "positive", "negative")) >>
+   ggplot(aes(x='x', y='y')) + theme_classic()
 ) + geom_line(aes(color='sign'), size=1.2)
 ```
 
@@ -79,7 +78,6 @@ df = tibble(x=numpy.linspace(0, 2*pi, 500))
 # for example: klib
 import klib
 from pipda import register_verb
-from datar.core.contexts import Context
 from datar.datasets import iris
 from datar.dplyr import pull
 

diff --git a/datar/__init__.py b/datar/__init__.py
@@ -3,4 +3,4 @@
 from .core import operator as _datar_operator
 from .core.defaults import f
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
diff --git a/datar/dplyr/context.py b/datar/dplyr/context.py
@@ -13,24 +13,24 @@
 from .group_data import group_vars
 
 # n used directly in count
-@register_func(context=Context.EVAL)
+@register_func(context=Context.EVAL, summarise_prefers_input=True)
 def n(series: Iterable[Any]) -> int:
     """gives the current group size."""
     return len(series)
 
-@register_func(DataFrame, verb_arg_only=True)
+@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
 def cur_data_all(_data: DataFrame) -> DataFrame:
     """gives the current data for the current group
     (including grouping variables)"""
     return _data
 
-@register_func(DataFrame, verb_arg_only=True)
+@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
 def cur_data(_data: DataFrame) -> int:
     """gives the current data for the current group
     (excluding grouping variables)."""
     return _data[setdiff(_data.columns, group_vars(_data))]
 
-@register_func(DataFrame, verb_arg_only=True)
+@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
 def cur_group(_data: DataFrame) -> DataFrame:
     """gives the group keys, a tibble with one row and one column for
     each grouping variable."""
@@ -41,12 +41,12 @@ def cur_group(_data: DataFrame) -> DataFrame:
     gdata = _data.attrs['group_data']
     return gdata.iloc[[index], :-1]
 
-@register_func(DataFrame, verb_arg_only=True)
+@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
 def cur_group_id(_data: DataFrame) -> int:
     """gives a unique numeric identifier for the current group."""
     return _data.attrs.get('group_index', 1)
 
-@register_func(DataFrame, verb_arg_only=True)
+@register_func(DataFrame, verb_arg_only=True, summarise_prefers_input=True)
 def cur_group_rows(_data: DataFrame) -> List[int]:
     """gives the row indices for the current group."""
     index = _data.attrs.get('group_index', None)

diff --git a/datar/dplyr/summarise.py b/datar/dplyr/summarise.py
@@ -3,6 +3,7 @@
 from typing import Any, Iterable, Mapping, Optional, Union
 from pandas import DataFrame
 from pipda import register_verb, evaluate_expr
+from pipda.function import Function
 
 from ..core.defaults import DEFAULT_COLUMN_PREFIX
 from ..core.contexts import Context
@@ -35,6 +36,23 @@ def summarise(
         >>> # 0 10 20
         >>> df >> summarise(y=sum(f.x), z=f.x+f.y) # fail
 
+        But they should not be mixed in later argument. For example:
+        >>> df = tibble(x=[1,2,3,4], g=list('aabb')) >> group_by(f.g)
+        >>> df >> summarise(n=n() + f.x)
+        >>> # as expected:
+        >>>      g  n
+        >>> # 0  a  3
+        >>> # 1  a  4
+        >>> # 2  b  5
+        >>> # 3  b  6
+        >>> # [Groups: ['g'] (n=2)]
+        >>> # However:
+        >>> df >> summarise(y=1, n=n() + f.y)
+        >>> # n() will be recycling output instead of input
+        >>> #    g  y  n
+        >>> # 0  a  1  2
+        >>> # 1  b  1  2
+
     Args:
         _groups: Grouping structure of the result.
             - "drop_last": dropping the last level of grouping.
@@ -140,11 +158,15 @@ def summarise_build(
 
     out = group_keys(_data)
     for key, val in named.items():
+        envdata = out
+        if out.shape[1] == 0 or (
+                isinstance(val, Function) and
+                getattr(val.func, 'summarise_prefers_input', False)
+        ):
+            envdata = _data
+
         try:
-            if out.shape[1] == 0:
-                val = evaluate_expr(val, _data, context)
-            else:
-                val = evaluate_expr(val, out, context)
+            val = evaluate_expr(val, envdata, context)
         except ColumnNotExistingError:
             # also recycle input
             val = evaluate_expr(val, _data, context)

diff --git a/datar/stats/__init__.py b/datar/stats/__init__.py
@@ -1,4 +1,4 @@
 """APIs for stats"""
 
-from .funcs import rnorm, rpois, runif, quantile, sd
+from .funcs import rnorm, rpois, runif, quantile, sd, weighted_mean
 from .verbs import setNames
diff --git a/datar/stats/funcs.py b/datar/stats/funcs.py
@@ -1,10 +1,12 @@
 """Some functions ported from R-stats"""
-from typing import Any, Iterable, List
+from typing import Any, Iterable, List, Optional
 
 import numpy
 from pipda import register_func
 
-from ..core.types import FloatOrIter, SeriesLikeType
+from ..core.types import (
+    FloatOrIter, NumericOrIter, NumericType, SeriesLikeType, is_scalar
+)
 from ..core.contexts import Context
 
 # pylint: disable=redefined-builtin, redefined-outer-name
@@ -86,3 +88,31 @@ def sd(
         numpy.nanstd(series, ddof=ddof) if na_rm
         else numpy.std(series, ddof=ddof)
     )
+
+@register_func(None, context=Context.EVAL)
+def weighted_mean(
+        # pylint: disable=invalid-name
+        x: NumericOrIter,
+        w: Optional[NumericOrIter] = None,
+        na_rm: bool = False
+) -> NumericType:
+    """Calculate weighted mean"""
+    if is_scalar(x):
+        x = [x]
+    if w is not None and is_scalar(w):
+        w = [w]
+    x = numpy.array(x)
+    if w is not None:
+        w = numpy.array(w)
+        if len(x) != len(w):
+            raise ValueError("'x' and 'w' must have the same length")
+
+    if na_rm:
+        notna = ~numpy.isnan(x)
+        x = x[notna]
+        if w is not None:
+            w = w[notna]
+
+    if w is not None and sum(w) == 0:
+        return numpy.nan
+    return numpy.average(x, weights=w)
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,2 +1,6 @@
+## 0.0.3
+- Add stats.weighted_mean
+- Allow function to prefer recycling input or output for summarise
+
 ## 0.0.2
 - Port verbs and functions from tidyverse/dplyr and test them with original cases
diff --git a/docs/TODO.md b/docs/TODO.md
@@ -1,3 +1,6 @@
 
 - Add tests for tidyr from original tidyverse/tidyr cases
 - Add more tests for base/core
+- Port more functions from `r-base`, `r-stats`, etc
+- Port more datasets from `r-datasets` namespace
+- Add more detailed documentations
diff --git a/docs/notebooks/readme.ipynb b/docs/notebooks/readme.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "datar"
-version = "0.0.2"
-description = "Probably the closest port of tidyr, dplyr and tibble in python"
+version = "0.0.3"
+description = "Port of R data packages tidyr, dplyr, tibble and so on in python"
 authors = ["pwwang <[email protected]>"]
 license = "MIT"
 

diff --git a/setup.py b/setup.py
@@ -15,8 +15,8 @@
 setup(
     long_description=readme,
     name='datar',
-    version='0.0.2',
-    description='Probably the closest port of tidyr, dplyr and tibble in python',
+    version='0.0.3',
+    description='Port of R data packages tidyr, dplyr, tibble and so on in python',
     python_requires='==3.*,>=3.7.1',
     author='pwwang',
     author_email='[email protected]',