From d22f6fcbec582ca2b8643ee22c76b9784e226482 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Nov 2016 20:19:53 -0500 Subject: [PATCH 1/5] categoricals --- dask/benchmarks/binary_ops.py | 265 ++++++++++++++++++++++++++++++++ dask/benchmarks/categoricals.py | 31 ++++ 2 files changed, 296 insertions(+) create mode 100644 dask/benchmarks/binary_ops.py create mode 100644 dask/benchmarks/categoricals.py diff --git a/dask/benchmarks/binary_ops.py b/dask/benchmarks/binary_ops.py new file mode 100644 index 0000000..9e9e18e --- /dev/null +++ b/dask/benchmarks/binary_ops.py @@ -0,0 +1,265 @@ +from pandas import DataFrame, Series, date_range +import numpy as np +import pandas.computation.expressions as expr +import dask.dataframe as dd + + +class frame_add(object): + goal_time = 0.2 + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(20000, 100)), + npartitions=10) + self.df2 = dd.from_pandas(DataFrame(np.random.randn(20000, 100)), + npartitions=10) + + def time_frame_add(self): + (self.df + self.df2) + + +class frame_add_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_add_no_ne(self): + (self.df + self.df2) + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_add_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_add_st(self): + (self.df + self.df2) + + def teardown(self): + expr.set_numexpr_threads() + + +class frame_float_div(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + self.df2 = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_div(self): + (self.df // self.df2) + + +class frame_float_div_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_div_by_zero(self): + (self.df / 0) + + +class frame_float_floor_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_floor_by_zero(self): + (self.df // 0) + + +class frame_float_mod(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + self.df2 = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_mod(self): + (self.df / self.df2) + + +class frame_int_div_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + + def time_frame_int_div_by_zero(self): + (self.df / 0) + + +class frame_int_mod(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + + def time_frame_int_mod(self): + (self.df / self.df2) + + +class frame_mult(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + + def time_frame_mult(self): + (self.df * self.df2) + + +class frame_mult_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_mult_no_ne(self): + (self.df * self.df2) + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_mult_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_mult_st(self): + (self.df * self.df2) + + def teardown(self): + expr.set_numexpr_threads() + + +class frame_multi_and(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + + def time_frame_multi_and(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + +class frame_multi_and_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_multi_and_no_ne(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_multi_and_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_multi_and_st(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + def teardown(self): + expr.set_numexpr_threads() + + +class series_timestamp_compare(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.s = Series(date_range('20010101', periods=self.N, freq='T')) + self.ts = self.s[self.halfway] + + def time_series_timestamp_compare(self): + (self.s <= self.ts) + + +class timestamp_ops_diff1(object): + goal_time = 0.2 + N = 1000000 + + def setup(self): + self.s = self.create() + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='s')) + + def time_timestamp_ops_diff1(self): + self.s.diff() + +class timestamp_tz_ops_diff1(timestamp_ops_diff1): + N = 10000 + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) + +class timestamp_ops_diff2(object): + goal_time = 0.2 + N = 1000000 + + def setup(self): + self.s = self.create() + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='s')) + + def time_timestamp_ops_diff2(self): + (self.s - self.s.shift()) + +class timestamp_tz_ops_diff2(timestamp_ops_diff2): + N = 10000 + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) + +class timestamp_series_compare(object): + goal_time = 0.2 + N = 1000000 + + def setup(self): + self.halfway = ((self.N // 2) - 1) + self.s = self.create() + self.ts = self.s[self.halfway] + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='T')) + + def time_timestamp_series_compare(self): + (self.ts >= self.s) + +class timestamp_tz_series_compare(timestamp_series_compare): + N = 10000 + + def create(self): + return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) diff --git a/dask/benchmarks/categoricals.py b/dask/benchmarks/categoricals.py new file mode 100644 index 0000000..55210de --- /dev/null +++ b/dask/benchmarks/categoricals.py @@ -0,0 +1,31 @@ +# See https://github.com/pandas-dev/pandas/blob/master/asv_bench/benchmarks/categoricals.py +import pandas as pd +import numpy as np +from pandas import Series +import dask.dataframe as dd + + +class concat_categorical(object): + goal_time = 0.2 + + def setup(self): + self.s = dd.from_pandas( + pd.Series((list('aabbcd') * 1000000)).astype('category'), + npartitions=10) + + def time_concat_categorical_interleave(self): + dd.concat([self.s, self.s], interleave_partitions=True).compute() + + +class categorical_value_counts(object): + goal_time = 1 + + def setup(self): + n = 500000 + np.random.seed(2718281) + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = dd.from_pandas(Series(arr).astype('category'), + npartitions=10) + + def time_value_counts(self): + self.ts.value_counts().compute() From c5bff7c7aab91a5fb94d30c21651ba9aeadb9591 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 6 Nov 2016 13:49:36 -0600 Subject: [PATCH 2/5] constructors --- dask/benchmarks/dataframe.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/dask/benchmarks/dataframe.py b/dask/benchmarks/dataframe.py index 3925394..981187b 100644 --- a/dask/benchmarks/dataframe.py +++ b/dask/benchmarks/dataframe.py @@ -1,3 +1,5 @@ +import string + import dask import dask.dataframe as dd import numpy as np @@ -41,3 +43,68 @@ def test_repartition(self): def test_quantile(self): self.data.quantile(.25).compute() + + +class TimeFloatConstructors(object): + + def setup(self): + self.floats = np.random.randn(10000, 10) + self.wide_floats = np.random.randn(10000, 1000) + self.floats_pandas = pd.DataFrame(self.floats) + self.wide_floats_pandas = pd.DataFrame(self.wide_floats) + + def time_floats(self): + dd.from_array(self.floats) + + def time_wide_floats(self): + dd.from_array(self.wide_floats) + + def time_floats_pandas(self): + dd.from_pandas(self.floats_pandas, npartitions=2) + + def time_wide_floats_pandas(self): + dd.from_pandas(self.wide_floats_pandas, npartitions=2) + + +class TimeIntConstructors(object): + + def setup(self): + self.ints = np.random.randint(0, 100, size=(10000, 10)) + self.wide_ints = np.random.randint(0, 100, size=(10000, 1000)) + self.ints_pandas = pd.DataFrame(self.ints) + self.wide_ints_pandas = pd.DataFrame(self.wide_ints) + + def time_ints(self): + dd.from_array(self.ints) + + def time_wide_ints(self): + dd.from_array(self.wide_ints) + + def time_ints_pandas(self): + dd.from_pandas(self.ints_pandas, npartitions=2) + + def time_wide_ints_pandas(self): + dd.from_pandas(self.wide_ints_pandas, npartitions=2) + + +class TimeObjectConstructors(object): + + def setup(self): + self.text = np.random.choice(list(string.ascii_letters), + size=(10000, 10)) + self.wide_text = np.random.choice(list(string.ascii_letters), + size=(10000, 1000)) + self.text_pandas = pd.DataFrame(self.text) + self.wide_text_pandas = pd.DataFrame(self.wide_text) + + def time_text(self): + dd.from_array(self.text) + + def time_wide_text(self): + dd.from_array(self.wide_text) + + def time_text_pandas(self): + dd.from_pandas(self.text_pandas, npartitions=2) + + def time_wide_text_pandas(self): + dd.from_pandas(self.wide_text_pandas, npartitions=2) From 0523db9d010ecd8cd6ab5159077eba7ef44ac350 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 6 Nov 2016 13:49:42 -0600 Subject: [PATCH 3/5] frame_methods --- dask/benchmarks/frame_methods.py | 272 +++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 dask/benchmarks/frame_methods.py diff --git a/dask/benchmarks/frame_methods.py b/dask/benchmarks/frame_methods.py new file mode 100644 index 0000000..2a98be2 --- /dev/null +++ b/dask/benchmarks/frame_methods.py @@ -0,0 +1,272 @@ +import string + +import numpy as np +from numpy.random import randn +from pandas import DataFrame, Series, date_range, NaT +import dask.dataframe as dd + + +class frame_apply_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(100000, 100)), 2) + + def time_frame_apply_axis_1(self): + self.df.apply((lambda x: (x + 1)), axis=1) + + def time_frame_apply_lambda_mean(self): + self.df.apply((lambda x: x.sum()), axis=1) + + def time_frame_apply_np_mean(self): + self.df.apply(np.mean, axis=1) + + def time_frame_apply_pass_thru(self): + self.df.apply((lambda x: x), axis=1) + + +class frame_apply_ref_by_name(object): + goal_time = 0.2 + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(100000, 3), + columns=list('ABC')), + npartitions=2) + + def time_frame_apply_ref_by_name(self): + self.df.apply((lambda x: (x['A'] + x['B'])), axis=1) + + +class frame_apply_user_func(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.arange(1028.0)) + self.df = dd.from_pandas(DataFrame({i: self.s for i in range(1028)}), + npartitions=2) + + def time_frame_apply_user_func(self): + self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)]), axis=1) + + +# class frame_boolean_row_select(object): +# goal_time = 0.2 + +# def setup(self): +# self.df = dd.from_pandas(DataFrame(randn(10000, 100)), +# npartitions=2) +# self.bool_arr = np.zeros(10000, dtype=bool) +# self.bool_arr[:100] = True + +# def time_frame_boolean_row_select(self): +# self.df.loc[self.bool_arr, :] + + +class frame_dropna(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.loc[50:1000, 20:50] = np.nan + self.df.loc[2000:3000] = np.nan + self.df.loc[:, 60:70] = np.nan + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_frame_dropna_axis0_all(self): + self.df.dropna(how='all') + + def time_frame_dropna_axis0_any(self): + self.df.dropna(how='any') + + + +class frame_dropna_axis0_mixed_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.loc[50:1000, 20:50] = np.nan + self.df.loc[2000:3000] = np.nan + self.df.loc[:, 60:70] = np.nan + self.df['foo'] = 'bar' + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_frame_dropna_axis0_all_mixed_dtypes(self): + self.df.dropna(how='all') + + def time_frame_dropna_axis0_any_mixed_dtypes(self): + self.df.dropna(how='any') + + + +class frame_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(1000, 1000)), + npartitions=2) + + def time_frame_dtypes(self): + self.df.dtypes + + +class frame_duplicated(object): + goal_time = 0.2 + + def setup(self): + self.n = (1 << 20) + self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) + self.xs = np.random.randn((self.n // 64)).round(2) + self.df = dd.from_pandas( + DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), + 'b': np.random.choice(self.t, self.n), + 'c': np.random.choice(self.xs, self.n), }), + npartitions=2) + + def time_frame_duplicated(self): + self.df.drop_duplicates() + + +class frame_float_equal(object): + goal_time = 0.2 + + def setup(self): + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_float_equal(self): + self.test_equal('float_df') + + def make_pair(self, frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(self, name): + (self.df, self.df2) = self.pairs[name] + return self.df.equals(self.df) + + def test_unequal(self, name): + (self.df, self.df2) = self.pairs[name] + return self.df.equals(self.df2) + + +class frame_float_unequal(object): + goal_time = 0.2 + + def setup(self): + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_float_unequal(self): + self.test_unequal('float_df') + + def make_pair(self, frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(self, name): + (self.df, self.df2) = self.pairs[name] + return self.df.equals(self.df) + + def test_unequal(self, name): + (self.df, self.df2) = self.pairs[name] + return self.df.equals(self.df2) + + +class frame_isnull_floats_no_null(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100000, 1000) + self.df = dd.from_pandas(DataFrame(self.data), npartitions=4) + + def time_frame_isnull(self): + self.df.isnull() + + +class frame_isnull_floats(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array([np.nan, 1.0]) + self.data = np.random.choice(self.sample, (100000, 1000)) + self.df = dd.from_pandas(DataFrame(self.data), npartitions=4) + + def time_frame_isnull(self): + self.df.isnull() + + +class frame_isnull_strings(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array(list(string.ascii_lowercase) + + list(string.ascii_uppercase) + + list(string.whitespace)) + self.data = np.random.choice(self.sample, (100000, 1000)) + self.df = dd.from_pandas(DataFrame(self.data), npartitions=4) + + def time_frame_isnull(self): + self.df.isnull() + + +class frame_isnull_obj(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + self.data = np.random.choice(self.sample, (10000, 100)) + self.df = dd.from_pandas(DataFrame(self.data), npartitions=4) + + def time_frame_isnull(self): + self.df.isnull() + + +class frame_itertuples(object): + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(50000, 10)), + npartitions=4) + + def time_frame_itertuples(self): + for row in self.df.itertuples(): + pass + + +class series_string_vector_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = dd.from_pandas(Series((['abcdefg', np.nan] * 500000)), + npartitions=4) + + def time_series_string_vector_slice(self): + self.s.str.slice(5) + + +class frame_quantile_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = dd.from_pandas(DataFrame(np.random.randn(100000, 3), + columns=list('ABC')), + npartitions=4) + + def time_frame_quantile_axis1(self): + self.df.quantile(0.1, axis=1) From 456540dc943fc85e531f69ed30d01d97d2505d44 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 6 Nov 2016 14:20:45 -0600 Subject: [PATCH 4/5] gil --- dask/benchmarks/gil.py | 126 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 dask/benchmarks/gil.py diff --git a/dask/benchmarks/gil.py b/dask/benchmarks/gil.py new file mode 100644 index 0000000..4aefef7 --- /dev/null +++ b/dask/benchmarks/gil.py @@ -0,0 +1,126 @@ +import numpy as np +from pandas import DataFrame +import pandas as pd +import dask.dataframe as dd + + +class nogil_groupby_base(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, + size=self.N), + 'data': np.random.randn(self.N)}) + self.df = dd.from_pandas(self.df, npartitions=4) + + def time_nogil_groupby_count_2(self): + self.df.groupby('key')['data'].count().compute() + + # def time_nogil_groupby_last_2(self): + # self.df.groupby('key')['data'].last().compute() + + def time_nogil_groupby_max_2(self): + self.df.groupby('key')['data'].max().compute() + + def time_nogil_groupby_mean_2(self): + self.df.groupby('key')['data'].mean().compute() + + def time_nogil_groupby_min_2(self): + self.df.groupby('key')['data'].min().compute() + + # def time_nogil_groupby_prod_2(self): + # self.df.groupby('key')['data'].prod().compute() + + def time_nogil_groupby_sum_2(self): + self.df.groupby('key')['data'].sum().compute() + + def time_nogil_groupby_sum_4(self): + self.df.groupby('key')['data'].sum().compute() + + def time_nogil_groupby_sum_8(self): + self.df.groupby('key')['data'].sum() + + def time_nogil_groupby_var_2(self): + self.df.groupby('key')['data'].var() + + +class nogil_n_larget(object): + def setup(self): + np.random.seed(1234) + self.N = 10000000 + self.k = 500000 + self.a = np.random.randn(self.N) + self.s = dd.from_array(self.a) + + def time_nogil_n_largest(self): + self.s.nlargest(n=5).compute() + + +class nogil_datetime_fields(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.dti = pd.date_range('1900-01-01', periods=self.N, freq='D') + # TODO: dt namespace on Periods + # self.period = self.dti.to_period('D') + + self.dti = dd.from_pandas(pd.Series(self.dti), npartitions=4) + # self.period = dd.from_pandas(pd.Series(self.period), npartitions=4) + + def time_datetime_field_year(self): + self.dti.dt.year.compute() + + def time_datetime_field_day(self): + self.dti.dt.day.compute() + + def time_datetime_field_daysinmonth(self): + self.dti.dt.days_in_month.compute() + + def time_datetime_field_normalize(self): + self.dti.dt.normalize().compute() + + def time_datetime_to_period(self): + self.dti.dt.to_period('S').compute() + + # def time_period_to_datetime(self): + # def run(period): + # period.to_timestamp() + # run(self.period) + + +class nogil_rolling_algos_slow(object): + goal_time = 0.2 + + def setup(self): + self.win = 100 + np.random.seed(1234) + self.arr = np.random.rand(100000) + self.arr = dd.from_array(self.arr) + + def time_nogil_rolling_median(self): + self.arr.rolling(self.win).median().compute() + + def time_nogil_rolling_mean(self): + self.arr.rolling(self.win).mean().compute() + + def time_nogil_rolling_min(self): + self.arr.rolling(self.win).min().compute() + + def time_nogil_rolling_max(self): + self.arr.rolling(self.win).max().compute() + + def time_nogil_rolling_var(self): + self.arr.rolling(self.win).var().compute() + + def time_nogil_rolling_skew(self): + self.arr.rolling(self.win).skew().compute() + + def time_nogil_rolling_kurt(self): + self.arr.rolling(self.win).kurt().compute() + + def time_nogil_rolling_std(self): + self.arr.rolling(self.win).std().compute() From c1b0f5bd0ac621c009bb760c9733420050a2df04 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 6 Nov 2016 15:14:52 -0600 Subject: [PATCH 5/5] groupby --- dask/benchmarks/groupby.py | 883 +++++++++++++++++++++++++++++++++++++ 1 file changed, 883 insertions(+) create mode 100644 dask/benchmarks/groupby.py diff --git a/dask/benchmarks/groupby.py b/dask/benchmarks/groupby.py new file mode 100644 index 0000000..5c215dd --- /dev/null +++ b/dask/benchmarks/groupby.py @@ -0,0 +1,883 @@ +import random +from itertools import product +from string import ascii_letters, digits + +import numpy as np +from numpy.random import randn, randint +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas.util.testing as tm +import dask.dataframe as dd + + +class groupby_agg_builtins(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(27182) + self.n = 100000 + self.df = DataFrame(np.random.randint(1, (self.n / 100), + (self.n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df = dd.from_pandas(self.df, npartitions=4) + + def time_groupby_agg_builtins1(self): + self.df.groupby('jim').agg([sum, min, max]).compute() + + def time_groupby_agg_builtins2(self): + self.df.groupby(['jim', 'joe']).agg([sum, min, max]).compute() + +# ---------------------------------------------------------------------- +# dict return values + + +class groupby_apply_dict_return(object): + goal_time = 0.2 + + def setup(self): + self.labels = dd.from_array(np.arange(1000).repeat(10)) + self.data = Series(randn(len(self.labels))) + self.f = (lambda x: {'sum': x.values[0]}) + self.data = dd.from_pandas(Series(randn(len(self.labels))), + chunksize=50000) + + def time_groupby_apply_dict_return(self): + self.data.groupby(self.labels).sum().compute() + + +# ---------------------------------------------------------------------- +# groups + +# NotImplemented + + +# class groupby_groups(object): +# goal_time = 0.1 + +# def setup(self): +# size = 2**22 +# self.data = dd.from_pandas( +# Series(np.random.randint(0, 100, size=size)), +# npartitions=2) +# self.data2 = dd.from_pandas( +# Series(np.random.randint(0, 10000, size=size)), +# npartitions=2) +# self.data3 = dd.from_pandas( +# Series(tm.makeStringIndex(100).take( +# np.random.randint(0, 100, size=size))), +# npartitions=2) +# self.data4 = dd.from_pandas( +# Series(tm.makeStringIndex(10000).take( +# np.random.randint(0, 10000, size=size))), +# npartitions=2) + +# def time_groupby_groups_int64_small(self): +# self.data.groupby(self.data).groups.compute() + +# def time_groupby_groups_int64_large(self): +# self.data2.groupby(self.data2).groups.compute() + +# def time_groupby_groups_object_small(self): +# self.data3.groupby(self.data3).groups.compute() + +# def time_groupby_groups_object_large(self): +# self.data4.groupby(self.data4).groups.compute() + + +# --------------------------------------------------------------------- +# First / last functions + +# class groupby_first_last(object): +# goal_time = 0.2 + +# def setup(self): +# self.labels = np.arange(10000).repeat(10) +# self.data = Series(randn(len(self.labels))) +# self.data[::3] = np.nan +# self.data[1::3] = np.nan +# self.data2 = Series(randn(len(self.labels)), dtype='float32') +# self.data2[::3] = np.nan +# self.data2[1::3] = np.nan +# self.labels = self.labels.take(np.random.permutation(len(self.labels))) +# self.data = dd.from_pandas(self.data, npartitions=4) +# self.data2 = dd.from_pandas(self.data2, npartitions=4) + +# def time_groupby_first_float32(self): +# self.data2.groupby(self.labels).first().compute() + +# def time_groupby_first_float64(self): +# self.data.groupby(self.labels).first().compute() + + # def time_groupby_last_float32(self): + # self.data2.groupby(self.labels).last() + + # def time_groupby_last_float64(self): + # self.data.groupby(self.labels).last() + + # def time_groupby_nth_float32_any(self): + # self.data2.groupby(self.labels).nth(0, dropna='all').compute() + + # def time_groupby_nth_float32_none(self): + # self.data2.groupby(self.labels).nth(0) + + # def time_groupby_nth_float64_any(self): + # self.data.groupby(self.labels).nth(0, dropna='all') + + # def time_groupby_nth_float64_none(self): + # self.data.groupby(self.labels).nth(0) + + +# class groupby_first_last_datetimes(object): +# goal_time = 0.2 + +# def setup(self): +# self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, +# freq='s'), 'b': range(100000)}) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_first_datetimes(self): +# self.df.groupby('b').first().compute() + +# # def time_groupby_last_datetimes(self): +# # self.df.groupby('b').last() + +# # def time_groupby_nth_datetimes_any(self): +# # self.df.groupby('b').nth(0, dropna='all') + +# # def time_groupby_nth_datetimes_none(self): +# self.df.groupby('b').nth(0) + + +# class groupby_first_last_object(object): +# goal_time = 0.2 + +# def setup(self): +# self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000)}) + +# def time_groupby_first_object(self): +# self.df.groupby('b').first() + +# def time_groupby_last_object(self): +# self.df.groupby('b').last() + +# def time_groupby_nth_object_any(self): +# self.df.groupby('b').nth(0, dropna='any') + +# def time_groupby_nth_object_none(self): +# self.df.groupby('b').nth(0) + + +# --------------------------------------------------------------------- +# DataFrame Apply overhead + +class groupby_frame_apply(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.labels = np.random.randint(0, 2000, size=self.N) + self.labels2 = np.random.randint(0, 3, size=self.N) + self.df = DataFrame({ + 'key': self.labels, 'key2': self.labels2, + 'value1': randn(self.N), + 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)), }) + self.df = dd.from_pandas(self.df, npartitions=2) + + def f(self, g): + return 1 + + def time_groupby_frame_apply(self): + self.df.groupby(['key', 'key2']).apply(self.f).compute() + + def time_groupby_frame_apply_overhead(self): + self.df.groupby('key').apply(self.f).compute() + + +# --------------------------------------------------------------------- +# 2d grouping, aggregate many columns + +class groupby_frame_cython_many_columns(object): + goal_time = 0.2 + + def setup(self): + self.labels = dd.from_array( + np.random.randint(0, 100, size=1000), + chunksize=500) + self.df = DataFrame(randn(1000, 1000)) + self.df = dd.from_pandas(self.df, chunksize=500) + + def time_sum(self): + self.df.groupby(self.labels).sum() + + +# --------------------------------------------------------------------- +# single key, long, integer key + +class groupby_frame_singlekey_integer(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100000, 1) + self.labels = dd.from_pandas( + pd.Series(np.random.randint(0, 1000, size=100000)), + npartitions=2) + self.df = DataFrame(self.data) + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_sum(self): + self.df.groupby(self.labels).sum() + + +# --------------------------------------------------------------------- +# median + +class groupby_frame(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100000, 2) + self.labels = dd.from_pandas( + pd.Series(np.random.randint(0, 1000, size=100000)), + npartitions=2) + self.df = DataFrame(self.data) + self.df = dd.from_pandas(self.df, npartitions=2) + + # def time_groupby_frame_median(self): + # self.df.groupby(self.labels).median().compute() + + def time_groupby_simple_compress_timing(self): + self.df.groupby(self.labels).mean().compute() + + +# --------------------------------------------------------------------- +# DataFrame nth + +# class groupby_nth(object): +# goal_time = 0.2 + +# def setup(self): +# self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_frame_nth_any(self): +# self.df.groupby(0).nth(0, dropna='any') + +# def time_groupby_frame_nth_none(self): +# self.df.groupby(0).nth(0) + +# def time_groupby_series_nth_any(self): +# self.df[1].groupby(self.df[0]).nth(0, dropna='any') + +# def time_groupby_series_nth_none(self): +# self.df[1].groupby(self.df[0]).nth(0) + + +# --------------------------------------------------------------------- +# groupby_indices replacement, chop up Series + +# class groupby_indices(object): +# goal_time = 0.2 + +# def setup(self): +# try: +# self.rng = date_range('1/1/2000', '12/31/2005', freq='H') +# (self.year, self.month, self.day) = (self.rng.year, self.rng.month, +# self.rng.day) +# except: +# self.rng = date_range('1/1/2000', '12/31/2000', +# offset=pd.tseries.offsets.Hour()) +# self.year = self.rng.map((lambda x: x.year)) +# self.month = self.rng.map((lambda x: x.month)) +# self.day = self.rng.map((lambda x: x.day)) +# self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + +# def time_groupby_indices(self): +# len(self.ts.groupby([self.year, self.month, self.day])) + + +class groupby_int64_overflow(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) + self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) + self.arr = np.vstack((self.arr, self.arr[self.i])) + self.i = np.random.permutation(len(self.arr)) + self.arr = self.arr[self.i] + self.df = DataFrame(self.arr, columns=list('abcde')) + (self.df['jim'], self.df['joe']) = ( + np.random.randn(2, len(self.df)) * 10) + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_int64_overflow(self): + self.df.groupby(list('abcde')).max().compute() + + +# --------------------------------------------------------------------- +# count() speed + +class groupby_multi_count(object): + goal_time = 0.2 + + def setup(self): + self.n = 10000 + self.offsets = np.random.randint( + self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') + self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') + self.value2 = np.random.randn(self.n) + self.value2[(np.random.rand(self.n) > 0.5)] = np.nan + self.obj = np.random.choice(list('ab'), size=self.n).astype(object) + self.obj[(np.random.randn(self.n) > 0.5)] = np.nan + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), + 'key2': np.random.randint(0, 100, size=self.n), + 'dates': self.dates, + 'value2': self.value2, + 'value3': np.random.randn(self.n), + 'ints': np.random.randint(0, 1000, size=self.n), + 'obj': self.obj, + 'offsets': self.offsets, }) + + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_multi_count(self): + self.df.groupby(['key1', 'key2']).count().compute() + + +class groupby_int_count(object): + goal_time = 0.2 + + def setup(self): + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), + 'key2': randint(0, 100, size=self.n), + 'ints': randint(0, 1000, size=self.n), + 'ints2': randint(0, 1000, size=self.n), }) + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_int_count(self): + self.df.groupby(['key1', 'key2']).count().compute() + + +# --------------------------------------------------------------------- +# group with different functions per column + +class groupby_agg_multi(object): + goal_time = 0.2 + + def setup(self): + self.fac1 = np.array(['A', 'B', 'C'], dtype='O') + self.fac2 = np.array(['one', 'two'], dtype='O') + self.df = DataFrame({ + 'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), + 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), + 'value1': np.random.randn(100000), + 'value2': np.random.randn(100000), + 'value3': np.random.randn(100000), }) + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_multi_different_functions(self): + self.df.groupby(['key1', 'key2']).agg( + {'value1': 'mean', 'value2': 'var', 'value3': 'sum'}).compute() + + def time_groupby_multi_different_numpy_functions(self): + self.df.groupby(['key1', 'key2']).agg( + {'value1': np.mean, 'value2': np.var, 'value3': np.sum}).compute() + + +class groupby_multi_index(object): + goal_time = 0.2 + + def setup(self): + self.n = (((5 * 7) * 11) * (1 << 9)) + self.alpha = list(map(''.join, product((ascii_letters + digits), + repeat=4))) + self.f = (lambda k: np.repeat(np.random.choice(self.alpha, + (self.n // k)), k)) + self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), + 'd': self.f(1), }) + self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) + self.i = np.random.permutation(len(self.df)) + self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_multi_index(self): + self.df.groupby(list('abcd')).max().compute() + + +class groupby_multi(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.ngroups = 100 + self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), + 'key2': self.get_test_data(ngroups=self.ngroups), + 'data1': np.random.randn(self.N), + 'data2': np.random.randn(self.N), }) + self.simple_series = Series(np.random.randn(self.N)) + self.df = dd.from_pandas(self.df, npartitions=2) + self.simple_series = dd.from_pandas(self.simple_series, npartitions=2) + self.key1 = self.df['key1'] + + def get_test_data(self, ngroups=100, n=100000): + self.unique_groups = range(self.ngroups) + self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), + dtype=object) + if (len(self.arr) < n): + self.arr = np.asarray((list(self.arr) + + self.unique_groups[:(n - len(self.arr))]), + dtype=object) + random.shuffle(self.arr) + return self.arr + + def f(self): + self.df.groupby(['key1', 'key2']).agg( + (lambda x: x.values.sum())).compute() + + def time_groupby_multi_cython(self): + self.df.groupby(['key1', 'key2']).sum().compute() + + # def time_groupby_multi_python(self): + # self.df.groupby(['key1', 'key2'])['data1'].agg( + # (lambda x: x.values.sum())).compute() + + def time_groupby_multi_series_op(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(np.std).compute() + + def time_groupby_series_simple_cython(self): + self.simple_series.groupby(self.key1).sum().compute() + + # def time_groupby_series_simple_rank(self): + # self.df.groupby('key1').rank(pct=True).compute() + + +# --------------------------------------------------------------------- +# size() speed + +class groupby_size(object): + goal_time = 0.2 + + def setup(self): + self.n = 100000 + self.offsets = np.random.randint( + self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), + 'key2': np.random.randint(0, 100, size=self.n), + 'value1': np.random.randn(self.n), + 'value2': np.random.randn(self.n), + 'value3': np.random.randn(self.n), + 'dates': self.dates, }) + self.df = dd.from_pandas(self.df, npartitions=2) + + def time_groupby_multi_size(self): + self.df.groupby(['key1', 'key2']).size().compute() + + def time_groupby_dt_size(self): + self.df.groupby(['dates']).size().compute() + + # def time_groupby_dt_timegrouper_size(self): + # self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + + +# --------------------------------------------------------------------- +# groupby with a variable value for ngroups + +class groupby_ngroups_int_10000(object): + goal_time = 0.2 + dtype = 'int' + ngroups = 10000 + + def setup(self): + np.random.seed(1234) + size = self.ngroups * 2 + rng = np.arange(self.ngroups) + ts = rng.take(np.random.randint(0, self.ngroups, size=size)) + if self.dtype == 'int': + value = np.random.randint(0, size, size=size) + else: + value = np.concatenate([np.random.random(self.ngroups) * 0.1, + np.random.random(self.ngroups) * 10.0]) + + self.df = DataFrame({'timestamp': ts, + 'value': value}) + self.df = dd.from_pandas(self.df, npartitions=2) + + + # def time_all(self): + # self.df.groupby('value')['timestamp'].all().compute() + + # def time_any(self): + # self.df.groupby('value')['timestamp'].any().compute() + + def time_count(self): + self.df.groupby('value')['timestamp'].count().compute() + + # def time_cumcount(self): + # self.df.groupby('value')['timestamp'].cumcount().compute() + + # def time_cummax(self): + # self.df.groupby('value')['timestamp'].cummax().compute() + + # def time_cummin(self): + # self.df.groupby('value')['timestamp'].cummin().compute() + + # def time_cumprod(self): + # self.df.groupby('value')['timestamp'].cumprod().compute() + + # def time_cumsum(self): + # self.df.groupby('value')['timestamp'].cumsum().compute() + + # def time_describe(self): + # self.df.groupby('value')['timestamp'].describe().compute() + + # def time_diff(self): + # self.df.groupby('value')['timestamp'].diff().compute() + + # def time_first(self): + # self.df.groupby('value')['timestamp'].first().compute() + + # def time_head(self): + # self.df.groupby('value')['timestamp'].head().compute() + + # def time_last(self): + # self.df.groupby('value')['timestamp'].last().compute() + + # def time_mad(self): + # self.df.groupby('value')['timestamp'].mad().compute() + + def time_max(self): + self.df.groupby('value')['timestamp'].max().compute() + + def time_mean(self): + self.df.groupby('value')['timestamp'].mean().compute() + + # def time_median(self): + # self.df.groupby('value')['timestamp'].median().compute() + + def time_min(self): + self.df.groupby('value')['timestamp'].min().compute() + + def time_nunique(self): + self.df.groupby('value')['timestamp'].nunique().compute() + + # def time_pct_change(self): + # self.df.groupby('value')['timestamp'].pct_change().compute() + + # def time_prod(self): + # self.df.groupby('value')['timestamp'].prod().compute() + + # def time_rank(self): + # self.df.groupby('value')['timestamp'].rank().compute() + + # def time_sem(self): + # self.df.groupby('value')['timestamp'].sem().compute() + + def time_size(self): + self.df.groupby('value')['timestamp'].size().compute() + + # def time_skew(self): + # self.df.groupby('value')['timestamp'].skew().compute() + + def time_std(self): + self.df.groupby('value')['timestamp'].std().compute() + + def time_sum(self): + self.df.groupby('value')['timestamp'].sum().compute() + + # def time_tail(self): + # self.df.groupby('value')['timestamp'].tail().compute() + + # def time_unique(self): + # self.df.groupby('value')['timestamp'].unique().compute() + + # def time_value_counts(self): + # self.df.groupby('value')['timestamp'].value_counts().compute() + + def time_var(self): + self.df.groupby('value')['timestamp'].var().compute() + + +class groupby_ngroups_int_100(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'int' + ngroups = 100 + + +class groupby_ngroups_float_100(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 100 + + +class groupby_ngroups_float_10000(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 10000 + + +class groupby_float32(object): + # GH 13335 + goal_time = 0.2 + + def setup(self): + tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) + tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) + tmp = np.concatenate((tmp1, tmp2)) + arr = np.repeat(tmp, 10) + self.df = DataFrame(dict(a=arr, b=arr)) + self.df = dd.from_pandas(self.df, npartitions=2) + + + def time_groupby_sum(self): + self.df.groupby(['a'])['b'].sum().compute() + + +class groupby_period(object): + # GH 14338 + goal_time = 0.2 + + def make_grouper(self, N): + return dd.from_pandas( + pd.Series(pd.period_range('1900-01-01', freq='D', periods=N)), + chunksize=N // 2) + + def setup(self): + N = 10000 + self.grouper = self.make_grouper(N) + self.df = pd.DataFrame(np.random.randn(N, 2)) + self.df = dd.from_pandas(self.df, chunksize=N // 2) + + def time_groupby_sum(self): + self.df.groupby(self.grouper).sum().compute() + + +class groupby_datetime(groupby_period): + def make_grouper(self, N): + return dd.from_pandas( + pd.Series(pd.date_range('1900-01-01', freq='D', periods=N)), + chunksize=N // 2) + + +class groupby_datetimetz(groupby_period): + def make_grouper(self, N): + return dd.from_pandas( + pd.Series(pd.date_range('1900-01-01', freq='D', periods=N, + tz='US/Central')), + chunksize=N // 2) + +# --------------------------------------------------------------------- +# Series.value_counts + + +class series_value_counts(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=100000)) + self.s2 = self.s.astype(float) + + self.K = 1000 + self.N = 100000 + self.uniques = tm.makeStringIndex(self.K).values + self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) + self.s = dd.from_pandas(self.s, npartitions=2) + self.s2 = dd.from_pandas(self.s2, npartitions=2) + self.s3 = dd.from_pandas(self.s3, npartitions=2) + + def time_value_counts_int64(self): + self.s.value_counts().compute() + + def time_value_counts_float64(self): + self.s2.value_counts().compute() + + def time_value_counts_strings(self): + self.s.value_counts().compute() + + +# --------------------------------------------------------------------- +# pivot_table + +class groupby_pivot_table(object): + goal_time = 0.2 + + def setup(self): + self.fac1 = np.array(['A', 'B', 'C'], dtype='O') + self.fac2 = np.array(['one', 'two'], dtype='O') + self.ind1 = np.random.randint(0, 3, size=100000) + self.ind2 = np.random.randint(0, 2, size=100000) + self.df = DataFrame({'key1': self.fac1.take(self.ind1), + 'key2': self.fac2.take(self.ind2), + 'key3': self.fac2.take(self.ind2), + 'value1': np.random.randn(100000), + 'value2': np.random.randn(100000), + 'value3': np.random.randn(100000), }) + self.df = dd.from_pandas(self.df, npartitions=2) + + + def time_groupby_pivot_table(self): + self.df.pivot_table(index='key1', columns='key2').compute() + + +# --------------------------------------------------------------------- +# Sum booleans #2692 + +class groupby_sum_booleans(object): + goal_time = 0.2 + + def setup(self): + self.N = 500 + self.df = DataFrame({'ii': range(self.N), + 'bb': [True for x in range(self.N)], }) + self.df = dd.from_pandas(self.df, npartitions=2) + + + def time_groupby_sum_booleans(self): + self.df.groupby('ii').sum().compute() + + +# --------------------------------------------------------------------- +# multi-indexed group sum #9049 + +# class groupby_sum_multiindex(object): +# goal_time = 0.2 + +# def setup(self): +# self.N = 50 +# self.df = DataFrame({'A': (list(range(self.N)) * 2), +# 'B': list(range((self.N * 2))), +# 'C': 1, }).set_index(['A', 'B']) + +# def time_groupby_sum_multiindex(self): +# self.df.groupby(level=[0, 1]).sum() + + +# ------------------------------------------------------------------------------ +# Transform testing + + +# class groupby_transform_multi_key(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(2718281) +# self.n = 20000 +# self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), +# columns=['jim', 'joe', 'jolie']) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_multi_key1(self): +# self.df.groupby(['jim', 'joe'])['jolie'].transform('max').compute() + + +# class groupby_transform_multi_key2(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(2718281) +# self.n = 20000 +# self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), +# columns=['jim', 'joe', 'jolie']) +# self.df['jim'] = self.df['joe'] +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_multi_key2(self): +# self.df.groupby(['jim', 'joe'])['jolie'].transform('max').compute() + + +# class groupby_transform_multi_key3(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(2718281) +# self.n = 200000 +# self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), +# columns=['jim', 'joe', 'jolie']) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_multi_key3(self): +# self.df.groupby(['jim', 'joe'])['jolie'].transform('max').compute() + + +# class groupby_transform_multi_key4(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(2718281) +# self.n = 200000 +# self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), +# columns=['jim', 'joe', 'jolie']) +# self.df['jim'] = self.df['joe'] +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_multi_key4(self): +# self.df.groupby(['jim', 'joe'])['jolie'].transform('max').compute() + + +# class groupby_transform_series(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(0) +# self.N = 120000 +# self.N_TRANSITIONS = 1400 +# self.transition_points = np.random.permutation( +# np.arange(self.N))[:self.N_TRANSITIONS] +# self.transition_points.sort() +# self.transitions = np.zeros((self.N,), dtype=np.bool) +# self.transitions[self.transition_points] = True +# self.g = self.transitions.cumsum() +# self.df = DataFrame({'signal': np.random.rand(self.N), }) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_series(self): +# self.df['signal'].groupby(self.g).transform(np.mean).compute() + + +# class groupby_transform_series2(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(0) +# self.df = DataFrame({'id': (np.arange(100000) / 3), +# 'val': np.random.randn(100000), }) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_series2(self): +# self.df.groupby('id')['val'].transform(np.mean).compute() + + +# class groupby_transform_dataframe(object): +# # GH 12737 +# goal_time = 0.2 + +# def setup(self): +# self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), +# 'B': np.nan, +# 'C': np.nan}) +# self.df.ix[4::10, 'B':'C'] = 5 +# self.df = dd.from_pandas(self.df, npartitions=2) + +# # def time_groupby_transform_dataframe(self): +# # self.df.groupby('group').transform('first').compute() + + +# class groupby_transform_cythonized(object): +# goal_time = 0.2 + +# def setup(self): +# np.random.seed(0) +# self.df = DataFrame({'id': (np.arange(100000) / 3), +# 'val': np.random.randn(100000), }) +# self.df = dd.from_pandas(self.df, npartitions=2) + +# def time_groupby_transform_cumprod(self): +# self.df.groupby('id').cumprod().compute() + +# def time_groupby_transform_cumsum(self): +# self.df.groupby('id').cumsum().compute() + +# def time_groupby_transform_shift(self): +# self.df.groupby('id').shift().compute()