Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

From pandas #4

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions dask/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
from pandas import DataFrame, Series, date_range
import numpy as np
import pandas.computation.expressions as expr
import dask.dataframe as dd


class frame_add(object):
goal_time = 0.2

def setup(self):
self.df = dd.from_pandas(DataFrame(np.random.randn(20000, 100)),
npartitions=10)
self.df2 = dd.from_pandas(DataFrame(np.random.randn(20000, 100)),
npartitions=10)

def time_frame_add(self):
(self.df + self.df2)


class frame_add_no_ne(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)

def time_frame_add_no_ne(self):
(self.df + self.df2)

def teardown(self):
expr.set_use_numexpr(True)


class frame_add_st(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_frame_add_st(self):
(self.df + self.df2)

def teardown(self):
expr.set_numexpr_threads()


class frame_float_div(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))
self.df2 = DataFrame(np.random.randn(1000, 1000))

def time_frame_float_div(self):
(self.df // self.df2)


class frame_float_div_by_zero(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))

def time_frame_float_div_by_zero(self):
(self.df / 0)


class frame_float_floor_by_zero(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))

def time_frame_float_floor_by_zero(self):
(self.df // 0)


class frame_float_mod(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))
self.df2 = DataFrame(np.random.randn(1000, 1000))

def time_frame_float_mod(self):
(self.df / self.df2)


class frame_int_div_by_zero(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))

def time_frame_int_div_by_zero(self):
(self.df / 0)


class frame_int_mod(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))
self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))

def time_frame_int_mod(self):
(self.df / self.df2)


class frame_mult(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))

def time_frame_mult(self):
(self.df * self.df2)


class frame_mult_no_ne(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)

def time_frame_mult_no_ne(self):
(self.df * self.df2)

def teardown(self):
expr.set_use_numexpr(True)


class frame_mult_st(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_frame_mult_st(self):
(self.df * self.df2)

def teardown(self):
expr.set_numexpr_threads()


class frame_multi_and(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))

def time_frame_multi_and(self):
self.df[((self.df > 0) & (self.df2 > 0))]


class frame_multi_and_no_ne(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)

def time_frame_multi_and_no_ne(self):
self.df[((self.df > 0) & (self.df2 > 0))]

def teardown(self):
expr.set_use_numexpr(True)


class frame_multi_and_st(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_frame_multi_and_st(self):
self.df[((self.df > 0) & (self.df2 > 0))]

def teardown(self):
expr.set_numexpr_threads()


class series_timestamp_compare(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.halfway = ((self.N // 2) - 1)
self.s = Series(date_range('20010101', periods=self.N, freq='T'))
self.ts = self.s[self.halfway]

def time_series_timestamp_compare(self):
(self.s <= self.ts)


class timestamp_ops_diff1(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.s = self.create()

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s'))

def time_timestamp_ops_diff1(self):
self.s.diff()

class timestamp_tz_ops_diff1(timestamp_ops_diff1):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))

class timestamp_ops_diff2(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.s = self.create()

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s'))

def time_timestamp_ops_diff2(self):
(self.s - self.s.shift())

class timestamp_tz_ops_diff2(timestamp_ops_diff2):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))

class timestamp_series_compare(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.halfway = ((self.N // 2) - 1)
self.s = self.create()
self.ts = self.s[self.halfway]

def create(self):
return Series(date_range('20010101', periods=self.N, freq='T'))

def time_timestamp_series_compare(self):
(self.ts >= self.s)

class timestamp_tz_series_compare(timestamp_series_compare):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern'))
31 changes: 31 additions & 0 deletions dask/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# See https://github.com/pandas-dev/pandas/blob/master/asv_bench/benchmarks/categoricals.py
import pandas as pd
import numpy as np
from pandas import Series
import dask.dataframe as dd


class concat_categorical(object):
goal_time = 0.2

def setup(self):
self.s = dd.from_pandas(
pd.Series((list('aabbcd') * 1000000)).astype('category'),
npartitions=10)

def time_concat_categorical_interleave(self):
dd.concat([self.s, self.s], interleave_partitions=True).compute()


class categorical_value_counts(object):
goal_time = 1

def setup(self):
n = 500000
np.random.seed(2718281)
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
self.ts = dd.from_pandas(Series(arr).astype('category'),
npartitions=10)

def time_value_counts(self):
self.ts.value_counts().compute()
67 changes: 67 additions & 0 deletions dask/benchmarks/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import string

import dask
import dask.dataframe as dd
import numpy as np
Expand Down Expand Up @@ -41,3 +43,68 @@ def test_repartition(self):

def test_quantile(self):
self.data.quantile(.25).compute()


class TimeFloatConstructors(object):

def setup(self):
self.floats = np.random.randn(10000, 10)
self.wide_floats = np.random.randn(10000, 1000)
self.floats_pandas = pd.DataFrame(self.floats)
self.wide_floats_pandas = pd.DataFrame(self.wide_floats)

def time_floats(self):
dd.from_array(self.floats)

def time_wide_floats(self):
dd.from_array(self.wide_floats)

def time_floats_pandas(self):
dd.from_pandas(self.floats_pandas, npartitions=2)

def time_wide_floats_pandas(self):
dd.from_pandas(self.wide_floats_pandas, npartitions=2)


class TimeIntConstructors(object):

def setup(self):
self.ints = np.random.randint(0, 100, size=(10000, 10))
self.wide_ints = np.random.randint(0, 100, size=(10000, 1000))
self.ints_pandas = pd.DataFrame(self.ints)
self.wide_ints_pandas = pd.DataFrame(self.wide_ints)

def time_ints(self):
dd.from_array(self.ints)

def time_wide_ints(self):
dd.from_array(self.wide_ints)

def time_ints_pandas(self):
dd.from_pandas(self.ints_pandas, npartitions=2)

def time_wide_ints_pandas(self):
dd.from_pandas(self.wide_ints_pandas, npartitions=2)


class TimeObjectConstructors(object):

def setup(self):
self.text = np.random.choice(list(string.ascii_letters),
size=(10000, 10))
self.wide_text = np.random.choice(list(string.ascii_letters),
size=(10000, 1000))
self.text_pandas = pd.DataFrame(self.text)
self.wide_text_pandas = pd.DataFrame(self.wide_text)

def time_text(self):
dd.from_array(self.text)

def time_wide_text(self):
dd.from_array(self.wide_text)

def time_text_pandas(self):
dd.from_pandas(self.text_pandas, npartitions=2)

def time_wide_text_pandas(self):
dd.from_pandas(self.wide_text_pandas, npartitions=2)
Loading