Start adding an asv benchmark suite (geopandas#497)

jorisvandenbossche · web-flow · commit baa1601c787d · 2017-08-20T13:44:20.000+02:00
diff --git a/asv.conf.json b/asv.conf.json
@@ -0,0 +1,151 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "geopandas",
+
+    // The project's homepage
+    "project_url": "http://geopandas.org/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["master"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://github.com/geopandas/geopandas/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.3"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    "matrix": {
+        "pandas": [],
+        "shapely": [],
+        "cython": [],
+        "fiona": [],
+        "pyproj": [],
+        "six": [],
+        "rtree": [],
+        "matplotlib": [],
+        "descartes": []
+    },
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    // "wheel_cache_size": 0
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // }
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
@@ -0,0 +1 @@
+
diff --git a/benchmarks/geom_methods.py b/benchmarks/geom_methods.py
@@ -0,0 +1,104 @@
+import random
+
+import numpy as np
+from geopandas import GeoSeries
+from shapely.geometry import Point, LineString, Polygon
+
+
+def with_attributes(**attrs):
+    def decorator(func):
+        for key, value in attrs.items():
+            setattr(func, key, value)
+        return func
+    return decorator
+
+
+class Bench:
+
+    def setup(self, *args):
+        self.points = GeoSeries([Point(i, i) for i in range(100000)])
+
+        triangles = GeoSeries([Polygon([(random.random(), random.random())
+                                        for _ in range(3)])
+                               for _ in range(1000)])
+        triangles2 = triangles.copy().iloc[np.random.choice(1000, 1000)]
+        triangles3 = GeoSeries([Polygon([(random.random(), random.random())
+                                         for _ in range(3)])
+                                for _ in range(10000)])
+        triangle = Polygon([(random.random(), random.random())
+                            for _ in range(3)])
+        self.triangles, self.triangles2 = triangles, triangles2
+        self.triangles_big = triangles3
+        self.triangle = triangle
+
+    @with_attributes(param_names=['op'],
+                     params=[('contains', 'crosses', 'disjoint', 'intersects',
+                              'overlaps', 'touches', 'within', 'geom_equals',
+                              'geom_almost_equals', 'geom_equals_exact')])
+    def time_binary_predicate(self, op):
+        getattr(self.triangles, op)(self.triangle)
+
+    @with_attributes(param_names=['op'],
+                     params=[('contains', 'crosses', 'disjoint', 'intersects',
+                              'overlaps', 'touches', 'within', 'geom_equals',
+                              'geom_almost_equals')])  # 'geom_equals_exact')])
+    def time_binary_predicate_vector(self, op):
+        getattr(self.triangles, op)(self.triangles2)
+
+    @with_attributes(param_names=['op'],
+                     params=[('distance')])
+    def time_binary_float(self, op):
+        getattr(self.triangles, op)(self.triangle)
+
+    @with_attributes(param_names=['op'],
+                     params=[('distance')])
+    def time_binary_float_vector(self, op):
+        getattr(self.triangles, op)(self.triangles2)
+
+    @with_attributes(param_names=['op'],
+                     params=[('difference', 'symmetric_difference', 'union',
+                              'intersection')])
+    def time_binary_geo(self, op):
+        getattr(self.triangles, op)(self.triangle)
+
+    @with_attributes(param_names=['op'],
+                     params=[('difference', 'symmetric_difference', 'union',
+                              'intersection')])
+    def time_binary_geo_vector(self, op):
+        getattr(self.triangles, op)(self.triangles2)
+
+    @with_attributes(param_names=['op'],
+                     params=[('is_valid', 'is_empty', 'is_simple', 'is_ring')])
+    def time_unary_predicate(self, op):
+        getattr(self.triangles, op)
+
+    @with_attributes(param_names=['op'],
+                     params=[('area', 'length')])
+    def time_unary_float(self, op):
+        getattr(self.triangles_big, op)
+
+    @with_attributes(param_names=['op'],
+                     params=[('boundary', 'centroid', 'convex_hull',
+                              'envelope', 'exterior', 'interiors')])
+    def time_unary_geo(self, op):
+        getattr(self.triangles, op)
+
+    def time_unary_geo_representative_point(self, *args):
+        getattr(self.triangles, 'representative_point')()
+
+    def time_geom_type(self, *args):
+        self.triangles_big.geom_type
+
+    def time_bounds(self, *args):
+        self.triangles.bounds
+
+    def time_unary_union(self, *args):
+        self.triangles.unary_union
+
+    def time_buffer(self, *args):
+        self.points.buffer(2)
+
+
+# TODO
+# project, interpolate, translate, rotate, scale, skew, explode
+# cx indexer
diff --git a/benchmarks/overlay.py b/benchmarks/overlay.py
@@ -0,0 +1,45 @@
+from geopandas import GeoDataFrame, GeoSeries, read_file, datasets, overlay
+from shapely.geometry import Polygon
+
+
+class Countries:
+
+    param_names = ['op']
+    params = [('intersection', 'union', 'identity', 'symmetric_difference',
+               'difference')]
+
+    def setup(self, *args):
+        world = read_file(datasets.get_path('naturalearth_lowres'))
+        capitals = read_file(datasets.get_path('naturalearth_cities'))
+        countries = world[['geometry', 'name']]
+        countries = countries.to_crs('+init=epsg:3395')[
+            countries.name != "Antarctica"]
+        capitals = capitals.to_crs('+init=epsg:3395')
+        capitals['geometry'] = capitals.buffer(500000)
+
+        self.countries = countries
+        self.capitals = capitals
+
+    def time_overlay(self, op):
+        overlay(self.countries, self.capitals, how=op)
+
+
+class Small:
+
+    param_names = ['op']
+    params = [('intersection', 'union', 'identity', 'symmetric_difference',
+               'difference')]
+
+    def setup(self, *args):
+        polys1 = GeoSeries([Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]),
+                            Polygon([(2, 2), (4, 2), (4, 4), (2, 4)])])
+        polys2 = GeoSeries([Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
+                            Polygon([(3, 3), (5, 3), (5, 5), (3, 5)])])
+
+        df1 = GeoDataFrame({'geometry': polys1, 'df1': [1, 2]})
+        df2 = GeoDataFrame({'geometry': polys2, 'df2': [1, 2]})
+
+        self.df1, self.df2 = df1, df2
+
+    def time_overlay(self, op):
+        overlay(self.df1, self.df2, how=op)
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
@@ -0,0 +1,57 @@
+import random
+
+from geopandas import GeoDataFrame, GeoSeries
+from shapely.geometry import Point, LineString, Polygon, MultiPolygon
+import numpy as np
+
+
+class Bench:
+
+    param_names = ['geom_type']
+    params = [('Point', 'LineString', 'Polygon', 'MultiPolygon', 'mixed')]
+
+    def setup(self, geom_type):
+
+        if geom_type == 'Point':
+            geoms = GeoSeries([Point(i, i) for i in range(1000)])
+        elif geom_type == 'LineString':
+            geoms = GeoSeries([LineString([(random.random(), random.random())
+                                           for _ in range(5)])
+                               for _ in range(100)])
+        elif geom_type == 'Polygon':
+            geoms = GeoSeries([Polygon([(random.random(), random.random())
+                                        for _ in range(3)])
+                               for _ in range(100)])
+        elif geom_type == 'MultiPolygon':
+            geoms = GeoSeries(
+                [MultiPolygon([Polygon([(random.random(), random.random())
+                                        for _ in range(3)])
+                               for _ in range(3)])
+                 for _ in range(20)])
+        elif geom_type == 'mixed':
+            g1 = GeoSeries([Point(i, i) for i in range(100)])
+            g2 = GeoSeries([LineString([(random.random(), random.random())
+                                        for _ in range(5)])
+                            for _ in range(100)])
+            g3 = GeoSeries([Polygon([(random.random(), random.random())
+                                     for _ in range(3)])
+                            for _ in range(100)])
+
+            geoms = g1
+            geoms.iloc[np.random.randint(0, 100, 50)] = g2
+            geoms.iloc[np.random.randint(0, 100, 33)] = g3
+
+            print(geoms.geom_type.value_counts())
+
+        df = GeoDataFrame({'geometry': geoms,
+                           'values': np.random.randn(len(geoms))})
+
+        self.geoms = geoms
+        self.df = df
+
+    def time_plot_series(self, *args):
+        self.geoms.plot()
+
+    def time_plot_values(self, *args):
+        self.df.plot(column='values')
+
diff --git a/benchmarks/sjoin.py b/benchmarks/sjoin.py
@@ -0,0 +1,30 @@
+import random
+
+from geopandas import GeoDataFrame, GeoSeries, sjoin
+from shapely.geometry import Point, LineString, Polygon
+import numpy as np
+
+
+class Bench:
+
+    param_names = ['op']
+    params = [('intersects', 'contains', 'within')]
+
+    def setup(self, *args):
+        triangles = GeoSeries(
+            [Polygon([(random.random(), random.random()) for _ in range(3)])
+             for _ in range(1000)])
+
+        points = GeoSeries(
+            [Point(x, y) for x, y in zip(np.random.random(10000),
+                                         np.random.random(10000))])
+
+        df1 = GeoDataFrame({'val1': np.random.randn(len(triangles)),
+                            'geometry': triangles})
+        df2 = GeoDataFrame({'val1': np.random.randn(len(points)),
+                            'geometry': points})
+
+        self.df1, self.df2 = df1, df2
+
+    def time_sjoin(self, op):
+        sjoin(self.df1, self.df2, op=op)