Skip to content

Commit

Permalink
Vendor cloudpickle 2.2.1
Browse files Browse the repository at this point in the history
All code now does "from tiledb.cloud._vendor import cloudpickle".

Hard to test because it needs this package to be in production and
cloudpickle can't pickle itself :(
  • Loading branch information
sgillies committed Nov 28, 2024
1 parent 9dd8695 commit 771a293
Show file tree
Hide file tree
Showing 61 changed files with 6,325 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,4 @@ venv.bak/

.idea
src/tiledb/cloud/version.py
.DS_Store
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ tests = [
"pytest-explicit",
"pytest-split",
"tiledbsoma",
"psutil",
]

[project.urls]
Expand All @@ -47,11 +48,12 @@ repository = "https://github.com/TileDB-Inc/TileDB-Cloud-Py"
requires = ["setuptools>=42", "wheel", "setuptools_scm>=6"]

[tool.pytest.ini_options]
explicit-only = ["bigfiles", "geospatial", "vcf"]
explicit-only = ["bigfiles", "geospatial", "vcf", "cloudpickle"]
markers = [
"bigfiles: tests that create and upload really big files",
"geospatial: tests that require the geospatial libraries",
"vcf: VCF tests that run on TileDB Cloud",
"cloudpickle: tests of vendored cloudpickle",
]
norecursedirs = ["tiledb/cloud"]

Expand Down
23 changes: 12 additions & 11 deletions src/tiledb/cloud/_common/pickle_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
import sys
import types

import cloudpickle.cloudpickle as cpcp
import importlib_metadata
import numpy
import packaging.version as pkgver

from tiledb.cloud._vendor import cloudpickle


def patch_cloudpickle() -> None:
"""Make older cloudpickle versions able to unpickle new function pickles."""
Expand Down Expand Up @@ -57,7 +58,7 @@ def patch_cloudpickle() -> None:
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

try:
empty_cell_value = cpcp._empty_cell_value
empty_cell_value = cloudpickle.cloudpickle._empty_cell_value
except AttributeError:
# https://github.com/cloudpipe/cloudpickle/blob/v2.2.1/cloudpickle/cloudpickle.py#L692-L698
class _empty_cell_value:
Expand All @@ -67,12 +68,12 @@ class _empty_cell_value:
def __reduce__(cls):
return cls.__name__

_empty_cell_value.__module__ = cpcp.__name__
_empty_cell_value.__module__ = cloudpickle.cloudpickle.__name__
empty_cell_value = _empty_cell_value()
cpcp._empty_cell_value = empty_cell_value
cloudpickle.cloudpickle._empty_cell_value = empty_cell_value

try:
make_empty_cell = cpcp._make_empty_cell
make_empty_cell = cloudpickle.cloudpickle._make_empty_cell
except AttributeError:
# https://github.com/cloudpipe/cloudpickle/blob/v2.2.1/cloudpickle/cloudpickle.py#L772-L778
def make_empty_cell():
Expand All @@ -86,16 +87,16 @@ def make_empty_cell():
if not hasattr(types, "CellType"):
types.CellType = type(make_empty_cell())

if not hasattr(cpcp, "_make_cell"):
if not hasattr(cloudpickle.cloudpickle, "_make_cell"):
try:
cell_set = cpcp.cell_set
cell_set = cloudpickle.cloudpickle.cell_set
except AttributeError:
#
def cell_set(cell, value):
# We only support 3.7+.
cell.cell_contents = value

cpcp.cell_set = cell_set
cloudpickle.cloudpickle.cell_set = cell_set

# https://github.com/cloudpipe/cloudpickle/blob/v2.2.1/cloudpickle/cloudpickle.py#L392-L450
def _make_cell(value=empty_cell_value):
Expand All @@ -104,16 +105,16 @@ def _make_cell(value=empty_cell_value):
cell_set(cell, value)
return cell

cpcp._make_cell = _make_cell
cloudpickle.cloudpickle._make_cell = _make_cell

if not hasattr(cpcp, "_make_function"):
if not hasattr(cloudpickle.cloudpickle, "_make_function"):

def _make_function(code, globals, name, argdefs, closure):
# Setting __builtins__ in globals is needed for nogil CPython.
globals["__builtins__"] = __builtins__
return types.FunctionType(code, globals, name, argdefs, closure)

cpcp._make_function = _make_function
cloudpickle.cloudpickle._make_function = _make_function


def patch_pandas() -> None:
Expand Down
2 changes: 1 addition & 1 deletion src/tiledb/cloud/_common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from enum import Enum
from typing import Any, Callable, Optional, Tuple, Type, TypeVar, Union

import cloudpickle
import urllib3

from tiledb.cloud._common import functions
from tiledb.cloud._vendor import cloudpickle

TILEDB_CLOUD_PROTOCOL = 4
PYTHON_VERSION = ".".join(map(str, sys.version_info[:3]))
Expand Down
3 changes: 2 additions & 1 deletion src/tiledb/cloud/_results/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from typing import TYPE_CHECKING, Any, Generic, Tuple, Type, TypeVar

import attrs
import cloudpickle
import pyarrow
import urllib3
from typing_extensions import Self, TypeGuard

from tiledb.cloud._vendor import cloudpickle

# This is a circular dependency since we need to be able to decode `tiledb_json`
# format data.
from . import tiledb_json
Expand Down
Empty file.
32 changes: 32 additions & 0 deletions src/tiledb/cloud/_vendor/cloudpickle/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
This module was extracted from the `cloud` package, developed by
PiCloud, Inc.

Copyright (c) 2015, Cloudpickle contributors.
Copyright (c) 2012, Regents of the University of California.
Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of California, Berkeley nor the
names of its contributors may be used to endorse or promote
products derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
182 changes: 182 additions & 0 deletions src/tiledb/cloud/_vendor/cloudpickle/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
Metadata-Version: 2.1
Name: cloudpickle
Version: 2.2.1
Summary: Extended pickling support for Python objects
Home-page: https://github.com/cloudpipe/cloudpickle
Author: Cloudpipe
Author-email: [email protected]
License: BSD 3-Clause License
Classifier: Development Status :: 4 - Beta
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: POSIX
Classifier: Operating System :: Microsoft :: Windows
Classifier: Operating System :: MacOS :: MacOS X
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: System :: Distributed Computing
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE

# cloudpickle

[![Automated Tests](https://github.com/cloudpipe/cloudpickle/workflows/Automated%20Tests/badge.svg?branch=master&event=push)](https://github.com/cloudpipe/cloudpickle/actions)
[![codecov.io](https://codecov.io/github/cloudpipe/cloudpickle/coverage.svg?branch=master)](https://codecov.io/github/cloudpipe/cloudpickle?branch=master)

`cloudpickle` makes it possible to serialize Python constructs not supported
by the default `pickle` module from the Python standard library.

`cloudpickle` is especially useful for **cluster computing** where Python
code is shipped over the network to execute on remote hosts, possibly close
to the data.

Among other things, `cloudpickle` supports pickling for **lambda functions**
along with **functions and classes defined interactively** in the
`__main__` module (for instance in a script, a shell or a Jupyter notebook).

Cloudpickle can only be used to send objects between the **exact same version
of Python**.

Using `cloudpickle` for **long-term object storage is not supported and
strongly discouraged.**

**Security notice**: one should **only load pickle data from trusted sources** as
otherwise `pickle.load` can lead to arbitrary code execution resulting in a critical
security vulnerability.


Installation
------------

The latest release of `cloudpickle` is available from
[pypi](https://pypi.python.org/pypi/cloudpickle):

pip install cloudpickle


Examples
--------

Pickling a lambda expression:

```python
>>> import cloudpickle
>>> squared = lambda x: x ** 2
>>> pickled_lambda = cloudpickle.dumps(squared)

>>> import pickle
>>> new_squared = pickle.loads(pickled_lambda)
>>> new_squared(2)
4
```

Pickling a function interactively defined in a Python shell session
(in the `__main__` module):

```python
>>> CONSTANT = 42
>>> def my_function(data: int) -> int:
... return data + CONSTANT
...
>>> pickled_function = cloudpickle.dumps(my_function)
>>> depickled_function = pickle.loads(pickled_function)
>>> depickled_function
<function __main__.my_function(data:int) -> int>
>>> depickled_function(43)
85
```


Overriding pickle's serialization mechanism for importable constructs:
----------------------------------------------------------------------

An important difference between `cloudpickle` and `pickle` is that
`cloudpickle` can serialize a function or class **by value**, whereas `pickle`
can only serialize it **by reference**. Serialization by reference treats
functions and classes as attributes of modules, and pickles them through
instructions that trigger the import of their module at load time.
Serialization by reference is thus limited in that it assumes that the module
containing the function or class is available/importable in the unpickling
environment. This assumption breaks when pickling constructs defined in an
interactive session, a case that is automatically detected by `cloudpickle`,
that pickles such constructs **by value**.

Another case where the importability assumption is expected to break is when
developing a module in a distributed execution environment: the worker
processes may not have access to the said module, for example if they live on a
different machine than the process in which the module is being developed.
By itself, `cloudpickle` cannot detect such "locally importable" modules and
switch to serialization by value; instead, it relies on its default mode,
which is serialization by reference. However, since `cloudpickle 2.0.0`, one
can explicitly specify modules for which serialization by value should be used,
using the `register_pickle_by_value(module)`/`/unregister_pickle(module)` API:

```python
>>> import cloudpickle
>>> import my_module
>>> cloudpickle.register_pickle_by_value(my_module)
>>> cloudpickle.dumps(my_module.my_function) # my_function is pickled by value
>>> cloudpickle.unregister_pickle_by_value(my_module)
>>> cloudpickle.dumps(my_module.my_function) # my_function is pickled by reference
```

Using this API, there is no need to re-install the new version of the module on
all the worker nodes nor to restart the workers: restarting the client Python
process with the new source code is enough.

Note that this feature is still **experimental**, and may fail in the following
situations:

- If the body of a function/class pickled by value contains an `import` statement:
```python
>>> def f():
>>> ... from another_module import g
>>> ... # calling f in the unpickling environment may fail if another_module
>>> ... # is unavailable
>>> ... return g() + 1
```

- If a function pickled by reference uses a function pickled by value during its execution.


Running the tests
-----------------

- With `tox`, to test run the tests for all the supported versions of
Python and PyPy:

pip install tox
tox

or alternatively for a specific environment:

tox -e py37


- With `py.test` to only run the tests for your current version of
Python:

pip install -r dev-requirements.txt
PYTHONPATH='.:tests' py.test

History
-------

`cloudpickle` was initially developed by [picloud.com](http://web.archive.org/web/20140721022102/http://blog.picloud.com/2013/11/17/picloud-has-joined-dropbox/) and shipped as part of
the client SDK.

A copy of `cloudpickle.py` was included as part of PySpark, the Python
interface to [Apache Spark](https://spark.apache.org/). Davies Liu, Josh
Rosen, Thom Neale and other Apache Spark developers improved it significantly,
most notably to add support for PyPy and Python 3.

The aim of the `cloudpickle` project is to make that work available to a wider
audience outside of the Spark ecosystem and to make it easier to improve it
further notably with the help of a dedicated non-regression test suite.
Loading

0 comments on commit 771a293

Please sign in to comment.