diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 665e279..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,66 +0,0 @@ -# Simple workflow for deploying static content to GitHub Pages -name: docs - -on: - # Runs on pushes targeting the default branch - push: - tags: - - '**' - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages -permissions: - contents: read - pages: write - id-token: write - -# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. -# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. -concurrency: - group: "pages" - cancel-in-progress: false - -jobs: - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Install dependencies - run: | - ls -la - python -m pip install -r requirements.txt - python -m pip install -r requirements_doc.txt - pip list - - - name: make the sphinx docs - run: | - cd ./docs - make html - ls -la _build/html - - - name: Upload artifact - uses: actions/upload-pages-artifact@v2 - with: - # Upload entire repository - path: 'docs/_build/html' - - - name: Deploy to GitHub Pages - uses: actions/deploy-pages@v2 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: gh-pages - force: true - directory: docs/_build/html diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 2698151..1a762ba 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -66,6 +66,7 @@ jobs: black ./${{ env.PYTHON_PACKAGE_NAME }} flake8 ./${{ env.PYTHON_PACKAGE_NAME }} mypy ./${{ env.PYTHON_PACKAGE_NAME }} --install-types --non-interactive --config-file pyproject.toml + doc8 ./docs env: PIP_DISABLE_PIP_VERSION_CHECK: 1 @@ -210,6 +211,50 @@ jobs: with: path: dist/*.tar.gz + build_docs: + needs: setup + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Load virtual environment + uses: actions/cache@v2 + with: + path: venv + key: ${{ runner.os }}-venv-${{ env.PYTHON_IMAGE }}-${{ hashFiles('requirements.txt', 'requirements_dev.txt') }} + - name: Make Sphinx docs + run: | + source venv/bin/activate + cd ./docs + make html + ls -la _build/html + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + path: docs/_build/html + + publish_docs: + needs: build_docs + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + permissions: + contents: read + pages: write + id-token: write + concurrency: + group: "pages" + cancel-in-progress: false + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + uses: actions/deploy-pages@v2 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: gh-pages + force: true + directory: docs/_build/html + upload_pypi: needs: [build_wheels, build_sdist] runs-on: ubuntu-latest diff --git a/doc8.ini b/doc8.ini new file mode 100644 index 0000000..101ec30 --- /dev/null +++ b/doc8.ini @@ -0,0 +1,3 @@ +[doc8] + +max-line-length=99 diff --git a/docs/conf.py b/docs/conf.py index 7a48e40..9493b78 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -95,7 +95,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] +# html_static_path = ["_static"] # html_css_files = ["custom.css"] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/deker/collection_schema.rst b/docs/deker/collection_schema.rst new file mode 100755 index 0000000..7cc9534 --- /dev/null +++ b/docs/deker/collection_schema.rst @@ -0,0 +1,736 @@ +***************** +Collection Schema +***************** + + +Introduction +============ + +In some aspects Deker is similar to other database management systems. It has *collections* which +are equivalent to tables in relational databases or collections in MongoDB. + +Collection stores one of two flavors of *arrays* supported by Deker. We would look into difference +between them later in this tutorial, but for now it is important to understand that *array* is +defined by the *schema* associated with the *collection* where it is stored. + +Collection *schema* consists from several components: + + * *Dimensions* schema defining number of array dimensions, their size, step and labels or + scales that simplify addressing particular dimension + * *Primary attributes* schema defines mandatory attributes that constitute unique identifier + of particular *array* that could be used to locate it inside the collection + * *Custom attributes* schema defines optional attributes that could be stored along the + particular *array* but could not be used to locate it + +.. attention:: + *Dimensions* and both *primary* and *custom attributes* schemas are **immutable**. Once you + have created collection, you will only be able manage *arrays* in it and modify their *custom + attributes* value. + + +Understanding Array Flavors +=========================== + +Two flavor of *arrays* supported by Deker are ``Array`` and ``VArray``. Those objects represent +core concept of Deker storage. Hereafter we will describe their structure, differences and +commonalities and give overview of when either of them should be used. + + +Array +----- + +``Array`` is a wrapper over physical file containing actual array data. + +.. |cell| image:: images/cell.png + :scale: 5% + +Each array consists of individual cells |cell| containing singular data values. + +Let's consider a simple 3D array containing current weather data bound to some grid: + +.. image:: images/array_0_axes.png + :scale: 30% + +.. image:: images/legend.png + :scale: 28% + :align: right + +Let's assume that ``X`` and ``Y`` axes represent geographical grid, and ``Z`` axis represents +layers with particular weather characteristic values, as shown in the legend. + +In the illustration above single ``Array`` has 4 cells in each dimension, in other words its +*shape* is ``(4, 4, 4)``. + +Deker will store each ``Array`` data in a separate file, and when we retrieve this ``Array`` object +from ``Collection`` and access its data, all operations will affect this file only. + + +VArray +------ + +From the developer point of view ``VArray`` is almost indistinguishable from ``Array``. + +Like ``Array`` it has *dimensions*, *primary* and *custom attributes*, it is stored in *collection* +and all operations that could be performed on ``Array`` could be performed on ``VArray`` as well. + +But there is a significant difference in its implementation. + +Imagine that instead of data bound to 4x4 grid you need to store a high-resolution image of +something really huge like whole Earth surface satellite image. Let's say that size of such image +would be 300000x200000 px. If stored in single file it will produce large filesystem objects +that will impose limitations on concurrent read-write access thus impending storage scalability. + +To optimize this type of data storage, Deker uses tiling, i.e. it splits large ``VArray`` objects +into series of smaller ``Array`` objects and transparently join them into for user access as +virtual array. It probably would still be impossible to access this huge array as a whole, but it +enables efficient access to digestible parts of it piece by piece. + +.. image:: images/vgrid.png + :scale: 35% + +``VArray`` is a wrapper over such a set of files. In the image above you can see how it is split +into separate *tiles* (``Array`` objects) with regular *grid*. + +If ``Collection`` is defined to contain ``VArray`` objects, you don't have to worry about tiling, +Deker would transparently manage this for you under the hood. + +When some slice of data is queried from the ``VArray``, it automatically calculates which files +need to be opened to retrieve it and what part of requested slice data bounds belong to each of +each file. + +For example, let's consider ``VArray`` with dimensions ``['X', 'Y', 'Z']`` and shape ``(4, 4, 4)``, +with its *zero-index* at the front-left-bottom corner. + +.. image:: images/varray.png + :scale: 30% + +Let's query the following slice of it: ``[:, 2:4, :]`` + +.. image:: images/varray_request.png + :scale: 30% + +Here you can see, that all 4 tile files will be affected, but only the highlighted pieces of them +will be actually read or written. All different files reads or writes could be done in parallel. +In case you are retrieving data, Deker will transparently combine each read piece into subset with +requested shape and return it to you. If you use these bounds to write data, Deker will +automatically split the slice you have provided into pieces and write them in parallel to +corresponding files. + + +Dimensions Order +================ + +It is important to remember that all array dimensions have strict order which is significant for +your data storage design. + +Let's have a look at array image: + +.. image:: images/array_0_axes.png + :scale: 30% + +.. image:: images/legend.png + :scale: 28% + :align: right + +As usual, every array has just one *entrance point*. You cannot get inner data without passing +through outer layers, but there is only one *gate* for each layer. + +When you decide on the dimensions positioning, you shall understand and keep in mind your usual +querying patterns. Correct positioning will make the querying faster, a wrong one will slow it. + +Assume, that our *gates* are always at the front face, as shown by the arrows, and the dimensions +are arranged as ``['X', 'Y', 'Z']``: + +.. image:: images/array_0_arrows.png + :scale: 30% + +.. image:: images/legend.png + :scale: 28% + :align: right + +It means that when we query our data, in the first place we capture ``X`` dimension, then ``Y`` +dimension and only after that we can get to our weather data. As long as weather layers are under +the geo grid, such a sequence perfectly fits for querying a pack of weather data for some +geo points. + +But what if we place these dimensions in a different manner? + +.. image:: images/array_1_arrows.png + :scale: 30% + +.. image:: images/array_2_arrows.png + :scale: 30% + :align: right + +Now each geo point contains only one sort of information. Moreover, you can place these dimensions +in such a way, when weather layers will become the first dimension, for example like +``['Z', 'Y', 'X']``. + +It entails that each its cell contains all the geo-grid, and the queries become much slower. + +So, before positioning the dimensions, you'd better decide how you are going to query your data and +what order is the most suitable for such queries. + + +Dimensions Schemas +================== + +Each dimension shall have its ``size`` - a precise non-zero positive quantity of its cells with a +constant scalar step ``1``. + +We believe that every piece of data shall be described, otherwise it is just a number or a +meaningless symbol. Each dimension, regardless of its type, shall have at least a unique ``name``. + +.. note:: + The final sequence of your dimensions schemas represents the exact shape of the future + ``Array`` or ``VArray``. + + +Dimension Schema +---------------- + +Here is an example of ``DimensionSchema`` declaration:: + + from deker import DimensionSchema + + dimensions = [ + DimensionSchema(name="height", size=255), + DimensionSchema(name="width", size=512), + ] + +Even if you need an array with only one dimension, it shall still be defined as a list (or a tuple) +of dimension schemas:: + + dimension = ( + DimensionSchema(name="total_daily_income", size=366), + ) + +.. note:: + ``DimensionSchema`` is kept in the ``Collection`` metadata and converted into ``Dimension`` + object for each ``Array`` or ``VArray`` of such ``Collection``. + +All right, now we have a list of two dimensions, named ``"height"`` and ``"width"``. They have some +size, but what are the units? Is there any regular scale for their values? Definitely, there should +be. + + +Scale +~~~~~ + +If a dimension has a real regular scale, we may indicate it:: + + from deker import DimensionSchema, Scale + + dimensions = [ + DimensionSchema( + name="height", + size=255, + scale=Scale(start_value=0.0, step=0.01, name="meters") + ), + DimensionSchema( + name="width", + size=512, + scale={"start_value": 1.0, "step": 0.5} + ), + ] + +As you can see, regular scale can be defined either with Python ``dict`` or with Deker ``Scale`` +named tuple. The keyword ``name`` is optional. Scale values shall be always defined as ``floats``. + +The parameters ``step`` and ``start_value`` may be negative as well. For example, ``era5`` weather +model has a geo grid shaped ``(ys=721, xs=1440)`` with step ``0.25`` degrees per cell. The +zero-point of the ``map`` is north-west or left-upper corner. In other words ``era5`` grid point +``(0, 0)`` is set to coordinates ``(lat=90.0, lon=-180.0)``. + +Here is an example of how this grid can be bound to real geographical coordinates in Deker:: + + dimensions = [ + DimensionSchema( + name="y", + size=721, + scale=Scale(start_value=90.0, step=-0.25, name="lat") + ), + DimensionSchema( + name="x", + size=1440, + scale={"start_value": -180.0, "step": 0.25, "name": "lon"} + ), + ] + +Now you can be sure that ``dimensions[0][0], dimensions[1][0]`` are bound to +``lat=90.0, lon=-180.0`` and ``dimensions[0][-1], dimensions[1][-1]`` are bound to +``lat=-90.0, lon=179.75`` and ``lat=0.0, lon=0.0`` can be found at +``dimensions[0][360], dimensions[1][720]``. + + +Labels +~~~~~~ + +If a dimension has no real regular scale, but there is still a certain logic in its values order, +we may use ``labels`` to describe it:: + + dimensions = [ + DimensionSchema( + name="weather_layers", + size=4, + labels=["temperature", "pressure", "wind_speed", "humidity"], + ), + ] + +You can provide not only a list of ``strings``, but a list (or a tuple) of ``floats`` as well. + +Both ``labels`` and ``scale`` provide a mapping of some reasonable information onto your data +cells. If ``labels`` is always a full sequence kept in metadata and in memory, ``scale`` is +calculated dynamically. + +As for the example with ``labels``, we can definitely state that calling index ``[0]`` will provide +temperature data, and index ``[2]`` will give us wind speed and nothing else. The same works for +scaled dimensions. For example, height index ``[1]`` will keep data relative to height +``0.01 meters`` and index ``[-1]`` - to height ``2.54 meters``. + +If you set some ``scale`` or ``labels`` for your dimensions, it will allow you to slice these +dimensions not only with ``integer``, but also with ``float`` and ``string`` (we will dive into it +in the section about fancy slicing). + + +Time Dimension Schema +--------------------- + +If you need to describe some time series you shall use ``TimeDimensionSchema``. + +.. note:: + ``TimeDimensionSchema`` is kept in the ``Collection`` metadata and converted into + ``TimeDimension`` object for each ``Array`` or ``VArray`` of such ``Collection``. + +``TimeDimensionSchema`` is an object, which is completely described by default, so it needs no +additional description. Thus, it allows you to slice ``TimeDimension`` with ``datetime`` objects +or ``float`` timestamps or even ``string`` (ISO 8601 formatted). + +Like ``DimensionSchema`` it has ``name`` and ``size``, but also it has its special arguments. + + +Start Value +~~~~~~~~~~~ + +Consider the following ``TimeDimensionSchema``:: + + from datetime import datetime, timedelta, timezone + from deker import TimeDimensionSchema + + dimensions = [ + TimeDimensionSchema( + name="dt", + size=8760, + start_value=datetime(2023, 1, 1, tzinfo=timezone.utc), + step=timedelta(hours=1) + ), + ] + +It covers all the hours in the year 2023 starting from 2023-01-01 00:00 to 2023-12-31 23:00 +(inclusively). + +Direct setting of the ``start_value`` parameter will make this date and time a **common +start point** for all the ``Array`` or ``VArray``. Sometimes it makes sense, but usually we want +to distinguish our data by individual time. In this case, it should be defined as follows:: + + dimensions = [ + TimeDimensionSchema( + name="dt", + size=8760, + start_value="$some_attribute_name", + step=timedelta(hours=1) + ), + ] + +A bit later you will get acquainted with ``AttributesSchema``, but for now it is important to note, +that providing ``start_value`` schema parameter with an **attribute name** starting with ``$`` will +let you set an individual start point for each new ``Array`` or ``VArray`` at its creation. + +.. attention:: + For ``start_value`` you can pass a datetime value with any timezone (e.g. your local timezone), + but you should remember that Deker converts and stores it in the UTC timezone. + + Before querying some data from ``TimeDimension``, you should convert your local time to UTC to + be sure that you get a pack of correct data. You can do it with ``get_utc()`` function from + ``deker_tools.time`` module. + + +Step +~~~~ + +Unlike ordinary dimensions ``TimeDimensionSchema`` shall be provided with ``step`` value, which +shall be described as a ``datetime.timedelta`` object. You may certainly set any scale for it, +starting with microseconds, ending with weeks, it will become a mapping for the dimension scalar +indexes onto a certain datetime, which will let you slice it in a fancy way. + +.. note:: + + **Why are integers inapplicable to timestamps and to scale and labels values?** + + Integers are reserved for native Python indexing. + + If your **timestamp** is an integer - convert it to float. + If your **scale** ``start_value`` and ``step`` are integers - define them as corresponding floats. + If your **labels** are integers for some reason - convert them to strings or floats. + + +Attributes Schema +================= + +All databases provide some additional obligatory and/or optional information concerning data. For +example, in SQL there are primary keys which indicate that data cannot be inserted without passing +them. + +For this purpose Deker provides **primary** and **custom attributes** which shall be defined as a +list (or a tuple) of ``AttributeSchema``:: + + from deker import AttributeSchema + + attributes = [ + AttributeSchema( + name="some_primary_attribute", + dtype=int, + primary=True + ), + AttributeSchema( + name="some_custom_attribute", + dtype=str, + primary=False + ), + ] + +Here we defined a pack of attributes, which will be applied to each ``Array`` or ``VArray`` in our +``Collection``. Both of them have a ``name`` and a ``dtype`` of the values you are going to pass +later. + +Regardless their ``primary`` flag value, their names **must be unique**. Valid ``dtypes`` are the +following: + + * ``int`` + * ``float`` + * ``complex`` + * ``str`` + * ``tuple`` + * ``datetime.datetime`` + +The last point is that one of the attributes is ``primary`` and another is ``custom``. What does it +mean? + + +Primary Attributes +------------------ + +.. note:: + Attribute for ``TimeDimension`` ``start_value`` indication shall be dtyped ``datetime.datetime`` + and may be **primary**. + +.. attention:: + It is highly recommended to define at least one **primary** attribute in every schema. + +Primary attributes are a strictly ordered sequence. They are used for ``Array`` or ``VArray`` +filtering. When Deker is building its file system, it creates symlinks for main data files using +primary attributes in the symlink path. If you need to get a certain ``Array`` or ``VArray`` from a +``Collection``, you have two options how to do it: + + * pass its ``id``, + * or indicate **all** its primary attributes' values. + +.. attention:: + Values for all the primary attributes **must be passed** at every ``Array`` or ``VArray`` + creation. + + +Custom Attributes +----------------- + +.. note:: + Attribute for ``TimeDimension`` ``start_value`` indication shall be dtyped ``datetime.datetime`` + and may be **custom** as well. + +No filtering by custom attributes is available at the moment. They just provide some optional +information about your data. You can put there anything, that is not very necessary, but may be +helpful for the data managing. + +Custom attributes are the only mutable objects of the schema. It does not mean that you can change +the schema, add new attributes or remove old ones. It means that you can change their values (with +respect to the specified ``dtype``) if needed. You can also set their values to ``None``, except +the attributes dtyped ``datetime.datetime``. + +.. attention:: + Values for custom attributes **are optional for passing** at every ``Array`` or ``VArray`` + creation. + + If nothing is passed for some or all of them, they are set to ``None``. + + This rule concerns all the custom attributes **except custom attributes dtyped** + ``datetime.datetime``. Values for custom attributes dtyped ``datetime.datetime`` **must be + passed** at every ``Array`` or ``VArray`` creation and **cannot be set to** ``None``. + +.. note:: + Defining ``AttributeSchemas`` is optional: you **may not set** any primary or custom attribute + (**except** attribute for ``TimeDimension.start_value`` indication). + + +Array and VArray Schemas +======================== + +Since you are now well informed about the dimensions and attributes, we are ready to move to the +arrays' schemas. Both ``ArraySchema`` and ``VArraySchema`` must be provided with a list of +dimensions schemas and ``dtype``. You may optionally pass a list of attributes schemas and +``fill_value`` to both of them. + + +Data Type +--------- + +Deker has a strong data typing. All the values of all the ``Array`` or ``VArray`` objects in one +``Collection`` shall be of the same data type. Deker accepts numeric data of the following NumPy +data types: + + * ``numpy.int8`` + * ``numpy.int16`` + * ``numpy.int32`` + * ``numpy.int64`` + * ``numpy.float16`` + * ``numpy.float64`` + * ``numpy.float128`` + * ``numpy.longfloat`` + * ``numpy.double`` + * ``numpy.longdouble`` + * ``numpy.complex64`` + * ``numpy.complex128`` + * ``numpy.complex256`` + * ``numpy.longcomplex`` + * ``numpy.longlong`` + +Python ``int``, ``float`` and ``complex`` are also acceptable. They are correspondingly converted +to ``numpy.int32``, ``numpy.float64`` and ``numpy.complex128``. + + +Fill Value +---------- + +Sometimes it happens that we have no values for some cells or we want to clear our data out in full +or in some parts. Unfortunately, NumPy does not allow you to set python ``None`` to such cells. +That's why we need something that will fill them in. + +Rules are the following: + +1. ``fill_value`` **shall not be significant** for your data. +2. ``fill_value`` **is optional** - you may not provide it. In this case Deker will choose it + automatically basing on the provided ``dtype``. For ``integer`` data types it will be the lowest + value for the correspondent data type bit capacity. For example, it will be ``-128`` for + ``numpy.int8``. For ``float`` data types (including ``complex``) it will be ``numpy.nan`` as + this type is also ``floating``. +3. If you would like to set it manually, ``fill_value`` shall be of the same data type, that was + passed to the ``dtype`` parameter. If all the values of the correspondent ``dtype`` are + significant for you, you shall choose a data type of a greater bit capacity. For example, if all + the values in the range ``[-128; 128]`` are valid for your dataset, you'd better choose + ``numpy.int16`` instead of ``numpy.int8`` and set ``-129`` as ``fill_value`` or let Deker to set + it automatically. The other workaround is to choose any floating data type, e.g. + ``numpy.float16``, and have ``numpy.nan`` as a ``fill_value``. + +Now, let's create once again some simple dimensions and attributes for both types of schemas:: + + from deker import DimensionSchema, AttributeSchema + + dimensions = [ + DimensionSchema(name="y", size=100), + DimensionSchema(name="x", size=200), + ] + + attributes = [ + AttributeSchema(name="attr", dtype=str, primary=False) + ] + + +Array Schema +------------ + +Let's define schema for ``Collection`` of ``Array``:: + + from deker import ArraySchema + + array_schema = ArraySchema( + dimensions=dimensions, + attributes=attributes, + dtype=float, # will be converted and saved as numpy.float64 + # fill_value is not passed - will be numpy.nan + ) + + +VArray Schema +-------------- + +And schema of ``Collection`` of ``VArray``:: + + from deker import VArraySchema + + varray_schema = VArraySchema( + dimensions=dimensions, + dtype=np.int64, + fill_value=-99999, + vgrid=(50, 20) + # attributes are not passed as they are optional + ) + + +VArray Grid +~~~~~~~~~~~ + +Perhaps it is one of the most obscure issues. ``VArray`` shall be split into files, but it cannot +decide itself how it shall be done. It's up to you, how you are going to split your data. + +``vgrid`` parameter shall be defined as a tuple of integers which quantity shall be exactly similar +to the quantity of the dimensions. Its values shall divide ``VArray`` shape without remainders. + +Our schema has two dimensions with sizes ``100`` and ``200`` correspondingly, what tells us that +the ``VArray`` shape will be ``(100, 200)``. And we set ``vgrid`` as ``(50, 20)``. What shall +happen? No magic, just a simple math:: + + (100, 200) / (50, 20) = (2.0, 10.0) + +``(2, 10)`` - that will be the shape of all the ``Array``, produced by the ``VArray``. + +If we do not want to divide any dimension into pieces and want to keep it in full size in all the +``Array``, we shall pass ``1`` in ``vgrid`` for that dimension:: + + (100, 200) / (1, 20) = (100.0, 10.0) + +Thus, the first dimension will retain its initial size for all the arrays, and their shape will be +``(100, 10)``. + +OK! Now we are finally ready to create our first database and we need ``Client``. + + +Creating Collection +=================== + +``Client`` is responsible for creating connections and its internal context. + +As far as Deker is a file-based database, you need to provide some path to the storage, where your +collections will be kept. + + +URI +--- + +There is a universal way to provide paths and connection options: an URI. + +The scheme of URI string for embedded Deker databases, stored on your local drive, is ``file://``. +It shall be followed by a path to the directory where the storage will be located. If this +directory (or even full path to it) does not exist, Deker will create it at ``Client`` +initialization. + +.. note:: + Relative paths are also applicable, but it is recommended to use absolute paths. + + *Explicit is better than implicit.* *Zen of Python:2* + +In this documentation we will use a reference to a temporary directory ``/tmp/deker``:: + + uri = "file:///tmp/deker" + + +Client +------ + +Now open the ``Client`` for interacting with Deker:: + + from deker import Client + + client = Client(uri) + +You can use it as a context manager as well:: + + with Client(uri) as client: + # some client calls here + +``Client`` opens its connections and inner context at its instantiation. If you use context +manager, it will close them automatically on exit. Otherwise the connections and context will +remain opened until you call ``client.close()`` directly. + +If for some reason you need to open and close ``Client`` in different parts of your code, you may +define it only once and reuse it by calling a context manager:: + + client = Client(uri) + # call client here + client.close() + + with client: + # do more client calls here + + with client: + # and call it here as well + + +Putting Everything Together +--------------------------- + +Great! Now let's assemble everything from the above scope and create an ``Array`` collection of +some world-wide weather data:: + + from datetime import datetime, timedelta + + from deker import ( + TimeDimensionSchema, + DimensionSchema, + Scale, + AttributeSchema, + ArraySchema, + Client, + Collection + ) + + dimensions = [ + TimeDimensionSchema( + name="day_hours", + size=24, + start_value="$dt", + step=timedelta(hours=1) + ), + DimensionSchema( + name="y", + size=181, + scale=Scale(start_value=90.0, step=-1.0, name="lat") + ), + DimensionSchema( + name="x", + size=360, + scale=Scale(start_value=-180.0, step=1.0, name="lon") + ), + DimensionSchema( + name="weather", + size=4, + labels=["temperature", "humidity", "pressure", "wind_speed"] + ), + ] + + attributes = [ + AttributeSchema(name="dt", dtype=datetime, primary=True), + AttributeSchema(name="tm", dtype=int, primary=False), + ] + + array_schema = ArraySchema( + dimensions=dimensions, + attributes=attributes, + dtype=float, # will be converted and saved as numpy.float64 + # fill_value is not passed - will be numpy.nan + ) + + with Client(uri="file:///tmp/deker") as client: + collection: Collection = client.create_collection("weather", array_schema) + + print(collection) + + # Will output: + # + # weather + +**We did it!** + +Now there is a new path ``/tmp/deker/collections/weather`` on your local drive where Deker will +store the data relative to the ``Collection`` named ``weather``. Each ``Array`` will contain a pack +of daily 24-hours weather data for each entire latitude and longitude degree: ``temperature``, +``humidity``, ``pressure`` and ``wind_speed``. + diff --git a/docs/deker/data_access.rst b/docs/deker/data_access.rst new file mode 100755 index 0000000..d3801d7 --- /dev/null +++ b/docs/deker/data_access.rst @@ -0,0 +1,601 @@ +*********** +Data Access +*********** + + +Collections +=========== + + +Retrieving Collections +---------------------- + +Retrieving collections is ``Client`` responsibility as well as their creation. In the previous +chapter we created ``Collection`` named ``"weather"``. Now we are going to get it:: + + from deker import Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + + print(collection) # weather + +If you have several collections on the same storage, you can iterate them with the ``Client``:: + + with Client("file:///tmp/deker") as client: + for collection in client: + print(collection) + +``Collection`` object has several useful properties and methods for self-managing:: + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + + print(collection.name) + print(collection.array_schema) # returns schema of Array + print(collection.varray_schema) # returns schema of VArray if applicable, else None + print(collection.path) # returns physical storage path of the Collection + print(collection.as_dict) # serializes main information about Collection into + # dictionary + collection.clear() # removes all the Array and/or VArray objects from the + # storage, but retains the Collection metadata + collection.delete() # removes all the Array and/or VArray and the Collection + # metadata from the storage + + +Managers +-------- + +``Collection`` object has 3 kinds of managers to work with its contents: + + 1. ``default`` (or ``DataManager``) is ``Collection`` itself + 2. ``Collection.arrays`` (or ``ArraysManager``) is a manager responsible for ``Array`` + 3. ``Collection.varrays`` (or ``VArraysManager``) is a manager responsible for ``VArray`` + (unavailable in ``Array`` collections). + +These managers are mixed with ``FilteredManager`` object and are responsible for creation and +filtering of the correspondent contents. All of them have the same interface. The default manager +is a preferred one. Having information about the ``Collection`` main schema, the default manager +decides what to create or to filter. If you have a ``VArray`` collection, it will create or filter +``VArray`` objects, if your collection is made of ``Array`` it will create or filter ``Array``. The +two others are made for direct filtering of ``Array`` or ``VArray``. + +Normally, you need the default one, and although the two others are public, we will not describe +them in this documentation. + + +Array Creation +-------------- + +Let's create a first Array:: + + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.create({"dt": datetime(2023, 1, 1, 0)}) + print(array) + +.. note:: + Let's assume that hereinafter all the ``datetime`` objects, including timestamps and ISO 8601 + strings are in **UTC timezone**. + +As you remember, our schema contains a ``TimeDimensionSchema`` and a **primary** attribute schema. +``TimeDimensionSchema.start_value`` was indicated as a reference to the ``AttributeSchema.name``, +what allowed you to set an individual time start point for each Array. That's why we passed +``{"dt": datetime(2023, 1, 1, 0)}`` to the method of creation, nevertheless if the attribute was +defined as ``primary`` or ``custom``. Now our ``Array`` knows the day and the hour when its data +time series starts. + +If some other primary attributes were defined, values for them should have been included in this +dictionary. + +If no attributes are defined in the schema, the method shall be called without parameters: +``collection.create()``. + +When an ``Array`` or a ``VArray`` is created, it has a unique ``id`` which is a UUID string. +``Array`` and ``VArray`` IDs are generated automatically by different algorithms. So the +probability to get two same IDs tends to zero. + +Fine, we have our first ``Array`` in the ``Collection``. Do we have any changes in our storage? +Yes, we do. If you list it with:: + + ls -lh /tmp/deker/collections/weather + +You will find out that there are two directories named ``array_data`` and ``array_symlinks`` and a +file with the ``Collection`` metadata ``weather.json``. + +Listing these inner directories will tell you that you have an ``.hdf5`` file with the ``Array`` +UUID in its name. At the moment this file is almost empty. It contains just the ``Array`` metadata, +as we have not yet inserted any data in it. But it is created and ready to be used. + +Thus, we can create all the ``Array`` objects in advance without filling them in with any data and +retrieve them when we need. Let's prepare our database for January 2023:: + + from datetime import datetime, timedelta + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + + for day in range(30): + start_point = datetime(2023, 1, 2, 0) + timedelta(days=day) + collection.create({"dt": start_point}) + +``Collection`` is an iterator, so we can get all its contents item by item:: + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + for array in collection: + print(array) + +.. note:: + Everything, mentioned above in this section, is applicable to ``VArray`` as well, except that + ``VArray`` collection path will contain two more directories: ``varray_data`` and + ``varray_symlinks``. + + +Arrays Filtering +---------------- + +If we need to get a certain ``Array`` from the collection, we shall filter it out. As previously +stated, **primary** attributes allow you to find a certain ``Array`` or ``VArray`` in the +``Collection``. If no primary attribute is defined, you need either to know its ``id`` or to +iterate the ``Collection`` in order to find a particular ``Array`` or ``VArray`` until you get the +right one. + +.. attention:: + It is highly recommended to define at least one **primary** attribute in every schema. + +So you have two options how to filter an ``Array`` or ``VArray`` in a ``Collection``: + + 1. By ``id`` + 2. By primary attributes + +For example, we saved an ``id`` of some ``Array`` to a variable, let's create a filter:: + + from deker import Array, Client, Collection + from deker.managers import FilteredManager + + id = "9d7b32ee-d51e-5a0b-b2d9-9a654cb1991d" + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + filter: FilteredManager = collection.filter({"id": id}) + +This ``filter`` is an instance of ``FilteredManager`` object, which is also lazy. It keeps the +parameters for filtering, but no job has been done yet. + +.. attention:: + There is no any query language or conditional matching for now, only strict matching is + available. + +The ``FilteredManager`` provides final methods for invocation of the filtered objects: + + * ``first()`` + * ``last()`` + +Since only strict matching is available, both of them will return the same. They are stubs for +the query language development. + +Now let's filter some ``Array`` by the primary attribute:: + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + + filter_1: FilteredManager = collection.filter({"dt": datetime(2023, 1, 3, 0)}) + filter_2: FilteredManager = collection.filter({"dt": datetime(2023, 1, 15, 0).isoformat()}) + + array_1: Array = filter_1.first() + array_2: Array = filter_2.last() + print(array_1) + print(array_2) + assert array_1.id != array_1.id + +As you see, attributes of ``datetime.datetime`` type, can be filtered both by ``datetime.datetime`` +object as well as by its representation as ISO 8601 string. + +.. attention:: + If your collection schema has **several** primary attributes, you must pass filtering values for + **all** of them! + +.. note:: + Everything, mentioned above in this section, is applicable to VArray as well. + + +Array and VArray +================ + +As previously stated, both ``Array`` and ``VArray`` objects have the same interface. + +Their common **properties** are: + + * ``id``: returns ``Array`` or ``VArray`` ID + * ``dtype``: returns type of the ``Array`` or ``VArray`` data + * ``shape``: returns ``Array`` or ``VArray`` shape as a tuple of dimension sizes + * ``named_shape``: returns ``Array`` or ``VArray`` shape as a tuple of dimension names bound to + their sizes + * ``dimensions``: returns a tuple of ``Array`` or ``VArray`` dimensions as objects + * ``schema``: returns ``Array`` or ``VArray`` low-level schema + * ``collection``: returns the name of ``Collection`` to which the ``Array`` is bound + * ``as_dict``: serializes main information about array into dictionary, prepared for JSON + * ``primary_attributes``: returns an ``OrderedDict`` of ``Array`` or ``VArray`` **primary** + attributes + * ``custom_attributes``: returns a ``dict`` of ``Array`` or ``VArray`` **custom** attributes + +``VArray`` has two extra properties: + + * ``arrays_shape``: returns common shape of all the ``Array`` bound to the ``VArray`` + * ``vgrid``: returns virtual grid (a tuple of integers) by which ``VArray`` is split into + ``Array`` + +Their common common methods are: + + * ``read_meta()``: reads the ``Array`` or ``VArray`` metadata from storage + * ``update_custom_attributes()``: updates ``Array`` or ``VArray`` custom attributes values + * ``delete()``: deletes ``Array`` or ``VArray`` from the storage with all its data and metadata + * ``__getitem__()``: creates ``Subset`` from ``Array`` or ``VSubset`` from ``VArray`` + + +Updating Custom Attributes +-------------------------- + +Updating custom attributes is quite simple. As you remember, our schema contains one named ``tm`` +(timestamp) with ``int`` data type, and we have never defined its value. It means, that it is set +to ``None`` in each ``Array``. Let's check it and update them everywhere:: + + from deker import Array, Client, Collection + from deker.managers import FilteredManager + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + for array in collection: + print(array.custom_attributes) # {'tm': None} + + # type shall be `int` + custom_attribute_value = int(array.primary_attributes["dt"].timestamp())) + array.update_custom_attributes({'tm': custom_attribute_value}) + + print(array.custom_attributes) + +If there are many custom attributes and you want to update just one or several of them - no +problem. Just pass a dictionary with values for the attributes you need to update. All the others +will not be harmed and will keep their values. + + +Fancy Slicing +------------- + +It is our privilege and pleasure to introduce the **fancy slicing** of your data! + +We consider the ``__getitem__()`` method to be one of our pearls. + +Usually, you use integers for native Python and NumPy indexing and ``start``, ``stop`` and ``step`` +slicing parameters:: + + import numpy as np + + python_seq = range(10) + np_seq = np.random.random((3, 4, 5)) + + print(python_seq[1], python_seq[3:], python_seq[3:9:2]) + print(np_seq[2, 3, 4], np_seq[1:,:, 2], np_seq[:2, :, 1:4:2]) + +.. attention:: + If you are new to NumPy indexing, please, refer to the `official documentation`_ + +.. _`official documentation`: https://numpy.org/doc/stable/user/basics.indexing.html + +Deker allows you to index and slice its ``Array`` and ``VArray`` not only with integers, but with +the ``types`` by which the dimensions are described. + +But let's start with a **constraint**. + + +Step +~~~~ + +Since a ``VArray`` is split in separate files, and each file can contain an ``Array`` with more +than one dimension, the calculation of their inner bounds is a non-trivial problem. + +That's why the ``step`` parameter **is limited** to ``1`` for both ``Array`` and ``VArray`` +dimensions. This constraint is introduced to keep consistent behavior, although that there is no +such a problem for ``Array``. + +Workaround for ``VArray`` would be to read your data and slice it again with steps, if you need, +as it will be a ``numpy.ndarray``. + + +Start and Stop +~~~~~~~~~~~~~~ + +As earlier mentioned, if your ``Dimensions`` have an additional description with ``scale`` or +``labels`` you can get rid of indexes calculations and provide your ``scale`` or ``labels`` values +to ``start`` and ``stop`` parameters. + +If you have a ``TimeDimension``, you can slice it with ``datetime.datetime`` objects, its ISO 8601 +formatted string or timestamps in the type of ``float``. + +.. attention:: + Remember, that you shall convert your local timezone to UTC for proper ``TimeDimension`` slicing. + +Let's have a closer look:: + + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + + array: Array = collection.filter({"dt": datetime(2023, 1, 3, 0)}).first() + + start_dt = datetime(2023, 1, 3, 5) + end_dt = datetime(2023, 1, 3, 10) + + fancy_subset = array[ + start_dt:end_dt, # step is timedelta(hours=1) + -44.0:-45.0, # y-scale start point is 90.0 and step is -1.0 (90.0 ... -90.0) + -1.0:1.0, # x-scale start point is -180.0 and step is 1.0 (-180.0 ... 179.0) + :"pressure" # captures just "temperature" and "humidity" + ] + + # which is equivalent of: + subset = array[ + 5:10, + 134:135, + 179:181, + :2 + ] + + assert fancy_subset.shape == subset.shape + assert fancy_subset.bounds == subset.bounds + +It is great, if you can keep in mind all the indexes and their mappings, but this feature awesome, +isn't it?! Yes, it is!!! + +The values, passed to each dimension's index or slice, are converted to integers, and after that +they are set in the native Python ``slice`` object. A ``tuple`` of such ``slices`` is the final +representation of the bounds which will be applied to your data. + +.. attention:: + Fancy index values must **exactly** match your dimension time, ``Scale`` or ``label`` values, + otherwise, you will get ``IndexError``. + +You have not yet approached your data, but you are closer and closer. + +Now you have a new object - `Subset`. + + +Subset and VSubset +================== + +``Subset`` and ``VSubset`` are the final lazy objects for the access to your data. + +Once created, they contain no data and do not access the storage until you manually invoke one of +their correspondent methods. + +.. note:: + If you need to read or write all the data from ``Array`` or ``VArray`` you should create a + subset with ``[:]`` or ``[...]``. + +Both of them also have the same interface. As for the properties, they are: + + * ``shape``: returns shape of the ``Subset`` or ``VSubset`` + * ``bounds``: returns bounds that were applied to ``Array`` or ``VArray`` + * ``dtype``: returns type of queried data + * ``fill_value``: returns value for empty cells + +Let's dive deeper into the methods. + +.. note:: + The explanations below are based on the logic, implemented for the ``HDF5`` format. + + +Read +---- + +Method ``read()`` gets data from the storage and returns a ``numpy.ndarray`` of the corresponding +``shape`` and ``dtype``. Regarding ``VArray`` data reading, ``VSubset`` will capture the data from +the ``Array``, affected by the passed bounds, arrange it in a single ``numpy.ndarray`` of the +proper ``shape`` and ``dtype`` and return it to you. + +If your ``Array`` or ``VArray`` is **empty** - a ``numpy.ndarray`` filled with ``fill_value`` will +be returned for any called ``Subset`` or ``VSubset``:: + + import numpy as np + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 15, 0)}).first() + subset = array[0, 0, 0] # get first hour and grid zero-point + print(subset.read()) # [nan, nan, nan, nan] + + +Update +------ + +Method ``update()`` is an **upsert** method, which is responsible for new values **inserting** and +old values **updating**. + +The shape of the data, that you pass into this method, shall match the shape of the ``Subset`` or +``VSubset``. It is impossible to insert 10 values into 9 cells. It is also impossible to insert +them into 11 cells, as there are no instructions how to arrange them properly. :: + + import numpy as np + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + subset = array[:] # captures full array shape + + data = np.random.random(subset.shape) + + subset.update(data) + +The provided data ``dtype`` shall match the dtype of ``Array`` or ``VArray`` set by the schema or +shall have the correspondent Python type to be converted into such ``dtype``:: + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + subset = array[:] # captures full array shape + + data = np.random.random(subset.shape).tolist # converts data into Python list of Python floats + + subset.update(data) # data will be converted to array.dtype + +If your ``Array`` or ``VArray`` is utterly empty, ``Subset`` or ``VSubset`` will create a +``numpy.ndarray`` of the ``Array`` shape filled with the ``fill_value`` from the ``Collection`` +schema and than, using the indicated bounds, it will insert the data provided by you in this array. +Afterwards it will be dumped to the storage. In the scope of ``VArray`` it will work in the same +manner, except that only corresponding affected inner ``Array`` will be created. + +If there is some data in your ``Array`` or ``VArray`` and you provide some new values by this +method, the old values in the affected bounds will be substituted with new ones:: + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + + data = np.random.random(array.shape) + array[:].update(data) + + subset = array[0, 0, 0] # get first hour and grid zero-point + + print(subset.read()) # a list of 4 random values + + new_values = [0.1, 0.2, 0.3, 0.4] + subset.update(new_values) # data will be converted to array.dtype + + print(subset.read()) # [0.1, 0.2, 0.3, 0.4] + + +Clear +----- + +Method ``clear()`` inserts the ``fill_value`` into the affected bounds. If all your ``Array`` or +``VArray`` values are ``fill_value``, it will be concerned empty and the data set will be deleted +from the file. But the file still exists and retains ``Array`` or ``VArray`` metadata:: + + import numpy as np + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + + data = np.random.random(array.shape) + array[:].update(data) + + subset = array[0, 0, 0] # get first hour and grid zero-point + + print(subset.read()) # a list of 4 random values + + new_values = [0.1, 0.2, 0.3, 0.4] + subset.update(new_values) # data will be converted to array.dtype + print(subset.read()) # [0.1, 0.2, 0.3, 0.4] + + subset.clear() + print(subset.read()) # [nan, nan, nan, nan] + + array[:].clear() + print(array[:].read()) # a numpy.ndarray full of `nans` + + +Describe +-------- + +You may want to check, what part of data you are going to manage. + +With ``describe()`` you can get an ``OrderedDict`` with a description of the dimensions parts +affected by ``Subset`` or ``VSubset``. If you provided ``scale`` and/or ``labels`` for your +dimensions, you will get the human-readable description, otherwise you'll get indexes. + +So it is highly recommended to describe your dimensions:: + + from datetime import datetime + from deker import Array, Client, Collection + from pprint import pprint + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + + pprint(array[0, 0, 0].describe()) + + # OrderedDict([('day_hours', + # [datetime.datetime(2023, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)]), + # ('y', [90.0]), + # ('x', [-180.0]), + # ('weather', ['temperature', 'humidity', 'pressure', 'wind_speed'])]) + + subset = array[datetime(2023, 1, 1, 5):datetime(2023, 1, 1, 10), + -44.0:-45.0, + -1.0:1.0, + :"pressure"] + + pprint(subset.describe()) + + # OrderedDict([('day_hours', + # [datetime.datetime(2023, 1, 1, 5, 0, tzinfo=datetime.timezone.utc), + # datetime.datetime(2023, 1, 1, 6, 0, tzinfo=datetime.timezone.utc), + # datetime.datetime(2023, 1, 1, 7, 0, tzinfo=datetime.timezone.utc), + # datetime.datetime(2023, 1, 1, 8, 0, tzinfo=datetime.timezone.utc), + # datetime.datetime(2023, 1, 1, 9, 0, tzinfo=datetime.timezone.utc)]), + # ('y', [-44.0]), + # ('x', [-1.0, 0.0]), + # ('weather', ['temperature', 'humidity'])]) + +.. attention:: + Description is an ``OrderedDict`` object, having in values full ranges of descriptive data for + ``Subset`` or ``VSubset``. If you keep this description in memory, your memory will be lowered + by its size. + + +Read Xarray +----------- + +.. warning:: + ``xarray`` package is not in the list of the Deker default dependencies. Please, refer to the + Installation_ chapter for more details + +Xarray_ is a wonderful project, which provides special objects for working with multidimensional +data. Its main principle is *the data shall be described*. We absolutely agree with that. + +Method ``read_xarray()`` describes a ``Subset`` or ``VSubset``, reads its contents and converts it +to ``xarray.DataArray`` object. + +If you need to convert your data to ``pandas`` objects, or to ``netCDF``, or to ``ZARR`` - use this +method and after it use methods, provided by ``xarray.DataArray``:: + + import numpy as np + from datetime import datetime + from deker import Array, Client, Collection + + with Client("file:///tmp/deker") as client: + collection: Collection = client.get_collection("weather") + array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() + + data = np.random.random(array.shape) + array[:].update(data) + + subset = array[0, 0, 0] # get first hour and grid zero-point + + x_subset: xarray.DataArray = subset.read_xarray() + + print(dir(x_subset)) + print(type(x_subset.to_dataframe())) + print(type(x_subset.to_netcdf())) + print(type(x_subset.to_zarr())) + +It provides even more opportunities. Refer to ``xarray.DataArray`` API_ for details . + +.. _Installation: installation.html#extra-dependencies +.. _Xarray: https://docs.xarray.dev/en/stable/ +.. _API: https://docs.xarray.dev/en/stable/generated/xarray.DataArray.html diff --git a/docs/deker/data_management.rst b/docs/deker/data_management.rst deleted file mode 100755 index dc370ce..0000000 --- a/docs/deker/data_management.rst +++ /dev/null @@ -1,566 +0,0 @@ -.. currentmodule:: deker - -**************** -Data management -**************** - -Collections -============== - -Retrieving collections ------------------------- -Retrieving collections is ``Client's`` responsibility as well as their creation. -In the previous chapter we created ``Collection`` named ``weather``. Now we are going to get it:: - - from deker import Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - print(collection) # weather - -If you have several collections on the same storage, you can iterate them with the ``Client``:: - - from deker import Client - - with Client("file:///tmp/deker") as client: - for collection in client: - print(collection) - -``Collection`` object has several useful properties and methods for self-managing:: - - from deker import Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - - print(collection.name) - print(collection.array_schema) # returns schema of Arrays - print(collection.varray_schema) # returns schema of VArrays if applicable, else None - print(collection.path) # returns storage path to the Collection - print(collection.as_dict) # serializes main information about Collection into dictionary, prepared for JSON - - collection.clear() # removes all the Arrays or VArrays from the storage, but retains the collection metadata - collection.delete() # removes all the Arrays or VArrays and the collection metadata from the storage - - -Managers ---------- -``Collection`` object has 3 kinds of managers to work with its contents: - -1. ``default`` (or ``DataManager``) is ``Collection`` itself. -2. ``Collection.arrays`` (or ``ArraysManager``) is a manager responsible for ``Arrays`` -3. ``Collection.varrays`` (or ``VArraysManager``) is a manager responsible for ``VArrays`` (unavailable in - ``Arrays'`` collections). - -These managers are mixed with ``FilteredManager`` object and are responsible for creation and filtering -of the correspondent contents. All of them have the same interface. The default manager is a preferred -one. Having information about the ``Collection`` main schema, the default manager decides what to create or to filter. -If you have a ``VArrays`` collection, it will create or filter ``VArrays``, if your collection is made of ``Arrays`` -it will create or filter ``Arrays``. The two others are made for direct filtering of ``Arrays`` or ``VArrays``. - -Normally, you need the default one, and although the two others are public, we will not describe them in this -documentation. - -Arrays creation --------------------- -Let's create a first Array:: - - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.create({"dt": datetime(2023, 1, 1, 0)}) - print(array) - -.. note:: Let's assume that hereinafter all the ``datetime`` objects, including timestamps and iso-strings, - represent **UTC timezone**. - -As you remember, our schema contains a ``TimeDimensionSchema`` and a **primary** attribute schema. -``TimeDimensionSchema.start_value`` was indicated as a reference to the ``AttributeSchema.name``, what allowed you -to set an individual time start point for each Array. That's why we passed -``{"dt": datetime(2023, 1, 1, 0)}`` to the method of creation, nevertheless if the attribute was defined as -``primary`` or ``custom``. Now our ``Array`` knows the day and the hour when its data time series starts. - -If some other primary attributes were defined, values for them should have been included in this -dictionary. - -If no attributes are defined in the schema, the method shall be called without parameters: -``collection.create()`` - -When an Array or a VArray is created, it has a unique ``id`` which is a UUID-string. ``Array's`` and ``VArray's`` ids -are generated automatically by different algorithms. So the probability to get two similar ids tends to zero. - -Fine, we have our first ``Array`` in the ``Collection``. Do we have any changes in our storage? Yes, we do. -If you list it with - -:: - - ls -lh /tmp/deker/collections/weather - -you will find out that there are two directories named ``array_data`` and ``array_symlinks`` and a file with the -``Collection`` metadata ``weather.json``. - -Listing these inner directories will tell you that you have an ``.hdf5`` file with the ``Array's`` UUID in its -name. At the moment this file is almost empty. It contains just the ``Array's`` metadata, as we have not yet -inserted any data in it. But it is created and ready to be used. - -Thus, we can create all the ``Arrays`` in advance without filling them in with any data and retrieve them when we need. -Let's prepare our database for January, 2023:: - - from datetime import datetime, timedelta - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - - for day in range(30): - start_point = datetime(2023, 1, 2, 0) + timedelta(days=day) - collection.create({"dt": start_point}) - -``Collection`` is an iterator, so we can get all its contents item by item:: - - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - for array in collection: - print(array) - -.. note:: Everything, mentioned above in this section, is applicable to VArray as well, except that a ``VArray`` - collection path will contain two more directories: ``varray_data`` and ``varray_symlinks``. - -Arrays filtering --------------------- -If we need to get a certain Array from the collection, we shall filter it out. -As previously stated, **primary** attributes allow you to find a certain ``Array`` or ``VArray`` in the ``Collection``. -If no primary attribute is defined, you need either to know its ``id`` or to iterate the ``Collection`` in order -to find a particular ``Array`` or ``VArray`` until you get the right one. - -.. attention:: It is highly recommended to define at least one **primary** attribute in every schema. - -So you have two options how to filter a ``Array`` or ``VArray`` in a ``Collection``: - -1. by ``id`` -2. by ``primary`` attributes - -For example, we saved an ``id`` of some ``Array`` to a variable, let's create a filter:: - - from deker import Array, Client, Collection - from deker.managers import FilteredManager - - id = "9d7b32ee-d51e-5a0b-b2d9-9a654cb1991d" - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - filter: FilteredManager = collection.filter({"id": id}) - -This ``filter`` is an instance of ``FilteredManager`` object, which is also lazy. It keeps the parameters for -filtering, but no job has been done yet. - -.. attention:: - | There is no any query language or conditional matching for now. - | Only straight matching is available. - - **But we are working on it.** - -The ``FilteredManager`` provides final methods for invocation of the filtered objects: - -- ``first`` -- ``last`` - -Since only straight matching is available, both of them will return the same. They are stubs for the query -language development. :: - - from deker import Array, Client, Collection - from deker.managers import FilteredManager - - id = "9d7b32ee-d51e-5a0b-b2d9-9a654cb1991d" - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - filter: FilteredManager = collection.filter({"id": id}) - array: Array = filter.first() - print(array) - assert array.id == filter.last().id - -Now let's filter some Array by the primary attribute:: - - from deker import Array, Client, Collection - from deker.managers import FilteredManager - - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - - filter_1: FilteredManager = collection.filter({"dt": datetime(2023, 1, 3, 0)}) - filter_2: FilteredManager = collection.filter({"dt": datetime(2023, 1, 15, 0).isoformat()}) - - array_1: Array = filter_1.first() - array_2: Array = filter_2.last() - print(array_1) - print(array_2) - assert array_1.id != array_1.id - -As you see, attributes, dtyped as ``datetime.datetime``, can be filtered both by ``datetime.datetime`` object as well -as by its native iso-string. - -.. attention:: If your collection schema has **several** schemas of the primary attributes, you shall pass filtering - values for **all** of them! - -.. note:: Everything, mentioned above in this section, is applicable to VArray as well. - -Arrays and VArrays -===================== -As previously stated, both ``Array`` and ``VArray`` objects have the same interface. - -Their common **properties** are: - -- ``id``: returns ``Array's`` or ``VArray's`` id -- ``dtype``: returns type of the ``Array's`` or ``VArray's`` data -- ``shape``: returns ``Array's`` or ``VArray's`` shape as a tuple of dimension sizes -- ``named_shape``: returns ``Array's`` or ``VArray's`` shape as a tuple of dimension names bound to their sizes -- ``dimensions``: returns a tuple of ``Array's`` or ``VArray's`` dimensions as objects -- ``schema``: returns ``Array's`` or ``VArray's`` low-level schema -- ``collection``: returns the name of ``Collection`` to which the ``Array`` is bound -- ``as_dict``: serializes main information about array into dictionary, prepared for JSON -- ``primary_attributes``: returns an ``OrderedDict`` of ``Array's`` or ``VArray's`` **primary** attributes -- ``custom_attributes``: returns a ``dict`` of ``Array's`` or ``VArray's`` **custom** attributes - -``VArray`` has two extra properties: - -- ``arrays_shape``: returns common shape of all the ``Arrays`` bound to the ``VArray`` -- ``vgrid``: returns virtual grid (a tuple of integers) by which ``VArray`` is split into ``Arrays`` - -Their common common methods are: - -- ``read_meta()``: reads the ``Array's`` or ``VArray's`` metadata from storage -- ``update_custom_attributes()``: updates ``Array's`` or ``VArray's`` custom attributes values -- ``delete()``: deletes ``Array`` or ``VArray`` from the storage with all its data and metadata -- ``__getitem__()``: creates ``Subset`` from ``Array`` or ``VSubset`` from ``VArray`` - -Updating custom attributes ----------------------------- -Updating custom attributes is quite simple. As you remember, our schema contains one named ``tm`` (timestamp) -with ``int`` dtype, and we have never defined its value. It means, that it is set to ``None`` in each ``Array``. -Let's check it and update them everywhere:: - - from deker import Array, Client, Collection - from deker.managers import FilteredManager - - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - for array in collection: - print(array.custom_attributes) # {'tm': None} - - custom_attribute_value = int(array.primary_attributes["dt"].timestamp())) # type shall be `int` - array.update_custom_attributes({'tm': custom_attribute_value}) - - print(array.custom_attributes) - -If there are many custom attributes and you want to update just one or several of them - no problem. -Just pass a dictionary with values for the attributes you need to update. All the others will not be harmed and -will keep their values. - -Fancy slicing --------------- -| It is our privilege and pleasure to introduce the **fancy slicing** of your data! -| We consider the ``__getitem__()`` method to be one of our pearls. - -Usually, you use integers for native Python and Numpy indexing and ``start``, ``stop`` and ``step`` slicing -parameters:: - - import numpy as np - - python_seq = range(10) - np_seq = np.random.random((3, 4, 5)) - - print(python_seq[1], python_seq[3:], python_seq[3:9:2]) - print(np_seq[2, 3, 4], np_seq[1:,:, 2], np_seq[:2, :, 1:4:2]) - -.. attention:: If you are new to NumPy indexing, please, refer to the `official documentation`_ - -.. _`official documentation`: https://numpy.org/doc/stable/user/basics.indexing.html - -Deker allows you to index and slice its ``Arrays`` and ``VArrays`` not only with integers, but with the ``types`` -by which the dimensions are described. - -But let's start with a **constraint**. - -Step -~~~~~~ -Since a ``VArray`` is split in separate files, and each file can contain an array made of more than one dimension, -the calculation of their inner bounds is a non-trivial problem. - -That's why the ``step`` parameter **is limited** to ``1`` for both ``Arrays`` and ``VArrays`` dimensions. This -constraint is introduced to keep consistent behaviour, *although that there is no such a problem for Arrays*. - -Moreover, we doubt that such feature is necessary. You may read your data and slice it again with steps, -if you need, as it will be a ``numpy.ndarray``. - -.. note:: We are definitely open for any ideas of solving the problem of the ``VArray`` *inner bounds with different - steps* calculation. Please, open your PRs! - -Start and Stop -~~~~~~~~~~~~~~~ -As earlier mentioned, if your ``Dimensions`` have an additional description with ``scale`` or ``labels`` you can get -rid of indexes calculations and provide your ``scale`` or ``labels`` values to ``start`` and ``stop`` parameters. - -If you have a ``TimeDimension``, you can slice it with ``datetime.datetime`` objects, its native iso-string -representation or timestamps in the type of ``float``. - -.. attention:: Remember, that you shall convert your local timezone to UTC for proper ``TimeDimension`` slicing. - -Let's have a closer look:: - - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 3, 0)}).first() - - fancy_subset = array[ - datetime(2023, 1, 3, 5):datetime(2023, 1, 3, 10), # step is timedelta(hours=1) - -44.0:-45.0, # y-scale start point is 90.0 and step is -1.0 (90.0 ... -90.0) - -1.0:1.0, # x-scale start point is -180.0 and step is 1.0 (-180.0 ... 179.0) - :"pressure" # captures just "temperature" and "humidity" - ] - # it is absolutely equal to - subset = array[5:10, 134:135, 179:181, :2] - - assert fancy_subset.shape == subset.shape - assert fancy_subset.bounds == subset.bounds - -It is great, if you can keep in mind all the indexes and their mappings, but this feature awesome, isn't it?! -Yes, it is!!! - -The values, passed to each dimension's index or slice, are converted to integers, and after that they are set in -the native Python ``slice`` object. A ``tuple`` of such ``slices`` is the final representation of the bounds which will be -applied to your data. - -.. warning:: *Fancy* values shall **exactly** match your datetime and scaling parameters and ``labels`` - values! **Otherwise, you will get** ``IndexError``. - -You have not yet approached your data, but you are closer and closer. - -Now you have a new object - `Subset`. - -Subsets and VSubsets -===================== -``Subset`` and ``VSubset`` are the final lazy objects for the access to your data. - -Once created, they contain no data and do not access the storage until you manually invoke one of their -correspondent methods. - -.. note:: If you need to get and manage all the data from the ``Array`` or ``VArray`` you should create a - subset with ``[:]`` or ``[...]``. - -Both of them also have the same interface. As for the properties, they are: - -- ``shape``: returns shape of the Subset or VSubset -- ``bounds``: returns bounds that were applied to Array or VArray -- ``dtype``: returns type of queried data -- ``fill_value``: returns value for *empty* cells - -Let's dive deeper into the methods. - -.. note:: The explanations below are based on the logic, implemented for the ``HDF5`` format. - -Read ------- -Method ``read()`` gets data from the storage and returns a ``numpy.ndarray`` of the corresponding ``shape`` and -``dtype``. Regarding ``VArray`` data reading, ``VSubset`` will capture the data from the ``Arrays``, affected by -the passed bounds, arrange it in a single ``numpy.ndarray`` of the proper ``shape`` and ``dtype`` and return it to you. - -If your ``Array`` or ``VArray`` is **empty** - a ``numpy.ndarray`` filled with ``fill_value`` will be returned for -any called ``Subset`` or ``VSubset``:: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 15, 0)}).first() - subset = array[0, 0, 0] # get first hour and grid zero-point - print(subset.read()) # [nan, nan, nan, nan] - -Update -------- -Method ``update()`` is an **upsert** method, which is responsible for new values **inserting** and old -values **updating**. - -The shape of the data, that you pass into this method, shall match the shape of the ``Subset`` or ``VSubset``. It is -impossible to insert 10 values into 9 cells. It is also impossible to insert them into 11 cells, as there are no -instructions how to arrange them properly. :: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - subset = array[:] # captures full array shape - - data = np.random.random(subset.shape) - - subset.update(data) - -The provided data ``dtype`` shall match the dtype of ``Array`` or ``VArray`` set by the schema or shall have the -correspondent Python type to be converted into such dtype:: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - subset = array[:] # captures full array shape - - data = np.random.random(subset.shape).tolist # converts data into Python list of Python floats - - subset.update(data) # data will be converted to array.dtype - -If your ``Array`` or ``VArray`` is utterly empty, ``Subset`` or ``VSubset`` will create a ``numpy.ndarray`` of the -``Array`` shape filled with the ``fill_value`` from the ``Collection`` schema and than, using the indicated bounds, -it will insert the data provided by you in this array. Afterwards it will be dumped to the storage. In the scope of -``VArrays`` it will work in the same manner, except that only corresponding affected inner ``Arrays`` will be created. - -If there is some data in your ``Array`` or ``VArray`` and you provide some new values by this method, the old values -in the affected bounds will be substituted with new ones:: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - - data = np.random.random(array.shape) - array[:].update(data) - - subset = array[0, 0, 0] # get first hour and grid zero-point - - print(subset.read()) # a list of 4 random values - - new_values = [0.1, 0.2, 0.3, 0.4] - subset.update(new_values) # data will be converted to array.dtype - - print(subset.read()) # [0.1, 0.2, 0.3, 0.4] - -Clear ------- -Method ``clear()`` inserts the ``fill_value`` into the affected bounds. If all your ``Array's`` or ``VArray's`` values -are ``fill_value``, it will be concerned empty and the dataset will be deleted from the file. But the file still -exists and retains ``Array's`` or ``VArray's`` metadata. :: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - - data = np.random.random(array.shape) - array[:].update(data) - - subset = array[0, 0, 0] # get first hour and grid zero-point - - print(subset.read()) # a list of 4 random values - - new_values = [0.1, 0.2, 0.3, 0.4] - subset.update(new_values) # data will be converted to array.dtype - print(subset.read()) # [0.1, 0.2, 0.3, 0.4] - - subset.clear() - print(subset.read()) # [nan, nan, nan, nan] - - array[:].clear() - print(array[:].read()) # a numpy.ndarray full of `nans` - -Describe ---------- -You may want to check, what part of data you are going to manage. - -With ``describe()`` you can get an ``OrderedDict`` with a description of the dimensions' parts affected -by ``Subset`` or ``VSubset``. If you provided ``scale`` and/or ``labels`` for your dimensions, you will get the -human-readable description, otherwise you'll get indexes. - -So it is highly recommended to describe your dimensions. :: - - from datetime import datetime - from deker import Array, Client, Collection - from pprint import pprint - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - - pprint(array[0, 0, 0].describe()) # OrderedDict([('day_hours', - # [datetime.datetime(2023, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)]), - # ('y', [90.0]), - # ('x', [-180.0]), - # ('weather', ['temperature', 'humidity', 'pressure', 'wind_speed'])]) - - subset = array[datetime(2023, 1, 1, 5):datetime(2023, 1, 1, 10), -44.0:-45.0, -1.0:1.0, :"pressure"] - pprint(subset.describe()) # OrderedDict([('day_hours', - # [datetime.datetime(2023, 1, 1, 5, 0, tzinfo=datetime.timezone.utc), - # datetime.datetime(2023, 1, 1, 6, 0, tzinfo=datetime.timezone.utc), - # datetime.datetime(2023, 1, 1, 7, 0, tzinfo=datetime.timezone.utc), - # datetime.datetime(2023, 1, 1, 8, 0, tzinfo=datetime.timezone.utc), - # datetime.datetime(2023, 1, 1, 9, 0, tzinfo=datetime.timezone.utc)]), - # ('y', [-44.0]), - # ('x', [-1.0, 0.0]), - # ('weather', ['temperature', 'humidity'])]) - -.. attention:: - Description is an ``OrderedDict`` object, having in values full ranges of descriptive data for ``Subset`` or - ``VSubset``. If you keep this description in memory, your memory will be lowered by its size. - -Read Xarray ------------- -.. _Xarray: https://docs.xarray.dev/en/stable/ -.. _Installation: installation.html#extra-dependencies - -.. warning:: ``xarray`` package is not in the list of the Deker default dependencies. - - Please, refer to the Installation_ chapter for more details - -Xarray_ is a wonderful project, which provides special objects for working with multidimensional data. -Its main principle is *the data shall be described*. We absolutely agree with that. - -Method ``read_xarray()`` describes a ``Subset`` or ``VSubset``, reads its contents and converts it to -``xarray.DataArray`` object. - -If you need to convert your data to ``pandas`` objects, or to ``netCDF``, or to ``ZARR`` - use this method and after it -use methods, provided by ``xarray.DataArray``:: - - import numpy as np - from datetime import datetime - from deker import Array, Client, Collection - - with Client("file:///tmp/deker") as client: - collection: Collection = client.get_collection("weather") - array: Array = collection.filter({"dt": datetime(2023, 1, 1, 0)}).first() - - data = np.random.random(array.shape) - array[:].update(data) - - subset = array[0, 0, 0] # get first hour and grid zero-point - - x_subset: xarray.DataArray = subset.read_xarray() - - print(dir(x_subset)) - print(type(x_subset.to_dataframe())) - print(type(x_subset.to_netcdf())) - print(type(x_subset.to_zarr())) - -It provides even more opportunities. Refer to ``xarray.DataArray`` API_ for details . - -.. _API: https://docs.xarray.dev/en/stable/generated/xarray.DataArray.html diff --git a/docs/deker/fine_tuning.rst b/docs/deker/fine_tuning.rst index 3367071..165ae2d 100755 --- a/docs/deker/fine_tuning.rst +++ b/docs/deker/fine_tuning.rst @@ -1,80 +1,85 @@ -.. currentmodule:: deker - -************* -Fine tuning -************* +*********** +Fine Tuning +*********** This chapter is dedicated to advanced settings and features provided by Deker. + Client -========= +====== + +In addition to the URI parameter ``Client`` accepts several options, that you may want or need to +tune. All of them shall be explicitly passed as keyword parameters, none of them is positional. -In addition to the URI parameter ``Client`` accepts several options, that you may want or need to tune. -All of them shall be explicitly passed as ``keyword`` parameters, none of them is positional. -executor ---------- -Deker creates its own ``ThreadPoolExecutor`` instance for working with ``VArrays``. By default, this parameter -is ``None``. You may want to use your own ``ThreadPoolExecutor`` (or some **custom** executor, based on -``ThreadPoolExecutor``) instance. In this case Deker will use the passed one:: +``executor`` +------------ + +Deker creates its own ``ThreadPoolExecutor`` instance for working with ``VArray``. By default, this +parameter is ``None``. You may want to use your own ``ThreadPoolExecutor`` (or some custom +executor, based on ``ThreadPoolExecutor``) instance. In this case Deker will use the passed one:: from deker import Client client = Client(uri, executor=) -.. note:: None executor is initialized and used if you work with a ``Collection`` of ``Arrays``. - The executor, passed by you, will be ignored. +.. note:: + No executor is initialized and used if you work with a ``Collection`` of ``Array``. The executor, + passed by you, will be ignored. -.. attention:: When Client is closed your executor will not be shutdown, you shall do it manually. +.. attention:: + When ``Client`` is closed your executor will not be shut down, you shall do it manually. -.. caution:: Be aware of the probable threads stuck when using this feature! It may happen because your - executor may simultaneously manage some outer tasks and may have insufficient number of free threads to resolve - all the stuff. -workers ---------- +``workers`` +----------- + This is a parameter for the native Deker executor mentioned above. -By default, it is ``None`` and in this case Deker uses the maximum number of threads from the formula, -provided by `Python 3.9 documentation`_ : ``cpu_count() + 4`` +By default, it is ``None`` and in this case Deker uses the maximum number of threads from the +formula, provided by `Python 3.9 documentation`_ : ``cpu_count() + 4``. You may increase or reduce it, if you need:: - from deker import Client client = Client(uri, workers=8) .. _Python 3.9 documentation: https://docs.python.org/3.9/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor -write_lock_timeout --------------------- -Deker uses its own file-locks for different operations, one of which is **writing**. -With ``write_lock_timeout`` you can modify an amount of seconds during which a parallel writing process waits -for the release of the locked file:: + +``write_lock_timeout`` +---------------------- + +Deker uses its own file locking mechanisms for different operations, one of which is for writing. +With ``write_lock_timeout`` you can modify an amount of seconds during which a parallel writing +process waits for the release of the locked file:: from deker import Client client = Client(uri, write_lock_timeout=120) -The default is ``60`` seconds. The units are immutable and only ``integers`` are accepted. +The default is ``60`` seconds. The units are immutable and only ``int`` is accepted. -write_lock_check_interval --------------------------- -While the parallel writing process waits for the lock release, it sleeps for some time and then checks -the state of the lock. You can adjust its sleeping time in seconds:: +``write_lock_check_interval`` +----------------------------- + +While the parallel writing process waits for the lock release, it sleeps for some time and then +checks the state of the lock. You can adjust its sleeping time in seconds:: from deker import Client client = Client(uri, write_lock_check_interval=5) -The default is ``1`` second. The units are immutable and only ``integers`` are accepted. +The default is ``1`` second. The units are immutable and only ``int`` is accepted. + -loglevel ---------- -All the Deker objects (including private ones) have their own loggers. They are bound by the common logging -level, which defaults to ``"ERROR"``. If you need, you may change it at ``Client's`` start:: +``loglevel`` +------------ + +All the Deker objects (including private ones) have their own loggers. They are bound by the common +logging level, which defaults to ``"ERROR"``. If you need, you may change it at ``Client`` init:: from deker import Client @@ -84,18 +89,16 @@ If you need to change it on the fly, you may use the following function:: from deker.log import set_logging_level - <...your code...> - set_logging_level("INFO") # now Deker logs starting from "INFO" level - <...your code...> -memory_limit --------------- -This parameter is used for the early runtime break in case of potential memory overflow. +``memory_limit`` +---------------- -Deker operates big amounts of data, and you may be unaware that your machine will probably run out of memory. -For example, NumPy shall raise ``_ArrayMemoryError`` if you do something like this:: +This parameter is used for the early run time break in case of potential memory overflow. + +Deker operates big amounts of data, and you may be unaware that your machine will probably run out +of memory. For example, NumPy shall raise ``_ArrayMemoryError`` if you do something like this:: >>> import numpy as np @@ -103,70 +106,79 @@ For example, NumPy shall raise ``_ArrayMemoryError`` if you do something like th # numpy.core._exceptions._ArrayMemoryError: Unable to allocate 74.5 GiB # for an array with shape (100000, 100000) and data type float64 -Insofar as Deker is lazy, you shall be warned about such problems beforehand. For that purpose, Deker checks the memory -limits when it is creating: +As Deker is lazy, you shall be warned about such problems beforehand. For that purpose, Deker +checks the memory limits when it is creating: - - a ``Collection`` - - a ``Subset`` or ``VSubset`` - - an ``xarray.DataArray`` from a ``Subset`` or a ``VSubset`` + * ``Collection`` + * ``Subset`` or ``VSubset`` + * ``xarray.DataArray`` from a ``Subset`` or a ``VSubset`` -By default Deker is **limited** to your **total** ``RAM`` size and your **total** ``swap`` size. For example, you have -16 Gb of RAM and 2 Gb of swap. Thus, Deker is limited to 18 Gb of memory by default. But usually a machine is already -using some parts of these memories for other processes. So your current available free memory is always lower than the -total one. +By default Deker is limited to your **total virtual memory size** (i.e. total amount of RAM plus +swap size). For example, you have 16 GB of RAM and 2 GB of swap. Thus, Deker is limited to 18 GB of +memory by default. But usually a machine is already using some parts of these memory for other +processes. So your current available free memory is always lower than the total one. -Deker compares its limits with your current available free memory (RAM + swap) and chooses the minimal one of them. -Than it compares the result with the requested values' shape and dtype. In case if your request requires too much -memory or you are trying to create a ``Collection`` with a schema, which may cause a memory overflow in future, -``DekerMemoryError`` will be immediately raised. +Deker compares its limits with your current available free memory (RAM + swap) and chooses the +minimal one of them. Than it compares the result with the requested shape size. In case your +request requires too much memory or you are trying to create a ``Collection`` with a schema, which +may cause a memory overflow in future, ``DekerMemoryError`` will be immediately raised. -You can lower the default value by passing a certain number of ``bytes`` or by passing a human representation of -kilobytes, megabytes or gigabytes, for example: ``"1024K"``, ``"512M"``, ``"8G"``:: +You can lower the default value by passing a certain number of ``bytes`` or by passing a human +readable representation of kilobytes, megabytes or gigabytes, for example: ``"1024K"``, ``"512M"``, +``"8G"``:: from deker import Client client = Client(uri, memory_limit="4G") # 4 gigabytes client = Client(uri, memory_limit=4096) # 4096 bytes -Only integers are acceptable for both of bytes and human representation. Capitalization of units postfix is ignored: -``"1024k"``, ``"512m"``, ``"8g"`` will work. +Only integers are acceptable for both of bytes and human representation. Capitalization of units +suffix is ignored: ``"1024k"``, ``"512m"``, ``"8g"`` will work. .. note:: You definitely may want to use it in **Docker**. - If you set a memory limit to your container, you'd better limit Deker to the same value. Otherwise your container - may be killed because of memory overflow. + If you set a memory limit to your container, you'd better limit Deker to the same value. + Otherwise your container may be killed because of memory overflow. -HDF5 options -============== -.. _`HDF5 official documentation`: https://portal.hdfgroup.org/display/HDF5/HDF5 -.. attention:: If you are new to ``HDF5``, please, refer to the `HDF5 official documentation`_ +HDF5 Options +============ -Very briefly, ``HDF5`` is a data model, library, and file format for storing and managing data. It supports an unlimited -variety of datatypes, and is designed for flexible and efficient I/O and for high volume and complex data. This format -offers a big number of special tuning options. We will talk about ``chunks`` and data ``compression``. +.. attention:: + If you are new to ``HDF5``, please, refer to the `HDF5 official documentation`_ + +.. _`HDF5 official documentation`: https://portal.hdfgroup.org/display/HDF5/HDF5 -Deker ``deker-local-adapters`` plugin has its default implementation of working with this format. It depends on two -packages: `h5py`_ and `hdf5plugin`_ which provide a Python interface for HDF5 binaries and a pack of compression -filters. +Very briefly, ``HDF5`` is a data model, library, and file format for storing and managing data. It +supports an unlimited variety of data types, and is designed for flexible and efficient I/O and for +high volume and complex data. This format offers a big number of special tuning options. We will +talk about ``chunks`` and data ``compression``. + +Deker ``deker-local-adapters`` plugin has its default implementation of working with this format. +It depends on two packages: ``h5py_`` and ``hdf5plugin_`` which provide a Python interface for HDF5 +binaries and a pack of compression filters. .. _h5py: https://docs.h5py.org/en/stable/ .. _hdf5plugin: http://www.silx.org/doc/hdf5plugin/latest/ -Deker applies chunks and compression options to all of the files within one collection. As long as you do not interact -directly with the files and low-level interfaces, Deker provides special types for these options usage. Your -settings are stored in the collection metadata. When you invoke a ``Collection``, they are recovered and ready to be -applied to your data. But they have to make a trip from the collection metadata to the final data, that's why we need -``HDF5Options`` and ``HDF5CompressionOpts`` objects. +Deker applies chunks and compression options to all of the files within one collection. As long as +you do not interact directly with the files and low-level interfaces, Deker provides special types +for these options usage. Your settings are stored in the collection metadata. When you invoke a +``Collection``, they are recovered and ready to be applied to your data. But they have to make a +trip from the collection metadata to the final data, that's why we need ``HDF5Options`` and +``HDF5CompressionOpts`` objects. -.. note:: Chunks and compression options are applied to your dataset within HDF5-file when the data is inserted or - updated. When reading, HDF5-file already knows how to manage its chunked and/or compressed contents properly. +.. note:: + Chunks and compression options are applied to your dataset within HDF5 file when the data is + inserted or updated. When reading, HDF5 file already knows how to manage its chunked and/or + compressed contents properly. First of all, let's prepare a collection schema once again:: from datetime import datetime, timedelta - from deker import ( # in order of appearance + + from deker import ( TimeDimensionSchema, DimensionSchema, Scale, @@ -199,6 +211,7 @@ First of all, let's prepare a collection schema once again:: labels=["temperature", "humidity", "pressure", "wind_speed"] ), ] + attributes = [ AttributeSchema(name="dt", dtype=datetime, primary=True), AttributeSchema(name="tm", dtype=int, primary=False), @@ -211,16 +224,17 @@ First of all, let's prepare a collection schema once again:: # fill_value is not passed - will be numpy.nan ) + Chunks --------- -Correct data chunking may increase your performance. It makes your data split in smaller equal pieces. -When you read data from a chunk, HDF5-file opens and caches it. The next reading of the same pattern -will be much faster as it will be captured not from the storage, but from the cache. +------ -A HDF5-file may have ``no chunks`` options or be chunked either ``manually`` or ``automatically``. +Correct data chunking may increase your performance. It makes your data split in smaller equal +pieces. When you read data from a chunk, HDF5-file opens and caches it. The next reading of the +same pattern will be much faster as it will be captured not from the storage, but from the cache. -.. admonition:: Hint +A HDF5-file may have *no chunks* options or be chunked either *manually* or *automatically*. +.. hint:: Study `HDF5 chunking manual`_ to understand **chunks** better. .. _HDF5 chunking manual: https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5 @@ -238,8 +252,8 @@ Chunks options are set to ``None`` by default. When you create an ``Array``, its file is one big chunk. -If you set chunks to ``True``, HDF5-file will automatically determine a chunk size with its own algorythm, basing -on the shape of your ``Array``:: +If you set chunks to ``True``, HDF5-file will automatically determine a chunk size with its own +algorithm, basing on the shape of your ``Array``:: from deker import Client, HDF5Options @@ -252,8 +266,9 @@ on the shape of your ``Array``:: You will never know the final chunk size, but be sure that your data is chunked now. -If you need to adjust it, you may set it manually. It shall be a tuple of integers. The size of the tuple shall -be equal to your ``Array`` shape. Its values shall divide your dimensions without remainders:: +If you need to adjust it, you may set it manually. It shall be a tuple of integers. The size of the +tuple shall be equal to your ``Array`` shape. Its values shall divide your dimensions without +remainders:: from deker import Client, HDF5Options @@ -269,47 +284,51 @@ be equal to your ``Array`` shape. Its values shall divide your dimensions withou HDF5Options(chunks=chunks) ) -Here we chunked our data into pieces, each of which will contain 1 hour, 181 ``y`` points *(because 181 -is a natural number and is divisible only by itself or 1)*, 36 ``x`` points and the full scope of weather layers. -If you need to read some data, which is kept in one or several chunks, the file will not affect other chunks, -but it will open and cache the correspondent ones. - -.. admonition:: Hint +Here we chunked our data into pieces, each of which will contain 1 hour, 181 ``y`` points (because +181 is a natural number and is divisible only by itself or 1), 36 ``x`` points and the full scope +of weather layers. If you need to read some data, which is kept in one or several chunks, the file +will not affect other chunks, but it will open and cache the correspondent ones. +.. hint:: The best way to decide on chunk size is your the most frequently used reading pattern. + Compression --------------- -To prevent a lack of the disc space for your data, you can compress it with different filters, supported by HDF5 -and provided by ``h5py`` and ``hdf5plugin`` packages. +----------- + +To prevent a lack of the disc space for your data, you can compress it with different filters, +supported by HDF5 and provided by ``h5py`` and ``hdf5plugin`` packages. -There are several default filters, set in ``h5py`` and a pack of the most popular filters, brought by ``hdf5plugin``. +There are several default filters, set in ``h5py`` and a pack of the most popular filters, brought +by ``hdf5plugin``. Default filters: - - ``GZip`` - - ``Lzf`` - - ``SZip`` + + * ``GZip`` + * ``Lzf`` + * ``SZip`` Custom filters, brought by ``hdf5plugin``: - - ``Bitshuffle`` - - ``Blosc`` - - ``BZip2`` - - ``FciDecomp`` - - ``LZ4`` - - ``SZ`` - - ``SZ3`` - - ``Zfp`` - - ``Zstd`` - -.. attention:: The data is compressed chunk by chunk. If you use compression without indicating a chunk size, it will - be automatically set to `True` and calculated by the inner HDF5 algorythm. + + * ``Bitshuffle`` + * ``Blosc`` + * ``BZip2`` + * ``FciDecomp`` + * ``LZ4`` + * ``SZ`` + * ``SZ3`` + * ``Zfp`` + * ``Zstd`` + +.. attention:: + The data is compressed chunk by chunk. If you use compression without indicating a chunk size, + it will be automatically set to `True` and calculated by the inner HDF5 algorythm. The default filters shall be used as follows:: from deker import Client, HDF5Options, HDF5CompressionOpts with Client("file:///tmp/deker") as client: - compression=HDF5CompressionOpts(compression="gzip", compression_opts=9), options = HDF5Options(compression_opts=compression) client.create_collection( @@ -320,11 +339,7 @@ The default filters shall be used as follows:: The custom filters shall be instantiated and passed to ``HDF5CompressionOpts`` as a mapping:: - import hdf5plugin - from deker import Client, HDF5Options, HDF5CompressionOpts - with Client("file:///tmp/deker") as client: - compression=HDF5CompressionOpts(**hdf5plugin.Zstd(6)), options = HDF5Options(chunks=(1, 181, 36, 4), compression_opts=compression) client.create_collection( @@ -333,9 +348,9 @@ The custom filters shall be instantiated and passed to ``HDF5CompressionOpts`` a collection_options=options ) -.. admonition:: Hint - - Dive into **compression options** at `h5py filter pipeline`_, `hdf5plugin docs`_ and `HDF5 compression manual`_. +.. hint:: + Dive into **compression options** at `h5py filter pipeline`_, `hdf5plugin docs`_ and + `HDF5 compression manual`_. .. _h5py filter pipeline: https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline .. _hdf5plugin docs: http://www.silx.org/doc/hdf5plugin/latest/ diff --git a/docs/deker/first_steps.rst b/docs/deker/first_steps.rst deleted file mode 100755 index e957882..0000000 --- a/docs/deker/first_steps.rst +++ /dev/null @@ -1,597 +0,0 @@ -.. currentmodule:: deker - -************* -First steps -************* - -Introduction -============== - -Insofar as Deker is a database, it cannot exist without schemas. - -Let's make some high-level parallels with SQL databases. ``Collection`` is an SQL table. It has rows - ``Arrays`` -or ``VArrays``. Table columns, their contents and quantity are set up by the table schema which is created as follows :: - - CREATE TABLE table_name ( - column1 datatype, - column2 datatype, - column3 datatype, - ... - ); - -Deker ``Collection`` can be a collection either of ``Arrays`` or of ``VArrays``. It's impossible to have the both types -in one ``Collection`` at the same time. - -That is why Deker provides two types of schemas: - -- ``ArraySchema`` -- ``VArraySchema`` - -And since Deker is a database for numeric data, which is represented as multidimensional arrays, the first thing -you shall do is to define a list of dimensions for your ``Arrays`` or ``VArrays``. For this purpose Deker provides -other two objects: - -- ``DimensionSchema`` -- ``TimeDimensionSchema`` - - -.. attention:: All schemas are immutable. Once you created a ``Collection`` you **cannot modify** its schema. - -Bringing to order -=================== - -| Obviously, all array's dimensions are placed in a strict order, and this order really matters. -| Let's return to our array image: - -.. image:: images/array_0_axes.png - :scale: 30% - -.. image:: images/legend.png - :scale: 28% - :align: right - -As usual, every array has just one *entrance point*. You cannot get inner data without passing through outer layers, -but there is only one *gate* for each layer. - -When you decide on the dimensions` positioning, you shall understand and keep in mind your usual querying patterns. -Correct positioning will make the querying faster, a wrong one will slow it. - -Assume, that our *gates* are always at the front face, as shown by the arrows, and the dimensions are arranged as -``['x', 'y', 'z']``: - -.. image:: images/array_0_arrows.png - :scale: 30% - -.. image:: images/legend.png - :scale: 28% - :align: right - -It means that when we query our data, in the first place we capture ``x`` dimension, then ``y`` dimension and only -after that we can get to our weather data. As long as weather layers are under the geo-grid, such a sequence perfectly -fits for querying a pack of weather data for some geo-point(-s). - -But what if we place these dimensions in a different manner? - -.. image:: images/array_1_arrows.png - :scale: 30% - -.. image:: images/array_2_arrows.png - :scale: 30% - :align: right - -| Now each geo-point contains only one sort of information. Moreover, you can place these dimensions in such a way, - when weather layers will become the first dimension, for example like -| ``['z', 'y', 'x']``. - -It entails that each its cell contains all the geo-grid, and the queries become much slower. - -So, before positioning the dimensions, you'd better decide how you are going to query your data and what order is the -most suitable for such queries. - - -Dimensions' schemas -==================== -Each dimension shall have its ``size`` - a precise non-zero positive quantity of its cells with a constant -scalar step ``1``. - -We believe that every piece of data shall be described, otherwise it is just a number or a meaningless symbol. -Each dimension, regardless of its type, shall have at least a unique ``name``. - -.. note:: The final sequence of your dimensions' schemas represents the exact shape of the future ``Array`` or - ``VArray``. - -DimensionSchema ------------------ -:: - - from deker import DimensionSchema - - dimensions = [ - DimensionSchema(name="height", size=255), - DimensionSchema(name="width", size=512), - ] - -*Even if you need an array with only one dimension, it shall still be defined as a list (or a tuple) of dimension -schemas*:: - - dimension = ( - DimensionSchema(name="total_daily_income", size=366), - ) - -.. note:: - ``DimensionSchema`` is kept in the ``Collection`` metadata and converted into ``Dimension`` object for - each ``Array`` or ``VArray`` of such ``Collection``. - -All right, now we have a list of two dimensions, named *"height"* and *"width"*. They have some size, -but what are the units? Is there any regular scale for their values? Definitely, there should be. - -Scale -~~~~~~ -If a dimension has a real regular scale, we may indicate it:: - - from deker import DimensionSchema, Scale - - dimensions = [ - DimensionSchema( - name="height", - size=255, - scale=Scale(start_value=0.0, step=0.01, name="meters") - ), - DimensionSchema( - name="width", - size=512, - scale={"start_value": 1.0, "step": 0.5} - ), - ] - - -As you can see, regular scale can be defined either with Python ``dict`` or with Deker ``Scale`` named tuple. -The keyword ``name`` is optional. Scale values shall be always defined as ``floats``. - -The parameters ``step`` and ``start_value`` may be negative as well. For example, ``era5`` weather model has a -geo-grid shaped ``(ys=721, xs=1440)`` with step ``0.25`` degrees per cell. The zero-point of the ``map`` is north-west -or left-upper corner. In other words ``era5`` grid point ``(0, 0)`` is set to coordinates ``(lat=90.0, lon=-180.0)``. - -Here is an example of how this grid can be bound to real geographical coordinates in Deker:: - - dimensions = [ - DimensionSchema( - name="y", - size=721, - scale=Scale(start_value=90.0, step=-0.25, name="lat") - ), - DimensionSchema( - name="x", - size=1440, - scale={"start_value": -180.0, "step": 0.25, "name": "lon"} - ), - ] - -Now you can be sure that ``dimensions[0][0], dimensions[1][0]`` are bound to ``lat=90.0, lon=-180.0`` and ``dimensions[0][-1], -dimensions[1][-1]`` are bound to ``lat=-90.0, lon=179.75`` and ``lat=0.0, lon=0.0`` can be found at ``dimensions[0][360], -dimensions[1][720]``. - -Labels -~~~~~~~ -If a dimension has no real regular scale, but there is still a certain logic in its values order, we may use ``labels`` -to describe it:: - - dimensions = [ - DimensionSchema( - name="weather_layers", - size=4, - labels=["temperature", "pressure", "wind_speed", "humidity"], - ), - ] - -You can provide not only a list of ``strings``, but a list (or a tuple) of ``floats`` as well. - -Both ``labels`` and ``scale`` provide a mapping of some reasonable information onto your data cells. If ``labels`` -is always a full sequence kept in metadata and in memory, ``scale`` is calculated dynamically. - -As for the example with ``labels``, we can definitely state that calling index ``[0]`` will provide temperature data, -and index ``[2]`` will give us wind speed and nothing else. The same works for scaled dimensions. For example, height: -index ``[1]`` will keep data relative to height ``0.01 meters`` and index ``[-1]`` - to height ``2.54 meters`` - -If you set some ``scale`` or ``labels`` for your dimensions, it will allow you to slice these dimensions not only with -``integers``, but also with ``floats`` and ``strings`` *(we will dive into it in the section, relative to fancy -slicing)*. - -TimeDimensionSchema ---------------------- -If you need to describe some time series you shall use ``TimeDimensionSchema``. - -.. note:: - ``TimeDimensionSchema`` is kept in the ``Collection`` metadata and converted into ``TimeDimension`` - object for each ``Array`` or ``VArray`` of such ``Collection``. - -``TimeDimensionSchema`` is an object, which is completely described by default, so it needs no additional description. -Thus, it allows you to slice ``TimeDimensions`` with ``datetime`` objects or ``floats`` (timestamps) or even -``strings`` (datetime native standard iso-formatted strings). - -Like ``DimensionSchema`` it has ``name`` and ``size``, but also it has its special arguments. - -Start value -~~~~~~~~~~~~ -:: - - from datetime import datetime, timedelta, timezone - from deker import TimeDimensionSchema - - dimensions = [ - TimeDimensionSchema( - name="dt", - size=8760, - start_value=datetime(2023, 1, 1, tzinfo=timezone.utc), - step=timedelta(hours=1) - ), - ] - -We have just defined a schema for a time dimension, which covers all the ``hours`` in the year 2023 starting from -2023-01-01 00:00 to 2023-12-31 23:00 (inclusively). - -Direct setting of the ``start_value`` parameter will make this date and time a **common start point** for all the -``Arrays`` or ``VArrays``. Sometimes it makes sense, but usually we want to distinguish our data by individual time. -In this case, it should be defined as follows:: - - dimensions = [ - TimeDimensionSchema( - name="dt", - size=8760, - start_value="$some_attribute_name", - step=timedelta(hours=1) - ), - ] - -A bit later you will get acquainted with ``AttributesSchema``, but for now it is important to note, that providing -``start_value`` schema parameter with an **attribute's name** starting with ``$`` will let you set an individual -start point for each new ``Array`` or ``VArray`` at its creation. - -.. attention:: - For ``start_value`` you can pass a datetime value with any timezone (e.g. your local timezone), but you should - remember that Deker converts and keeps it in the UTC timezone. - - Before querying some data from ``TimeDimension``, you should convert your local time to UTC to be sure that you get - a pack of correct data. You can do it with ``get_utc()`` function from ``deker_tools.time`` module. - -Step -~~~~ -Unlike ordinary dimensions ``TimeDimensionSchema`` shall be provided with ``step`` value, which shall be described as -a ``datetime.timedelta`` object. You may certainly set any scale for it, starting with microseconds, ending with -weeks, it will become a mapping for the dimension scalar indexes onto a certain datetime, which will let you slice -it in a fancy way. - -.. admonition:: Hint - - **Why are integers inapplicable to timestamps and to scale and labels values?** - - | Integers are reserved for native Python indexing. - - | If your **timestamp** is an integer - convert it to float. - | If your **scale** ``start_value`` and ``step`` are integers - define them as corresponding floats. - | If your **labels** are integers for some reason - convert them to strings or floats. - - -Attributes' schema -=================== -All databases provide some additional obligatory and/or optional information concerning data. For example, in SQL -there are *primary keys* which indicate that data cannot be inserted without passing them. - -For this purpose Deker provides **primary and custom attributes** which shall be defined as a list (or a tuple) of -``AttributeSchema``:: - - from deker import AttributeSchema - - attributes = [ - AttributeSchema( - name="some_primary_attribute", - dtype=int, - primary=True - ), - AttributeSchema( - name="some_custom_attribute", - dtype=str, - primary=False - ), - ] - -Here we defined a pack of attributes, which will be applied to each ``Array`` or ``VArray`` in our ``Collection``. -Both of them have a ``name`` and a ``dtype`` of the values you are going to pass later. - -Regardless their ``primary`` flag value, their names **must be unique**. Valid ``dtypes`` are the following: - -- ``int`` -- ``float`` -- ``complex`` -- ``str`` -- ``tuple`` -- ``datetime.datetime`` - -The last point is that one of the attributes is ``primary`` and another is ``custom``. What does it mean? - -Primary attributes -------------------- -.. note:: Attribute for ``TimeDimension`` ``start_value`` indication shall be dtyped ``datetime.datetime`` - and may be **primary**. - -.. attention:: It is highly recommended to define at least one **primary** attribute in every schema. - -Primary attributes are a strictly ordered sequence. They are used for ``Arrays`` or ``VArrays`` filtering. -When Deker is building its file system, it creates symlinks for main data files using primary attributes in the symlink -path. If you need to get a certain ``Array`` or ``VArray`` from a ``Collection``, you have two options how to do it: - -- to pass its ``id`` -- or to indicate **all** its primary attributes' values. - -.. attention:: Values for all the primary attributes **must be passed** at every ``Array`` or ``VArray`` creation. - -Custom attributes -------------------- -.. note:: Attribute for ``TimeDimension`` ``start_value`` indication shall be dtyped ``datetime.datetime`` - and may be **custom** as well. - -No filtering by custom attributes is available at the moment. They just provide some optional information about your -data. You can put there anything, that is not very necessary, but may be helpful for the data managing. - -Custom attributes are the only mutable objects of the schema. It does not mean that you can change the schema, -add new attributes or remove old ones. It means that you can change their values (with respect to the -specified ``dtype``) if needed. You can also set their values to ``None``, except the attributes dtyped -``datetime.datetime``. - -.. attention:: - | Values for custom attributes **are optional for passing** at every ``Array`` or ``VArray`` creation. - | If nothing is passed for some or all of them, they are set to ``None``. - - This rule concerns all the custom attributes **except custom attributes dtyped** ``datetime.datetime``. - - Values for custom attributes dtyped ``datetime.datetime`` **must be passed** at every ``Array`` or ``VArray`` - creation and **cannot be set to** ``None``. - -.. note:: Defining ``AttributeSchemas`` is optional: you **may not set** any primary or custom attribute - (**except** attribute for ``TimeDimension.start_value`` indication). - -Arrays' schemas -================= -Since you are now well informed about the dimensions and attributes, we are ready to move to the arrays' schemas. -Both ``ArraySchema`` and ``VArraySchema`` must be provided with a list of dimensions' schemas and ``dtype``. -You may optionally pass a list of attributes' schemas and ``fill_value`` to both of them. - -Dtype --------- -Deker has a strong data typing. All the values of all the ``Arrays`` or ``VArray`` in one ``Collection`` shall be -of the same data type. Deker accepts numeric data of the following NumPy dtypes: - -- ``numpy.int8`` -- ``numpy.int16`` -- ``numpy.int32`` -- ``numpy.int64`` -- ``numpy.float16`` -- ``numpy.float64`` -- ``numpy.float128`` -- ``numpy.longfloat`` -- ``numpy.double`` -- ``numpy.longdouble`` -- ``numpy.complex64`` -- ``numpy.complex128`` -- ``numpy.complex256`` -- ``numpy.longcomplex`` -- ``numpy.longlong`` - -Python ``int``, ``float`` and ``complex`` are also acceptable. They are correspondingly converted to ``numpy.int32``, -``numpy.float64`` and ``numpy.complex128``. - -Fill value ------------- -Sometimes it happens that we have no values for some cells or we want to clear our data out in full or in some -parts. Unfortunately, NumPy does not allow you to set python ``None`` to such cells. That's why we need something that -will fill them in. - -Rules are the following: - -1. ``fill_value`` **shall not be significant** for your data -2. ``fill_value`` **is optional** - you may not provide it. In this case Deker will choose it automatically basing - on the provided ``dtype``. For ``integer``-dtypes it will be the lowest value for the correspondent dtype bit - capacity. For example, it will be ``-128`` for ``numpy.int8``. For ``float``-dtypes (including ``complex``) it will - be ``numpy.nan`` as this type is also ``floating``. -3. if you'd like to set it manually - ``fill_value`` shall be of the same dtype, that was passed to the ``dtype`` - parameter. If all the values of the correspondent ``dtype`` are significant for you, you shall choose a dtype - of a greater bit capacity. For example, if all the values in the range ``[-128; 128]`` are valid for your dataset, - you'd better choose ``numpy.int16`` instead of ``numpy.int8`` and set ``-129`` as ``fill_value`` or let Deker - to set it automatically. The other workaround is to choose any floating dtype, e.g. ``numpy.float16``, and have - ``numpy.nan`` as a ``fill_value``. - -Now, let's create once again some simple dimensions and attributes for both types of schemas:: - - from deker import DimensionSchema, AttributeSchema - - dimensions = [ - DimensionSchema(name="y", size=100), - DimensionSchema(name="x", size=200), - ] - attributes = [ - AttributeSchema(name="attr", dtype=str, primary=False) - ] - -Array schema ----------------- -:: - - from deker import ArraySchema - - array_schema = ArraySchema( - dimensions=dimensions, - attributes=attributes, - dtype=float, # will be converted and saved as numpy.float64 - # fill_value is not passed - will be numpy.nan - ) - -Now we have a schema for ``Collection`` of ``Arrays``. - - -VArray schema --------------- -:: - - from deker import VArraySchema - - varray_schema = VArraySchema( - dimensions=dimensions, - dtype=np.int64, - fill_value=-99999, - vgrid=(50, 20) - # attributes are not passed as they are optional - ) - -vgrid -~~~~~~ -Perhaps it is one of the most obscure issues. ``VArray`` shall be split into files, but it cannot decide itself how -it shall be done. It's up to you, how you are going to split your data. - -``Vgrid`` parameter shall be defined as a tuple of integers which quantity shall be exactly similar to the quantity of -the dimensions. Its values shall divide ``VArray`` shape without remainders. - -Our schema has two dimensions with sizes ``100`` and ``200`` correspondingly, what tells us that the ``VArray`` -shape will be ``(100, 200)``. And we set ``vgrid`` as ``(50, 20)``. What shall happen? No magic, just a simple math:: - - (100, 200) / (50, 20) = (2.0, 10.0) - -``(2, 10)`` - that will be the shape of all the ``Arrays``, produced by the ``VArray``. - -If we do not want to divide any dimension into pieces and want to keep it in full size in all the ``Arrays``, we shall -pass ``1`` in ``vgrid`` for that dimension:: - - (100, 200) / (1, 20) = (100.0, 10.0) - -| Thus, the first dimension will retain its initial size for all the arrays, and their shape will be -| ``(100, 10)``. - -Ok! Now we are finally ready to create our first database and we need ``Client``. - -Creating a database -==================== -``Client`` is responsible for creating connections and its internal context. - -As far as Deker is a file-based database, you need to provide some path to the storage, where your collections will -be kept. - -.. note:: - | Deker was developed and tested on Unix platforms and perfectly works with Unix file systems. - | We would appreciate any feedback from MS Windows users. - -URI ----- -There is a universal way to provide paths and connection options: an URI. - -The scheme of URI-string for embedded Deker databases, stored on your local drive, is ``file://``. -It shall be followed by a path to the directory where the storage will be located. If this directory (or even full path -to it) does not exist, Deker will create it at ``Client`s`` opening. - -.. note:: Relative paths are also applicable, but it is recommended to use absolute paths. - - *Explicit is better than implicit.* *Zen of Python:2* - -In this documentation we will use a reference to a temporary directory ``/tmp/deker``:: - - uri = "file:///tmp/deker" - - -Client -------- -Now open the Client for interacting with Deker:: - - from deker import Client - - client = Client(uri) - -You can use it as a context manager as well:: - - from deker import Client - - with Client(uri) as client: - <...some client-job here...> - -``Client`` opens its connections and inner context at its instantiation. If you use context manager, it will close them -automatically on exit. Otherwise the connections and context will remain opened until you call ``client.close()`` -directly. - -If for some reason you need to open and close ``Client`` in different parts of your code, you may define it only once -and reuse it by calling a context manager:: - - from deker import Client - - client = Client(uri) - <...some client-job here...> - client.close() - - - - with client: - - - - - with client: - - -Putting together ----------------- -Great! Now let's assemble everything from the above scope and create an ``Array`` collection of some world-wide -weather data. - -:: - - from datetime import datetime, timedelta - from deker import ( # in order of appearance - TimeDimensionSchema, - DimensionSchema, - Scale, - AttributeSchema, - ArraySchema, - Client, - Collection - ) - - dimensions = [ - TimeDimensionSchema( - name="day_hours", - size=24, - start_value="$dt", - step=timedelta(hours=1) - ), - DimensionSchema( - name="y", - size=181, - scale=Scale(start_value=90.0, step=-1.0, name="lat") - ), - DimensionSchema( - name="x", - size=360, - scale=Scale(start_value=-180.0, step=1.0, name="lon") - ), - DimensionSchema( - name="weather", - size=4, - labels=["temperature", "humidity", "pressure", "wind_speed"] - ), - ] - attributes = [ - AttributeSchema(name="dt", dtype=datetime, primary=True), - AttributeSchema(name="tm", dtype=int, primary=False), - ] - - array_schema = ArraySchema( - dimensions=dimensions, - attributes=attributes, - dtype=float, # will be converted and saved as numpy.float64 - # fill_value is not passed - will be numpy.nan - ) - - with Client(uri="file:///tmp/deker") as client: - collection: Collection = client.create_collection("weather", array_schema) - print(collection) # "weather" - -**We did it!** - -Now there is a new path ``/tmp/deker/collections/weather`` on your local drive where Deker will store the data relative -to the ``Collection`` named ``weather``. Each ``Array`` will contain a pack of daily 24-hours weather data for each -entire latitude and longitude degree: ``temperature``, ``humidity``, ``pressure`` and ``wind_speed``. diff --git a/docs/deker/installation.rst b/docs/deker/installation.rst index 613eac9..dd1634c 100755 --- a/docs/deker/installation.rst +++ b/docs/deker/installation.rst @@ -1,202 +1,87 @@ -.. currentmodule:: deker - -************* +************ Installation -************* +************ -Deker installation -==================== -Deker was developed and tested on Linux (``Ubuntu 20.04``, ``Centos 8.7``) and MacOS (``12.6.3``, ``13.14.1`` ), -so these platforms are perfectly suitable for using Deker. +Deker +===== -.. note:: Minimal python version for Deker is ``3.9``. +Deker was developed and tested on x86_64 Linux and both x86_64 and Apple silicon macOS, and known +to be running in production environments on x86_64 Linux servers. -.. attention:: If you are a user of M1+ chip, please, refer to the `ARM architecture family`_ section first. +.. note:: + Minimal Python version for Deker is ``3.9``. -Required dependencies ---------------------- -Deker dependencies are external: +.. attention:: + Deker uses NumPy, and some NumPy types are unsupported on current NumPy arm64 version. So if you + want to use Deker library on Apple silicon (M series CPU), you have to install x86_64 version of + Python using Rosetta_ x86_64 to arm64 dynamic binary translator. -- numpy>=1.18 -- attrs>=23.1.0 -- tqdm>=4.64.1 -- psutil>=5.9.5 + You may use the following guide_ to install x86_64 version of Python an then switch to that + version in your Deker project using ``pyenv`` and install Deker package as usual. -and internal: +.. _Rosetta: https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment +.. _guide: https://sixty-north.com/blog/pyenv-apple-silicon.html -- deker-tools -- deker-local-adapters - * h5py>=3.8.0 - * hdf5plugin>=4.0.1 -Deker comes with the previously mentioned dependencies included:: +Dependencies +------------ - pip install deker +Deker depends on the following third-party packages: -or:: + * ``numpy`` >= 1.18 + * ``attrs`` >= 23.1.0 + * ``tqdm`` >= 4.64.1 + * ``psutil`` >= 5.9.5 + * ``h5py`` >= 3.8.0 + * ``hdf5plugin`` >= 4.0.1 - python -m pip install deker +Also please not that for flexibility few internal Deker components are published as separate +packages: -Extra dependencies ------------------- -- xarray>=2023.5.0 + * ``deker-local-adapters`` + * ``deker-tools`` -.. _Xarray: https://docs.xarray.dev/en/stable/getting-started-guide/installing.html -.. _pandas: https://pandas.pydata.org/getting_started.html - -If you wish to convert your data into Xarray_ or pandas_ *(or even some other)* objects:: - - pip install deker[xarray] - -or :: - - python -m pip install deker[xarray] - -Or you can install them separately:: +To install Deker with all the previously mentioned dependencies, run:: pip install deker - pip install xarray - -or :: - - python -m pip install deker - python -m pip install xarray - -ARM architecture family ----------------------------- -| Deker uses NumPy, and some NumPy types are unsupported on current NumPy ARM version. -| If you want to run Deker library on your Mac with M1+ chip inside, you need to install ``python x86_64`` with Rosetta_. -.. _Rosetta: https://support.apple.com/en-us/HT211861 -Use this guide_ or follow next steps: +Optional Packages +----------------- -.. _guide: https://towardsdatascience.com/how-to-use-manage-multiple-python-versions-on-an-apple-silicon-m1-mac-d69ee6ed0250 +Deker also supports output of its data as pandas_ or Xarray_ via the following package: -1. Install Rosetta (ARM -> x86_64 translator):: + * ``xarray`` >= 2023.5.0 - softwareupdate --install-rosetta +To install it with ``xarray`` optional dependency:: -2. Create a Rosetta terminal: - - | 2.1. duplicate your terminal ``apps -> utilities -> right click -> duplicate`` or ``install new`` - | 2.2. click ``Get info`` on new terminal and set ``Open using Rosetta`` - -3. Install homebrew:: - - /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" - -4. Add an alias to your ``zsh`` config file:: - - alias rbrew="arch -x86_64 /usr/local/bin/brew" - -5. Install python:: + pip install deker[xarray] - rbrew install python@3.10 +.. _Xarray: https://docs.xarray.dev/en/stable/getting-started-guide/installing.html +.. _pandas: https://pandas.pydata.org/getting_started.html -**Hooray! Now you can install Deker with pip!** -Interactive shell -=================== -``Deker-shell`` is a MVP built on ``ptpython`` which provides a minimalistic interactive shell interface, -where you can manage your Deker database in real time. Requires ``deker`` package to be installed alongside manually. +Interactive Shell +================= -It comes with **code autocompletion**, **syntax highlighting** and session **actions history**. +``deker-shell`` is an interactive environment that enables you to manage and access Deker storage +in a convenient way. It requires ``deker`` package to be installed manually before use as described +above. -Installation --------------- -Deker-shell is not included as an out-of-box battery for Deker, so it should be installed manually:: +To install interactive shell package:: pip install deker deker-shell -or :: - - python -m pip install deker deker-shell - -Usage --------------- -Once installed, open your terminal and make :: - deker file:// +Deker Tools +=========== -You will be brought to the running Python REPL with: - - imported NumPy as ``np``, ``datetime`` library and Deker public classes - - predefined variables ``client`` and ``collections`` - - a running asyncio loop; thus, you can use ``async/await`` right in it +``deker-tools`` is an out-of-box battery which provides several useful tools and utilities to work +with Deker data. You may find this package useful in projects, even if they are not related to +Deker. -Deker tools -================ -``Deker-tools`` is an out-of-box battery and provides several tools and utilities. You may find this package useful -in projects, even those not related to Deker. - -Installation --------------- -:: +To install Deker tools package:: pip install deker-tools -or :: - - python -m pip install deker-tools - - -Usage --------------- -You will get a collection of utility functions and classes designed to assist in common data processing tasks. -It consists of modules that handle data conversion, path validation, and slice manipulation. - -data -+++++++++ - -This module provides ``convert_size_to_human`` method for converting bytes size into human readable representation:: - - >>> convert_size_to_human(1052810) - "1.0 MB" - -path -+++++++++ -This module provides functions to validate and handle filesystem ``paths``:: - - is_empty(path) - is_path_valid(path) - -slices -+++++++++ -Calculate ``shape`` of a subset from the index expression:: - - >>> shape = (361, 720, 4) - >>> index_exp = (slice(None, None, None), slice(None, None, None), 0) - >>> create_shape_from_slice(shape, index_exp) - (361, 720) - -Convert ``slice`` into a sequence and get its length:: - - >>> match_slice_size(10, slice(10)) - (0, 10, 1) - -Serialize ``slices`` to ``string`` and vice versa with ``slice_converter``:: - - >>> slice_converter[5] - '[5]' - - >>> slice_converter[datetime.datetime(2023,1,1):datetime.datetime(2023,2,1), 0.1:0.9:0.05] - '[`2023-01-01T00:00:00`:`2023-02-01T00:00:00`, 0.1:0.9:0.05]' - -time -+++++++++ - -This module provides ``get_utc`` function which returns timezone with UTC or current time by default:: - - >>> get_utc() - 2023-07-26 15:42:05.539317+00:00 - - >>> get_utc(datetime.now()) - 2023-07-26 15:42:05.539317+00:00 - -The contents of this package may be changed anytime. For details refer to the `deker-tools API`_ - -.. _deker-tools API: api/deker_tools/modules.html - - -.. note:: Please, don't hesitate to inform us about any installation or usage issues. diff --git a/docs/deker/shell.rst b/docs/deker/shell.rst index 1e15890..60c4513 100644 --- a/docs/deker/shell.rst +++ b/docs/deker/shell.rst @@ -1,86 +1,94 @@ -******************* -Interactive shell -******************* +***************** +Interactive Shell +***************** + +Interactive shell is a convenient Python REPL interface that allows you to manage, query and modify +data in your Deker storage. + +.. note:: Deker shell is based on amazing ptpython_ - a better Python REPL + +.. _ptpython: https://github.com/prompt-toolkit/ptpython + +Features +======== + +* Autocompletion +* Syntax highlighting +* ``client`` and ``collections`` variables initialized at start +* Shortcut ``use`` function to change current ``collection`` +* Imported at start: ``numpy`` as ``np``, ``datetime`` and all ``deker`` public classes +* Running ``asyncio`` loop (thus, enabling you to use ``async`` and ``await``) +* All the ``ptpython`` features -Deker-shell provides Python REPL interface for Deker, offering features like autocompletion, preset variables, -and enhanced REPL functionality through Ptpython. Start -========= -You need Deker and python>=3.9 installed. +===== -.. code-block:: bash +Once installed, open your terminal and run the shell providing path to Deker storage via command +line parameter (in this case it would be ``/tmp/deker-data``):: - pip install deker deker-shell - deker --version # 1.0.0 - deker file:///tmp/deker + deker file:///tmp/deker-data -Features -========= -- autocompletion -- syntax highlighting -- preset ``client`` and ``collections`` variable -- default ``collection`` variable with 'use' method -- Deker and Ptpython features Examples -========= -Using global collection variable +======== -.. image:: images/shell_collection.png +Using global collection variable: +.. image:: images/shell_collection.png -Creating a new collection +Creating a new collection: .. image:: images/shell_highlight.png - -Press ``F2`` for menu +REPL menu (called with ``F2``): .. image:: images/shell_menu.png :scale: 45% + Interface ========= -Imported Deker classes ------------------------ -management -++++++++++++ -- :class:`Client ` -- :class:`Collection ` -- :class:`Array ` -- :class:`VArray ` -- :class:`Subset ` -- :class:`VSubset ` - -schemas -++++++++++++ -- :class:`DimensionSchema ` -- :class:`TimeDimensionSchema ` -- :class:`ArraySchema ` -- :class:`VArraySchema ` -- :class:`AttributeSchema ` - -options -++++++++++++ -- :class:`HDF5Options ` -- :class:`HDF5CompressionOpts ` -- :class:`Scale ` - -Preset variables ------------------- -- ``client``: Client (registry of collections) instance, connected to the uri-database -- ``collections``: list of Client collections names -- ``collection``: global default collection variable, set by use("coll_name") method; -- ``np``: numpy library -- ``datetime``: datetime library - -Methods +Imported Deker Classes +---------------------- + +Basic storage access and management classes: + +* :class:`Client ` +* :class:`Collection ` +* :class:`Array ` +* :class:`VArray ` +* :class:`Subset ` +* :class:`VSubset ` + +Collection schema related: + +* :class:`DimensionSchema ` +* :class:`TimeDimensionSchema ` +* :class:`ArraySchema ` +* :class:`VArraySchema ` +* :class:`AttributeSchema ` +* :class:`Scale ` + +Physical storage (HDF5) level options: + +* :class:`HDF5Options ` +* :class:`HDF5CompressionOpts + ` + +Preset Variables +---------------- + +* ``client``: Client (registry of collections) instance, connected to the uri-database +* ``collections``: list of Client collections names +* ``collection``: global default collection variable, set by use("coll_name") method; +* ``np``: numpy library +* ``datetime``: datetime library + +Functions --------- -- ``use("collection_name")``: gets collection from client and saves it to ``collection`` variable -- ``get_global_coll_variable()``: returns ``collection`` global variable -.. admonition:: Special thanks to +* ``use("collection_name")``: gets collection from client and saves it to ``collection`` variable +* ``get_global_coll_variable()``: returns ``collection`` global variable - `Ptpython `_ - advanced Python REPL diff --git a/docs/deker/tools.rst b/docs/deker/tools.rst new file mode 100644 index 0000000..7730bfd --- /dev/null +++ b/docs/deker/tools.rst @@ -0,0 +1,71 @@ +*********** +Deker Tools +*********** + +Deker Tools is a collection of utility functions and classes designed to assist in common data +processing tasks. It consists of modules that handle data conversion, path validation, and slice +manipulation. + +.. attention:: This package is not considered to be part of Deker public API nad could significantly + changed in future versions. + +For details please refer to the `Deker Tools`_ API documentation. + +.. _Deker Tools: api/deker_tools/modules.html + + +``data`` +======== + +This module provides ``convert_size_to_human`` method for converting bytes size into human readable +representation:: + + >>> convert_size_to_human(1052810) + "1.0 MB" + + +``path`` +======== + +This module provides functions to validate and handle filesystem ``paths``:: + + is_empty(path) + is_path_valid(path) + + +``slices`` +========== + +Calculate ``shape`` of a subset from the index expression:: + + >>> shape = (361, 720, 4) + >>> index_exp = (slice(None, None, None), slice(None, None, None), 0) + >>> create_shape_from_slice(shape, index_exp) + (361, 720) + +Convert ``slice`` into a sequence and get its length:: + + >>> match_slice_size(10, slice(10)) + (0, 10, 1) + +Serialize ``slices`` to ``string`` and vice versa with ``slice_converter``:: + + >>> slice_converter[5] + '[5]' + + >>> slice_converter[datetime.datetime(2023,1,1):datetime.datetime(2023,2,1), 0.1:0.9:0.05] + '[`2023-01-01T00:00:00`:`2023-02-01T00:00:00`, 0.1:0.9:0.05]' + + +``time`` +======== + +This module provides ``get_utc`` function which returns timezone with UTC or current time by +default:: + + >>> get_utc() + 2023-07-26 15:42:05.539317+00:00 + + >>> get_utc(datetime.now()) + 2023-07-26 15:42:05.539317+00:00 + diff --git a/docs/deker/why_deker.rst b/docs/deker/why_deker.rst deleted file mode 100755 index 74cba94..0000000 --- a/docs/deker/why_deker.rst +++ /dev/null @@ -1,152 +0,0 @@ -*********** -Why Deker? -*********** - -Deker was made with the aims: - - to easily write and read big amounts of data; - - to be thread and process safe; - - to be lazy as long as it can be. - -It means that Deker can operate the datasets limited only by the size of your available free RAM. -To some extent it resembles an ORM over a database, but it is still a database. - -How it works -================ -| In contrast with similar decisions Deker's interfaces are quite simple and user-friendly. -| It has a few high-level objects for data managing: - -- :class:`Client ` -- :class:`Collection ` -- :class:`Array ` or :class:`VArray ` -- :class:`Subset ` or :class:`VSubset ` - -``Client`` is the first object you start with. It is being used for creating and getting ``Collections`` of ``Arrays`` -or ``VArrays`` basing on the collection schema. - -The data has a representation of arrays and is stored in files with the help of ``deker-local-adapters`` plugin. -Deker provides two types of high-level objects for data managing with the same interface: ``Array`` and ``VArray``. - -What is the difference? ``Array`` is an abstract wrapper over final low-level arrays (or files) containing data. -``VArray`` *(or Virtual Array)* is an "array of ``Arrays``" or an "image of pixels". -If we consider VArray as an image, it is split by virtual grid into some tiles having the similar shape. -In this case, each tile is an ordinary ``Array``, and each ``Array`` is a file. But for a user there is no difference -between ``VArray`` and ``Array`` interfaces. - -To access the data you need just to create a slice from your ``Array`` or ``VArray``, thus you'll get a new object -called ``Subset`` or ``VSubset``. - -``Subset`` and ``VSubset`` also have the same interface and they are the final lazy objects, which possess methods -for direct data updating, reading and clearing. - -For creating a new ``Сollection`` you will need a few more objects: - -- :class:`DimensionSchema ` and/or - :class:`TimeDimensionSchema ` -- :class:`ArraySchema ` or :class:`VArraySchema ` -- :class:`AttributeSchema ` (optionally, but highly recommended) - -And optionally you may need: - -- :class:`HDF5Options ` and :class:`HDF5CompressionOpts ` -- :class:`Scale ` - -Features -========== - -1. ``VArrays`` -2. Own locks -3. Strong data typing. -4. FancySlicing: use ``datetime``, ``floats`` and ``strings`` instead of ``integers`` for ``Arrays`` and ``VArrays`` - slicing (available for ``Dimensions`` described with ``labels`` or ``scale`` parameters and for ``TimeDimensions``) -5. Reading your data as ``xarray.DataArray`` with further possibility of conversion to different formats - (refer to the DataArray_ ``to_...`` methods) -6. Data compression and chunking (available for ``HDF5``) - -.. _DataArray: https://docs.xarray.dev/en/stable/generated/xarray.DataArray.html - -Understanding Array and VArray -================================ - -Array ------- -As previously mentioned, ``Array`` is an abstract wrapper over files containing data. It does not have a direct access -to the data, but it knows everything about its properties and options. - -.. |cell| image:: images/cell.png - :scale: 5% - -| An array is made of cells - |cell| - containers for data pieces. -| Here is an example of a simple 3-dimensional array with some weather data: - -.. image:: images/array_0_axes.png - :scale: 30% - -.. image:: images/legend.png - :scale: 28% - :align: right - -Let's assume that ``x`` and ``y`` axes represent some geographical grid, and ``z`` axis represents layers -with weather data, as shown in the legend. - -It is a single ``Array`` having 4 cells in each dimension, in other words its shape is ``(4, 4, 4)``. - -Deker stores this data in a single file, and when we call this ``Array`` from the correspondent ``Collection``, all the -operations with its data will affect this file. - -So, it is quite simple: one ``Array`` - one file. - - -VArray -------- -And here comes ``VArray``! - -.. image:: images/array_0_axes.png - :scale: 30% - -No, it is not a mistake. Both ``Array`` and ``VArray`` have the same interface, so there is no visible -difference between them for a user. - -.. attention:: But there is a difference under the hood! - -Imagine that you need to create a photo of something really big with a very high resolution, for example a photo -of the Earth. Suppose the size of the image is 300.000 px * 200.000 px. It is really huge and requires incredibly -much space on a drive and a lot of RAM to be processed. Obviously, nowadays it is impossible to upload it promptly -to physical memory. Moreover, it may require several storage drives to be written down, as its final size depends -on the data type. - -How this problem can be solved? We can make a lot of small shots, place them in separated files and arrange them -in the correct order. We certainly will not be able to see the full picture, but we will be able to browse it piece -by piece. - -.. image:: images/vgrid.png - :scale: 35% - :align: center - -``VArray`` is a virtual wrapper over such a set of files. You can see how ``vgrid`` cuts it into separated pieces -in the above image. Each separate piece is an ``Array``, which lays under ``VArray``. And as previously stated, one -``Array`` is one file. If your ``Collection`` is a collection of ``VArrays``, you don't have to worry about ``Arrays``, -``VArray`` manages them for you. - -When we query some piece of data, ``VArray`` calculates which files to open and what bounds to impose on each -of these files. - -For example, we have the same VArray: its shape is ``(4, 4, 4)``, its dimensions are arranged as ``['x', 'y', 'z']`` -and its *zero-index* is at the front-left-bottom corner. - -.. image:: images/varray.png - :scale: 30% - -Let's query it in the following way: ``VArray[:, 2:4, :]`` - -.. image:: images/varray_request.png - :scale: 30% - -Here you can see, that all of 4 files will be affected, but only the highlighted parts of them will be captured. -If you use these bounds for **inserting or updating**, ``VArray`` will distribute your input data within the proper -files and in the correct order. If you use them for **reading**, ``VArray`` will aggregate all the captured parts into -one ``numpy.ndarray`` of the correspondent shape and in the correct order and return it to you. And, obviously, the -captured parts will be **cleared**, if you so wish. - -Pursuing the aim to be fast, ``VArray`` uses its own ``ThreadPoolExecutor`` to cope with all the tasks it needs to do. -In the interest of thread-safety Deker uses its own file locks for all sorts of file operations: creating, reading, -writing and deleting. diff --git a/docs/index.rst b/docs/index.rst index 4eefc38..7c889c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,74 +1,84 @@ +.. toctree:: + :hidden: + :caption: Tutorials + + Installation + Collection Schema + Data Access + Fine Tuning + Interactive Shell + Deker Tools + +.. toctree:: + :hidden: + :caption: API Reference + + Deker API + Deker Tools + +.. toctree:: + :hidden: + :caption: About Us + + OpenWeather + GitHub Projects + + +************** +What is Deker? +************** + .. image:: deker/images/logo.png :align: right :scale: 50% -Deker documentation -==================== -**Welcome to Deker!** +Deker is pure Python implementation of petabyte-scale highly parallel data storage engine for +multidimensional arrays. -Deker - is a pure-Python NoSQL database framework, which provides storing multidimensional spatial raster -numeric data and its further simple, fast and comfortable accessing and managing. +Deker name comes from term *dekeract*, the 10-cube_. -.. admonition:: What is Deker? +Deker was made with the following major goals in mind: - **Deker** is a short name for a **dekeract** or `10-cube (ten-dimensional hypercube)`_. + * provide intuitive interface for storing and accessing **huge data arrays** + * support **arbitrary number of data dimensions** + * be **thread and process safe** and as **lean on RAM** use as possible -.. _10-cube (ten-dimensional hypercube): https://en.wikipedia.org/wiki/10-cube +Deker empowers users to store and access a wide range of data types, virtually anything that can be +represented as arrays, like **geospacial data**, **satellite images**, **machine learning models**, +**sensors data**, graphs, key-value pairs, tabular data, and more. -It perfectly fits for a vast variety of data: +Deker does not limit your data complexity and size: it supports virtually unlimited number of data +dimensions and provides under the hood mechanisms to **partition** huge amounts of data for +**scalability**. -- geospatial data (cartography, geodesy, meteorology, …, even outer space), -- images, -- video, -- audio, -- biomedicine, -- genomics, -- finance, -- ML, -- ... +.. _10-cube: https://en.wikipedia.org/wiki/10-cube -and many others – everything that may be represented and stored as a pack of numbers. -Deker is not really limited by a number of dimensions – it’s up to you to decide how complicated your structures -shall be and how many dimensions you use `(our current goal is 5, at the moment)`. +Features +======== -| Actually, it is a scalable high-level wrapper over different file formats. -| At the moment Deker supports just ``HDF5``, but we’ll be glad to accept PRs with new storage adapters: - ``TIFF``, ``NetCDF``, ``ZARR``, … Any format you like and need, even ``JSON`` or ``TXT``. - -Deker uses NumPy_ structures and provides an additional support for Xarray_, pandas_ and others. +* **Open source** under GPL 3.0 +* Scalable storage of huge virtual arrays via **tiling** +* **Parallel processing** of virtual array tiles +* Own **locking** mechanism enabling arrays parallel read and write +* Array level **metadata attributes** +* **Fancy data slicing** using timestamps and named labels +* Support for industry standard NumPy_, pandas_ and Xarray_ +* Storage level data **compression and chunking** (via HDF5) .. _NumPy: https://numpy.org/doc/stable/ -.. _Xarray: https://docs.xarray.dev/en/stable/ .. _pandas: https://pandas.pydata.org/docs/ +.. _Xarray: https://docs.xarray.dev/en/stable/ -Deker open source ecosystem: - - deker - - deker-local-adapters - - deker-shell - - deker-tools -.. toctree:: - :hidden: - :caption: Tutorials +Code and Documentation +====================== - Why Deker - Installation - First steps - Data management - Fine tuning - Interactive shell +Open source implementation of Deker storage engine is published at -.. toctree:: - :hidden: - :caption: API Reference + * https://github.com/openweathermap/deker - Deker API - Deker-tools +API documentation and tutorials for the current release could be found at -.. toctree:: - :hidden: - :caption: About us + * https://docs.deker.io - GitHub - OpenWeather diff --git a/requirements_dev.txt b/requirements_dev.txt index 080746a..daadfe6 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -22,3 +22,8 @@ pytest-mock==3.10.0 xarray>=2023.5.0 bandit==1.7.5 tox==4.6.3 +m2r2 +sphinx-rtd-theme +enum_tools +sphinx_toolbox +doc8 diff --git a/requirements_doc.txt b/requirements_doc.txt deleted file mode 100644 index 87d0aa5..0000000 --- a/requirements_doc.txt +++ /dev/null @@ -1,4 +0,0 @@ -m2r2==0.3.2 -sphinx-rtd-theme==1.2.2 -enum_tools==0.10.0 -sphinx_toolbox==3.5.0