VER: Release 0.14.1

databento · Jun 16, 2023 · 444d68a · 444d68a
2 parents 4648cbf + 39035cd
commit 444d68a
Show file tree

Hide file tree

Showing 35 changed files with 427 additions and 337 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 0.14.1 - 2023-06-16
+- Fixed issue where `DBNStore.to_df()` would raise an exception if no records were present
+- Fixed exception message when creating a DBNStore from an empty data source
+
 ## 0.14.0 - 2023-06-14
 - Added support for reusing a `Live` client to reconnect
 - Added `metadata` property to `Live`

diff --git a/README.md b/README.md
@@ -1,18 +1,11 @@
-<a href="https://databento.com">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://dzv012k6yye9u.cloudfront.net/brand/logo-white.png">
-    <source media="(prefers-color-scheme: light)" srcset="https://dzv012k6yye9u.cloudfront.net/brand/logo.png">
-    <img alt="Databento" src="https://dzv012k6yye9u.cloudfront.net/brand/logo-white.png" width="560px">
-  </picture>
-</a>
-
-# Pay as you go for market data
+# databento-python
 
 [![test](https://github.com/databento/databento-python/actions/workflows/test.yml/badge.svg?branch=dev)](https://github.com/databento/databento-python/actions/workflows/test.yml)
 ![python](https://img.shields.io/badge/python-3.8+-blue.svg)
 [![pypi-version](https://img.shields.io/pypi/v/databento)](https://pypi.org/project/databento)
 [![license](https://img.shields.io/github/license/databento/databento-python?color=blue)](./LICENSE)
 [![code-style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Slack](https://img.shields.io/badge/join_Slack-community-darkblue.svg?logo=slack)](https://join.slack.com/t/databento-hq/shared_invite/zt-1xk498wxs-9fUs_xhz5ypaGD~mhI_hVQ)
 
 The official Python client library for [Databento](https://databento.com).
 

diff --git a/databento/common/bentologging.py b/databento/common/bentologging.py
@@ -5,9 +5,9 @@
 
 def enable_logging(level: int | str = logging.INFO) -> None:
     """
-    Enable logging for the Databento module.
-    This function should be used for simple applications and examples.
-    It is advisible to configure your own logging for serious applications.
+    Enable logging for the Databento module. This function should be used for
+    simple applications and examples. It is advisible to configure your own
+    logging for serious applications.
 
     Parameters
     ----------

diff --git a/databento/common/cram.py b/databento/common/cram.py
@@ -1,4 +1,6 @@
-"""Functions for handling challenge-response authentication"""
+"""
+Functions for handling challenge-response authentication.
+"""
 import argparse
 import hashlib
 import os
@@ -10,9 +12,8 @@
 
 def get_challenge_response(challenge: str, key: str) -> str:
     """
-    Return the response for a given challenge-response
-    authentication mechanism (CRAM) code provided by
-    a Databento service.
+    Return the response for a given challenge-response authentication mechanism
+    (CRAM) code provided by a Databento service.
 
     A valid API key is hashed with the challenge string.
 

diff --git a/databento/common/data.py b/databento/common/data.py
@@ -28,15 +28,6 @@ def get_deriv_ba_types(level: int) -> list[tuple[str, type | str]]:
     Schema.TRADES,
 )
 
-
-OHLCV_SCHEMAS = (
-    Schema.OHLCV_1S,
-    Schema.OHLCV_1M,
-    Schema.OHLCV_1H,
-    Schema.OHLCV_1D,
-)
-
-
 RECORD_HEADER: list[tuple[str, type | str]] = [
     ("length", np.uint8),
     ("rtype", np.uint8),
@@ -265,6 +256,7 @@ def get_deriv_ba_fields(level: int) -> list[str]:
 
 
 DERIV_HEADER_COLUMNS = [
+    "ts_recv",
     "ts_event",
     "ts_in_delta",
     "publisher_id",
@@ -279,6 +271,7 @@ def get_deriv_ba_fields(level: int) -> list[str]:
 ]
 
 OHLCV_HEADER_COLUMNS = [
+    "ts_event",
     "publisher_id",
     "instrument_id",
     "open",
@@ -289,7 +282,6 @@ def get_deriv_ba_fields(level: int) -> list[str]:
 ]
 
 DEFINITION_DROP_COLUMNS = [
-    "ts_recv",
     "length",
     "rtype",
     "reserved1",
@@ -299,14 +291,12 @@ def get_deriv_ba_fields(level: int) -> list[str]:
 ]
 
 IMBALANCE_DROP_COLUMNS = [
-    "ts_recv",
     "length",
     "rtype",
     "dummy",
 ]
 
 STATISTICS_DROP_COLUMNS = [
-    "ts_recv",
     "length",
     "rtype",
     "dummy",
@@ -330,6 +320,7 @@ def get_deriv_ba_fields(level: int) -> list[str]:
 
 COLUMNS = {
     Schema.MBO: [
+        "ts_recv",
         "ts_event",
         "ts_in_delta",
         "publisher_id",

diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -2,6 +2,7 @@
 
 import abc
 import datetime as dt
+import functools
 import logging
 from collections.abc import Generator
 from io import BytesIO
@@ -55,8 +56,7 @@
 
 def is_zstandard(reader: IO[bytes]) -> bool:
     """
-    Determine if an `IO[bytes]` reader contains zstandard compressed
-    data.
+    Determine if an `IO[bytes]` reader contains zstandard compressed data.
 
     Parameters
     ----------
@@ -96,7 +96,9 @@ def is_dbn(reader: IO[bytes]) -> bool:
 
 
 class DataSource(abc.ABC):
-    """Abstract base class for backing DBNStore instances with data."""
+    """
+    Abstract base class for backing DBNStore instances with data.
+    """
 
     def __init__(self, source: object) -> None:
         ...
@@ -137,6 +139,11 @@ def __init__(self, source: PathLike[str] | str):
         if not self._path.is_file() or not self._path.exists():
             raise FileNotFoundError(source)
 
+        if self._path.stat().st_size == 0:
+            raise ValueError(
+                f"Cannot create data source from empty file: {self._path.name}",
+            )
+
         self._name = self._path.name
         self.__buffer: IO[bytes] | None = None
 
@@ -244,8 +251,8 @@ def nbytes(self) -> int:
     @property
     def reader(self) -> IO[bytes]:
         """
-        Return a reader for this buffer.
-        The reader beings at the start of the buffer.
+        Return a reader for this buffer. The reader beings at the start of the
+        buffer.
 
         Returns
         -------
@@ -306,6 +313,11 @@ class DBNStore:
     to_ndarray : np.ndarray
         The data as a numpy `ndarray`.
 
+    Raises
+    ------
+    BentoError
+        When the data_source does not contain valid DBN data or is corrupted.
+
     See Also
     --------
     https://docs.databento.com/knowledge-base/new-users/dbn-encoding
@@ -328,7 +340,7 @@ def __init__(self, data_source: DataSource) -> None:
             buffer = data_source.reader
         else:
             # We don't know how to read this file
-            raise RuntimeError(
+            raise BentoError(
                 f"Could not determine compression format of {self._data_source.name}",
             )
 
@@ -452,10 +464,6 @@ def _prepare_dataframe(
         df: pd.DataFrame,
         schema: Schema,
     ) -> pd.DataFrame:
-        # Setup column ordering and index
-        df.set_index(self._get_index_column(schema), inplace=True)
-        df = df.reindex(columns=COLUMNS[schema])
-
         if schema == Schema.MBO or schema in DERIV_SCHEMAS:
             df["flags"] = df["flags"] & 0xFF  # Apply bitmask
             df["side"] = df["side"].str.decode("utf-8")
@@ -500,8 +508,8 @@ def _map_symbols(self, df: pd.DataFrame, pretty_ts: bool) -> pd.DataFrame:
     @property
     def compression(self) -> Compression:
         """
-        Return the data compression format (if any).
-        This is determined by inspecting the data.
+        Return the data compression format (if any). This is determined by
+        inspecting the data.
 
         Returns
         -------
@@ -525,8 +533,8 @@ def dataset(self) -> str:
     @property
     def end(self) -> pd.Timestamp | None:
         """
-        Return the query end for the data.
-        If None, the end time was not known when the data was generated.
+        Return the query end for the data. If None, the end time was not known
+        when the data was generated.
 
         Returns
         -------
@@ -632,8 +640,7 @@ def reader(self) -> IO[bytes]:
     @property
     def schema(self) -> Schema | None:
         """
-        Return the DBN record schema.
-        If None, may contain one or more schemas.
+        Return the DBN record schema. If None, may contain one or more schemas.
 
         Returns
         -------
@@ -664,8 +671,8 @@ def start(self) -> pd.Timestamp:
     @property
     def stype_in(self) -> SType | None:
         """
-        Return the query input symbology type for the data.
-        If None, the records may contain mixed STypes.
+        Return the query input symbology type for the data. If None, the
+        records may contain mixed STypes.
 
         Returns
         -------
@@ -739,7 +746,9 @@ def from_file(cls, path: PathLike[str] | str) -> DBNStore:
         Raises
         ------
         FileNotFoundError
-            If a empty or non-existant file is specified.
+            If a non-existant file is specified.
+        ValueError
+            If an empty file is specified.
 
         """
         return cls(FileDataSource(path))
@@ -760,8 +769,8 @@ def from_bytes(cls, data: BytesIO | bytes | IO[bytes]) -> DBNStore:
 
         Raises
         ------
-        FileNotFoundError
-            If a empty or non-existant file is specified.
+        ValueError
+            If an empty buffer is specified.
 
         """
         return cls(MemoryDataSource(data))
@@ -941,7 +950,12 @@ def to_df(
                 raise ValueError("a schema must be specified for mixed DBN data")
             schema = self.schema
 
-        df = pd.DataFrame(self.to_ndarray(schema=schema))
+        df = pd.DataFrame(
+            self.to_ndarray(schema),
+            columns=COLUMNS[schema],
+        )
+        df.set_index(self._get_index_column(schema), inplace=True)
+
         df = self._prepare_dataframe(df, schema)
 
         if pretty_ts:
@@ -1049,12 +1063,10 @@ def to_ndarray(
             self,
         )
 
-        result = []
-        for record in schema_records:
-            np_rec = np.frombuffer(
-                bytes(record),
-                dtype=STRUCT_MAP[schema],
-            )
-            result.append(np_rec[0])
+        decoder = functools.partial(np.frombuffer, dtype=STRUCT_MAP[schema])
+        result = tuple(map(decoder, map(bytes, schema_records)))
+
+        if not result:
+            return np.empty(shape=(0, 1), dtype=STRUCT_MAP[schema])
 
-        return np.asarray(result)
+        return np.ravel(result)