Skip to content

Commit 18f45c2

Browse files
authored
VER: Release 0.26.0
See release notes.
2 parents 32e6ea1 + 89abe36 commit 18f45c2

17 files changed

+225
-93
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
22

3+
## 0.26.0 - 2024-01-16
4+
5+
This release adds support for transcoding DBN data into Apache parquet.
6+
7+
#### Enhancements
8+
- Added `DBNStore.to_parquet` for transcoding DBN data into Apache parquet using `pyarrow`
9+
- Upgraded `databento-dbn` to 0.15.0
10+
311
## 0.25.0 - 2024-01-09
412

513
#### Breaking changes

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ The library is fully compatible with the latest distribution of Anaconda 3.8 and
3232
The minimum dependencies as found in the `pyproject.toml` are also listed below:
3333
- python = "^3.8"
3434
- aiohttp = "^3.8.3"
35-
- databento-dbn = "0.14.2"
35+
- databento-dbn = "0.15.0"
3636
- numpy= ">=1.23.5"
3737
- pandas = ">=1.5.3"
38+
- pyarrow = ">=13.0.0"
3839
- requests = ">=2.24.0"
3940
- zstandard = ">=0.21.0"
4041

databento/common/dbnstore.py

Lines changed: 88 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
Any,
1616
BinaryIO,
1717
Callable,
18+
Final,
1819
Literal,
1920
Protocol,
2021
overload,
@@ -23,6 +24,8 @@
2324
import databento_dbn
2425
import numpy as np
2526
import pandas as pd
27+
import pyarrow as pa
28+
import pyarrow.parquet as pq
2629
import zstandard
2730
from databento_dbn import FIXED_PRICE_SCALE
2831
from databento_dbn import Compression
@@ -51,6 +54,8 @@
5154

5255
logger = logging.getLogger(__name__)
5356

57+
PARQUET_CHUNK_SIZE: Final = 2**16
58+
5459
if TYPE_CHECKING:
5560
from databento.historical.client import Historical
5661

@@ -791,18 +796,14 @@ def to_csv(
791796
compression : Compression or str, default `Compression.NONE`
792797
The output compression for writing.
793798
schema : Schema or str, optional
794-
The schema for the csv.
799+
The DBN schema for the csv.
795800
This is only required when reading a DBN stream with mixed record types.
796801
797802
Raises
798803
------
799804
ValueError
800805
If the schema for the array cannot be determined.
801806
802-
Notes
803-
-----
804-
Requires all the data to be brought up into memory to then be written.
805-
806807
"""
807808
compression = validate_enum(compression, Compression, "compression")
808809
schema = validate_maybe_enum(schema, Schema, "schema")
@@ -870,7 +871,7 @@ def to_df(
870871
a 'symbol' column, mapping the instrument ID to its requested symbol for
871872
every record.
872873
schema : Schema or str, optional
873-
The schema for the dataframe.
874+
The DBN schema for the dataframe.
874875
This is only required when reading a DBN stream with mixed record types.
875876
count : int, optional
876877
If set, instead of returning a single `DataFrame` a `DataFrameIterator`
@@ -887,7 +888,7 @@ def to_df(
887888
Raises
888889
------
889890
ValueError
890-
If the schema for the array cannot be determined.
891+
If the DBN schema is unspecified and cannot be determined.
891892
892893
"""
893894
schema = validate_maybe_enum(schema, Schema, "schema")
@@ -919,6 +920,81 @@ def to_df(
919920

920921
return df_iter
921922

923+
def to_parquet(
924+
self,
925+
path: Path | str,
926+
price_type: Literal["fixed", "float"] = "float",
927+
pretty_ts: bool = True,
928+
map_symbols: bool = True,
929+
schema: Schema | str | None = None,
930+
**kwargs: Any,
931+
) -> None:
932+
"""
933+
Write the data to a parquet file at the given path.
934+
935+
Parameters
936+
----------
937+
price_type : str, default "float"
938+
The price type to use for price fields.
939+
If "fixed", prices will have a type of `int` in fixed decimal format; each unit representing 1e-9 or 0.000000001.
940+
If "float", prices will have a type of `float`.
941+
The "decimal" price type is not supported at this time.
942+
pretty_ts : bool, default True
943+
If all timestamp columns should be converted from UNIX nanosecond
944+
`int` to tz-aware UTC `pyarrow.TimestampType`.
945+
map_symbols : bool, default True
946+
If symbology mappings from the metadata should be used to create
947+
a 'symbol' column, mapping the instrument ID to its requested symbol for
948+
every record.
949+
schema : Schema or str, optional
950+
The DBN schema for the parquet file.
951+
This is only required when reading a DBN stream with mixed record types.
952+
953+
Raises
954+
------
955+
ValueError
956+
If an incorrect price type is specified.
957+
If the DBN schema is unspecified and cannot be determined.
958+
959+
"""
960+
if price_type == "decimal":
961+
raise ValueError("the 'decimal' price type is not currently supported")
962+
963+
schema = validate_maybe_enum(schema, Schema, "schema")
964+
if schema is None:
965+
if self.schema is None:
966+
raise ValueError("a schema must be specified for mixed DBN data")
967+
schema = self.schema
968+
969+
dataframe_iter = self.to_df(
970+
price_type=price_type,
971+
pretty_ts=pretty_ts,
972+
map_symbols=map_symbols,
973+
schema=schema,
974+
count=PARQUET_CHUNK_SIZE,
975+
)
976+
977+
writer = None
978+
try:
979+
for frame in dataframe_iter:
980+
if writer is None:
981+
# Initialize the writer using the first DataFrame
982+
parquet_schema = pa.Schema.from_pandas(frame)
983+
writer = pq.ParquetWriter(
984+
where=path,
985+
schema=parquet_schema,
986+
**kwargs,
987+
)
988+
writer.write_table(
989+
pa.Table.from_pandas(
990+
frame,
991+
schema=parquet_schema,
992+
),
993+
)
994+
finally:
995+
if writer is not None:
996+
writer.close()
997+
922998
def to_file(self, path: Path | str) -> None:
923999
"""
9241000
Write the data to a DBN file at the given path.
@@ -972,18 +1048,14 @@ def to_json(
9721048
compression : Compression or str, default `Compression.NONE`
9731049
The output compression for writing.
9741050
schema : Schema or str, optional
975-
The schema for the json.
1051+
The DBN schema for the json.
9761052
This is only required when reading a DBN stream with mixed record types.
9771053
9781054
Raises
9791055
------
9801056
ValueError
9811057
If the schema for the array cannot be determined.
9821058
983-
Notes
984-
-----
985-
Requires all the data to be brought up into memory to then be written.
986-
9871059
"""
9881060
compression = validate_enum(compression, Compression, "compression")
9891061
schema = validate_maybe_enum(schema, Schema, "schema")
@@ -1030,7 +1102,7 @@ def to_ndarray(
10301102
Parameters
10311103
----------
10321104
schema : Schema or str, optional
1033-
The schema for the array.
1105+
The DBN schema for the array.
10341106
This is only required when reading a DBN stream with mixed record types.
10351107
count : int, optional
10361108
If set, instead of returning a single `np.ndarray` a `NDArrayIterator`
@@ -1047,7 +1119,7 @@ def to_ndarray(
10471119
Raises
10481120
------
10491121
ValueError
1050-
If the schema for the array cannot be determined.
1122+
If the DBN schema is unspecified and cannot be determined.
10511123
10521124
"""
10531125
schema = validate_maybe_enum(schema, Schema, "schema")
@@ -1120,7 +1192,7 @@ def _transcode(
11201192
pretty_ts=pretty_ts,
11211193
has_metadata=True,
11221194
map_symbols=map_symbols,
1123-
symbol_interval_map=symbol_map, # type: ignore [arg-type]
1195+
symbol_interval_map=symbol_map,
11241196
schema=schema,
11251197
)
11261198

@@ -1329,8 +1401,7 @@ def _format_px(
13291401
if price_type == "decimal":
13301402
for field in px_fields:
13311403
df[field] = (
1332-
df[field].replace(INT64_NULL, np.nan).apply(decimal.Decimal)
1333-
/ FIXED_PRICE_SCALE
1404+
df[field].replace(INT64_NULL, np.nan).apply(decimal.Decimal) / FIXED_PRICE_SCALE
13341405
)
13351406
elif price_type == "float":
13361407
for field in px_fields:

databento/common/parsing.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from datetime import date
55
from functools import partial
66
from functools import singledispatch
7-
from numbers import Number
7+
from numbers import Integral
8+
from typing import Any
89

910
import pandas as pd
1011
from databento_dbn import SType
@@ -59,7 +60,7 @@ def optional_values_list_to_string(
5960

6061
@singledispatch
6162
def optional_symbols_list_to_list(
62-
symbols: Iterable[str] | Iterable[Number] | str | Number | None,
63+
symbols: Iterable[str | int | Integral] | str | int | Integral | None,
6364
stype_in: SType,
6465
) -> list[str]:
6566
"""
@@ -68,7 +69,7 @@ def optional_symbols_list_to_list(
6869
6970
Parameters
7071
----------
71-
symbols : iterable of str, iterable of Number, str, or Number optional
72+
symbols : Iterable of str or int or Number, or str or int or Number, optional
7273
The symbols to concatenate.
7374
stype_in : SType
7475
The input symbology type for the request.
@@ -84,7 +85,7 @@ def optional_symbols_list_to_list(
8485
"""
8586
raise TypeError(
8687
f"`{symbols}` is not a valid type for symbol input; "
87-
"allowed types are Iterable[str], Iterable[int], str, int, and None.",
88+
"allowed types are Iterable[str | int], str, int, and None.",
8889
)
8990

9091

@@ -102,10 +103,10 @@ def _(_: None, __: SType) -> list[str]:
102103
return [ALL_SYMBOLS]
103104

104105

105-
@optional_symbols_list_to_list.register(cls=Number)
106-
def _(symbols: Number, stype_in: SType) -> list[str]:
106+
@optional_symbols_list_to_list.register(cls=Integral)
107+
def _(symbols: Integral, stype_in: SType) -> list[str]:
107108
"""
108-
Dispatch method for optional_symbols_list_to_list. Handles numerical types,
109+
Dispatch method for optional_symbols_list_to_list. Handles integral types,
109110
alerting when an integer is given for STypes that expect strings.
110111
111112
See Also
@@ -147,7 +148,7 @@ def _(symbols: str, stype_in: SType) -> list[str]:
147148

148149

149150
@optional_symbols_list_to_list.register(cls=Iterable)
150-
def _(symbols: Iterable[str] | Iterable[int], stype_in: SType) -> list[str]:
151+
def _(symbols: Iterable[Any], stype_in: SType) -> list[str]:
151152
"""
152153
Dispatch method for optional_symbols_list_to_list. Handles Iterables by
153154
dispatching the individual members.

databento/historical/api/batch.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import logging
44
import os
5+
from collections.abc import Iterable
56
from datetime import date
67
from os import PathLike
78
from pathlib import Path
@@ -48,7 +49,7 @@ def __init__(self, key: str, gateway: str) -> None:
4849
def submit_job(
4950
self,
5051
dataset: Dataset | str,
51-
symbols: list[str] | str,
52+
symbols: Iterable[str | int] | str | int,
5253
schema: Schema | str,
5354
start: pd.Timestamp | date | str | int,
5455
end: pd.Timestamp | date | str | int | None = None,
@@ -75,7 +76,7 @@ def submit_job(
7576
----------
7677
dataset : Dataset or str
7778
The dataset code (string identifier) for the request.
78-
symbols : list[str | int] or str
79+
symbols : Iterable[str | int] or str or int
7980
The instrument symbols to filter for. Takes up to 2,000 symbols per request.
8081
If more than 1 symbol is specified, the data is merged and sorted by time.
8182
If 'ALL_SYMBOLS' or `None` then will be for **all** symbols.

databento/historical/api/metadata.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from collections.abc import Iterable
34
from datetime import date
45
from typing import Any
56

@@ -261,7 +262,7 @@ def get_record_count(
261262
dataset: Dataset | str,
262263
start: pd.Timestamp | date | str | int,
263264
end: pd.Timestamp | date | str | int | None = None,
264-
symbols: list[str] | str | None = None,
265+
symbols: Iterable[str | int] | str | int | None = None,
265266
schema: Schema | str = "trades",
266267
stype_in: SType | str = "raw_symbol",
267268
limit: int | None = None,
@@ -285,7 +286,7 @@ def get_record_count(
285286
If an integer is passed, then this represents nanoseconds since the UNIX epoch.
286287
Values are forward filled based on the resolution provided.
287288
Defaults to the same value as `start`.
288-
symbols : list[str | int] or str, optional
289+
symbols : Iterable[str | int] or str or int, optional
289290
The instrument symbols to filter for. Takes up to 2,000 symbols per request.
290291
If 'ALL_SYMBOLS' or `None` then will be for **all** symbols.
291292
schema : Schema or str {'mbo', 'mbp-1', 'mbp-10', 'trades', 'tbbo', 'ohlcv-1s', 'ohlcv-1m', 'ohlcv-1h', 'ohlcv-1d', 'definition', 'statistics', 'status'}, default 'trades' # noqa
@@ -329,7 +330,7 @@ def get_billable_size(
329330
dataset: Dataset | str,
330331
start: pd.Timestamp | date | str | int,
331332
end: pd.Timestamp | date | str | int | None = None,
332-
symbols: list[str] | str | None = None,
333+
symbols: Iterable[str | int] | str | int | None = None,
333334
schema: Schema | str = "trades",
334335
stype_in: SType | str = "raw_symbol",
335336
limit: int | None = None,
@@ -354,7 +355,7 @@ def get_billable_size(
354355
If an integer is passed, then this represents nanoseconds since the UNIX epoch.
355356
Values are forward filled based on the resolution provided.
356357
Defaults to the same value as `start`.
357-
symbols : list[str | int] or str, optional
358+
symbols : Iterable[str | int] or str or int, optional
358359
The instrument symbols to filter for. Takes up to 2,000 symbols per request.
359360
If 'ALL_SYMBOLS' or `None` then will be for **all** symbols.
360361
schema : Schema or str {'mbo', 'mbp-1', 'mbp-10', 'trades', 'tbbo', 'ohlcv-1s', 'ohlcv-1m', 'ohlcv-1h', 'ohlcv-1d', 'definition', 'statistics', 'status'}, default 'trades' # noqa
@@ -399,7 +400,7 @@ def get_cost(
399400
start: pd.Timestamp | date | str | int,
400401
end: pd.Timestamp | date | str | int | None = None,
401402
mode: FeedMode | str = "historical-streaming",
402-
symbols: list[str] | str | None = None,
403+
symbols: Iterable[str | int] | str | int | None = None,
403404
schema: Schema | str = "trades",
404405
stype_in: SType | str = "raw_symbol",
405406
limit: int | None = None,
@@ -426,7 +427,7 @@ def get_cost(
426427
Defaults to the same value as `start`.
427428
mode : FeedMode or str {'live', 'historical-streaming', 'historical'}, default 'historical-streaming'
428429
The data feed mode for the request.
429-
symbols : list[str | int] or str, optional
430+
symbols : Iterable[str | int] or str or int, optional
430431
The instrument symbols to filter for. Takes up to 2,000 symbols per request.
431432
If 'ALL_SYMBOLS' or `None` then will be for **all** symbols.
432433
schema : Schema or str {'mbo', 'mbp-1', 'mbp-10', 'trades', 'tbbo', 'ohlcv-1s', 'ohlcv-1m', 'ohlcv-1h', 'ohlcv-1d', 'definition', 'statistics', 'status'}, default 'trades' # noqa

0 commit comments

Comments
 (0)