15
15
Any ,
16
16
BinaryIO ,
17
17
Callable ,
18
+ Final ,
18
19
Literal ,
19
20
Protocol ,
20
21
overload ,
23
24
import databento_dbn
24
25
import numpy as np
25
26
import pandas as pd
27
+ import pyarrow as pa
28
+ import pyarrow .parquet as pq
26
29
import zstandard
27
30
from databento_dbn import FIXED_PRICE_SCALE
28
31
from databento_dbn import Compression
51
54
52
55
logger = logging .getLogger (__name__ )
53
56
57
+ PARQUET_CHUNK_SIZE : Final = 2 ** 16
58
+
54
59
if TYPE_CHECKING :
55
60
from databento .historical .client import Historical
56
61
@@ -791,18 +796,14 @@ def to_csv(
791
796
compression : Compression or str, default `Compression.NONE`
792
797
The output compression for writing.
793
798
schema : Schema or str, optional
794
- The schema for the csv.
799
+ The DBN schema for the csv.
795
800
This is only required when reading a DBN stream with mixed record types.
796
801
797
802
Raises
798
803
------
799
804
ValueError
800
805
If the schema for the array cannot be determined.
801
806
802
- Notes
803
- -----
804
- Requires all the data to be brought up into memory to then be written.
805
-
806
807
"""
807
808
compression = validate_enum (compression , Compression , "compression" )
808
809
schema = validate_maybe_enum (schema , Schema , "schema" )
@@ -870,7 +871,7 @@ def to_df(
870
871
a 'symbol' column, mapping the instrument ID to its requested symbol for
871
872
every record.
872
873
schema : Schema or str, optional
873
- The schema for the dataframe.
874
+ The DBN schema for the dataframe.
874
875
This is only required when reading a DBN stream with mixed record types.
875
876
count : int, optional
876
877
If set, instead of returning a single `DataFrame` a `DataFrameIterator`
@@ -887,7 +888,7 @@ def to_df(
887
888
Raises
888
889
------
889
890
ValueError
890
- If the schema for the array cannot be determined.
891
+ If the DBN schema is unspecified and cannot be determined.
891
892
892
893
"""
893
894
schema = validate_maybe_enum (schema , Schema , "schema" )
@@ -919,6 +920,81 @@ def to_df(
919
920
920
921
return df_iter
921
922
923
+ def to_parquet (
924
+ self ,
925
+ path : Path | str ,
926
+ price_type : Literal ["fixed" , "float" ] = "float" ,
927
+ pretty_ts : bool = True ,
928
+ map_symbols : bool = True ,
929
+ schema : Schema | str | None = None ,
930
+ ** kwargs : Any ,
931
+ ) -> None :
932
+ """
933
+ Write the data to a parquet file at the given path.
934
+
935
+ Parameters
936
+ ----------
937
+ price_type : str, default "float"
938
+ The price type to use for price fields.
939
+ If "fixed", prices will have a type of `int` in fixed decimal format; each unit representing 1e-9 or 0.000000001.
940
+ If "float", prices will have a type of `float`.
941
+ The "decimal" price type is not supported at this time.
942
+ pretty_ts : bool, default True
943
+ If all timestamp columns should be converted from UNIX nanosecond
944
+ `int` to tz-aware UTC `pyarrow.TimestampType`.
945
+ map_symbols : bool, default True
946
+ If symbology mappings from the metadata should be used to create
947
+ a 'symbol' column, mapping the instrument ID to its requested symbol for
948
+ every record.
949
+ schema : Schema or str, optional
950
+ The DBN schema for the parquet file.
951
+ This is only required when reading a DBN stream with mixed record types.
952
+
953
+ Raises
954
+ ------
955
+ ValueError
956
+ If an incorrect price type is specified.
957
+ If the DBN schema is unspecified and cannot be determined.
958
+
959
+ """
960
+ if price_type == "decimal" :
961
+ raise ValueError ("the 'decimal' price type is not currently supported" )
962
+
963
+ schema = validate_maybe_enum (schema , Schema , "schema" )
964
+ if schema is None :
965
+ if self .schema is None :
966
+ raise ValueError ("a schema must be specified for mixed DBN data" )
967
+ schema = self .schema
968
+
969
+ dataframe_iter = self .to_df (
970
+ price_type = price_type ,
971
+ pretty_ts = pretty_ts ,
972
+ map_symbols = map_symbols ,
973
+ schema = schema ,
974
+ count = PARQUET_CHUNK_SIZE ,
975
+ )
976
+
977
+ writer = None
978
+ try :
979
+ for frame in dataframe_iter :
980
+ if writer is None :
981
+ # Initialize the writer using the first DataFrame
982
+ parquet_schema = pa .Schema .from_pandas (frame )
983
+ writer = pq .ParquetWriter (
984
+ where = path ,
985
+ schema = parquet_schema ,
986
+ ** kwargs ,
987
+ )
988
+ writer .write_table (
989
+ pa .Table .from_pandas (
990
+ frame ,
991
+ schema = parquet_schema ,
992
+ ),
993
+ )
994
+ finally :
995
+ if writer is not None :
996
+ writer .close ()
997
+
922
998
def to_file (self , path : Path | str ) -> None :
923
999
"""
924
1000
Write the data to a DBN file at the given path.
@@ -972,18 +1048,14 @@ def to_json(
972
1048
compression : Compression or str, default `Compression.NONE`
973
1049
The output compression for writing.
974
1050
schema : Schema or str, optional
975
- The schema for the json.
1051
+ The DBN schema for the json.
976
1052
This is only required when reading a DBN stream with mixed record types.
977
1053
978
1054
Raises
979
1055
------
980
1056
ValueError
981
1057
If the schema for the array cannot be determined.
982
1058
983
- Notes
984
- -----
985
- Requires all the data to be brought up into memory to then be written.
986
-
987
1059
"""
988
1060
compression = validate_enum (compression , Compression , "compression" )
989
1061
schema = validate_maybe_enum (schema , Schema , "schema" )
@@ -1030,7 +1102,7 @@ def to_ndarray(
1030
1102
Parameters
1031
1103
----------
1032
1104
schema : Schema or str, optional
1033
- The schema for the array.
1105
+ The DBN schema for the array.
1034
1106
This is only required when reading a DBN stream with mixed record types.
1035
1107
count : int, optional
1036
1108
If set, instead of returning a single `np.ndarray` a `NDArrayIterator`
@@ -1047,7 +1119,7 @@ def to_ndarray(
1047
1119
Raises
1048
1120
------
1049
1121
ValueError
1050
- If the schema for the array cannot be determined.
1122
+ If the DBN schema is unspecified and cannot be determined.
1051
1123
1052
1124
"""
1053
1125
schema = validate_maybe_enum (schema , Schema , "schema" )
@@ -1120,7 +1192,7 @@ def _transcode(
1120
1192
pretty_ts = pretty_ts ,
1121
1193
has_metadata = True ,
1122
1194
map_symbols = map_symbols ,
1123
- symbol_interval_map = symbol_map , # type: ignore [arg-type]
1195
+ symbol_interval_map = symbol_map ,
1124
1196
schema = schema ,
1125
1197
)
1126
1198
@@ -1329,8 +1401,7 @@ def _format_px(
1329
1401
if price_type == "decimal" :
1330
1402
for field in px_fields :
1331
1403
df [field ] = (
1332
- df [field ].replace (INT64_NULL , np .nan ).apply (decimal .Decimal )
1333
- / FIXED_PRICE_SCALE
1404
+ df [field ].replace (INT64_NULL , np .nan ).apply (decimal .Decimal ) / FIXED_PRICE_SCALE
1334
1405
)
1335
1406
elif price_type == "float" :
1336
1407
for field in px_fields :
0 commit comments