VER: Release 0.22.1

See release notes.
databento · Oct 24, 2023 · f4e2a13 · f4e2a13
2 parents 3247f49 + c2814b3
commit f4e2a13
Show file tree

Hide file tree

Showing 7 changed files with 313 additions and 121 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 0.22.1 - 2023-10-24
+
+#### Bug fixes
+- Fixed an issue where `DBNStore.to_csv` and `DBNStore.to_json` were mapping symbols even when `map_symbols` was set to `False`
+- Fixed an issue where empty symbology mappings caused a `ValueError` when loading symbols into the `DBNStore` instrument map
+
 ## 0.22.0 - 2023-10-23
 
 #### Enhancements

diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -36,7 +36,6 @@
 from databento.common.error import BentoError
 from databento.common.iterator import chunk
 from databento.common.symbology import InstrumentMap
-from databento.common.symbology import SymbolInterval
 from databento.common.validation import validate_enum
 from databento.common.validation import validate_file_write_path
 from databento.common.validation import validate_maybe_enum
@@ -812,23 +811,13 @@ def to_csv(
                 raise ValueError("a schema must be specified for mixed DBN data")
             schema = self.schema
 
-        record_type = SCHEMA_STRUCT_MAP[schema]
-        record_iter = filter(lambda r: isinstance(r, record_type), self)
-
-        if map_symbols:
-            self._instrument_map.insert_metadata(self.metadata)
-            symbol_map = self._instrument_map._data
-        else:
-            symbol_map = None
-
         with open(path, "xb") as output:
             self._transcode(
                 output=output,
-                records_iter=record_iter,
                 encoding=Encoding.CSV,
                 pretty_px=pretty_px,
                 pretty_ts=pretty_ts,
-                symbol_map=symbol_map,
+                map_symbols=map_symbols,
                 compression=compression,
                 schema=schema,
             )
@@ -1025,23 +1014,13 @@ def to_json(
                 raise ValueError("a schema must be specified for mixed DBN data")
             schema = self.schema
 
-        record_type = SCHEMA_STRUCT_MAP[schema]
-        record_iter = filter(lambda r: isinstance(r, record_type), self)
-
-        if map_symbols:
-            self._instrument_map.insert_metadata(self.metadata)
-            symbol_map = self._instrument_map._data
-        else:
-            symbol_map = None
-
         with open(path, "xb") as output:
             self._transcode(
                 output=output,
-                records_iter=record_iter,
                 encoding=Encoding.JSON,
                 pretty_px=pretty_px,
                 pretty_ts=pretty_ts,
-                symbol_map=symbol_map,
+                map_symbols=map_symbols,
                 compression=compression,
                 schema=schema,
             )
@@ -1114,27 +1093,33 @@ def to_ndarray(
     def _transcode(
         self,
         output: BinaryIO,
-        records_iter: Iterator[DBNRecord],
         encoding: Encoding,
         pretty_px: bool,
         pretty_ts: bool,
-        symbol_map: dict[int, list[SymbolInterval]] | None,
+        map_symbols: bool,
         compression: Compression,
         schema: Schema,
     ) -> None:
+        if map_symbols:
+            self._instrument_map.insert_metadata(self.metadata)
+            symbol_map = self._instrument_map._data
+        else:
+            symbol_map = None
+
         transcoder = Transcoder(
             file=output,
             encoding=encoding,
             compression=compression,
             pretty_px=pretty_px,
             pretty_ts=pretty_ts,
+            map_symbols=map_symbols,
             has_metadata=True,
             symbol_map=symbol_map,  # type: ignore [arg-type]
             schema=schema,
         )
 
         transcoder.write(bytes(self.metadata))
-        for records in chunk(records_iter, 2**16):
+        for records in chunk(self, 2**16):
             for record in records:
                 transcoder.write(bytes(record))
             transcoder.flush()

diff --git a/databento/common/symbology.py b/databento/common/symbology.py
@@ -19,23 +19,23 @@
 ALL_SYMBOLS = "ALL_SYMBOLS"
 
 
-class SymbolInterval(NamedTuple):
+class MappingInterval(NamedTuple):
     """
     Interval inside which a symbol is defined.
 
     Attributes
     ----------
-    start: dt.date
+    start_date: dt.date
         The start time of the interval.
-    end: dt.date
+    end_date: dt.date
         The end time of the interval (exclusive).
     symbol: str
         The string symbol.
 
     """
 
-    start: dt.date
-    end: dt.date
+    start_date: dt.date
+    end_date: dt.date
     symbol: str
 
 
@@ -60,7 +60,7 @@ class InstrumentMap:
     )
 
     def __init__(self) -> None:
-        self._data: dict[int, list[SymbolInterval]] = defaultdict(list)
+        self._data: dict[int, list[MappingInterval]] = defaultdict(list)
 
     def clear(self) -> None:
         """
@@ -96,7 +96,7 @@ def resolve(
         """
         mappings = self._data[instrument_id]
         for entry in mappings:
-            if entry.start <= date < entry.end:
+            if entry.start_date <= date < entry.end_date:
                 return entry.symbol
         return None
 
@@ -119,17 +119,14 @@ def insert_metadata(self, metadata: Metadata) -> None:
             # Nothing to do
             return
 
-        if SType(metadata.stype_in) == SType.INSTRUMENT_ID:
-            inverse = True
-        elif SType(metadata.stype_out) == SType.INSTRUMENT_ID:
-            inverse = False
-        else:
-            raise ValueError(
-                "either `stype_out` or `stype_in` must be `instrument_id` to insert",
-            )
+        stype_in = SType(metadata.stype_in)
+        stype_out = SType(metadata.stype_out)
 
-        for in_symbol, entries in metadata.mappings.items():
+        for symbol_in, entries in metadata.mappings.items():
             for entry in entries:
+                if not entry["symbol"]:
+                    continue  # skip empty symbol mapping
+
                 try:
                     start_date = pd.Timestamp(entry["start_date"], tz="utc").date()
                     end_date = pd.Timestamp(entry["end_date"], tz="utc").date()
@@ -138,28 +135,18 @@ def insert_metadata(self, metadata: Metadata) -> None:
                         f"failed to parse date range from start_date={entry['start_date']} end_date={entry['end_date']}",
                     )
 
-                if inverse:
-                    try:
-                        instrument_id = int(in_symbol)
-                    except TypeError:
-                        raise ValueError(
-                            f"failed to parse `{in_symbol}` as an instrument_id",
-                        )
-                    symbol = entry["symbol"]
-                else:
-                    try:
-                        instrument_id = int(entry["symbol"])
-                    except TypeError:
-                        raise ValueError(
-                            f"failed to parse `{entry['symbol']}` as an instrument_id",
-                        )
-                    symbol = in_symbol
+                symbol, instrument_id = _resolve_mapping_tuple(
+                    symbol_in=symbol_in,
+                    stype_in=stype_in,
+                    symbol_out=entry["symbol"],
+                    stype_out=stype_out,
+                )
 
                 self._insert_inverval(
                     instrument_id,
-                    SymbolInterval(
-                        start=start_date,
-                        end=end_date,
+                    MappingInterval(
+                        start_date=start_date,
+                        end_date=end_date,
                         symbol=symbol,
                     ),
                 )
@@ -201,9 +188,9 @@ def insert_symbol_mapping_msg(
 
         self._insert_inverval(
             msg.hd.instrument_id,
-            SymbolInterval(
-                start=pd.Timestamp(start_ts, unit="ns", tz="utc").date(),
-                end=pd.Timestamp(end_ts, unit="ns", tz="utc").date(),
+            MappingInterval(
+                start_date=pd.Timestamp(start_ts, unit="ns", tz="utc").date(),
+                end_date=pd.Timestamp(end_ts, unit="ns", tz="utc").date(),
                 symbol=symbol,
             ),
         )
@@ -243,25 +230,22 @@ def insert_json(
         if not all(k in mapping for k in self.SYMBOLOGY_RESOLVE_KEYS):
             raise ValueError("mapping must contain a complete symbology.resolve result")
 
-        if SType(mapping["stype_in"]) == SType.INSTRUMENT_ID:
-            inverse = True
-        elif SType(mapping["stype_out"]) == SType.INSTRUMENT_ID:
-            inverse = False
-        else:
-            raise ValueError(
-                "either `stype_out` or `stype_in` must be `instrument_id` to insert",
-            )
-
         if not isinstance(mapping["result"], dict):
             raise ValueError("`result` is not a valid symbology mapping")
 
-        for in_symbol, entries in mapping["result"].items():
+        stype_in = SType(mapping["stype_in"])
+        stype_out = SType(mapping["stype_out"])
+
+        for symbol_in, entries in mapping["result"].items():
             for entry in entries:
                 if not all(k in entry for k in self.SYMBOLOGY_RESULT_KEYS):
                     raise ValueError(
                         "`result` contents must contain `d0`, `d1`, and `s` keys",
                     )
 
+                if not entry["s"]:
+                    continue  # skip empty symbol mapping
+
                 try:
                     start_date = pd.Timestamp(entry["d0"], tz="utc").date()
                     end_date = pd.Timestamp(entry["d1"], tz="utc").date()
@@ -270,33 +254,23 @@ def insert_json(
                         f"failed to parse date range from d0={entry['d0']} d1={entry['d1']}",
                     )
 
-                if inverse:
-                    try:
-                        instrument_id = int(in_symbol)
-                    except TypeError:
-                        raise ValueError(
-                            f"failed to parse `{in_symbol}` as an instrument_id",
-                        )
-                    symbol = entry["s"]
-                else:
-                    try:
-                        instrument_id = int(entry["s"])
-                    except TypeError:
-                        raise ValueError(
-                            f"failed to parse `{entry['s']}` as an instrument_id",
-                        )
-                    symbol = in_symbol
+                symbol, instrument_id = _resolve_mapping_tuple(
+                    symbol_in=symbol_in,
+                    stype_in=stype_in,
+                    symbol_out=entry["s"],
+                    stype_out=stype_out,
+                )
 
                 self._insert_inverval(
                     instrument_id,
-                    SymbolInterval(
-                        start=start_date,
-                        end=end_date,
+                    MappingInterval(
+                        start_date=start_date,
+                        end_date=end_date,
                         symbol=symbol,
                     ),
                 )
 
-    def _insert_inverval(self, instrument_id: int, interval: SymbolInterval) -> None:
+    def _insert_inverval(self, instrument_id: int, interval: MappingInterval) -> None:
         """
         Insert a SymbolInterval into the map.
 
@@ -314,3 +288,31 @@ def _insert_inverval(self, instrument_id: int, interval: SymbolInterval) -> None
             return  # this mapping is already present
 
         mappings.insert(insert_position, interval)
+
+
+def _resolve_mapping_tuple(
+    symbol_in: str | int,
+    stype_in: SType,
+    symbol_out: str | int,
+    stype_out: SType,
+) -> tuple[str, int]:
+    if stype_in == SType.INSTRUMENT_ID:
+        try:
+            instrument_id = int(symbol_in)
+        except (TypeError, ValueError):
+            raise ValueError(
+                f"failed to parse `{symbol_in}` as an instrument_id",
+            )
+        return str(symbol_out), instrument_id
+    elif stype_out == SType.INSTRUMENT_ID:
+        try:
+            instrument_id = int(symbol_out)
+        except (TypeError, ValueError):
+            raise ValueError(
+                f"failed to parse `{symbol_out}` as an instrument_id",
+            )
+        return str(symbol_in), instrument_id
+
+    raise ValueError(
+        "either `stype_out` or `stype_in` must be `instrument_id` to insert",
+    )
diff --git a/databento/version.py b/databento/version.py
@@ -1 +1 @@
-__version__ = "0.22.0"
+__version__ = "0.22.1"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "databento"
-version = "0.22.0"
+version = "0.22.1"
 description = "Official Python client library for Databento"
 authors = [
     "Databento <[email protected]>",