Skip to content

Commit 253b7ec

Browse files
authored
feat(python): Add map type constructor (#687)
When working on fixing a problem with Arrow C++'s map type import, I realized there was no way to create map types or any of the canonical extensions. Extensions are slightly different and I'll tackle them later, but map types are relatively straightforward and follow the existing pattern of all the other constructors: ```python import nanoarrow as na na.map_(na.string(), na.int32()) #> <Schema> map<entries: struct<key: string, value: int32>> ```
1 parent 116cdad commit 253b7ec

File tree

4 files changed

+107
-11
lines changed

4 files changed

+107
-11
lines changed

python/src/nanoarrow/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
list_,
5353
large_list,
5454
fixed_size_list,
55+
map_,
5556
dictionary,
5657
binary,
5758
large_binary,
@@ -115,6 +116,7 @@
115116
"large_string",
116117
"large_list",
117118
"list_",
119+
"map_",
118120
"null",
119121
"nulls_as_sentinel",
120122
"nulls_forbid",

python/src/nanoarrow/_schema.pyx

+8
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,14 @@ cdef class CSchemaBuilder:
878878

879879
return self
880880

881+
def set_map_keys_sorted(self, map_keys_sorted) -> CSchemaBuilder:
882+
if map_keys_sorted:
883+
self._ptr.flags = self._ptr.flags | ARROW_FLAG_MAP_KEYS_SORTED
884+
else:
885+
self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_MAP_KEYS_SORTED
886+
887+
return self
888+
881889
def validate(self) -> CSchemaView:
882890
return CSchemaView(self.c_schema)
883891

python/src/nanoarrow/schema.py

+87-11
Original file line numberDiff line numberDiff line change
@@ -401,12 +401,14 @@ def dictionary_ordered(self) -> Union[bool, None]:
401401
return self._c_schema_view.dictionary_ordered
402402

403403
@property
404-
def value_type(self):
405-
"""Dictionary or list value type
404+
def value_type(self) -> Union["Schema", None]:
405+
"""Dictionary, map, or list value type
406406
407407
>>> import nanoarrow as na
408408
>>> na.list_(na.int32()).value_type
409409
<Schema> 'item': int32
410+
>>> na.map_(na.int32(), na.string()).value_type
411+
<Schema> 'value': string
410412
>>> na.dictionary(na.int32(), na.string()).value_type
411413
<Schema> string
412414
"""
@@ -416,11 +418,33 @@ def value_type(self):
416418
_types.FIXED_SIZE_LIST,
417419
):
418420
return self.field(0)
421+
elif self._c_schema_view.type_id == _types.MAP:
422+
return Schema(self._c_schema.child(0).child(1))
419423
elif self._c_schema_view.type_id == _types.DICTIONARY:
420424
return Schema(self._c_schema.dictionary)
421425
else:
422426
return None
423427

428+
@property
429+
def key_type(self) -> Union["Schema", None]:
430+
"""Map key type
431+
432+
>>> import nanoarrow as na
433+
>>> na.map_(na.int32(), na.string()).key_type
434+
<Schema> 'key': non-nullable int32
435+
"""
436+
if self._c_schema_view.type_id == _types.MAP:
437+
return Schema(self._c_schema.child(0).child(0))
438+
else:
439+
return None
440+
441+
@property
442+
def keys_sorted(self) -> Union[bool, None]:
443+
if self._c_schema_view.type_id == _types.MAP:
444+
return self._c_schema_view.map_keys_sorted
445+
else:
446+
return None
447+
424448
@property
425449
def list_size(self) -> Union[int, None]:
426450
"""Fixed-size list element size
@@ -979,7 +1003,7 @@ def timestamp(
9791003
return Schema(Type.TIMESTAMP, timezone=timezone, unit=unit, nullable=nullable)
9801004

9811005

982-
def duration(unit, nullable: bool = True):
1006+
def duration(unit, nullable: bool = True) -> Schema:
9831007
"""Create an instance of a duration type.
9841008
9851009
Parameters
@@ -999,7 +1023,7 @@ def duration(unit, nullable: bool = True):
9991023
return Schema(Type.DURATION, unit=unit, nullable=nullable)
10001024

10011025

1002-
def interval_months(nullable: bool = True):
1026+
def interval_months(nullable: bool = True) -> Schema:
10031027
"""Create an instance of an interval type measured in months.
10041028
10051029
Parameters
@@ -1017,7 +1041,7 @@ def interval_months(nullable: bool = True):
10171041
return Schema(Type.INTERVAL_MONTHS, nullable=nullable)
10181042

10191043

1020-
def interval_day_time(nullable: bool = True):
1044+
def interval_day_time(nullable: bool = True) -> Schema:
10211045
"""Create an instance of an interval type measured as a day/time pair.
10221046
10231047
Parameters
@@ -1035,7 +1059,7 @@ def interval_day_time(nullable: bool = True):
10351059
return Schema(Type.INTERVAL_DAY_TIME, nullable=nullable)
10361060

10371061

1038-
def interval_month_day_nano(nullable: bool = True):
1062+
def interval_month_day_nano(nullable: bool = True) -> Schema:
10391063
"""Create an instance of an interval type measured as a month/day/nanosecond
10401064
tuple.
10411065
@@ -1100,7 +1124,7 @@ def decimal256(precision: int, scale: int, nullable: bool = True) -> Schema:
11001124
return Schema(Type.DECIMAL256, precision=precision, scale=scale, nullable=nullable)
11011125

11021126

1103-
def struct(fields, nullable=True) -> Schema:
1127+
def struct(fields, nullable: bool = True) -> Schema:
11041128
"""Create a type representing a named sequence of fields.
11051129
11061130
Parameters
@@ -1124,7 +1148,7 @@ def struct(fields, nullable=True) -> Schema:
11241148
return Schema(Type.STRUCT, fields=fields, nullable=nullable)
11251149

11261150

1127-
def list_(value_type, nullable=True) -> Schema:
1151+
def list_(value_type, nullable: bool = True) -> Schema:
11281152
"""Create a type representing a variable-size list of some other type.
11291153
11301154
Parameters
@@ -1144,7 +1168,7 @@ def list_(value_type, nullable=True) -> Schema:
11441168
return Schema(Type.LIST, value_type=value_type, nullable=nullable)
11451169

11461170

1147-
def large_list(value_type, nullable=True) -> Schema:
1171+
def large_list(value_type, nullable: bool = True) -> Schema:
11481172
"""Create a type representing a variable-size list of some other type.
11491173
11501174
Unlike :func:`list_`, the func:`large_list` can accomodate arrays
@@ -1167,7 +1191,7 @@ def large_list(value_type, nullable=True) -> Schema:
11671191
return Schema(Type.LARGE_LIST, value_type=value_type, nullable=nullable)
11681192

11691193

1170-
def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
1194+
def fixed_size_list(value_type, list_size: int, nullable: bool = True) -> Schema:
11711195
"""Create a type representing a fixed-size list of some other type.
11721196
11731197
Parameters
@@ -1194,7 +1218,40 @@ def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
11941218
)
11951219

11961220

1197-
def dictionary(index_type, value_type, dictionary_ordered=False):
1221+
def map_(key_type, value_type, keys_sorted: bool = False, nullable: bool = True):
1222+
"""Create a type representing a list of key/value mappings
1223+
1224+
Note that each element in the list contains potentially many
1225+
key/value pairs (and that a map array contains potentially
1226+
many individual mappings).
1227+
1228+
Parameters
1229+
----------
1230+
value_type : schema-like
1231+
The type of keys in each map element.
1232+
value_type : schema-like
1233+
The type of values in each map element
1234+
keys_sorted : bool, optional
1235+
True if keys within each map element are sorted.
1236+
nullable : bool, optional
1237+
Use ``False`` to mark this field as non-nullable.
1238+
1239+
Examples
1240+
--------
1241+
>>> import nanoarrow as na
1242+
>>> na.map_(na.int32(), na.string())
1243+
<Schema> map<entries: struct<key: int32, value: string>>
1244+
"""
1245+
return Schema(
1246+
Type.MAP,
1247+
key_type=key_type,
1248+
value_type=value_type,
1249+
keys_sorted=keys_sorted,
1250+
nullable=nullable,
1251+
)
1252+
1253+
1254+
def dictionary(index_type, value_type, dictionary_ordered: bool = False) -> Schema:
11981255
"""Create a type representing dictionary-encoded values
11991256
12001257
Parameters
@@ -1290,6 +1347,25 @@ def _c_schema_from_type_and_params(type: Type, params: dict):
12901347
factory.allocate_children(1)
12911348
factory.set_child(0, "item", c_schema(params.pop("value_type")))
12921349

1350+
elif type == Type.MAP:
1351+
key_schema = c_schema(params.pop("key_type"))
1352+
value_schema = c_schema(params.pop("value_type"))
1353+
1354+
entries = CSchemaBuilder.allocate()
1355+
entries.set_format("+s")
1356+
entries.set_nullable(False)
1357+
entries.allocate_children(2)
1358+
entries.set_child(0, "key", key_schema.modify(nullable=False))
1359+
entries.set_child(1, "value", value_schema)
1360+
1361+
factory.set_format("+m")
1362+
factory.allocate_children(1)
1363+
factory.set_child(0, "entries", entries.finish())
1364+
factory.set_nullable(False)
1365+
1366+
if "keys_sorted" in params:
1367+
factory.set_map_keys_sorted(params.pop("keys_sorted"))
1368+
12931369
elif type == Type.DICTIONARY:
12941370
index_type = c_schema(params.pop("index_type"))
12951371
factory.set_format(index_type.format)

python/tests/test_schema.py

+10
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,16 @@ def test_schema_fixed_size_list():
198198
assert schema_obj.list_size == 123
199199

200200

201+
def test_schema_map():
202+
schema_obj = na.map_(na.int32(), na.string())
203+
assert schema_obj.type == na.Type.MAP
204+
assert schema_obj.key_type.type == na.Type.INT32
205+
assert schema_obj.value_type.type == na.Type.STRING
206+
assert schema_obj.keys_sorted is False
207+
208+
assert na.map_(na.int32(), na.string(), keys_sorted=True).keys_sorted is True
209+
210+
201211
def test_schema_dictionary():
202212
schema_obj = na.dictionary(na.int8(), na.null())
203213
assert schema_obj.type == na.Type.DICTIONARY

0 commit comments

Comments
 (0)