feat: allow to use with no args, as a simple padding function

aphp · Oct 12, 2023 · eecb141 · eecb141
1 parent b240a3e
commit eecb141
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 98 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,7 @@
+# v0.3.2
+
+- Allow to use `as_folded_tensor` with no args, as a simple padding function
+
 # v0.3.1
 
 - Enable sharing FoldedTensor instances in a multiprocessing + cuda context by autocloning the indexer before fork-pickling an instance

diff --git a/foldedtensor/__init__.py b/foldedtensor/__init__.py
@@ -5,7 +5,7 @@
 import torch
 from torch.autograd import Function
 
-numpy_to_torch_dtype_dict = {
+np_to_torch_dtype = {
     torch.bool: bool,
     torch.uint8: np.uint8,
     torch.int8: np.int8,
@@ -89,10 +89,32 @@ def backward(ctx, grad_output):
         # )
 
 
+type_to_dtype_dict = {
+    int: torch.int64,
+    float: torch.float64,
+    bool: torch.bool,
+    None: torch.float64,
+}
+
+
+def get_metadata(nested_data):
+    item = None
+
+    def rec(seq):
+        nonlocal item
+        if isinstance(seq, (list, tuple)):
+            for item in seq:
+                yield from (1 + res for res in rec(item))
+        else:
+            yield 0
+
+    return next(rec(nested_data), 0), type(item)
+
+
 def as_folded_tensor(
     data: Sequence,
-    data_dims: Sequence[Union[int, str]],
-    full_names: Sequence[str],
+    data_dims: Optional[Sequence[Union[int, str]]] = None,
+    full_names: Optional[Sequence[str]] = None,
     dtype: Optional[torch.dtype] = None,
     lengths: Optional[List[List[int]]] = None,
     device: Optional[Union[str, torch.device]] = None,
@@ -117,14 +139,18 @@ def as_folded_tensor(
     device: Optional[Unit[str, torch.device]]
         The device of the output tensor
     """
-    data_dims = tuple(
-        dim if isinstance(dim, int) else full_names.index(dim) for dim in data_dims
-    )
-    if (data_dims[-1] + 1) != len(full_names):
-        raise ValueError(
-            "The last dimension of `data_dims` must be the last variable dimension."
+    if data_dims is not None:
+        data_dims = tuple(
+            dim if isinstance(dim, int) else full_names.index(dim) for dim in data_dims
         )
-    elif isinstance(data, torch.Tensor) and lengths is not None:
+        if (data_dims[-1] + 1) != len(full_names):
+            raise ValueError(
+                "The last dimension of `data_dims` must be the last variable dimension."
+            )
+    elif full_names is not None:
+        data_dims = tuple(range(len(full_names)))
+    if isinstance(data, torch.Tensor) and lengths is not None:
+        data_dims = data_dims or tuple(range(len(lengths)))
         np_indexer, shape = _C.make_refolding_indexer(lengths, data_dims)
         assert shape == list(data.shape[: len(data_dims)])
         result = FoldedTensor(
@@ -135,9 +161,15 @@ def as_folded_tensor(
             indexer=torch.from_numpy(np_indexer).to(data.device),
         )
     elif isinstance(data, Sequence):
+        # if dtype is None:
+        #     raise ValueError("dtype must be provided when `data` is a sequence")
+        if data_dims is None or dtype is None:
+            deepness, inferred_dtype = get_metadata(data)
+        if data_dims is None:
+            data_dims = tuple(range(deepness))
         if dtype is None:
-            raise ValueError("dtype must be provided when `data` is a sequence")
-        dtype = numpy_to_torch_dtype_dict.get(dtype, dtype)
+            dtype = type_to_dtype_dict.get(inferred_dtype)
+        dtype = np_to_torch_dtype.get(dtype, dtype)
         padded, indexer, lengths = _C.nested_py_list_to_padded_array(
             data,
             data_dims,
@@ -235,13 +267,17 @@ def as_tensor(self):
     def to(self, *args, **kwargs):
         with torch._C.DisableTorchFunction():
             result = super().to(*args, **kwargs)
+            copy = kwargs.get("copy", False)
+            non_blocking = kwargs.get("non_blocking", False)
             return FoldedTensor(
                 data=result,
                 lengths=self.lengths,
                 data_dims=self.data_dims,
                 full_names=self.full_names,
-                indexer=self.indexer.to(result.device, copy=kwargs.get("copy", False)),
-                mask=self._mask.to(result.device, copy=kwargs.get("copy", False))
+                indexer=self.indexer.to(
+                    result.device, copy=copy, non_blocking=non_blocking
+                ),
+                mask=self._mask.to(result.device, copy=copy, non_blocking=non_blocking)
                 if self._mask is not None
                 else None,
             )

diff --git a/tests/test_folded_tensor.py b/tests/test_folded_tensor.py
@@ -133,19 +133,6 @@ def test_as_folded_tensor_error():
 
     assert "as_folded_tensor expects:" in str(excinfo.value)
 
-    with pytest.raises(ValueError) as excinfo:
-        as_folded_tensor(
-            [
-                [[1], [], [], [], [2, 3]],
-                [[4, 3]],
-            ],
-            data_dims=("samples", "lines", "words"),
-            full_names=("samples", "lines", "words"),
-            lengths=[[1, 2, 3]],
-        )
-
-    assert "dtype must be provided" in str(excinfo.value)
-
 
 @pytest.fixture
 def ft():
@@ -312,6 +299,54 @@ def test_pad_embedding():
     ).all()
 
 
+def test_empty_args():
+    ft = as_folded_tensor(
+        [
+            [0, 1, 2],
+            [3, 4],
+        ],
+    )
+    assert (
+        ft.data
+        == torch.tensor(
+            [
+                [0, 1, 2],
+                [3, 4, 0],
+            ]
+        )
+    ).all()
+    assert ft.data.dtype == torch.int64
+    assert (
+        ft.mask
+        == torch.tensor(
+            [
+                [1, 1, 1],
+                [1, 1, 0],
+            ]
+        ).bool()
+    ).all()
+
+
+def test_no_data_dims():
+    ft = as_folded_tensor(
+        [
+            [0, 1, 2],
+            [3, 4],
+        ],
+        full_names=("token",),
+        dtype=torch.long,
+    )
+    assert (
+        ft.data
+        == torch.tensor(
+            [
+                [0, 1, 2],
+                [3, 4, 0],
+            ]
+        )
+    ).all()
+
+
 def test_as_tensor(ft):
     tensor = ft.as_tensor()
     assert type(tensor) == torch.Tensor