NVIDIA-Merlin · lesnikow · Sep 14, 2021 · Sep 16, 2021 · Sep 16, 2021 · Sep 16, 2021
diff --git a/nvtabular/loader/backend.py b/nvtabular/loader/backend.py
@@ -188,6 +188,7 @@ def __init__(
         sparse_names=None,
         sparse_max=None,
         sparse_as_dense=False,
+        pad_left=False,
     ):
         self.data = dataset
         self.indices = cp.arange(dataset.to_ddf().npartitions)
@@ -196,6 +197,7 @@ def __init__(
         self.sparse_names = sparse_names or []
         self.sparse_max = sparse_max or {}
         self.sparse_as_dense = sparse_as_dense
+        self.pad_left = pad_left
         self.global_size = global_size or 1
         self.global_rank = global_rank or 0
 

diff --git a/nvtabular/loader/tensorflow.py b/nvtabular/loader/tensorflow.py
@@ -140,7 +140,7 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader):
 
     Iterator output is of the form `(dict(features), list(labels))`,
     where each element of the features dict is a
-    `feature_name: feature_tensor`  and each elemtn of the labels
+    `feature_name: feature_tensor`  and each element of the labels
     list is a tensor, and all tensors are of shape `(batch_size, 1)`.
     Note that this means vectorized continuous and multi-hot categorical
     features are not currently supported.
@@ -153,7 +153,7 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader):
         workflow.update_stats(dataset.data.to_iter(), record_stats=True)
 
     Parameters
-    -------------
+    ----------
     - paths_or_dataset: str or list(str)
         Either a string representing a file pattern (see `tf.glob` for
         pattern rules), a list of filenames to be iterated through, or
@@ -205,6 +205,10 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader):
         dictionary of key: column_name + value: integer representing max sequence length for column
     sparse_dense : bool
         bool value to activate transforming sparse tensors to dense
+    pad_left : bool
+        Boolean value to indicate whether to pad on the left. Use True to pad on the left,
+        False to pad on the right. Default: False
+
     """
 
     _use_nnz = True
@@ -230,6 +234,7 @@ def __init__(
         sparse_names=None,
         sparse_max=None,
         sparse_as_dense=False,
+        pad_left=False,
     ):
         dataset = _validate_dataset(
             paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs
@@ -238,7 +243,7 @@ def __init__(
             feature_columns, cat_names, cont_names, schema=dataset.schema
         )
 
-        # sort the ccolumns to avoid getting incorrect output
+        # Sort the columns to avoid getting incorrect output.
         # (https://github.com/NVIDIA/NVTabular/issues/412)
         cat_names = _get_embedding_order(cat_names)
         cont_names = _get_embedding_order(cont_names)
@@ -261,23 +266,23 @@ def __init__(
             sparse_names=sparse_names,
             sparse_max=sparse_max,
             sparse_as_dense=sparse_as_dense,
+            pad_left=pad_left,
         )
         self._map_fns = []
 
     def __len__(self):
-        """
-        recreating since otherwise Keras yells at you
-        """
+        """Recreating since otherwise Keras yells at you."""
         # TODO: what's a better way to do this inheritance
         # of the appropriate methods? A Metaclass?
         DataLoader.stop(self)
         return DataLoader.__len__(self)
 
     def __getitem__(self, idx):
         """
-        implemented exclusively for consistency
+        Implemented exclusively for consistency
         with Keras model.fit. Does not leverage
-        passed idx in any way
+        passed idx in any way.
+
         """
         return DataLoader.__next__(self)
 
@@ -286,6 +291,7 @@ def map(self, fn):
         Applying a function to each batch.
 
         This can for instance be used to add `sample_weight` to the model.
+
         """
         self._map_fns.append(fn)
 
@@ -416,8 +422,30 @@ def _get_sparse_tensor(self, values, indices, num_rows, seq_limit):
         return sparse_tensor
 
     def _build_sparse_tensor(self, values, offsets, diff_offsets, num_rows, seq_limit):
+        """Builds sparse tensors in the TensorFlow dataloader.
+
+        Parameters
+        ----------
+        values :
+        offsets :
+        diff_offsets :
+        num_rows :
+        seq_limit :
+
+        Returns
+        -------
+        tf.sparse
+            Our built TensorFlow sparse tensor.
+
+        """
         ragged = tf.RaggedTensor.from_row_lengths(values=values, row_lengths=diff_offsets)
-        tensor = tf.RaggedTensor.from_tensor(ragged.to_tensor(shape=[None, seq_limit])).to_sparse()
+        if self.pad_left:
+            max_len = max(max(len(row) for row in ragged), seq_limit)
+            tensor = tf.stack([tf.pad(row, [[max_len - len(row), 0]]) for row in ragged], axis=0)
+        else:
+            tensor = ragged.to_tensor(shape=[None, seq_limit])
+
+        tensor = tf.RaggedTensor.from_tensor(tensor).to_sparse()
         if self.sparse_as_dense:
             tensor = tf.sparse.to_dense(tensor)
         return tensor

diff --git a/nvtabular/loader/torch.py b/nvtabular/loader/torch.py
@@ -42,7 +42,7 @@ class TorchAsyncItr(torch.utils.data.IterableDataset, DataLoader):
     batches are the specified size until the final batch.
 
     Parameters
-    -----------
+    ----------
     dataset : NVTabular dataset
     cats : [str]
         the list of categorical columns in the dataset
@@ -64,6 +64,10 @@ class TorchAsyncItr(torch.utils.data.IterableDataset, DataLoader):
         dictionary of key: column_name + value: integer representing max sequence length for column
     sparse_dense : bool
         bool value to activate transforming sparse tensors to dense
+    pad_left : bool
+        Boolean value to indicate whether to pad on the left. Use True to pad on the left,
+        False to pad on the right. Default: False
+
     """
 
     def __init__(
@@ -83,6 +87,7 @@ def __init__(
         sparse_names=None,
         sparse_max=None,
         sparse_as_dense=False,
+        pad_left=False,
     ):
         DataLoader.__init__(
             self,
@@ -101,6 +106,7 @@ def __init__(
             sparse_names=sparse_names,
             sparse_max=sparse_max,
             sparse_as_dense=sparse_as_dense,
+            pad_left=pad_left,
         )
 
     def __iter__(self):
@@ -174,8 +180,48 @@ def _get_sparse_tensor(self, values, indices, num_rows, seq_limit):
             sparse_tensor = sparse_tensor.to_dense()
         return sparse_tensor
 
-    def _build_sparse_tensor(self, values, offsets, diff_offsets, num_rows, seq_limit):
+    def _build_sparse_tensor_helper_process_column(self, col: torch.Tensor) -> torch.Tensor:
+        """Process column by increasing blocks for use in left padding."""
+        col = col.tolist()
+        prev, curr = 0, 0
+        while curr < len(col):
+            if col[curr] >= col[curr - 1]:
+                col[prev:curr] = col[prev:curr][::-1]
+                prev = curr
+            if curr == (len(col) - 1):
+                col[prev : curr + 1] = col[prev : curr + 1][::-1]
+            curr += 1
+        return torch.Tensor(col)
+
+    def _build_sparse_tensor(
+        self,
+        values,
+        offsets,
+        diff_offsets,
+        num_rows,
+        seq_limit,
+    ):
+        """Builds sparse tensors in our torch dataloader.
+
+        Parameters
+        ----------
+        values :
+        offsets :
+        diff_offsets :
+        num_rows :
+        seq_limit :
+
+        Returns
+        -------
+        torch.sparse
+            Our built torch sparse tensor.
+
+        """
         indices = self._get_indices(offsets, diff_offsets)
+        if self.pad_left:
+            indices[:, 1] = self._build_sparse_tensor_helper_process_column(
+                (seq_limit - 1) - indices[:, 1]
+            )
         return self._get_sparse_tensor(values, indices, num_rows, seq_limit)
 
 

diff --git a/nvtabular/ops/list_slice.py b/nvtabular/ops/list_slice.py
@@ -61,7 +61,7 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
         on_cpu = _is_cpu_object(df)
         ret = type(df)()
         for col in col_selector.names:
-            # handle CPU via normal python slicing (not very efficient)
+            # Handle CPU via normal python slicing (not very efficient).
             if on_cpu:
                 ret[col] = [row[self.start : self.end] for row in df[col]]
             else:
@@ -99,8 +99,8 @@ def output_tags(self):
 
 @numba.cuda.jit
 def _calculate_row_sizes(start, end, offsets, row_sizes):
-    """given a slice (start/end) and existing offsets indicating row lengths, this
-    calculates the size for each new row after slicing"""
+    """Given a slice (start/end) and existing offsets indicating row lengths, this
+    calculates the size for each new row after slicing."""
     rowid = numba.cuda.grid(1)
     if rowid < offsets.size - 1:
         original_row_size = offsets[rowid + 1] - offsets[rowid]
@@ -120,9 +120,9 @@ def _calculate_row_sizes(start, end, offsets, row_sizes):
 
 @numba.cuda.jit
 def _slice_rows(start, offsets, elements, new_offsets, new_elements):
-    """slices rows of a list column. requires the 'new_offsets' to
+    """Slices rows of a list column. requires the 'new_offsets' to
     be previously calculated (meaning that we don't need the 'end' slice index
-    since thats baked into the new_offsets"""
+    since thats baked into the new_offsets."""
     rowid = numba.cuda.grid(1)
     if rowid < (new_offsets.size - 1):
         if start >= 0:

diff --git a/tests/unit/loader/test_tf_dataloader.py b/tests/unit/loader/test_tf_dataloader.py
@@ -518,6 +518,68 @@ def test_sparse_tensors(tmpdir, sparse_dense):
                 assert not isinstance(feature_tensor, tf.sparse.SparseTensor)
 
 
+@pytest.mark.parametrize("pad_left", [False, True])
+def test_sparse_tensor_left_padding(pad_left):
+    """Tests the pad_left functionality of our TensorFlow dataloader
+    to pad data on the left for sparse tensors."""
+    df = cudf.DataFrame({"A": [[3, 1, 5, 1], [9, 2], [6]], "B": [[3, 1, 5, 1, 9], [2], [6, 5, 3]]})
+    categorical_columns = ["A", "B"]
+    sparse_max = {"A": 5, "B": 8}
+    batch_size = 4
+
+    data_itr = tf_dataloader.KerasSequenceLoader(
+        nvt.Dataset(df),
+        cat_names=categorical_columns,
+        cont_names=[],
+        label_names=[],
+        batch_size=batch_size,
+        sparse_max=sparse_max,
+        sparse_names=categorical_columns,
+        sparse_as_dense=True,
+        pad_left=pad_left,
+    )
+
+    for batch in data_itr:
+        features, labels = batch
+        for categorical_column in categorical_columns:
+            feature_tensor = features[categorical_column]
+            print("feature_tensor is:\n{}".format(feature_tensor))
+            print("categorical_column is:\n{}".format(categorical_column))
+            if pad_left:
+                if categorical_column == "A":
+                    expected_tensor = tf.constant(
+                        [[0, 3, 1, 5, 1], [0, 0, 0, 9, 2], [0, 0, 0, 0, 6]], dtype=tf.int64
+                    )
+                    print("expected_tensor is:\n{}".format(expected_tensor))
+                if categorical_column == "B":
+                    expected_tensor = tf.constant(
+                        [
+                            [0, 0, 0, 3, 1, 5, 1, 9],
+                            [0, 0, 0, 0, 0, 0, 0, 2],
+                            [0, 0, 0, 0, 0, 6, 5, 3],
+                        ],
+                        dtype=tf.int64,
+                    )
+                    print("expected_tensor is:\n{}".format(expected_tensor))
+            elif not pad_left:
+                if categorical_column == "A":
+                    expected_tensor = tf.constant(
+                        [[3, 1, 5, 1, 0], [9, 2, 0, 0, 0], [6, 0, 0, 0, 0]], dtype=tf.int64
+                    )
+                    print("expected_tensor is:\n{}".format(expected_tensor))
+                if categorical_column == "B":
+                    expected_tensor = tf.constant(
+                        [
+                            [3, 1, 5, 1, 9, 0, 0, 0],
+                            [2, 0, 0, 0, 0, 0, 0, 0],
+                            [6, 5, 3, 0, 0, 0, 0, 0],
+                        ],
+                        dtype=tf.int64,
+                    )
+                    print("expected_tensor is:\n{}".format(expected_tensor))
+            assert tf.experimental.numpy.allclose(feature_tensor, expected_tensor)
+
+
 @pytest.mark.skipif(
     os.environ.get("NR_USER") is not None, reason="not working correctly in ci environment"
 )

diff --git a/tests/unit/loader/test_torch_dataloader.py b/tests/unit/loader/test_torch_dataloader.py
@@ -511,6 +511,60 @@ def test_sparse_tensors(sparse_dense):
     # ensure they are correct structurally
 
 
+@pytest.mark.parametrize("pad_left", [False, True])
+def test_sparse_tensor_left_padding(pad_left):
+    """Tests the pad_left functionality of our Torch dataloader
+    to pad data on the left for sparse tensors."""
+    df = cudf.DataFrame({"A": [[3, 1, 5, 1], [9, 2], [6]], "B": [[3, 1, 5, 1, 9], [2], [6, 5, 3]]})
+    categorical_columns = ["A", "B"]
+    sparse_max = {"A": 5, "B": 8}
+    batch_size = 4
+    data_itr = torch_dataloader.TorchAsyncItr(
+        nvt.Dataset(df),
+        cats=categorical_columns,
+        conts=[],
+        labels=[],
+        batch_size=batch_size,
+        sparse_names=categorical_columns,
+        sparse_max=sparse_max,
+        sparse_as_dense=True,
+        pad_left=pad_left,
+    )
+    for batch in data_itr:
+        features, labels = batch
+        for categorical_column in categorical_columns:
+            feature_tensor = features[categorical_column]
+            if pad_left:
+                if categorical_column == "A":
+                    expected_tensor = torch.tensor(
+                        [[0, 3, 1, 5, 1], [0, 0, 0, 9, 2], [0, 0, 0, 0, 6]], dtype=torch.int64
+                    ).cuda()
+                if categorical_column == "B":
+                    expected_tensor = torch.tensor(
+                        [
+                            [0, 0, 0, 3, 1, 5, 1, 9],
+                            [0, 0, 0, 0, 0, 0, 0, 2],
+                            [0, 0, 0, 0, 0, 6, 5, 3],
+                        ],
+                        dtype=torch.int64,
+                    ).cuda()
+            elif not pad_left:
+                if categorical_column == "A":
+                    expected_tensor = torch.tensor(
+                        [[3, 1, 5, 1, 0], [9, 2, 0, 0, 0], [6, 0, 0, 0, 0]], dtype=torch.int64
+                    ).cuda()
+                if categorical_column == "B":
+                    expected_tensor = torch.tensor(
+                        [
+                            [3, 1, 5, 1, 9, 0, 0, 0],
+                            [2, 0, 0, 0, 0, 0, 0, 0],
+                            [6, 5, 3, 0, 0, 0, 0, 0],
+                        ],
+                        dtype=torch.int64,
+                    ).cuda()
+            assert torch.allclose(feature_tensor, expected_tensor)
+
+
 def test_mh_model_support(tmpdir):
     df = cudf.DataFrame(
         {