Update docstrings for issue #1077

Update docstrings for issue #1077. This touches the tensorflow and torch dataloader modules and the list_slice op module. The motivation for this is to improve readability. This commit is towards resolving issue #1077 on implementing left padding for sparse sequential features.
NVIDIA-Merlin · lesnikow · Sep 14, 2021 · Sep 16, 2021 · Sep 16, 2021 · Sep 16, 2021
commit a52a90cf03f742d9618cbb94ccd69b7fb98928f9
diff --git a/nvtabular/loader/tensorflow.py b/nvtabular/loader/tensorflow.py
@@ -140,20 +140,20 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader):
 
     Iterator output is of the form `(dict(features), list(labels))`,
     where each element of the features dict is a
-    `feature_name: feature_tensor`  and each elemtn of the labels
+    `feature_name: feature_tensor`  and each element of the labels
     list is a tensor, and all tensors are of shape `(batch_size, 1)`.
     Note that this means vectorized continuous and multi-hot categorical
     features are not currently supported.
     The underlying NVTabular `Dataset` object is stored in the `data`
     attribute, and should be used for updating NVTabular `Workflow`
-    statistics::
+    statistics:
 
         workflow = nvt.Workflow(...)
         dataset = KerasSequenceLoader(...)
         workflow.update_stats(dataset.data.to_iter(), record_stats=True)
 
     Parameters
-    -------------
+    ----------
     - paths_or_dataset: str or list(str)
         Either a string representing a file pattern (see `tf.glob` for
         pattern rules), a list of filenames to be iterated through, or
@@ -205,6 +205,7 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader):
         dictionary of key: column_name + value: integer representing max sequence length for column
     sparse_dense : bool
         bool value to activate transforming sparse tensors to dense
+
     """
 
     _use_nnz = True
@@ -238,7 +239,7 @@ def __init__(
             feature_columns, cat_names, cont_names, schema=dataset.schema
         )
 
-        # sort the ccolumns to avoid getting incorrect output
+        # Sort the columns to avoid getting incorrect output.
         # (https://github.com/NVIDIA/NVTabular/issues/412)
         cat_names = _get_embedding_order(cat_names)
         cont_names = _get_embedding_order(cont_names)
@@ -265,19 +266,18 @@ def __init__(
         self._map_fns = []
 
     def __len__(self):
-        """
-        recreating since otherwise Keras yells at you
-        """
+        """Recreating since otherwise Keras yells at you."""
         # TODO: what's a better way to do this inheritance
         # of the appropriate methods? A Metaclass?
         DataLoader.stop(self)
         return DataLoader.__len__(self)
 
     def __getitem__(self, idx):
         """
-        implemented exclusively for consistency
+        Implemented exclusively for consistency
         with Keras model.fit. Does not leverage
-        passed idx in any way
+        passed idx in any way.
+
         """
         return DataLoader.__next__(self)
 
@@ -286,6 +286,7 @@ def map(self, fn):
         Applying a function to each batch.
 
         This can for instance be used to add `sample_weight` to the model.
+
         """
         self._map_fns.append(fn)
 

diff --git a/nvtabular/loader/torch.py b/nvtabular/loader/torch.py
@@ -42,7 +42,7 @@ class TorchAsyncItr(torch.utils.data.IterableDataset, DataLoader):
     batches are the specified size until the final batch.
 
     Parameters
-    -----------
+    ----------
     dataset : NVTabular dataset
     cats : [str]
         the list of categorical columns in the dataset

diff --git a/nvtabular/ops/list_slice.py b/nvtabular/ops/list_slice.py
@@ -61,7 +61,7 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
         on_cpu = _is_cpu_object(df)
         ret = type(df)()
         for col in col_selector.names:
-            # handle CPU via normal python slicing (not very efficient)
+            # Handle CPU via normal python slicing (not very efficient).
             if on_cpu:
                 ret[col] = [row[self.start : self.end] for row in df[col]]
             else:
@@ -99,8 +99,8 @@ def output_tags(self):
 
 @numba.cuda.jit
 def _calculate_row_sizes(start, end, offsets, row_sizes):
-    """given a slice (start/end) and existing offsets indicating row lengths, this
-    calculates the size for each new row after slicing"""
+    """Given a slice (start/end) and existing offsets indicating row lengths, this
+    calculates the size for each new row after slicing."""
     rowid = numba.cuda.grid(1)
     if rowid < offsets.size - 1:
         original_row_size = offsets[rowid + 1] - offsets[rowid]
@@ -120,9 +120,9 @@ def _calculate_row_sizes(start, end, offsets, row_sizes):
 
 @numba.cuda.jit
 def _slice_rows(start, offsets, elements, new_offsets, new_elements):
-    """slices rows of a list column. requires the 'new_offsets' to
+    """Slices rows of a list column. requires the 'new_offsets' to
     be previously calculated (meaning that we don't need the 'end' slice index
-    since thats baked into the new_offsets"""
+    since thats baked into the new_offsets."""
     rowid = numba.cuda.grid(1)
     if rowid < (new_offsets.size - 1):
         if start >= 0: