diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index 33a63cb22f8..e1e76877073 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -135,8 +135,8 @@ def __init__(
         self.compute_clips(clip_length_in_frames, frames_between_clips, frame_rate)
 
     def _compute_frame_pts(self) -> None:
-        self.video_pts = []
-        self.video_fps: List[int] = []
+        self.video_pts = []  # len = num_videos. Each entry is a tensor of shape (num_frames_in_video,)
+        self.video_fps: List[int] = []  # len = num_videos
 
         # strategy: use a DataLoader to parallelize read_video_timestamps
         # so need to create a dummy dataset first
@@ -152,13 +152,13 @@ def _compute_frame_pts(self) -> None:
         with tqdm(total=len(dl)) as pbar:
             for batch in dl:
                 pbar.update(1)
-                clips, fps = list(zip(*batch))
+                batch_pts, batch_fps = list(zip(*batch))
                 # we need to specify dtype=torch.long because for empty list,
                 # torch.as_tensor will use torch.float as default dtype. This
                 # happens when decoding fails and no pts is returned in the list.
-                clips = [torch.as_tensor(c, dtype=torch.long) for c in clips]
-                self.video_pts.extend(clips)
-                self.video_fps.extend(fps)
+                batch_pts = [torch.as_tensor(pts, dtype=torch.long) for pts in batch_pts]
+                self.video_pts.extend(batch_pts)
+                self.video_fps.extend(batch_fps)
 
     def _init_from_metadata(self, metadata: Dict[str, Any]) -> None:
         self.video_paths = metadata["video_paths"]