New demo for just loading single N-frame clips per video

RaivoKoot · RaivoKoot · commit 70176d533dba · 2020-11-18T14:43:11.000Z
diff --git a/README.md b/README.md
@@ -54,9 +54,10 @@ for image in frames:
 - [1. Requirements](#1-requirements)
 - [2. Custom Dataset](#2-custom-dataset)
 - [3. Video Frame Sampling Method](#3-video-frame-sampling-method)
-- [4. Using VideoFrameDataset for Training](#4-using-videoframedataset-for-training)
-- [5. Conclusion](#5-conclusion)
-- [6. Acknowledgements](#6-acknowledgements)
+- [4. Alternate Video Frame Sampling Methods](#4-alternate-vide-frame-sampling-methods)
+- [5. Using VideoFrameDataset for Training](#5-using-videoframedataset-for-training)
+- [6. Conclusion](#6-conclusion)
+- [7. Acknowledgements](#7-acknowledgements)
 
 ### 1. Requirements
 ```
@@ -118,20 +119,26 @@ When loading a video, only a number of its frames are loaded. They are chosen in
 1. The frame indices [1,N] are divided into NUM_SEGMENTS even segments. From each segment, FRAMES_PER_SEGMENT consecutive indices are chosen at random.
 This results in NUM_SEGMENTS*FRAMES_PER_SEGMENT chosen indices, whose frames are loaded as PIL images and put into a list and returned when calling
 `dataset[i]`.
+
+### 4. Alternate Video Frame Sampling Methods
+If you do not want to use sparse temporal sampling and instead want to sample a single N-frame continuous
+clip from a video, this is possible. Set `NUM_SEGMENTS=1` and `FRAMES_PER_SEGMENT=N`. Because VideoFrameDataset
+will chose a random start index per segment and take `NUM_SEGMENTS` continuous frames from each sampled start
+index, this will result in a single N-frame continuous clip per video. An example of this is in `demo.py`. 
   
-### 4. Using VideoFrameDataset for training
+### 5. Using VideoFrameDataset for training
 As demonstrated in `demo.py`, we can use PyTorch's `torch.utils.data.DataLoader` class with VideoFrameDataset to take care of shuffling, batching, and more.
-To turn the lists of PIL images returned by VideoFrameDataset into tensors, the transform `video_dataset.imglist_totensor()` can be supplied
+To turn the lists of PIL images returned by VideoFrameDataset into tensors, the transform `video_dataset.ImglistToTensor()` can be supplied
 as the `transform` parameter to VideoFrameDataset. This turns a list of N PIL images into a batch of images/frames of shape `N x CHANNELS x HEIGHT x WIDTH`. 
-We can further chain preprocessing and augmentation functions that act on batches of images onto the end of `imglist_totensor()`.
+We can further chain preprocessing and augmentation functions that act on batches of images onto the end of `ImglistToTensor()`.
   
 As of `torchvision 0.8.0`, all torchvision transforms can now also operate on batches of images, and they apply deterministic or random transformations
 on the batch identically on all images of the batch. Therefore, any torchvision transform can be used here to apply video-uniform preprocessing and augmentation.
 
-### 5. Conclusion
+### 6. Conclusion
 A proper code-based explanation on how to use VideoFrameDataset for training is provided in `demo.py`
 
-### 6. Acknowledgements
+### 7. Acknowledgements
 We thank the authors of TSN for their [codebase](https://github.com/yjxiong/tsn-pytorch), from which we took VideoFrameDataset and adapted it.
 ```
 @InProceedings{wang2016_TemporalSegmentNetworks,
diff --git a/demo.py b/demo.py
@@ -1,18 +1,43 @@
-from video_dataset import  VideoFrameDataset, imglist_totensor
+from video_dataset import  VideoFrameDataset, ImglistToTensor
 from torchvision import transforms
 import torch
 import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import ImageGrid
 import os
 
+"""
+Ignore this function and look at "main" below.
+"""
+def plot_video(rows, cols, frame_list, plot_width, plot_height):
+    fig = plt.figure(figsize=(plot_width, plot_height))
+    grid = ImageGrid(fig, 111,  # similar to subplot(111)
+                     nrows_ncols=(rows, cols),  # creates 2x2 grid of axes
+                     axes_pad=0.3,  # pad between axes in inch.
+                     )
+
+    for index, (ax, im) in enumerate(zip(grid, frame_list)):
+        # Iterating over the grid returns the Axes.
+        ax.imshow(im)
+        ax.set_title(index)
+    plt.show()
 
 if __name__ == '__main__':
     """
     This demo uses the dummy dataset inside of the folder "demo_dataset".
     It is structured just like a real dataset would need to be structured.
+    
+    TABLE OF CODE CONTENTS:
+    1. Minimal demo without image transforms
+    2. Minimal demo without sparse temporal sampling for single continuous frame clips, without image transforms
+    3. Demo with image transforms
+    4. Demo 3 continued with PyTorch dataloader
+    
     """
     videos_root = os.path.join(os.getcwd(), 'demo_dataset')
     annotation_file = os.path.join(videos_root, 'annotations.txt')
 
+
+
     """ DEMO 1 WITHOUT IMAGE TRANSFORMS """
     dataset = VideoFrameDataset(
         root_path=videos_root,
@@ -29,20 +54,42 @@
     frames = sample[0]  # list of PIL images
     label = sample[1]   # integer label
 
-    for image in frames:
-        plt.imshow(image)
-        plt.title(label)
-        plt.show()
-        plt.pause(1)
+    plot_video(rows=1, cols=5, frame_list=frames, plot_width=15., plot_height=3.)
+
+
+
+    """ DEMO 2 SINGLE CONTINUOUS FRAME CLIP INSTEAD OF SAMPLED FRAMES, WITHOUT TRANSFORMS """
+    # If you do not want to use sparse temporal sampling, and instead
+    # want to just load N consecutive frames starting from a random
+    # start index, this is easy. Simply set NUM_SEGMENTS=1 and
+    # FRAMES_PER_SEGMENT=N. Each time a sample is loaded, N
+    # frames will be loaded from a new random start index.
+    dataset = VideoFrameDataset(
+        root_path=videos_root,
+        annotationfile_path=annotation_file,
+        num_segments=1,
+        frames_per_segment=9,
+        imagefile_template='img_{:05d}.jpg',
+        transform=None,
+        random_shift=True,
+        test_mode=False
+    )
+
+    sample = dataset[3]
+    frames = sample[0]  # list of PIL images
+    label = sample[1]  # integer label
+
+    plot_video(rows=3, cols=3, frame_list=frames, plot_width=10., plot_height=5.)
+
 
 
-    """ DEMO 2 WITH TRANSFORMS """
+    """ DEMO 3 WITH TRANSFORMS """
     # As of torchvision 0.8.0, torchvision transforms support batches of images
     # of size (BATCH x CHANNELS x HEIGHT x WIDTH) and apply deterministic or random
     # transformations on the batch identically on all images of the batch. Any torchvision
     # transform for image augmentation can thus also be used  for video augmentation.
     preprocess = transforms.Compose([
-        transforms.Lambda(imglist_totensor),  # list of PIL images to (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
+        ImglistToTensor(),  # list of PIL images to (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
         transforms.Resize(299),  # image batch, resize smaller edge to 299
         transforms.CenterCrop(299),  # image batch, center crop to square 299x299
         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@@ -60,12 +107,10 @@
     )
 
     sample = dataset[2]
-    # tensor of shape (NUM_SEGMENTS*FRAMES_PER_SEGMENT) x CHANNELS x HEIGHT x WIDTH
-    frame_tensor = sample[0]
-    print('Video Tensor Size:', frame_tensor.size())
-    # integer label
-    label = sample[1]
+    frame_tensor = sample[0]  # tensor of shape (NUM_SEGMENTS*FRAMES_PER_SEGMENT) x CHANNELS x HEIGHT x WIDTH
+    label = sample[1]  # integer label
 
+    print('Video Tensor Size:', frame_tensor.size())
 
     def denormalize(video_tensor):
         """
@@ -82,14 +127,11 @@ def denormalize(video_tensor):
 
 
     frame_tensor = denormalize(frame_tensor)
-    for image in frame_tensor:
-        plt.imshow(image)
-        plt.title(label)
-        plt.show()
-        plt.pause(1)
+    plot_video(rows=1, cols=5, frame_list=frames, plot_width=15., plot_height=3.)
+
 
 
-    """ DEMO 2 CONTINUED: DATALOADER """
+    """ DEMO 3 CONTINUED: DATALOADER """
     dataloader = torch.utils.data.DataLoader(
         dataset=dataset,
         batch_size=2,
diff --git a/video_dataset.py b/video_dataset.py
@@ -43,7 +43,7 @@ class VideoFrameDataset(torch.utils.data.Dataset):
     loads x RGB frames of a video (sparse temporal sampling) and evenly
     chooses those frames from start to end of the video, returning
     a list of x PIL images or ``FRAMES x CHANNELS x HEIGHT x WIDTH``
-    tensors where FRAMES=x if the ``imglist_totensor()``
+    tensors where FRAMES=x if the ``ImglistToTensor()``
     transform is used.
 
     More specifically, the frame range [0,N] is divided into NUM_SEGMENTS
@@ -235,19 +235,21 @@ def _get(self, record, indices):
     def __len__(self):
         return len(self.video_list)
 
-def imglist_totensor(img_list):
+class ImglistToTensor(torch.nn.Module):
     """
-    Converts each PIL image in a list to
-    a torch Tensor and stacks them into
-    a single tensor. Can be used as first transform
-    for ``VideoFrameDataset``.
-    To use this with torchvision.transforms.Compose, wrap this
-    function in a torchvision lambda like
-    this ``torchvision.transforms.Lambda(imglist_totensor)``.
-
-    Args:
-        img_list: list of PIL images.
-    Returns:
-        tensor of size ``NUM_IMAGES x CHANNELS x HEIGHT x WIDTH``
+    Converts a list of PIL images in the range [0,255] to a torch.FloatTensor
+    of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1].
+    Can be used as first transform for ``VideoFrameDataset``.
     """
-    return torch.stack([transforms.functional.to_tensor(pic) for pic in img_list])
+    def forward(self, img_list):
+        """
+        Converts each PIL image in a list to
+        a torch Tensor and stacks them into
+        a single tensor.
+
+        Args:
+            img_list: list of PIL images.
+        Returns:
+            tensor of size ``NUM_IMAGES x CHANNELS x HEIGHT x WIDTH``
+        """
+        return torch.stack([transforms.functional.to_tensor(pic) for pic in img_list])