add flow module

bryanyzhu · Jun 28, 2017 · 5ca01a1 · 5ca01a1
1 parent 6794bd5
commit 5ca01a1
Show file tree

Hide file tree

Showing 8 changed files with 14,012 additions and 75 deletions.
diff --git a/datasets/ucf101.py b/datasets/ucf101.py
@@ -6,31 +6,20 @@
 import numpy as np
 import cv2
 
-IMG_EXTENSIONS = [
-    '.jpg', '.JPG', '.jpeg', '.JPEG',
-    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
-]
-
-def is_image_file(filename):
-    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
-
 def find_classes(dir):
     classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
     classes.sort()
     class_to_idx = {classes[i]: i for i in range(len(classes))}
     return classes, class_to_idx
 
-def cv2_loader(path):
-    return cv2.imread(path)
-
-def read_split_file(root, split_file):
+def make_dataset(root, source):
 
-    if not os.path.exists(split_file):
-        print("Split file for ucf101 dataset doesn't exist.")
+    if not os.path.exists(source):
+        print("Setting file %s for ucf101 dataset doesn't exist." % (source))
         sys.exit()
     else:
         clips = []
-        with open(split_file) as split_f:  
+        with open(source) as split_f:  
             data = split_f.readlines()
             for line in data:
                 line_info = line.split()
@@ -40,53 +29,160 @@ def read_split_file(root, split_file):
                 item = (clip_path, duration, target)
                 clips.append(item)
     return clips
-
+
+def ReadSegmentRGB(path, offsets, new_height, new_width, new_length, is_color, name_pattern):
+    if is_color:
+        cv_read_flag = cv2.IMREAD_COLOR         # > 0
+    else:
+        cv_read_flag = cv2.IMREAD_GRAYSCALE     # = 0
+    interpolation = cv2.INTER_LINEAR
+
+    sampled_list = []
+    for offset_id in range(len(offsets)):
+        offset = offsets[offset_id]
+        for length_id in range(1, new_length+1):
+            frame_name = name_pattern % (length_id + offset)
+            frame_path = path + "/" + frame_name
+            cv_img_origin = cv2.imread(frame_path, cv_read_flag)
+            if cv_img_origin is None:
+               print("Could not load file %s" % (frame_path))
+               sys.exit() 
+               # TODO: error handling here
+            if new_width > 0 and new_height > 0:
+                cv_img = cv2.resize(cv_img_origin, (new_width, new_height), interpolation)
+            else:
+                cv_img = cv_img_origin
+            cv_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)
+            sampled_list.append(cv_img)    
+    clip_input = np.concatenate(sampled_list, axis=2)
+    return clip_input
+
+def ReadSegmentFlow(path, offsets, new_height, new_width, new_length, is_color, name_pattern):
+    if is_color:
+        cv_read_flag = cv2.IMREAD_COLOR         # > 0
+    else:
+        cv_read_flag = cv2.IMREAD_GRAYSCALE     # = 0
+    interpolation = cv2.INTER_LINEAR  
+
+    sampled_list = []
+    for offset_id in range(len(offsets)):
+        offset = offsets[offset_id]
+        for length_id in range(1, new_length+1):
+            frame_name_x = name_pattern % ("x", length_id + offset)
+            frame_path_x = path + "/" + frame_name_x
+            cv_img_origin_x = cv2.imread(frame_path_x, cv_read_flag)
+            frame_name_y = name_pattern % ("y", length_id + offset)
+            frame_path_y = path + "/" + frame_name_y
+            cv_img_origin_y = cv2.imread(frame_path_y, cv_read_flag)
+            if cv_img_origin_x is None or cv_img_origin_y is None:
+               print("Could not load file %s or %s" % (frame_path_x, frame_path_y))
+               sys.exit() 
+               # TODO: error handling here
+            if new_width > 0 and new_height > 0:
+                cv_img_x = cv2.resize(cv_img_origin_x, (new_width, new_height), interpolation)
+                cv_img_y = cv2.resize(cv_img_origin_y, (new_width, new_height), interpolation)
+            else:
+                cv_img_x = cv_img_origin_x
+                cv_img_y = cv_img_origin_y
+            sampled_list.append(np.expand_dims(cv_img_x, 2))  
+            sampled_list.append(np.expand_dims(cv_img_y, 2))      
+
+    clip_input = np.concatenate(sampled_list, axis=2)
+    return clip_input
+
 
 class ucf101(data.Dataset):
 
-    def __init__(self, root, split_file, phase, new_length=1, transform=None, target_transform=None,
-                 video_transform=None, loader=cv2_loader):
+    def __init__(self, 
+                 root, 
+                 source, 
+                 phase, 
+                 modality,
+                 name_pattern=None,
+                 is_color=True, 
+                 num_segments=1,
+                 new_length=1, 
+                 new_width=0,
+                 new_height=0,
+                 transform=None, 
+                 target_transform=None, 
+                 video_transform=None):
+
         classes, class_to_idx = find_classes(root)
-        clips = read_split_file(root, split_file)
+        clips = make_dataset(root, source)
 
         if len(clips) == 0:
             raise(RuntimeError("Found 0 video clips in subfolders of: " + root + "\n"
                                "Check your data directory."))
 
         self.root = root
-        self.split_file = split_file
+        self.source = source
         self.phase = phase
-        self.clips = clips
+        self.modality = modality
+
         self.classes = classes
         self.class_to_idx = class_to_idx
+        self.clips = clips
+
+        if name_pattern:
+            self.name_pattern = name_pattern
+        else:
+            if self.modality == "rgb":
+                self.name_pattern = "image_%04d.jpg"
+            elif self.modality == "flow":
+                self.name_pattern = "flow_%s_%04d.jpg"
+
+        self.is_color = is_color
+        self.num_segments = num_segments
         self.new_length = new_length
+        self.new_width = new_width
+        self.new_height = new_height
+
         self.transform = transform
         self.target_transform = target_transform
         self.video_transform = video_transform
-        self.loader = loader
 
     def __getitem__(self, index):
         path, duration, target = self.clips[index]
-        frame_list = os.listdir(path)
-        frame_list.sort()
-        if self.phase == "train":
-            sampled_frameID = random.randint(0, duration-self.new_length)
-        elif self.phase == "val":
-            if duration >= self.new_length:
-                sampled_frameID = int((duration - self.new_length + 1)/2)
+        average_duration = int(duration / self.num_segments)
+        offsets = []
+        for seg_id in range(self.num_segments):
+            if self.phase == "train":
+                if average_duration >= self.new_length:
+                    offset = random.randint(0, average_duration - self.new_length)
+                    # No +1 because randint(a,b) return a random integer N such that a <= N <= b.
+                    offsets.append(offset + seg_id * average_duration)
+                else:
+                    offsets.append(0)
+            elif self.phase == "val":
+                if average_duration >= self.new_length:
+                    offsets.append(int((average_duration - self.new_length + 1)/2 + seg_id * average_duration))
+                else:
+                    offsets.append(0)
             else:
-                sampled_frameID = 0
+                print("Only phase train and val are supported.")
+
+
+        if self.modality == "rgb":
+            clip_input = ReadSegmentRGB(path,
+                                        offsets, 
+                                        self.new_height,
+                                        self.new_width, 
+                                        self.new_length, 
+                                        self.is_color, 
+                                        self.name_pattern
+                                        )
+        elif self.modality == "flow":
+            clip_input = ReadSegmentFlow(path,
+                                        offsets, 
+                                        self.new_height,
+                                        self.new_width, 
+                                        self.new_length, 
+                                        self.is_color, 
+                                        self.name_pattern
+                                        )
         else:
-            print("No such phase. Only train and val are supported.")
-
-        sampled_list = []
-        for frame_id in range(self.new_length):
-            fname = os.path.join(path, frame_list[sampled_frameID+frame_id])
-            if is_image_file(fname):
-                img = self.loader(fname)
-                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-                sampled_list.append(img)
-        clip_input = np.concatenate(sampled_list, axis=2)
+            print("No such modality %s" % (self.modality))  
 
         if self.transform is not None:
             clip_input = self.transform(clip_input)
@@ -97,5 +193,6 @@ def __getitem__(self, index):
 
         return clip_input, target
 
+
     def __len__(self):
         return len(self.clips)
diff --git a/main.py b/main.py
@@ -45,8 +45,10 @@
                     help='number of total epochs to run')
 parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                     help='manual epoch number (useful on restarts)')
-parser.add_argument('-b', '--batch-size', default=50, type=int,
+parser.add_argument('-b', '--batch-size', default=32, type=int,
                     metavar='N', help='mini-batch size (default: 50)')
+parser.add_argument('--iter-size', default=4, type=int,
+                    metavar='I', help='iter size as in Caffe to reduce memory usage (default: 8)')
 parser.add_argument('--new_length', default=1, type=int,
                     metavar='N', help='length of sampled video frames (default: 1)')
 parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,