clamsproject · keighrim · Dec 15, 2023 · Dec 12, 2023 · Nov 20, 2023 · Dec 12, 2023
diff --git a/app.py b/app.py
@@ -10,21 +10,20 @@
 import logging
 from typing import Union
 
+import yaml
 from clams import ClamsApp, Restifier
-from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
-
-import classify
+from mmif import Mmif, View, AnnotationTypes, DocumentTypes
 
+from modeling import classify
 
 logging.basicConfig(filename='swt.log', level=logging.DEBUG)
 
 
 class SwtDetection(ClamsApp):
 
-    def __init__(self, config_file: str):
+    def __init__(self, configs):
         super().__init__()
-        self.classifier = classify.Classifier(config_file)
-
+        self.classifier = classify.Classifier(**configs)
 
     def _appmetadata(self):
         # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
@@ -41,13 +40,16 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
             return mmif
         vd = vds[0]
 
+        # aad the timeframes to a new view and return the updated Mmif object
+        new_view: View = mmif.new_view()
+        self.sign_view(new_view, parameters)
+        parameters = self.get_configuration(parameters)
+
         # calculate the frame predictions and extract the timeframes
+        # use `parameters` as needed as runtime configuration
         predictions = self.classifier.process_video(vd.location)
         timeframes = self.classifier.extract_timeframes(predictions)
 
-        # aad the timeframes to a new view and return the updated Mmif object
-        new_view: View = mmif.new_view()
-        self.sign_view(new_view, parameters)
         new_view.new_contain(AnnotationTypes.TimeFrame, document=vd.id)
         for tf in timeframes:
             start, end, score, label = tf
@@ -62,14 +64,14 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", help="The YAML config file")
-    parser.add_argument("--port", action="store", default="5000", help="set port to listen" )
+    parser.add_argument("-c", "--config", help="The YAML config file", default='modeling/config/classifier.yaml')
+    parser.add_argument("--port", action="store", default="5000", help="set port to listen")
     parser.add_argument("--production", action="store_true", help="run gunicorn server")
 
     parsed_args = parser.parse_args()
-    CONFIGS = parsed_args.config
+    classifier_configs = yaml.safe_load(open(parsed_args.config))
 
-    app = SwtDetection(CONFIGS)
+    app = SwtDetection(classifier_configs)
 
     http_app = Restifier(app, port=int(parsed_args.port))
     # for running the application in production mode

diff --git a/example-config.yml b/example-config.yml
diff --git a/example-config.yml b/example-config.yml
@@ -0,0 +1 @@
+modeling/config/classifier-no-positional.yml
diff --git a/modeling/__init__.py b/modeling/__init__.py
@@ -0,0 +1,2 @@
+negative_label = 'NEG'
+positive_label = 'POS'
diff --git a/classify.py → modeling/classify.py b/classify.py → modeling/classify.py
@@ -7,106 +7,79 @@
 
 """
 
-import os
-import sys
+import argparse
 import json
-import yaml
 import logging
-import argparse
+import os
+import sys
 from operator import itemgetter
 
-import torch
-import numpy as np
 import cv2
+import numpy as np
+import torch
+import yaml
 from PIL import Image
 
-from mmif.utils import video_document_helper as vdh
-
-from modeling import backbones
-
-
-def get_net(in_dim, n_labels, num_layers, dropout=0.0):
-    # Copied from modeling.train
-    # TODO: use the one from the train module, requires creating a proper package
-    dropouts = [dropout] * (num_layers - 1) if isinstance(dropout, (int, float)) else dropout
-    if len(dropouts) + 1 != num_layers:
-        raise ValueError("length of dropout must be equal to num_layers - 1")
-    net = torch.nn.Sequential()
-    for i in range(1, num_layers):
-        neurons = max(128 // i, n_labels)
-        net.add_module(f"fc{i}", torch.nn.Linear(in_dim, neurons))
-        net.add_module(f"relu{i}", torch.nn.ReLU())
-        net.add_module(f"dropout{i}", torch.nn.Dropout(p=dropouts[i - 1]))
-        in_dim = neurons
-    net.add_module("fc_out", torch.nn.Linear(neurons, n_labels))
-    # no softmax here since we're using CE loss which includes it
-    # net.add_module(Softmax(dim=1))
-    return net
+from modeling import train, data_loader
 
 
 class Classifier:
 
-    def __init__(self, config_file: str):
-        with open(config_file) as f:
-            config = yaml.safe_load(f)
-        # model and model configuration
-        self.model_file = config["model_file"]
-        with open(config["model_config"]) as f:
-            self.model_config = yaml.safe_load(f)
-        # classifier parameters
+    def __init__(self, **config):
+        model_config = yaml.safe_load(open(config["model_config_file"]))
+        # the "labels" list from the config file should not include "negative" label from the beginning
+        self.labels = train.get_final_label_names(model_config)
+        self.featurizer = data_loader.FeatureExtractor(
+            img_enc_name=model_config["img_enc_name"],
+            pos_enc_name=model_config.get("pos_enc_name", None),
+            pos_enc_dim=model_config.get("pos_enc_dim", 0),
+            max_input_length=model_config.get("max_input_length", 0),
+            pos_unit=model_config.get("pos_unit", 0),
+        )
+        self.classifier = train.get_net(
+            in_dim=self.featurizer.feature_vector_dim(),
+            n_labels=len(model_config['bins']['pre'].keys()) + 1,
+            num_layers=model_config["num_layers"],
+            dropout=model_config["dropouts"],
+        )
+        self.classifier.load_state_dict(torch.load(config["model_file"]))
+        # TODO (krim @ 12/14/23): deal with post bin
+        # self.postbin = config.get("postbin", None)
+
+        # stitcher config
         self.time_unit = config["time_unit"]
         self.sample_rate = config["sample_rate"]
         self.minimum_frame_score = config["minimum_frame_score"]
         self.minimum_timeframe_score = config["minimum_timeframe_score"]
         self.minimum_frame_count = config["minimum_frame_count"]
-        # not including the "other" label
-        self.labels = tuple(self.model_config["labels"][:-1])
-        # debugging parameters
+
+        # debugging
         self.dribble = config.get("dribble", False)
-        self.load_model()
-
-    def load_model(self):
-        self.model = get_net(
-            self.model_config["in_dim"],
-            len(self.model_config["labels"]),
-            self.model_config["num_layers"],
-            self.model_config["dropout"])
-        self.model.load_state_dict(torch.load(self.model_file))
-        self.model_type = self.model_config["model_type"]
-        self.featurizer = backbones.model_map[self.model_type]()
 
     def process_video(self, mp4_file: str):
         """Loops over the frames in a video and for each frame extract the features
         and apply the model. Returns a list of predictions, where each prediction is
         an instance of numpy.ndarray."""
         print(f'Processing {mp4_file}...')
         logging.info(f'processing {mp4_file}...')
-        basename = os.path.splitext(os.path.basename(mp4_file))[0]
-        all_predictions = []
-        for n, image in get_frames(mp4_file, self.sample_rate):
+        predictions = []
+        vidcap = cv2.VideoCapture(mp4_file)
+        fps = round(vidcap.get(cv2.CAP_PROP_FPS), 2)
+        fc = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
+        dur = round(fc / fps, 3) * 1000
+        for ms in range(0, sys.maxsize, self.sample_rate):
+            vidcap.set(cv2.CAP_PROP_POS_MSEC, ms)
+            success, image = vidcap.read()
+            if not success:
+                break
             img = Image.fromarray(image[:,:,::-1])
-            features = self.extract_features(img, self.featurizer)
-            prediction = self.model(features)
-            prediction = Prediction(n, self.labels, prediction)
+            features = self.featurizer.get_full_feature_vectors(img, ms, dur)
+            prediction = self.classifier(features).detach()
+            prediction = Prediction(ms, self.labels, prediction)
             if self.dribble:
-                print(f'{n:07d}', prediction)
-            all_predictions.append(prediction)
-        logging.info(f'number of predictions = {len(all_predictions)}')
-        return(all_predictions)
-
-    def extract_features(self, frame_vec: np.ndarray, model: torch.nn.Sequential) -> torch.Tensor:
-        """Extract the features of a single frame. Based on, but not identical to, the
-        process_frame() method of the FeatureExtractor class in data_ingestion.py."""
-        frame_vec = model.preprocess(frame_vec)
-        frame_vec = frame_vec.unsqueeze(0)
-        if torch.cuda.is_available():
-            if self.dribble:
-                print('CUDA is available')
-            frame_vec = frame_vec.to('cuda')
-            model.model.to('cuda')
-        with torch.no_grad():
-            feature_vec = model.model(frame_vec)
-        return feature_vec.cpu()
+                print(prediction)
+            predictions.append(prediction)
+        return predictions
 
     def extract_timeframes(self, predictions):
         timeframes = self.collect_timeframes(predictions)
@@ -117,8 +90,8 @@ def extract_timeframes(self, predictions):
 
     def collect_timeframes(self, predictions: list) -> dict:
         """Find sequences of frames for all labels where the score is not 0."""
-        timeframes = { label: [] for label in self.labels}
-        open_frames = { label: [] for label in self.labels}
+        timeframes = {label: [] for label in self.labels}
+        open_frames = {label: [] for label in self.labels}
         for prediction in predictions:
             timepoint = prediction.timepoint
             bins = prediction.data[-1]
@@ -217,18 +190,6 @@ def experiment(self):
             print(outputs)
 
 
-def get_frames(mp4_file: str, step: int = 1000):
-    """Generator to get frames from an mp4 file. The step parameter defines the number
-    of milliseconds between the frames."""
-    vidcap = cv2.VideoCapture(mp4_file)
-    for n in range(0, sys.maxsize, step):
-        vidcap.set(cv2.CAP_PROP_POS_MSEC, n)
-        success, image = vidcap.read()
-        if not success:
-            break
-        yield n, image
-
-
 def softmax(x):
     return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
 
@@ -251,16 +212,6 @@ def load_predictions(filename: str, labels: list) -> list:
     return predictions
 
 
-def print_predictions(predictions, filename=None):
-    fh = sys.stdout if filename is None else open(filename, 'w')
-    fh.write('\n        slate  chyron creds  other\n')
-    for prediction in predictions:
-        milliseconds = prediction.timepoint
-        p1, p2, p3, p4 = prediction.data[:4]
-        fh.write(f'{milliseconds:7}  {p1:.4f} {p2:.4f} {p3:.4f} {p4:.4f}\n')
-    fh.write(f'\nTOTAL PREDICTIONS: {len(predictions)}\n')
-
-
 def print_timeframes(labels, timeframes):
     if timeframes:
         if type(timeframes) is dict:
@@ -280,6 +231,17 @@ def print_timeframes(labels, timeframes):
         print(f'\nNumber of time frames is 0\n')
 
 
+def print_predictions(predictions, filename=None):
+    # Debugging method
+    fh = sys.stdout if filename is None else open(filename, 'w')
+    fh.write('\n        slate  chyron creds  other\n')
+    for prediction in predictions:
+        milliseconds = prediction.timepoint
+        p1, p2, p3, p4 = prediction.data[:4]
+        fh.write(f'{milliseconds:7}  {p1:.4f} {p2:.4f} {p3:.4f} {p4:.4f}\n')
+    fh.write(f'\nTOTAL PREDICTIONS: {len(predictions)}\n')
+
+
 class Prediction:
 
     """Class to store a prediction from the model. It is meant to simplify the rest
@@ -298,14 +260,15 @@ def __init__(self, timepoint: int, labels: list,
         self.labels = labels
         self.tensor = prediction
         if data is None:
-            self.data = softmax(self.tensor.detach().numpy())[0].tolist()
+            # TODO: probably use torch.nn.Softmax()
+            self.data = softmax(self.tensor.detach().numpy()).tolist()
         else:
             self.data = data
 
     def __str__(self):
         label_scores = ' '.join(["%.4f" % d for d in self.data[:3]])
-        other_score = self.data[len(self.labels)]
-        return f'<Prediction {self.timepoint:6} {label_scores} {other_score:.4f}>'
+        neg_score = self.data[len(self.labels)]
+        return f'<Prediction {self.timepoint:6} {label_scores} {neg_score:.4f}>'
 
     def score_for_label(self, label: str):
         return self.data[self.labels.index(label)]
@@ -314,7 +277,6 @@ def as_json(self):
         return [self.timepoint, self.tensor.detach().numpy().tolist(), self.data]
 
 
-
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
@@ -325,15 +287,15 @@ def as_json(self):
     parser.add_argument("--use-predictions", action='store_true', help=pred_help)
     args = parser.parse_args()
 
-    classifier = Classifier(args.config)
+    classifier = Classifier(**yaml.safe_load(open(args.config)))
 
     input_basename, extension = os.path.splitext(args.input)
     predictions_file = f'{input_basename}.json'
     if args.use_predictions:
         predictions = load_predictions(predictions_file, classifier.labels)
     else:
         predictions = classifier.process_video(args.input)
-        save_predictions(predictions, predictions_file)
+        #save_predictions(predictions, predictions_file)
     #print_predictions(predictions, filename='predictions.txt')
 
     timeframes = classifier.collect_timeframes(predictions)

diff --git a/modeling/config/classifier.yml b/modeling/config/classifier.yml
@@ -0,0 +1,22 @@
+model_file: "modeling/models/20231214-191952.convnext_tiny.kfold_000.pt"
+model_config_file: "modeling/models/20231214-193543.convnext_tiny.kfold_config.yml"
+minimum_score: 1.01
+score_mapping: {0.01: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.01: 4}
+
+# Time unit for output timeframe
+time_unit: 'milliseconds'
+
+# Milliseconds between sampled frames
+sample_rate: 1000
+
+# Minimum score for a frame to be included in a potential timeframe
+minimum_frame_score: 0.01
+
+# Minimum score for a timeframe to be selected
+minimum_timeframe_score: 0.25
+
+# Minimum number of sampled frames required for a timeframe to be included
+minimum_frame_count: 2
+
+# Debugging setting
+dribble: False