Merge pull request #120 from clamsproject/116-traing-with-challenging

accommodating image-level annotations as training/validation data
clamsproject · Nov 25, 2024 · 2f80099 · 2f80099
2 parents c0aa813 + 626171c
commit 2f80099
Show file tree

Hide file tree

Showing 38 changed files with 1,390 additions and 678 deletions.
diff --git a/.gitignore b/.gitignore
@@ -257,10 +257,7 @@ $RECYCLE.BIN/
 
 # Files created by data_ingestion or visualization
 modeling/features
-modeling/results*/*.pt
-modeling/results*/*.csv
-modeling/results*/*.txt
-modeling/results*/*.yml
+modeling/results*
 modeling/vectorized/*
 modeling/html
 

diff --git a/app.py b/app.py
@@ -79,8 +79,10 @@ def _annotate_timepoints(self, mmif: Mmif, **parameters) -> Mmif:
         all_positions = []
         t = time.perf_counter()
         # in the following, the .glob() should always return only one, otherwise we have a problem
+        ## naming convention from train.py + gridsearch.py = {timestamp}.{backbonename}.{prebinname}.pos{T/F}.pt
+        ## right now, `prebinname` is fixed to `nomap` as we don't use prebinning
         model_filestem = next(default_model_storage.glob(
-            f"*.{parameters['tpModelName']}.pos{'T' if parameters['tpUsePosModel'] else 'F'}.pt")).stem
+            f"*.{parameters['tpModelName']}.*.pos{'T' if parameters['tpUsePosModel'] else 'F'}.pt")).stem
         self.logger.info(f"Initiating classifier with {model_filestem}")
         classifier = classify.Classifier(default_model_storage / model_filestem,
                                          self.logger.name if self.logger.isEnabledFor(logging.DEBUG) else None)
@@ -123,6 +125,8 @@ def _annotate_timepoints(self, mmif: Mmif, **parameters) -> Mmif:
             timepoint_annotation.add_property('classification', classification)
 
     def _annotate_timeframes(self, mmif: Mmif, **parameters) -> Mmif:
+        from modeling.config import bins
+
         TimeFrameTuple = namedtuple('TimeFrame', 
                                     ['label', 'tf_score', 'targets', 'representatives'])
         tp_view = mmif.get_view_contains(AnnotationTypes.TimePoint)
@@ -152,6 +156,13 @@ def _annotate_timeframes(self, mmif: Mmif, **parameters) -> Mmif:
         src_labels = sqh.validate_labelset(tps)
 
         # TODO: fill in `tfLabelMap` parameter value if a preset is used by the user
+        # first fill in labelMap parameter value if a preset is used by the user
+        label_map = bins.binning_schemes.get(parameters['tfLabelMapPreset'])
+        if label_map is None:
+            label_map = parameters['tfLabelMap']
+        else:
+            label_map = {lbl: binname for binname, lbls in label_map.items() for lbl in lbls}
+        parameters['tfLabelMap'] = label_map
         self.logger.debug(f"Label map: {parameters['tfLabelMap']}")
         label_remapper = sqh.build_label_remapper(src_labels, parameters['tfLabelMap'])
 

diff --git a/metadata.py b/metadata.py
@@ -10,6 +10,7 @@
 from mmif import DocumentTypes, AnnotationTypes
 
 from modeling import FRAME_TYPES
+import modeling.config.bins
 
 default_model_storage = Path(__file__).parent / 'modeling/models'
 
@@ -26,16 +27,6 @@ def appmetadata() -> AppMetadata:
 
     available_models = default_model_storage.glob('*.pt')
 
-    # This was the most frequent label mapping from the old configuration file,
-    # which had default mappings for each model.
-    labelMap = [
-        "B:bars",
-        "S:slate",
-        "I:chyron", "N:chyron", "Y:chyron",
-        "C:credits", "R:credits",
-        "W:other_opening", "L:other_opening", "O:other_opening", "M:other_opening",
-        "E:other_text", "K:other_text", "G:other_text", "T:other_text", "F:other_text"]
-
     metadata = AppMetadata(
         name="Scenes-with-text Detection",
         description="Detects scenes with text, like slates, chyrons and credits. "
@@ -57,36 +48,42 @@ def appmetadata() -> AppMetadata:
 
     metadata.add_parameter(
         name='useClassifier', type='boolean', default=True,
-        description='Use the image classifier model to generate TimePoint annotations')
+        description='Use the image classifier model to generate TimePoint annotations.')
     metadata.add_parameter(
         name='tpModelName', type='string',
-        default='convnext_lg',
+        default='convnext_small',
         choices=list(set(m.stem.split('.')[1] for m in available_models)),
-        description='model name to use for classification, only applies when `useClassifier=true`')
+        description='Model name to use for classification, only applies when `useClassifier=true`.')
     metadata.add_parameter(
         name='tpUsePosModel', type='boolean', default=True,
-        description='Use the model trained with positional features, only applies when `useClassifier=true`')
+        description='Use the model trained with positional features, only applies when `useClassifier=true`.')
     metadata.add_parameter(
         name='tpStartAt', type='integer', default=0,
-        description='Number of milliseconds into the video to start processing, only applies when `useClassifier=true`')
+        description='Number of milliseconds into the video to start processing, only applies when `useClassifier=true`.')
     metadata.add_parameter(
         name='tpStopAt', type='integer', default=sys.maxsize,
-        description='Number of milliseconds into the video to stop processing, only applies when `useClassifier=true`')
+        description='Number of milliseconds into the video to stop processing, only applies when `useClassifier=true`.')
     metadata.add_parameter(
         name='tpSampleRate', type='integer', default=1000,
-        description='Milliseconds between sampled frames, only applies when `useClassifier=true`')
+        description='Milliseconds between sampled frames, only applies when `useClassifier=true`.')
     metadata.add_parameter(
         name='useStitcher', type='boolean', default=True,
-        description='Use the stitcher after classifying the TimePoints')
+        description='Use the stitcher after classifying the TimePoints.')
     metadata.add_parameter(
         name='tfMinTPScore', type='number', default=0.5,
-        description='Minimum score for a TimePoint to be included in a TimeFrame, only applies when `useStitcher=true`')
+        description='Minimum score for a TimePoint to be included in a TimeFrame. '
+                    'A lower value will include more TimePoints in the TimeFrame '
+                    '(increasing recall in exchange for precision). '
+                    'Only applies when `useStitcher=true`.')
     metadata.add_parameter(
         name='tfMinTFScore', type='number', default=0.9,
-        description='Minimum score for a TimeFrame, only applies when `useStitcher=true`')
+        description='Minimum score for a TimeFrame. '
+                    'A lower value will include more TimeFrames in the output '
+                    '(increasing recall in exchange for precision). '
+                    'Only applies when `useStitcher=true`')
     metadata.add_parameter(
         name='tfMinTFDuration', type='integer', default=5000,
-        description='Minimum duration of a TimeFrame in milliseconds, only applies when `useStitcher=true`')
+        description='Minimum duration of a TimeFrame in milliseconds, only applies when `useStitcher=true`.')
     metadata.add_parameter(
         name='tfAllowOverlap', type='boolean', default=False,
         description='Allow overlapping time frames, only applies when `useStitcher=true`')
@@ -96,17 +93,28 @@ def appmetadata() -> AppMetadata:
                     'multiple representative points to follow any changes in the scene. '
                     'Only applies when `useStitcher=true`')
     metadata.add_parameter(
-        # TODO: do we want to use the old default labelMap from the configuration here or
-        # do we truly want an empty mapping and use the pass-through, as hinted at in the
-        # description (which is now not in sync with the code).
-        name='tfLabelMap', type='map', default=labelMap,
+        name='tfLabelMap', type='map', default=[],
         description=(
-            'Mapping of a label in the input annotations to a new label. Must be formatted as '
-            'IN_LABEL:OUT_LABEL (with a colon). To pass multiple mappings, use this parameter '
-            'multiple times. By default, all the input labels are passed as is, including any '
-            'negative labels (with default value being no remapping at all). However, when '
-            'at least one label is remapped, all the other "unset" labels are discarded as '
-            'a negative label. Only applies when `useStitcher=true`'))
+            '(See also `tfLabelMapPreset`, set `tfLabelMapPreset=nopreset` to make sure that a preset does not '
+            'override `tfLabelMap` when using this) Mapping of a label in the input TimePoint annotations to a new '
+            'label of the stitched TimeFrame annotations. Must be formatted as IN_LABEL:OUT_LABEL (with a colon). To '
+            'pass multiple mappings, use this parameter multiple times. When two+ TP labels are mapped to a TF  '
+            'label, it essentially works as a "binning" operation. If no mapping is used, all the input labels are '
+            'passed-through, meaning no change in both TP & TF labelsets. However, when at least one label is mapped, '
+            'all the other "unset" labels are mapped to the negative label (`-`) and if `-` does not exist in the TF '
+            'labelset, it is added automatically. '
+            'Only applies when `useStitcher=true`.'))
+    labelMapPresetsReformat = {schname: str([f'`{lbl}`:`{binname}`' 
+                                             for binname, lbls in scheme.items() 
+                                             for lbl in lbls]) 
+                               for schname, scheme in modeling.config.bins.binning_schemes.items()}
+    labelMapPresetsMarkdown = '\n'.join([f"- `{k}`: {v}" for k, v in labelMapPresetsReformat.items()])
+    metadata.add_parameter(
+        name='tfLabelMapPreset', type='string', default='relaxed',
+        choices=list(modeling.config.bins.binning_schemes.keys()),
+        description=f'(See also `tfLabelMap`) Preset alias of a label mapping. If not `nopreset`, this parameter will '
+                    f'override the `tfLabelMap` parameter. Available presets are:\n{labelMapPresetsMarkdown}\n\n '
+                    f'Only applies when `useStitcher=true`.')
 
     return metadata
 

diff --git a/modeling/__init__.py b/modeling/__init__.py
@@ -2,10 +2,14 @@
 positive_label = '+'
 
 # full typology from https://github.com/clamsproject/app-swt-detection/issues/1
-FRAME_TYPES = ["B", "S", "W", "L", "O",
-               "M", "I", "N", "E", "P", "Y", "K", "G", "T", "F", "C", "R"]
-FRAME_TYPES_WITH_SUBTYPES = ["B", "SH", "SC", "SD", "SB", "SG", "W", "L", "O",
-                             "M", "I", "N", "E", "P", "Y", "K", "G", "T", "F", "C", "R"]
+FRAME_TYPES = [
+    "B", "S", "I", "C", "R", "M", "O", "W",
+    "N", "Y", "U", "K",
+    "L", "G", "F", "E", "T",
+    "P",
+]
+FRAME_TYPES_WITH_SUBTYPES = FRAME_TYPES.copy() + ['SH', 'SC', 'SD', 'SB', 'SG']
+FRAME_TYPES_WITH_SUBTYPES.remove('S')
 
 # These are time frames that are typically static (that is, the text does not
 # move around or change as with rolling credits). These are frame names after

diff --git a/modeling/backbones.py b/modeling/backbones.py
@@ -46,16 +46,6 @@ class ExtractorModel:
 # TODO/REVIEW - do we want to be able to change the weight versions (IMAGENET1K_V1 etc)
 # ==========================================|
 # ConvNext Models
-class ConvnextBaseExtractor(ExtractorModel):
-    name = "convnext_base"
-    dim = 1024
-
-    def __init__(self):
-        self.model = convnext_base(weights=ConvNeXt_Base_Weights.IMAGENET1K_V1)
-        self.model.classifier[-1] = torch.nn.Identity()
-        self.preprocess = ConvNeXt_Base_Weights.IMAGENET1K_V1.transforms()
-
-
 class ConvnextTinyExtractor(ExtractorModel):
     name = "convnext_tiny"
     dim = 768
@@ -76,6 +66,16 @@ def __init__(self):
         self.preprocess = ConvNeXt_Small_Weights.IMAGENET1K_V1.transforms()
 
 
+class ConvnextBaseExtractor(ExtractorModel):
+    name = "convnext_base"
+    dim = 1024
+
+    def __init__(self):
+        self.model = convnext_base(weights=ConvNeXt_Base_Weights.IMAGENET1K_V1)
+        self.model.classifier[-1] = torch.nn.Identity()
+        self.preprocess = ConvNeXt_Base_Weights.IMAGENET1K_V1.transforms()
+
+
 class ConvnextLargeExtractor(ExtractorModel):
     name = "convnext_lg"
     dim = 1536

diff --git a/modeling/classify.py b/modeling/classify.py
@@ -20,7 +20,7 @@ def __init__(self, model_stem, logger_name=None):
         model_config_file = f"{model_stem}.yml"
         model_checkpoint = f"{model_stem}.pt"
         model_config = yaml.safe_load(open(model_config_file))
-        self.training_labels = train.pretraining_binned_label(model_config)
+        self.training_labels = train.get_prebinned_labelset(model_config)
         self.featurizer = data_loader.FeatureExtractor(**model_config)
         label_count = len(FRAME_TYPES) + 1
         if 'bins' in model_config:

diff --git a/modeling/config/__init__.py b/modeling/config/__init__.py
diff --git a/modeling/config/batches.py b/modeling/config/batches.py
@@ -0,0 +1,146 @@
+# training batches see https://github.com/clamsproject/aapb-annotations/tree/main/batches for more details
+unintersting_guids = ["cpb-aacip-254-75r7szdz"]   # the most "uninteresting" video (88/882 frames annotated)
+aapb_collaboration_27_a = [
+    "cpb-aacip-129-88qc000k",
+    "cpb-aacip-f2c34dd1cd4",
+    "cpb-aacip-191-40ksn47s",
+    "cpb-aacip-507-028pc2tp2z",
+    "cpb-aacip-507-0k26970f2d",
+    "cpb-aacip-507-0z70v8b17g",
+    "cpb-aacip-512-542j67b12n",
+    "cpb-aacip-394-149p8fcw",
+    "cpb-aacip-08fb0e1f287",
+    "cpb-aacip-512-t43hx1753b",
+    "cpb-aacip-d0f2569e145",
+    "cpb-aacip-d8ebafee30e",
+    "cpb-aacip-c72fd5cbadc",
+    "cpb-aacip-b6a2a39b7eb",
+    "cpb-aacip-512-4b2x34nv4t",
+    "cpb-aacip-512-416sx65d21",
+    "cpb-aacip-512-3f4kk95f7h",
+    "cpb-aacip-512-348gf0nn4f",
+    "cpb-aacip-516-cc0tq5s94c",
+    "cpb-aacip-516-8c9r20sq57",
+]
+aapb_collaboration_27_b = [
+    "cpb-aacip-254-75r7szdz",
+    "cpb-aacip-259-4j09zf95",
+    "cpb-aacip-526-hd7np1xn78",
+    "cpb-aacip-75-72b8h82x",
+    "cpb-aacip-fe9efa663c6",
+    "cpb-aacip-f5847a01db5",
+    "cpb-aacip-f2a88c88d9d",
+    "cpb-aacip-ec590a6761d",
+    "cpb-aacip-c7c64922fcd",
+    "cpb-aacip-f3fa7215348",
+    "cpb-aacip-f13ae523e20",
+    "cpb-aacip-e7a25f07d35",
+    "cpb-aacip-ce6d5e4bd7f",
+    "cpb-aacip-690722078b2",
+    "cpb-aacip-e649135e6ec",
+    "cpb-aacip-15-93gxdjk6",
+    "cpb-aacip-512-4f1mg7h078",
+    "cpb-aacip-512-4m9183583s",
+    "cpb-aacip-512-4b2x34nt7g",
+    "cpb-aacip-512-3n20c4tr34",
+    "cpb-aacip-512-3f4kk9534t",
+]
+aapb_collaboration_27_c = [
+    "cpb-aacip-0d338c39a45",
+    "cpb-aacip-0acac5e9db7",
+    "cpb-aacip-0bdc7c8ecc5",
+    "cpb-aacip-1032b1787b4",
+    "cpb-aacip-516-qf8jd4qq96",
+    "cpb-aacip-259-kh0dzd78",
+    "cpb-aacip-259-nc5sb374",
+    "cpb-aacip-259-mw28cq94",
+    "cpb-aacip-259-mc8rg22j",
+    "cpb-aacip-259-5717pw8g",
+    "cpb-aacip-259-pr7msz5c",
+    "cpb-aacip-259-g737390m",
+    "cpb-aacip-259-pc2t780t",
+    "cpb-aacip-259-q814r90k",
+    "cpb-aacip-259-cz325478",
+    "cpb-aacip-259-vh5cgj9t",
+    "cpb-aacip-259-gt5ff704",
+    "cpb-aacip-259-gx44t714",
+    "cpb-aacip-259-pr7msz3w",
+    "cpb-aacip-259-zg6g5589",
+]
+aapb_collaboration_27_d = [
+    "cpb-aacip-259-wh2dcb8p"
+]  # this is kept for evaluation set, should not be used for training!!!
+
+# new image-level annotation added after v6.1
+# "challenging images" from later annotation (`bm` set and `pbd` set, 60 videos, 2024 summer)
+# recorded as `aapb-collaboration-27-e` in the annotation repo
+guids_with_challenging_images_bm = [
+    "cpb-aacip-00a9ed7f2ba",
+    "cpb-aacip-0ace30f582d", 
+    "cpb-aacip-0ae98c2c4b2",
+    "cpb-aacip-0b0c0afdb11",
+    "cpb-aacip-0bb992d2e7f",
+    "cpb-aacip-0c0374c6c55",
+    "cpb-aacip-0c727d4cac3",
+    "cpb-aacip-0c74795718b",
+    "cpb-aacip-0cb2aebaeba",
+    "cpb-aacip-0d74af419eb",
+    "cpb-aacip-0dbb0610457",
+    "cpb-aacip-0dfbaaec869",
+    "cpb-aacip-0e2dc840bc6",
+    "cpb-aacip-0ed7e315160",
+    "cpb-aacip-0f3879e2f22",
+    "cpb-aacip-0f80359ada5",
+    "cpb-aacip-0f80a4f5ed2",
+    "cpb-aacip-0fe3e4311e1",
+    "cpb-aacip-1a365705273",
+    "cpb-aacip-1b295839145",
+]
+guids_with_challenging_images_pbd = [
+    "cpb-aacip-110-16c2ftdq",
+    "cpb-aacip-120-1615dwkg",
+    "cpb-aacip-120-203xsm67",
+    "cpb-aacip-15-70msck27",
+    "cpb-aacip-16-19s1rw84",
+    "cpb-aacip-17-07tmq941",
+    "cpb-aacip-17-58bg87rx",
+    "cpb-aacip-17-65v6xv27",
+    "cpb-aacip-17-81jhbz0g",
+    "cpb-aacip-29-61djhjcx",
+    "cpb-aacip-29-8380gksn",
+    "cpb-aacip-41-322bvxmn",
+    "cpb-aacip-41-42n5tj3d",
+    "cpb-aacip-110-35gb5r94",
+    "cpb-aacip-111-655dvd99",
+    "cpb-aacip-120-19s1rrsp",
+    "cpb-aacip-120-31qfv097",
+    "cpb-aacip-120-73pvmn2q",
+    "cpb-aacip-120-80ht7h8d",
+    "cpb-aacip-120-8279d01c",
+    "cpb-aacip-120-83xsjcb2",
+    "cpb-aacip-17-88qc0md1",
+    "cpb-aacip-35-36tx99h9",
+    "cpb-aacip-42-78tb31b1",
+    "cpb-aacip-52-84zgn1wb",
+    "cpb-aacip-52-87pnw5t0",
+    "cpb-aacip-55-84mkmvwx",
+    "cpb-aacip-75-13905w9q",
+    "cpb-aacip-75-54xgxnzg",
+    "cpb-aacip-77-02q5807j",
+    "cpb-aacip-77-074tnfhr",
+    "cpb-aacip-77-1937qsxt",
+    "cpb-aacip-77-214mx491",
+    "cpb-aacip-77-24jm6zc8",
+    "cpb-aacip-77-35t77b2v",
+    "cpb-aacip-77-44bp0mdh",
+    "cpb-aacip-77-49t1h3fv",
+    "cpb-aacip-77-81jhbv89",
+    "cpb-aacip-83-074tmx7h",
+    "cpb-aacip-83-23612txx",
+]
+aapb_collaboration_27_e = guids_with_challenging_images_bm + guids_with_challenging_images_pbd
+
+# this `pbd` subset contains 40 videos with 15328 (non-transitional) + 557 (transitional) = 15885 frames
+# then updated with more annotations 19331 (non-transitional) + 801 (transitional) = 20132 frames
+# we decided to use this subset for the fixed validation set (#116)
+guids_for_fixed_validation_set = guids_with_challenging_images_pbd