Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement positional encoding #109

Merged
merged 9 commits into from
Jul 10, 2024
6 changes: 4 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ def _extract_images(self, video):

def _classify(self, extracted: list, positions: list, total_ms: int):
t = time.perf_counter()
self.logger.info(f"Initiating classifier with {self.configs['modelName']}")
classifier = classify.Classifier(default_model_storage / self.configs['modelName'],
model_checkpoint_name = next(default_model_storage.glob(
f"*.{self.configs['modelName']}.pos{'T' if self.configs['usePosModel'] else 'F'}.pt"))
self.logger.info(f"Initiating classifier with {model_checkpoint_name.stem}")
classifier = classify.Classifier(model_checkpoint_name,
self.logger.name if self.logger.isEnabledFor(logging.DEBUG) else None)
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(f"Classifier initiation took {time.perf_counter() - t:.2f} seconds")
Expand Down
7 changes: 5 additions & 2 deletions metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,12 @@ def appmetadata() -> AppMetadata:
description='Minimum number of sampled frames required for a TimeFrame')
metadata.add_parameter(
name='modelName', type='string',
default='20240626-205715.convnext_lg',
choices=[m.stem for m in available_models],
default='convnext_lg',
choices=[m.stem.split('.')[1] for m in available_models],
description='model name to use for classification')
metadata.add_parameter(
name='usePosModel', type='boolean', default=True,
description='Use the model trained with positional features')
metadata.add_parameter(
name='useStitcher', type='boolean', default=True,
description='Use the stitcher after classifying the TimePoints')
Expand Down
3 changes: 1 addition & 2 deletions modeling/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ def __init__(self, model_stem, logger_name=None):
self.training_labels = train.pretraining_binned_label(model_config)
self.featurizer = data_loader.FeatureExtractor(
img_enc_name=model_config["img_enc_name"],
pos_enc_name=model_config.get("pos_enc_name", None),
pos_enc_dim=model_config.get("pos_enc_dim", 0),
max_input_length=model_config.get("input_length", 0),
pos_length=model_config.get("pos_length", 0),
pos_unit=model_config.get("pos_unit", 0))
label_count = len(FRAME_TYPES) + 1
if 'bins' in model_config:
Expand Down
3 changes: 1 addition & 2 deletions modeling/config/exmaple-trainer-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ block_guids_valid: [
num_layers: 3
dropouts: 0.1

pos_enc_name: "sinusoidal-concat"
pos_unit: 60000
pos_enc_dim: 512
# for now, hard-coding the longest video length in the annotated dataset
# $ for m in /llc_data/clams/swt-gbh/**/*.mp4; do printf "%s %s\n" "$(basename $m .mp4)" "$(ffmpeg -i $m 2>&1 | grep Duration: )"; done | sort -k 3 -r | head -n 1
# cpb-aacip-259-4j09zf95 Duration: 01:33:59.57, start: 0.000000, bitrate: 852 kb/s
# 94 mins = 5640 secs = 5640000 ms
input_length: 6000000
pos_length: 6000000

# bins config is used to "prebin" the labels into a smaller set of classes
#bins:
Expand Down
87 changes: 33 additions & 54 deletions modeling/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,48 +66,42 @@ class FeatureExtractor(object):

img_encoder: backbones.ExtractorModel
pos_encoder: Optional[str]
input_length: int
pos_length: int
pos_dim: int
sinusoidal_embeddings: ClassVar[Dict[Tuple[int, int], torch.Tensor]] = {}

def __init__(self, img_enc_name: str,
pos_enc_name: str = None,
pos_enc_dim: int = 512,
input_length: int = 6000000, # 100 min
pos_length: int = 6000000,
pos_unit: int = 60000,
pos_abs_th_front: int = 5,
pos_abs_th_end: int = 5,
pos_abs_th_front: int = 3,
pos_abs_th_end: int = 10,
pos_vec_coeff: float = 0.5):
"""
Initializes the FeatureExtractor object.

@param: img_enc_name = a name of backbone model (e.g. CNN) to use for image vector extraction
@param: pos_enc_name = type of positional encoder to use, one of 'fractional', sinusoidal-add', 'sinusoidal-concat', when not given use no positional encoding
@param: pos_enc_dim = dimension of positional embedding, only relevant to 'sinusoidal-add' scheme, when not given use 512
@param: input_length = length of input video in milliseconds, used for padding positional encoding
@param: pos_enc_dim = dimension of positional embedding, when not given use 512
@param: pos_length = "width" of positional encoding matrix, actual number of matrix columns is calculated by
pos_length / pos_unit (with default values, that is 100 minutes)
@param: pos_unit = unit of positional encoding in milliseconds (e.g., 60000 for minutes, 1000 for seconds)
@param: pos_abs_th_front = the number of minutes to perform absolute lookup at the front of the video
@param: pos_abs_th_end = the number of minutes to perform absolute lookup at the end of the video
@param: pos_abs_th_front = the number of "units" to perform absolute lookup at the front of the video
@param: pos_abs_th_end = the number of "units" to perform absolute lookup at the end of the video
@param: pos_vec_coeff = a value used to regularize the impact of positional encoding
"""
if img_enc_name is None:
raise ValueError("A image vector model must be specified")
else:
self.img_encoder: backbones.ExtractorModel = backbones.model_map[img_enc_name]()
self.pos_encoder = pos_enc_name
self.pos_dim = pos_enc_dim
self.pos_unit = pos_unit
self.pos_abs_th_front = pos_abs_th_front
self.pos_abs_th_end = pos_abs_th_end
self.pos_vec_coeff = pos_vec_coeff
if pos_enc_name in ['sinusoidal-add', 'sinusoidal-concat']:
position_dim = int(input_length / self.pos_unit)
if position_dim % 2 == 1:
position_dim += 1
if pos_enc_name == 'sinusoidal-concat':
self.pos_vec_lookup = self.get_sinusoidal_embeddings(position_dim, pos_enc_dim)
elif pos_enc_name == 'sinusoidal-add':
self.pos_vec_lookup = self.get_sinusoidal_embeddings(position_dim, self.img_encoder.dim)
position_dim = int(pos_length / self.pos_unit)
if position_dim % 2 == 1:
position_dim += 1
self.pos_vec_lookup = self.get_sinusoidal_embeddings(position_dim, self.img_encoder.dim)

def get_sinusoidal_embeddings(self, n_pos, dim):
if (n_pos, dim) in self.__class__.sinusoidal_embeddings:
Expand All @@ -132,39 +126,24 @@ def get_img_vector(self, raw_img, as_numpy=True):
return feature_vec.cpu().numpy()
else:
return feature_vec.cpu()

def convert_position(self, cur, tot):
if cur < self.pos_abs_th_front or tot - cur < self.pos_abs_th_end:
return cur
else:
return cur * self.pos_vec_lookup.shape[0] // tot

def encode_position(self, cur_time, tot_time, img_vec):
if isinstance(img_vec, np.ndarray):
img_vec = torch.from_numpy(img_vec)
img_vec = img_vec.squeeze(0)
if self.pos_encoder != 'none':
pos_lookup_col = self.convert_position(cur_time, tot_time)
pos_vec = self.pos_vec_lookup[pos_lookup_col] * self.pos_vec_coeff
if self.pos_encoder == 'fractional':
pos = cur_time / tot_time
pos_vec = torch.tensor([pos]).to(img_vec.dtype)
return torch.concat((img_vec, pos_vec))
elif self.pos_encoder == 'sinusoidal-add':
return torch.add(img_vec, pos_vec)
elif self.pos_encoder == 'sinusoidal-concat':
return torch.concat((img_vec, pos_vec))
else:
return img_vec
pos_lookup_col = self.convert_position(cur_time, tot_time)
pos_vec = self.pos_vec_lookup[pos_lookup_col] * self.pos_vec_coeff
return torch.add(img_vec, pos_vec)

def feature_vector_dim(self):
if self.pos_encoder == 'sinusoidal-concat':
return self.img_encoder.dim + self.pos_dim
elif self.pos_encoder == 'fractional':
return self.img_encoder.dim + 1
else:
return self.img_encoder.dim

return self.img_encoder.dim

def get_full_feature_vectors(self, raw_img, cur_time, tot_time):
img_vecs = self.get_img_vector(raw_img, as_numpy=False)
return self.encode_position(cur_time, tot_time, img_vecs)
Expand All @@ -184,15 +163,15 @@ def __init__(self, model_name: str):
raise ValueError("No valid model found")
print(f'using model(s): {[model.img_encoder.name for model in self.models]}')

def process_video(self,
vid_path: Union[os.PathLike, str],
def process_video(self,
vid_path: Union[os.PathLike, str],
csv_path: Union[os.PathLike, str],) -> Tuple[Dict, Dict[str, np.ndarray]]:
"""Extract the features for every annotated timepoint in a video.

@param: vid_path = filename of the video
@param: csv_path = filename of the csv containing timepoints
@returns: A list of metadata dictionaries and associated feature matrix"""

frame_metadata = {'frames': []}
frame_vecs = defaultdict(list)
print(f'processing video: {vid_path}')
Expand All @@ -201,7 +180,7 @@ def process_video(self,
frame_metadata['guid'] = frame.guid
if 'duration' not in frame_metadata:
frame_metadata['duration'] = frame.total_time

for extractor in self.models:
frame_vecs[extractor.img_encoder.name].append(extractor.get_img_vector(frame.image, as_numpy=True))
frame_dict = {k: v for k, v in frame.__dict__.items() if k != "image" and k != "guid" and k != "total_time"}
Expand All @@ -211,10 +190,10 @@ def process_video(self,
frame_mats = {k: np.vstack(v) for k, v in frame_vecs.items()}
return frame_metadata, frame_mats

def get_stills(self, vid_path: Union[os.PathLike, str],
def get_stills(self, vid_path: Union[os.PathLike, str],
csv_path: Union[os.PathLike, str]) -> List[AnnotatedImage]:
"""Extract stills at given timepoints from a video file

@param: vid_path = the filename of the video
@param: timepoints = a list of the video's annotated timepoints
@return: a list of Frame objects"""
Expand All @@ -227,22 +206,22 @@ def get_stills(self, vid_path: Union[os.PathLike, str],
subtype_label=row[3],
mod=row[4].lower() == 'true') for row in reader if row[1] == 'true']
# CSV rows with mod=True should be discarded (taken as "unseen")
# maybe we can throw away the video with the least (88) frames annotation from B2 to make 20/20 split on dense vs sparse annotation
# maybe we can throw away the video with the least (88) frames annotation from B2 to make 20/20 split on dense vs sparse annotation

# this part is doing the same thing as the get_stills function in getstills.py
# (copied from https://github.com/WGBH-MLA/keystrokelabeler/blob/df4d2bc936fa3a73cdf3004803a0c35c290caf93/getstills.py#L36 )

container = av.open(vid_path)
video_stream = next((s for s in container.streams if s.type == 'video'), None)
if video_stream is None:
raise Exception("No video stream found in {}".format(vid_path))
fps = video_stream.average_rate.numerator / video_stream.average_rate.denominator
cur_target_frame = 0
fcount = 0
fcount = 0
for frame in container.decode(video=0):
if cur_target_frame == len(frame_list):
break
ftime = int(frame.time * 1000)
ftime = int(frame.time * 1000)
if ftime == frame_list[cur_target_frame].curr_time:
frame_list[cur_target_frame].image = frame.to_image()
yield frame_list[cur_target_frame]
Expand All @@ -257,7 +236,7 @@ def main(args):
print('extractor ready')
feat_metadata, feat_mats = featurizer.process_video(in_file, metadata_file)
print('extraction complete')

if not os.path.exists(args.outdir):
os.makedirs(args.outdir, exist_ok=True)
with open(f"{args.outdir}/{feat_metadata['guid']}.json", 'w', encoding='utf8') as f:
Expand Down
7 changes: 3 additions & 4 deletions modeling/gridsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
num_splits = {1}
num_epochs = {10}
num_layers = {4}
pos_enc_name = {"sinusoidal-concat"}
pos_unit = {60000}
pos_enc_dim = {256}
dropouts = {0.1}
# img_enc_name = modeling.backbones.model_map.keys()
img_enc_name = {'convnext_lg', 'convnext_tiny'}

# new search space for next rounds of positional encoding experiments
max_input_length = {6000000}
pos_length = {6000000}
pos_abs_th_front = {0, 3, 5, 10}
pos_abs_th_end = {0, 3, 5, 10}
pos_enc_coeff = {1, 0.75, 0.5, 0.25}
pos_vec_coeff = {0, 1, 0.75, 0.5, 0.25} # when 0, positional encoding is not enabled
block_guids_train = [
["cpb-aacip-254-75r7szdz"], # always block this the most "uninteresting" video (88/882 frames annotated)
]
Expand Down Expand Up @@ -50,7 +49,7 @@
# we no longer use bins, keeping this just for historical reference
# bins = [{'pre': {'slate': ['S'], 'chyron': ['I', 'N', 'Y'], 'credit': ['C']}}]

param_keys = ['num_splits', 'num_epochs', 'num_layers', 'pos_enc_name', 'input_length', 'pos_unit', 'pos_enc_dim', 'dropouts', 'img_enc_name', 'pos_abs_th_front', 'pos_abs_th_end', 'pos_enc_coeff', 'block_guids_train', 'block_guids_valid']
param_keys = ['num_splits', 'num_epochs', 'num_layers', 'pos_length', 'pos_unit', 'pos_enc_dim', 'dropouts', 'img_enc_name', 'pos_abs_th_front', 'pos_abs_th_end', 'pos_vec_coeff', 'block_guids_train', 'block_guids_valid']
l = locals()
configs = []
for vals in itertools.product(*[l[key] for key in param_keys]):
Expand Down
Binary file removed modeling/models/20240626-205715.convnext_lg.pt
Binary file not shown.
Binary file removed modeling/models/20240626-205803.convnext_tiny.pt
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
num_splits: 1
num_epochs: 10
num_layers: 4
pos_enc_name: sinusoidal-concat
max_input_length: 5640000
pos_length: 6000000
pos_unit: 60000
pos_enc_dim: 256
dropouts: 0.1
img_enc_name: convnext_lg
pos_abs_th_front: 3
pos_abs_th_end: 10
pos_vec_coeff: 0
block_guids_train:
- cpb-aacip-254-75r7szdz
block_guids_valid:
Expand Down
Binary file not shown.
35 changes: 35 additions & 0 deletions modeling/models/20240709-141946.convnext_lg.posT.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
num_splits: 1
num_epochs: 10
num_layers: 4
pos_length: 6000000
pos_unit: 60000
pos_enc_dim: 256
dropouts: 0.1
img_enc_name: convnext_lg
pos_abs_th_front: 3
pos_abs_th_end: 10
pos_vec_coeff: 0.5
block_guids_train:
- cpb-aacip-254-75r7szdz
block_guids_valid:
- cpb-aacip-254-75r7szdz
- cpb-aacip-259-4j09zf95
- cpb-aacip-526-hd7np1xn78
- cpb-aacip-75-72b8h82x
- cpb-aacip-fe9efa663c6
- cpb-aacip-f5847a01db5
- cpb-aacip-f2a88c88d9d
- cpb-aacip-ec590a6761d
- cpb-aacip-c7c64922fcd
- cpb-aacip-f3fa7215348
- cpb-aacip-f13ae523e20
- cpb-aacip-e7a25f07d35
- cpb-aacip-ce6d5e4bd7f
- cpb-aacip-690722078b2
- cpb-aacip-e649135e6ec
- cpb-aacip-15-93gxdjk6
- cpb-aacip-512-4f1mg7h078
- cpb-aacip-512-4m9183583s
- cpb-aacip-512-4b2x34nt7g
- cpb-aacip-512-3n20c4tr34
- cpb-aacip-512-3f4kk9534t
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
num_splits: 1
num_epochs: 10
num_layers: 4
pos_enc_name: sinusoidal-concat
max_input_length: 5640000
pos_length: 6000000
pos_unit: 60000
pos_enc_dim: 256
dropouts: 0.1
img_enc_name: convnext_tiny
pos_abs_th_front: 3
pos_abs_th_end: 10
pos_vec_coeff: 0
block_guids_train:
- cpb-aacip-254-75r7szdz
block_guids_valid:
Expand Down
Binary file not shown.
35 changes: 35 additions & 0 deletions modeling/models/20240709-142139.convnext_tiny.posT.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
num_splits: 1
num_epochs: 10
num_layers: 4
pos_length: 6000000
pos_unit: 60000
pos_enc_dim: 256
dropouts: 0.1
img_enc_name: convnext_tiny
pos_abs_th_front: 3
pos_abs_th_end: 10
pos_vec_coeff: 0.5
block_guids_train:
- cpb-aacip-254-75r7szdz
block_guids_valid:
- cpb-aacip-254-75r7szdz
- cpb-aacip-259-4j09zf95
- cpb-aacip-526-hd7np1xn78
- cpb-aacip-75-72b8h82x
- cpb-aacip-fe9efa663c6
- cpb-aacip-f5847a01db5
- cpb-aacip-f2a88c88d9d
- cpb-aacip-ec590a6761d
- cpb-aacip-c7c64922fcd
- cpb-aacip-f3fa7215348
- cpb-aacip-f13ae523e20
- cpb-aacip-e7a25f07d35
- cpb-aacip-ce6d5e4bd7f
- cpb-aacip-690722078b2
- cpb-aacip-e649135e6ec
- cpb-aacip-15-93gxdjk6
- cpb-aacip-512-4f1mg7h078
- cpb-aacip-512-4m9183583s
- cpb-aacip-512-4b2x34nt7g
- cpb-aacip-512-3n20c4tr34
- cpb-aacip-512-3f4kk9534t
Loading
Loading