remyxai · smellslikeml · Feb 24, 2025 · Dec 6, 2024 · Feb 18, 2025 · Feb 18, 2025
diff --git a/docker/location_refinement_stage/Dockerfile b/docker/location_refinement_stage/Dockerfile
@@ -1,6 +1,8 @@
 FROM vqasynth:base
 WORKDIR /app
 
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
 COPY docker/location_refinement_stage/process_location_refinement.py /app
 COPY docker/location_refinement_stage/entrypoint.sh /app
 COPY config/config.yaml /app/config/config.yaml

diff --git a/docker/location_refinement_stage/process_location_refinement.py b/docker/location_refinement_stage/process_location_refinement.py
@@ -4,37 +4,46 @@
 import argparse
 import numpy as np
 import pandas as pd
+
 from vqasynth.datasets import Dataloader
 from vqasynth.localize import Localizer
 from vqasynth.utils import filter_null
 
 def main(output_dir, source_repo_id, images):
+    # 1) Instantiate the Dataloader
     dataloader = Dataloader(output_dir)
-    localizer = Localizer()
+
+    # 2) Create the Localizer with Molmo + SAM2 (points)
+    #    You can choose whichever SAM2 model variant:
+    #    e.g. "facebook/sam2-hiera-small", "facebook/sam2-hiera-large", etc.
+    localizer = Localizer(
+        captioner_type="florence",
+        segmenter_model="facebook/sam2-hiera-small"
+    )
 
-    # Load dataset
+    # 3) Load the dataset
     dataset = dataloader.load_dataset(source_repo_id)
 
-    # Apply the localizer transformation with batching
+    # 4) Apply the localizer transformation with batching and pass use_points=True
     dataset = dataset.map(
         localizer.apply_transform,
         fn_kwargs={'images': images},
         batched=True,
-        batch_size=32
+        batch_size=1,
     )
 
-    # Filter out nulls with the updated filter_null function
+    # 5) Filter out nulls
     dataset = dataset.filter(filter_null, batched=True, batch_size=32)
 
-    # Save the processed dataset to disk
+    # 6) Save the processed dataset
     dataloader.save_to_disk(dataset)
 
     print("Localization complete")
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Localize and describe objects in images", add_help=True
+        description="Localize and describe objects in images", 
+        add_help=True
     )
     parser.add_argument(
         "--output_dir",
@@ -57,3 +66,4 @@ def main(output_dir, source_repo_id, images):
     args = parser.parse_args()
 
     main(args.output_dir, args.source_repo_id, args.images)
+
diff --git a/requirements.txt b/requirements.txt
@@ -21,3 +21,6 @@ pandas==2.2.3
 html5lib==1.1
 datasets==3.1.0
 spacy==3.7.5
+bitsandbytes
+peft
+tensorflow