From a23474ee69cbcac544dc3d698cf3b6828f9a50e5 Mon Sep 17 00:00:00 2001 From: Jianqi Pan Date: Thu, 20 Jun 2024 16:00:29 +0900 Subject: [PATCH] feat: add a option for ignoring images that already have a caption file --- finetune/make_captions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/finetune/make_captions.py b/finetune/make_captions.py index 489bdbcce..bca85dd32 100644 --- a/finetune/make_captions.py +++ b/finetune/make_captions.py @@ -118,6 +118,22 @@ def run_batch(path_imgs): if args.debug: logger.info(f'{image_path} {caption}') + def filter_image_paths(image_paths, extension=".caption"): + """ + Filter out image paths that already have a .caption file with the same name + """ + filtered_paths = [] + for image_path in image_paths: + caption_path = os.path.splitext(image_path)[0] + extension + if not os.path.exists(caption_path): + filtered_paths.append(image_path) + return filtered_paths + + # すでにキャプションがある画像を除外 + if args.ignore_existing: + logger.info("ignoring images that already have a caption file") + image_paths = filter_image_paths(image_paths, args.caption_extension) + # 読み込みの高速化のためにDataLoaderを使うオプション if args.max_data_loader_n_workers is not None: dataset = ImageLoadingTransformDataset(image_paths) @@ -194,7 +210,7 @@ def setup_parser() -> argparse.ArgumentParser: parser.add_argument("--seed", default=42, type=int, help="seed for reproducibility / 再現性を確保するための乱数seed") parser.add_argument("--debug", action="store_true", help="debug mode") parser.add_argument("--recursive", action="store_true", help="search for images in subfolders recursively / サブフォルダを再帰的に検索する") - + parser.add_argument("--ignore_existing", action="store_true", help="ignore images that already have a caption file / すでにキャプションファイルがある画像を無視する") return parser