From 02ece9bf8ee46b8e684825762c5d6d55337a7f3d Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Thu, 9 Jan 2025 16:23:32 -0800 Subject: [PATCH] [GConstruct] Allow directory paths as input to GConstruct (#1130) *Issue #, if available:* *Description of changes:* * Allow directories as input to GConstruct, bringing it level with GSProcessing. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --- python/graphstorm/gconstruct/file_io.py | 2 ++ .../gconstruct/test_gconstruct_utils.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/gconstruct/file_io.py b/python/graphstorm/gconstruct/file_io.py index 31fce06945..baa5f5e27b 100644 --- a/python/graphstorm/gconstruct/file_io.py +++ b/python/graphstorm/gconstruct/file_io.py @@ -88,6 +88,8 @@ def expand_wildcard(data_files: List[str]) -> List[str]: """ expanded_files = [] + if len(data_files) == 1 and os.path.isdir(data_files[0]): + data_files = [os.path.join(data_files[0], "*")] for item in data_files: if '*' in item: matched_files = sorted(glob.glob(item)) diff --git a/tests/unit-tests/gconstruct/test_gconstruct_utils.py b/tests/unit-tests/gconstruct/test_gconstruct_utils.py index 3c3c811857..770445b5dd 100644 --- a/tests/unit-tests/gconstruct/test_gconstruct_utils.py +++ b/tests/unit-tests/gconstruct/test_gconstruct_utils.py @@ -39,7 +39,7 @@ read_data_hdf5, get_in_files, write_data_parquet) -from graphstorm.gconstruct.file_io import read_index, write_index_json +from graphstorm.gconstruct.file_io import read_index, write_index_json, expand_wildcard from graphstorm.gconstruct.file_io import (read_data_csv, read_data_json, read_data_parquet) @@ -502,7 +502,21 @@ def test_read_index(): assert train_content == [("p1", "p3"), ("p2", "p4")] assert test_content == [("p5", "p7"), ("p6", "p8")] - +def test_single_directory_expansion(): + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Create some test files in the directory + test_files = ['file1.txt', 'file2.txt'] + for file_name in test_files: + with open(os.path.join(temp_dir, file_name), 'w') as f: + f.write('test') + + # Test the function with a single directory + result = expand_wildcard([temp_dir]) + + # Sort both lists for comparison + expected_files = sorted([os.path.join(temp_dir, f) for f in test_files]) + assert sorted(result) == expected_files if __name__ == '__main__': test_shuffle_hard_nids()