Skip to content

Commit

Permalink
[GConstruct] Allow directory paths as input to GConstruct (#1130)
Browse files Browse the repository at this point in the history
*Issue #, if available:*

*Description of changes:*

* Allow directories as input to GConstruct, bringing it level with
GSProcessing.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
  • Loading branch information
thvasilo authored Jan 10, 2025
1 parent ddae1f3 commit 02ece9b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
2 changes: 2 additions & 0 deletions python/graphstorm/gconstruct/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def expand_wildcard(data_files: List[str]) -> List[str]:
"""
expanded_files = []
if len(data_files) == 1 and os.path.isdir(data_files[0]):
data_files = [os.path.join(data_files[0], "*")]
for item in data_files:
if '*' in item:
matched_files = sorted(glob.glob(item))
Expand Down
18 changes: 16 additions & 2 deletions tests/unit-tests/gconstruct/test_gconstruct_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
read_data_hdf5,
get_in_files,
write_data_parquet)
from graphstorm.gconstruct.file_io import read_index, write_index_json
from graphstorm.gconstruct.file_io import read_index, write_index_json, expand_wildcard
from graphstorm.gconstruct.file_io import (read_data_csv,
read_data_json,
read_data_parquet)
Expand Down Expand Up @@ -502,7 +502,21 @@ def test_read_index():
assert train_content == [("p1", "p3"), ("p2", "p4")]
assert test_content == [("p5", "p7"), ("p6", "p8")]


def test_single_directory_expansion():
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create some test files in the directory
test_files = ['file1.txt', 'file2.txt']
for file_name in test_files:
with open(os.path.join(temp_dir, file_name), 'w') as f:
f.write('test')

# Test the function with a single directory
result = expand_wildcard([temp_dir])

# Sort both lists for comparison
expected_files = sorted([os.path.join(temp_dir, f) for f in test_files])
assert sorted(result) == expected_files

if __name__ == '__main__':
test_shuffle_hard_nids()
Expand Down

0 comments on commit 02ece9b

Please sign in to comment.