Skip to content

Commit 0bd5cc4

Browse files
committed
perf: refactor commit processing to use chunking for better memory efficiency
This change improves memory usage and scalability for repositories with large commit histories by: - Replacing list(commits) with chunked processing to avoid loading the entire commit history into memory at once - Using iterators directly to find the original commit author instead of loading all commits first - Processing commits in manageable chunks based on the configured chunk_size, similar to how pull requests and issues are already handled - Adding debug logging for commit processing progress This approach ensures consistent memory usage regardless of repository size and prevents potential out-of-memory issues when analyzing repositories with extensive commit histories. Signed-off-by: Zack Koppert <[email protected]>
1 parent 2d429f4 commit 0bd5cc4

File tree

2 files changed

+56
-23
lines changed

2 files changed

+56
-23
lines changed

measure_innersource.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,6 @@ def main(): # pragma: no cover
164164
innersource_contributors = []
165165
team_members_that_own_the_repo = []
166166

167-
# Get all commits for contribution counting (needed regardless of team determination method)
168-
logger.info("Fetching commits...")
169-
commits = repo_data.commits()
170-
commit_list = list(commits)
171-
172167
# Check if owning team is explicitly specified
173168
if owning_team:
174169
logger.info("Using explicitly specified owning team: %s", owning_team)
@@ -177,12 +172,26 @@ def main(): # pragma: no cover
177172
original_commit_author = None
178173
original_commit_author_manager = None
179174
else:
180-
logger.info("Analyzing first commit...")
181-
# Paginate to the last page to get the oldest commit
182-
# commits is a GitHubIterator, so you can use .count to get total,
183-
# then get the last one
184-
first_commit = commit_list[-1] # The last in the list is the oldest
185-
original_commit_author = first_commit.author.login
175+
logger.info("Finding original commit author...")
176+
# We need to find the oldest commit for team determination
177+
# Use GitHub's default chronological ordering (oldest first)
178+
commits_iterator = repo_data.commits()
179+
original_commit = None
180+
181+
# Process just enough commits to find the oldest one
182+
# Most repos will only need a single API call since GitHub sorts oldest first
183+
# For repositories with unusual commit ordering, we'll get the first commit from the first page
184+
try:
185+
original_commit = next(commits_iterator)
186+
original_commit_author = (
187+
original_commit.author.login
188+
if hasattr(original_commit.author, "login")
189+
else None
190+
)
191+
logger.info("Found original commit by %s", original_commit_author)
192+
except StopIteration:
193+
logger.warning("No commits found in repository")
194+
original_commit_author = None
186195

187196
# Check if original commit author exists in org chart
188197
if original_commit_author not in org_data:
@@ -267,13 +276,38 @@ def main(): # pragma: no cover
267276

268277
logger.info("Pre-processing contribution data...")
269278

270-
# Create mapping of commit authors to commit counts
271-
logger.info("Processing commits...")
279+
# Process commits in chunks
280+
logger.info("Processing commits in chunks...")
272281
commit_author_counts = {}
273-
for commit in commit_list:
274-
if hasattr(commit.author, "login"):
275-
author = commit.author.login
276-
commit_author_counts[author] = commit_author_counts.get(author, 0) + 1
282+
total_commits = 0
283+
284+
# GitHub API returns an iterator that internally handles pagination
285+
# We'll manually chunk it to avoid loading everything at once
286+
commits_iterator = repo_data.commits()
287+
while True:
288+
# Process a chunk of commits
289+
chunk = []
290+
for _ in range(chunk_size):
291+
try:
292+
chunk.append(next(commits_iterator))
293+
except StopIteration:
294+
break
295+
296+
if not chunk:
297+
break
298+
299+
# Update counts for this chunk
300+
for commit in chunk:
301+
if hasattr(commit.author, "login"):
302+
author = commit.author.login
303+
commit_author_counts[author] = (
304+
commit_author_counts.get(author, 0) + 1
305+
)
306+
307+
total_commits += len(chunk)
308+
logger.debug(" Processed %s commits so far...", total_commits)
309+
310+
logger.info("Found and processed %s commits", total_commits)
277311

278312
# Process pull requests in chunks
279313
logger.info("Processing pull requests in chunks...")

test_measure_innersource.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,12 @@ def test_main_missing_user_in_org_chart(tmp_path, monkeypatch):
147147
call[0][0] for call in mock_logger.info.call_args_list if call[0]
148148
]
149149

150-
# Should have logged about reading org data and analyzing first
151-
# commit, but should NOT have logged about original commit author
152-
# with manager
150+
# Should have logged about reading org data and finding original commit author
151+
# but should NOT have logged about original commit author with manager
153152
assert "Reading in org data from org-data.json..." in info_calls
154-
assert "Analyzing first commit..." in info_calls
155-
156-
# Should NOT contain the log message about
153+
assert (
154+
"Finding original commit author..." in info_calls
155+
) # Should NOT contain the log message about
157156
# "Original commit author: X, with manager: Y"
158157
assert not any(
159158
isinstance(msg, str)

0 commit comments

Comments
 (0)