perf: refactor commit processing to use chunking for better memory efficiency

zkoppert · zkoppert · commit 0bd5cc4340b9 · 2025-09-30T15:28:31.000-07:00
This change improves memory usage and scalability for repositories with large
commit histories by:

- Replacing list(commits) with chunked processing to avoid loading the entire
  commit history into memory at once
- Using iterators directly to find the original commit author instead of
  loading all commits first
- Processing commits in manageable chunks based on the configured chunk_size,
  similar to how pull requests and issues are already handled
- Adding debug logging for commit processing progress

This approach ensures consistent memory usage regardless of repository size
and prevents potential out-of-memory issues when analyzing repositories with
extensive commit histories.

Signed-off-by: Zack Koppert &lt;zkoppert@github.com&gt;
diff --git a/measure_innersource.py b/measure_innersource.py
@@ -164,11 +164,6 @@ def main():  # pragma: no cover
         innersource_contributors = []
         team_members_that_own_the_repo = []
 
-        # Get all commits for contribution counting (needed regardless of team determination method)
-        logger.info("Fetching commits...")
-        commits = repo_data.commits()
-        commit_list = list(commits)
-
         # Check if owning team is explicitly specified
         if owning_team:
             logger.info("Using explicitly specified owning team: %s", owning_team)
@@ -177,12 +172,26 @@ def main():  # pragma: no cover
             original_commit_author = None
             original_commit_author_manager = None
         else:
-            logger.info("Analyzing first commit...")
-            # Paginate to the last page to get the oldest commit
-            # commits is a GitHubIterator, so you can use .count to get total,
-            # then get the last one
-            first_commit = commit_list[-1]  # The last in the list is the oldest
-            original_commit_author = first_commit.author.login
+            logger.info("Finding original commit author...")
+            # We need to find the oldest commit for team determination
+            # Use GitHub's default chronological ordering (oldest first)
+            commits_iterator = repo_data.commits()
+            original_commit = None
+
+            # Process just enough commits to find the oldest one
+            # Most repos will only need a single API call since GitHub sorts oldest first
+            # For repositories with unusual commit ordering, we'll get the first commit from the first page
+            try:
+                original_commit = next(commits_iterator)
+                original_commit_author = (
+                    original_commit.author.login
+                    if hasattr(original_commit.author, "login")
+                    else None
+                )
+                logger.info("Found original commit by %s", original_commit_author)
+            except StopIteration:
+                logger.warning("No commits found in repository")
+                original_commit_author = None
 
             # Check if original commit author exists in org chart
             if original_commit_author not in org_data:
@@ -267,13 +276,38 @@ def main():  # pragma: no cover
 
         logger.info("Pre-processing contribution data...")
 
-        # Create mapping of commit authors to commit counts
-        logger.info("Processing commits...")
+        # Process commits in chunks
+        logger.info("Processing commits in chunks...")
         commit_author_counts = {}
-        for commit in commit_list:
-            if hasattr(commit.author, "login"):
-                author = commit.author.login
-                commit_author_counts[author] = commit_author_counts.get(author, 0) + 1
+        total_commits = 0
+
+        # GitHub API returns an iterator that internally handles pagination
+        # We'll manually chunk it to avoid loading everything at once
+        commits_iterator = repo_data.commits()
+        while True:
+            # Process a chunk of commits
+            chunk = []
+            for _ in range(chunk_size):
+                try:
+                    chunk.append(next(commits_iterator))
+                except StopIteration:
+                    break
+
+            if not chunk:
+                break
+
+            # Update counts for this chunk
+            for commit in chunk:
+                if hasattr(commit.author, "login"):
+                    author = commit.author.login
+                    commit_author_counts[author] = (
+                        commit_author_counts.get(author, 0) + 1
+                    )
+
+            total_commits += len(chunk)
+            logger.debug("  Processed %s commits so far...", total_commits)
+
+        logger.info("Found and processed %s commits", total_commits)
 
         # Process pull requests in chunks
         logger.info("Processing pull requests in chunks...")
diff --git a/test_measure_innersource.py b/test_measure_innersource.py
@@ -147,13 +147,12 @@ def test_main_missing_user_in_org_chart(tmp_path, monkeypatch):
                 call[0][0] for call in mock_logger.info.call_args_list if call[0]
             ]
 
-            # Should have logged about reading org data and analyzing first
-            # commit, but should NOT have logged about original commit author
-            # with manager
+            # Should have logged about reading org data and finding original commit author
+            # but should NOT have logged about original commit author with manager
             assert "Reading in org data from org-data.json..." in info_calls
-            assert "Analyzing first commit..." in info_calls
-
-            # Should NOT contain the log message about
+            assert (
+                "Finding original commit author..." in info_calls
+            )  # Should NOT contain the log message about
             # "Original commit author: X, with manager: Y"
             assert not any(
                 isinstance(msg, str)