Merge pull request #55 from github/copilot/fix-d2184ad1-8c09-4dca-99dd-740c3fae4b09

zkoppert · web-flow · commit ef88aacdb7c7 · 2025-09-28T17:40:03.000-07:00
feat: Replace print() statements with proper logging framework
diff --git a/config.py b/config.py
@@ -13,6 +13,7 @@
 import os
 from os.path import dirname, join
 
+from constants import DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE
 from dotenv import load_dotenv
 
 
@@ -233,14 +234,14 @@ def get_env_vars(test: bool = False) -> EnvVars:
     rate_limit_bypass = get_bool_env_var("RATE_LIMIT_BYPASS", False)
 
     # Get the chunk size for processing data in batches (for memory efficiency)
-    chunk_size_str = os.getenv("CHUNK_SIZE", "100")
+    chunk_size_str = os.getenv("CHUNK_SIZE", str(DEFAULT_CHUNK_SIZE))
     try:
         chunk_size = int(chunk_size_str)
         # Ensure a reasonable minimum chunk size
-        chunk_size = max(chunk_size, 10)
+        chunk_size = max(chunk_size, MIN_CHUNK_SIZE)
     except ValueError:
-        # Default to 100 if not a valid integer
-        chunk_size = 100
+        # Default to DEFAULT_CHUNK_SIZE if not a valid integer
+        chunk_size = DEFAULT_CHUNK_SIZE
 
     return EnvVars(
         gh_app_id,
diff --git a/constants.py b/constants.py
@@ -0,0 +1,14 @@
+"""Constants used throughout the InnerSource measurement tool.
+
+This module defines commonly used constants to avoid magic values
+and improve code maintainability.
+"""
+
+# GitHub issue body character limit
+GITHUB_ISSUE_BODY_MAX_CHARS = 65535
+
+# Default chunk size for processing data in batches
+DEFAULT_CHUNK_SIZE = 100
+
+# Minimum allowed chunk size
+MIN_CHUNK_SIZE = 10
diff --git a/logging_config.py b/logging_config.py
@@ -0,0 +1,58 @@
+"""Logging configuration and utilities for the InnerSource measurement tool.
+
+This module provides centralized logging configuration to replace
+print statements with proper logging levels.
+"""
+
+import logging
+import sys
+
+
+def setup_logging(level: str = "INFO") -> logging.Logger:
+    """Configure and return a logger for the InnerSource measurement tool.
+
+    Args:
+        level (str): Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+                    Defaults to INFO.
+
+    Returns:
+        logging.Logger: Configured logger instance
+    """
+    # Create logger
+    logger = logging.getLogger("innersource_measure")
+
+    # Avoid adding multiple handlers if logger is already configured
+    if logger.handlers:
+        return logger
+
+    # Set level
+    numeric_level = getattr(logging, level.upper(), logging.INFO)
+    logger.setLevel(numeric_level)
+
+    # Create console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(numeric_level)
+
+    # Create formatter
+    formatter = logging.Formatter(
+        fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    console_handler.setFormatter(formatter)
+
+    # Add handler to logger
+    logger.addHandler(console_handler)
+
+    # Prevent propagation to avoid duplicate messages
+    logger.propagate = False
+
+    return logger
+
+
+def get_logger() -> logging.Logger:
+    """Get the configured logger instance.
+
+    Returns:
+        logging.Logger: The configured logger for the application
+    """
+    return logging.getLogger("innersource_measure")
diff --git a/measure_innersource.py b/measure_innersource.py
@@ -12,6 +12,8 @@
 
 from auth import auth_to_github, get_github_app_installation_token
 from config import get_env_vars
+from constants import GITHUB_ISSUE_BODY_MAX_CHARS
+from logging_config import get_logger, setup_logging
 from markdown_helpers import markdown_too_large_for_issue_body, split_markdown_file
 from markdown_writer import write_to_markdown
 
@@ -38,15 +40,20 @@ def evaluate_markdown_file_size(output_file: str) -> None:
     """
     output_file_name = output_file if output_file else "innersource_report.md"
     file_name_without_extension = Path(output_file_name).stem
-    max_char_count = 65535
+    max_char_count = GITHUB_ISSUE_BODY_MAX_CHARS
+    logger = get_logger()
+
     if markdown_too_large_for_issue_body(output_file_name, max_char_count):
         split_markdown_file(output_file_name, max_char_count)
         shutil.move(output_file_name, f"{file_name_without_extension}_full.md")
         shutil.move(f"{file_name_without_extension}_0.md", output_file_name)
-        print(
-            f"The markdown file is too large for GitHub issue body and has been \
-split into multiple files. ie. {output_file_name}, {file_name_without_extension}_1.md, etc. \
-The full file is saved as {file_name_without_extension}_full.md\n"
+        logger.info(
+            "The markdown file is too large for GitHub issue body and has been "
+            "split into multiple files. ie. %s, %s_1.md, etc. "
+            "The full file is saved as %s_full.md\n",
+            output_file_name,
+            file_name_without_extension,
+            file_name_without_extension,
         )
 
 
@@ -82,7 +89,9 @@ def main():  # pragma: no cover
         - Requires org-data.json file to be present in the current directory
     """
 
-    print("Starting innersource-measure tool...")
+    # Initialize logging
+    logger = setup_logging()
+    logger.info("Starting innersource-measure tool...")
 
     # Get the environment variables for use in the script
     env_vars = get_env_vars()
@@ -117,50 +126,57 @@ def main():  # pragma: no cover
     # evaluate_markdown_file_size(output_file)
 
     if github_connection:
-        print("connection successful")
+        logger.info("Connection to GitHub successful")
 
         # fetch repository data
-        print(f"Fetching repository data for {owner}/{repo}...")
+        logger.info("Fetching repository data for %s/%s...", owner, repo)
         repo_data = github_connection.repository(owner, repo)
         if not repo_data:
-            print(f"Unable to fetch repository {owner}/{repo} specified. Exiting.")
+            logger.error(
+                "Unable to fetch repository %s/%s specified. Exiting.", owner, repo
+            )
             return
 
-        print(f"Repository {repo_data.full_name} found.")
+        logger.info("Repository %s found.", repo_data.full_name)
 
         # Read in the org data in org-data.json
         org_data = None
         org_data_path = Path("org-data.json")
         if org_data_path.exists():
-            print("Reading in org data from org-data.json...")
+            logger.info("Reading in org data from org-data.json...")
             with open(org_data_path, "r", encoding="utf-8") as org_file:
                 org_data = json.load(org_file)
-            print("Org data read successfully.")
+            logger.info("Org data read successfully.")
         else:
-            print("No org data found. InnerSource collaboration cannot be measured.")
+            logger.warning(
+                "No org data found. InnerSource collaboration cannot be measured."
+            )
 
         if org_data:
-            print("Org data found. Measuring InnerSource collaboration...")
+            logger.info("Org data found. Measuring InnerSource collaboration...")
         else:
-            print("No org data found. InnerSource collaboration cannot be measured.")
+            logger.error(
+                "No org data found. InnerSource collaboration cannot be measured."
+            )
             return
 
         # Initialize contributor lists and team members list
         all_contributors = []
         innersource_contributors = []
         team_members_that_own_the_repo = []
 
-        print("Analyzing first commit...")
+        logger.info("Analyzing first commit...")
         commits = repo_data.commits()
         # Paginate to the last page to get the oldest commit
         # commits is a GitHubIterator, so you can use .count to get total, then get the last one
         commit_list = list(commits)
         first_commit = commit_list[-1]  # The last in the list is the oldest
         original_commit_author = first_commit.author.login
         original_commit_author_manager = org_data[original_commit_author]["manager"]
-        print(
-            f"Original commit author: {original_commit_author}, \
-with manager: {original_commit_author_manager}"
+        logger.info(
+            "Original commit author: %s, with manager: %s",
+            original_commit_author,
+            original_commit_author_manager,
         )
         # Create a dictionary mapping users to their managers for faster lookups
         user_to_manager = {}
@@ -195,11 +211,13 @@ def main():  # pragma: no cover
 
         # Remove duplicates from the team members list
         team_members_that_own_the_repo = list(set(team_members_that_own_the_repo))
-        print(f"Team members that own the repo: {team_members_that_own_the_repo}")
+        logger.debug(
+            "Team members that own the repo: %s", team_members_that_own_the_repo
+        )
 
         # For each contributor, check if they are in the team that owns the repo list
         # and if not, add them to the innersource contributors list
-        print("Analyzing all contributors in the repository...")
+        logger.info("Analyzing all contributors in the repository...")
         for contributor in repo_data.contributors():
             all_contributors.append(contributor.login)
             if (
@@ -208,25 +226,25 @@ def main():  # pragma: no cover
             ):
                 innersource_contributors.append(contributor.login)
 
-        print(f"All contributors: {all_contributors}")
-        print(f"Innersource contributors: {innersource_contributors}")
+        logger.debug("All contributors: %s", all_contributors)
+        logger.debug("Innersource contributors: %s", innersource_contributors)
 
         # Process data in chunks to avoid memory issues while maintaining performance
         chunk_size = env_vars.chunk_size
-        print(f"Using chunk size of {chunk_size} for data processing")
+        logger.info("Using chunk size of %s for data processing", chunk_size)
 
-        print("Pre-processing contribution data...")
+        logger.info("Pre-processing contribution data...")
 
         # Create mapping of commit authors to commit counts
-        print("Processing commits...")
+        logger.info("Processing commits...")
         commit_author_counts = {}
         for commit in commit_list:
             if hasattr(commit.author, "login"):
                 author = commit.author.login
                 commit_author_counts[author] = commit_author_counts.get(author, 0) + 1
 
         # Process pull requests in chunks
-        print("Processing pull requests in chunks...")
+        logger.info("Processing pull requests in chunks...")
         pr_author_counts = {}
         total_prs = 0
 
@@ -252,12 +270,12 @@ def main():  # pragma: no cover
                     pr_author_counts[author] = pr_author_counts.get(author, 0) + 1
 
             total_prs += len(chunk)
-            print(f"  Processed {total_prs} pull requests so far...")
+            logger.debug("  Processed %s pull requests so far...", total_prs)
 
-        print(f"Found and processed {total_prs} pull requests")
+        logger.info("Found and processed %s pull requests", total_prs)
 
         # Process issues in chunks
-        print("Processing issues in chunks...")
+        logger.info("Processing issues in chunks...")
         issue_author_counts = {}
         total_issues = 0
 
@@ -283,13 +301,13 @@ def main():  # pragma: no cover
                     issue_author_counts[author] = issue_author_counts.get(author, 0) + 1
 
             total_issues += len(chunk)
-            print(f"  Processed {total_issues} issues so far...")
+            logger.debug("  Processed %s issues so far...", total_issues)
 
-        print(f"Found and processed {total_issues} issues")
+        logger.info("Found and processed %s issues", total_issues)
 
         # Count contributions for each innersource contributor using precompiled dictionaries
         innersource_contribution_counts = {}
-        print("Counting contributions for each innersource contributor...")
+        logger.info("Counting contributions for each innersource contributor...")
         for contributor in innersource_contributors:
             # Initialize counter for this contributor
             innersource_contribution_counts[contributor] = 0
@@ -309,13 +327,13 @@ def main():  # pragma: no cover
                 contributor, 0
             )
 
-        print("Innersource contribution counts:")
+        logger.debug("Innersource contribution counts:")
         for contributor, count in innersource_contribution_counts.items():
-            print(f"  {contributor}: {count} contributions")
+            logger.debug("  %s: %s contributions", contributor, count)
 
         # Count contributions for each team member using precompiled dictionaries
         team_member_contribution_counts = {}
-        print("Counting contributions for each team member that owns the repo...")
+        logger.info("Counting contributions for each team member that owns the repo...")
         for member in team_members_that_own_the_repo:
             # Initialize counter for this team member
             team_member_contribution_counts[member] = 0
@@ -333,10 +351,10 @@ def main():  # pragma: no cover
                 member, 0
             )
 
-        print("Team member contribution counts:")
+        logger.debug("Team member contribution counts:")
         for member, count in team_member_contribution_counts.items():
             if count > 0:
-                print(f"  {member}: {count} contributions")
+                logger.debug("  %s: %s contributions", member, count)
 
         # Calculate the ratio of innersource contributions to total contributions
         total_contributions = sum(innersource_contribution_counts.values()) + sum(
@@ -349,7 +367,7 @@ def main():  # pragma: no cover
         else:
             innersource_ratio = 0
 
-        print(f"Innersource contribution ratio: {innersource_ratio:.2%}")
+        logger.info("Innersource contribution ratio: %.2f%%", innersource_ratio * 100)
 
         # Write the results to a markdown file using report_title and output_file
         write_to_markdown(
@@ -367,10 +385,10 @@ def main():  # pragma: no cover
         )
 
         evaluate_markdown_file_size(output_file)
-        print(f"InnerSource report written to {output_file}")
+        logger.info("InnerSource report written to %s", output_file)
 
     else:
-        print("Failed to connect to GitHub. Exiting.")
+        logger.error("Failed to connect to GitHub. Exiting.")
 
 
 if __name__ == "__main__":
diff --git a/test_constants.py b/test_constants.py
@@ -0,0 +1,33 @@
+"""Tests for constants.py"""
+
+import unittest
+
+from constants import DEFAULT_CHUNK_SIZE, GITHUB_ISSUE_BODY_MAX_CHARS, MIN_CHUNK_SIZE
+
+
+class TestConstants(unittest.TestCase):
+    """Test cases for constants"""
+
+    def test_github_issue_body_max_chars(self):
+        """Test that the GitHub issue body limit constant is correct"""
+        assert GITHUB_ISSUE_BODY_MAX_CHARS == 65535
+
+    def test_default_chunk_size(self):
+        """Test that the default chunk size constant is correct"""
+        assert DEFAULT_CHUNK_SIZE == 100
+
+    def test_min_chunk_size(self):
+        """Test that the minimum chunk size constant is correct"""
+        assert MIN_CHUNK_SIZE == 10
+
+    def test_constants_are_integers(self):
+        """Test that all constants are integers"""
+        assert isinstance(GITHUB_ISSUE_BODY_MAX_CHARS, int)
+        assert isinstance(DEFAULT_CHUNK_SIZE, int)
+        assert isinstance(MIN_CHUNK_SIZE, int)
+
+    def test_chunk_size_relationships(self):
+        """Test that chunk size constants have correct relationships"""
+        assert MIN_CHUNK_SIZE <= DEFAULT_CHUNK_SIZE
+        assert MIN_CHUNK_SIZE > 0
+        assert DEFAULT_CHUNK_SIZE > 0
diff --git a/test_logging_config.py b/test_logging_config.py
diff --git a/test_markdown_helpers.py b/test_markdown_helpers.py