Skip to content

Improve git history treemap visualizations and uncover pairwise changed files #352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".

MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
WHERE elementId(firstGitFile) < elementId(secondGitFile)
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
CALL (firstCodeFile, secondCodeFile, gitChange) {
MERGE (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
SET pairwiseChange = properties(gitChange)
} IN TRANSACTIONS
RETURN count(*) AS pairCount
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"

MATCH (global_git_commit:Git:Commit)
WITH count(global_git_commit) AS globalCommitCount
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
ORDER BY git_commit.sha, git_file.relativePath
WITH globalCommitCount
,git_commit.sha AS commitHash
,collect(DISTINCT git_file) AS filesInCommit
// Limit the file count to min. 2 (changed together) and
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
WHERE size(filesInCommit) >= 2
AND size(filesInCommit) <= 50
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
WITH globalCommitCount
,commitHash
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
UNWIND fileCombinations AS fileCombination
WITH globalCommitCount
,fileCombination
,count(DISTINCT commitHash) AS commitCount
,collect(DISTINCT commitHash) AS commitHashes
// Filter out file pairs that where changed not very often together
// In detail: More than 0.1 per mille compared to overall commit count
WHERE commitCount > globalCommitCount * 0.001
WITH fileCombination[0] AS firstFile
,fileCombination[1] AS secondFile
,commitCount
,commitHashes
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
CALL (firstFile, secondFile, commitCount, commitHashes) {
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
SET pairwiseChange.commitCount = commitCount
,pairwiseChange.commitHashes = commitHashes
} IN TRANSACTIONS
// Return one row with some statistics about the found pairs and their commit counts
RETURN max(commitCount) AS maxCommitCount
,avg(commitCount) AS avgCommitCount
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
,count(*) AS pairCount
10 changes: 10 additions & 0 deletions cypher/GitLog/List_git_files_that_were_changed_together.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// List git files that where changed together frequently. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".

MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
WHERE elementId(firstGitFile) < elementId(secondGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
,gitChange.commitCount AS commitCount
ORDER BY commitCount DESC
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// List git files that where changed together frequently

MATCH (global_git_commit:Git:Commit)
WITH count(global_git_commit) AS globalCommitCount
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
WITH globalCommitCount
,git_commit.sha AS commitHash
,collect(DISTINCT filePath) AS filesInCommit
WHERE size(filesInCommit) >= 2
AND size(filesInCommit) <= 50
WITH globalCommitCount
,commitHash
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
UNWIND fileCombinations AS fileCombination
WITH globalCommitCount
,apoc.coll.sort(fileCombination) AS fileCombination
,count(DISTINCT commitHash) AS commitCount
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
RETURN fileCombination[0] AS firstFile
,fileCombination[1] AS secondFile
,commitCount
ORDER BY commitCount DESC
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".

MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
UNWIND gitChange.commitHashes AS commitHash
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
,count(DISTINCT commitHash) AS commitCount
ORDER BY commitCount DESC
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// List git files that where changed together frequently

MATCH (global_git_commit:Git:Commit)
WITH count(global_git_commit) AS globalCommitCount
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
WITH globalCommitCount
,git_commit.sha AS commitHash
,collect(DISTINCT filePath) AS filesInCommit
WHERE size(filesInCommit) >= 2
AND size(filesInCommit) <= 50
WITH globalCommitCount
,commitHash
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
UNWIND fileCombinations AS fileCombination
UNWIND fileCombination AS filePath
WITH globalCommitCount
,filePath
,count(DISTINCT commitHash) AS commitCount
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
RETURN filePath
,commitCount
ORDER BY commitCount DESC
174 changes: 172 additions & 2 deletions jupyter/GitHistoryGeneral.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@
")\n",
"plotly_treemap_figure_show_settings = dict(\n",
" renderer=\"svg\" if is_command_line_execution() else None,\n",
" width=1000,\n",
" height=800\n",
" width=1080,\n",
" height=1080\n",
")\n",
"\n",
"plotly_treemap_marker_base_style = dict(\n",
Expand Down Expand Up @@ -766,6 +766,89 @@
"figure.show(**plotly_treemap_figure_show_settings)"
]
},
{
"cell_type": "markdown",
"id": "485b5194",
"metadata": {},
"source": [
"### Directories with very few different authors"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3175be23",
"metadata": {},
"outputs": [],
"source": [
"git_commit_authors_per_directory_low_focus = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.33)\n",
"\n",
"author_count_top_limit = git_commit_authors_per_directory_low_focus['authorCount_limited'].max().astype(int).astype(str)\n",
"author_count_top_limit_label_alias = {author_count_top_limit: author_count_top_limit + ' or more'}\n",
"\n",
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
" create_treemap_commit_statistics_settings(git_commit_authors_per_directory_low_focus),\n",
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
" # values = git_commit_authors_per_directory['fileCount'],\n",
" marker=dict(\n",
" **plotly_treemap_marker_base_colorscale,\n",
" colors=git_commit_authors_per_directory_low_focus['authorCount_limited'], \n",
" colorbar=dict(title=\"Authors\",\n",
" tickmode=\"auto\",\n",
" labelalias=author_count_top_limit_label_alias\n",
" ),\n",
" reversescale=True\n",
" ),\n",
"))\n",
"figure.update_layout(\n",
" **plotly_treemap_layout_base_settings,\n",
" title='Number of distinct commit authors (red/black = only one or very few authors)',\n",
")\n",
"\n",
"figure.show(**plotly_treemap_figure_show_settings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e11947c5",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# Example data\n",
"labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n",
"parents = [\"\", \"A\", \"A\", \"B\", \"B\"]\n",
"values = [10, 20, 30, 40, 50] # Color scale values\n",
"max_value = max(values)\n",
"\n",
"# Create treemap\n",
"fig = go.Figure(go.Treemap(\n",
" labels=labels,\n",
" parents=parents,\n",
" values=values,\n",
" marker=dict(\n",
" colors=values,\n",
" colorscale=\"Blues\",\n",
" colorbar=dict(\n",
" title=\"Value\",\n",
" tickmode=\"auto\", # Let Plotly auto-select ticks\n",
" ticklabelposition=\"outside top\",\n",
" tickformat=\",\", # Use default formatting\n",
" ticklabeloverflow=\"allow\", # Ensure long labels are displayed\n",
" ticklabelstep=1 # Show all labels\n",
" )\n",
" )\n",
"))\n",
"\n",
"# Add an alias for the highest tick value dynamically\n",
"fig.update_layout(coloraxis_colorbar_tickvals=[max_value])\n",
"fig.update_layout(coloraxis_colorbar_ticktext=[f\"{max_value} or more\"])\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "markdown",
"id": "5dbceaef",
Expand Down Expand Up @@ -1045,6 +1128,93 @@
"figure.show(**plotly_treemap_figure_show_settings)"
]
},
{
"cell_type": "markdown",
"id": "80bd7c28",
"metadata": {},
"source": [
"### File changed frequently with other files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24055998",
"metadata": {},
"outputs": [],
"source": [
"pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n",
"\n",
"# Debug\n",
"# display(\"1. pairwise changed files --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
"pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')\n",
"\n",
"# Debug\n",
"# display(\"2. added directories --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
"pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n",
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
")\n",
"pairwise_changed_git_files.reset_index(inplace=True)\n",
"\n",
"# Debug\n",
"# display(\"3. after grouping --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"pairwise_changed_git_files = pd.merge(\n",
" git_files_with_commit_statistics, \n",
" pairwise_changed_git_files, \n",
" left_on='directoryPath', \n",
" right_on=\"directoryPath\",\n",
" how=\"left\",\n",
" validate=\"m:1\"\n",
")\n",
"\n",
"# Debug\n",
"# display(\"4. after merging --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
"pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
"pairwise_changed_git_files.reset_index(inplace=True)\n",
"\n",
"# Debug\n",
"# display(\"5. after NaN fill --------------\")\n",
"# display(pairwise_changed_git_files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19b5a98a",
"metadata": {},
"outputs": [],
"source": [
"pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, \"pairwiseChangeCommitCount\", 0.98)\n",
"\n",
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
" create_treemap_commit_statistics_settings(pairwise_changed_git_files),\n",
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
" # values = pairwise_changed_git_files['fileCount'],\n",
" marker=dict(\n",
" **plotly_treemap_marker_base_colorscale,\n",
" colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], \n",
" colorbar=dict(title=\"Changes\"),\n",
" ),\n",
"))\n",
"figure.update_layout(\n",
" **plotly_treemap_layout_base_settings,\n",
" title='Pairwise file changes',\n",
")\n",
"figure.show(**plotly_treemap_figure_show_settings)"
]
},
{
"cell_type": "markdown",
"id": "d8c6ccee",
Expand Down
4 changes: 4 additions & 0 deletions scripts/importGit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ commonPostGitImport() {
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher"

echo "importGit: Creating relationships to file nodes that where changed together..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher"

# Since it's currently not possible to rule out ambiguity in git<->code file matching,
# the following verifications are only an additional info in the log rather than an error.
echo "importGit: Running verification queries for troubleshooting (non failing)..."
Expand Down
Loading