-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
571 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import collections | ||
import subprocess | ||
|
||
def get_graph_and_depths(): | ||
graph = {} | ||
depth = {} | ||
p = subprocess.Popen(['git', 'log', '--format=%H %P'], | ||
stdout=subprocess.PIPE, universal_newlines=True) | ||
for line in p.stdout: | ||
commits = line.split() | ||
graph[commits[0]] = commits[1:] | ||
|
||
waiting = collections.defaultdict(set) | ||
ready = set() | ||
for commit, parents in graph.items(): | ||
if parents: | ||
for parent in parents: | ||
waiting[parent].add(commit) | ||
else: | ||
ready.add(commit) | ||
|
||
while ready: | ||
commit = ready.pop() | ||
parents = graph[commit] | ||
depth[commit] = 1 + max((depth[p] for p in parents), default=0) | ||
for other in waiting[commit]: | ||
parents = graph[other] | ||
if all(p in depth for p in parents): | ||
ready.add(other) | ||
|
||
return graph, depth | ||
|
||
graph, depth = get_graph_and_depths() | ||
for commit, d in depth.items(): | ||
print("%5d %s" % (d, commit)) | ||
|
||
''' | ||
- Minimum number of unique commits to either side | ||
- Total number of diverging commits (or divide by 2 to get average) | ||
- What is the longest sequence of non-merge commits merged? (Use for rebase) | ||
''' | ||
|
||
class AncestryGraph(object): | ||
""" | ||
A class that maintains a direct acycle graph of commits for the purpose of | ||
determining if one commit is the ancestor of another. | ||
""" | ||
|
||
def __init__(self): | ||
self.cur_value = 0 | ||
|
||
# A mapping from the external identifers given to us to the simple integers | ||
# we use in self.graph | ||
self.value = {} | ||
|
||
# A tuple of (depth, list-of-ancestors). Values and keys in this graph are | ||
# all integers from the self.value dict. The depth of a commit is one more | ||
# than the max depth of any of its ancestors. | ||
self.graph = {} | ||
|
||
def record_external_commits(self, external_commits): | ||
""" | ||
Record in graph that each commit in external_commits exists, and is | ||
treated as a root commit with no parents. | ||
""" | ||
for c in external_commits: | ||
if c not in self.value: | ||
self.cur_value += 1 | ||
self.value[c] = self.cur_value | ||
self.graph[self.cur_value] = (1, []) | ||
|
||
def add_commit_and_parents(self, commit, parents): | ||
""" | ||
Record in graph that commit has the given parents. parents _MUST_ have | ||
been first recorded. commit _MUST_ not have been recorded yet. | ||
""" | ||
assert all(p in self.value for p in parents) | ||
assert commit not in self.value | ||
|
||
# Get values for commit and parents | ||
self.cur_value += 1 | ||
self.value[commit] = self.cur_value | ||
graph_parents = [self.value[x] for x in parents] | ||
|
||
# Determine depth for commit, then insert the info into the graph | ||
depth = 1 | ||
if parents: | ||
depth += max(self.graph[p][0] for p in graph_parents) | ||
self.graph[self.cur_value] = (depth, graph_parents) | ||
|
||
def is_ancestor(self, possible_ancestor, check): | ||
""" | ||
Return whether possible_ancestor is an ancestor of check | ||
""" | ||
a, b = self.value[possible_ancestor], self.value[check] | ||
a_depth = self.graph[a][0] | ||
ancestors = [b] | ||
visited = set() | ||
while ancestors: | ||
ancestor = ancestors.pop() | ||
if ancestor in visited: | ||
continue | ||
visited.add(ancestor) | ||
depth, more_ancestors = self.graph[ancestor] | ||
if ancestor == a: | ||
return True | ||
elif depth <= a_depth: | ||
continue | ||
ancestors.extend(more_ancestors) | ||
return False | ||
|
||
class RepoAnalyze(object): | ||
|
||
@staticmethod | ||
def setup_or_update_rename_history(stats, commit, oldname, newname): | ||
rename_commits = stats['rename_history'].get(oldname, set()) | ||
rename_commits.add(commit) | ||
stats['rename_history'][oldname] = rename_commits | ||
|
||
@staticmethod | ||
def handle_renames(stats, commit, change_types, filenames): | ||
for index, change_type in enumerate(change_types): | ||
if change_type == ord(b'R'): | ||
oldname, newname = filenames[index], filenames[-1] | ||
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) | ||
RepoAnalyze.setup_or_update_rename_history(stats, commit, | ||
oldname, newname) | ||
|
||
@staticmethod | ||
def handle_file(stats, graph, commit, modes, shas, filenames): | ||
mode, sha, filename = modes[-1], shas[-1], filenames[-1] | ||
|
||
# Figure out kind of deletions to undo for this file, and update lists | ||
# of all-names-by-sha and all-filenames | ||
delmode = 'tree_deletions' | ||
if mode != b'040000': | ||
delmode = 'file_deletions' | ||
stats['names'][sha].add(filename) | ||
stats['allnames'].add(filename) | ||
|
||
# If the file (or equivalence class of files) was recorded as deleted, | ||
# clearly it isn't anymore | ||
equiv = RepoAnalyze.equiv_class(stats, filename) | ||
for f in equiv: | ||
stats[delmode].pop(f, None) | ||
|
||
# If we get a modify/add for a path that was renamed, we may need to break | ||
# the equivalence class. However, if the modify/add was on a branch that | ||
# doesn't have the rename in its history, we are still okay. | ||
need_to_break_equivalence = False | ||
if equiv[-1] != filename: | ||
for rename_commit in stats['rename_history'][filename]: | ||
if graph.is_ancestor(rename_commit, commit): | ||
need_to_break_equivalence = True | ||
|
||
if need_to_break_equivalence: | ||
for f in equiv: | ||
if f in stats['equivalence']: | ||
del stats['equivalence'][f] | ||
|
||
@staticmethod | ||
def analyze_commit(stats, graph, commit, parents, date, file_changes): | ||
graph.add_commit_and_parents(commit, parents) | ||
for change in file_changes: | ||
modes, shas, change_types, filenames = change | ||
if len(parents) == 1 and change_types.startswith(b'R'): | ||
change_types = b'R' # remove the rename score; we don't care | ||
if modes[-1] == b'160000': | ||
continue | ||
elif modes[-1] == b'000000': | ||
# Track when files/directories are deleted | ||
for f in RepoAnalyze.equiv_class(stats, filenames[-1]): | ||
if any(x == b'040000' for x in modes[0:-1]): | ||
stats['tree_deletions'][f] = date | ||
else: | ||
stats['file_deletions'][f] = date | ||
elif change_types.strip(b'AMT') == b'': | ||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) | ||
elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': | ||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) | ||
elif change_types.strip(b'RAM') == b'': | ||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) | ||
RepoAnalyze.handle_renames(stats, commit, change_types, filenames) | ||
else: | ||
raise SystemExit(_("Unhandled change type(s): %(change_type)s " | ||
"(in commit %(commit)s)") | ||
% ({'change_type': change_types, 'commit': commit}) | ||
) # pragma: no cover | ||
|
||
@staticmethod | ||
def gather_data(args): | ||
unpacked_size, packed_size = GitUtils.get_blob_sizes() | ||
stats = {'names': collections.defaultdict(set), | ||
'allnames' : set(), | ||
'file_deletions': {}, | ||
'tree_deletions': {}, | ||
'equivalence': {}, | ||
'rename_history': collections.defaultdict(set), | ||
'unpacked_size': unpacked_size, | ||
'packed_size': packed_size, | ||
'num_commits': 0} | ||
|
||
# Setup the rev-list/diff-tree process | ||
commit_parse_progress = ProgressWriter() | ||
num_commits = 0 | ||
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + | ||
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + | ||
' --date=short -M -t -c --raw --combined-all-paths') | ||
dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) | ||
f = dtp.stdout | ||
line = f.readline() | ||
if not line: | ||
raise SystemExit(_("Nothing to analyze; repository is empty.")) | ||
cont = bool(line) | ||
graph = AncestryGraph() | ||
while cont: | ||
commit = line.rstrip() | ||
parents = f.readline().split() | ||
date = f.readline().rstrip() | ||
|
||
# We expect a blank line next; if we get a non-blank line then | ||
# this commit modified no files and we need to move on to the next. | ||
# If there is no line, we've reached end-of-input. | ||
line = f.readline() | ||
if not line: | ||
cont = False | ||
line = line.rstrip() | ||
|
||
# If we haven't reached end of input, and we got a blank line meaning | ||
# a commit that has modified files, then get the file changes associated | ||
# with this commit. | ||
file_changes = [] | ||
if cont and not line: | ||
cont = False | ||
for line in f: | ||
if not line.startswith(b':'): | ||
cont = True | ||
break | ||
n = 1+max(1, len(parents)) | ||
assert line.startswith(b':'*(n-1)) | ||
relevant = line[n-1:-1] | ||
splits = relevant.split(None, n) | ||
modes = splits[0:n] | ||
splits = splits[n].split(None, n) | ||
shas = splits[0:n] | ||
splits = splits[n].split(b'\t') | ||
change_types = splits[0] | ||
filenames = [PathQuoting.dequote(x) for x in splits[1:]] | ||
file_changes.append([modes, shas, change_types, filenames]) | ||
|
||
# Analyze this commit and update progress | ||
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, | ||
file_changes) | ||
num_commits += 1 | ||
commit_parse_progress.show(_("Processed %d commits") % num_commits) | ||
|
||
# Show the final commits processed message and record the number of commits | ||
commit_parse_progress.finish() | ||
stats['num_commits'] = num_commits | ||
|
||
# Close the output, ensure rev-list|diff-tree pipeline completed successfully | ||
dtp.stdout.close() | ||
if dtp.wait(): | ||
raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover | ||
|
||
return stats |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Day 1: ort, correct basic design (in memory resolve, then update index/worktree) | ||
Day 2: correctness, submodules | ||
Day 3: correctness, more conflict types | ||
Day 4: correctness, ambiguous renaming handling | ||
Day 5: perf, rename detection -- smarter implementation (3 old things; maybe split?) | ||
Day 6, perf, rename detection -- handle directories better (basename pref.) | ||
Day 7: perf, smarter index update (not O(N^2)) | ||
Day 8: perf, partial index (for when no index/worktree update needed) | ||
Day 9: perf, in-memory rebase (avoiding intermediate disk serialization) | ||
Day 10: feature, -X mas | ||
Day 11: feature, modified diff3 | ||
Day 12: feature, remerge-diff |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
Rename similarities of two unrelated files of same size: | ||
* Both files are the same size | ||
* One file has chars from '0..9', other from 'a..z'; no whitespace | ||
* First column is size of files, second column is similarity from git | ||
4 R000 | ||
5 R000 | ||
6 R000 | ||
8 R000 | ||
9 R000 | ||
11 R000 | ||
13 R000 | ||
16 R000 | ||
19 R000 | ||
22 R000 | ||
26 R000 | ||
32 R000 | ||
38 R000 | ||
45 R000 | ||
53 R000 | ||
64 R000 | ||
76 R000 | ||
90 R000 | ||
107 R000 | ||
128 R000 | ||
152 R000 | ||
181 R000 | ||
215 R000 | ||
256 R000 | ||
304 R000 | ||
362 R000 | ||
430 R000 | ||
512 R000 | ||
608 R000 | ||
724 R000 | ||
861 R000 | ||
1024 R000 | ||
1217 R000 | ||
1448 R000 | ||
1722 R000 | ||
2048 R000 | ||
2435 R000 | ||
2896 R000 | ||
3444 R000 | ||
4096 R000 | ||
4870 R000 | ||
5792 R000 | ||
6888 R000 | ||
8192 R000 | ||
9741 R000 | ||
11585 R000 | ||
13777 R000 | ||
16384 R000 | ||
19483 R000 | ||
23170 R000 | ||
27554 R000 | ||
32768 R000 | ||
38967 R000 | ||
46340 R000 | ||
55108 R001 | ||
65536 R001 | ||
77935 R000 | ||
92681 R001 | ||
110217 R001 | ||
131072 R001 | ||
155871 R002 | ||
185363 R002 | ||
220435 R003 | ||
262144 R003 | ||
311743 R004 | ||
370727 R005 | ||
440871 R006 | ||
524288 R007 | ||
623487 R008 | ||
741455 R009 | ||
881743 R011 | ||
1048576 R013 | ||
1246974 R015 | ||
1482910 R017 | ||
1763487 R020 | ||
2097152 R022 | ||
2493948 R026 | ||
2965820 R029 | ||
3526975 R032 | ||
4194304 R036 | ||
4987896 R040 | ||
5931641 R044 | ||
7053950 R048 | ||
8388608 R051 | ||
9975792 R055 | ||
11863283 R058 | ||
14107900 R061 | ||
16777216 R064 | ||
19951584 R067 | ||
23726566 R070 | ||
28215801 R072 | ||
33554432 R074 | ||
39903169 R076 | ||
47453132 R078 | ||
56431603 R080 | ||
67108864 R082 | ||
79806338 R083 | ||
94906265 R084 | ||
112863206 R086 | ||
134217728 R087 | ||
159612677 R088 | ||
189812531 R089 | ||
225726412 R090 | ||
268435456 R091 | ||
319225354 R091 | ||
379625062 R092 | ||
451452825 R093 | ||
536870912 R093 | ||
638450708 R094 | ||
759250124 R094 | ||
902905650 R095 | ||
1073741824 R095 |
Oops, something went wrong.