Skip to content

Commit

Permalink
add ratio before the bar (#22)
Browse files Browse the repository at this point in the history
* refactory: load hash

* refactory and add ratio before the bar
  • Loading branch information
xyb authored Nov 27, 2022
1 parent d045111 commit c776a2b
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 67 deletions.
72 changes: 29 additions & 43 deletions chunkdup/chunkdiff.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import argparse
import sys
from difflib import SequenceMatcher
from itertools import groupby
from math import ceil

from .diff import find_diff
from .sums import Chunksums


Expand All @@ -18,25 +18,6 @@
END = "\033[0m"


def find_diff(chunks1, sizes1, chunks2, sizes2):
s = SequenceMatcher(a=chunks1, b=chunks2)
diff = []
total = 0
tag_map = {
"equal": ["=", "="],
"replace": ["-", "+"],
"delete": ["-", " "],
"insert": [" ", "+"],
}
for tag, i1, i2, j1, j2 in s.get_opcodes():
size1 = sum([s for s in sizes1[i1:i2]])
size2 = sum([s for s in sizes2[j1:j2]])
total += max(size1, size2)
diff.append(tag_map[tag] + [size1, size2])

return total, diff


def fill_line(bar_width, total, diff):
zoom = bar_width / total

Expand Down Expand Up @@ -66,12 +47,13 @@ def get_bar_layer(chunksums1, chunksums2, path1, path2, bar_width=40):
f1 = chunksums1.get_file(path1)
f2 = chunksums2.get_file(path2)

total, diff = find_diff(f1.hashes, f1.sizes, f2.hashes, f2.sizes)
total, ratio, diff = find_diff(f1.hashes, f2.hashes, f1.sizes, f2.sizes)
line1, line2 = fill_line(bar_width, total, diff)
return line1, line2, f1.size, f2.size
return ratio, line1, line2, f1.size, f2.size


def print_2lines_bar(
ratio,
line1,
line2,
filesize1,
Expand All @@ -83,13 +65,13 @@ def print_2lines_bar(
"""
>>> line1 = ['-----', '==', '-----', '===']
>>> line2 = ['++', ' ', '==', '+', ' ', '===']
>>> print_2lines_bar(line1, line2, 100, 70, color=False)
100 -----==-----===
70 ++ ==+ ===
>>> print_2lines_bar(line1, line2, 100, 70)
>>> print_2lines_bar(0.5, line1, line2, 100, 70, color=False)
50.00% 100 -----==-----===
70 ++ ==+ ===
>>> print_2lines_bar(0.5, line1, line2, 100, 70)
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
100 ...
70 ...
50.00% 100 ...
70 ...
"""

def colorful(line):
Expand All @@ -105,15 +87,17 @@ def colorful(line):
line1 = colorful(line1)
line2 = colorful(line2)

for size, line in ((filesize1, line1), (filesize2, line2)):
percent = f"{ratio * 100:>6.2f}%"
for pre, size, line in ((percent, filesize1, line1), ("", filesize2, line2)):
print(
"{:>10} {}".format(size, "".join(line)),
"{:>7s} {:>6} {}".format(pre, size, "".join(line)),
file=output or sys.stdout,
flush=True,
)


def print_1line_bar(
ratio,
line1,
line2,
filesize1,
Expand All @@ -125,11 +109,11 @@ def print_1line_bar(
"""
>>> line1 = ['-----', '==', ' ', '===']
>>> line2 = ['++', ' ', '==', '+++++', '===']
>>> print_1line_bar(line1, line2, 100, 70, color=False)
▀100 ▄70 ██▀▀▀▒▒▄▄▄▄▄▒▒▒
>>> print_1line_bar(line1, line2, 100, 70, color=True)
>>> print_1line_bar(0.6, line1, line2, 100, 70, color=False)
60.00% ▀100 ▄70 ██▀▀▀▒▒▄▄▄▄▄▒▒▒
>>> print_1line_bar(0.6, line1, line2, 100, 70, color=True)
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
▀100 ▄70 ...
60.00% ▀100 ▄70 ...
"""

pairs = list("".join(x) for x in zip("".join(line1), "".join(line2)))
Expand All @@ -156,7 +140,8 @@ def print_1line_bar(
bar.append(item)

print(
"▀{} ▄{} {}".format(
"{:>6.2f}% ▀{} ▄{} {}".format(
ratio * 100,
filesize1,
filesize2,
"".join(bar),
Expand Down Expand Up @@ -188,13 +173,13 @@ def print_diff(
>>> a = Chunksums.parse(open(f1.name))
>>> b = Chunksums.parse(open(f2.name))
>>> print_diff(a, b, './a', './b', color=False)
▀35 ▄35 ▀▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒█████
57.14% ▀35 ▄35 ▀▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒█████
>>> print_diff(a, b, './a', './b', color=False, oneline=False)
35 ---------========= ===== =====-----
35 =========+++++=====+++++=====+++++
57.14% 35 ---------========= ===== =====-----
35 =========+++++=====+++++=====+++++
"""

line1, line2, filesize1, filesize2 = get_bar_layer(
ratio, line1, line2, filesize1, filesize2 = get_bar_layer(
chunksums1,
chunksums2,
path1,
Expand All @@ -206,6 +191,7 @@ def print_diff(
else:
print_func = print_2lines_bar
print_func(
ratio,
line1,
line2,
filesize1,
Expand Down Expand Up @@ -250,14 +236,14 @@ def main():
>>> s = f.name
>>> sys.argv = ['chunkdiff', '-s', s, '-s', s, './a', './b', '--nocolor']
>>> main()
▀45 ▄45 ▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒████▄▄▄▄▄▄▄▒▒▒▒████
55.56% ▀45 ▄45 ▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒████▄▄▄▄▄▄▄▒▒▒▒████
>>> sys.argv = ['chunkdiff', '-s', s, './a', './b', '-n', '-w', '10']
>>> main()
▀45 ▄45 ▀▀▒▒▒▒█▄▄▒█
55.56% ▀45 ▄45 ▀▀▒▒▒▒█▄▄▒█
>>> sys.argv = ['chunkdiff', '-s', s, './a', './b', '-n', '-b', 'twolines']
>>> main()
45 --------===============---- ====----
45 ===============+++++++++++====++++
55.56% 45 --------===============---- ====----
45 ===============+++++++++++====++++
>>> sys.argv = ['chunkdiff', '-s', s, './bad', './beef']
>>> try:
Expand Down
26 changes: 10 additions & 16 deletions chunkdup/chunkdup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,25 @@
import argparse
import signal
import sys
from difflib import SequenceMatcher

from .diff import find_diff
from .sums import Chunksums


def diff_ratio(a, b, sizes1, sizes2):
"""
>>> sizes = {'a': 10, 'b': 10, 'c': 20}
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], sizes, sizes)
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'],
... [10, 10, 10, 10], [10, 10, 10, 10])
1.0
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'], sizes, sizes)
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'],
... [10, 10, 10, 10], [10, 10, 10, 10])
0.75
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'], sizes, sizes)
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'],
... [10, 10, 10, 10], [10, 20, 10])
0.5
"""
matches = 0
for tag, i1, i2, _, _ in SequenceMatcher(a=a, b=b).get_opcodes():
if tag != "equal":
continue
matches += sum(
[sizes1.get(chunk, 0) or sizes2.get(chunk, 0) for chunk in a[i1:i2]],
)
size1 = sum([sizes1.get(chunk) for chunk in a])
size2 = sum([sizes2.get(chunk) for chunk in b])
ratio = (2 * matches) / (size1 + size2)
_, ratio, _ = find_diff(a, b, sizes1, sizes2)
return ratio


Expand Down Expand Up @@ -60,8 +54,8 @@ def find_dup_files(chunksums1, chunksums2):
ratio = diff_ratio(
f1.hashes,
f2.hashes,
dict(f1.chunks),
dict(f2.chunks),
f1.sizes,
f2.sizes,
)
if f1.path == f2.path and ratio == 1.0:
continue
Expand Down
39 changes: 39 additions & 0 deletions chunkdup/diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from difflib import SequenceMatcher


DIFF_ASCII = {
"equal": ["=", "="],
"replace": ["-", "+"],
"delete": ["-", " "],
"insert": [" ", "+"],
}


def find_diff(chunks1, chunks2, sizes1, sizes2):
"""
>>> sizes = {'a': 10, 'b': 10, 'c': 20}
>>> find_diff(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'],
... [10, 10, 10, 10], [10, 10, 10, 10])
(40, 1.0, [['=', '=', 40, 40]])
>>> find_diff(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'b'],
... [10, 10, 10, 10], [10, 10, 10, 10])
(40, 0.75, [['=', '=', 30, 30], ['-', '+', 10, 10]])
>>> find_diff(['a', 'a', 'a', 'a'], ['c', 'a', 'a'],
... [10, 10, 10, 10], [10, 20, 10])
(60, 0.5, [[' ', '+', 0, 10], ['=', '=', 20, 30], ['-', ' ', 20, 0]])
"""
diff = []
total = 0
matches = 0
s = SequenceMatcher(a=chunks1, b=chunks2)
for tag, i1, i2, j1, j2 in s.get_opcodes():
size1 = sum([s for s in sizes1[i1:i2]])
size2 = sum([s for s in sizes2[j1:j2]])
total += max(size1, size2)
if tag == "equal":
matches += size1
diff.append(DIFF_ASCII[tag] + [size1, size2])

ratio = (2 * matches) / (sum(sizes1) + sum(sizes2))

return total, ratio, diff
17 changes: 9 additions & 8 deletions chunkdup/sums.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
def load_hash(hash):
if isinstance(hash, bytes):
return hash
else:
return bytes.fromhex(hash)


class File:
def __init__(self, hash, path, alg_name, chunks):
if isinstance(hash, bytes):
self.hash = hash
else:
self.hash = bytes.fromhex(hash)
self.hash = load_hash(hash)
self.path = path
self.alg_name = alg_name
self._load_chunks(chunks)
Expand All @@ -14,10 +18,7 @@ def _load_chunks(self, chunks):
self.hashes, self.sizes = [], []
else:
chunks = list(
[
(hash if isinstance(hash, bytes) else bytes.fromhex(hash), size)
for hash, size in chunks
],
[(load_hash(hash), size) for hash, size in chunks],
)
self.hashes, self.sizes = list(zip(*chunks))

Expand Down

0 comments on commit c776a2b

Please sign in to comment.