-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpyrsync2.py
143 lines (118 loc) · 5.04 KB
/
pyrsync2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
''' This is a pure Python implementation of the [rsync algorithm] [TM96].
### Example Use Case: ###
# On the system containing the file that needs to be patched
>>> unpatched = open('unpatched.file', 'rb')
>>> hashes = blockchecksums(unpatched)
# On the remote system after having received `hashes`
>>> patchedfile = open('patched.file', 'rb')
>>> delta = rsyncdelta(patchedfile, hashes)
# System with the unpatched file after receiving `delta`
>>> unpatched.seek(0)
>>> save_to = open('locally-patched.file', 'wb')
>>> patch_stream(unpatched, save_to, delta)
'''
import hashlib
__all__ = [
'checksum', 'rolling_checksum',
'blockchecksums', 'rsyncdelta',
'patch_stream',
]
def rsyncdelta(datastream, hashes, blocksize=4096, max_buffer=4096):
''' Return an iterator of binary patches when supplied with a readable
stream of the up-to-date data and a list of hash pair tuples from an
unpatched target.
'''
hashdict = {}
for index, (weak, strong) in enumerate(hashes):
if weak not in hashdict:
hashdict[weak] = {}
hashdict[weak][strong] = index
match = True
current_block = bytearray()
while True:
if match:
# Whenever there is a match or
# the loop is running for the first time,
# populate the window using checksum instead of rolling
# through every single byte which takes at least twice as long.
window = bytearray(datastream.read(blocksize))
if window:
window_offset = 0
weakkey, a, b = checksum(window)
else:
break
else:
# Roll one byte forward if not already at the EOF
if datastream is not None:
newbytearray = bytearray(datastream.read(1))
if newbytearray:
newbyte = newbytearray[0]
window.append(newbyte)
else:
# EOF; the window will slowly shrink.
# newbyte needs to be zero from here on to keep
# the checksum correct.
newbyte = 0
tailsize = datastream.tell() % blocksize
datastream = None
# Add the old byte the file delta. This is data that was not found
# inside of a matching block so it needs to be sent to the target.
oldbyte = window[window_offset]
current_block.append(oldbyte)
window_offset += 1
# Yank off the extra byte and calculate the new window checksum
weakkey, a, b = rolling_checksum(oldbyte, newbyte, a, b, blocksize)
strongkey = hashlib.md5(window[window_offset:]).digest() if (
weakkey in hashdict) else None
if weakkey in hashdict and strongkey in hashdict[weakkey]:
match = True
if current_block:
yield bytes(current_block)
current_block = bytearray()
yield hashdict[weakkey][strongkey]
if datastream is None:
break
else:
match = False
if len(current_block) == max_buffer:
yield bytes(current_block)
current_block = bytearray()
if datastream is None and len(window) - window_offset <= tailsize:
# The likelihood that any blocks will match after this is
# nearly nil so flush the current block and call it quits.
if current_block:
yield bytes(current_block)
current_block = bytearray()
yield bytes(window[window_offset:])
break
def blockchecksums(in_stream, blocksize=4096):
''' Return an iterator of the (weak hash (int), strong hash (bytes)) tuples
for each block of the given data stream.
'''
read = in_stream.read(blocksize)
while read:
yield (checksum(read)[0], hashlib.md5(read).digest())
read = in_stream.read(blocksize)
def patch_stream(in_stream, out_stream, delta, blocksize=4096):
''' Patch the in-stream data based on the binary patch delta and write the
resultantant data to the out-stream.
'''
for element in delta:
if isinstance(element, int):
in_stream.seek(element * blocksize)
element = in_stream.read(blocksize)
out_stream.write(element)
def rolling_checksum(old, new, a, b, blocksize=4096):
''' Return a new weak checksum when supplied with the internal state of the
checksum calculation from the previous window, the removed old byte,
and the added new byte.
'''
a = (a - old + new) % 65536
b = (b - old * blocksize + a) % 65536
return (b << 16) | a, a, b
def checksum(block):
''' Return a weak checksum from an iterable set of bytes. '''
blocksize = len(block)
a = sum(block[i] for i in range(blocksize)) % 65536
b = sum(((blocksize - i) * block[i]) for i in range(blocksize)) % 65536
return (b << 16) | a, a, b