-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnodecheck.py
executable file
·225 lines (188 loc) · 8.07 KB
/
nodecheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python3
#
# Python screening exercise
#
# Node 1: hostname=n1 ipaddr=192.168.0.1/24
# Node 2: hostname=n2 ipaddr=192.168.0.2/24
# Node 3: hostname=n3 ipaddr=192.168.0.3/24
#
# Write code that will perform the following when executed on node 3
# 1. Retrieve a list of file objects in /mnt/share1/test on node 1 and node2 in parallel.
# 2. Return success if contents match; error if there is a mismatch.
# Bonus points:
# Include validation of file metadata
# Include data integrity check of files
#
# Notes:
# Could use dircmp, but where would the fun be in that!
#
# The program walks each node in parallel (as instructed) which could result in a lot of data being generated.
# I would walk the filing systems handing off each pair of directories to a thread pool to check the contents.
# This may also extract extra parallelism
import os
import sys
import time
import argparse
import hashlib
from multiprocessing import Pool as ThreadPool
class NodeChecker:
# constants
HASHING_ALG = "SHA1"
STAT_CHECKS = \
[
# index, name, print function
(0, "mode", lambda value: oct(value)[2:]),
#(1, "inode", str), # ignore inode
#(2, "device", lambda value: hex(value)[2:]), # ignore device
#(3, "link count", str), # ignore link count
(4, "user id", str),
(5, "group id", str),
(6, "size", str),
#(7, "access time", time.ctime), # ignore access time
(8, "modified time", time.ctime),
#(9, "creation time", time.ctime), # ignore creation time while testing
]
def __init__(self, algorithm, progress):
"""
Class to check two filing system nodes have identical contents.
:param algorithm: algorithm to use for file contents hashing
:type algorithm: string
:param progress: display progress
:type progress: bool
"""
self.algorithm = algorithm
self.progress = progress
self.files = {}
self.errors = 0
def get_digest(self, filepath):
"""
Calculate the hash of a file using the algorithm passed to the class
:param filepath: path to the file
:type filepath: string
:return: has in the form of a hex digest
:rtype: string
"""
if self.algorithm == "MD5":
h = hashlib.md5()
elif self.algorithm == "SHA1":
h = hashlib.sha1()
elif self.algorithm == "SHA256":
h = hashlib.sha256()
else:
raise ValueError("Invalid algorithm %s" % self.algorithm)
with open(filepath, 'rb') as f:
while True:
# Reading is buffered, so we can read smaller chunks.
chunk = f.read(h.block_size)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def walk_node(self, node_path):
"""
Discover all files under the given path and retrieve metadata from os\.stat
and file hash using the algorithm passed to the class at initialisation.
:param node_path: path to walk
:type node_path: string
:return: tuple of dictionary of file mata and digests keyed on the relative file path, and error count
:rtype: (dict, int)
"""
files = {}
# only interested in the root of the path and the filenames,
# subsequent iterations will walk the subdirectories
for dirpath, _, filenames in os.walk(node_path, onerror=self._walk_error):
if self.progress:
print("%s" % dirpath)
for filename in filenames:
filepath = os.path.join(dirpath, filename)
metadata = os.stat(filepath)
try:
digest = self.get_digest(filepath),
except OSError as e:
# will fail if file permissions inaccessible
digest = None
print(str(e), file=sys.stderr)
self.errors += 1
# file key is relative to the path we are given
files[os.path.relpath(filepath, node_path)] = \
{
"metadata" : metadata,
"digest" : digest,
}
return files, self.errors
def _walk_error(self, error):
print(str(error), file=sys.stderr)
self.errors += 1
def check(self, paths):
"""
Check two filing system paths have identical contents
:return: True if identical
:rtype: bool
"""
# Read file information for each node in parallel
if self.progress:
print("Reading directories:")
with ThreadPool(processes=len(paths)) as pool:
node_data = pool.map(self.walk_node, paths)
self.errors = sum(nd[1] for nd in node_data)
if self.progress:
print("Checking matching file names")
# first quick check to ensure all files are present in both nodes
file_sets = [set(nd[0].keys()) for nd in node_data]
only_ons = [file_sets[0] - file_sets[1], file_sets[1] - file_sets[0]]
for i, only_on in enumerate(only_ons):
if only_on:
print("Files only on %s:\n%s" % (paths[i], "\n".join(only_on)))
# early termination as things obviously different, could have option to continue at this point
if any(only_ons):
return False
# flag to indicate everything matches until we know otherwise
matching = True
# second check of metadata and digests
if self.progress:
print("Checking file metadata and hashes")
for filename in node_data[0][0]:
# should no key errors as we are currently terminating if files don't match, but handle by continuing
try:
file0 = node_data[0][0][filename]
file1 = node_data[1][0][filename]
except KeyError:
continue
metadata0 = file0["metadata"]
metadata1 = file1["metadata"]
# check each item of metadata, except device and inode ids, and link count
# which don't indicate a difference in contents
for i, st_name, print_fn in self.STAT_CHECKS:
if metadata0[i] != metadata1[i]:
print("%s : %s different: %s != %s" % \
(filename, st_name, print_fn(metadata0[i]), print_fn(metadata1[i])))
matching = False
# check the file has digests are identical
# note both could be none if files inaccessible
if file0["digest"] != file1["digest"]:
print("%s : %s hashes are different" % (filename, self.algorithm))
matching = False
return matching
def main():
parser = argparse.ArgumentParser(description="Check contents of two filing system nodes are identical")
parser.add_argument("-a", "--algorithm", default=NodeChecker.HASHING_ALG, help="Hash algorithm (MD5, SHA1, SHA256), default %s" % NodeChecker.HASHING_ALG)
parser.add_argument("-p", "--progress", action="store_true", help="Show progress")
parser.add_argument("path", nargs=2, help="Two paths to check")
args = parser.parse_args()
errors = 0
for path in args.path:
if not os.path.isdir(path):
print("'%s' is not a valid path, has it been mounted?" % path, file=sys.stderr)
if errors:
sys.exit(1)
nodechecker = NodeChecker(args.algorithm, args.progress)
if nodechecker.check(args.path):
if nodechecker.errors:
print("Nodes appear to match, but %d errors were encountered" % nodechecker.errors)
else:
print("Nodes match")
else:
print("Nodes differ", file=sys.stderr)
sys.exit(2)
if __name__ == "__main__":
main()