forked from luci/luci-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrace_inputs.py
executable file
·3463 lines (3076 loc) · 121 KB
/
trace_inputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding=utf-8
# Copyright 2012 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.
"""Traces an executable and its child processes and extract the files accessed
by them.
The implementation uses OS-specific API. The native Kernel logger and the ETL
interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
The OS-specific implementation is hidden in an 'API' interface.
The results are embedded in a Results instance. The tracing is done in two
phases, the first is to do the actual trace and generate an
implementation-specific log file. Then the log file is parsed to extract the
information, including the individual child processes and the files accessed
from the log.
"""
import codecs
import csv
import errno
import getpass
import glob
import logging
import os
import re
import stat
import subprocess
import sys
import tempfile
import threading
import time
import weakref
from third_party import colorama
from third_party.depot_tools import fix_encoding
from third_party.depot_tools import subcommand
from utils import file_path
from utils import fs
from utils import logging_utils
from utils import subprocess42
from utils import tools
## OS-specific imports
if sys.platform == 'win32':
from ctypes.wintypes import byref, c_int, c_wchar_p
from ctypes.wintypes import windll # pylint: disable=E0611
__version__ = '0.2'
BASE_DIR = os.path.dirname(os.path.abspath(
__file__.decode(sys.getfilesystemencoding())))
ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
class TracingFailure(Exception):
"""An exception occured during tracing."""
def __init__(self, description, pid, line_number, line, *args):
super(TracingFailure, self).__init__(
description, pid, line_number, line, *args)
self.description = description
self.pid = pid
self.line_number = line_number
self.line = line
self.extra = args
def __str__(self):
out = self.description
if self.pid:
out += '\npid: %d' % self.pid
if self.line_number:
out += '\nline: %d' % self.line_number
if self.line:
out += '\n%s' % self.line
if self.extra:
out += '\n' + ', '.join(map(str, filter(None, self.extra)))
return out
## OS-specific functions
if sys.platform == 'win32':
def get_current_encoding():
"""Returns the 'ANSI' code page associated to the process."""
return 'cp%d' % int(windll.kernel32.GetACP())
def CommandLineToArgvW(command_line):
"""Splits a commandline into argv using CommandLineToArgvW()."""
# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
size = c_int()
assert isinstance(command_line, unicode)
ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
try:
return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
finally:
windll.kernel32.LocalFree(ptr)
def can_trace():
"""Returns True if the user is an administrator on Windows.
It is required for tracing to work.
"""
if sys.platform != 'win32':
return True
return bool(windll.shell32.IsUserAnAdmin())
def create_subprocess_thunk():
"""Creates a small temporary script to start the child process.
This thunk doesn't block, its unique name is used to identify it as the
parent.
"""
handle, name = tempfile.mkstemp(prefix=u'trace_inputs_thunk', suffix='.py')
try:
os.write(
handle,
(
'import subprocess, sys\n'
'sys.exit(subprocess.call(sys.argv[2:]))\n'
))
finally:
os.close(handle)
return name
def create_exec_thunk():
"""Creates a small temporary script to start the child executable.
Reads from the file handle provided as the fisrt argument to block, then
execv() the command to be traced.
"""
handle, name = tempfile.mkstemp(prefix=u'trace_inputs_thunk', suffix='.py')
try:
os.write(
handle,
(
'import os, sys\n'
'fd = int(sys.argv[1])\n'
# This will block until the controlling process writes a byte on the
# pipe. It will do so once the tracing tool, e.g. strace, is ready to
# trace.
'os.read(fd, 1)\n'
'os.close(fd)\n'
'os.execv(sys.argv[2], sys.argv[2:])\n'
))
finally:
os.close(handle)
return name
def strace_process_quoted_arguments(text):
"""Extracts quoted arguments on a string and return the arguments as a list.
Implemented as an automaton. Supports incomplete strings in the form
'"foo"...'.
Example:
With text = '"foo", "bar"', the function will return ['foo', 'bar']
TODO(maruel): Implement escaping.
"""
# All the possible states of the DFA.
( NEED_QUOTE, # Begining of a new arguments.
INSIDE_STRING, # Inside an argument.
ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
# a serie of 3 dots or a comma.
NEED_SPACE, # Right after a comma
NEED_DOT_2, # Found a dot, need a second one.
NEED_DOT_3, # Found second dot, need a third one.
NEED_COMMA, # Found third dot, need a comma.
) = range(8)
state = NEED_QUOTE
out = []
for index, char in enumerate(text):
if char == '"':
if state == NEED_QUOTE:
state = INSIDE_STRING
# A new argument was found.
out.append('')
elif state == INSIDE_STRING:
# The argument is now closed.
state = NEED_COMMA_OR_DOT
elif state == ESCAPED:
out[-1] += char
state = INSIDE_STRING
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
elif char == ',':
if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
state = NEED_SPACE
elif state == INSIDE_STRING:
out[-1] += char
elif state == ESCAPED:
out[-1] += char
state = INSIDE_STRING
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
elif char == ' ':
if state == NEED_SPACE:
state = NEED_QUOTE
elif state == INSIDE_STRING:
out[-1] += char
elif state == ESCAPED:
out[-1] += char
state = INSIDE_STRING
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
elif char == '.':
if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
# The string is incomplete, this mean the strace -s flag should be
# increased.
# For NEED_QUOTE, the input string would look like '"foo", ...'.
# For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
state = NEED_DOT_2
elif state == NEED_DOT_2:
state = NEED_DOT_3
elif state == NEED_DOT_3:
state = NEED_COMMA
elif state == INSIDE_STRING:
out[-1] += char
elif state == ESCAPED:
out[-1] += char
state = INSIDE_STRING
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
elif char == '\\':
if state == ESCAPED:
out[-1] += char
state = INSIDE_STRING
elif state == INSIDE_STRING:
state = ESCAPED
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
else:
if state == INSIDE_STRING:
out[-1] += char
else:
raise ValueError(
'Can\'t process char \'%s\' at column %d for: %r' % (
char, index, text),
index,
text)
if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
raise ValueError(
'String is incorrectly terminated: %r' % text,
text)
return out
def assert_is_renderable(pseudo_string):
"""Asserts the input is a valid object to be processed by render()."""
assert (
pseudo_string is None or
isinstance(pseudo_string, unicode) or
hasattr(pseudo_string, 'render')), repr(pseudo_string)
def render(pseudo_string):
"""Converts the pseudo-string to an unicode string."""
if pseudo_string is None or isinstance(pseudo_string, unicode):
return pseudo_string
return pseudo_string.render()
class Results(object):
"""Results of a trace session."""
class _TouchedObject(object):
"""Something, a file or a directory, that was accessed."""
def __init__(self, root, path, tainted, size, nb_files):
logging.debug(
'%s(%s, %s, %s, %s, %s)' %
(self.__class__.__name__, root, path, tainted, size, nb_files))
assert_is_renderable(root)
assert_is_renderable(path)
self.root = root
self.path = path
self.tainted = tainted
self.nb_files = nb_files
# Can be used as a cache or a default value, depending on context. In
# particular, once self.tainted is True, because the path was replaced
# with a variable, it is not possible to look up the file size.
self._size = size
# These are cache only.
self._real_path = None
# Check internal consistency.
assert path, path
assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path)
assert tainted or (
not fs.exists(self.full_path) or
(self.full_path == file_path.get_native_path_case(self.full_path))), (
tainted,
self.full_path,
file_path.get_native_path_case(self.full_path))
@property
def existent(self):
return self.size != -1
@property
def full_path(self):
if self.root:
return os.path.join(self.root, self.path)
return self.path
@property
def real_path(self):
"""Returns the path with symlinks resolved."""
if not self._real_path:
self._real_path = os.path.realpath(self.full_path)
return self._real_path
@property
def size(self):
"""File's size. -1 is not existent.
Once tainted, it is not possible the retrieve the file size anymore since
the path is composed of variables.
"""
if self._size is None and not self.tainted:
try:
self._size = fs.stat(self.full_path).st_size
except OSError:
self._size = -1
return self._size
def flatten(self):
"""Returns a dict representing this object.
A 'size' of 0 means the file was only touched and not read.
"""
return {
'path': self.path,
'size': self.size,
}
def replace_variables(self, variables):
"""Replaces the root of this File with one of the variables if it matches.
If a variable replacement occurs, the cloned object becomes tainted.
"""
for variable, root_path in variables.iteritems():
if self.path.startswith(root_path):
return self._clone(
self.root, variable + self.path[len(root_path):], True)
# No need to clone, returns ourself.
return self
def strip_root(self, root):
"""Returns a clone of itself with 'root' stripped off.
Note that the file is kept if it is either accessible from a symlinked
path that was used to access the file or through the real path.
"""
# Check internal consistency.
assert (
self.tainted or
(file_path.isabs(root) and root.endswith(os.path.sep))), root
if not self.full_path.startswith(root):
# Now try to resolve the symlinks to see if it can be reached this way.
# Only try *after* trying without resolving symlink.
if not self.real_path.startswith(root):
return None
path = self.real_path
else:
path = self.full_path
return self._clone(root, path[len(root):], self.tainted)
def _clone(self, new_root, new_path, tainted):
raise NotImplementedError(self.__class__.__name__)
class File(_TouchedObject):
"""A file that was accessed. May not be present anymore.
If tainted is true, it means it is not a real path anymore as a variable
replacement occured.
|mode| can be one of None, TOUCHED, READ or WRITE.
"""
# Was probed for existence, and it is existent, but was never _opened_.
TOUCHED = 't'
# Opened for read only and guaranteed to not have been written to.
READ = 'r'
# Opened for write.
WRITE = 'w'
# They are listed in order of priority. E.g. if a file is traced as TOUCHED
# then as WRITE, only keep WRITE. None means no idea, which is a problem on
# Windows.
ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)
def __init__(self, root, path, tainted, size, mode):
assert mode in self.ACCEPTABLE_MODES
super(Results.File, self).__init__(root, path, tainted, size, 1)
self.mode = mode
def _clone(self, new_root, new_path, tainted):
"""Clones itself keeping meta-data."""
# Keep the self.size and self._real_path caches for performance reason. It
# is also important when the file becomes tainted (with a variable instead
# of the real path) since self.path is not an on-disk path anymore so
# out._size cannot be updated.
out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
out._real_path = self._real_path
return out
def flatten(self):
out = super(Results.File, self).flatten()
out['mode'] = self.mode
return out
class Directory(_TouchedObject):
"""A directory of files. Must exist.
For a Directory instance, self.size is not a cache, it's an actual value
that is never modified and represents the total size of the files contained
in this directory. It is possible that the directory is empty so that
size==0; this happens if there's only an invalid symlink in it.
"""
def __init__(self, root, path, tainted, size, nb_files):
"""path='.' is a valid value and must be handled appropriately."""
assert not path.endswith(os.path.sep), path
super(Results.Directory, self).__init__(
root, path + os.path.sep, tainted, size, nb_files)
def flatten(self):
out = super(Results.Directory, self).flatten()
out['nb_files'] = self.nb_files
return out
def _clone(self, new_root, new_path, tainted):
"""Clones itself keeping meta-data."""
out = self.__class__(
new_root,
new_path.rstrip(os.path.sep),
tainted,
self.size,
self.nb_files)
out._real_path = self._real_path
return out
class Process(object):
"""A process that was traced.
Contains references to the files accessed by this process and its children.
"""
def __init__(self, pid, files, executable, command, initial_cwd, children):
logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
self.pid = pid
self.files = sorted(files, key=lambda x: x.path)
self.children = children
self.executable = executable
self.command = command
self.initial_cwd = initial_cwd
# Check internal consistency.
assert len(set(f.path for f in self.files)) == len(self.files), sorted(
f.path for f in self.files)
assert isinstance(self.children, list)
assert isinstance(self.files, list)
@property
def all(self):
for child in self.children:
for i in child.all:
yield i
yield self
def flatten(self):
return {
'children': [c.flatten() for c in self.children],
'command': self.command,
'executable': self.executable,
'files': [f.flatten() for f in self.files],
'initial_cwd': self.initial_cwd,
'pid': self.pid,
}
def strip_root(self, root):
assert file_path.isabs(root) and root.endswith(os.path.sep), root
# Loads the files after since they are constructed as objects.
out = self.__class__(
self.pid,
filter(None, (f.strip_root(root) for f in self.files)),
self.executable,
self.command,
self.initial_cwd,
[c.strip_root(root) for c in self.children])
logging.debug(
'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
return out
def __init__(self, process):
self.process = process
# Cache.
self._files = None
def flatten(self):
return {
'root': self.process.flatten(),
}
@property
def files(self):
if self._files is None:
self._files = sorted(
sum((p.files for p in self.process.all), []),
key=lambda x: x.path)
return self._files
@property
def existent(self):
return [f for f in self.files if f.existent]
@property
def non_existent(self):
return [f for f in self.files if not f.existent]
def strip_root(self, root):
"""Returns a clone with all the files outside the directory |root| removed
and converts all the path to be relative paths.
It keeps files accessible through the |root| directory or that have been
accessed through any symlink which points to the same directory.
"""
# Resolve any symlink
root = os.path.realpath(root)
root = (
file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep)
logging.debug('strip_root(%s)' % root)
return Results(self.process.strip_root(root))
class ApiBase(object):
"""OS-agnostic API to trace a process and its children."""
class Context(object):
"""Processes one log line at a time and keeps the list of traced processes.
The parsing is complicated by the fact that logs are traced out of order for
strace but in-order for dtrace and logman. In addition, on Windows it is
very frequent that processids are reused so a flat list cannot be used. But
at the same time, it is impossible to faithfully construct a graph when the
logs are processed out of order. So both a tree and a flat mapping are used,
the tree is the real process tree, while the flat mapping stores the last
valid process for the corresponding processid. For the strace case, the
tree's head is guessed at the last moment.
"""
class Process(object):
"""Keeps context for one traced child process.
Logs all the files this process touched. Ignores directories.
"""
def __init__(self, blacklist, pid, initial_cwd):
# Check internal consistency.
assert isinstance(pid, int), repr(pid)
assert_is_renderable(initial_cwd)
self.pid = pid
# children are Process instances.
self.children = []
self.initial_cwd = initial_cwd
self.cwd = None
self.files = {}
self.executable = None
self.command = None
self._blacklist = blacklist
def to_results_process(self):
"""Resolves file case sensitivity and or late-bound strings."""
# When resolving files, it's normal to get dupe because a file could be
# opened multiple times with different case. Resolve the deduplication
# here.
def fix_path(x):
"""Returns the native file path case.
Converts late-bound strings.
"""
if not x:
# Do not convert None instance to 'None'.
return x
x = render(x)
if os.path.isabs(x):
# If the path is not absolute, which tends to happen occasionally on
# Windows, it is not possible to get the native path case so ignore
# that trace. It mostly happens for 'executable' value.
x = file_path.get_native_path_case(x)
return x
def fix_and_blacklist_path(x, m):
"""Receives a tuple (filepath, mode) and processes filepath."""
x = fix_path(x)
if not x:
return
# The blacklist needs to be reapplied, since path casing could
# influence blacklisting.
if self._blacklist(x):
return
# Filters out directories. Some may have passed through.
if fs.isdir(x):
return
return x, m
# Renders all the files as strings, as some could be RelativePath
# instances. It is important to do it first since there could still be
# multiple entries with the same path but different modes.
rendered = (
fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
files = sorted(
(f for f in rendered if f),
key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
# Then converting into a dict will automatically clean up lesser
# important values.
files = [
Results.File(None, f, False, None, m)
for f, m in dict(files).iteritems()
]
return Results.Process(
self.pid,
files,
fix_path(self.executable),
self.command,
fix_path(self.initial_cwd),
[c.to_results_process() for c in self.children])
def add_file(self, filepath, mode):
"""Adds a file if it passes the blacklist."""
if self._blacklist(render(filepath)):
return
logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
# Note that filepath and not render(filepath) is added. It is because
# filepath could be something else than a string, like a RelativePath
# instance for dtrace logs.
modes = Results.File.ACCEPTABLE_MODES
old_mode = self.files.setdefault(filepath, mode)
if old_mode != mode and modes.index(old_mode) < modes.index(mode):
# Take the highest value.
self.files[filepath] = mode
def __init__(self, blacklist):
self.blacklist = blacklist
# Initial process.
self.root_process = None
# dict to accelerate process lookup, to not have to lookup the whole graph
# each time.
self._process_lookup = {}
class Tracer(object):
"""During it's lifetime, the tracing subsystem is enabled."""
def __init__(self, logname):
self._logname = logname
self._lock = threading.RLock()
self._traces = []
self._initialized = True
self._scripts_to_cleanup = []
def trace(self, cmd, cwd, tracename, output):
"""Runs the OS-specific trace program on an executable.
Arguments:
- cmd: The command (a list) to run.
- cwd: Current directory to start the child process in.
- tracename: Name of the trace in the logname file.
- output: If False, redirects output to PIPEs.
Returns a tuple (resultcode, output) and updates the internal trace
entries.
"""
# The implementation adds an item to self._traces.
raise NotImplementedError(self.__class__.__name__)
def close(self, _timeout=None):
"""Saves the meta-data in the logname file.
For kernel-based tracing, stops the tracing subsystem.
Must not be used manually when using 'with' construct.
"""
with self._lock:
if not self._initialized:
raise TracingFailure(
'Called %s.close() on an unitialized object' %
self.__class__.__name__,
None, None, None)
try:
while self._scripts_to_cleanup:
try:
fs.remove(self._scripts_to_cleanup.pop())
except OSError as e:
logging.error('Failed to delete a temporary script: %s', e)
tools.write_json(self._logname, self._gen_logdata(), False)
finally:
self._initialized = False
def post_process_log(self):
"""Post-processes the log so it becomes faster to load afterward.
Must not be used manually when using 'with' construct.
"""
assert not self._initialized, 'Must stop tracing first.'
def _gen_logdata(self):
"""Returns the data to be saved in the trace file."""
return {
'traces': self._traces,
}
def __enter__(self):
"""Enables 'with' statement."""
return self
def __exit__(self, exc_type, exc_value, traceback):
"""Enables 'with' statement."""
self.close()
# If an exception was thrown, do not process logs.
if not exc_type:
self.post_process_log()
def get_tracer(self, logname):
"""Returns an ApiBase.Tracer instance.
Initializes the tracing subsystem, which is a requirement for kernel-based
tracers. Only one tracer instance should be live at a time!
logname is the filepath to the json file that will contain the meta-data
about the logs.
"""
return self.Tracer(logname)
@staticmethod
def clean_trace(logname):
"""Deletes an old log."""
raise NotImplementedError()
@classmethod
def parse_log(cls, logname, blacklist, trace_name):
"""Processes trace logs and returns the files opened and the files that do
not exist.
It does not track directories.
Arguments:
- logname: must be an absolute path.
- blacklist: must be a lambda.
- trace_name: optional trace to read, defaults to reading all traces.
Most of the time, files that do not exist are temporary test files that
should be put in /tmp instead. See http://crbug.com/116251.
Returns a list of dict with keys:
- results: A Results instance.
- trace: The corresponding tracename parameter provided to
get_tracer().trace().
- output: Output gathered during execution, if get_tracer().trace(...,
output=False) was used.
"""
raise NotImplementedError(cls.__class__.__name__)
class Strace(ApiBase):
"""strace implies linux."""
@staticmethod
def load_filename(filename):
"""Parses a filename in a log."""
# TODO(maruel): Be compatible with strace -x.
assert isinstance(filename, str)
out = ''
i = 0
while i < len(filename):
c = filename[i]
if c == '\\':
out += chr(int(filename[i+1:i+4], 8))
i += 4
else:
out += c
i += 1
# TODO(maruel): That's not necessarily true that the current code page is
# utf-8.
return out.decode('utf-8')
class Context(ApiBase.Context):
"""Processes a strace log line and keeps the list of existent and non
existent files accessed.
Ignores directories.
Uses late-binding to processes the cwd of each process. The problem is that
strace generates one log file per process it traced but doesn't give any
information about which process was started when and by who. So process the
logs out of order and use late binding with RelativePath to be able to
deduce the initial directory of each process once all the logs are parsed.
TODO(maruel): Use the script even in the non-sudo case, so log parsing can
be done in two phase: first find the root process, then process the child
processes in order. With that, it should be possible to not use RelativePath
anymore. This would significantly simplify the code!
"""
class Process(ApiBase.Context.Process):
"""Represents the state of a process.
Contains all the information retrieved from the pid-specific log.
"""
# Function names are using ([a-z_0-9]+)
# This is the most common format. function(args) = result
RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
# An interrupted function call, only grab the minimal header.
RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
# A resumed function call.
RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
# A process received a signal.
RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
# A process didn't handle a signal. Ignore any junk appearing before,
# because the process was forcibly killed so it won't open any new file.
RE_KILLED = re.compile(
r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
# The process has exited.
RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
# A call was canceled. Ignore any prefix.
RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
# The process has exited due to the ptrace sandbox. RE_PROCESS_EXITED will
# follow on next line.
RE_PTRACE = re.compile(r'^.*<ptrace\(SYSCALL\):No such process>$')
# Happens when strace fails to even get the function name.
UNNAMED_FUNCTION = '????'
# Corner-case in python, a class member function decorator must not be
# @staticmethod.
def parse_args(regexp, expect_zero): # pylint: disable=E0213
"""Automatically convert the str 'args' into a list of processed
arguments.
Arguments:
- regexp is used to parse args.
- expect_zero: one of True, False or None.
- True: will check for result.startswith('0') first and will ignore
the trace line completely otherwise. This is important because for
many functions, the regexp will not process if the call failed.
- False: will check for not result.startswith(('?', '-1')) for the
same reason than with True.
- None: ignore result.
"""
def meta_hook(function):
assert function.__name__.startswith('handle_')
def hook(self, args, result):
if expect_zero is True and not result.startswith('0'):
return
if expect_zero is False and result.startswith(('?', '-1')):
return
match = re.match(regexp, args)
if not match:
raise TracingFailure(
'Failed to parse %s(%s) = %s' %
(function.__name__[len('handle_'):], args, result),
None, None, None)
return function(self, match.groups(), result)
return hook
return meta_hook
class RelativePath(object):
"""A late-bound relative path."""
def __init__(self, parent, value):
assert_is_renderable(parent)
self.parent = parent
assert (
value is None or
(isinstance(value, unicode) and not os.path.isabs(value)))
self.value = value
if self.value:
# TODO(maruel): On POSIX, '\\' is a valid character so remove this
# assert.
assert '\\' not in self.value, value
assert '\\' not in self.value, (repr(value), repr(self.value))
def render(self):
"""Returns the current directory this instance is representing.
This function is used to return the late-bound value.
"""
assert self.parent is not None
parent = render(self.parent)
if self.value:
return os.path.normpath(os.path.join(parent, self.value))
return parent
def __init__(self, root, pid):
"""Keeps enough information to be able to guess the original process
root.
strace doesn't store which process was the initial process. So more
information needs to be kept so the graph can be reconstructed from the
flat map.
"""
logging.info('%s(%d)' % (self.__class__.__name__, pid))
super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
assert isinstance(root, ApiBase.Context)
self._root = weakref.ref(root)
# The dict key is the function name of the pending call, like 'open'
# or 'execve'.
self._pending_calls = {}
self._line_number = 0
# Current directory when the process started.
if isinstance(self._root(), unicode):
self.initial_cwd = self._root()
else:
self.initial_cwd = self.RelativePath(self._root(), None)
self.parentid = None
self._done = False
def get_cwd(self):
"""Returns the best known value of cwd."""
return self.cwd or self.initial_cwd
def render(self):
"""Returns the string value of the RelativePath() object.
Used by RelativePath. Returns the initial directory and not the
current one since the current directory 'cwd' validity is time-limited.
The validity is only guaranteed once all the logs are processed.
"""
return self.initial_cwd.render()
def on_line(self, line):
assert isinstance(line, str)
self._line_number += 1
try:
if self._done:
raise TracingFailure(
'Found a trace for a terminated process or corrupted log',
None, None, None)
if self.RE_SIGNAL.match(line):
# Ignore signals.
return
match = self.RE_KILLED.match(line)
if match:
# Converts a '+++ killed by Foo +++' trace into an exit_group().
self.handle_exit_group(match.group(1), None)
return
match = self.RE_PROCESS_EXITED.match(line)
if match:
# Converts a '+++ exited with 1 +++' trace into an exit_group()
self.handle_exit_group(match.group(1), None)
return
match = self.RE_UNFINISHED.match(line)
if match:
if match.group(1) in self._pending_calls:
raise TracingFailure(
'Found two unfinished calls for the same function',
None, None, None,
self._pending_calls)
self._pending_calls[match.group(1)] = (
match.group(1) + match.group(2))
return
match = self.RE_UNAVAILABLE.match(line)
if match:
# This usually means a process was killed and a pending call was
# canceled.
# TODO(maruel): Look up the last exit_group() trace just above and
# make sure any self._pending_calls[anything] is properly flushed.
return
match = self.RE_PTRACE.match(line)
if match:
# Not sure what this means. Anyhow, the process died.
# TODO(maruel): Add note that only RE_PROCESS_EXITED is valid