forked from autotest/autotest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrashcollect.py
199 lines (158 loc) · 7.46 KB
/
crashcollect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os, time, logging, shutil, gzip
from autotest.client.shared import global_config
from autotest.server import utils
# import any site hooks for the crashdump and crashinfo collection
get_site_crashdumps = utils.import_site_function(
__file__, "autotest.server.site_crashcollect", "get_site_crashdumps",
lambda host, test_start_time: None)
get_site_crashinfo = utils.import_site_function(
__file__, "autotest.server.site_crashcollect", "get_site_crashinfo",
lambda host, test_start_time: None)
def get_crashdumps(host, test_start_time):
get_site_crashdumps(host, test_start_time)
def get_crashinfo(host, test_start_time):
logging.info("Collecting crash information...")
# include crashdumps as part of the general crashinfo
get_crashdumps(host, test_start_time)
if wait_for_machine_to_recover(host):
# run any site-specific collection
get_site_crashinfo(host, test_start_time)
crashinfo_dir = get_crashinfo_dir(host)
collect_messages(host)
collect_log_file(host, "/var/log/monitor-ssh-reboots", crashinfo_dir)
collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
collect_uncollected_logs(host)
# Load default for number of hours to wait before giving up on crash collection.
HOURS_TO_WAIT = global_config.global_config.get_config_value(
'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
"""Wait for a machine (possibly down) to become accessible again.
@param host: A RemoteHost instance to wait on
@param hours_to_wait: Number of hours to wait before giving up
@returns: True if the machine comes back up, False otherwise
"""
current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
if host.is_up():
logging.info("%s already up, collecting crash info", host.hostname)
return True
if hours_to_wait > 0:
logging.info("Waiting %s hours for %s to come up (%s)",
hours_to_wait, host.hostname, current_time)
if not host.wait_up(timeout=hours_to_wait * 3600):
logging.warning("%s down, unable to collect crash info",
host.hostname)
return False
else:
logging.info("%s is back up, collecting crash info", host.hostname)
return True
else:
logging.info("Skipping crash info collection")
return False
def get_crashinfo_dir(host):
"""Find and if necessary create a directory to store crashinfo in.
@param host: The RemoteHost object that crashinfo will be collected from
@returns: The path to an existing directory for writing crashinfo into
"""
host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
if host_resultdir:
infodir = host_resultdir
else:
infodir = os.path.abspath(os.getcwd())
infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
if not os.path.exists(infodir):
os.mkdir(infodir)
return infodir
def collect_log_file(host, log_path, dest_path):
"""Collects a log file from the remote machine.
Log files are collected from the remote machine and written into the
destination path. If dest_path is a directory, the log file will be named
using the basename of the remote log path.
@param host: The RemoteHost to collect logs from
@param log_path: The remote path to collect the log file from
@param dest_path: A path (file or directory) to write the copies logs into
"""
logging.info("Collecting %s...", log_path)
try:
host.get_file(log_path, dest_path, preserve_perm=False)
except Exception:
logging.warning("Collection of %s failed", log_path)
def collect_command(host, command, dest_path):
"""Collects the result of a command on the remote machine.
The standard output of the command will be collected and written into the
desitionation path. The destination path is assumed to be filename and
not a directory.
@param host: The RemoteHost to collect from
@param command: A shell command to run on the remote machine and capture
the output from.
@param dest_path: A file path to write the results of the log into
"""
logging.info("Collecting '%s' ...", command)
devnull = open("/dev/null", "w")
try:
try:
result = host.run(command, stdout_tee=devnull).stdout
utils.open_write_close(dest_path, result)
except Exception, e:
logging.warning("Collection of '%s' failed:\n%s", command, e)
finally:
devnull.close()
def collect_uncollected_logs(host):
"""Collects any leftover uncollected logs from the client.
@param host: The RemoteHost to collect from
"""
if host.job:
try:
logs = host.job.get_client_logs()
for hostname, remote_path, local_path in logs:
if hostname == host.hostname:
logging.info("Retrieving logs from %s:%s into %s",
hostname, remote_path, local_path)
host.get_file(remote_path + "/", local_path + "/")
except Exception, e:
logging.warning("Error while trying to collect stranded "
"Autotest client logs: %s", e)
def collect_messages(host):
"""Collects the 'new' contents of /var/log/messages.
If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
the contents of /var/log/messages excluding whatever initial contents
are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
present, simply collects the entire contents of /var/log/messages.
@param host: The RemoteHost to collect from
"""
crashinfo_dir = get_crashinfo_dir(host)
try:
# paths to the messages files
messages = os.path.join(crashinfo_dir, "messages")
messages_raw = os.path.join(crashinfo_dir, "messages.raw")
messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
# grab the files from the remote host
collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
messages_at_start)
collect_log_file(host, "/var/log/messages", messages_raw)
# figure out how much of messages.raw to skip
if os.path.exists(messages_at_start):
# if the first lines of the messages at start should match the
# first lines of the current messages; if they don't then messages
# has been erase or rotated and we just grab all of it
first_line_at_start = utils.read_one_line(messages_at_start)
first_line_now = utils.read_one_line(messages_raw)
if first_line_at_start != first_line_now:
size_at_start = 0
else:
size_at_start = os.path.getsize(messages_at_start)
else:
size_at_start = 0
raw_messages_file = open(messages_raw)
messages_file = gzip.GzipFile(messages+".gz", "w")
try:
raw_messages_file.seek(size_at_start)
shutil.copyfileobj(raw_messages_file, messages_file)
raw_messages_file.close()
finally:
messages_file.close()
# get rid of the "raw" versions of messages
os.remove(messages_raw)
if os.path.exists(messages_at_start):
os.remove(messages_at_start)
except Exception, e:
logging.warning("Error while collecting /var/log/messages: %s", e)