-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathct_logs_processor.py
executable file
·202 lines (164 loc) · 6.91 KB
/
ct_logs_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
# Copyright (c) 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Cluster Telemetry Logs Processor
Cluster Telemetry log files contains detailed unaggregated result for each run,
which are useful for doing analysis like a variability study with high number
of page repeat. The produced csv file does not contain these individual
results; the results are aggregated together for each URL instead.
This file provides functions to parse these logs and produce a CSV file that
contains all the information, which can then be easily used in tools like R or
Sheets. The log file mixes various different log sources together, so this
script uses regex parsing to pull out the relevant information. The parsing
code may need to be updated as cluster telemetry log structure changes.
"""
from __future__ import print_function
import ast
import csv
import os
import re
import sys
import traceback
import argparse
from collections import defaultdict
from pprint import pprint
class TraceUrlMismatch(Exception):
def __init__(self, expected_url, found_url, metric_name, run_index):
print("Expected", expected_url)
print("but found")
print(found_url)
print("Metric:", metric_name)
print("Storyset repeat: ", run_index)
def string_to_list(string):
try:
# ast.literal_eval only parses static data so safer than eval.
return ast.literal_eval(string)
except:
print("Encountered error while processing this:\n" + string)
raise
# Histograms are referred to a "rows" in CT logs.
def get_histograms(filename):
output = []
with open(filename) as f:
# Iterate until you find the merge log line.
while True:
line = f.readline()
if line == '':
return output # Reached EOF. No histograms in this file.
if re.search('Merging \d+ csv files into \d+ columns', line):
break
# Read the rest of the file.
logs = f.read()
# First I remove the exec.go:83 prefix from some lines, because in the logs
# they look like this:
# ["For rows: [{'productVersions': '', 'osVersi
# I0105 18:10:08.017015 26595 exec.go:83] exec.go:83 ons': 'M', ...
# NOTE: The line number of 83 here may change. Please fix it if to be the
# right number if it does so.
# TODO: Automate finding this line number.
logs = re.sub('\nI.*exec\.go\:83] exec\.go\:83 ', '\n', logs)
# Then I remove all lines that are not from other log sources, for example,
# util.go, exec.go:223 etc.
logs = re.sub('\nI.*?\.go\:\d+] .*?\.go\:\d+.*?(?=\n)', '\n', logs)
# Then I remove all new lines.
logs = re.sub('\n', '', logs);
for histograms_str in re.findall("For rows: (.*?)Avg row is",
logs, re.DOTALL):
output.extend(string_to_list(histograms_str))
print("%d histograms processed." % len(output))
return output
# Returns url -> run_index -> dict of metrics
def get_run_results(ct_histograms):
url_to_run_index_to_rows = defaultdict(lambda: defaultdict(dict))
fieldnames = set(['page_name', 'run_index', 'trace_url'])
stats = {'more_than_one_value': defaultdict(int)}
for histogram in ct_histograms:
if histogram['avg'] == '': continue
if int(histogram['count']) < 1: continue
metric_name = histogram['name']
# Stories can be like "https://google.com (#12)".
# Strip the story number at the end.
url = histogram['stories'].split("(")[0].strip()
run_index = int(histogram['storysetRepeats'])
if histogram['count'] > '1':
# Track this case.
stats['more_than_one_value'][metric_name] += 1
metric_value = float(histogram['avg'])
metrics_dict = url_to_run_index_to_rows[url][run_index]
metrics_dict['page_name'] = url
metrics_dict[metric_name] = float(metric_value)
trace_url = histogram['traceUrls']
if 'trace_url' in metrics_dict:
# All histograms for the same url and run index should have the same
# trace url.
if (metrics_dict['trace_url'] != trace_url):
raise TraceUrlMismatch(metrics_dict['trace_url'], trace_url,
metric_name, run_index)
else:
metrics_dict['trace_url'] = trace_url
fieldnames.add(metric_name)
return {'run_results': url_to_run_index_to_rows, 'fieldnames': fieldnames}
def write_results_to_csv(out_filename, run_results, fieldnames):
rows = 0
with open(out_filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames)
writer.writeheader()
for run_index_to_metrics in run_results.values():
all_runs = run_index_to_metrics.values()
for run_result in all_runs:
writer.writerow(run_result)
rows += 1
print("Wrote %d rows to %s" % (rows, out_filename))
def transform_single_file(args):
if not os.path.exists(args.outdir):
os.makedirs(args.outdir, mode=0o755)
for input_file in args.input_files:
print("Processing " + input_file)
basename, _ = os.path.splitext(input_file)
output_file = os.path.join(args.outdir, basename + ".csv")
results = get_run_results(get_histograms(input_file))
write_results_to_csv(output_file, **results)
# Not using f-strings to keep compatibility with python2.
print("Transformed %d files to csv." % len(args.input_files))
def transform_and_merge(args):
out_filename = args.merge
all_results = []
for input_file in args.input_files:
print("Processing " + input_file)
basename, _ = os.path.splitext(input_file)
output_file = os.path.join(args.outdir, basename + ".csv")
all_results.append(get_run_results(get_histograms(input_file)))
all_fieldnames = set()
for fieldnames in [x['fieldnames'] for x in all_results]:
all_fieldnames.union(fieldnames)
rows = 0
with open(out_filename, 'w') as f:
writer = csv.DictWriter(f, list(fieldnames))
writer.writeheader()
for run_results in [x['run_results'] for x in all_results]:
for run_index_to_metrics in run_results.values():
all_runs = run_index_to_metrics.values()
for run_result in all_runs:
rows += 1
writer.writerow(run_result)
print("Wrote %d rows to %s" % (rows, out_filename))
def main():
argparser = argparse.ArgumentParser(
description="Transform a cluster telemetry log file to csv.")
argparser.add_argument("--outdir", help="path to output directory",
default=os.curdir)
argparser.add_argument('--merge', nargs='?', const="merged.csv",
metavar="MERGED_FILENAME",
help="merge all outputs into one csv")
argparser.add_argument("input_files", nargs="+",
help="""Path to one or more input files. Output
filenames are deduced by replacing the extension with
csv.""")
args = argparser.parse_args()
if args.merge:
transform_and_merge(args)
else:
transform_single_file(args)
if __name__ == "__main__":
main()