-
Notifications
You must be signed in to change notification settings - Fork 143
/
Copy pathdrain_bigfile_demo.py
70 lines (56 loc) · 2.15 KB
/
drain_bigfile_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# SPDX-License-Identifier: MIT
import json
import logging
import os
import subprocess
import sys
import time
from os.path import dirname
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
in_gz_file = "SSH.tar.gz"
in_log_file = "SSH.log"
if not os.path.isfile(in_log_file):
logger.info(f"Downloading file {in_gz_file}")
p = subprocess.Popen(f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True)
p.wait()
logger.info(f"Extracting file {in_gz_file}")
p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True)
p.wait()
config = TemplateMinerConfig()
config.load(f"{dirname(__file__)}/drain3.ini")
config.profiling_enabled = True
template_miner = TemplateMiner(config=config)
line_count = 0
with open(in_log_file) as f:
lines = f.readlines()
start_time = time.time()
batch_start_time = start_time
batch_size = 10000
for line in lines:
line = line.rstrip()
line = line.partition(": ")[2]
result = template_miner.add_log_message(line)
line_count += 1
if line_count % batch_size == 0:
time_took = time.time() - batch_start_time
rate = batch_size / time_took
logger.info(f"Processing line: {line_count}, rate {rate:.1f} lines/sec, "
f"{len(template_miner.drain.clusters)} clusters so far.")
batch_start_time = time.time()
if result["change_type"] != "none":
result_json = json.dumps(result)
logger.info(f"Input ({line_count}): {line}")
logger.info(f"Result: {result_json}")
time_took = time.time() - start_time
rate = line_count / time_took
logger.info(f"--- Done processing file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, "
f"{len(template_miner.drain.clusters)} clusters")
sorted_clusters = sorted(template_miner.drain.clusters, key=lambda it: it.size, reverse=True)
for cluster in sorted_clusters:
logger.info(cluster)
print("Prefix Tree:")
template_miner.drain.print_tree()
template_miner.profiler.report(0)