forked from OpenTSDB/opentsdb
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'OFFICIAL/master'
- Loading branch information
Showing
6 changed files
with
234 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from subprocess import Popen, PIPE, TimeoutExpired, check_output | ||
from random import shuffle | ||
import time | ||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | ||
import logging | ||
import pprint | ||
|
||
log = logging.getLogger("repair-tsd") | ||
log.setLevel(logging.INFO) | ||
ch = logging.StreamHandler() | ||
logformat = '%(asctime)s %(name)s %(levelname)s %(message)s' | ||
formatter = logging.Formatter(logformat) | ||
ch.setFormatter(formatter) | ||
log.addHandler(ch) | ||
|
||
|
||
class TSDRepair(object): | ||
def __init__(self, args): | ||
self.time_chunk = args.get("time_chunk", 15) | ||
self.timeout = int(self.time_chunk * 60) | ||
self.retries = args.get("retries", 1) | ||
self.multiplier = int(60 / self.time_chunk) | ||
self.time_range = args.get("time_range", 48) | ||
self.chunk_count = self.time_range * self.multiplier | ||
self.tsd_path = args.get("tsd_path", "/usr/share/opentsdb/bin/tsdb") | ||
self.cfg_path = args.get("cfg_path", "/etc/opentsdb/opentsdb.conf") | ||
self.use_sudo = args.get("use_sudo", False) | ||
self.sudo_user = args.get("sudo_user", "opentsdb") | ||
self.log = logging.getLogger("repair-tsd") | ||
self.base = "{} fsck --config={}".format(self.tsd_path, self.cfg_path) | ||
self.check_cmd = "{} uid --config={} metrics".format(self.tsd_path, self.cfg_path) | ||
if self.use_sudo: | ||
self.base = "sudo -u {} {}".format(self.sudo_user, self.base) | ||
self.check_cmd = "sudo -u {} {}".format(self.sudo_user, self.check_cmd) | ||
|
||
def _get_metrics(self): | ||
""" | ||
Collect all metrics from OpenTSDB | ||
:returns: all metrics | ||
:rtype: list | ||
""" | ||
try: | ||
self.store_path = args.get('store_path', '/tmp/opentsdb.list') | ||
with open(self.store_path, 'r') as f_in: | ||
finished_metrics = [m for m in f_in.read().split('\n') if m] | ||
except Exception: | ||
finished_metrics = [] | ||
cmd = '{} uid --config={} grep metrics ".*"'.format(self.tsd_path, | ||
self.cfg_path) | ||
proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) | ||
results = proc.communicate() | ||
metrics = [m.split(" ")[1].strip(":") | ||
for m in results[0].decode().split("\n") if m] | ||
metrics = [m for m in metrics if m and m != "\x00" and | ||
m not in finished_metrics] | ||
shuffle(metrics) | ||
self.log.info("There are {} metrics to process".format(len(metrics))) | ||
return metrics | ||
|
||
def _repair_metric_chunk(self, metric, chunk): | ||
""" | ||
Repair one 'chunk' of data for a metric | ||
""" | ||
self.log.debug("Running chunk {} for {}".format(chunk, metric)) | ||
if chunk < 2: | ||
timestr = "{}m-ago".format(self.time_chunk) | ||
else: | ||
timestr = "{}m-ago {}m-ago".format((chunk + 1) * self.time_chunk, | ||
chunk * self.time_chunk) | ||
cmd = "{} {} sum".format(self.base, timestr) | ||
""" | ||
Even though we're chunking, it's worth trying things more than once | ||
""" | ||
for x in range(1, self.retries + 2): | ||
self.log.debug("Repair try {} for {}".format(x, timestr)) | ||
fullcmd = "{} {} --fix-all --compact".format(cmd, metric) | ||
self.log.debug("Full command: {}".format(fullcmd)) | ||
metricproc = Popen(fullcmd, shell=True, stdout=PIPE, stderr=PIPE) | ||
try: | ||
results, err = metricproc.communicate(timeout=self.timeout) | ||
except TimeoutExpired: | ||
self.log.debug("{} failed to complete in window (run {})".format(metric, x)) | ||
continue | ||
except Exception as e: | ||
self.log.error("{} general exception :: {}".format(metric, | ||
e)) | ||
else: | ||
results = [r for r in results.decode().split("\n") if r][-26:] | ||
final_results = [] | ||
""" | ||
We'll only collect results that are non-0 | ||
since we're not super interested in stuff that didn't change. | ||
""" | ||
for r in results: | ||
# Strip the timestamp from the log line | ||
line = r.split(" ")[6:] | ||
try: | ||
if int(line[-1]) != 0: | ||
final_results.append(" ".join(line)) | ||
except Exception: | ||
final_results.append(" ".join(line)) | ||
result_str = "\n".join(final_results) | ||
self.log.debug("{} results:\n{}".format(metric, result_str)) | ||
if chunk % 20 == 0: | ||
self.log.info("Chunk {} of {} finished".format(chunk, self.chunk_count)) | ||
else: | ||
self.log.debug("Chunk {} of {} finished".format(chunk, self.chunk_count)) | ||
try: | ||
with open(self.store_path, 'a') as f_out: | ||
f_out.write("{}\n".format(metric)) | ||
except Exception: | ||
pass | ||
return None | ||
else: | ||
self.log.error("Failed to completely repair {}".format(metric)) | ||
return metric | ||
|
||
def process_metrics(self): | ||
""" | ||
Run fsck on a list of metrics over a time range | ||
""" | ||
failed_metrics = [] | ||
metrics = self._get_metrics() | ||
for index, metric in enumerate(metrics): | ||
try: | ||
check_output("{} {}".format(self.check_cmd, metric), | ||
shell=True) | ||
except Exception: | ||
log.warning("{} doesn't exist! Skipping...".format(metric)) | ||
continue | ||
logline = "{} ({} of {})".format(metric, index + 1, len(metrics)) | ||
logline += " ({} failed) in {} chunks".format(len(failed_metrics), | ||
self.chunk_count) | ||
self.log.info(logline) | ||
start_time = time.time() | ||
start_time_min = int(start_time//60 * 60) | ||
failed_metrics = [self._repair_metric_chunk(metric, x) | ||
for x in range(1, self.chunk_count + 1)] | ||
failed_metrics = [m for m in failed_metrics if m] | ||
runtime = time.time() - start_time | ||
self.log.info("{} repair took {} seconds".format(metric, | ||
int(runtime))) | ||
self.log.info("Failed metrics: {}".format(failed_metrics)) | ||
return failed_metrics | ||
|
||
|
||
def cli_opts(): | ||
parser = ArgumentParser(description="Repair all OpenTSDB metrics", | ||
formatter_class=ArgumentDefaultsHelpFormatter) | ||
parser.add_argument("--debug", action="store_true", default=False, | ||
help="Show debug information") | ||
parser.add_argument("--time-range", default="48", | ||
help="How many hours of time we collect to repair") | ||
parser.add_argument("--time-chunk", default="15", | ||
help="How many minutes of data to scan per chunk") | ||
parser.add_argument("--retries", default="1", | ||
help="How many times we should try failed metrics") | ||
parser.add_argument("--tsd-path", default="/usr/share/opentsdb/bin/tsdb", | ||
help="Path to the OpenTSDB CLI binary") | ||
parser.add_argument("--cfg-path", default="/etc/opentsdb/opentsdb.conf", | ||
help="Path to OpenTSDB config") | ||
parser.add_argument("--store-path", default="/opentsdb-fsck.list", | ||
help="Path to OpenTSDB config") | ||
parser.add_argument("--use-sudo", action="store_true", | ||
default=False, | ||
help="switch user when running repairs?") | ||
parser.add_argument("--sudo-user", default="opentsdb", | ||
help="User to switch to...") | ||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = cli_opts() | ||
if args.debug: | ||
log.setLevel(logging.DEBUG) | ||
try: | ||
time_range = int(args.time_range) | ||
except Exception as e: | ||
log.error("Invalid time range {} :: {}".format(args.time_range, e)) | ||
try: | ||
retries = int(args.retries) | ||
except Exception as e: | ||
log.error("Invalid retry number {} :: {}".format(args.retries, e)) | ||
try: | ||
time_chunk = int(args.time_chunk) | ||
if 60 % time_chunk != 0: | ||
raise ArithmeticError | ||
except Exception as e: | ||
log.error("Invalid time chunk {} :: {}".format(args.retries, e)) | ||
|
||
repair_tool = TSDRepair({"time_range": time_range, | ||
"use_sudo": args.use_sudo, | ||
"sudo_user": args.sudo_user, | ||
"time_chunk": time_chunk, | ||
"tsd_path": args.tsd_path, | ||
"cfg_path": args.cfg_path, | ||
"store_path": args.store_path, | ||
"retries": retries}) | ||
repair_tool.process_metrics() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |