Skip to content

Commit 7f1e8b1

Browse files
dylanbunDylan Bunarto
andauthored
[AMA TST] Improve DCR Parsing and Troubleshooter -A Mode (#2114)
* patch * redid parsing * related changes * removed logging from helpers.py * more repairs * cleanup * more cleanup * added more helpful messages to debug endpoint issues * allowing for -A to go through all checks and then report them --------- Co-authored-by: Dylan Bunarto <[email protected]>
1 parent 62751ae commit 7f1e8b1

File tree

8 files changed

+167
-68
lines changed

8 files changed

+167
-68
lines changed

AzureMonitorAgent/ama_tst/modules/connect/check_endpts.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import subprocess
2+
import traceback
23

34
from error_codes import *
45
from errors import error_info
@@ -14,6 +15,25 @@
1415
ME_REGION_URL = "{0}.monitoring.azure.com"
1516

1617

18+
def _log_ssl_error(context, exception, show_traceback=True):
19+
"""Helper function to log SSL errors cleanly"""
20+
print("{0}:".format(context))
21+
print(" Type: {0}".format(type(exception).__name__))
22+
print(" Message: {0}".format(str(exception)))
23+
24+
# For CalledProcessError, show command details
25+
if isinstance(exception, subprocess.CalledProcessError):
26+
print(" Command: {0}".format(getattr(exception, 'cmd', 'Unknown')))
27+
print(" Return code: {0}".format(getattr(exception, 'returncode', 'Unknown')))
28+
if hasattr(exception, 'output') and exception.output:
29+
print(" Output: {0}".format(exception.output.strip()))
30+
31+
# Show traceback if requested
32+
if show_traceback:
33+
print(" Traceback:")
34+
print(traceback.format_exc())
35+
36+
1737
def check_endpt_ssl(ssl_cmd, endpoint):
1838
"""
1939
openssl connect to specific endpoint
@@ -32,9 +52,31 @@ def check_endpt_ssl(ssl_cmd, endpoint):
3252
verified = True
3353
continue
3454

55+
# If connection established but no explicit verification status in brief mode,
56+
# try a verification check to determine if SSL cert is valid
57+
if connected and not verified:
58+
try:
59+
# Use verify_return_error flag to test certificate verification
60+
verify_cmd = ssl_cmd.replace('-brief', '-verify_return_error -brief')
61+
verify_output = subprocess.check_output(verify_cmd.format(endpoint), shell=True,\
62+
stderr=subprocess.STDOUT, universal_newlines=True)
63+
# If verify command succeeds (no exception), verification is OK
64+
if "CONNECTION ESTABLISHED" in verify_output:
65+
verified = True
66+
except subprocess.CalledProcessError as e:
67+
# Verification failed - certificate issues
68+
_log_ssl_error("SSL verification failed", e, show_traceback=False)
69+
verified = False
70+
except Exception as e:
71+
# Other error - assume verified if basic connection worked
72+
# This handles cases where verify_return_error isn't supported
73+
_log_ssl_error("SSL verification exception", e, show_traceback=True)
74+
verified = False
75+
3576
return (connected, verified, ssl_output)
3677
except Exception as e:
37-
return (False, False, e)
78+
_log_ssl_error("SSL connection failed", e, show_traceback=True)
79+
return (False, False, str(e))
3880

3981

4082
def check_internet_connect():

AzureMonitorAgent/ama_tst/modules/connect/connect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def check_parameters():
2929
return NO_ERROR
3030

3131
def check_workspace():
32-
wkspc_id, wkspc_region, e = find_dcr_workspace()
32+
wkspc_id, wkspc_region, agent_settings, e = find_dcr_workspace()
3333
if e != None:
3434
error_info.append((e,))
3535
return ERR_NO_DCR

AzureMonitorAgent/ama_tst/modules/custom_logs/check_clconf.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@ def check_customlog_input():
1111
if (cl_input == None or len(cl_input) == 0):
1212
error_info.append(("No custom logs file path",))
1313
return ERR_CL_INPUT
14+
# cl_input is a list, not a dictionary - iterate over the paths directly
1415
for path in cl_input:
16+
# Skip malformed entries that don't look like valid file paths
17+
if not path or not path.startswith('/'):
18+
continue
1519
try:
1620
check_path = run_cmd_output('ls {0}'.format(path)).strip()
1721
if check_path.endswith('No such file or directory'):
@@ -42,11 +46,20 @@ def check_customlog_conf():
4246
cl_log_file = cl_line.strip().split('log_file')[1]
4347
general_info['CL_LOG'] = cl_log_file
4448

45-
if (cl_line.strip().startswith('Path')):
46-
cl_input_path = cl_line.strip().split('Path')[1].strip()
47-
general_info['CL_INPUT'].append(cl_input_path)
49+
# Only match exact "Path" lines (not "Path_Key" or other variants)
50+
if (cl_line.strip().startswith('Path ') or cl_line.strip().startswith('Path\t')):
51+
# Extract the path value after the whitespace
52+
parts = cl_line.strip().split(None, 1) # Split on any whitespace, max 1 split
53+
if len(parts) > 1:
54+
cl_input_path = parts[1].strip()
55+
# Only add valid file paths (should start with /)
56+
if cl_input_path.startswith('/'):
57+
general_info['CL_INPUT'].append(cl_input_path)
58+
4859
except Exception as e:
4960
error_info.append((e,))
5061
return ERR_CL_CONF
51-
62+
63+
print('cl_input value: {0}'.format(general_info['CL_INPUT']))
64+
5265
return NO_ERROR

AzureMonitorAgent/ama_tst/modules/helpers.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
except NameError:
2020
FileNotFoundError = IOError
2121

22+
# backwards compatible JSONDecodeError for Python 2 vs 3
23+
try:
24+
json.JSONDecodeError
25+
except AttributeError:
26+
# Python 2 doesn't have json.JSONDecodeError, use ValueError instead
27+
json.JSONDecodeError = ValueError
28+
2229
# backwards compatible devnull variable for Python 3.3 vs earlier
2330
try:
2431
DEVNULL = subprocess.DEVNULL
@@ -220,19 +227,45 @@ def run_cmd_output(cmd):
220227

221228

222229
def find_dcr_workspace():
230+
"""
231+
Parse DCR configuration files to find workspace IDs and regions.
232+
"""
223233
global general_info
224234

225235
if 'DCR_WORKSPACE_ID' in general_info and 'DCR_REGION' in general_info:
226236
return (general_info['DCR_WORKSPACE_ID'], general_info['DCR_REGION'], None)
227237
dcr_workspace = set()
228238
dcr_region = set()
229239
me_region = set()
240+
agent_settings = {}
230241
general_info['URL_SUFFIX'] = '.com'
231242
try:
232243
for file in os.listdir(CONFIG_DIR):
233244
file_path = CONFIG_DIR + "/" + file
234245
with open(file_path) as f:
235246
result = json.load(f)
247+
248+
# Check if this is an AgentSettings DCR - parse its settings
249+
if 'kind' in result and result['kind'] == 'AgentSettings' and 'channels' not in result:
250+
if 'settings' in result:
251+
settings_str = result['settings']
252+
try:
253+
# The settings field is a JSON string, so parse it
254+
if isinstance(settings_str, str):
255+
settings_list = json.loads(settings_str)
256+
else:
257+
settings_list = settings_str
258+
259+
# Process each setting
260+
for setting in settings_list:
261+
name = setting['name']
262+
value = setting['value']
263+
if name:
264+
agent_settings[name] = value
265+
except (json.JSONDecodeError, TypeError) as e:
266+
# If parsing fails, skip this AgentSettings DCR
267+
print("Error parsing settings key in AgentSettings DCR")
268+
continue
236269
channels = result['channels']
237270
for channel in channels:
238271
if channel['protocol'] == 'ods':
@@ -255,14 +288,17 @@ def find_dcr_workspace():
255288
region = endpoint_url.split('https://')[1].split('.monitoring')[0]
256289
me_region.add(region)
257290
except Exception as e:
258-
return (None, None, e)
291+
return (None, None, None, e)
259292

260293
general_info['DCR_WORKSPACE_ID'] = dcr_workspace
261294
general_info['DCR_REGION'] = dcr_region
262295
general_info['ME_REGION'] = me_region
263-
return (dcr_workspace, dcr_region, None)
296+
return (dcr_workspace, dcr_region, agent_settings, None)
264297

265298
def find_dce():
299+
"""
300+
Parse DCR configuration files to find Data Collection Endpoints (DCE).
301+
"""
266302
global general_info
267303

268304
dce = set()
@@ -271,6 +307,9 @@ def find_dce():
271307
file_path = CONFIG_DIR + "/" + file
272308
with open(file_path) as f:
273309
result = json.load(f)
310+
# Check if this is an AgentSettings DCR, if so skip it
311+
if 'kind' in result and result['kind'] == 'AgentSettings' and 'channels' not in result:
312+
continue
274313
channels = result['channels']
275314
for channel in channels:
276315
if channel['protocol'] == 'gig':

AzureMonitorAgent/ama_tst/modules/install/check_ama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def check_ama(interactive):
199199
if newer_ama_version is None:
200200
if e is None:
201201
# No error and no newer version found - current version is up to date
202-
print("AMA version is up to date (latest version)")
202+
print("AMA version is up to date (latest version)")
203203
return NO_ERROR
204204
else:
205205
# There was an error fetching the latest version

AzureMonitorAgent/ama_tst/modules/install/check_os.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
from __future__ import absolute_import
12
import platform
23
from error_codes import *
34
from errors import error_info
45
from helpers import find_vm_bits, find_vm_distro
5-
import install.supported_distros as supported_distros
6+
from . import supported_distros
67

78

89
def format_alternate_versions(supported_dist, versions):

AzureMonitorAgent/ama_tst/modules/logcollector.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,12 +290,14 @@ def create_outfile(output_dirpath, logs_date, pkg_manager):
290290
outfile.write("--------------------------------------------------------------------------------\n")
291291

292292
# connection to endpoints
293-
wkspc_id, wkspc_region, e = helpers.find_dcr_workspace()
293+
wkspc_id, wkspc_region, agent_settings, e = helpers.find_dcr_workspace()
294294
if e == None:
295295
outfile.write("Workspace ID: {0}\n".format(str(wkspc_id)))
296296
outfile.write("Workspace region: {0}\n".format(str(wkspc_region)))
297297
outfile.write("--------------------------------------------------------------------------------\n")
298-
298+
if agent_settings != {}:
299+
outfile.write("AgentSettinsgs file found: {0}\n".format(str(agent_settings)))
300+
299301
# AMA package info (dpkg/rpm)
300302
if (pkg_manager == "dpkg"):
301303
outfile.write("Output of command: {0}\n".format(DPKG_CMD))

AzureMonitorAgent/ama_tst/modules/main.py

Lines changed: 58 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -24,63 +24,65 @@ def check_sudo():
2424
return True
2525

2626
def check_all(interactive):
27-
all_success = NO_ERROR
28-
# 1: Install
29-
checked_install = check_installation(interactive)
30-
if (is_error(checked_install)):
31-
return checked_install
32-
else:
33-
all_success = checked_install
34-
35-
print("================================================================================")
36-
# 2: Connection
37-
checked_connection = check_connection(interactive)
38-
if (is_error(checked_connection)):
39-
return checked_connection
40-
else:
41-
all_success = checked_connection
42-
43-
print("================================================================================")
44-
# 3: General Health
45-
checked_general_health = check_general_health(interactive)
46-
if (is_error(checked_general_health)):
47-
return checked_general_health
48-
else:
49-
all_success = checked_general_health
50-
51-
print("================================================================================")
52-
# 4: High CPU/Memory Usage
53-
checked_highcpumem = check_high_cpu_memory(interactive)
54-
if (is_error(checked_highcpumem)):
55-
return checked_highcpumem
56-
else:
57-
all_success = checked_highcpumem
58-
59-
print("================================================================================")
60-
# 5: Syslog
61-
checked_syslog = check_syslog(interactive)
62-
if (is_error(checked_syslog)):
63-
return checked_syslog
64-
else:
65-
all_success = checked_syslog
66-
67-
print("================================================================================")
68-
# 6: Custom logs
69-
checked_custom_logs = check_custom_logs(interactive)
70-
if (is_error(checked_custom_logs)):
71-
return checked_custom_logs
72-
else:
73-
all_success = checked_custom_logs
74-
27+
"""
28+
Run all troubleshooter checks, continuing even if errors occur.
29+
Collects all results and reports the most severe issue at the end.
30+
"""
31+
checks = [
32+
("Installation", check_installation),
33+
("Connection", check_connection),
34+
("General Health", check_general_health),
35+
("High CPU/Memory Usage", check_high_cpu_memory),
36+
("Syslog", check_syslog),
37+
("Custom logs", check_custom_logs),
38+
("Metrics", run_metrics_troubleshooter),
39+
]
40+
41+
results = []
42+
overall_status = NO_ERROR
43+
44+
for i, (check_name, check_func) in enumerate(checks, 1):
45+
print("================================================================================")
46+
print("Running check {0}/7: {1}...".format(i, check_name))
47+
48+
try:
49+
result = check_func(interactive)
50+
results.append((check_name, result))
51+
52+
# Track the most severe error (higher error codes are more severe)
53+
if is_error(result) and result > overall_status:
54+
overall_status = result
55+
elif not is_error(result) and result > overall_status and overall_status == NO_ERROR:
56+
overall_status = result
57+
58+
# Print immediate result for this check
59+
if is_error(result):
60+
print("[ERROR] {0}: ERROR (code {1})".format(check_name, result))
61+
elif result != NO_ERROR:
62+
print("[WARN] {0}: WARNING (code {1})".format(check_name, result))
63+
else:
64+
print("[OK] {0}: OK".format(check_name))
65+
66+
except Exception as e:
67+
print("[EXCEPTION] {0}: EXCEPTION - {1}".format(check_name, str(e)))
68+
results.append((check_name, "EXCEPTION: {0}".format(str(e))))
69+
overall_status = ERR_FOUND # Set a generic error code
70+
71+
# Summary of all results
72+
print("\n================================================================================")
73+
print("SUMMARY OF ALL CHECKS:")
7574
print("================================================================================")
76-
# 7: Metrics not flowing
77-
check_data_collected = run_metrics_troubleshooter(interactive)
78-
if (is_error(check_data_collected)):
79-
return check_data_collected
80-
else:
81-
all_success = check_data_collected
82-
83-
return all_success
75+
for check_name, result in results:
76+
if isinstance(result, str) and result.startswith("EXCEPTION"):
77+
print("[EXCEPTION] {0}: {1}".format(check_name, result))
78+
elif is_error(result):
79+
print("[ERROR] {0}: ERROR (code {1})".format(check_name, result))
80+
elif result != NO_ERROR:
81+
print("[WARN] {0}: WARNING (code {1})".format(check_name, result))
82+
else:
83+
print("[OK] {0}: OK".format(check_name))
84+
85+
return overall_status
8486

8587
def collect_logs():
8688
# get output directory for logs

0 commit comments

Comments
 (0)