[AMA TST] Improve DCR Parsing and Troubleshooter -A Mode (#2114)

dylanbun · Dylan Bunarto · web-flow · commit 7f1e8b101528 · 2025-10-23T10:58:54.000-07:00
* patch

* redid parsing

* related changes

* removed logging from helpers.py

* more repairs

* cleanup

* more cleanup

* added more helpful messages to debug endpoint issues

* allowing for -A to go through all checks and then report them

---------

Co-authored-by: Dylan Bunarto &lt;dylanbunarto@microsoft.com&gt;
diff --git a/AzureMonitorAgent/ama_tst/modules/connect/check_endpts.py b/AzureMonitorAgent/ama_tst/modules/connect/check_endpts.py
@@ -1,4 +1,5 @@
 import subprocess
+import traceback
 
 from error_codes import *
 from errors      import error_info
@@ -14,6 +15,25 @@
 ME_REGION_URL = "{0}.monitoring.azure.com"
 
 
+def _log_ssl_error(context, exception, show_traceback=True):
+    """Helper function to log SSL errors cleanly"""
+    print("{0}:".format(context))
+    print("  Type: {0}".format(type(exception).__name__))
+    print("  Message: {0}".format(str(exception)))
+    
+    # For CalledProcessError, show command details
+    if isinstance(exception, subprocess.CalledProcessError):
+        print("  Command: {0}".format(getattr(exception, 'cmd', 'Unknown')))
+        print("  Return code: {0}".format(getattr(exception, 'returncode', 'Unknown')))
+        if hasattr(exception, 'output') and exception.output:
+            print("  Output: {0}".format(exception.output.strip()))
+    
+    # Show traceback if requested
+    if show_traceback:
+        print("  Traceback:")
+        print(traceback.format_exc())
+
+
 def check_endpt_ssl(ssl_cmd, endpoint):
     """
     openssl connect to specific endpoint
@@ -32,9 +52,31 @@ def check_endpt_ssl(ssl_cmd, endpoint):
                 verified = True
                 continue
 
+        # If connection established but no explicit verification status in brief mode,
+        # try a verification check to determine if SSL cert is valid
+        if connected and not verified:
+            try:
+                # Use verify_return_error flag to test certificate verification
+                verify_cmd = ssl_cmd.replace('-brief', '-verify_return_error -brief')
+                verify_output = subprocess.check_output(verify_cmd.format(endpoint), shell=True,\
+                               stderr=subprocess.STDOUT, universal_newlines=True)
+                # If verify command succeeds (no exception), verification is OK
+                if "CONNECTION ESTABLISHED" in verify_output:
+                    verified = True
+            except subprocess.CalledProcessError as e:
+                # Verification failed - certificate issues
+                _log_ssl_error("SSL verification failed", e, show_traceback=False)
+                verified = False
+            except Exception as e:
+                # Other error - assume verified if basic connection worked
+                # This handles cases where verify_return_error isn't supported
+                _log_ssl_error("SSL verification exception", e, show_traceback=True)
+                verified = False
+
         return (connected, verified, ssl_output)
     except Exception as e:
-        return (False, False, e)
+        _log_ssl_error("SSL connection failed", e, show_traceback=True)
+        return (False, False, str(e))
 
 
 def check_internet_connect():
diff --git a/AzureMonitorAgent/ama_tst/modules/connect/connect.py b/AzureMonitorAgent/ama_tst/modules/connect/connect.py
@@ -29,7 +29,7 @@ def check_parameters():
     return NO_ERROR
    
 def check_workspace():
-    wkspc_id, wkspc_region, e = find_dcr_workspace()
+    wkspc_id, wkspc_region, agent_settings, e = find_dcr_workspace()
     if e != None:
         error_info.append((e,))
         return ERR_NO_DCR 
diff --git a/AzureMonitorAgent/ama_tst/modules/custom_logs/check_clconf.py b/AzureMonitorAgent/ama_tst/modules/custom_logs/check_clconf.py
@@ -11,7 +11,11 @@ def check_customlog_input():
     if (cl_input == None or len(cl_input) == 0):
         error_info.append(("No custom logs file path",))
         return ERR_CL_INPUT
+    # cl_input is a list, not a dictionary - iterate over the paths directly
     for path in cl_input:
+        # Skip malformed entries that don't look like valid file paths
+        if not path or not path.startswith('/'):
+            continue
         try: 
             check_path = run_cmd_output('ls {0}'.format(path)).strip()
             if check_path.endswith('No such file or directory'):
@@ -42,11 +46,20 @@ def check_customlog_conf():
                     cl_log_file = cl_line.strip().split('log_file')[1]
                     general_info['CL_LOG'] =  cl_log_file
                     
-                if (cl_line.strip().startswith('Path')):
-                    cl_input_path = cl_line.strip().split('Path')[1].strip()
-                    general_info['CL_INPUT'].append(cl_input_path)
+                # Only match exact "Path" lines (not "Path_Key" or other variants)
+                if (cl_line.strip().startswith('Path ') or cl_line.strip().startswith('Path\t')):
+                    # Extract the path value after the whitespace
+                    parts = cl_line.strip().split(None, 1)  # Split on any whitespace, max 1 split
+                    if len(parts) > 1:
+                        cl_input_path = parts[1].strip()
+                        # Only add valid file paths (should start with /)
+                        if cl_input_path.startswith('/'):
+                            general_info['CL_INPUT'].append(cl_input_path)
+
     except Exception as e:
         error_info.append((e,))
         return ERR_CL_CONF
-    
+
+    print('cl_input value: {0}'.format(general_info['CL_INPUT']))
+
     return NO_ERROR
diff --git a/AzureMonitorAgent/ama_tst/modules/helpers.py b/AzureMonitorAgent/ama_tst/modules/helpers.py
@@ -19,6 +19,13 @@
 except NameError:
     FileNotFoundError = IOError
 
+# backwards compatible JSONDecodeError for Python 2 vs 3
+try:
+    json.JSONDecodeError
+except AttributeError:
+    # Python 2 doesn't have json.JSONDecodeError, use ValueError instead
+    json.JSONDecodeError = ValueError
+
 # backwards compatible devnull variable for Python 3.3 vs earlier
 try:
     DEVNULL = subprocess.DEVNULL
@@ -220,19 +227,45 @@ def run_cmd_output(cmd):
 
 
 def find_dcr_workspace():
+    """
+    Parse DCR configuration files to find workspace IDs and regions.
+    """
     global general_info
     
     if 'DCR_WORKSPACE_ID' in general_info and 'DCR_REGION' in general_info:
         return (general_info['DCR_WORKSPACE_ID'], general_info['DCR_REGION'], None)
     dcr_workspace = set()
     dcr_region = set()
     me_region = set()
+    agent_settings = {}
     general_info['URL_SUFFIX'] = '.com'
     try:
         for file in os.listdir(CONFIG_DIR):
             file_path = CONFIG_DIR + "/" + file
             with open(file_path) as f:
                 result = json.load(f)
+                
+                # Check if this is an AgentSettings DCR - parse its settings
+                if 'kind' in result and result['kind'] == 'AgentSettings' and 'channels' not in result:
+                    if 'settings' in result:
+                        settings_str = result['settings']
+                        try:
+                            # The settings field is a JSON string, so parse it
+                            if isinstance(settings_str, str):
+                                settings_list = json.loads(settings_str)
+                            else:
+                                settings_list = settings_str
+                            
+                            # Process each setting
+                            for setting in settings_list:
+                                name = setting['name']
+                                value = setting['value']
+                                if name:
+                                    agent_settings[name] = value
+                        except (json.JSONDecodeError, TypeError) as e:
+                            # If parsing fails, skip this AgentSettings DCR
+                            print("Error parsing settings key in AgentSettings DCR")
+                    continue
                 channels = result['channels']
                 for channel in channels:
                     if channel['protocol'] == 'ods':
@@ -255,14 +288,17 @@ def find_dcr_workspace():
                         region = endpoint_url.split('https://')[1].split('.monitoring')[0]
                         me_region.add(region)
     except Exception as e:
-        return (None, None, e)
+        return (None, None, None, e)
 
     general_info['DCR_WORKSPACE_ID'] = dcr_workspace
     general_info['DCR_REGION'] = dcr_region
     general_info['ME_REGION'] = me_region
-    return (dcr_workspace, dcr_region, None)
+    return (dcr_workspace, dcr_region, agent_settings, None)
 
 def find_dce():
+    """
+    Parse DCR configuration files to find Data Collection Endpoints (DCE).
+    """
     global general_info
     
     dce = set()
@@ -271,6 +307,9 @@ def find_dce():
             file_path = CONFIG_DIR + "/" + file
             with open(file_path) as f:
                 result = json.load(f)
+                # Check if this is an AgentSettings DCR, if so skip it
+                if 'kind' in result and result['kind'] == 'AgentSettings' and 'channels' not in result:
+                    continue
                 channels = result['channels']
                 for channel in channels:
                     if channel['protocol'] == 'gig':
diff --git a/AzureMonitorAgent/ama_tst/modules/install/check_ama.py b/AzureMonitorAgent/ama_tst/modules/install/check_ama.py
@@ -199,7 +199,7 @@ def check_ama(interactive):
     if newer_ama_version is None:
         if e is None:
             # No error and no newer version found - current version is up to date
-            print("✓ AMA version is up to date (latest version)")
+            print("AMA version is up to date (latest version)")
             return NO_ERROR
         else:
             # There was an error fetching the latest version
diff --git a/AzureMonitorAgent/ama_tst/modules/install/check_os.py b/AzureMonitorAgent/ama_tst/modules/install/check_os.py
@@ -1,8 +1,9 @@
+from __future__ import absolute_import
 import platform
 from error_codes import *
 from errors      import error_info
 from helpers     import find_vm_bits, find_vm_distro
-import install.supported_distros as supported_distros
+from . import supported_distros
 
     
 def format_alternate_versions(supported_dist, versions):
diff --git a/AzureMonitorAgent/ama_tst/modules/logcollector.py b/AzureMonitorAgent/ama_tst/modules/logcollector.py
@@ -290,12 +290,14 @@ def create_outfile(output_dirpath, logs_date, pkg_manager):
         outfile.write("--------------------------------------------------------------------------------\n")
 
         # connection to endpoints
-        wkspc_id, wkspc_region, e = helpers.find_dcr_workspace()
+        wkspc_id, wkspc_region, agent_settings, e = helpers.find_dcr_workspace()
         if e == None:
             outfile.write("Workspace ID: {0}\n".format(str(wkspc_id)))
             outfile.write("Workspace region: {0}\n".format(str(wkspc_region)))
             outfile.write("--------------------------------------------------------------------------------\n")
-               
+            if agent_settings != {}:
+                outfile.write("AgentSettinsgs file found: {0}\n".format(str(agent_settings)))
+
         # AMA package info (dpkg/rpm)
         if (pkg_manager == "dpkg"):
             outfile.write("Output of command: {0}\n".format(DPKG_CMD))
diff --git a/AzureMonitorAgent/ama_tst/modules/main.py b/AzureMonitorAgent/ama_tst/modules/main.py
@@ -24,63 +24,65 @@ def check_sudo():
         return True
 
 def check_all(interactive):
-    all_success = NO_ERROR
-    # 1: Install
-    checked_install = check_installation(interactive)
-    if (is_error(checked_install)):
-        return checked_install
-    else:
-        all_success = checked_install
-
-    print("================================================================================")
-    # 2: Connection
-    checked_connection = check_connection(interactive)
-    if (is_error(checked_connection)):
-        return checked_connection
-    else:
-        all_success = checked_connection
-
-    print("================================================================================")
-    # 3: General Health
-    checked_general_health = check_general_health(interactive)
-    if (is_error(checked_general_health)):
-        return checked_general_health
-    else:
-        all_success = checked_general_health
-
-    print("================================================================================")
-    # 4: High CPU/Memory Usage
-    checked_highcpumem = check_high_cpu_memory(interactive)
-    if (is_error(checked_highcpumem)):
-        return checked_highcpumem
-    else:
-        all_success = checked_highcpumem
-
-    print("================================================================================")
-    # 5: Syslog
-    checked_syslog = check_syslog(interactive)
-    if (is_error(checked_syslog)):
-        return checked_syslog
-    else:
-        all_success = checked_syslog
-
-    print("================================================================================")
-    # 6: Custom logs
-    checked_custom_logs = check_custom_logs(interactive)
-    if (is_error(checked_custom_logs)):
-        return checked_custom_logs
-    else:
-        all_success = checked_custom_logs
-
+    """
+    Run all troubleshooter checks, continuing even if errors occur.
+    Collects all results and reports the most severe issue at the end.
+    """
+    checks = [
+        ("Installation", check_installation),
+        ("Connection", check_connection),
+        ("General Health", check_general_health),
+        ("High CPU/Memory Usage", check_high_cpu_memory),
+        ("Syslog", check_syslog),
+        ("Custom logs", check_custom_logs),
+        ("Metrics", run_metrics_troubleshooter),
+    ]
+    
+    results = []
+    overall_status = NO_ERROR
+    
+    for i, (check_name, check_func) in enumerate(checks, 1):
+        print("================================================================================")
+        print("Running check {0}/7: {1}...".format(i, check_name))
+        
+        try:
+            result = check_func(interactive)
+            results.append((check_name, result))
+            
+            # Track the most severe error (higher error codes are more severe)
+            if is_error(result) and result > overall_status:
+                overall_status = result
+            elif not is_error(result) and result > overall_status and overall_status == NO_ERROR:
+                overall_status = result
+                
+            # Print immediate result for this check
+            if is_error(result):
+                print("[ERROR] {0}: ERROR (code {1})".format(check_name, result))
+            elif result != NO_ERROR:
+                print("[WARN]  {0}: WARNING (code {1})".format(check_name, result))
+            else:
+                print("[OK]    {0}: OK".format(check_name))
+                
+        except Exception as e:
+            print("[EXCEPTION] {0}: EXCEPTION - {1}".format(check_name, str(e)))
+            results.append((check_name, "EXCEPTION: {0}".format(str(e))))
+            overall_status = ERR_FOUND  # Set a generic error code
+    
+    # Summary of all results
+    print("\n================================================================================")
+    print("SUMMARY OF ALL CHECKS:")
     print("================================================================================")
-    # 7: Metrics not flowing
-    check_data_collected = run_metrics_troubleshooter(interactive)
-    if (is_error(check_data_collected)):
-        return check_data_collected
-    else:
-        all_success = check_data_collected
-
-    return all_success
+    for check_name, result in results:
+        if isinstance(result, str) and result.startswith("EXCEPTION"):
+            print("[EXCEPTION] {0}: {1}".format(check_name, result))
+        elif is_error(result):
+            print("[ERROR] {0}: ERROR (code {1})".format(check_name, result))
+        elif result != NO_ERROR:
+            print("[WARN]  {0}: WARNING (code {1})".format(check_name, result))
+        else:
+            print("[OK]    {0}: OK".format(check_name))
+    
+    return overall_status
 
 def collect_logs():
     # get output directory for logs