From ef672bcf8782b6bd86f554f5fd3aad313f216ba4 Mon Sep 17 00:00:00 2001 From: Xincun Li Date: Tue, 8 Oct 2024 22:39:41 -0700 Subject: [PATCH 1/2] HLD for gNMI health diagnosis --- doc/mgmt/gnmi/gNMI_Health_Diag.md | 218 ++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 doc/mgmt/gnmi/gNMI_Health_Diag.md diff --git a/doc/mgmt/gnmi/gNMI_Health_Diag.md b/doc/mgmt/gnmi/gNMI_Health_Diag.md new file mode 100644 index 0000000000..dc980115cf --- /dev/null +++ b/doc/mgmt/gnmi/gNMI_Health_Diag.md @@ -0,0 +1,218 @@ + +# High-Level Design (HLD) for Health Diagnostic Module in SONiC gNMI Server + +## Overview +The purpose of this design is to add a health diagnostic module to the SONiC gNMI server. The module will provide system health metrics, such as CPU utilization, memory utilization, and network health, allowing monitoring through the gNMI interface. The design supports both **Dial-in** and **Dial-out** modes of gNMI operation. + +## Assumption + +- Phase1: Collecting gNMI server +- Phase2: Collection ? container +- ... + +## Requirements +- **Health Metrics to Collect:** + - CPU Utilization (%) + - Memory Utilization (%) + - Active gNMI sessions + - Network interface health (up/down status) + - To be enriched... + +- **YANG Model Support:** + - The gNMI server should expose health diagnostics through the YANG model. + +- **gNMI Integration:** + - The health data must be available as part of the gNMI `Get` RPC for **Dial-in** mode. + - For **Dial-out** mode, the gNMI server will push health diagnostics to a remote collector based on a periodic schedule. + +- **Performance Considerations:** + - The health checks should be lightweight and should not negatively impact gNMI server performance. + +## Modes of Operation + +### 1. Dial-in Mode +In Dial-in mode, the client makes a request to the gNMI server for health diagnostics. The server gathers the requested metrics and responds to the client with the data. + +### 2. Dial-out Mode +In Dial-out mode, the gNMI server is configured to automatically push health metrics to a remote system (collector) periodically or upon specific events. The push interval and collector addresses are configurable. + +## Design Components + +### 1. Health Diagnostic Module +The health diagnostic module is responsible for gathering metrics related to system resource usage and the health of network interfaces. + +#### Module Structure +- `GetHealthInfo()`: A function to gather health metrics including CPU and memory usage. +- `GetCPUUtilization()`: Function for calculating CPU utilization. +- `GetSessionUtilization()`: A function to gather session utilization. +- `GetServiceStatus()`: Function for collecting Service Status. +- `YANG Model`: Extends the existing gNMI YANG model to include a new path for the health diagnostics. + +```go +package health + +type HealthInfo struct { + CPUUtilization float64 + MemoryUtilization float64 +} + +type Status int + +const ( + Unknown Status = iota + Up + Down +) + +func (s Status) String() string { + switch s { + case Up: + return "Up" + case Down: + return "Down" + default: + return "Unknown" + } +} + +func GetStatus() Status { + return Pending +} + +func GetHealthInfo() HealthInfo { + // Memory stats collection + var memStats runtime.MemStats + runtime.ReadMemStats(&memStats) + + return HealthInfo{ + CPUUtilization: GetCPUUtilization(), + MemoryUtilization: float64(memStats.Alloc) / float64(memStats.TotalAlloc), + SessionUtilization: GetSessionStatistics(), + ServiceStatus: GetStatus(), + } +} +``` + +### 2. YANG Model Extension +The YANG model is extended to include paths for health diagnostics (`cpu-utilization`, `memory-utilization`, `session-utilization`, `service-status`). + +```yang +module sonic-health { + namespace "http://github.com/sonic-net/sonic-gnmi/health"; + prefix "sh"; + + container health-status { + leaf cpu-utilization { + type decimal64 { fraction-digits 2; } + description "Percentage of CPU utilization"; + } + leaf memory-utilization { + type decimal64 { fraction-digits 2; } + description "Percentage of memory utilization"; + } + leaf session-utilization { + type int32 { + range "0..35000"; + } + description "In house session utilization"; + } + leaf service-status { + type enumeration { + enum up; + enum down; + } + default up; + description "Service Status"; + } + } +} +``` + +### 3. gNMI Server Integration +#### 3.1 Dial-in Mode +The `Get` RPC is extended to support health diagnostic requests. When the `/health-status` path is requested, the health metrics are returned to the client. + +```go +func (s *Server) Get(ctx context.Context, req *gnmi.GetRequest) (*gnmi.GetResponse, error) { + // Existing Get functionality here + + for _, path := range req.Path { + if path == "/health-status" { + healthInfo := health.GetHealthInfo() + return &gnmi.GetResponse{ + Notification: []*gnmi.Notification{ + &gnmi.Notification{ + Update: []*gnmi.Update{ + &gnmi.Update{ + Path: &gnmi.Path{Element: []string{"cpu-utilization"}}, + Val: &gnmi.TypedValue{Value: &gnmi.TypedValue_DecimalVal{DecimalVal: healthInfo.CPUUtilization}}, + }, + &gnmi.Update{ + Path: &gnmi.Path{Element: []string{"memory-utilization"}}, + Val: &gnmi.TypedValue{Value: &gnmi.TypedValue_DecimalVal{DecimalVal: healthInfo.MemoryUtilization}}, + }, + }, + }, + }, + }, nil + } + } + + // Existing code + return &gnmi.GetResponse{}, nil +} +``` + +#### 3.2 Dial-out Mode +In Dial-out mode, the server periodically pushes health diagnostics to a remote collector. The push interval and destination are configurable. + +```go +func (s *Server) PushHealthData() { + for { + time.Sleep(time.Duration(s.pushInterval) * time.Second) + healthInfo := health.GetHealthInfo() + s.pushToCollector(healthInfo) + } +} + +func (s *Server) pushToCollector(healthInfo HealthInfo) { + // Logic to push health metrics to remote collector +} +``` + +### 4. Testing and Validation +Unit tests will be written to validate the functionality of the health diagnostics module in both **Dial-in** and **Dial-out** modes. These tests will ensure that health data is correctly returned in response to client requests in Dial-in mode and successfully pushed to remote collectors in Dial-out mode. + +## Flowchart +The following diagram illustrates the flow of how the health diagnostic module integrates with the gNMI server in both **Dial-in** and **Dial-out** modes: + +### Dial-in +```mermaid +graph TD + A[gNMI Client] -->|Dial-in Request /health-status| B[gNMI Server] + B --> C{Health Diagnostic Module} + C --> D[Collect CPU Utilization] + C --> E[Collect Memory Utilization] + C --> F[Collect Session Utilization] + C --> G[Collect Service Status] + D --> H[Return CPU Usage] + E --> I[Return Memory Usage] + F --> J[Return Session Usage] + G --> K[Return Service Status] + H --> B + I --> B + J --> B + K --> B + B -->|Send Response| A +``` + +### Dial-out +```mermaid +graph TD + H[gNMI Server] -->|Dial-out| I[Remote Collector] + H --> J[Push Health Data Periodically] + J --> I +``` + +## Conclusion +This high-level design outlines the addition of a health diagnostic module to the SONiC gNMI server, supporting both Dial-in and Dial-out modes. The design is lightweight, modular, and allows for easy extension to gather additional health metrics. It integrates seamlessly into the gNMI interface, enabling clients to monitor the health of the server in real time and allowing automatic push of health metrics to remote collectors in Dial-out mode. From 85ed13279a0520bbbe3ee48b6332beb69d1bb70f Mon Sep 17 00:00:00 2001 From: Xincun Li Date: Wed, 9 Oct 2024 12:10:02 -0700 Subject: [PATCH 2/2] Add two levels health --- doc/mgmt/gnmi/gNMI_Health_Diag.md | 214 ++++++++++++++---------------- 1 file changed, 99 insertions(+), 115 deletions(-) diff --git a/doc/mgmt/gnmi/gNMI_Health_Diag.md b/doc/mgmt/gnmi/gNMI_Health_Diag.md index dc980115cf..9bdae46fb3 100644 --- a/doc/mgmt/gnmi/gNMI_Health_Diag.md +++ b/doc/mgmt/gnmi/gNMI_Health_Diag.md @@ -12,18 +12,27 @@ The purpose of this design is to add a health diagnostic module to the SONiC gNM ## Requirements - **Health Metrics to Collect:** - - CPU Utilization (%) - - Memory Utilization (%) - - Active gNMI sessions - - Network interface health (up/down status) - - To be enriched... + - System-level: + - CPU Utilization (%) + - Memory Utilization (%) + - Active gNMI sessions + - Network interface health (up/down status) + + - Container-level: + - Container CPU utilization + - Container memory usage + - Disk I/O + - Network I/O + - Container status (running, exited, restarting, etc.) + - Uptime (how long the container has been running) - **YANG Model Support:** - The gNMI server should expose health diagnostics through the YANG model. + - YANG paths should support both system-level and container-level metrics. - **gNMI Integration:** - The health data must be available as part of the gNMI `Get` RPC for **Dial-in** mode. - - For **Dial-out** mode, the gNMI server will push health diagnostics to a remote collector based on a periodic schedule. + - For **Dial-out** mode, the gNMI server will push health diagnostics (both system and container) to a remote collector based on a periodic schedule. - **Performance Considerations:** - The health checks should be lightweight and should not negatively impact gNMI server performance. @@ -31,188 +40,163 @@ The purpose of this design is to add a health diagnostic module to the SONiC gNM ## Modes of Operation ### 1. Dial-in Mode -In Dial-in mode, the client makes a request to the gNMI server for health diagnostics. The server gathers the requested metrics and responds to the client with the data. +In Dial-in mode, the client makes a request to the gNMI server for health diagnostics. The server gathers the requested metrics (both system-level and container-level) and responds to the client with the data. ### 2. Dial-out Mode -In Dial-out mode, the gNMI server is configured to automatically push health metrics to a remote system (collector) periodically or upon specific events. The push interval and collector addresses are configurable. +In Dial-out mode, the gNMI server is configured to automatically push health metrics (system and container) to a remote system (collector) periodically or upon specific events. The push interval and collector addresses are configurable. ## Design Components ### 1. Health Diagnostic Module -The health diagnostic module is responsible for gathering metrics related to system resource usage and the health of network interfaces. +The health diagnostic module is responsible for gathering both system-level and container-specific metrics related to system resource usage and the health of network interfaces and containers. #### Module Structure -- `GetHealthInfo()`: A function to gather health metrics including CPU and memory usage. -- `GetCPUUtilization()`: Function for calculating CPU utilization. -- `GetSessionUtilization()`: A function to gather session utilization. -- `GetServiceStatus()`: Function for collecting Service Status. -- `YANG Model`: Extends the existing gNMI YANG model to include a new path for the health diagnostics. +- `GetHealthInfo()`: A function to gather system-level health metrics including CPU and memory usage. +- `GetContainerHealth()`: A function to gather container-level metrics (container CPU, memory, network I/O, disk I/O, status, and uptime). ```go package health -type HealthInfo struct { - CPUUtilization float64 - MemoryUtilization float64 +type ContainerHealthInfo struct { + ContainerID string + CPUUtilization float64 + MemoryUsage float64 + NetworkIO float64 + DiskIO float64 + Status string + Uptime int64 } -type Status int - -const ( - Unknown Status = iota - Up - Down -) - -func (s Status) String() string { - switch s { - case Up: - return "Up" - case Down: - return "Down" - default: - return "Unknown" - } -} - -func GetStatus() Status { - return Pending -} - -func GetHealthInfo() HealthInfo { - // Memory stats collection - var memStats runtime.MemStats - runtime.ReadMemStats(&memStats) - - return HealthInfo{ - CPUUtilization: GetCPUUtilization(), - MemoryUtilization: float64(memStats.Alloc) / float64(memStats.TotalAlloc), - SessionUtilization: GetSessionStatistics(), - ServiceStatus: GetStatus(), - } +func GetContainerHealth() ([]ContainerHealthInfo, error) { + // Example logic to gather container health metrics (using Docker or containerd APIs) } ``` ### 2. YANG Model Extension -The YANG model is extended to include paths for health diagnostics (`cpu-utilization`, `memory-utilization`, `session-utilization`, `service-status`). +We will extend the YANG model to support both system-level and container-level health metrics. ```yang module sonic-health { namespace "http://github.com/sonic-net/sonic-gnmi/health"; prefix "sh"; - container health-status { + container system-health-status { leaf cpu-utilization { type decimal64 { fraction-digits 2; } - description "Percentage of CPU utilization"; + description "System CPU utilization"; } + leaf memory-utilization { type decimal64 { fraction-digits 2; } - description "Percentage of memory utilization"; + description "System memory utilization"; } - leaf session-utilization { - type int32 { - range "0..35000"; + } + + container container-health-status { + list container { + key "container-id"; + + leaf container-id { + type string; + description "ID of the container"; } - description "In house session utilization"; - } - leaf service-status { - type enumeration { - enum up; - enum down; + + leaf cpu-utilization { + type decimal64 { fraction-digits 2; } + description "Percentage of CPU utilization for the container"; + } + + leaf memory-usage { + type decimal64 { fraction-digits 2; } + description "Memory usage for the container"; + } + + leaf network-io { + type decimal64 { fraction-digits 2; } + description "Network I/O for the container"; + } + + leaf disk-io { + type decimal64 { fraction-digits 2; } + description "Disk I/O for the container"; + } + + leaf status { + type string; + description "Container status (running, stopped, etc.)"; + } + + leaf uptime { + type int64; + description "Uptime of the container in seconds"; } - default up; - description "Service Status"; } } } ``` ### 3. gNMI Server Integration + #### 3.1 Dial-in Mode -The `Get` RPC is extended to support health diagnostic requests. When the `/health-status` path is requested, the health metrics are returned to the client. +The `Get` RPC will support paths to retrieve both system and container health diagnostics. ```go func (s *Server) Get(ctx context.Context, req *gnmi.GetRequest) (*gnmi.GetResponse, error) { - // Existing Get functionality here - for _, path := range req.Path { - if path == "/health-status" { - healthInfo := health.GetHealthInfo() - return &gnmi.GetResponse{ - Notification: []*gnmi.Notification{ - &gnmi.Notification{ - Update: []*gnmi.Update{ - &gnmi.Update{ - Path: &gnmi.Path{Element: []string{"cpu-utilization"}}, - Val: &gnmi.TypedValue{Value: &gnmi.TypedValue_DecimalVal{DecimalVal: healthInfo.CPUUtilization}}, - }, - &gnmi.Update{ - Path: &gnmi.Path{Element: []string{"memory-utilization"}}, - Val: &gnmi.TypedValue{Value: &gnmi.TypedValue_DecimalVal{DecimalVal: healthInfo.MemoryUtilization}}, - }, - }, - }, - }, - }, nil + if path == "/container-health-status" { + containerHealthInfo, err := health.GetContainerHealth() + if err != nil { + return nil, err + } + // Construct the GetResponse for container health data } } - // Existing code return &gnmi.GetResponse{}, nil } ``` #### 3.2 Dial-out Mode -In Dial-out mode, the server periodically pushes health diagnostics to a remote collector. The push interval and destination are configurable. +The gNMI server will periodically push both system and container health diagnostics to a remote collector. ```go func (s *Server) PushHealthData() { for { time.Sleep(time.Duration(s.pushInterval) * time.Second) - healthInfo := health.GetHealthInfo() + healthInfo, err := health.GetContainerHealth() + if err != nil { + continue + } s.pushToCollector(healthInfo) } } - -func (s *Server) pushToCollector(healthInfo HealthInfo) { - // Logic to push health metrics to remote collector -} ``` ### 4. Testing and Validation -Unit tests will be written to validate the functionality of the health diagnostics module in both **Dial-in** and **Dial-out** modes. These tests will ensure that health data is correctly returned in response to client requests in Dial-in mode and successfully pushed to remote collectors in Dial-out mode. +The extended testing will include: +- Validation of both system and container health data. +- Simulated container crashes or restarts to verify health diagnostics behavior. +- Testing resource limits and error conditions for container health monitoring. ## Flowchart -The following diagram illustrates the flow of how the health diagnostic module integrates with the gNMI server in both **Dial-in** and **Dial-out** modes: +The following diagram illustrates the extended flow for both system and container health diagnostics in **Dial-in** and **Dial-out** modes: -### Dial-in ```mermaid graph TD A[gNMI Client] -->|Dial-in Request /health-status| B[gNMI Server] B --> C{Health Diagnostic Module} - C --> D[Collect CPU Utilization] - C --> E[Collect Memory Utilization] - C --> F[Collect Session Utilization] - C --> G[Collect Service Status] - D --> H[Return CPU Usage] - E --> I[Return Memory Usage] - F --> J[Return Session Usage] - G --> K[Return Service Status] - H --> B - I --> B - J --> B - K --> B + C --> D[Collect System Metrics] + C --> E[Collect Container Metrics] + D --> F[Return System Health Data] + E --> G[Return Container Health Data] + F --> B + G --> B B -->|Send Response| A -``` -### Dial-out -```mermaid -graph TD H[gNMI Server] -->|Dial-out| I[Remote Collector] - H --> J[Push Health Data Periodically] + H --> J[Push System and Container Health Data Periodically] J --> I ``` ## Conclusion -This high-level design outlines the addition of a health diagnostic module to the SONiC gNMI server, supporting both Dial-in and Dial-out modes. The design is lightweight, modular, and allows for easy extension to gather additional health metrics. It integrates seamlessly into the gNMI interface, enabling clients to monitor the health of the server in real time and allowing automatic push of health metrics to remote collectors in Dial-out mode. +This high-level design extends the gNMI health diagnostic module to cover both system and container health diagnostics. The design ensures comprehensive health monitoring and supports both **Dial-in** and **Dial-out** modes, providing real-time health information to clients and remote collectors.