From e300b92bad501d7dc612685199c58147e48e4bbb Mon Sep 17 00:00:00 2001 From: xincunli-sonic Date: Mon, 28 Oct 2024 22:58:19 -0700 Subject: [PATCH 1/3] Add init health module. --- health/health.go | 118 +++++++++++++++++++++++++++++ sonic_data_client/non_db_client.go | 46 ++++++++++- 2 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 health/health.go diff --git a/health/health.go b/health/health.go new file mode 100644 index 00000000..3790b96f --- /dev/null +++ b/health/health.go @@ -0,0 +1,118 @@ +package health + +import ( + "crypto/x509" + "encoding/pem" + "fmt" + "log/syslog" + "os/exec" + "strconv" + "strings" + "time" +) + +type ContainerHealthInfo struct { + ContainerID string + CPUUtilization float64 + MemoryUsage float64 + DiskOccupation float64 + CertExpiration int64 // days until expiration + Status string +} + +// GetHealthInfo gathers health information for the gNMI container +func GetHealthInfo() ([]ContainerHealthInfo, error) { + // Here we interact with Docker to get container stats + cmd := "docker stats --no-stream --format \"{{.Container}},{{.CPUPerc}},{{.MemPerc}},{{.Name}}\" | grep gnmi" + output, err := exec.Command("sh", "-c", cmd).Output() + if err != nil { + return nil, fmt.Errorf("failed to retrieve container stats: %v", err) + } + + var healthInfo []ContainerHealthInfo + lines := strings.Split(string(output), "\n") + for _, line := range lines { + if line == "" { + continue + } + parts := strings.Split(line, ",") + if len(parts) < 4 { + continue + } + + containerID := parts[0] + container := ContainerHealthInfo{ + ContainerID: containerID, + CPUUtilization: parsePercentage(parts[1]), + MemoryUsage: parsePercentage(parts[2]), + DiskOccupation: getDiskOccupation(containerID), + CertExpiration: getCertExpiration(containerID), + Status: parts[3], + } + + healthInfo = append(healthInfo, container) + } + + return healthInfo, nil +} + +// getDiskOccupation retrieves the disk usage for the container +func getDiskOccupation(containerID string) float64 { + // Run the command to get disk usage inside the container + cmd := fmt.Sprintf("docker exec %s df / | tail -1 | awk '{print $5}'", containerID) + output, err := exec.Command("sh", "-c", cmd).Output() + if err != nil { + fmt.Printf("failed to retrieve disk occupation for container %s: %v\n", containerID, err) + return 0.0 + } + return parsePercentage(strings.TrimSpace(string(output))) +} + +// getCertExpiration retrieves the certificate expiration for the container +func getCertExpiration(containerID string) int64 { + // Run the command to get the certificate from the container + cmd := fmt.Sprintf("docker exec %s cat /path/to/cert.pem", containerID) + output, err := exec.Command("sh", "-c", cmd).Output() + if err != nil { + fmt.Printf("failed to retrieve certificate for container %s: %v\n", containerID, err) + return 0 + } + + // Parse the certificate to get the expiration date + block, _ := pem.Decode(output) + if block == nil { + fmt.Printf("failed to parse certificate PEM for container %s\n", containerID) + return 0 + } + + cert, err := x509.ParseCertificate(block.Bytes) + if err != nil { + fmt.Printf("failed to parse certificate for container %s: %v\n", containerID, err) + return 0 + } + + // Calculate days until expiration + return int64(time.Until(cert.NotAfter).Hours() / 24) +} + +// LogHealthProofs logs container health information to syslog +func LogHealthProofs(container ContainerHealthInfo) { + logwriter, err := syslog.New(syslog.LOG_NOTICE, "container_health") + if err == nil { + logwriter.Info("Health check for container " + container.ContainerID + ": " + + "CPU=" + fmt.Sprintf("%.2f", container.CPUUtilization) + + ", Memory=" + fmt.Sprintf("%.2f", container.MemoryUsage) + + ", Disk=" + fmt.Sprintf("%.2f", container.DiskOccupation) + + ", CertExpiryDays=" + fmt.Sprintf("%d", container.CertExpiration)) + } +} + +// Helper function to parse percentages +func parsePercentage(value string) float64 { + value = strings.TrimSuffix(value, "%") + parsedValue, err := strconv.ParseFloat(value, 64) + if err != nil { + return 0.0 + } + return parsedValue +} \ No newline at end of file diff --git a/sonic_data_client/non_db_client.go b/sonic_data_client/non_db_client.go index a1c4a6c5..d8428da9 100644 --- a/sonic_data_client/non_db_client.go +++ b/sonic_data_client/non_db_client.go @@ -11,6 +11,7 @@ import ( spb "github.com/sonic-net/sonic-gnmi/proto" "github.com/Workiva/go-datastructures/queue" + "github.com/sonic-net/sonic-gnmi/health" linuxproc "github.com/c9s/goprocinfo/linux" log "github.com/golang/glog" gnmipb "github.com/openconfig/gnmi/proto/gnmi" @@ -39,6 +40,13 @@ type statsRing struct { mu sync.RWMutex // Mutex for data protection } +type healthInfoStash struct { + once sync.Once + healthInfo []ContainerHealthInfo + err error + isHealthy bool +} + // SonicVersionInfo is a data model to serialize '/etc/sonic/sonic_version.yml' type SonicVersionInfo struct { BuildVersion string `yaml:"build_version" json:"build_version"` @@ -101,6 +109,10 @@ var ( path: []string{"OTHERS", "osversion", "build"}, getFunc: dataGetFunc(getBuildVersion), }, + { // Container Health Status + path: []string{"OTHERS", "container-health-status", "gnmi"}, + getFunc: dataGetFunc(getContainerHealthStatus), + }, } ) @@ -137,7 +149,7 @@ func getCpuUtilPercents(cur, last *linuxproc.CPUStat) uint64 { idleTicks := cur.Idle - last.Idle totalTicks := curTotal - lastTotal if totalTicks == 0 { // No change in CPU Utilization - return 0 + return 0 } return 100 * (totalTicks - idleTicks) / totalTicks } @@ -335,6 +347,35 @@ func getBuildVersion() ([]byte, error) { return b, nil } +func getContainerHealthStatus() ([]byte, error) { + // Load and parse the container health status + healthInfoStash.once.Do(func() { + healthInfoStash.healthInfo, healthInfoStash.err = GetHealthInfo() + if healthInfoStash.err != nil { + log.Errorf("Failed to gather health metrics: %v", healthInfoStash.err) + return + } + + // Evaluate health info + healthInfoStash.isHealthy = true + for _, container := range healthInfoStash.healthInfo { + LogHealthProofs(container) + if container.CPUUtilization > 80.0 || container.MemoryUsage > 80.0 || container.DiskOccupation > 90.0 || container.CertExpiration <= 30 { + healthInfoStash.isHealthy = false + break + } + } + }) + + b, err := json.Marshal(healthInfoStash.healthInfo) + if err != nil { + log.V(2).Infof("%v", err) + return b, err + } + log.V(4).Infof("ReportHealthToKubeSonic, output %v", string(b)) + return b, nil +} + func WriteStatsToBuffer(stat *linuxproc.Stat) { statsR.mu.Lock() statsR.buff[statsR.writeIdx] = stat @@ -588,7 +629,7 @@ func (c *NonDbClient) Close() error { return nil } -func (c *NonDbClient) Set(delete []*gnmipb.Path, replace []*gnmipb.Update, update []*gnmipb.Update) error { +func (c *NonDbClient) Set(delete []*gnmipb.Path, replace []*gnmipb.Update, update []*gnmipb.Update) error { return nil } func (c *NonDbClient) Capabilities() []gnmipb.ModelData { @@ -599,4 +640,3 @@ func (c *NonDbClient) SentOne(val *Value) { func (c *NonDbClient) FailedSend() { } - From 4efda84f3a6643481474715853828e6d47d46e32 Mon Sep 17 00:00:00 2001 From: xincunli-sonic Date: Mon, 28 Oct 2024 23:28:43 -0700 Subject: [PATCH 2/3] fix security issue --- health/health.go | 9 ++++++--- sonic_data_client/non_db_client.go | 23 ++++++++++++----------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/health/health.go b/health/health.go index 3790b96f..92843cb5 100644 --- a/health/health.go +++ b/health/health.go @@ -24,7 +24,8 @@ type ContainerHealthInfo struct { func GetHealthInfo() ([]ContainerHealthInfo, error) { // Here we interact with Docker to get container stats cmd := "docker stats --no-stream --format \"{{.Container}},{{.CPUPerc}},{{.MemPerc}},{{.Name}}\" | grep gnmi" - output, err := exec.Command("sh", "-c", cmd).Output() + args := strings.Fields(cmd) + output, err := exec.Command(args[0], args[1:]...).Output() if err != nil { return nil, fmt.Errorf("failed to retrieve container stats: %v", err) } @@ -60,7 +61,8 @@ func GetHealthInfo() ([]ContainerHealthInfo, error) { func getDiskOccupation(containerID string) float64 { // Run the command to get disk usage inside the container cmd := fmt.Sprintf("docker exec %s df / | tail -1 | awk '{print $5}'", containerID) - output, err := exec.Command("sh", "-c", cmd).Output() + args := strings.Fields(cmd) + output, err := exec.Command(args[0], args[1:]...).Output() if err != nil { fmt.Printf("failed to retrieve disk occupation for container %s: %v\n", containerID, err) return 0.0 @@ -72,7 +74,8 @@ func getDiskOccupation(containerID string) float64 { func getCertExpiration(containerID string) int64 { // Run the command to get the certificate from the container cmd := fmt.Sprintf("docker exec %s cat /path/to/cert.pem", containerID) - output, err := exec.Command("sh", "-c", cmd).Output() + args := strings.Fields(cmd) + output, err := exec.Command(args[0], args[1:]...).Output() if err != nil { fmt.Printf("failed to retrieve certificate for container %s: %v\n", containerID, err) return 0 diff --git a/sonic_data_client/non_db_client.go b/sonic_data_client/non_db_client.go index d8428da9..59cadcf5 100644 --- a/sonic_data_client/non_db_client.go +++ b/sonic_data_client/non_db_client.go @@ -42,7 +42,7 @@ type statsRing struct { type healthInfoStash struct { once sync.Once - healthInfo []ContainerHealthInfo + healthInfo []health.ContainerHealthInfo err error isHealthy bool } @@ -349,30 +349,31 @@ func getBuildVersion() ([]byte, error) { func getContainerHealthStatus() ([]byte, error) { // Load and parse the container health status - healthInfoStash.once.Do(func() { - healthInfoStash.healthInfo, healthInfoStash.err = GetHealthInfo() - if healthInfoStash.err != nil { - log.Errorf("Failed to gather health metrics: %v", healthInfoStash.err) + var stash healthInfoStash + stash.once.Do(func() { + stash.healthInfo, stash.err = health.GetHealthInfo() // Assuming GetHealthInfo() returns ([]ContainerHealthInfo, error) + if stash.err != nil { + log.V(2).Infof("Failed to gather health metrics: %v", stash.err) return } // Evaluate health info - healthInfoStash.isHealthy = true - for _, container := range healthInfoStash.healthInfo { - LogHealthProofs(container) + stash.isHealthy = true + for _, container := range stash.healthInfo { + health.LogHealthProofs(container) if container.CPUUtilization > 80.0 || container.MemoryUsage > 80.0 || container.DiskOccupation > 90.0 || container.CertExpiration <= 30 { - healthInfoStash.isHealthy = false + stash.isHealthy = false break } } }) - b, err := json.Marshal(healthInfoStash.healthInfo) + b, err := json.Marshal(stash.healthInfo) if err != nil { log.V(2).Infof("%v", err) return b, err } - log.V(4).Infof("ReportHealthToKubeSonic, output %v", string(b)) + log.V(4).Infof("getContainerHealthStatus, output %v", string(b)) return b, nil } From 0c63a1888268584bcb1cfdd5eb836308f175bbea Mon Sep 17 00:00:00 2001 From: xincunli-sonic Date: Mon, 28 Oct 2024 23:37:20 -0700 Subject: [PATCH 3/3] Fix dynamic exec command --- health/health.go | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/health/health.go b/health/health.go index 92843cb5..aed3897e 100644 --- a/health/health.go +++ b/health/health.go @@ -23,9 +23,7 @@ type ContainerHealthInfo struct { // GetHealthInfo gathers health information for the gNMI container func GetHealthInfo() ([]ContainerHealthInfo, error) { // Here we interact with Docker to get container stats - cmd := "docker stats --no-stream --format \"{{.Container}},{{.CPUPerc}},{{.MemPerc}},{{.Name}}\" | grep gnmi" - args := strings.Fields(cmd) - output, err := exec.Command(args[0], args[1:]...).Output() + output, err := exec.Command("docker", "stats", "--no-stream", "--format", "\"{{.Container}},{{.CPUPerc}},{{.MemPerc}},{{.Name}}\"", "| grep gnmi").Output() if err != nil { return nil, fmt.Errorf("failed to retrieve container stats: %v", err) } @@ -60,9 +58,7 @@ func GetHealthInfo() ([]ContainerHealthInfo, error) { // getDiskOccupation retrieves the disk usage for the container func getDiskOccupation(containerID string) float64 { // Run the command to get disk usage inside the container - cmd := fmt.Sprintf("docker exec %s df / | tail -1 | awk '{print $5}'", containerID) - args := strings.Fields(cmd) - output, err := exec.Command(args[0], args[1:]...).Output() + output, err := exec.Command("docker", "exec", containerID, "df", "/").Output() if err != nil { fmt.Printf("failed to retrieve disk occupation for container %s: %v\n", containerID, err) return 0.0 @@ -73,9 +69,7 @@ func getDiskOccupation(containerID string) float64 { // getCertExpiration retrieves the certificate expiration for the container func getCertExpiration(containerID string) int64 { // Run the command to get the certificate from the container - cmd := fmt.Sprintf("docker exec %s cat /path/to/cert.pem", containerID) - args := strings.Fields(cmd) - output, err := exec.Command(args[0], args[1:]...).Output() + output, err := fmt.Sprintf("docker", "exec", containerID, "cat", "/path/to/cert.pem") if err != nil { fmt.Printf("failed to retrieve certificate for container %s: %v\n", containerID, err) return 0