-
Notifications
You must be signed in to change notification settings - Fork 1
/
rasdaemon.go
148 lines (128 loc) · 3.79 KB
/
rasdaemon.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright (c) 2017 mgIT GmbH. All rights reserved.
// Distributed under the Apache License. See LICENSE for details.
package main
import (
"database/sql"
"fmt"
"log"
"os"
"strconv"
"strings"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
_ "modernc.org/sqlite"
)
type RasDaemonOptions struct {
Path string `json:"path"`
}
type RasdaemonChecker struct {
opts RasDaemonOptions
db *sql.DB
promRasdaemonMCERecordSize *prometheus.Desc
promRasdaemonMCEventSize *prometheus.Desc
}
func NewRasdaemonChecker(opts RasDaemonOptions) (*RasdaemonChecker, error) {
if opts.Path == "" {
opts.Path = "/var/lib/rasdaemon/ras-mc_event.db"
}
if _, err := os.Stat(opts.Path); errors.Is(err, os.ErrNotExist) {
return nil, errors.Wrapf(err, "file %q does not exist", opts.Path)
}
db, err := sql.Open("sqlite", fmt.Sprintf("file:%s?mode=ro", opts.Path))
if err != nil {
return nil, errors.Wrap(err, "sql.Open")
}
if err := db.Ping(); err != nil {
return nil, errors.Wrap(err, "db.Ping")
}
return &RasdaemonChecker{
opts: opts,
db: db,
promRasdaemonMCERecordSize: prometheus.NewDesc(
"rasdaemon_mce_record_total",
"size of the rasdaemon mce_records",
[]string{"bank", "bank_name", "action_required"}, nil),
promRasdaemonMCEventSize: prometheus.NewDesc(
"rasdaemon_mc_event_total",
"size of the rasdaemon mc-event log events",
[]string{"err_type"}, nil),
}, nil
}
func (c *RasdaemonChecker) Describe(ch chan<- *prometheus.Desc) {
ch <- c.promRasdaemonMCERecordSize
ch <- c.promRasdaemonMCEventSize
}
func (c *RasdaemonChecker) CollectRasdaemonMCERecordSize(ch chan<- prometheus.Metric) {
//nolint:godox
// Todo: This could break when rasdaemon is updated.
// See: https://pagure.io/rasdaemon/blob/master/f/mce-amd.c
rows, err := c.db.Query(`
select bank, bank_name, (case when error_msg like '% no action required.' then 'no' else 'yes' end) as action_required, count(id)
from mce_record group by bank, bank_name, action_required;
`)
if err != nil {
log.Println("failed to query mce_record:", err)
return
}
defer rows.Close()
for rows.Next() {
var size int
var bank int
var bankName string
var actionRequired string
if err := rows.Scan(&bank, &bankName, &actionRequired, &size); err != nil {
log.Println("sql.Scan:", err)
continue
}
// Trim unnecessary mentioning of bank in bank_name - example: bank = 18, bank_name = Unified Memory Controller (bank=18)
bankName = strings.TrimSuffix(bankName, fmt.Sprintf(" (bank=%d)", bank))
ch <- prometheus.MustNewConstMetric(
c.promRasdaemonMCERecordSize,
prometheus.GaugeValue,
float64(size),
strconv.Itoa(bank),
bankName,
actionRequired,
)
}
if err := rows.Err(); err != nil {
log.Println("sql.Next:", err)
}
}
func (c *RasdaemonChecker) CollectRasdaemonMCEventSize(ch chan<- prometheus.Metric) {
// There are exactly 4 error types in mc_events: Corrected, Uncorrected, Fatal and Info.
// See: https://pagure.io/rasdaemon/blob/master/f/ras-mc-handler.c
rows, err := c.db.Query("select err_type, count(id) from mc_event group by err_type")
if err != nil {
log.Println("failed to query mc_event:", err)
return
}
defer rows.Close()
for rows.Next() {
var size int
var errType string
if err := rows.Scan(&errType, &size); err != nil {
log.Println("sql.Scan:", err)
continue
}
ch <- prometheus.MustNewConstMetric(
c.promRasdaemonMCEventSize,
prometheus.GaugeValue,
float64(size),
errType,
)
}
if err := rows.Err(); err != nil {
log.Println("sql.Next:", err)
}
}
func (c *RasdaemonChecker) Collect(ch chan<- prometheus.Metric) {
c.CollectRasdaemonMCERecordSize(ch)
c.CollectRasdaemonMCEventSize(ch)
}
func (c *RasdaemonChecker) Close() error {
if err := c.db.Close(); err != nil {
return errors.Wrap(err, "sql.Close")
}
return nil
}