-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetrics.go
148 lines (142 loc) · 3.91 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package cage
import (
"github.com/apex/log"
"time"
"math"
"github.com/aws/aws-sdk-go/service/cloudwatch"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/cloudwatch/cloudwatchiface"
"golang.org/x/sync/errgroup"
"errors"
)
type ServiceHealth struct {
availability float64
responseTime float64
}
type NoDataPointsFoundError struct {
Input *cloudwatch.GetMetricStatisticsInput
}
func (v *NoDataPointsFoundError) Error() string {
return ""
}
func (envars *Envars) GetServiceMetricStatistics(
cw cloudwatchiface.CloudWatchAPI,
lbId string,
tgId string,
metricName string,
unit string,
startTime time.Time,
endTime time.Time,
) (float64, error) {
log.Infof(
"getStatistics: LoadBalancer=%s, TargetGroup=%s, metricName=%s, unit=%s",
lbId, tgId, metricName, unit,
)
input := &cloudwatch.GetMetricStatisticsInput{
Namespace: aws.String("AWS/ApplicationELB"),
Dimensions: []*cloudwatch.Dimension{
{
Name: aws.String("LoadBalancer"),
Value: aws.String(lbId),
}, {
Name: aws.String("TargetGroup"),
Value: aws.String(tgId),
},
},
Statistics: []*string{&unit},
MetricName: &metricName,
StartTime: &startTime,
EndTime: &endTime,
Period: envars.RollOutPeriod,
}
out, err := cw.GetMetricStatistics(input)
if err != nil {
log.Errorf("failed to get CloudWatch's '%s' metric statistics due to: %s", metricName, err.Error())
return 0, err
}
if (metricName == "RequestCount" || metricName == "TargetResponseTime") && len(out.Datapoints) == 0 {
return 0, &NoDataPointsFoundError{Input: input}
}
var ret float64 = 0
switch unit {
case "Sum":
for _, v := range out.Datapoints {
ret += *v.Sum
}
case "Average":
for _, v := range out.Datapoints {
ret += *v.Average
}
ret /= float64(len(out.Datapoints))
default:
err = NewErrorf("unsupported unit type: %s", unit)
}
return ret, err
}
func (envars *Envars) AccumulatePeriodicServiceHealth(
cw cloudwatchiface.CloudWatchAPI,
loadBalancerArn *string,
targetGroupArn *string,
startTime time.Time,
endTime time.Time,
) (*ServiceHealth, error) {
var (
lbId string
tgId string
err error
)
if lbId, err = ExtractAlbId(*loadBalancerArn); err != nil {
return nil, err
}
if tgId, err = ExtractTargetGroupId(*targetGroupArn); err != nil {
return nil, err
}
maxRetryCnt := 20 // 15sec x 20 = 5min
for i := 0; i < maxRetryCnt; i++ {
eg := errgroup.Group{}
requestCnt := 0.0
elb5xxCnt := 0.0
target5xxCnt := 0.0
responseTime := 0.0
accumulate := func(metricName string, unit string, dest *float64) func() (error) {
return func() (error) {
v, err := envars.GetServiceMetricStatistics(cw, lbId, tgId, metricName, unit, startTime, endTime)
if err == nil {
*dest = v
}
return err
}
}
eg.Go(accumulate("RequestCount", "Sum", &requestCnt))
eg.Go(accumulate("HTTPCode_ELB_5XX_Count", "Sum", &elb5xxCnt))
eg.Go(accumulate("HTTPCode_Target_5XX_Count", "Sum", &target5xxCnt))
eg.Go(accumulate("TargetResponseTime", "Average", &responseTime))
err := eg.Wait()
if err != nil {
switch err.(type) {
case *NoDataPointsFoundError:
// タイミングによってCloudWatchのメトリクスデータポイントがまだ存在しない場合がある
log.Warnf(
"no data points found on CloudWatch Metrics between %s ~ %s. will retry after %d seconds",
startTime.String(), endTime.String(), 15,
)
default:
log.Errorf("failed to accumulate periodic service health due to: %s", err.Error())
return nil, err
}
} else {
if requestCnt == 0 && elb5xxCnt == 0 {
return nil, errors.New("failed to get precise metric data")
} else {
avl := (requestCnt - target5xxCnt) / (requestCnt + elb5xxCnt)
avl = math.Max(0, math.Min(1, avl))
return &ServiceHealth{
availability: avl,
responseTime: responseTime,
}, nil
}
}
<-newTimer(time.Duration(15) * time.Second).C
}
return nil, NewErrorf("no data points found in 20 retries")
}