Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add health checks #259

Merged
merged 4 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 78 additions & 17 deletions cmd/fdsn-holdings-consumer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,76 @@
package main

import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"log"
"log/slog"
"os"
"os/signal"
"strings"
"syscall"
"time"

"github.com/GeoNet/kit/aws/s3"
"github.com/GeoNet/kit/aws/sqs"
"github.com/GeoNet/kit/cfg"
"github.com/GeoNet/kit/health"
"github.com/GeoNet/kit/metrics"
)

const (
healthCheckAged = 5 * time.Minute //need to have a good heartbeat within this time
healthCheckStartup = 5 * time.Minute //ignore heartbeat messages for this time after starting
healthCheckTimeout = 30 * time.Second //health check timeout
healthCheckService = ":7777" //end point to listen to for SOH checks
healthCheckPath = "/soh"
)

var (
db *sql.DB
queueURL = os.Getenv("SQS_QUEUE_URL")
queueURL string
sqsClient sqs.SQS
s3Client *s3.S3
s3Client s3.S3
saveHoldings *sql.Stmt
)

type event struct {
s3.Event
}

// init and check aws variables
func initAwsClient() {
queueURL = os.Getenv("SQS_QUEUE_URL")
if queueURL == "" {
log.Fatal("SQS_QUEUE_URL is not set")
}

var err error
sqsClient, err = sqs.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("error creating SQS client: %s", err)
}
if err = sqsClient.CheckQueue(queueURL); err != nil {
log.Fatalf("error checking queueURL %s: %s", queueURL, err.Error())
}

s3Client, err = s3.NewWithMaxRetries(3)
if err != nil {
log.Fatalf("error creating S3 client: %s", err)
}
}

func main() {
//check health
if health.RunningHealthCheck() {
healthCheck()
}

//run as normal service
initAwsClient()
p, err := cfg.PostgresEnv()
if err != nil {
log.Fatalf("error reading DB config from the environment vars: %s", err)
Expand Down Expand Up @@ -80,6 +122,9 @@ func main() {
db.SetMaxIdleConns(p.MaxIdle)
db.SetMaxOpenConns(p.MaxOpen)

// provide a soh heartbeat
health := health.New(healthCheckService, healthCheckAged, healthCheckStartup)

ping:
for {
err = db.Ping()
Expand All @@ -91,27 +136,29 @@ ping:
break ping
}

sqsClient, err = sqs.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("error creating SQS client: %s", err)
}

s3c, err := s3.NewWithMaxRetries(3)
if err != nil {
log.Fatalf("error creating S3 client: %s", err)
}
s3Client = &s3c

log.Println("listening for messages")

var r sqs.Raw
var e event

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()

loop1:
for {
r, err = sqsClient.Receive(queueURL, 600)
health.Ok() // update soh
r, err = sqsClient.ReceiveWithContext(ctx, queueURL, 600)
if err != nil {
log.Printf("problem receiving message, backing off: %s", err)
time.Sleep(time.Second * 20)
switch {
case sqs.IsNoMessagesError(err):
continue
case sqs.Cancelled(err): //stoped
log.Println("##1 system stop... ")
break loop1
default:
slog.Warn("problem receiving message, backing off", "err", err)
time.Sleep(time.Second * 20)
}
continue
}

Expand All @@ -120,14 +167,28 @@ ping:
log.Printf("problem processing message, skipping deletion for redelivery: %s", err)
continue
}

err = sqsClient.Delete(queueURL, r.ReceiptHandle)
if err != nil {
log.Printf("problem deleting message, continuing: %s", err)
}
}
}

// check health by calling the http soh endpoint
// cmd: ./fdsn-holdings-consumer -check
func healthCheck() {
ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
defer cancel()

msg, err := health.Check(ctx, healthCheckService+healthCheckPath, healthCheckTimeout)
if err != nil {
log.Printf("status: %v", err)
os.Exit(1)
}
log.Printf("status: %s", string(msg))
os.Exit(0)
}

// Process implements msg.Processor for event.
func (e *event) Process(msg []byte) error {
err := json.Unmarshal(msg, e)
Expand Down
91 changes: 78 additions & 13 deletions cmd/fdsn-quake-consumer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,33 @@ package main

import (
"bytes"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"log"
"log/slog"
"os"
"os/signal"
"syscall"
"time"

"github.com/GeoNet/kit/aws/s3"
"github.com/GeoNet/kit/aws/sqs"
"github.com/GeoNet/kit/cfg"
"github.com/GeoNet/kit/health"
"github.com/GeoNet/kit/metrics"
)

const (
healthCheckAged = 5 * time.Minute //need to have a good heartbeat within this time
healthCheckStartup = 5 * time.Minute //ignore heartbeat messages for this time after starting
healthCheckTimeout = 30 * time.Second //health check timeout
healthCheckService = ":7777" //end point to listen to for SOH checks
healthCheckPath = "/soh"
)

var (
queueURL = os.Getenv("SQS_QUEUE_URL")
s3Client s3.S3
Expand All @@ -30,7 +43,37 @@ type notification struct {
s3.Event
}

// init and check aws clients
func initAwsClients() {
queueURL = os.Getenv("SQS_QUEUE_URL")
if queueURL == "" {
log.Fatal("SQS_QUEUE_URL is not set")
}

var err error
s3Client, err = s3.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("creating S3 client: %s", err)
}

sqsClient, err = sqs.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("creating SQS client: %s", err)
}
if err = sqsClient.CheckQueue(queueURL); err != nil {
log.Fatalf("error checking queueURL %s: %s", queueURL, err.Error())
}
}

func main() {
//check health
if health.RunningHealthCheck() {
healthCheck()
}

//run as normal service
initAwsClients()

p, err := cfg.PostgresEnv()
if err != nil {
log.Fatalf("error reading DB config from the environment vars: %s", err)
Expand All @@ -45,6 +88,9 @@ func main() {
db.SetMaxIdleConns(p.MaxIdle)
db.SetMaxOpenConns(p.MaxOpen)

// provide a soh heartbeat
health := health.New(healthCheckService, healthCheckAged, healthCheckStartup)

ping:
for {
err = db.Ping()
Expand All @@ -56,26 +102,30 @@ ping:
break ping
}

s3Client, err = s3.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("creating S3 client: %s", err)
}

sqsClient, err = sqs.NewWithMaxRetries(100)
if err != nil {
log.Fatalf("creating SQS client: %s", err)
}

log.Println("listening for messages")

var r sqs.Raw
var n notification

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()

loop1:
for {
r, err = sqsClient.Receive(queueURL, 600)
health.Ok() // update soh

r, err = sqsClient.ReceiveWithContext(ctx, queueURL, 600)
if err != nil {
log.Printf("problem receiving message, backing off: %s", err)
time.Sleep(time.Second * 20)
switch {
case sqs.IsNoMessagesError(err):
continue
case sqs.Cancelled(err): //stoped
log.Println("##1 system stop... ")
break loop1
default:
slog.Warn("problem receiving message, backing off", "err", err)
time.Sleep(time.Second * 20)
}
continue
}

Expand All @@ -92,6 +142,21 @@ ping:
}
}

// check health by calling the http soh endpoint
// cmd: ./fdsn-quake-consumer -check
func healthCheck() {
ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
defer cancel()

msg, err := health.Check(ctx, healthCheckService+healthCheckPath, healthCheckTimeout)
if err != nil {
log.Printf("status: %v", err)
os.Exit(1)
}
log.Printf("status: %s", string(msg))
os.Exit(0)
}

// Process implements msg.Processor for event.
func (n *notification) Process(msg []byte) error {
err := json.Unmarshal(msg, n)
Expand Down
4 changes: 4 additions & 0 deletions cmd/fdsn-ws/fdsn_dataselect.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ func initDataselectTemplate() {
log.Fatalf("error creating S3 client: %s", err)
}
s3Client = &s3c

if err = s3Client.CheckBucket(S3_BUCKET); err != nil {
log.Fatalf("error checking S3_BUCKET %s: %s", S3_BUCKET, err.Error())
}
}

// fdsnDataMetricsV1Handler handles all datametrics queries.
Expand Down
29 changes: 28 additions & 1 deletion cmd/fdsn-ws/server.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package main

import (
"context"
"database/sql"
"fmt"
"log"
"net/http"
"os"
Expand All @@ -10,10 +12,13 @@ import (
"time"

"github.com/GeoNet/kit/cfg"
"github.com/GeoNet/kit/health"
"github.com/gorilla/schema"
_ "github.com/lib/pq"
)

const servicePort = ":8080" //http service port

var (
db *sql.DB
decoder = newDecoder() // decoder for URL queries.
Expand All @@ -36,6 +41,12 @@ func newDecoder() *schema.Decoder {
}

func main() {
//check health if flagged in cmd
if health.RunningHealthCheck() {
healthCheck()
}

//run as normal service
var err error
if S3_BUCKET = os.Getenv("S3_BUCKET"); S3_BUCKET == "" {
log.Fatal("ERROR: S3_BUCKET environment variable is not set")
Expand Down Expand Up @@ -77,10 +88,26 @@ func main() {

log.Println("starting server")
server := &http.Server{
Addr: ":8080",
Addr: servicePort,
Handler: mux,
ReadTimeout: 1 * time.Minute,
WriteTimeout: 10 * time.Minute,
}
log.Fatal(server.ListenAndServe())
}

// check health by calling the http soh endpoint
// cmd: ./fdsn-ws -check
func healthCheck() {
timeout := 30 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()

msg, err := health.Check(ctx, fmt.Sprintf("%s/soh", servicePort), timeout)
if err != nil {
log.Printf("status: %v", err)
os.Exit(1)
}
log.Printf("status: %s", string(msg))
os.Exit(0)
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/GeoNet/fdsn
go 1.21

require (
github.com/GeoNet/kit v0.0.0-20240512234353-4d4493144f60
github.com/GeoNet/kit v0.0.0-20241129025613-745247c4fb1c
github.com/gorilla/schema v1.4.1
github.com/joho/godotenv v1.5.1
github.com/lib/pq v1.10.3
Expand Down
Loading
Loading