Skip to content

Commit

Permalink
Feature/barman metrics (#81)
Browse files Browse the repository at this point in the history
* Add space requirements to RECOVERY.md

* Ensure correct permission for ssh keys even if daemon is not enabled

* Barman doesn't require running PG to start

* Add .swp to gitignore

* Add metrics for Barman

* Add prefix to metrics. Updated README.md
  • Loading branch information
S3RK authored and paunin committed Oct 19, 2017
1 parent 6313749 commit 47a2a71
Show file tree
Hide file tree
Showing 13 changed files with 493 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.idea
*.swp
9 changes: 4 additions & 5 deletions Barman-2.3.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM debian:jessie
FROM golang:1.8-jessie

ARG BARMAN_VERSION=2.3-2.pgdg80+1
ARG DOCKERIZE_VERSION=v0.2.0
# grab gosu for easy step-down from root
ARG GOSU_VERSION=1.7
RUN set -x \
Expand All @@ -20,9 +19,6 @@ RUN wget -q https://www.postgresql.org/media/keys/ACCC4CF8.asc -O - | apt-key a
apt-get update && \
apt-get install -y libffi-dev libssl-dev barman=$BARMAN_VERSION openssh-server

RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && \
tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz

RUN apt-get -y install cron
ADD barman/crontab /etc/cron.d/barman
RUN rm -f /etc/cron.daily/*
Expand Down Expand Up @@ -58,6 +54,9 @@ COPY ./barman/configs/upstream.conf $UPSTREAM_CONFIG_FILE
COPY ./barman/bin /usr/local/bin/barman_docker
RUN chmod +x /usr/local/bin/barman_docker/* && ls /usr/local/bin/barman_docker

COPY ./barman/metrics /go
RUN cd /go && go build /go/main.go

VOLUME $BACKUP_DIR

CMD /usr/local/bin/barman_docker/entrypoint.sh
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ Whole backup procedure is performed remotely, but for recovery SSH access is req

*For Disaster Recovery process see [RECOVERY.md](./RECOVERY.md)*

Barman exposes several metrics on `:8080/metrics` for more information see [Barman docs](./barman/README.md)

## Health-checks

Expand Down
14 changes: 11 additions & 3 deletions RECOVERY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## Free space requirements:
1. For base backup itself, you can check size with `barman show-backup`

2. Space for WALs
* Take into account that barman stores WALs in compress form. You can calcuate required space by checking once again `barman show-backup`. There you will find disk usage and compression rate.
* During recovery barman put `restore_command = cp...` which means that you have to have at least double of WAL size.
Upod recovery completion duplicates wil be removed.

## Recovery steps:

1. Stop pg cluster, but retain containers. **IMPORTANT start with slaves**
Expand All @@ -8,15 +16,15 @@
```
1. Check sshd is running on master, if not start it:
```
/home/postgres/.ssh/entrypoint.sh
SSH_ENABLE=1 /home/postgres/.ssh/entrypoint.sh
```
1. Connect to barman container and select appropriate base backup
```
barman list-backup all
```
1. Recover to required point, for example:
```
barman recover pg_cluster 20170815T041301 /var/lib/postgresql/data/ --target-time "2017-08-15 04:11:26.5" --remote-ssh-command="ssh pgmaster"
barman recover pg_cluster 20170815T041301 /var/lib/postgresql/data/ --target-time "2017-08-15 04:11:26.5" --remote-ssh-command="ssh pgmaster" -j$(nproc)
```
1. Connect to master node, start recovery and check DB consistency, then finish recovery
```
Expand All @@ -28,7 +36,7 @@
```
rm /var/run/recovery.lock
```
1. Wait until master will fully functional and restart each standby node one by one
1. Wait until master will become fully functional and restart every standby node
```
rm /var/run/recovery.lock
```
Expand Down
16 changes: 16 additions & 0 deletions barman/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## Metrics

Metrics are exposed in prometheus format on `:8080/metrics` via small go deamon which collects data from `fs`, `barman diagnose` and `barman check` commands.

Type of all metrics is GAUGE.
Available metrics are:
* `barman_check_is_ok` 0 or 1. 1 means ok and returned when exit code of `barman check` is 0
* `barman_backups_amount`
* `barman_last_backup_start_time_seconds`
* `barman_last_backup_end_time_seconds`
* `barman_last_backup_size_bytes`
* `barman_last_backup_duration_total_seconds`
* `barman_last_backup_duration_copy_seconds`
* `barman_oldest_backup_end_time_seconds`
* `barman_backup_disk_free_bytes`
* `barman_backup_disk_used_bytes`
14 changes: 4 additions & 10 deletions barman/bin/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ echo ">>> Checking all configurations"
[[ "$POSTGRES_PASSWORD" != "" ]] || ( echo 'Variable POSTGRES_PASSWORD is not set!' ;exit 3 )
[[ "$POSTGRES_DB" != "" ]] || ( echo 'Variable POSTGRES_DB is not set!' ;exit 4 )

echo ">>> Waiting for upstream DB"
dockerize -wait tcp://$REPLICATION_HOST:$REPLICATION_PORT -timeout "$WAIT_UPSTREAM_TIMEOUT"s

echo ">>> Configuring barman for streaming replication"
echo "
[$UPSTREAM_NAME]
Expand All @@ -27,13 +24,6 @@ backup_directory = $BACKUP_DIR
retention_policy = RECOVERY WINDOW OF $BACKUP_RETENTION_DAYS DAYS
" >> $UPSTREAM_CONFIG_FILE

SLOTS_COUNT=`barman show-server $UPSTREAM_NAME | grep "replication_slot: Record(slot_name='$REPLICATION_SLOT_NAME'" | wc -l`
if [ "$SLOTS_COUNT" -gt "0" ]; then
echo ">>>>>> Looks like replication slot already exists"
else
barman receive-wal --create-slot $UPSTREAM_NAME
fi

echo '>>> STARTING SSH (if required)...'
source /home/postgres/.ssh/entrypoint.sh

Expand All @@ -42,6 +32,10 @@ echo ">>>>>> Backup schedule is $BACKUP_SCHEDULE"
echo "$BACKUP_SCHEDULE root barman backup all > /proc/1/fd/1 2> /proc/1/fd/2" >> /etc/cron.d/barman
chmod 0644 /etc/cron.d/barman

echo '>>> STARTING METRICS SERVER'
/go/main &

echo '>>> STARTING CRON'
env >> /etc/environment
cron -f

12 changes: 12 additions & 0 deletions barman/bin/wal-receiver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -e

SLOTS_COUNT=`barman show-server $UPSTREAM_NAME | grep "replication_slot: Record(slot_name='$REPLICATION_SLOT_NAME'" | wc -l`
if [ "$SLOTS_COUNT" -gt "0" ]; then
echo "Looks like replication slot already exists"
else
echo "Creating replication slot: $REPLICATION_SLOT_NAM"
barman receive-wal --create-slot $UPSTREAM_NAME
fi

barman cron
2 changes: 1 addition & 1 deletion barman/crontab
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
* * * * * root barman cron > /proc/1/fd/1 2> /proc/1/fd/2
*/5 * * * * root cd /home/postgres && /usr/local/bin/barman_docker/wal-receiver.sh > /proc/1/fd/1 2> /proc/1/fd/2

Binary file added barman/metrics/main
Binary file not shown.
131 changes: 131 additions & 0 deletions barman/metrics/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package main

import (
"fmt"
"net/http"
"os/exec"
"encoding/json"
// "github.com/davecgh/go-spew/spew"
"syscall"
"time"
"strings"
"sort"
)

func main() {
http.HandleFunc("/metrics", handler)
http.ListenAndServe(":8080", nil)
fmt.Printf("Server is listeinging on :8080. Metrics URL — /metrics")
}

func handler(w http.ResponseWriter, r *http.Request) {
for k, v := range collectMetrics() {
fmt.Fprintf(w, "%s %d\n", k, v)
}
}

func collectMetrics() map[string]int64 {
metrics := make(map[string]int64)

diagnose := barmanDiagnose()
backups := diagnose.Servers.Pg_cluster.Backups

metrics["barman_check_is_ok"] = int64(barmanCheck())
metrics["barman_backups_amount"] = int64(len(backups))
if (len(backups) > 0) {
var keys []string
for k := range backups {
keys = append(keys, k)
}
sort.Strings(keys)
latestBackup := backups[keys[len(keys)-1]]
oldestBackup := backups[keys[0]]
metrics["barman_last_backup_start_time_seconds"] = latestBackup.Begin_time.Unix()
metrics["barman_last_backup_end_time_seconds"] = latestBackup.End_time.Unix()
metrics["barman_last_backup_size_bytes"] = latestBackup.Size
metrics["barman_last_backup_duration_total_seconds"] = int64(latestBackup.Copy_stats["total_time"])
metrics["barman_last_backup_duration_copy_seconds"] = int64(latestBackup.Copy_stats["copy_time"])
metrics["barman_oldest_backup_end_time_seconds"] = oldestBackup.End_time.Unix()
}

var diskUsage syscall.Statfs_t
syscall.Statfs("/var/backups", &diskUsage)
metrics["barman_disk_free_bytes"] = int64(diskUsage.Bavail * uint64(diskUsage.Bsize))
metrics["barman_disk_used_bytes"] = int64((diskUsage.Blocks - diskUsage.Bfree) * uint64(diskUsage.Bsize))

return metrics
}

var execCommand = exec.Command
func barmanCheck() int {
checkCmd := execCommand("barman", "check")
checkErr := checkCmd.Run()

barman_check_ok := 1
if checkErr != nil {
barman_check_ok = 0
}

return barman_check_ok
}

type BarmanDiagnose struct {
Servers struct {
Pg_cluster struct {
Backups map[string]BarmanBackup
}
}
}

type BarmanBackup struct {
Backup_id string
Begin_time CustomTime
End_time CustomTime
Copy_stats map[string]float64
Size int64
}

func barmanDiagnose() BarmanDiagnose {
cmd := execCommand("barman", "diagnose")
result, err := cmd.Output()
if err != nil {
panic(string(result))
}

var diag BarmanDiagnose
jsonErr := json.Unmarshal(result, &diag)
if jsonErr != nil {
panic(jsonErr)
}

return diag
}

type CustomTime struct {
time.Time
}

const ctLayout = "Mon Jan 2 15:04:05 2006"

func (ct *CustomTime) UnmarshalJSON(b []byte) (err error) {
s := strings.Trim(string(b), "\"")
if s == "null" {
ct.Time = time.Time{}
return
}
ct.Time, err = time.Parse(ctLayout, s)
return
}

func (ct *CustomTime) MarshalJSON() ([]byte, error) {
if ct.Time.UnixNano() == nilTime {
return []byte("null"), nil
}
return []byte(fmt.Sprintf("\"%s\"", ct.Time.Format(ctLayout))), nil
}

var nilTime = (time.Time{}).UnixNano()
func (ct *CustomTime) IsSet() bool {
return ct.UnixNano() != nilTime
}

114 changes: 114 additions & 0 deletions barman/metrics/main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package main

import "testing"
import "fmt"
import "os"
import "github.com/stretchr/testify/assert"
import "github.com/stretchr/testify/suite"
import "net/http"
import "net/http/httptest"
import "os/exec"
import "io/ioutil"
import "strconv"

var fakeExitCode = 0

type MainTestSuite struct {
suite.Suite
testHandler http.HandlerFunc
rr *httptest.ResponseRecorder
req *http.Request
}

func TestAll(t *testing.T) {
execCommand = fakeExecCommand
suite.Run(t, new(MainTestSuite))
}

func (suite *MainTestSuite) SetupTest() {
fakeExitCode = 0
var err error
suite.req, err = http.NewRequest("GET", "http://localhost:8080/metrics", nil)
if err != nil {
panic(err.Error())
}

suite.rr = httptest.NewRecorder()
suite.testHandler = http.HandlerFunc(handler)
}

func (suite *MainTestSuite) TestBarmanCheckFailed() {
fakeExitCode = 1
suite.testHandler.ServeHTTP(suite.rr, suite.req)

if status := suite.rr.Code; status != http.StatusOK {
suite.T().Errorf("handler returned wrong status code: got %v want %v",
status, http.StatusOK)
}

resp := suite.rr.Body.String();
assert.Contains(suite.T(), resp, "barman_check_is_ok 0")
}

func (suite *MainTestSuite) TestHandler() {
suite.testHandler.ServeHTTP(suite.rr, suite.req)

if status := suite.rr.Code; status != http.StatusOK {
suite.T().Errorf("handler returned wrong status code: got %v want %v",
status, http.StatusOK)
}

resp := suite.rr.Body.String();
assert.Contains(suite.T(), resp, "barman_check_is_ok 1")
assert.Contains(suite.T(), resp, "barman_backups_amount 2")
assert.Contains(suite.T(), resp, "barman_last_backup_start_time_seconds 1503656945")
assert.Contains(suite.T(), resp, "barman_last_backup_end_time_seconds 1503656955")
assert.Contains(suite.T(), resp, "barman_last_backup_size_bytes 36304273")
assert.Contains(suite.T(), resp, "barman_last_backup_duration_copy_seconds 5")
assert.Contains(suite.T(), resp, "barman_last_backup_duration_total_seconds 10")
assert.Contains(suite.T(), resp, "barman_oldest_backup_end_time_seconds 1503570545")
assert.Contains(suite.T(), resp, "barman_disk_free_bytes ")
assert.Contains(suite.T(), resp, "barman_disk_used_bytes ")
}

func fakeExecCommand(command string, args...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1", "GO_FAKE_EXIT_CODE=" + strconv.Itoa(fakeExitCode)}
return cmd
}

func TestHelperProcess(t *testing.T){
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
command := os.Args[3]
arguments := os.Args[4:]

switch command {
case "barman":
switch arguments[0] {
case "diagnose":
testfile, err := ioutil.ReadFile("test.json")
if err != nil {
panic(err.Error())
}
fmt.Fprintf(os.Stdout, string(testfile))
case "check":
code, err := strconv.Atoi(os.Getenv("GO_FAKE_EXIT_CODE"))
if err != nil {
panic(err.Error())
}
os.Exit(code)
default:
fmt.Fprintf(os.Stderr, "Unknown barman command call")
os.Exit(1)
}
default:
fmt.Fprintf(os.Stderr, "Unknown command call")
os.Exit(2)
}

os.Exit(0)
}
Loading

0 comments on commit 47a2a71

Please sign in to comment.