Skip to content

Commit

Permalink
remove selfmon (#86)
Browse files Browse the repository at this point in the history
  • Loading branch information
DuodenumL authored Jan 5, 2022
1 parent c2f18f7 commit 1b75441
Show file tree
Hide file tree
Showing 21 changed files with 56 additions and 659 deletions.
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ unit-test:
go test -race -count=1 -timeout 240s -cover ./logs/... \
./manager/node/... \
./manager/workload/... \
./selfmon/... \
./types/... \
./utils/...

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,4 @@ Make sure you can clone code by ssh protocol because libgit2 ask for it. So you
<cli_execute_path> container deploy -pod <pod_name> --entry agent --network <network_name> --deploy-method fill --image <projecteru2/agent>|<your_own_image> --count 1 --file <agent_config_yaml>:/etc/eru/agent.yaml [--cpu 0.3 | --mem 1024000000] http://bit.ly/EruAgent
```

Now you will find agent was started in each node, and monitor containers status include itself.
Now you will find agent was started in each node, and monitor containers status include itself.
36 changes: 14 additions & 22 deletions agent.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"context"
"fmt"
"math/rand"
"os"
Expand All @@ -12,7 +13,6 @@ import (
"github.com/projecteru2/agent/api"
"github.com/projecteru2/agent/manager/node"
"github.com/projecteru2/agent/manager/workload"
"github.com/projecteru2/agent/selfmon"
"github.com/projecteru2/agent/types"
"github.com/projecteru2/agent/utils"
"github.com/projecteru2/agent/version"
Expand Down Expand Up @@ -56,17 +56,12 @@ func serve(c *cli.Context) error {
utils.WritePid(config.PidFile)
defer os.Remove(config.PidFile)

if c.Bool("selfmon") {
mon, err := selfmon.New(c.Context, config)
if err != nil {
return err
}
return mon.Run(c.Context)
}

ctx, cancel := signal.NotifyContext(c.Context, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
ctx, cancel := context.WithCancel(c.Context)
defer cancel()

signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGUSR1)

errChan := make(chan error, 2)
defer close(errChan)

Expand Down Expand Up @@ -103,10 +98,18 @@ func serve(c *cli.Context) error {
go func() {
select {
case <-ctx.Done():
log.Info("[agent] Agent caught system signal, exiting")
log.Info("[agent] Agent exiting")
case <-errChan:
log.Info("[agent] got err, exiting")
cancel()
case sig := <-signalChan:
log.Infof("[agent] Agent caught system signal %v", sig)
if sig != syscall.SIGUSR1 {
if err := nodeManager.Exit(); err != nil {
log.Errorf("[agent] node manager exits with err: %v", err)
}
}
cancel()
}
}()

Expand Down Expand Up @@ -234,17 +237,6 @@ func main() {
Usage: "change hostname",
EnvVars: []string{"ERU_HOSTNAME"},
},
&cli.BoolFlag{
Name: "selfmon",
Value: false,
Usage: "run this agent as a selfmon daemon",
},
&cli.StringFlag{
Name: "kv",
Value: "",
Usage: "kv type",
EnvVars: []string{"ERU_AGENT_KV"},
},
&cli.BoolFlag{
Name: "check-only-mine",
Value: false,
Expand Down
38 changes: 2 additions & 36 deletions agent.yaml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ store: grpc
# This option is not required as the default value is "docker".
runtime: docker

# kv defines the type of kv store.
# This option is not required as the default value is "etcd".
kv: etcd

# core defines the address of eru-core component.
# This option is not required as the default value is "127.0.0.1:5001".
core:
Expand All @@ -24,10 +20,8 @@ core:

# heartbeat_interval defines the interval for eru-agent to
# report health status of the node to eru-core.
# This option is not required, and is only useful when enabling
# selfmon mode.
# If you don't want eru-agent to report this status, set it to 0.
# The default value of this option is 0.
# The default value of this option is 60.
heartbeat_interval: 120

# auth defines the authentication values for eru-core.
Expand Down Expand Up @@ -117,41 +111,13 @@ log:
#
# healthcheck.cache_ttl defines how long will eru-agent cache an unchanged status locally.
# This is only used when selfmon mode is switched on. The default value is 300 (in seconds).
#
# healthcheck.enable_selfmon defines whether selfmon is switched on.
# This should be true if there's at least one eru-agent is in mode selfmon.
# When this is true, healthcheck.enable_selfmon and heartbeat_interval is meaningless.
healthcheck:
interval: 120
timeout: 10
cache_ttl: 300
enable_selfmon: false

# global_connection_timeout defines the timeout for eru-agent other than healthcheck.
# E.g. the timeout for reporting action of eru-agent, or the timeout for eru-agent to
# connect to docker.
# The default value is "5s", note that "s" in the end.
global_connection_timeout: 15s

# ha_keepalive_interval defines the time interval for sending heartbeat
# when selfmon maintains its own active state.
# The default value is "16s", note that "s" in the end.
ha_keepalive_interval: 16s

# etcd defines the etcd configuration.
# This option is required and has no default value.
# If you don't plan to run this eru-agent in selfmon mode,
# you can give a mocked value e.g. 127.0.0.1:1111,
# this value won't be used to connect, it's only to pass
# the validation of this option (it's tricky).
# Will plan to improve this in next release.
#
# etcd.machines defines the addresses of etcd machines.
#
# etcd.prefix defines the prefix for eru-agents in selfmon mode.
# This prefix should be the same for all eru-agents in selfmon mode,
# and also distinguished for different ERU clusters.
etcd:
machines:
- 127.0.0.1:2379
prefix: /agent-selfmon
global_connection_timeout: 15s
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ require (
github.com/go-ole/go-ole v0.0.0-20180213002836-a1ec82a652eb // indirect
github.com/jinzhu/configor v1.2.1
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pkg/errors v0.9.1
github.com/pkg/errors v0.9.1 // indirect
github.com/projecteru2/core v0.0.0-20211021040158-0be8dbadbc55
github.com/projecteru2/libyavirt v0.0.0-20211014062234-66e6f24ab6d1
github.com/prometheus/client_golang v1.11.0
Expand Down
14 changes: 8 additions & 6 deletions manager/node/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,22 +78,24 @@ func (m *Manager) Run(ctx context.Context) error {
log.Info("[NodeManager] start node status heartbeat")
go m.heartbeat(ctx)

// wait for signal
<-ctx.Done()
return m.exit()
log.Info("[NodeManager] exiting")
return nil
}

func (m *Manager) exit() error {
// Exit .
func (m *Manager) Exit() error {
log.Info("[NodeManager] exiting")
log.Infof("[NodeManager] mark node %s as down", m.config.HostName)
log.Infof("[NodeManager] remove node status of %s", m.config.HostName)

// ctx is now canceled. use a new context.
var err error
utils.WithTimeout(context.TODO(), m.config.GlobalConnectionTimeout, func(ctx context.Context) {
err = m.store.SetNode(ctx, m.config.HostName, false)
// remove node status
err = m.store.SetNodeStatus(ctx, -1)
})
if err != nil {
log.Errorf("[NodeManager] failed to mark the node %s as down, err: %s", m.config.HostName, err)
log.Errorf("[NodeManager] failed to remove node status of %v, err: %s", m.config.HostName, err)
return err
}
return nil
Expand Down
7 changes: 3 additions & 4 deletions manager/node/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ func newMockNodeManager(t *testing.T) *Manager {
Stdout: true,
},
HealthCheck: types.HealthCheckConfig{
Interval: 10,
Timeout: 5,
CacheTTL: 300,
EnableSelfmon: true,
Interval: 10,
Timeout: 5,
CacheTTL: 300,
},
GlobalConnectionTimeout: 5 * time.Second,
}
Expand Down
7 changes: 3 additions & 4 deletions manager/workload/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ func newMockWorkloadManager(t *testing.T) *Manager {
Stdout: true,
},
HealthCheck: types.HealthCheckConfig{
Interval: 10,
Timeout: 5,
CacheTTL: 300,
EnableSelfmon: true,
Interval: 10,
Timeout: 5,
CacheTTL: 300,
},
GlobalConnectionTimeout: 5 * time.Second,
}
Expand Down
2 changes: 1 addition & 1 deletion runtime/mocks/Runtime.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 0 additions & 104 deletions selfmon/node.go

This file was deleted.

Loading

0 comments on commit 1b75441

Please sign in to comment.