diff --git a/ais/test/ec_test.go b/ais/test/ec_test.go index 962b5578a8..bd3377be4c 100644 --- a/ais/test/ec_test.go +++ b/ais/test/ec_test.go @@ -2915,8 +2915,11 @@ func TestECBckEncodeRecover(t *testing.T) { } touched = damaged } - xid, err := api.ECEncodeBucket(tools.BaseAPIParams(proxyURL), bck, test.data, test.parity, true) + + bp := tools.BaseAPIParams(tools.RandomProxyURL()) + xid, err := api.ECEncodeBucket(bp, bck, test.data, test.parity, true /*recover missing ...*/) tassert.CheckFatal(t, err) + // First, wait for EC-encode xaction to complete reqArgs := xact.ArgsMsg{ID: xid, Kind: apc.ActECEncode, Bck: bck} api.WaitForXactionIdle(tools.BaseAPIParams(proxyURL), &reqArgs) diff --git a/ais/tgtec.go b/ais/tgtec.go index 20b884bc53..c4f69f1845 100644 --- a/ais/tgtec.go +++ b/ais/tgtec.go @@ -86,33 +86,30 @@ func (t *target) httpecpost(w http.ResponseWriter, r *http.Request) { switch action { case apc.ActEcRecover: query := r.URL.Query() - if isRecover := cos.IsParseBool(query.Get(apc.QparamECRecover)); !isRecover { - nlog.Errorf("EC: invalid request to recover an object. '%s' is not set", apc.QparamECRecover) + objName := query.Get(apc.QparamECObject) + if objName == "" { + err := fmt.Errorf("%s: invalid request to recover an object: name's empty", t) + t.writeErr(w, r, err) return } - objPath := query.Get(apc.QparamECObject) - if objPath == "" { - nlog.Errorf("EC: invalid request to recover an object. Object name(%s) is undefined", apc.QparamECRecover) - return - } - objName := objPath lom := core.AllocLOM(objName) - mbck, err := newBckFromQuname(r.URL.Query(), true) + + mbck, err := newBckFromQuname(query, true) if err != nil { - nlog.Errorln("Failed to recover object:", err) core.FreeLOM(lom) + err = fmt.Errorf("%s: %v", t, err) // FATAL/unlikely + t.writeErr(w, r, err) return } - bck := mbck.Clone() - if err := lom.InitBck(&bck); err != nil { - nlog.Errorln("LOM init failed:", err) + + bck := mbck.Bucket() + if err := lom.InitBck(bck); err != nil { core.FreeLOM(lom) + err = fmt.Errorf("%s: %v", t, err) + t.writeErr(w, r, err) return } - if err := ec.ECM.TryRecoverObj(lom); err != nil { - nlog.Errorln("Failed to recover object:", err) - core.FreeLOM(lom) - } + ec.ECM.TryRecoverObj(lom) // free LOM inside return case apc.ActEcOpen: hk.UnregIf(hkname, closeEc) // just in case, a no-op most of the time diff --git a/ais/tgtimpl.go b/ais/tgtimpl.go index 5cf256a37a..befaec8ff9 100644 --- a/ais/tgtimpl.go +++ b/ais/tgtimpl.go @@ -374,7 +374,6 @@ func (t *target) _promRemote(params *core.PromoteParams, lom *core.LOM, tsi *met func (t *target) ECRestoreReq(ct *core.CT, tsi *meta.Snode) error { q := ct.Bck().NewQuery() ct.Bck().AddUnameToQuery(q, apc.QparamBckTo) - q.Set(apc.QparamECRecover, "true") q.Set(apc.QparamECObject, ct.ObjectName()) cargs := allocCargs() { diff --git a/ais/tgttxn.go b/ais/tgttxn.go index bd795fa80b..1acbea7fcf 100644 --- a/ais/tgttxn.go +++ b/ais/tgttxn.go @@ -360,7 +360,9 @@ func (t *target) setBprops(c *txnSrv) (string, error) { if _, reec := _reEC(bprops, nprops, c.bck, nil /*smap*/); reec { flt := xreg.Flt{Kind: apc.ActECEncode, Bck: c.bck} xreg.DoAbort(flt, errors.New("re-ec")) - rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit, false) + + // checkAndRecover always false (compare w/ ecEncode below) + rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit, false /*check & recover missing/corrupted*/) if rns.Err != nil { return "", rns.Err } @@ -758,7 +760,8 @@ func (t *target) ecEncode(c *txnSrv) (string, error) { if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil { return "", cmn.NewErrFailedTo(t, "commit", txn, err) } - rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit, c.msg.Name == apc.ActEcRecover) + checkAndRecover := c.msg.Name == apc.ActEcRecover + rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit, checkAndRecover /*missing/corrupted slices, etc.*/) if rns.Err != nil { nlog.Errorf("%s: %s %v", t, txn, rns.Err) return "", rns.Err diff --git a/api/apc/actmsg.go b/api/apc/actmsg.go index 2904a1328c..979b0d2e1e 100644 --- a/api/apc/actmsg.go +++ b/api/apc/actmsg.go @@ -146,7 +146,7 @@ const ( const ( ActEcOpen = "open-ec-streams" ActEcClose = "close-ec-streams" - ActEcRecover = "obj-recover" + ActEcRecover = "recover" // check and recover missing or corrupted EC metadata and/or slices, if any ) // ActMsg is a JSON-formatted control structures used in a majority of API calls diff --git a/api/apc/query.go b/api/apc/query.go index 972c8dceec..92c80856d6 100644 --- a/api/apc/query.go +++ b/api/apc/query.go @@ -137,8 +137,7 @@ const ( QparamMpathLabel = "mountpath_label" // Request to restore an object - QparamECRecover = "recover" - QparamECObject = "object" + QparamECObject = "object" ) // QparamFltPresence enum. diff --git a/api/bucket.go b/api/bucket.go index 2e9cca6405..3a02059358 100644 --- a/api/bucket.go +++ b/api/bucket.go @@ -267,7 +267,7 @@ func MakeNCopies(bp BaseParams, bck cmn.Bck, copies int) (xid string, err error) // Erasure-code entire `bck` bucket at a given `data`:`parity` redundancy. // The operation requires at least (`data + `parity` + 1) storage targets in the cluster. // Returns xaction ID if successful, an error otherwise. -func ECEncodeBucket(bp BaseParams, bck cmn.Bck, data, parity int, doRecover ...bool) (xid string, err error) { +func ECEncodeBucket(bp BaseParams, bck cmn.Bck, data, parity int, checkAndRecover bool) (xid string, err error) { bp.Method = http.MethodPost // Without `string` conversion it makes base64 from []byte in `Body`. ecConf := string(cos.MustMarshal(&cmn.ECConfToSet{ @@ -280,7 +280,7 @@ func ECEncodeBucket(bp BaseParams, bck cmn.Bck, data, parity int, doRecover ...b reqParams.BaseParams = bp reqParams.Path = apc.URLPathBuckets.Join(bck.Name) msg := apc.ActMsg{Action: apc.ActECEncode, Value: ecConf} - if len(doRecover) > 0 && doRecover[0] { + if checkAndRecover { msg.Name = apc.ActEcRecover } reqParams.Body = cos.MustMarshal(msg) diff --git a/cmd/cli/cli/bencodeway_hdlr.go b/cmd/cli/cli/bencodeway_hdlr.go index ae1892cea9..eaefd61920 100644 --- a/cmd/cli/cli/bencodeway_hdlr.go +++ b/cmd/cli/cli/bencodeway_hdlr.go @@ -32,6 +32,7 @@ var ( dataSlicesFlag, paritySlicesFlag, nonverboseFlag, + checkAndRecoverFlag, }, } @@ -147,7 +148,7 @@ func ecEncodeHandler(c *cli.Context) error { } func ecEncode(c *cli.Context, bck cmn.Bck, bprops *cmn.Bprops, data, parity int, warned bool) error { - xid, err := api.ECEncodeBucket(apiBP, bck, data, parity) + xid, err := api.ECEncodeBucket(apiBP, bck, data, parity, flagIsSet(c, checkAndRecoverFlag)) if err != nil { return err } diff --git a/cmd/cli/cli/const.go b/cmd/cli/cli/const.go index 32cac91d87..dc3dab2a15 100644 --- a/cmd/cli/cli/const.go +++ b/cmd/cli/cli/const.go @@ -595,6 +595,11 @@ var ( dataSlicesFlag = cli.IntFlag{Name: "data-slices,data,d", Value: 2, Usage: "number of data slices", Required: true} paritySlicesFlag = cli.IntFlag{Name: "parity-slices,parity,p", Value: 2, Usage: "number of parity slices", Required: true} + checkAndRecoverFlag = cli.BoolFlag{ + Name: "recover", + Usage: "check and recover missing or corrupted EC metadata and/or slices, if any", + } + compactPropFlag = cli.BoolFlag{Name: "compact,c", Usage: "display properties grouped in human-readable mode"} nameOnlyFlag = cli.BoolFlag{ diff --git a/cmd/cli/go.mod b/cmd/cli/go.mod index 048af24eb1..705965c735 100644 --- a/cmd/cli/go.mod +++ b/cmd/cli/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli go 1.23.2 require ( - github.com/NVIDIA/aistore v1.3.26-0.20241018191536-5a7b43c14253 + github.com/NVIDIA/aistore v1.3.26-0.20241021212206-e65b4c2efcf3 github.com/fatih/color v1.17.0 github.com/json-iterator/go v1.1.12 github.com/onsi/ginkgo/v2 v2.20.2 diff --git a/cmd/cli/go.sum b/cmd/cli/go.sum index 8124478c52..9cc2df043d 100644 --- a/cmd/cli/go.sum +++ b/cmd/cli/go.sum @@ -1,7 +1,7 @@ code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc= github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= -github.com/NVIDIA/aistore v1.3.26-0.20241018191536-5a7b43c14253 h1:4tyDXe2OssOS6Gi9dM6dX3n7Y6jHU2YQ+bguw/C05xM= -github.com/NVIDIA/aistore v1.3.26-0.20241018191536-5a7b43c14253/go.mod h1:Q6J3YIeiL4A6oWga3qCJ8+XI1CUvdde7Gua/HfueGlQ= +github.com/NVIDIA/aistore v1.3.26-0.20241021212206-e65b4c2efcf3 h1:apzFZIz80x8HYgFG1PFnNZ1JUVAfiaHhnyhSJjiqriw= +github.com/NVIDIA/aistore v1.3.26-0.20241021212206-e65b4c2efcf3/go.mod h1:Q6J3YIeiL4A6oWga3qCJ8+XI1CUvdde7Gua/HfueGlQ= github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8= github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA= diff --git a/ec/bencodex.go b/ec/bencodex.go index 1779446eff..e597de8590 100644 --- a/ec/bencodex.go +++ b/ec/bencodex.go @@ -28,10 +28,10 @@ type ( } XactBckEncode struct { xact.Base - bck *meta.Bck - wg *sync.WaitGroup // to wait for EC finishes all objects - smap *meta.Smap - doRecover bool + bck *meta.Bck + wg *sync.WaitGroup // to wait for EC finishes all objects + smap *meta.Smap + checkAndRecover bool } ) @@ -75,8 +75,12 @@ func (p *encFactory) WhenPrevIsRunning(prevEntry xreg.Renewable) (wpr xreg.WPR, // XactBckEncode // /////////////////// -func newXactBckEncode(bck *meta.Bck, uuid string, doRecover bool) (r *XactBckEncode) { - r = &XactBckEncode{bck: bck, wg: &sync.WaitGroup{}, smap: core.T.Sowner().Get(), doRecover: doRecover} +func newXactBckEncode(bck *meta.Bck, uuid string, checkAndRecover bool) (r *XactBckEncode) { + r = &XactBckEncode{ + bck: bck, + wg: &sync.WaitGroup{}, + smap: core.T.Sowner().Get(), + checkAndRecover: checkAndRecover} r.InitBase(uuid, apc.ActECEncode, bck) return } @@ -98,15 +102,16 @@ func (r *XactBckEncode) Run(wg *sync.WaitGroup) { ECM.incActive(r) ctList := []string{fs.ObjectType} - if r.doRecover { - ctList = []string{fs.ObjectType, fs.ECMetaType, fs.ECSliceType} - } opts := &mpather.JgroupOpts{ CTs: ctList, VisitObj: r.bckEncode, - VisitCT: r.bckEncodeMD, DoLoad: mpather.LoadUnsafe, } + if r.checkAndRecover { + opts.CTs = []string{fs.ObjectType, fs.ECMetaType, fs.ECSliceType} + opts.VisitCT = r.bckEncodeMD + } + opts.Bck.Copy(r.bck.Bucket()) jg := mpather.NewJoggerGroup(opts, cmn.GCO.Get(), nil) jg.Run() diff --git a/ec/manager.go b/ec/manager.go index 02694dc369..6134096b5f 100644 --- a/ec/manager.go +++ b/ec/manager.go @@ -349,14 +349,12 @@ func (mgr *Manager) BMDChanged() error { return nil } -func (mgr *Manager) TryRecoverObj(lom *core.LOM) error { - // RestoreObject is blocking: it waits for the object is recovered +// TODO -- FIXME: joggers, etc. +func (mgr *Manager) TryRecoverObj(lom *core.LOM) { go func() { if err := mgr.RestoreObject(lom); err != nil { - nlog.Errorf("Failed to recover an object '%s': %v", lom, err) + nlog.Errorln(core.T.String(), "failed to recover", lom.Cname(), "err:", err) } core.FreeLOM(lom) }() - - return nil } diff --git a/xact/xreg/bucket.go b/xact/xreg/bucket.go index a9e4580ad5..5f7476ecdb 100644 --- a/xact/xreg/bucket.go +++ b/xact/xreg/bucket.go @@ -70,8 +70,9 @@ func RenewBucketXact(kind string, bck *meta.Bck, args Args, buckets ...*meta.Bck return dreg.renew(e, bck, buckets...) } -func RenewECEncode(bck *meta.Bck, uuid, phase string, doRecover bool) RenewRes { - return RenewBucketXact(apc.ActECEncode, bck, Args{Custom: &ECEncodeArgs{Phase: phase, Recover: doRecover}, UUID: uuid}) +func RenewECEncode(bck *meta.Bck, uuid, phase string, checkAndRecover bool) RenewRes { + args := Args{Custom: &ECEncodeArgs{Phase: phase, Recover: checkAndRecover}, UUID: uuid} + return RenewBucketXact(apc.ActECEncode, bck, args) } func RenewMakeNCopies(uuid, tag string) {