arcalot · jaredoconnell · Aug 16, 2024 · Jul 3, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/go.mod b/go.mod
@@ -4,14 +4,14 @@ go 1.21
 
 require (
 	go.arcalot.io/assert v1.8.0
-	go.arcalot.io/dgraph v1.4.1
+	go.arcalot.io/dgraph v1.5.0
 	go.arcalot.io/lang v1.1.0
 	go.arcalot.io/log/v2 v2.1.0
 	go.flow.arcalot.io/deployer v0.6.1
 	go.flow.arcalot.io/dockerdeployer v0.7.2
 	go.flow.arcalot.io/expressions v0.4.3
 	go.flow.arcalot.io/kubernetesdeployer v0.9.3
-	go.flow.arcalot.io/pluginsdk v0.12.1
+	go.flow.arcalot.io/pluginsdk v0.12.2
 	go.flow.arcalot.io/podmandeployer v0.11.2
 	go.flow.arcalot.io/pythondeployer v0.6.1
 	go.flow.arcalot.io/testdeployer v0.6.1
@@ -29,7 +29,7 @@ require (
 	github.com/docker/go-units v0.5.0 // indirect
 	github.com/emicklei/go-restful/v3 v3.12.0 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
-	github.com/fxamacker/cbor/v2 v2.6.0 // indirect
+	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect

diff --git a/go.sum b/go.sum
@@ -25,8 +25,8 @@ github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH
 github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
-github.com/fxamacker/cbor/v2 v2.6.0 h1:sU6J2usfADwWlYDAFhZBQ6TnLFBHxgesMrQfQgk1tWA=
-github.com/fxamacker/cbor/v2 v2.6.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
+github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
+github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
@@ -123,8 +123,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 go.arcalot.io/assert v1.8.0 h1:hGcHMPncQXwQvjj7MbyOu2gg8VIBB00crUJZpeQOjxs=
 go.arcalot.io/assert v1.8.0/go.mod h1:nNmWPoNUHFyrPkNrD2aASm5yPuAfiWdB/4X7Lw3ykHk=
-go.arcalot.io/dgraph v1.4.1 h1:y/lhJ68WzNUDR2BYSk7tZAZhVokts92svcrJLbK4Ebo=
-go.arcalot.io/dgraph v1.4.1/go.mod h1:+Kxc81utiihMSmC1/ttSPGLDlWPpvgOpNxSFmIDPxFM=
+go.arcalot.io/dgraph v1.5.0 h1:6cGlxLzmmehJoD/nj0Hkql7uh90EU0A0GtZhGYkr28M=
+go.arcalot.io/dgraph v1.5.0/go.mod h1:+Kxc81utiihMSmC1/ttSPGLDlWPpvgOpNxSFmIDPxFM=
 go.arcalot.io/exex v0.2.0 h1:u44pjwPwcH57TF8knhaqVZP/1V/KbnRe//pKzMwDpLw=
 go.arcalot.io/exex v0.2.0/go.mod h1:5zlFr+7vOQNZKYCNOEDdsad+z/dlvXKs2v4kG+v+bQo=
 go.arcalot.io/lang v1.1.0 h1:ugglRKpd3qIMkdghAjKJxsziIgHm8QpxrzZPSXoa08I=
@@ -139,8 +139,8 @@ go.flow.arcalot.io/expressions v0.4.3 h1:0BRRghutHp0sctsITHe/A1le0yYiJtKNTxm27T+
 go.flow.arcalot.io/expressions v0.4.3/go.mod h1:UORX78N4ep71wOzNXdIo/UY+6SdDD0id0mvuRNEQMeM=
 go.flow.arcalot.io/kubernetesdeployer v0.9.3 h1:XKiqmCqXb6ZLwP5IQTAKS/gJHpq0Ub/yEjCfgAwQF2A=
 go.flow.arcalot.io/kubernetesdeployer v0.9.3/go.mod h1:DtB6HR7HBt/HA1vME0faIpOQ/lhfBJjL6OAGgT3Bu/Q=
-go.flow.arcalot.io/pluginsdk v0.12.1 h1:HlWo1+Fn13u0EoeH9KpcWzdn1miqucNEvj0cirFhcA8=
-go.flow.arcalot.io/pluginsdk v0.12.1/go.mod h1:J0RYsfD6g1WKMVSbLGZR/ZJBVdcjt+bOuKoOvnNPpy4=
+go.flow.arcalot.io/pluginsdk v0.12.2 h1:ue+Ax1c3Bw0TmnuAEZ4r19D6ZA4dns9rP3NO4cvmTec=
+go.flow.arcalot.io/pluginsdk v0.12.2/go.mod h1:U+7mtXo4aT3YNn9fNWZ3b5RBvXttm/iocc2/P4MGa88=
 go.flow.arcalot.io/podmandeployer v0.11.2 h1:aqrHaNaCXYDREqDJpKhVeVIIZPiPld5SDvnUcc0Tjiw=
 go.flow.arcalot.io/podmandeployer v0.11.2/go.mod h1:70+9M6eVQa0EEynDZ720P3AEzXt1gZto2lvoMlyjuzo=
 go.flow.arcalot.io/pythondeployer v0.6.1 h1:IyaA9BVfHJ2fhC+fNfT6VicrtRGFlZOlSaAVGGPwo1E=

diff --git a/internal/step/dummy/provider_test.go b/internal/step/dummy/provider_test.go
@@ -14,6 +14,9 @@ type stageChangeHandler struct {
 	message chan string
 }
 
+func (s *stageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) {
+}
+
 func (s *stageChangeHandler) OnStageChange(_ step.RunningStep, _ *string, _ *string, _ *any, _ string, _ bool, _ *sync.WaitGroup) {
 
 }

diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go
@@ -292,7 +292,7 @@ func (p *pluginProvider) LoadSchema(inputs map[string]any, _ map[string][]byte)
 			requestedDeploymentType, pluginSource, err)
 	}
 	// Set up the ATP connection
-	transport := atp.NewClientWithLogger(pluginConnector, p.logger)
+	transport := atp.NewClientWithLogger(pluginConnector, p.logger.WithLabel("source", "atp-client"))
 	// Read the schema information
 	s, err := transport.ReadSchema()
 	if err != nil {
@@ -688,6 +688,7 @@ func (r *runnableStep) Start(input map[string]any, runID string, stageChangeHand
 		runID:              runID,
 	}
 
+	s.wg.Add(1) // Wait for the run to finish before closing.
 	go s.run()
 
 	return s, nil
@@ -922,8 +923,8 @@ func (r *runningStep) closeComponents(closeATP bool) error {
 	return nil
 }
 
+// Note: Caller must add 1 to the waitgroup before calling.
 func (r *runningStep) run() {
-	r.wg.Add(1) // Wait for the run to finish before closing.
 	defer func() {
 		r.cancel()  // Close before WaitGroup done
 		r.wg.Done() // Done. Close may now exit.
@@ -958,6 +959,10 @@ func (r *runningStep) run() {
 		r.transitionToDisabled()
 		return
 	}
+
+	// It's enabled, so the disabled stage will not occur.
+	r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err)
+
 	if err := r.startStage(container); err != nil {
 		r.startFailed(err)
 		return
@@ -1055,7 +1060,7 @@ func (r *runningStep) enableStage() (bool, error) {
 
 func (r *runningStep) startStage(container deployer.Plugin) error {
 	r.logger.Debugf("Starting stage for step %s/%s", r.runID, r.pluginStepID)
-	atpClient := atp.NewClientWithLogger(container, r.logger)
+	atpClient := atp.NewClientWithLogger(container, r.logger.WithLabel("source", "atp-client"))
 	var inputReceivedEarly bool
 	r.lock.Lock()
 	r.atpClient = atpClient
@@ -1118,9 +1123,12 @@ func (r *runningStep) startStage(container deployer.Plugin) error {
 		return fmt.Errorf("schema mismatch between local and remote deployed plugin in step %s/%s, unserializing input failed (%w)", r.runID, r.pluginStepID, err)
 	}
 
+	r.wg.Add(1)
+
 	// Runs the ATP client in a goroutine in order to wait for it.
 	// On context done, the deployer has 30 seconds before it will error out.
 	go func() {
+		defer r.wg.Done()
 		result := r.atpClient.Execute(
 			schema.Input{RunID: r.runID, ID: r.pluginStepID, InputData: runInput},
 			r.signalToStep,
@@ -1169,6 +1177,27 @@ func (r *runningStep) runStage() error {
 	return nil
 }
 
+func (r *runningStep) markStageFailures(firstStage StageID, err error) {
+	switch firstStage {
+	case StageIDEnabling:
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err)
+		fallthrough
+	case StageIDDisabled:
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err)
+		fallthrough
+	case StageIDStarting:
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err)
+		fallthrough
+	case StageIDRunning:
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err)
+		fallthrough
+	case StageIDOutput:
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err)
+	default:
+		panic("unknown StageID")
+	}
+}
+
 func (r *runningStep) deployFailed(err error) {
 	r.logger.Debugf("Deploy failed stage for step %s/%s", r.runID, r.pluginStepID)
 	r.transitionStage(StageIDDeployFailed, step.RunningStepStateRunning)
@@ -1180,6 +1209,9 @@ func (r *runningStep) deployFailed(err error) {
 		Error: err.Error(),
 	})
 	r.completeStep(StageIDDeployFailed, step.RunningStepStateFinished, &outputID, &output)
+	// If deployment fails, enabling, disabled, starting, running, and output cannot occur.
+	err = fmt.Errorf("deployment failed for step %s/%s", r.runID, r.pluginStepID)
+	r.markStageFailures(StageIDEnabling, err)
 }
 
 func (r *runningStep) transitionToCancelled() {
@@ -1188,6 +1220,12 @@ func (r *runningStep) transitionToCancelled() {
 	r.transitionStage(StageIDCancelled, step.RunningStepStateRunning)
 	// Cancelled currently has no output.
 	r.transitionStage(StageIDCancelled, step.RunningStepStateFinished)
+
+	// This is called after deployment. So everything after deployment cannot occur.
+	err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID)
+	// Note: This function is only called if it's cancelled during the deployment phase.
+	// If that changes, the stage IDs marked as failed need to be changed.
+	r.markStageFailures(StageIDEnabling, err)
 }
 
 func (r *runningStep) transitionToDisabled() {
@@ -1207,6 +1245,10 @@ func (r *runningStep) transitionToDisabled() {
 		schema.PointerTo("output"),
 		&disabledOutput,
 	)
+
+	err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID)
+	r.markStageFailures(StageIDStarting, err)
+
 }
 
 func (r *runningStep) startFailed(err error) {
@@ -1221,6 +1263,9 @@ func (r *runningStep) startFailed(err error) {
 	})
 
 	r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output)
+
+	r.markStageFailures(StageIDRunning, err)
+
 }
 
 func (r *runningStep) runFailed(err error) {
@@ -1234,6 +1279,8 @@ func (r *runningStep) runFailed(err error) {
 		Output: err.Error(),
 	})
 	r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output)
+
+	r.markStageFailures(StageIDOutput, err)
 }
 
 // TransitionStage transitions the stage to the specified stage, and the state to the specified state.

diff --git a/internal/step/plugin/provider_test.go b/internal/step/plugin/provider_test.go
@@ -46,6 +46,10 @@ func (s *deployFailStageChangeHandler) OnStepComplete(
 	s.message <- message
 }
 
+func (s *deployFailStageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) {
+
+}
+
 type startFailStageChangeHandler struct {
 	message chan string
 }
@@ -79,6 +83,10 @@ func (s *startFailStageChangeHandler) OnStepComplete(
 	s.message <- message
 }
 
+func (s *startFailStageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) {
+
+}
+
 type stageChangeHandler struct {
 	message chan string
 }
@@ -111,6 +119,10 @@ func (s *stageChangeHandler) OnStepComplete(
 	s.message <- message
 }
 
+func (s *stageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) {
+
+}
+
 func TestProvider_MissingDeployer(t *testing.T) {
 	logger := log.New(
 		log.Config{

diff --git a/internal/step/provider.go b/internal/step/provider.go
@@ -56,6 +56,15 @@ type StageChangeHandler interface {
 		previousStageOutput *any,
 		wg *sync.WaitGroup,
 	)
+
+	// OnStepStageFailure is called when it becomes known that the step's stage will not produce an output.
+	// The error is optional.
+	OnStepStageFailure(
+		step RunningStep,
+		stage string,
+		wg *sync.WaitGroup,
+		err error,
+	)
 }
 
 // RunnableStep is a step that already has a schema and can be run.

diff --git a/workflow/model.go b/workflow/model.go
@@ -2,12 +2,10 @@ package workflow
 
 import (
 	"fmt"
-	"regexp"
-	"strings"
-
 	"go.arcalot.io/dgraph"
 	"go.flow.arcalot.io/engine/internal/step"
 	"go.flow.arcalot.io/pluginsdk/schema"
+	"regexp"
 )
 
 // Workflow is the primary data structure describing workflows.
@@ -229,30 +227,23 @@ type ErrNoMorePossibleSteps struct {
 
 // Error returns an explanation on why the error happened.
 func (e ErrNoMorePossibleSteps) Error() string {
-	var outputsUnmetDependencies []string //nolint:prealloc
-	for _, node := range e.dag.ListNodes() {
-		if node.Item().Kind != DAGItemKindOutput {
-			continue
-		}
-		var unmetDependencies []string
-		inbound, err := node.ListInboundConnections()
-		if err != nil {
-			panic(fmt.Errorf("failed to fetch output node inbound dependencies (%w)", err))
-		}
-		for i := range inbound {
-			unmetDependencies = append(unmetDependencies, i)
-		}
-		outputsUnmetDependencies = append(
-			outputsUnmetDependencies,
-			fmt.Sprintf("%s: %s", node.Item().OutputID, strings.Join(unmetDependencies, ", ")),
-		)
-	}
 	return fmt.Sprintf(
-		"no steps running, no more executable steps, cannot construct any output (outputs have the following dependencies: %s)",
-		strings.Join(outputsUnmetDependencies, "; "),
+		"no steps running, no more executable steps; cannot construct any output." +
+			" this is the fallback system, indicating a failure of the output resolution system",
 	)
 }
 
+// ErrNoMorePossibleOutputs indicates that the workflow has terminated due to it being impossible to resolve an output.
+// This means that steps that the output(s) depended on did not have the required results.
+type ErrNoMorePossibleOutputs struct {
+	dag dgraph.DirectedGraph[*DAGItem]
+}
+
+// Error returns an explanation on why the error happened.
+func (e ErrNoMorePossibleOutputs) Error() string {
+	return "all outputs marked as unresolvable"
+}
+
 // ErrInvalidState indicates that the workflow failed due to an invalid state.
 type ErrInvalidState struct {
 	processingSteps int

diff --git a/workflow/model_test.go b/workflow/model_test.go
@@ -6,7 +6,7 @@ import (
 	"testing"
 )
 
-var versionExp = "v0.2.0"
+var testVersionExp = "v0.2.0"
 var inputExp = map[string]any{
 	"root": "RootObject",
 	"objects": map[string]any{
@@ -24,9 +24,9 @@ var stepsExp = map[string]any{
 		"input": map[string]any{
 			"wait_time_ms": 1}},
 }
-var outputID = "success"
+var testExpectedOutputID = "success"
 var outputsExp = map[string]any{
-	outputID: "!expr $.steps.long_wait.outputs",
+	testExpectedOutputID: "!expr $.steps.long_wait.outputs",
 }
 var outputSchemaRootID = "RootObjectOut"
 var stepOutputSchemaInput = map[string]any{
@@ -42,10 +42,10 @@ var stepOutputSchemaInput = map[string]any{
 						}}}}}},
 }
 var outputSchemaInput = map[string]any{
-	outputID: stepOutputSchemaInput,
+	testExpectedOutputID: stepOutputSchemaInput,
 }
 var workflowSchemaInput = map[string]any{
-	"version":      versionExp,
+	"version":      testVersionExp,
 	"input":        inputExp,
 	"steps":        stepsExp,
 	"outputs":      outputsExp,