Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moved logs connected to unknown undeployable to Devel #631

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions core/environment/transition_deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ func (t DeployTransition) do(env *Environment) (err error) {
WithField("partition", env.Id().String()).
WithField("timeout", deploymentTimeout).
WithField("detector", detector).
WithField("level", infologger.IL_Devel).
Error("role failed to deploy within timeout")
undeployableTaskRoles = append(undeployableTaskRoles, role.GetPath())
}
Expand Down Expand Up @@ -275,6 +276,7 @@ func (t DeployTransition) do(env *Environment) (err error) {
WithField("partition", env.Id().String()).
WithField("timeout", deploymentTimeout).
WithField("detector", detector).
WithField("level", infologger.IL_Devel).
Error("role failed to deploy because of timeout")
undeployableTaskRoles = append(undeployableTaskRoles, role.GetPath())
} else if roleStatus != task.ACTIVE {
Expand All @@ -288,6 +290,7 @@ func (t DeployTransition) do(env *Environment) (err error) {
WithField("partition", env.Id().String()).
WithField("timeout", deploymentTimeout).
WithField("detector", detector).
WithField("level", infologger.IL_Devel).
Error("role failed to deploy because of timeout")
inactiveTaskRoles = append(inactiveTaskRoles, role.GetPath())
}
Expand Down
50 changes: 23 additions & 27 deletions core/task/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ func (m *Manager) newTaskForMesosOffer(
offer *mesos.Offer,
descriptor *Descriptor,
localBindMap channel.BindMap,
executorId mesos.ExecutorID) (t *Task) {
executorId mesos.ExecutorID,
) (t *Task) {
newId := uid.New().String()
t = &Task{
name: fmt.Sprintf("%s#%s", descriptor.TaskClassName, newId),
Expand Down Expand Up @@ -197,8 +198,8 @@ func getTaskClassList(taskClassesRequired []string) (taskClassList []*taskclass.
if err != nil {
return
}
repo := repoManager.GetAllRepos()[tempRepo.GetIdentifier()] //get IRepo pointer from RepoManager
if repo == nil { //should never end up here
repo := repoManager.GetAllRepos()[tempRepo.GetIdentifier()] // get IRepo pointer from RepoManager
if repo == nil { // should never end up here
return nil, errors.New("getTaskClassList: repo not found for " + taskClass)
}

Expand All @@ -223,7 +224,6 @@ func getTaskClassList(taskClassesRequired []string) (taskClassList []*taskclass.
taskInfo := strings.Split(taskPath, "/tasks/")
if len(taskInfo) == 1 {
taskFilename = taskInfo[0]

} else {
taskFilename = taskInfo[1]
}
Expand Down Expand Up @@ -280,7 +280,7 @@ func (m *Manager) removeInactiveClasses() {
return
}

func (m *Manager) RemoveReposClasses(repoPath string) { //Currently unused
func (m *Manager) RemoveReposClasses(repoPath string) { // Currently unused
utils.EnsureTrailingSlash(&repoPath)

_ = m.classes.Do(func(classMap *map[string]*taskclass.Class) error {
Expand Down Expand Up @@ -335,7 +335,6 @@ func logDescriptors(prefix string, logFunc func(format string, args ...interface
}

func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err error) {

/*
Here's what's gonna happen:
1) check if any tasks are already in Roster, whether they are already locked
Expand Down Expand Up @@ -561,7 +560,8 @@ func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err e
// ↑ Not all roles could be deployed. If some were critical,
// we cannot proceed with running this environment. Either way,
// we keep the roles running since they might be useful in the future.
logWithId.Errorf("environment deployment failure: %d tasks requested for deployment, but %d deployed", len(tasksToRun), len(deployedTasks))
logWithId.WithField("level", infologger.IL_Devel).
Errorf("environment deployment failure: %d tasks requested for deployment, but %d deployed", len(tasksToRun), len(deployedTasks))

for _, desc := range undeployedDescriptors {
if desc.TaskRole.GetTaskTraits().Critical == true {
Expand Down Expand Up @@ -596,16 +596,21 @@ func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err e
break DEPLOYMENT_ATTEMPTS_LOOP
}

log.WithField("partition", envId).Errorf("Deployment failed %d/%d attempts. Check messages in IL to figure out why. Retrying...", attemptCount+1, MAX_ATTEMPTS_PER_DEPLOY_REQUEST)
log.WithField("partition", envId).
WithField("level", infologger.IL_Devel).
Errorf("Deployment failed %d/%d attempts. Check messages in IL to figure out why. Retrying...", attemptCount+1, MAX_ATTEMPTS_PER_DEPLOY_REQUEST)
time.Sleep(time.Second * SLEEP_LENGTH_BETWEEN_PER_DEPLOY_REQUESTS)
}
}

logDescriptors("critical task deployment impossible: ", logWithId.Errorf, undeployableCriticalDescriptors)
logDescriptors("critical task deployment failure: ", logWithId.Errorf, undeployedCriticalDescriptors)
{
logWithIdDev := logWithId.WithField("level", infologger.IL_Devel)
logDescriptors("critical task deployment impossible: ", logWithIdDev.Errorf, undeployableCriticalDescriptors)
logDescriptors("critical task deployment failure: ", logWithIdDev.Errorf, undeployedCriticalDescriptors)

logDescriptors("non-critical task deployment failure: ", logWithId.Warningf, undeployedNonCriticalDescriptors)
logDescriptors("non-critical task deployment impossible: ", logWithId.Warningf, undeployableNonCriticalDescriptors)
logDescriptors("non-critical task deployment failure: ", logWithIdDev.Warningf, undeployedNonCriticalDescriptors)
logDescriptors("non-critical task deployment impossible: ", logWithIdDev.Warningf, undeployableNonCriticalDescriptors)
}

// After retries notify environment about failed critical tasks
for _, desc := range undeployableDescriptors {
Expand All @@ -617,11 +622,8 @@ func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err e
m.deployMu.Unlock()

if !deploymentSuccess {
// While all the required roles are running, for some reason we
// can't lock some of them, so we must roll back and keep them
// unlocked in the roster.
var deployedTaskIds []string
for taskPtr, _ := range deployedTasks {
for taskPtr := range deployedTasks {
taskPtr.SetParent(nil)
deployedTaskIds = append(deployedTaskIds, taskPtr.taskId)
}
Expand All @@ -636,11 +638,11 @@ func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err e
}

// Finally, we write to the roster. Point of no return!
for taskPtr, _ := range deployedTasks {
for taskPtr := range deployedTasks {
m.roster.append(taskPtr)
}
if deploymentSuccess {
for taskPtr, _ := range deployedTasks {
for taskPtr := range deployedTasks {
taskPtr.GetParent().SetTask(taskPtr)
}
for taskPtr, descriptor := range tasksAlreadyRunning {
Expand All @@ -653,7 +655,6 @@ func (m *Manager) acquireTasks(envId uid.ID, taskDescriptors Descriptors) (err e
}

func (m *Manager) releaseTasks(envId uid.ID, tasks Tasks) error {

taskReleaseErrors := make(map[string]error)
taskIdsReleased := make([]string, 0)

Expand Down Expand Up @@ -710,7 +711,7 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error {
taskPath := task.GetParentRolePath()
for inbChName, endpoint := range task.GetLocalBindMap() {
var bindMapKey string
if strings.HasPrefix(inbChName, "::") { //global channel alias
if strings.HasPrefix(inbChName, "::") { // global channel alias
bindMapKey = inbChName

// deduplication
Expand Down Expand Up @@ -809,7 +810,6 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error {
func (m *Manager) transitionTasks(envId uid.ID, tasks Tasks, src string, event string, dest string, commonArgs controlcommands.PropertyMap) error {
notify := make(chan controlcommands.MesosCommandResponse)
receivers, err := tasks.GetMesosCommandTargets()

if err != nil {
return err
}
Expand Down Expand Up @@ -894,7 +894,6 @@ func (m *Manager) TriggerHooks(envId uid.ID, tasks Tasks) error {

notify := make(chan controlcommands.MesosCommandResponse)
receivers, err := tasks.GetMesosCommandTargets()

if err != nil {
return err
}
Expand Down Expand Up @@ -959,7 +958,6 @@ func (m *Manager) GetTask(id string) *Task {
}

func (m *Manager) updateTaskState(taskId string, state string) {

taskPtr := m.roster.getByTaskId(taskId)
if taskPtr == nil {
log.WithField("taskId", taskId).
Expand Down Expand Up @@ -1013,7 +1011,7 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {
}
if ack, ok := m.ackKilledTasks.getValue(taskId); ok {
ack <- struct{}{}
//close(ack) // It can even be left open?
// close(ack) // It can even be left open?
}

return
Expand Down Expand Up @@ -1054,7 +1052,6 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {

// Kill all tasks outside an environment (all unlocked tasks)
func (m *Manager) Cleanup() (killed Tasks, running Tasks, err error) {

toKill := m.roster.filtered(func(t *Task) bool {
return !t.IsLocked()
})
Expand Down Expand Up @@ -1193,9 +1190,8 @@ func (m *Manager) handleMessage(tm *TaskmanMessage) error {
err := m.acquireTasks(tm.GetEnvironmentId(), tm.GetDescriptors())
if err != nil {
log.WithError(err).
WithField("level", infologger.IL_Devel).
WithField("partition", tm.GetEnvironmentId().String()).
Errorf("acquireTasks failed")
Errorf("Failed task creation and Mesos resources allocation during the deployment of the environment. For more details check Devel logs in Info Logger.")
}
}()
case taskop.ConfigureTasks:
Expand Down
1 change: 1 addition & 0 deletions core/task/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ func (state *schedulerState) resourceOffers(fidStore store.Singleton) events.Han
descriptorsUndeployable = append(descriptorsUndeployable, descriptor)
descriptorsStillToDeploy = append(descriptorsStillToDeploy[:i], descriptorsStillToDeploy[i+1:]...)
log.WithField("partition", envId.String()).
WithField("level", infologger.IL_Devel).
WithField("descriptor", descriptor.TaskClassName).
Errorf("no resource offer for required host %s, deployment will be aborted", requiredMachineId)
}
Expand Down
Loading