From 2b8f4236e1bfb20068ee06fba2e389baab123b44 Mon Sep 17 00:00:00 2001
From: Stefano Belforte <stefano.belforte@cern.ch>
Date: Tue, 29 Oct 2024 20:30:51 +0100
Subject: [PATCH] report failing reason to log and console. FIx #5342 (#5343)

* better reporting, log and doc. Fix #5342

* cleanup

* fix except order
---
 bin/crab.py                               |  15 ++--
 src/python/CRABClient/Commands/recover.py | 105 ++++++++++++++++------
 2 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/bin/crab.py b/bin/crab.py
index b67a8757..790a6804 100755
--- a/bin/crab.py
+++ b/bin/crab.py
@@ -132,8 +132,13 @@ def log_exception(exc_type, exc_value, tback):
             print("'" + str(args[0]) + "' is not a valid command.")
             self.parser.print_help()
             sys.exit(-1)
-        self.cmd = sub_cmd(self.logger, args[1:])
+        self.cmd = sub_cmd(self.logger, args[1:])  # the crab command to be executed
 
+        # Every command returns a dictionary which MUST contain the "commandStatus" key.
+        # Any value other then "SUCCESS" for this key indicates command failure
+        #   and will result in 'crab' terminating with non-zero exit code
+        # Additional keys may be present in the dictionary, depending on the specific command,
+        #   which are used to pass information to caller when CRABAPI is used
         returnDict = self.cmd()
         if returnDict['commandStatus'] != 'SUCCESS':
             raise CommandFailedException("Command %s failed" % str(args[0]))
@@ -172,14 +177,14 @@ def log_exception(exc_type, exc_value, tback):
             errorId = re.search(r'(?<=X-Error-Id:\s)[^\n]*', err).group(0)
             client.logger.info('Error Id: %s', errorId)
         logging.getLogger('CRAB3').exception('Caught RESTInterfaceException exception')
-    except ClientException as ce:
-        client.logger.error(ce)
-        logging.getLogger('CRAB3').exception('Caught ClientException exception')
-        exitcode = ce.exitcode
     except CommandFailedException as ce:
         client.logger.warning(ce)
         logging.getLogger('CRAB3').exception('Caught CommandFailedException exception')
         exitcode = 1
+    except ClientException as ce:
+        client.logger.error(ce)
+        logging.getLogger('CRAB3').exception('Caught ClientException exception')
+        exitcode = ce.exitcode
     except KeyboardInterrupt:
         client.logger.error('Keyboard Interrupted')
         exitcode = 1
diff --git a/src/python/CRABClient/Commands/recover.py b/src/python/CRABClient/Commands/recover.py
index ebeb1ec3..22d76cfc 100644
--- a/src/python/CRABClient/Commands/recover.py
+++ b/src/python/CRABClient/Commands/recover.py
@@ -28,7 +28,6 @@
 # step submit
 from CRABClient.Commands.submit import submit
 from CRABClient.UserUtilities import getColumn
-from CRABClient.ClientUtilities import colors
 from ServerUtilities import SERVICE_INSTANCES
 
 SPLITTING_RECOVER_LUMIBASED = set(("LumiBased", "Automatic", "EventAwareLumiBased"))
@@ -44,6 +43,17 @@ class recover(SubCommand):
     shortnames = ["rec"]
 
     def __call__(self):
+        """
+        Code is organized as a series of steps, if any step fails, command exits
+        Each step returns a "retval" dictionary which always contains keys: 'commandStatus' and 'step'
+          'step' value is the name of the step
+          'commandStatus' can be: SUCCESS, FAILED, NothingToDo
+        Only the first two can be returned by this method to crab.py, the latter "NothingToDo"
+          is used as a "break" to exit the chain of steps early and will be converted to SUCCES before
+          this method exits
+        Other keys may be present as present in the return dict of subcommands used in here
+          if a 'msg' key is present, stepExit will log that message
+        """
 
         retval = self.stepInit()
         if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
@@ -62,6 +72,7 @@ def __call__(self):
             self.logger.debug("no need to run crab remake - self.restHostCommonname %s", self.restHostCommonname)
             self.crabProjDir = self.requestarea
 
+
         retval = self.stepValidate()
         if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
 
@@ -83,7 +94,7 @@ def __call__(self):
             if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
 
             if "recoverLumimaskPath" not in retval:
-                return retval
+                return self.stepExit(retval)
 
             retval = self.stepSubmitLumiBased(retval["recoverLumimaskPath"])
             if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
@@ -110,6 +121,11 @@ def stepExit(self, retval):
         > retval = self.stepYYY()
         > if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
         """
+        if 'msg' in retval:
+            self.logger.info("recover process prematurely exited during %s step", retval['step'])
+            self.logger.info(retval['msg'])
+        if retval['commandStatus'] == 'NothingToDo':
+            retval['commandStatus'] = "SUCCESS"  # tell crab.py to exit cleanly with no error
         return retval
 
     def stepInit(self):
@@ -132,7 +148,7 @@ def stepInit(self):
         self.failingTaskInfo = {}
         self.failedJobs = []
 
-        return {"commandStatus": "SUCCESS", "init": None }
+        return {"commandStatus": "SUCCESS", "step": "init"}
 
     def stepRemake(self):
         """
@@ -169,6 +185,9 @@ def stepRemake(self):
             retval = remakeCmd.remakecache(self.failingTaskName)
         self.logger.debug("stepRemakeAndValidate() - remake, retval: %s", retval)
         self.logger.debug("stepRemakeAndValidate() - remake, after, self.configuration: %s", self.configuration)
+        retval['step'] = "remake"
+        if retval['commandStatus'] != "SUCCESS":
+            retval['msg'] = "Could not remake the task project directory"
         return retval
 
     def stepValidate(self):
@@ -202,6 +221,7 @@ def stepValidate(self):
         splitalgo = getColumn(self.failingCrabDBInfo, 'tm_split_algo')
         if not splitalgo in SPLITTING_RECOVER_LUMIBASED.union(SPLITTING_RECOVER_FILEBASED):
             msg = 'crab recover supports only tasks with LumiBased and FileBased splitting, you have {}'.format(splitalgo)
+            self.logger.info(msg)
             return {"commandStatus": "FAILED", "step": "RemakeAndValidate" , "msg": msg }
 
         self.failingTaskInfo["splitalgo"] = splitalgo
@@ -210,7 +230,7 @@ def stepValidate(self):
 
         self.logger.debug("stepRemakeAndValidate() - failingtaskinfo - %s", self.failingTaskInfo)
 
-        return {"commandStatus": "SUCCESS", "validate": None }
+        return {"commandStatus": "SUCCESS", "step": "validate"}
 
     def stepStatus(self):
         """
@@ -261,6 +281,9 @@ def stepStatus(self):
         # [1, 2, 4]
         self.failedJobs = [job[1] for job in retval["jobList"] if job[0] == "failed"]
         self.logger.debug("stepStatus() - status, failedJobs: %s", self.failedJobs)
+        retval['step'] = "status"
+        if retval['commandStatus'] != "SUCCESS":
+            retval['msg'] = "Could not retrieve task status"
 
         return retval
 
@@ -270,22 +293,23 @@ def stepKill(self):
         - kills the original failing task
         """
         ## step2: kill
+        retval = {"step": "kill"}
 
         # if the task is already killed or about to be killed, do not kill again
         if self.failingTaskStatus["dbStatus"] == "KILLED" or \
             (self.failingTaskStatus["dbStatus"] in ("NEW", "QUEUED") and self.failingTaskStatus["command"] == "KILL"):
-            returnDict = {'kill' : 'already killed', 'commandStatus': 'SUCCESS'}
-            self.logger.info("step kill - task already killed")
-            return returnDict
+            retval['commandStatus'] = "SUCCESS"
+            self.logger.debug("step kill - task already killed")
+            return retval
 
         # avoid that crab operators kill users tasks by mistake.
         # if the user who is running crab recover differs from the one who submitted the original task,
         # then kill the task only if the option "--forcekill" is used.
         username = getUsername(self.proxyfilename, logger=self.logger)
         if self.failingTaskInfo["username"] != username and not self.options.__dict__["forceKill"]:
-            returnDict = {'kill' : 'do not kill task submitted by another user', 'commandStatus': 'FAILED'}
-            self.logger.info("step kill - task submitted by another user, will not kill it")
-            return returnDict
+            retval['commandStatus'] = "FAILED"
+            retval['msg'] = "task submitted by another user, will not kill it"
+            return retval
 
         cmdargs = []
         cmdargs.append("-d")
@@ -301,7 +325,7 @@ def stepKill(self):
         self.logger.debug("stepKill() - cmdargs: %s", cmdargs)
         killCmd = kill(logger=self.logger, cmdargs=cmdargs)
         with SubcommandExecution(self.logger, "kill") as _:
-            retval = killCmd()
+            retval.update(killCmd())
 
         self.logger.debug("stepKill() - retval: %s", retval)
         self.logger.debug("stepKill() - after, self.configuration: %s", self.configuration)
@@ -369,23 +393,28 @@ def stepCheckKill(self):
         self.logger.debug("stepCheckKill() - dagStatus %s", self.failingTaskStatus["dagStatus"])
         self.logger.debug("stepCheckKill() - dbStatus %s", self.failingTaskStatus["dbStatus"])
 
+        retval = {'step': "checkKill"}
+
         # check the task status. 
         # it does not make sense to recover a task in COMPLETED
         if not self.failingTaskStatus["status"] in ("SUBMITTED", "FAILED", "FAILED (KILLED)"):
-            msg = "In order to recover a task, the combined status of the task needs can not be {}".format(self.failingTaskStatus["status"])
-            return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+            msg = "Tasks in status {} can not be recovered".format(self.failingTaskStatus["status"])
+            retval.update({"commandStatus": "NothingToDo", "msg": msg})
+            return retval
 
         # the status on the db should be submitted or killed. or about to be killed
         if self.failingTaskStatus["dbStatus"] in ("NEW", "QUEUED"):
             if not self.failingTaskStatus["command"] in ("KILL"):
                 msg = "In order to recover a task, when the status of the task in the oracle DB is {}, the task command can not be {}"\
                     .format(self.failingTaskStatus["dbStatus"], self.failingTaskStatus["command"])
-                return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+                retval.update({"commandStatus": "FAILED", "msg": msg})
+                return retval
         else:
             if not self.failingTaskStatus["dbStatus"] in ("SUBMITTED", "KILLED"):
                 msg = "In order to recover a task, the status of the task in the oracle DB can not be {}"\
                     .format(self.failingTaskStatus["dbStatus"])
-                return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+                retval.update({"commandStatus": "FAILED", "msg": msg})
+                return retval
 
         # make sure that the jobs ad publications are in a final state.
         # - [x] make sure that there are no ongoing transfers
@@ -396,7 +425,8 @@ def stepCheckKill(self):
         if not set(self.failingTaskStatus["jobsPerStatus"].keys()).issubset(terminalStates):
             msg = "In order to recover a task, all the jobs need to be in a terminal state ({}). You have {}"\
                 .format(terminalStates, self.failingTaskStatus["jobsPerStatus"].keys())
-            return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+            retval.update({"commandStatus": "FAILED", "msg": msg})
+            return retval
 
         # - [x] make sure that there are no ongoing publications
         self.logger.debug("stepCheckKill - publication %s", self.failingTaskStatus["publication"] )
@@ -404,15 +434,18 @@ def stepCheckKill(self):
         if not set(self.failingTaskStatus["publication"].keys()).issubset(terminalStatesPub):
             msg = "In order to recover a task, publication for all the jobs need to be in a terminal state ({}). You have {}"\
                 .format(terminalStatesPub, self.failingTaskStatus["publication"].keys())
-            return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+            retval.update({"commandStatus": "FAILED", "msg": msg})
+            return retval
 
         # - [x] if all jobs failed, then exit. it is better to submit again the task than using crab recover :)
         #       check that "failed" is the only key of the jobsPerStatus dictionary
         if set(self.failingTaskStatus["jobsPerStatus"].keys()) == set(("failed",)):
-            msg = "All the jobs of the original task failed. better submitting it again from scratch than recovering it."
-            return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
+            msg = "All the jobs of the original task failed. Better investigate and submit it again than recover."
+            retval.update({"commandStatus": "FAILED", "msg": msg})
+            return retval
 
-        return {"commandStatus": "SUCCESS", "checkkill": "task can be recovered"}
+        retval.update({"commandStatus": "SUCCESS", "msg": "task can be recovered"})
+        return retval
 
     def stepReport(self):
         """
@@ -425,6 +458,8 @@ def stepReport(self):
           with the output of crab report
         """
 
+        retval = {"step": "report"}
+
         failingTaskPublish = getColumn(self.failingCrabDBInfo, 'tm_publication')
         self.logger.debug("stepReport() - tm_publication: %s %s", type(failingTaskPublish), failingTaskPublish)
         # - if the user specified --strategy=notPublished but the original failing task
@@ -459,7 +494,7 @@ def stepReport(self):
         reportCmd = report(logger=self.logger, cmdargs=cmdargs)
         with SubcommandExecution(self.logger, "report") as _:
             # FIXME - stays noisy because interference with getMutedStatusInfo()
-            retval = reportCmd()
+            retval.update(reportCmd())
         self.logger.debug("stepReport() - report, after, self.configuration: %s", self.configuration)
         self.logger.debug("stepReport() - report, retval: %s", retval)
 
@@ -486,17 +521,20 @@ def stepReport(self):
                 # we will likely never reach this if, because in this case the status on the schedd
                 # should be COMPLETED, which is not accepted by stepCheckKill
                 self.logger.info("stepReport() - all lumis have been processed by original task. crab recover will exit")
+                retval.update({'commandStatus' : 'SUCCESS'})
+                return retval
 
         self.logger.debug("crab report - recovery task will process lumis contained in file %s", recoverLumimaskPath)
 
 
         if os.path.exists(recoverLumimaskPath):
-            returnDict = {'commandStatus' : 'SUCCESS', 'recoverLumimaskPath': recoverLumimaskPath}
+            retval.update({'commandStatus' : 'SUCCESS', 'recoverLumimaskPath': recoverLumimaskPath})
         else:
-            msg = 'the file {} does not exist. crab report could not produce it, the task can not be recovered'.format(recoverLumimaskPath)
-            returnDict = {'commandStatus' : 'FAILED', 'msg': msg}
+            msg = 'File {} does not exist. crab report could not produce it, the task can not be recovered'.format(recoverLumimaskPath)
+            self.logger.info(msg)
+            retval.update({'commandStatus' : 'FAILED', 'msg': msg})
 
-        return returnDict
+        return retval
 
     def stepGetsandbox(self):
         """
@@ -504,6 +542,8 @@ def stepGetsandbox(self):
         - download the user_ and debug_sandbox from s3 or from the schedd
         """
 
+        retval = {"step": "getSandbox"}
+
         cmdargs = []
         cmdargs.append("-d")
         cmdargs.append(str(self.crabProjDir))
@@ -516,7 +556,7 @@ def stepGetsandbox(self):
         self.logger.debug("stepGetsandbox() - cmdargs: %s", cmdargs)
         getsandboxCmd = getsandbox(logger=self.logger, cmdargs=cmdargs)
         with SubcommandExecution(self.logger, "getsandbox") as _:
-            retval = getsandboxCmd()
+            retval.update(getsandboxCmd())
         self.logger.debug("stepGetsandbox() - retval: %s", retval)
         return retval
 
@@ -528,6 +568,9 @@ def stepExtractSandbox(self, sandbox_paths):
         - extracts the user_ and debug_sandbox, so that the files that they contain
           can be used by crab submit at a later step
         """
+
+        retval = {"step": "extractSandbox"}
+
         debug_sandbox = tarfile.open(sandbox_paths[0])
         debug_sandbox.extractall(path=os.path.join(self.crabProjDir, "user_sandbox"))
         debug_sandbox.close()
@@ -539,7 +582,8 @@ def stepExtractSandbox(self, sandbox_paths):
         self.recoverconfig = os.path.join(self.crabProjDir, "debug_sandbox", 
                                           "debug" , "crabConfig.py")
 
-        return {"commandStatus": "SUCCESS", }
+        retval.update({"commandStatus": "SUCCESS"})
+        return retval
 
     def stepSubmitLumiBased(self, notFinishedJsonPath):
         """
@@ -551,6 +595,8 @@ def stepSubmitLumiBased(self, notFinishedJsonPath):
         - submits a new task
         """
 
+        retval = {"step": "submitLumiBased"}
+
         cmdargs = []
         cmdargs.append("-c")
         cmdargs.append(self.recoverconfig)
@@ -588,7 +634,7 @@ def stepSubmitLumiBased(self, notFinishedJsonPath):
         submitCmd = submit(logger=self.logger, cmdargs=cmdargs)
 
         # with SubcommandExecution(self.logger, "submit") as _:
-        retval = submitCmd()
+        retval.update(submitCmd())
         self.logger.debug("stepSubmit() - retval %s", retval)
         return retval
 
@@ -602,8 +648,11 @@ def stepSubmitFileBased(self):
         - [ ] if the input is from DBS, then write info to runs_and_lumis.tar.gz
         """
 
+        retval = {"step": "submitFileBased"}
+
         # TODO
         # I will need to implement this!
+        raise NotImplementedError
         return {'commandStatus': 'FAILED', 'error': 'not implemented yet'}
 
     def setOptions(self):