Add --api option so that we can circumvent pymongo calls

XENON1T · ershockley · Jun 24, 2016 · Jun 29, 2016 · Jul 5, 2016 · Aug 30, 2016
commit 1b34fd935c64d3eb108288a0aecb5cf435285b91
diff --git a/cax/api.py b/cax/api.py
@@ -15,7 +15,7 @@ def __init__(self):
 
         # Runs DB Query Parameters
         self.api_url = config.API_URL
-        self.api_schema = "https://xenon1t-daq.lngs.infn.it"
+        self.api_schema = "https://xenon1t-daq.lngs.infn.it" # needed for using self.next_run
         self.get_params = {
             "username": config.api_user(),
             "api_key": config.api_key(),
@@ -31,7 +31,7 @@ def __init__(self):
 
         self.logging = logging.getLogger(self.__class__.__name__)
 
-    def get_next_run(self, query):
+    def get_next_run(self, query, _id=None):
         ret = None
         if self.next_run == None:
             return ret
@@ -45,11 +45,13 @@ def get_next_run(self, query):
 
             params['limit']=1
             params['offset']=0
-
+
+            url = self.api_url if _id is None else (self.api_url + str(_id) + "/")
+
             api_try = 1
             while api_try <= 3:
                 try:
-                    db_request = requests.get(self.api_url, params = params).text
+                    db_request = requests.get(url, params = params).text
                     break
                 except:
                     time.sleep(5)
@@ -65,12 +67,16 @@ def get_next_run(self, query):
 
         # Keep track of the next run so we can iterate. 
         if ret is not None:
-            self.next_run = ret['meta']['next']
-            if len(ret['objects'])==0:
-                return None
+            if _id is None:
+                self.next_run = ret['meta']['next']
+                if len(ret['objects'])==0:
+                    return None
 
-            return ret['objects'][0]['doc']
+                return ret['objects'][0]['doc']
 
+            else:
+                self.next_run = None # otherwise self.get_all_runs would be an infinite loop
+                return ret['doc']
         return None
 
     def add_location(self, uuid, parameters):
@@ -132,12 +138,11 @@ def verify_site(self, sitea, siteb):
                  (sitea['type'] == siteb['type']) and
                  (sitea['location'] == siteb['location']))
 
-    def get_all_runs(self, query, limit):
+    def get_all_runs(self, query, _id=None):
         # return list of rundocs for all runs satisfying query
         collection = []
-        counter = 0
-        while self.next_run is not None and counter < limit:
-            collection.append(self.get_next_run(query))
-            counter += 1
+        query = {'query' : dumps(query,default=json_util.default)}
+        while self.next_run is not None:
+            collection.append(self.get_next_run(query, _id))
 
         return collection
diff --git a/cax/dag_writer.py b/cax/dag_writer.py
@@ -63,7 +63,7 @@ def get_run_doc(self, run_id):
             detector = 'muon_veto'
 
         query = {identifier  : run_id}
-        API = api(detector=detector)
+        API = api()
         doc = API.get_next_run(query)
         time.sleep(0.1)
         return doc

diff --git a/cax/main.py b/cax/main.py
@@ -38,6 +38,7 @@ def main():
                         help="Select a single run using the run name")
     parser.add_argument('--host', type=str,
                         help="Host to pretend to be")
+    parser.add_argument('--api', action='store_true', help='Uses API interface instead of pymongo')
 
     args = parser.parse_args()
 
@@ -92,12 +93,17 @@ def main():
                          args.config_file)
             config.set_json(args.config_file)
 
+    if args.api:
+        use_api = True
+    else:
+        use_api = False
+
     tasks = [
         corrections.AddElectronLifetime(),  # Add electron lifetime to run, which is just a function of calendar time
         corrections.AddGains(), #  Adds gains to a run, where this is computed using slow control information
         #corrections.AddSlowControlInformation(),
         data_mover.CopyPull(), # Download data through e.g. scp to this location
-        data_mover.CopyPush(),  # Upload data through e.g. scp or gridftp to this location where cax running
+        data_mover.CopyPush(use_api=use_api),  # Upload data through e.g. scp or gridftp to this location where cax running
         #tsm_mover.AddTSMChecksum(), # Add forgotten Checksum for runDB for TSM client.
         checksum.CompareChecksums(),  # See if local data corrupted
         checksum.AddChecksum(),  # Add checksum for data here so can know if corruption (useful for knowing when many good copies!)
@@ -109,7 +115,7 @@ def main():
 
         filesystem.SetPermission(),  # Set any permissions (primarily for Tegner) for new data to make sure analysts can access
         clear.BufferPurger(),  # Clear old data at some locations as specified in cax.json
-        process.ProcessBatchQueue(),  # Process the data with pax
+        process.ProcessBatchQueue(use_api=use_api),  # Process the data with pax
         process_hax.ProcessBatchQueueHax()  # Process the data with hax
     ]
 

diff --git a/cax/task.py b/cax/task.py
@@ -7,14 +7,19 @@
 
 from cax import config
 from cax.dag_prescript import clear_errors
+from cax.api import api
 
 class Task():
-    def __init__(self, query = {}):
+    def __init__(self, query = {}, use_api = False):
         # Grab the Run DB so we can query it
-        self.collection = config.mongo_collection()
+
+        if not use_api:
+            self.collection = config.mongo_collection()
+
         self.log = logging.getLogger(self.__class__.__name__)
         self.run_doc = None
         self.untriggered_data = None
+        self.use_api = use_api
 
         self.query = query
 
@@ -26,8 +31,7 @@ def go(self, specify_run = None):
         if specify_run is not None:
             if isinstance(specify_run,int):
                 self.query['number'] = specify_run
-                #if 'data' in self.query:
-                    #clear_errors(specify_run, self.query["data"]["$not"]["$elemMatch"]['pax_version'])
+
             elif isinstance(specify_run,str):
                 self.query['name'] = specify_run
 
@@ -36,27 +40,28 @@ def go(self, specify_run = None):
 
         # Collect all run document ids.  This has to be turned into a list
         # to avoid timeouts if a task takes too long.
-        try:
-            ids = [doc['_id'] for doc in self.collection.find(self.query,
-                                                              projection=('_id'),
-                                                              sort=(('start', -1),))]
-        except pymongo.errors.CursorNotFound:
-            self.log.info("Curson not found exception.  Skipping")
+
+        ids = self.collect_ids()
+        if ids is None:
+            self.log.info("Can't get run ids for some reason. Skipping")
             return
 
+
         if len(ids) == 0:
             self.log.info("Query matches no entry. Skipping.")
             return
+
         # Iterate over each run
         for id in ids:
             # Make sure up to date
-            try:
-                self.run_doc = self.collection.find_one({'_id': id})
-            except pymongo.errors.AutoReconnect:
-                self.log.error("pymongo.errors.AutoReconnect, skipping...")
+            self.run_doc = self.get_rundoc(id)
+
+            if self.run_doc is None:
+                self.log.info("Problems getting rundoc for id %s. Skipping" % id)
                 continue
 
             if 'data' not in self.run_doc:
+                self.log.info('Data not in run_doc')
                 continue
 
             # Operate on only user-specified datasets
@@ -114,3 +119,38 @@ def shutdown(self):
         """Runs at end and can be overloaded by subclasses
         """
         pass
+
+    def collect_ids(self):
+        # if not using API interface, do normal pymongo query which is faster
+        if not self.use_api:
+            try:
+                ids = [doc['_id'] for doc in self.collection.find(self.query,
+                                                                  projection=('_id'),
+                                                                  sort=(('start', -1),))]
+            except pymongo.errors.CursorNotFound:
+                self.log.info("Cursor not found exception.  Skipping")
+                return
+
+        else: # slower but uses API which can be useful
+            # initialize api instance
+            API = api()
+            ids = [doc['_id'] for doc in API.get_all_runs(self.query)]
+
+        return ids
+
+
+    def get_rundoc(self, id):
+        if not self.use_api:
+            try:
+                rundoc = self.collection.find_one({'_id': id})
+            except pymongo.errors.AutoReconnect:
+                self.log.error("pymongo.errors.AutoReconnect, skipping...")
+                return
+
+        else:
+            # initialize api
+            API = api()
+            # only want the first result,  mimics collection.find_one
+            rundoc = API.get_all_runs({'_id' : id}, _id=id)[0]
+
+        return rundoc
diff --git a/cax/tasks/data_mover.py b/cax/tasks/data_mover.py
@@ -18,6 +18,7 @@
 from cax import config
 from cax.task import Task
 from cax import qsub
+from cax.api import api
 
 from cax.tasks.tsm_mover import TSMclient
 from cax.tasks.rucio_mover import RucioBase, RucioRule
@@ -688,7 +689,6 @@ def copy_handshake(self, datum, destination, method, option_type):
             return
 
           if datum['type'] == 'processed':
-            self.log.info(datum)
             base_dir = os.path.join(base_dir, 'pax_%s' % datum['pax_version'])
 
           # Check directory existence on local host for download only
@@ -734,14 +734,19 @@ def copy_handshake(self, datum, destination, method, option_type):
             datum_new['location'] = "n/a"
 
         if config.DATABASE_LOG == True:
-            result = self.collection.update_one({'_id': self.run_doc['_id'],
-                                                 },
-                                   {'$push': {'data': datum_new}})
+            if not self.use_api:
+                result = self.collection.update_one({'_id': self.run_doc['_id'],
+                                                     },
+                                                     {'$push': {'data': datum_new}})
 
-            if result.matched_count == 0:
-                self.log.error("Race condition!  Could not copy because another "
-                               "process seemed to already start.")
-                return
+                if result.matched_count == 0:
+                    self.log.error("Race condition!  Could not copy because another "
+                                   "process seemed to already start.")
+                    return
+
+            else:
+                API = api()
+                API.add_location(self.run_doc['_id'], datum_new)
 
         self.log.info('Starting '+method)
 
@@ -772,35 +777,41 @@ def copy_handshake(self, datum, destination, method, option_type):
         self.log.debug(method+" done, telling run database")
 
         if config.DATABASE_LOG:
-          if method == "rucio":
-            logging.info("following entries are added to the runDB:")
-            logging.info("Status: %s", self.rucio.get_rucio_info()['status'] )
-            logging.info("Location: %s", self.rucio.get_rucio_info()['location'] )
-            logging.info("Checksum: %s", self.rucio.get_rucio_info()['checksum'] )
-            logging.info("RSE: %s", self.rucio.get_rucio_info()['rse'] )
-
-            self.collection.update({'_id' : self.run_doc['_id'],
-                                    'data': {
-                                    '$elemMatch': datum_new}},
-                                 {'$set': {
-                                      'data.$.status': self.rucio.get_rucio_info()['status'],
-                                      'data.$.location': self.rucio.get_rucio_info()['location'],
-                                      'data.$.checksum': self.rucio.get_rucio_info()['checksum'],
-                                      'data.$.rse': self.rucio.get_rucio_info()['rse']
-                                          }
-                                })
-
-          else:
-          #Fill the data if method is not rucio
-            if config.DATABASE_LOG:  
-              self.collection.update({'_id' : self.run_doc['_id'],
-                                    'data': {
+            if method == "rucio":
+                logging.info("following entries are added to the runDB:")
+                logging.info("Status: %s", self.rucio.get_rucio_info()['status'] )
+                logging.info("Location: %s", self.rucio.get_rucio_info()['location'] )
+                logging.info("Checksum: %s", self.rucio.get_rucio_info()['checksum'] )
+                logging.info("RSE: %s", self.rucio.get_rucio_info()['rse'] )
+
+                self.collection.update({'_id' : self.run_doc['_id'],
+                                        'data': {
                                         '$elemMatch': datum_new}},
-                                   {'$set': {
-                                        'data.$.status': status
-                                            }
-                                   })
-
+                                     {'$set': {
+                                          'data.$.status': self.rucio.get_rucio_info()['status'],
+                                          'data.$.location': self.rucio.get_rucio_info()['location'],
+                                          'data.$.checksum': self.rucio.get_rucio_info()['checksum'],
+                                          'data.$.rse': self.rucio.get_rucio_info()['rse']
+                                              }
+                                    })
+
+            #Fill the data if method is not rucio
+            #if config.DATABASE_LOG:
+            else:
+                if not self.use_api:
+                    self.collection.update({'_id' : self.run_doc['_id'],
+                                            'data': {
+                                                '$elemMatch': datum_new}},
+                                           {'$set': {
+                                                'data.$.status': status
+                                                    }
+                                           })
+
+                else:
+                    API = api()
+                    updatum = datum_new.copy()
+                    updatum['status'] = 'transferred' #status
+                    API.update_location(self.run_doc['_id'], datum_new, updatum)
 
         if method == "rucio":
           #Rucio 'side load' to set the transfer rules directly after the file upload

diff --git a/cax/tasks/process.py b/cax/tasks/process.py
@@ -19,7 +19,6 @@
 
 from cax import qsub, config
 from cax.task import Task
-from cax.api import api
 from cax.dag_prescript import clear_errors
 
 def verify():
@@ -187,9 +186,7 @@ def _process(name, in_location, host, pax_version,
 class ProcessBatchQueue(Task):
     "Create and submit job submission script."
 
-    def __init__(self):
-        self.API = api()
-
+    def __init__(self, use_api=False):
         self.thishost = config.get_hostname()
         self.pax_version = 'v%s' % pax.__version__
 
@@ -229,7 +226,7 @@ def __init__(self):
             query["data"]["$elemMatch"] = {"host" : self.thishost,
                                            "type" : "raw"}
 
-        Task.__init__(self, query = query)
+        Task.__init__(self, query=query, use_api=use_api)
 
     def verify(self):
         """Verify processing worked"""
@@ -406,9 +403,6 @@ def each_run(self):
 
             self.submit(out_location, ncpus, disable_updates, json_file)
 
-            #if config.DATABASE_LOG == True:
-             #   self.API.add_location(self.run_doc['_id'], datum)
-
             time.sleep(2)
 
     def local_data_finder(self):