ioos · mo-dkrz · Aug 11, 2022 · Aug 11, 2022
diff --git a/README.md b/README.md
@@ -7,13 +7,25 @@ A simple crawler/parser for THREDDS catalogs
 ## Installation
 
 ```bash
-pip install thredds_crawler
+git clone -b conn_sql https://github.com/ioos/thredds_crawler.git
+cd thredds_crawler
+python setup.py install
 ```
 
-or
+PostGreSQL installation (optional)
+```bash
+brew install postgresql
+```
 
+For running and creating databse in the PostGreSQL, you need to do according to the following lines:
 ```bash
-conda install -c conda-forge thredds_crawler
+brew install postgresql
+brew services start postgresql
+export PGPORT=<port>
+export PGHOST=<host>
+export PGUSER=<username>
+export PGDB=<name of database>
+createdb -h $PGHOST -p $PGPORT -U $PGUSER $PGDB
 ```
 
 ## Usage
@@ -166,6 +178,18 @@ c = Crawl(
 )
 ```
 
+### Connecting to a database
+
+The output of the harvesting process can be easily imported into a database. The purpose of this is to compare two consecutive scans to identify duplicated metadata in each scan.
+
+```python
+from thredds_crawler.crawl import Crawl
+c = Crawl(
+  'http://tds.maracoos.org/thredds/MODIS.xml',
+  conn_database=["username","host","port","database_name","password","type of webservices for harvesting (e.g iso or opendap)"]
+  )
+```
+
 
 ### Debugging
 

diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ requests
 lxml
 pytz
 python-dateutil
+psycopg2
diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py
@@ -59,15 +59,37 @@ class Crawl(object):
         '.*Constant Forecast Offset.*',
         '.*Constant Forecast Date.*'
     ]
-
-    def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None):
+    def get_transaction_status(self,user,host,port,database,password):
+        # a function for finding the final status of connection to the PostGreSql
+        from psycopg2 import extensions, connect, InterfaceError, Error
+
+        conn = connect(user=user,
+            password=password,
+            host=host,
+            port=port,
+            database=database)
+        # evaluate the status for the PostgreSQL connection
+        if conn.status == extensions.STATUS_READY:
+            print ("psycopg2 status #1: Connection is ready for a transaction.")
+
+        elif conn.status == extensions.STATUS_BEGIN:
+            print ("psycopg2 status #2: An open transaction is in process.")
+
+        elif conn.status == extensions.STATUS_IN_TRANSACTION:
+            print ("psycopg2 status #3: An exception has occured.")
+            print ("Use tpc_commit() or tpc_rollback() to end transaction")
+
+        elif conn.status == extensions.STATUS_PREPARED:
+            print ("psycopg2 status #4:A transcation is in the 2nd phase of the process.")
+        return conn
+    def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None, conn_database=None):
         """
         :param select list: Dataset IDs. Python regex supported.
         :param list skip: Dataset names and/or a catalogRef titles. Python regex supported.
         :param requests.auth.AuthBase auth: requets auth object to use
         """
         workers = workers or 4
-        self.pool = mp.Pool(processes=workers)
+        self.pool = mp.get_context('fork').Pool(processes=workers)
 
         if debug is True:
             logger.setLevel(logging.DEBUG)
@@ -129,7 +151,40 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None,
                 d.metadata = etree.fromstring(d.metadata)
 
         self.datasets = datasets
-
+        if conn_database is not None:
+            from psycopg2 import extensions, connect, InterfaceError, Error
+            conn = self.get_transaction_status(conn_database[0],conn_database[1],conn_database[2],conn_database[3],conn_database[4])
+            cursor = conn.cursor()
+            cursor.execute("ROLLBACK;")
+            try:
+                cursor.execute('CREATE TABLE import_iso_address (dataset_id VARCHAR ( 2500 ) \
+                    ,dataset_name VARCHAR (2350 ),dataset_urlpath VARCHAR (2350 ) \
+                        ,dataset_url VARCHAR ( 2500 ) UNIQUE NOT NULL );')                                            
+            except:
+                pass
+            count = 0
+            postgres_delete_queries = " DELETE FROM import_iso_address;"
+            cursor.execute(postgres_delete_queries)
+            conn.commit()     
+            for d in self.datasets:
+                for s in d.services:
+                    if s.get("service").lower() == conn_database[5]:
+                        xml_url = s.get("url")
+                        postgres_insert_query = """ INSERT INTO import_iso_address (dataset_id,\
+                                dataset_name,dataset_urlpath,dataset_url) VALUES (%s,%s,%s,%s)"""
+                        try:
+                            cursor.execute("ROLLBACK;")
+                            record_to_insert = (d.id, d.name,d.urlpath,xml_url)
+
+                            cursor.execute(postgres_insert_query, record_to_insert)
+                            conn.commit()
+
+                            count = count + cursor.rowcount
+                        except:
+                            pass                        
+            print(count, "Record inserted successfully into import_iso_address table")
+            cursor.close()
+            conn.close()
     def _get_catalog_url(self, url):
         '''
         Returns the appropriate catalog URL by replacing html with xml in some
@@ -253,6 +308,7 @@ def __init__(self, dataset_url, auth=None):
         self.services    = []
         self.id          = None
         self.name        = None
+        self.urlpath     = None        
         self.catalog_url = None
         self.data_size   = None
 
@@ -267,6 +323,7 @@ def __init__(self, dataset_url, auth=None):
                 dataset = tree.find("{%s}dataset" % INV_NS)
                 self.id = dataset.get("ID")
                 self.name = dataset.get("name")
+                self.urlpath = dataset.get("urlPath").removesuffix(self.name)
                 metadata = dataset.find("{%s}metadata" % INV_NS)
                 self.catalog_url = dataset_url.split("?")[0]
 
@@ -345,4 +402,4 @@ def size(self):
             return None  # We can't calculate
 
     def __repr__(self):
-        return "<LeafDataset id: %s, name: %s, services: %s>" % (self.id, self.name, str([s.get("service") for s in self.services]))
+        return "<LeafDataset id: %s, name: %s, urlpath: %s, services: %s>" % (self.id, self.name, self.urlpath, str([s.get("service") for s in self.services]))