Skip to content
This repository has been archived by the owner on Dec 4, 2023. It is now read-only.

bugs are fixed and a sql database feature is added #30

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,25 @@ A simple crawler/parser for THREDDS catalogs
## Installation

```bash
pip install thredds_crawler
git clone -b conn_sql https://github.com/ioos/thredds_crawler.git
cd thredds_crawler
python setup.py install
```

or
PostGreSQL installation (optional)
```bash
brew install postgresql
```

For running and creating databse in the PostGreSQL, you need to do according to the following lines:
```bash
conda install -c conda-forge thredds_crawler
brew install postgresql
brew services start postgresql
export PGPORT=<port>
export PGHOST=<host>
export PGUSER=<username>
export PGDB=<name of database>
createdb -h $PGHOST -p $PGPORT -U $PGUSER $PGDB
```

## Usage
Expand Down Expand Up @@ -166,6 +178,18 @@ c = Crawl(
)
```

### Connecting to a database

The output of the harvesting process can be easily imported into a database. The purpose of this is to compare two consecutive scans to identify duplicated metadata in each scan.

```python
from thredds_crawler.crawl import Crawl
c = Crawl(
'http://tds.maracoos.org/thredds/MODIS.xml',
conn_database=["username","host","port","database_name","password","type of webservices for harvesting (e.g iso or opendap)"]
)
```


### Debugging

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ requests
lxml
pytz
python-dateutil
psycopg2
67 changes: 62 additions & 5 deletions thredds_crawler/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,37 @@ class Crawl(object):
'.*Constant Forecast Offset.*',
'.*Constant Forecast Date.*'
]

def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None):
def get_transaction_status(self,user,host,port,database,password):
# a function for finding the final status of connection to the PostGreSql
from psycopg2 import extensions, connect, InterfaceError, Error

conn = connect(user=user,
password=password,
host=host,
port=port,
database=database)
# evaluate the status for the PostgreSQL connection
if conn.status == extensions.STATUS_READY:
print ("psycopg2 status #1: Connection is ready for a transaction.")

elif conn.status == extensions.STATUS_BEGIN:
print ("psycopg2 status #2: An open transaction is in process.")

elif conn.status == extensions.STATUS_IN_TRANSACTION:
print ("psycopg2 status #3: An exception has occured.")
print ("Use tpc_commit() or tpc_rollback() to end transaction")

elif conn.status == extensions.STATUS_PREPARED:
print ("psycopg2 status #4:A transcation is in the 2nd phase of the process.")
return conn
def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None, conn_database=None):
"""
:param select list: Dataset IDs. Python regex supported.
:param list skip: Dataset names and/or a catalogRef titles. Python regex supported.
:param requests.auth.AuthBase auth: requets auth object to use
"""
workers = workers or 4
self.pool = mp.Pool(processes=workers)
self.pool = mp.get_context('fork').Pool(processes=workers)

if debug is True:
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -129,7 +151,40 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None,
d.metadata = etree.fromstring(d.metadata)

self.datasets = datasets

if conn_database is not None:
from psycopg2 import extensions, connect, InterfaceError, Error
conn = self.get_transaction_status(conn_database[0],conn_database[1],conn_database[2],conn_database[3],conn_database[4])
cursor = conn.cursor()
cursor.execute("ROLLBACK;")
try:
cursor.execute('CREATE TABLE import_iso_address (dataset_id VARCHAR ( 2500 ) \
,dataset_name VARCHAR (2350 ),dataset_urlpath VARCHAR (2350 ) \
,dataset_url VARCHAR ( 2500 ) UNIQUE NOT NULL );')
except:
pass
count = 0
postgres_delete_queries = " DELETE FROM import_iso_address;"
cursor.execute(postgres_delete_queries)
conn.commit()
for d in self.datasets:
for s in d.services:
if s.get("service").lower() == conn_database[5]:
xml_url = s.get("url")
postgres_insert_query = """ INSERT INTO import_iso_address (dataset_id,\
dataset_name,dataset_urlpath,dataset_url) VALUES (%s,%s,%s,%s)"""
try:
cursor.execute("ROLLBACK;")
record_to_insert = (d.id, d.name,d.urlpath,xml_url)

cursor.execute(postgres_insert_query, record_to_insert)
conn.commit()

count = count + cursor.rowcount
except:
pass
print(count, "Record inserted successfully into import_iso_address table")
cursor.close()
conn.close()
def _get_catalog_url(self, url):
'''
Returns the appropriate catalog URL by replacing html with xml in some
Expand Down Expand Up @@ -253,6 +308,7 @@ def __init__(self, dataset_url, auth=None):
self.services = []
self.id = None
self.name = None
self.urlpath = None
self.catalog_url = None
self.data_size = None

Expand All @@ -267,6 +323,7 @@ def __init__(self, dataset_url, auth=None):
dataset = tree.find("{%s}dataset" % INV_NS)
self.id = dataset.get("ID")
self.name = dataset.get("name")
self.urlpath = dataset.get("urlPath").removesuffix(self.name)
metadata = dataset.find("{%s}metadata" % INV_NS)
self.catalog_url = dataset_url.split("?")[0]

Expand Down Expand Up @@ -345,4 +402,4 @@ def size(self):
return None # We can't calculate

def __repr__(self):
return "<LeafDataset id: %s, name: %s, services: %s>" % (self.id, self.name, str([s.get("service") for s in self.services]))
return "<LeafDataset id: %s, name: %s, urlpath: %s, services: %s>" % (self.id, self.name, self.urlpath, str([s.get("service") for s in self.services]))