From 4f4e4c2e6e5c776a13ec93e1358757973aa20bf2 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:29:13 -0400 Subject: [PATCH 01/15] If we require lxml, remove the internal etree logic --- thredds_crawler/crawl.py | 2 +- thredds_crawler/etree.py | 20 -------------------- 2 files changed, 1 insertion(+), 21 deletions(-) delete mode 100644 thredds_crawler/etree.py diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 7efc75e..806a5c4 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -1,4 +1,3 @@ -from thredds_crawler.etree import etree try: import urlparse from urllib import quote_plus @@ -11,6 +10,7 @@ import re from datetime import datetime import pytz +from lxml import etree from thredds_crawler.utils import construct_url from dateutil.parser import parse import multiprocessing as mp diff --git a/thredds_crawler/etree.py b/thredds_crawler/etree.py deleted file mode 100644 index 4dfea6d..0000000 --- a/thredds_crawler/etree.py +++ /dev/null @@ -1,20 +0,0 @@ -try: - from lxml import etree -except ImportError: - try: - # Python 2.5 - import xml.etree.cElementTree as etree - except ImportError: - try: - # Python 2.5 - import xml.etree.ElementTree as etree - except ImportError: - try: - # normal cElementTree install - import cElementTree as etree - except ImportError: - try: - # normal ElementTree install - import elementtree.ElementTree as etree - except ImportError: - raise RuntimeError('You need either lxml or ElementTree') From 969c374fe0c2e6f232e7dda819a0547c1bed868e Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:29:21 -0400 Subject: [PATCH 02/15] Fix testing URL --- tests/test_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 7ad1c08..431f65b 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -33,7 +33,7 @@ def test_regex_skips(self): assert len(c.datasets) == 0 def test_iso_links(self): - c = Crawl("http://thredds.axiomalaska.com/thredds/catalogs/global.html", debug=True) + c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html", debug=True) isos = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] assert "?dataset=" in isos[0] assert "&catalog=" in isos[0] From 8fe6d56bf7ae0dae00c29237fd7101931ab8ff2f Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:29:35 -0400 Subject: [PATCH 03/15] Don't swallow logging during testing --- thredds_crawler/crawl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 806a5c4..25be118 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -28,7 +28,6 @@ class NullHandler(logging.Handler): def emit(self, record): pass logger = logging.getLogger("thredds_crawler") -logger.addHandler(NullHandler()) def request_xml(url): @@ -66,6 +65,8 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, formatter = logging.Formatter('%(asctime)s - [%(levelname)s] %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) + else: + logger.addHandler(NullHandler()) # Only process these dataset IDs if select is not None: From 778b1f442fe7b35b8d155f7f8e1e9e285eee7fea Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:30:33 -0400 Subject: [PATCH 04/15] Provide better error messages when a LeadDataset fails --- thredds_crawler/crawl.py | 85 +++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 25be118..2e04e25 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -246,49 +246,52 @@ def __init__(self, dataset_url): except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: - dataset = tree.find("{%s}dataset" % INV_NS) - self.id = dataset.get("ID") - self.name = dataset.get("name") - self.metadata = dataset.find("{%s}metadata" % INV_NS) - self.catalog_url = dataset_url.split("?")[0] - - # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize - data_size = dataset.find("{%s}dataSize" % INV_NS) - if data_size is not None: - self.data_size = float(data_size.text) - data_units = data_size.get('units') - # Convert to MB - if data_units == "bytes": - self.data_size *= 1e-6 - elif data_units == "Kbytes": - self.data_size *= 0.001 - elif data_units == "Gbytes": - self.data_size /= 0.001 - elif data_units == "Tbytes": - self.data_size /= 1e-6 - - # Services - service_tag = dataset.find("{%s}serviceName" % INV_NS) - if service_tag is None: - service_tag = self.metadata.find("{%s}serviceName" % INV_NS) - service_name = service_tag.text - - for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): - if service.get("serviceType") == "Compound": - for s in service.findall("{%s}service" % INV_NS): - url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") - if s.get("suffix") is not None: - url += s.get("suffix") + try: + dataset = tree.find("{%s}dataset" % INV_NS) + self.id = dataset.get("ID") + self.name = dataset.get("name") + self.metadata = dataset.find("{%s}metadata" % INV_NS) + self.catalog_url = dataset_url.split("?")[0] + + # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize + data_size = dataset.find("{%s}dataSize" % INV_NS) + if data_size is not None: + self.data_size = float(data_size.text) + data_units = data_size.get('units') + # Convert to MB + if data_units == "bytes": + self.data_size *= 1e-6 + elif data_units == "Kbytes": + self.data_size *= 0.001 + elif data_units == "Gbytes": + self.data_size /= 0.001 + elif data_units == "Tbytes": + self.data_size /= 1e-6 + + # Services + service_tag = dataset.find("{%s}serviceName" % INV_NS) + if service_tag is None: + service_tag = self.metadata.find("{%s}serviceName" % INV_NS) + service_name = service_tag.text + + for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): + if service.get("serviceType") == "Compound": + for s in service.findall("{%s}service" % INV_NS): + url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") + if s.get("suffix") is not None: + url += s.get("suffix") + # ISO like services need additional parameters + if s.get('name') in ["iso", "ncml", "uddc"]: + url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) + self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) + else: + url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters - if s.get('name') in ["iso", "ncml", "uddc"]: + if service.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) - self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) - else: - url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") - # ISO like services need additional parameters - if service.get('name') in ["iso", "ncml", "uddc"]: - url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) - self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } ) + self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } ) + except BaseException as e: + logger.error('Could not process {}. {}.'.format(dataset_url, e)) @property def size(self): From 07dfb88e5dec06d6385f34b7088bc648d7f1210e Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:30:52 -0400 Subject: [PATCH 05/15] Catch an error when there is no serviceName tag found on a LeafDataset --- thredds_crawler/crawl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 2e04e25..b460c32 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -272,6 +272,8 @@ def __init__(self, dataset_url): service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) + if service_tag is None: + raise ValueError("No serviceName definition found!") service_name = service_tag.text for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): From 80fd2d24ef06f2c06fd86dd6e76c885f51addf46 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:31:07 -0400 Subject: [PATCH 06/15] Add a test for Unidata's motherload catalog --- tests/test_crawler.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 431f65b..5e692a9 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -71,3 +71,23 @@ def test_modified_time(self): def test_ssl(self): c = Crawl("https://opendap.co-ops.nos.noaa.gov/thredds/catalog/NOAA/DBOFS/MODELS/201501/catalog.xml", debug=True) assert len(c.datasets) > 0 + + def test_unidata_parse(self): + selects = [".*Best.*"] + skips = Crawl.SKIPS + [".*grib2", ".*grib1", ".*GrbF.*", ".*ncx2", + "Radar Data", "Station Data", + "Point Feature Collections", "Satellite Data", + "Unidata NEXRAD Composites \(GINI\)", + "Unidata case studies", + ".*Reflectivity-[0-9]{8}"] + c = Crawl( + 'http://thredds.ucar.edu/thredds/catalog.xml', + select=selects, + skip=skips, + debug=True + ) + + assert len(c.datasets) > 0 + + isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] + assert len(isos) > 0 From 44d2a2bdb3ded6808e81aa0ed3f58ff6c3be1cc4 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:50:14 -0400 Subject: [PATCH 07/15] Add auto-tag-release plugin to travis --- .travis.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.travis.yml b/.travis.yml index 9fd187d..2452bfb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,3 +30,11 @@ install: - conda install -c conda-forge pytest script: py.test -rx -v + +deploy: + provider: releases + api_key: + secure: XAx2aeocMQWn2acXcQ5LevsO977glpvPKOnk/2yafHTMd+VROVy8jZjsVTTwOEhzag2xOYgTyDYbX5PRT2uG2Uz/RPwJA0PbB+9NIiT1gvHZ/sfFEm7AfOQ257I2IL72ZGUuSZoa0I1pZnIFaew84FZGQ/jsNtfWZzo1veXI6A0= + on: + tags: true + repo: ioos/thredds_crawler From 0bb582553976149751ce16aa47d5191b2e1a5ff6 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 11:52:05 -0400 Subject: [PATCH 08/15] Move tests folder under thredds_crawler to avoid a top level tests module --- {tests => thredds_crawler/tests}/__init__.py | 0 {tests => thredds_crawler/tests}/test_crawler.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {tests => thredds_crawler/tests}/__init__.py (100%) rename {tests => thredds_crawler/tests}/test_crawler.py (100%) diff --git a/tests/__init__.py b/thredds_crawler/tests/__init__.py similarity index 100% rename from tests/__init__.py rename to thredds_crawler/tests/__init__.py diff --git a/tests/test_crawler.py b/thredds_crawler/tests/test_crawler.py similarity index 100% rename from tests/test_crawler.py rename to thredds_crawler/tests/test_crawler.py From b28b385aa6814ad529d29c9e24ce4a74ac9021e9 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 12:11:51 -0400 Subject: [PATCH 09/15] Don't duplicate logs when testing --- thredds_crawler/crawl.py | 4 ++-- thredds_crawler/tests/test_crawler.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index b460c32..1bc0879 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -8,6 +8,7 @@ import os import sys import re +import logging from datetime import datetime import pytz from lxml import etree @@ -18,7 +19,6 @@ INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" XLINK_NS = "http://www.w3.org/1999/xlink" -import logging try: # Python >= 2.7 from logging import NullHandler @@ -27,7 +27,7 @@ class NullHandler(logging.Handler): def emit(self, record): pass -logger = logging.getLogger("thredds_crawler") +logger = logging.getLogger(__name__) def request_xml(url): diff --git a/thredds_crawler/tests/test_crawler.py b/thredds_crawler/tests/test_crawler.py index 5e692a9..f5e8530 100644 --- a/thredds_crawler/tests/test_crawler.py +++ b/thredds_crawler/tests/test_crawler.py @@ -2,8 +2,12 @@ from datetime import datetime, timedelta import pytz +import logging from thredds_crawler.crawl import Crawl +logger = logging.getLogger('thredds_crawler') +logger.setLevel(logging.DEBUG) +logger.handlers = [logging.StreamHandler()] class CrawlerTest(unittest.TestCase): @@ -33,43 +37,43 @@ def test_regex_skips(self): assert len(c.datasets) == 0 def test_iso_links(self): - c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html", debug=True) + c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html") isos = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] assert "?dataset=" in isos[0] assert "&catalog=" in isos[0] def test_dataset_size_using_xml(self): - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Composites-1Day/2014/catalog.xml", debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Composites-1Day/2014/catalog.xml") self.assertIsNotNone(c.datasets[0].size) def test_dataset_size_using_dap(self): - c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-One-Agg"], debug=True) + c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-One-Agg"]) self.assertIsNotNone(c.datasets[0].size) def test_modified_time(self): # after with timezone af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # after without timezone af = datetime(2015, 12, 30, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # before bf = datetime(2016, 1, 5, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf) assert len(c.datasets) == 3 # both af = datetime(2016, 1, 20, 0, 0) bf = datetime(2016, 2, 1, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af) assert len(c.datasets) == 11 def test_ssl(self): - c = Crawl("https://opendap.co-ops.nos.noaa.gov/thredds/catalog/NOAA/DBOFS/MODELS/201501/catalog.xml", debug=True) + c = Crawl("https://opendap.co-ops.nos.noaa.gov/thredds/catalog/NOAA/DBOFS/MODELS/201501/catalog.xml") assert len(c.datasets) > 0 def test_unidata_parse(self): @@ -83,8 +87,7 @@ def test_unidata_parse(self): c = Crawl( 'http://thredds.ucar.edu/thredds/catalog.xml', select=selects, - skip=skips, - debug=True + skip=skips ) assert len(c.datasets) > 0 From 5a36f00379d2fd01fe7639b6e7bef2b23a5dd505 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 12:12:09 -0400 Subject: [PATCH 10/15] Don't show SSL warnings --- thredds_crawler/crawl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 1bc0879..8a8fd40 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -5,6 +5,7 @@ from urllib import parse as urlparse from urllib.parse import quote_plus import requests +from requests.packages.urllib3.exceptions import InsecureRequestWarning import os import sys import re @@ -18,6 +19,7 @@ INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" XLINK_NS = "http://www.w3.org/1999/xlink" +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) try: # Python >= 2.7 From 71bb0fb656474d7d6c529b6861a4dc0e76744af9 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 12:12:54 -0400 Subject: [PATCH 11/15] Add a conda-recipe and test it in Travis --- .travis.yml | 6 +++++- conda-recipe/meta.yaml | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 conda-recipe/meta.yaml diff --git a/.travis.yml b/.travis.yml index 2452bfb..30fca7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,7 +29,11 @@ install: - conda install -c conda-forge netCDF4 # For optional DAP file size calculation - conda install -c conda-forge pytest -script: py.test -rx -v +script: + - py.test -rx -v + - conda install -n root conda-build anaconda-client + - conda build conda-recipe --python $TRAVIS_PYTHON_VERSION + - conda install thredds_crawler --use-local deploy: provider: releases diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml new file mode 100644 index 0000000..36bf6e5 --- /dev/null +++ b/conda-recipe/meta.yaml @@ -0,0 +1,33 @@ +package: + name: thredds_crawler + version: "1.5.1" + +source: + path: ../ + +build: + number: 0 + script: python setup.py install --single-version-externally-managed --record=record.txt + +requirements: + build: + - python + - setuptools + - requests + - lxml + - pytz + run: + - python + - requests + - lxml + - netcdf4 + - pytz + +test: + imports: + - thredds_crawler + +about: + home: https://github.com/ioos/thredds_crawler + license: MIT License + summary: 'A Python library for crawling THREDDS servers' From a9813999a22f96a1e1802ecc5d106da8935704d2 Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 12:15:32 -0400 Subject: [PATCH 12/15] Ignore pytest meta folders --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index c724ba7..99b69ee 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ dist build *.sublime* +.cache +__pycache__ From 103b28978961fe7b87dfef67382b6e1c08486f3a Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 12:15:46 -0400 Subject: [PATCH 13/15] Bump to 1.5.2 --- VERSION | 2 +- conda-recipe/meta.yaml | 2 +- thredds_crawler/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index 26ca594..4cda8f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.1 +1.5.2 diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 36bf6e5..90dbcd6 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,6 +1,6 @@ package: name: thredds_crawler - version: "1.5.1" + version: "1.5.2" source: path: ../ diff --git a/thredds_crawler/__init__.py b/thredds_crawler/__init__.py index 77f1c8e..c3b3841 100644 --- a/thredds_crawler/__init__.py +++ b/thredds_crawler/__init__.py @@ -1 +1 @@ -__version__ = '1.5.0' +__version__ = '1.5.2' From aa767b4ad2d767aa1f79c3387cddd41b5e942d3f Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 16:06:16 -0400 Subject: [PATCH 14/15] Imrove the readability of the README code samples --- README.md | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0e47eb4..e0fdcb2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -thredds_crawler -=============== +# thredds_crawler [![Build Status](https://travis-ci.org/ioos/thredds_crawler.svg?branch=master)](https://travis-ci.org/ioos/thredds_crawler) @@ -26,7 +25,7 @@ You can select datasets based on their THREDDS ID using the 'select' parameter. ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=[".*-Agg"]) print c.datasets [ , @@ -74,7 +73,11 @@ If you need to remove or add a new `skip`, it is **strongly** encouraged you use ```python from thredds_crawler.crawl import Crawl skips = Crawl.SKIPS + [".*-Day-Aggregation"] -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips) +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=[".*-Agg"], + skip=skips +) print c.datasets [ @@ -128,20 +131,22 @@ You can select data by the THREDDS `modified_time` by using a the `before` and ` import pytz from thredds_crawler.crawl import Crawl -# after +bf = datetime(2016, 1, 5, 0, 0) af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) +url = 'http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml' + +# after +c = Crawl(url, after=af) assert len(c.datasets) == 3 # before -bf = datetime(2016, 1, 5, 0, 0) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf) +c = Crawl(url, before=bf) assert len(c.datasets) == 3 # both af = datetime(2016, 1, 20, 0, 0) bf = datetime(2016, 2, 1, 0, 0) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af) +c = Crawl(url, before=bf, after=af) assert len(c.datasets) == 11 ``` @@ -153,7 +158,12 @@ You can pass in a `debug=True` parameter to Crawl to log to STDOUT what is actua ```python from thredds_crawler.crawl import Crawl skips = Crawl.SKIPS + [".*-Day-Aggregation"] -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips, debug=True) +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=['.*-Agg'], + skip=skips, + debug=True +) Crawling: http://tds.maracoos.org/thredds/MODIS.xml Skipping catalogRef based on 'skips'. Title: MODIS Individual Files @@ -189,7 +199,7 @@ You can get some basic information about a LeafDataset, including the services a ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) dataset = c.datasets[0] print dataset.id MODIS-Agg @@ -214,7 +224,7 @@ If you have a list of datasets you can easily return all endpoints of a certain ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"] print urls [ @@ -236,7 +246,10 @@ This isn't necessarialy the size on disk, because it does not account for `missi ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html", select=["MB_.*"]) +c = Crawl( + 'http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html', + select=['MB_.*'] +) sizes = [d.size for d in c.datasets] print sizes [29247.410283999998, 72166.289680000002] @@ -249,7 +262,7 @@ The entire THREDDS catalog metadata record is saved along with the dataset objec ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) dataset = c.datasets[0] print dataset.metadata.find("{http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0}documentation").text Ocean Color data are provided as a service to the broader community, and can be From 6a6c67a8909a20e809e07476b036f6db50b71d1d Mon Sep 17 00:00:00 2001 From: Kyle Wilcox Date: Mon, 3 Oct 2016 16:06:44 -0400 Subject: [PATCH 15/15] Adds authentication options. fixes #11. --- README.md | 16 ++++++++++++++++ thredds_crawler/crawl.py | 29 ++++++++++++++++------------- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e0fdcb2..7793932 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,22 @@ assert len(c.datasets) == 11 ``` +### Authentication + +You can pass an auth parameter as needed. It needs to be a [requests compatible auth object](http://docs.python-requests.org/en/latest/user/authentication/). + +```python +from thredds_crawler.crawl import Crawl +auth = ('user', 'password') +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=['.*-Agg'], + skip=Crawl.SKIPS, + auth=auth +) +``` + + ### Debugging You can pass in a `debug=True` parameter to Crawl to log to STDOUT what is actually happening. diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 8a8fd40..c491ca4 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -32,32 +32,34 @@ def emit(self, record): logger = logging.getLogger(__name__) -def request_xml(url): +def request_xml(url, auth=None): ''' Returns an etree.XMLRoot object loaded from the url :param str url: URL for the resource to load as an XML ''' try: - r = requests.get(url, verify=False) + r = requests.get(url, auth=auth, verify=False) return r.text.encode('utf-8') except BaseException: logger.error("Skipping %s (error parsing the XML)" % url) return -def make_leaf(url): - return LeafDataset(url) +def make_leaf(url, auth): + return LeafDataset(url, auth=auth) class Crawl(object): SKIPS = [".*files.*", ".*Individual Files.*", ".*File_Access.*", ".*Forecast Model Run.*", ".*Constant Forecast Offset.*", ".*Constant Forecast Date.*"] - def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=4): + def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None): """ - select: a list of dataset IDs. Python regex supported. - skip: list of dataset names and/or a catalogRef titles. Python regex supported. + :param select list: Dataset IDs. Python regex supported. + :param list skip: Dataset names and/or a catalogRef titles. Python regex supported. + :param requests.auth.AuthBase auth: requets auth object to use """ + workers = workers or 4 self.pool = mp.Pool(processes=workers) if debug is True: @@ -105,9 +107,9 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, self.visited = [] datasets = [] - urls = list(self._run(url=catalog_url)) + urls = list(self._run(url=catalog_url, auth=auth)) - jobs = [self.pool.apply_async(make_leaf, args=(url,)) for url in urls] + jobs = [self.pool.apply_async(make_leaf, args=(url, auth)) for url in urls] datasets = [j.get() for j in jobs] self.datasets = [ x for x in datasets if x.id is not None ] @@ -185,11 +187,12 @@ def _compile_references(self, url, tree): references.append(construct_url(url, ref.get("{%s}href" % XLINK_NS))) return references - def _run(self, url): + def _run(self, url, auth): ''' Performs a multiprocess depth-first-search of the catalog references and yields a URL for each leaf dataset found :param str url: URL for the current catalog + :param requests.auth.AuthBase auth: requets auth object to use ''' if url in self.visited: logger.debug("Skipping %s (already crawled)" % url) @@ -200,7 +203,7 @@ def _run(self, url): url = self._get_catalog_url(url) # Get an etree object - xml_content = request_xml(url) + xml_content = request_xml(url, auth) for ds in self._build_catalog(url, xml_content): yield ds @@ -232,7 +235,7 @@ def _build_catalog(self, url, xml_content): class LeafDataset(object): - def __init__(self, dataset_url): + def __init__(self, dataset_url, auth=None): self.services = [] self.id = None @@ -242,7 +245,7 @@ def __init__(self, dataset_url): self.data_size = None # Get an etree object - r = requests.get(dataset_url, verify=False) + r = requests.get(dataset_url, auth=auth, verify=False) try: tree = etree.XML(r.text.encode('utf-8')) except etree.XMLSyntaxError: