Merge pull request #22 from kwilcox/master

Better LeafDataset error messages from workers, etree cleanup, get tests passing
ioos · Oct 6, 2016 · 4433a9f · 4433a9f
2 parents c43d56d + 6a6c67a
commit 4433a9f
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 103 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 dist
 build
 *.sublime*
+.cache
+__pycache__
diff --git a/.travis.yml b/.travis.yml
@@ -29,4 +29,16 @@ install:
   - conda install -c conda-forge netCDF4  # For optional DAP file size calculation
   - conda install -c conda-forge pytest
 
-script: py.test -rx -v
+script:
+  - py.test -rx -v
+  - conda install -n root conda-build anaconda-client
+  - conda build conda-recipe --python $TRAVIS_PYTHON_VERSION
+  - conda install thredds_crawler --use-local
+
+deploy:
+  provider: releases
+  api_key:
+    secure: XAx2aeocMQWn2acXcQ5LevsO977glpvPKOnk/2yafHTMd+VROVy8jZjsVTTwOEhzag2xOYgTyDYbX5PRT2uG2Uz/RPwJA0PbB+9NIiT1gvHZ/sfFEm7AfOQ257I2IL72ZGUuSZoa0I1pZnIFaew84FZGQ/jsNtfWZzo1veXI6A0=
+  on:
+    tags: true
+    repo: ioos/thredds_crawler
diff --git a/README.md b/README.md
@@ -1,5 +1,4 @@
-thredds_crawler
-===============
+# thredds_crawler
 
 [![Build Status](https://travis-ci.org/ioos/thredds_crawler.svg?branch=master)](https://travis-ci.org/ioos/thredds_crawler)
 
@@ -26,7 +25,7 @@ You can select datasets based on their THREDDS ID using the 'select' parameter.
 
 ```python
 from thredds_crawler.crawl import Crawl
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"])
+c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=[".*-Agg"])
 print c.datasets
 [
   <LeafDataset id: MODIS-Agg, name: MODIS-Complete Aggregation, services: ['OPENDAP', 'ISO']>,
@@ -74,7 +73,11 @@ If you need to remove or add a new `skip`, it is **strongly** encouraged you use
 ```python
 from thredds_crawler.crawl import Crawl
 skips = Crawl.SKIPS + [".*-Day-Aggregation"]
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips)
+c = Crawl(
+  'http://tds.maracoos.org/thredds/MODIS.xml',
+  select=[".*-Agg"],
+  skip=skips
+)
 print c.datasets
 
 [
@@ -128,32 +131,55 @@ You can select data by the THREDDS `modified_time` by using a the `before` and `
 import pytz
 from thredds_crawler.crawl import Crawl
 
-# after
+bf = datetime(2016, 1, 5, 0, 0)
 af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc)
-c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af)
+url = 'http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml'
+
+# after
+c = Crawl(url, after=af)
 assert len(c.datasets) == 3
 
 # before
-bf = datetime(2016, 1, 5, 0, 0)
-c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf)
+c = Crawl(url, before=bf)
 assert len(c.datasets) == 3
 
 # both
 af = datetime(2016, 1, 20, 0, 0)
 bf = datetime(2016, 2, 1, 0, 0)
-c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af)
+c = Crawl(url, before=bf, after=af)
 assert len(c.datasets) == 11
 ```
 
 
+### Authentication
+
+You can pass an auth parameter as needed. It needs to be a [requests compatible auth object](http://docs.python-requests.org/en/latest/user/authentication/).
+
+```python
+from thredds_crawler.crawl import Crawl
+auth = ('user', 'password')
+c = Crawl(
+  'http://tds.maracoos.org/thredds/MODIS.xml',
+  select=['.*-Agg'],
+  skip=Crawl.SKIPS,
+  auth=auth
+)
+```
+
+
 ### Debugging
 
 You can pass in a `debug=True` parameter to Crawl to log to STDOUT what is actually happening.
 
 ```python
 from thredds_crawler.crawl import Crawl
 skips = Crawl.SKIPS + [".*-Day-Aggregation"]
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips, debug=True)
+c = Crawl(
+  'http://tds.maracoos.org/thredds/MODIS.xml',
+  select=['.*-Agg'],
+  skip=skips,
+  debug=True
+)
 
 Crawling: http://tds.maracoos.org/thredds/MODIS.xml
 Skipping catalogRef based on 'skips'.  Title: MODIS Individual Files
@@ -189,7 +215,7 @@ You can get some basic information about a LeafDataset, including the services a
 
 ```python
 from thredds_crawler.crawl import Crawl
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"])
+c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg'])
 dataset = c.datasets[0]
 print dataset.id
 MODIS-Agg
@@ -214,7 +240,7 @@ If you have a list of datasets you can easily return all endpoints of a certain
 
 ```python
 from thredds_crawler.crawl import Crawl
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"])
+c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg'])
 urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"]
 print urls
 [
@@ -236,7 +262,10 @@ This isn't necessarialy the size on disk, because it does not account for `missi
 
 ```python
 from thredds_crawler.crawl import Crawl
-c = Crawl("http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html", select=["MB_.*"])
+c = Crawl(
+  'http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html',
+  select=['MB_.*']
+)
 sizes = [d.size for d in c.datasets]
 print sizes
 [29247.410283999998, 72166.289680000002]
@@ -249,7 +278,7 @@ The entire THREDDS catalog metadata record is saved along with the dataset objec
 
 ```python
 from thredds_crawler.crawl import Crawl
-c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"])
+c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg'])
 dataset = c.datasets[0]
 print dataset.metadata.find("{http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0}documentation").text
 Ocean Color data are provided as a service to the broader community, and can be

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.5.1
+1.5.2
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -0,0 +1,33 @@
+package:
+    name: thredds_crawler
+    version: "1.5.2"
+
+source:
+    path: ../
+
+build:
+    number: 0
+    script: python setup.py install --single-version-externally-managed --record=record.txt
+
+requirements:
+    build:
+        - python
+        - setuptools
+        - requests
+        - lxml
+        - pytz
+    run:
+        - python
+        - requests
+        - lxml
+        - netcdf4
+        - pytz
+
+test:
+    imports:
+        - thredds_crawler
+
+about:
+    home: https://github.com/ioos/thredds_crawler
+    license:  MIT License
+    summary: 'A Python library for crawling THREDDS servers'
diff --git a/thredds_crawler/__init__.py b/thredds_crawler/__init__.py
@@ -1 +1 @@
-__version__ = '1.5.0'
+__version__ = '1.5.2'