From 5502cc1153f3012e4b53a68238afaa34744ee610 Mon Sep 17 00:00:00 2001 From: Dustin Ingram Date: Fri, 31 Jul 2015 17:17:40 -0400 Subject: [PATCH] Add S3File input type --- README.rst | 106 +++++++++++++++++++++-------------- setup.py | 4 +- vladiate/inputs.py | 34 +++++++++++ vladiate/test/test_inputs.py | 23 ++++++++ vladiate/vlad.py | 23 ++++---- 5 files changed, 135 insertions(+), 55 deletions(-) create mode 100644 vladiate/test/test_inputs.py diff --git a/README.rst b/README.rst index 529f579..c18952c 100644 --- a/README.rst +++ b/README.rst @@ -10,18 +10,15 @@ file. Features -------- -- **Write validation schemas in plain-old Python** +**Write validation schemas in plain-old Python** + No UI, no XML, no JSON, just code. -No UI, no XML, no JSON, just code. +**Write your own validators** + Vladiate comes with a few by default, but there's no reason you can't write + your own. -- **Write your own validators** - -Vladiate comes with a few by default, but there's no reason you can't -write your own. - -- **Validate multiple files at once** - -Either with the same schema, or different ones. +**Validate multiple files at once** + Either with the same schema, or different ones. Documentation ------------- @@ -188,63 +185,88 @@ Built-in Validators Vladiate comes with a few common validators built-in: -- *class* ``Validator`` +*class* ``Validator`` + + Generic validator. Should be subclassed by any custom validators. Not to + be used directly. + +*class* ``CastValidator`` + + Generic "can-be-cast-to-x" validator. Should be subclassed by any + cast-test validator. Not to be used directly. + +*class* ``IntValidator`` + + Validates whether a field can be cast to an ``int`` type or not. + + :``empty_ok=False``: + Specify whether a field which is an empty string should be ignored. -Generic validator. Should be subclassed by any custom validators. Not to -be used directly. +*class* ``FloatValidator`` -- *class* ``CastValidator`` + Validates whether a field can be cast to an ``float`` type or not. -Generic "can-be-cast-to-x" validator. Should be subclassed by any -cast-test validator. Not to be used directly. + :``empty_ok=False``: + Specify whether a field which is an empty string should be ignored. -- *class* ``IntValidator`` +*class* ``SetValidator`` -Validates whether a field can be cast to an ``int`` type or not. + Validates whether a field is in the specified set of possible fields. -- ``empty_ok=False`` + :``valid_set=[]``: + List of valid possible fields + :``empty_ok=False``: + Implicity adds the empty string to the specified set. - Specify whether a field which is an empty string should be ignored. +*class* ``UniqueValidator`` -- *class* ``FloatValidator`` + Ensures that a given field is not repeated in any other column. Can + optionally determine "uniqueness" with other fields in the row as well via + ``unique_with``. -Validates whether a field can be cast to an ``float`` type or not. + :``unique_with=[]``: + List of field names to make the primary field unique with. -- ``empty_ok=False`` +*class* ``EmptyValidator`` - Specify whether a field which is an empty string should be ignored. + Ensure that a field is always empty. Essentially the same as an empty + ``SetValidator``. This is used by default when a field has no + validators. -- *class* ``SetValidator`` +*class* ``Ignore`` -Validates whether a field is in the specified set of possible fields. + Always passes validation. Used to explicity ignore a given column. -- ``valid_set=[]`` +Built-in Input Types +^^^^^^^^^^^^^^^^^^^^ - List of valid possible fields +Vladiate comes with the following input types: -- ``empty_ok=False`` +*class* ``VladInput`` - Implicity adds the empty string to the specified set. + Generic input. Should be subclassed by any custom inputs. Not to be used + directly. -- *class* ``UniqueValidator`` +*class* ``LocalFile`` -Ensures that a given field is not repeated in any other column. Can -optionally determine "uniqueness" with other fields in the row as well -via ``unique_with``. + Read from a file local to the filesystem. -- ``unique_with=[]`` + :``filename``: + Path to a local CSV file. - List of field names to make the primary field unique with. +*class* ``S3File`` -- *class* ``EmptyValidator`` + Read from a file in S3. Uses the `boto `_ + library. Optionally can specify either a full path, or a bucket/key pair. -Ensure that a field is always empty. Essentially the same as an empty -``SetValidator``. This is used by default when a field has no -validators. + :``path=None``: + A full S3 filepath (e.g., ``s3://foo.bar/path/to/file.csv``) -- *class* ``Ignore`` + :``bucket=None``: + S3 bucket. Must be specified with a ``key``. -Always passes validation. Used to explicity ignore a given column. + :``key=None``: + S3 key. Must be specified with a ``bucket``. Testing ~~~~~~~ diff --git a/setup.py b/setup.py index 0b441dd..5f8d081 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,12 @@ def run_tests(self): errno = pytest.main(self.pytest_args) sys.exit(errno) + def readme(): with open('README.rst') as f: return f.read() + setup( name='vladiate', version=version, @@ -60,7 +62,7 @@ def readme(): packages=find_packages(exclude=['examples', 'tests']), include_package_data=True, zip_safe=False, - install_requires=[], + install_requires=['boto'], tests_require=['pytest'], cmdclass={'test': PyTest}, entry_points={ diff --git a/vladiate/inputs.py b/vladiate/inputs.py index 9615e18..f7f2089 100644 --- a/vladiate/inputs.py +++ b/vladiate/inputs.py @@ -1,3 +1,8 @@ +import io +import boto +from urlparse import urlparse + + class VladInput(object): ''' A generic input class ''' @@ -22,3 +27,32 @@ def open(self): def __repr__(self): return "{}('{}')".format(self.__class__.__name__, self.filename) + + +class S3File(VladInput): + ''' Read from a file in S3 ''' + + def __init__(self, path=None, bucket=None, key=None): + if path and not any((bucket, key)): + self.path = path + parse_result = urlparse(path) + self.bucket = parse_result.netloc + self.key = parse_result.path + elif all((bucket, key)): + self.bucket = bucket + self.key = key + self.path = "s3://{}{}" + else: + raise ValueError( + "Either 'path' argument or 'bucket' and 'key' argument must be set.") + + def open(self): + s3 = boto.connect_s3() + bucket = s3.get_bucket(self.bucket) + key = bucket.new_key(self.key) + contents = key.get_contents_as_string() + ret = io.BytesIO(bytes(contents)) + return ret + + def __repr__(self): + return "{}('{}')".format(self.__class__.__name__, self.path) diff --git a/vladiate/test/test_inputs.py b/vladiate/test/test_inputs.py new file mode 100644 index 0000000..a4206e8 --- /dev/null +++ b/vladiate/test/test_inputs.py @@ -0,0 +1,23 @@ +import pytest + +from ..inputs import * + + +@pytest.mark.parametrize('kwargs', [ + ({'path':'s3://some.bucket/some/s3/key.csv'}), + ({'bucket':'some.bucket', 'key':'/some/s3/key.csv'}), +]) +def test_float_validator_works(kwargs): + S3File(**kwargs) + + +@pytest.mark.parametrize('kwargs', [ + ({}), + ({'path':'s3://some.bucket/some/s3/key.csv', 'bucket':'some.bucket'}), + ({'path':'s3://some.bucket/some/s3/key.csv', 'key':'/some/s3/key.csv'}), + ({'bucket':'some.bucket'}), + ({'key':'/some/s3/key.csv'}), +]) +def test_float_validator_fails(kwargs): + with pytest.raises(ValueError): + S3File(**kwargs) diff --git a/vladiate/vlad.py b/vladiate/vlad.py index 7d55da2..d860250 100644 --- a/vladiate/vlad.py +++ b/vladiate/vlad.py @@ -49,18 +49,17 @@ def validate(self): for field, value in self.validators.iteritems() if not value }) - with self.source.open() as csvfile: - reader = csv.DictReader(csvfile) - self.missing_fields = set(reader.fieldnames) - set(self.validators) - if not self.missing_fields: - for line, row in enumerate(reader): - for field_name, field in row.iteritems(): - for validator in self.validators[field_name]: - try: - validator.validate(field, row=row) - except ValidationException, e: - self.failures[field_name][line].append(e) - validator.fail_count += 1 + reader = csv.DictReader(self.source.open()) + self.missing_fields = set(reader.fieldnames) - set(self.validators) + if not self.missing_fields: + for line, row in enumerate(reader): + for field_name, field in row.iteritems(): + for validator in self.validators[field_name]: + try: + validator.validate(field, row=row) + except ValidationException, e: + self.failures[field_name][line].append(e) + validator.fail_count += 1 if self.missing_fields: self.logger.info("\033[1;33m" + "Missing..." + "\033[0m")