Skip to content
This repository has been archived by the owner on Jul 7, 2022. It is now read-only.

Adding preliminary support for converting json schema to apache parquet schema. #19

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,6 @@ ENV/

# mypy
.mypy_cache/

# vscode
.vscode/
17 changes: 14 additions & 3 deletions aptos/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from .parser import SchemaParser
from .primitive import Object
from .visitor import ValidationVisitor
from .schema.visitor import AvroSchemaVisitor
from .schema.visitor import AvroSchemaVisitor, ParquetSchemaVisitor
from .jinja_helper import JinjaHelper


def validate(arguments):
Expand All @@ -30,8 +31,18 @@ def convert(arguments):
sys.exit(colored('error', 'red') + ' cannot convert schema {!r} into {!r} format, schema must be of type "object"'.format(arguments.schema, arguments.format)) # noqa: E501
Visitor = {
'avro': AvroSchemaVisitor,
'parquet': ParquetSchemaVisitor
}[arguments.format]
print(json.dumps(component.accept(Visitor()), indent=2))
converted_schema = component.accept(Visitor())
print_schema(arguments.format, converted_schema)


def print_schema(schema_arg, converted_schema):
if schema_arg == 'avro':
print(json.dumps(converted_schema, indent=2))
elif schema_arg == 'parquet':
parquet_schema = JinjaHelper.get_template('inner_schema.jinja2')
print(parquet_schema.render(body=converted_schema))


def main():
Expand All @@ -57,7 +68,7 @@ def main():
'convert', help='''
Convert a JSON Schema into a different data-interchange format''')
conversion.add_argument(
'-format', type=str, choices=['avro'], help='data-interchange format')
'-format', type=str, choices=['avro', 'parquet'], help='data-interchange format')
conversion.set_defaults(func=convert)

parser.add_argument(
Expand Down
16 changes: 16 additions & 0 deletions aptos/jinja_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from jinja2 import Environment, FileSystemLoader

class JinjaHelper:

jinja_env = None

@classmethod
def create_jinja_env(cls):
file_loader = FileSystemLoader('templates')
JinjaHelper.jinja_env = Environment(loader=file_loader)

@staticmethod
def get_template(template_name):
if JinjaHelper.jinja_env is None:
JinjaHelper.create_jinja_env()
return JinjaHelper.jinja_env.get_template(template_name)
31 changes: 31 additions & 0 deletions aptos/schema/visitor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ..primitive import Array, Object, Reference, Enumeration
from ..jinja_helper import JinjaHelper


class AvroSchemaVisitor:
Expand Down Expand Up @@ -54,3 +55,33 @@ def visit_reference(self, reference, *args):

def visit_union(self, union, *args):
return {'type': union.type}

##-------------------------------------##

class ParquetSchemaVisitor:

def determine_repetition(self, name, required_fields):
if name in required_fields:
return 'required'
else:
return 'optional'

def visit_string(self, string, *args):
return '{repetition} binary {name} (UTF8);\n'

def visit_integer(self, integer, *args):
return '{repetition} int32 {name};\n'

def visit_object(self, obj, *args):
fields = ''
for name, member in obj.properties.items():
placeholder_field = member.accept(self, *args)
if isinstance(member, (Array, Object, Reference, Enumeration)):
fields += placeholder_field
fields += '\n'
else:
field = placeholder_field.format(repetition=self.determine_repetition(name, obj.required), name=name)
fields += field
obj_exp = '{repetition} group {name}'
obj_name = obj_exp.format(repetition=self.determine_repetition(obj.title, obj.required), name=obj.title)
return JinjaHelper.get_template('object.jinja2').render(obj_name=obj_name, body=fields)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
install_requires=[
'colorama',
'termcolor',
'jinja2'
],
name='aptos',
version='1.0.2',
Expand Down
5 changes: 5 additions & 0 deletions templates/inner_schema.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{% extends "schema.jinja2" -%}
{% block schema_content -%}
{% macro fix_indent() %}{{ body }}{% endmacro -%}
{{ fix_indent()|indent(width=2,first=True) -}}
{% endblock -%}
1 change: 1 addition & 0 deletions templates/macros.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% macro fix_indent(body) %}{{ body }}{% endmacro -%}
5 changes: 5 additions & 0 deletions templates/object.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{% import "macros.jinja2" as macros -%}
{{ obj_name }} {
{% block object_content -%}
{{ macros.fix_indent(body)|indent(width=2,first=True) -}}
{% endblock -%}}
4 changes: 4 additions & 0 deletions templates/schema.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
message ParquetSchema {
{% block schema_content %}
{% endblock %}
}
28 changes: 28 additions & 0 deletions tests/schema/person
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"title": "Person",
"type": "object",
"properties": {
"firstName": {
"type": "string"
},
"lastName": {
"type": "string"
},
"age": {
"type" : "integer"
},
"address": {
"type" : "object",
"title": "Address",
"properties": {
"street_name": {
"type": "string"
},
"zipcode": {
"type": "integer"
}
}
}
},
"required": [ "firstName", "age" ]
}