Skip to content

Commit

Permalink
Add gsheets source (airbytehq#695)
Browse files Browse the repository at this point in the history
  • Loading branch information
sherifnada authored Oct 29, 2020
1 parent 6452b62 commit f408bce
Show file tree
Hide file tree
Showing 35 changed files with 1,121 additions and 8 deletions.
1 change: 1 addition & 0 deletions .github/workflows/gradle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ jobs:
STRIPE_INTEGRATION_TEST_CREDS: ${{ secrets.STRIPE_INTEGRATION_TEST_CREDS }}
GH_INTEGRATION_TEST_CREDS: ${{ secrets.GH_INTEGRATION_TEST_CREDS }}
SALESFORCE_INTEGRATION_TESTS_CREDS: ${{ secrets.SALESFORCE_INTEGRATION_TESTS_CREDS }}
GSHEETS_INTEGRATION_TESTS_CREDS: ${{ secrets.GSHEETS_INTEGRATION_TESTS_CREDS }}

- name: Build
run: ./gradlew --no-daemon build --scan
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ __pycache__
.eggs
.venv
.mypy_cache

1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,3 @@ Check out our [roadmap](https://github.com/airbytehq/airbyte/projects/1) to get
## License

Airbyte is licensed under the MIT license. See the [LICENSE](https://docs.airbyte.io/license) file for licensing information.

Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
apply from: rootProject.file('tools/gradle/commons/integrations/image.gradle')

Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def start(self, args):

with tempfile.TemporaryDirectory() as temp_dir:
if cmd == "spec":
message = AirbyteMessage(type="SPEC", spec=self.source.spec(logger))
message = AirbyteMessage(type=Type.SPEC, spec=self.source.spec(logger))
print(message.json(exclude_unset=True))
sys.exit(0)

Expand Down
1 change: 0 additions & 1 deletion airbyte-integrations/bases/base-singer/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ apply from: rootProject.file('tools/gradle/commons/integrations/image.gradle')
build.dependsOn ':airbyte-integrations:bases:airbyte-protocol:build'
build.dependsOn ':airbyte-integrations:bases:base-python:build'
buildImage.dependsOn ':airbyte-integrations:bases:base-python:buildImage'

Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ apply from: rootProject.file('tools/gradle/commons/integrations/image.gradle')
apply from: rootProject.file('tools/gradle/commons/integrations/integration-test.gradle')

integrationTest.dependsOn(buildImage)
build.dependsOn ':airbyte-integrations:bases:base-python:build'
build.dependsOn ':airbyte-integrations:bases:airbyte-protocol:build'
buildImage.dependsOn ':airbyte-integrations:bases:base-singer:buildImage'
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ COPY source_github_singer/*.json ./$CODE_PATH
COPY resourcesstandardtest/*.json ./$CODE_PATH

COPY setup.py ./
RUN pip install ".[standardtest]"
RUN pip install -v ".[integration_tests]"

WORKDIR /airbyte

Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ standardSourceTestPython {

standardSourceTestPython.dependsOn(buildTestImage)
build.dependsOn ':airbyte-integrations:bases:base-singer:build'
buildImage.dependsOn ':airbyte-integrations:bases:base-singer:buildImage'
build.dependsOn ':airbyte-integrations:bases:base-python-test:build'
buildImage.dependsOn ':airbyte-integrations:bases:base-singer:buildImage'
buildTestImage.dependsOn ':airbyte-integrations:bases:base-python-test:buildImage'
integrationTest.dependsOn(buildImage)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*
!Dockerfile
!Dockerfile.test
!google_sheets_source
!setup.py
!integration_tests
!secrets
15 changes: 15 additions & 0 deletions airbyte-integrations/connectors/source-google-sheets/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM airbyte/integration-base-python:dev

RUN apt-get update && apt-get install -y jq curl bash && rm -rf /var/lib/apt/lists/*

ENV CODE_PATH="google_sheets_source"
ENV AIRBYTE_IMPL_MODULE="google_sheets_source"
ENV AIRBYTE_IMPL_PATH="GoogleSheetsSource"

WORKDIR /airbyte/integration_code
COPY $CODE_PATH ./$CODE_PATH
COPY setup.py ./
RUN pip install .

LABEL io.airbyte.version=0.1.0
LABEL io.airbyte.name=airbyte/source-google-sheets
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM airbyte/base-python-test:dev

RUN apt-get update && apt-get install -y \
&& rm -rf /var/lib/apt/lists/*

ENV CODE_PATH="integration_tests"
ENV AIRBYTE_TEST_MODULE="integration_tests"
ENV AIRBYTE_TEST_PATH="GoogleSheetsSourceStandardTest"
ENV AIRBYTE_TEST_CASE=true

LABEL io.airbyte.version=0.1.0
LABEL io.airbyte.name=airbyte/source-google-sheets-standard-test

WORKDIR /airbyte/integration_code
COPY google_sheets_source google_sheets_source
COPY integration_tests integration_tests
COPY secrets/* integration_tests
COPY google_sheets_source/*.json integration_tests
COPY setup.py ./

RUN pip install ".[integration_tests]"

WORKDIR /airbyte
38 changes: 38 additions & 0 deletions airbyte-integrations/connectors/source-google-sheets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Google Sheets Source

This is the repository for the Google Sheets source connector, written in Python.
For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/sources/googlesheets).

## Local development
### Build
First, build the module by running the following from the `airbyte` project root directory:
```
./gradlew :airbyte-integrations:connectors:source-google-sheets:build
```

This should generate a virtualenv for this module in `source-google-sheets/.venv`. Make sure this venv is active in your
development environment of choice. If you are on the terminal, run the following from the `source-google-sheets` directory:
```
source .venv/bin/activate
```
If you are in an IDE, follow your IDE's instructions to activate the virtualenv.

**All the instructions below assume you have correctly activated the virtualenv.**.

### Unit Tests
To run unit tests locally, from the connector root run:
```
python setup.py test
```

### Integration Tests
1. Configure credentials as appropriate, described below
1. From the airbyte project root, run `./gradlew :airbyte-integrations:connectors:source-google-sheets:standardSourceTestPython`

## Configure credentials
### Configuring credentials as a community contributor
Follow the instructions in the [Google Sheets documentation](https://docs.airbyte.io/integrations/sources/googlesheets) for generating credentials to access the Google API, then put those
in a file named `secrets/credentials.json`.

### Airbyte Employee
Credentials are available in RPass under the secret name `google sheets integration test creds`.
30 changes: 30 additions & 0 deletions airbyte-integrations/connectors/source-google-sheets/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
plugins {
id 'java'
}

project.ext.pyModule = 'google_sheets_source'
apply from: rootProject.file('tools/gradle/commons/integrations/python.gradle')
apply from: rootProject.file('tools/gradle/commons/integrations/image.gradle')
apply from: rootProject.file('tools/gradle/commons/integrations/test-image.gradle')
apply from: rootProject.file('tools/gradle/commons/integrations/integration-test.gradle')
apply from: rootProject.file('tools/gradle/commons/integrations/standard-source-test-python.gradle')


standardSourceTestPython {
ext {
imageName = "${extractImageName(project.file('Dockerfile'))}:dev"
pythonContainerName = "${extractImageName(project.file('Dockerfile.test'))}:dev"
}
}

task unitTest(type: PythonTask){
command = "setup.py test"
}

build.dependsOn(unitTest)
build.dependsOn ':airbyte-integrations:bases:base-python-test:build'
buildImage.dependsOn ':airbyte-integrations:bases:base-python:buildImage'
integrationTest.dependsOn(buildImage)

buildTestImage.dependsOn ':airbyte-integrations:bases:base-python-test:buildImage'
standardSourceTestPython.dependsOn(buildTestImage)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
MIT License
Copyright (c) 2020 Airbyte
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

from .google_sheets_source import GoogleSheetsSource

__all__ = ["GoogleSheetsSource"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
MIT License
Copyright (c) 2020 Airbyte
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import json
from typing import Generator

from airbyte_protocol import AirbyteCatalog, AirbyteConnectionStatus, AirbyteMessage, Status, Type
from apiclient import errors
from base_python import AirbyteLogger, Source

from .helpers import Helpers
from .models.spreadsheet import Spreadsheet
from .models.spreadsheet_values import SpreadsheetValues

ROW_BATCH_SIZE = 200


class GoogleSheetsSource(Source):
"""
Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
"""

def __init__(self):
super().__init__()

def check(self, logger: AirbyteLogger, config_container) -> AirbyteConnectionStatus:
# Check involves verifying that the specified spreadsheet is reachable with our credentials.
config = config_container.rendered_config
client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"]))
spreadsheet_id = config["spreadsheet_id"]
try:
# Attempt to get first row of sheet
client.get(spreadsheetId=spreadsheet_id, includeGridData=False, ranges="1:1").execute()
except errors.HttpError as err:
reason = str(err)
# Give a clearer message if it's a common error like 404.
if err.resp.status == 404:
reason = "Requested spreadsheet was not found."

print(f"Formatted error: {reason}")
return AirbyteConnectionStatus(status=Status.FAILED, message=str(reason))

return AirbyteConnectionStatus(status=Status.SUCCEEDED)

def discover(self, logger: AirbyteLogger, config_container) -> AirbyteCatalog:
config = config_container.rendered_config
client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"]))
spreadsheet_id = config["spreadsheet_id"]
try:
logger.info(f"Running discovery on sheet {spreadsheet_id}")
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False).execute())
sheet_names = [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
streams = []
for sheet_name in sheet_names:
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
stream = Helpers.headers_to_airbyte_stream(sheet_name, header_row_data)
streams.append(stream)
return AirbyteCatalog(streams=streams)

except errors.HttpError as err:
reason = str(err)
if err.resp.status == 404:
reason = "Requested spreadsheet was not found."
raise Exception(f"Could not run discovery: {reason}")

def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]:
config = config_container.rendered_config
client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"]))

catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))

sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
spreadsheet_id = config["spreadsheet_id"]

logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
# a blank row, emit the row batch
sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(client, spreadsheet_id, sheet_to_column_name)
for sheet in sheet_to_column_index_to_name.keys():
logger.info(f"Syncing sheet {sheet}")
column_index_to_name = sheet_to_column_index_to_name[sheet]
row_cursor = 2 # we start syncing past the header row
encountered_blank_row = False
while not encountered_blank_row:
range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
logger.info(f"Fetching range {range}")
row_batch = SpreadsheetValues.parse_obj(
client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()
)
row_cursor += ROW_BATCH_SIZE + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]

if not value_ranges.values:
break

row_values = value_ranges.values
if len(row_values) == 0:
break

for row in row_values:
if Helpers.is_row_empty(row):
encountered_blank_row = True
break
elif Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Loading

0 comments on commit f408bce

Please sign in to comment.