From d256dfe597dd112ee393130e851fa115502c9435 Mon Sep 17 00:00:00 2001 From: Andrey Kabanov Date: Sun, 13 Feb 2022 18:46:41 -0800 Subject: [PATCH] Re-submitting PR #140 (Add additional base test suite tests) (#145) * add mac to gitignore * change testing repo * adjust `start_date_2` for new repo * add tap-tester automatic fields * add tap-tester all fields * add all expected streams to all fields test * set specific bookmark for test-repo * fix `collaborators` stream bookmark spelling for tap-tester * add more streams to automatic fields test * add tap-tester bookmarks * updates to automatic fields test: * add check for unique primary keys in replicated records * replace explicit set of expected streams with `expected_check_streams` from base * update `test_run` doc string * omit `team_memberships` stream from `expected_check_streams` * build expected_check_streams() using expected_streams() * add bug id and description * pylint fixes: * adjust imports * use specific error class * set encoding * adjust circle config: * use latest image * trim pylint disable options * add unit tests step * make sure integration tests run always * Re-submitting PR #141 (All repos for an organization) (#146) * add parsing of "org/*" wildcard to retreive all repos for an org * add unit test cases for extract_repos_from_config() * add requests-mock for dev requirements * add unit test for get_all_repos() * Re-submitting PR #142 (Improve Rate Limiting and Retry Logic) (#143) * add basic backoff retry * add backoff to setup.py * replace deprecated assertEquals method with assertEqual * add MAX_SLEEP_SECONDS parsing from config and DEFAULT_MAX_SECONDS for rate limiting * add comments for changes and use DEFAULT_SLEEP_SECONDS * add pylint ignore for global-statement * README updates: (#148) - add full list of replicated streams - update GitHub docs links * add streams to excluded_streams that aren't respecting automatic fields * add NotFoundException handling to collaborators stream * add bug info * adjust test for test-repo * pylint fixes * run unit and integations steps always * don't raise a NotFoundException to deal with access issues to resources * pylint fix and return empty response body for 404 * add collaborators stream to excluded set * fixes to tap-tester tests * adjust 2nd sync dates for data * deal with None from get method * adjust start date tap-tester dates * actually deal with NoneType in get method * FIX: sub_streams sync functions passed parent metadata * remove expected_check_streams after bug identified and addressed * use expected_streams after bug identified and addressed * don't write a bookmark for FULL_TABLE streams * update unit test expectations to recent changes * updates to bookmarks tap-tester: * adjust test expectatons for streams * create simulated_states based on test data and tap behavior * adjust tests based on commits and pr_commits schema * update base based on tap behavior and test data * adjust test expectations for team_members stream * Exclude collaborators stream due to access issues in circle * update circle config to include slack orb and tap-tester-user context * add bug info * add tap-tester-user to daily build context --- .circleci/config.yml | 31 +-- .gitignore | 5 + README.md | 32 ++- setup.py | 4 +- tap_github/__init__.py | 156 +++++++++---- tests/base.py | 34 +-- tests/test_github_all_fields.py | 90 ++++++++ tests/test_github_automatic_fields.py | 73 ++++++ tests/test_github_bookmarks.py | 207 ++++++++++++++++++ tests/test_github_start_date.py | 14 +- tests/unittests/test_exception_handling.py | 24 +- .../test_extract_repos_from_config.py | 32 +++ tests/unittests/test_formatting_dates.py | 4 +- tests/unittests/test_get_all_repos.py | 74 +++++++ tests/unittests/test_key_error.py | 36 +-- tests/unittests/test_rate_limit.py | 2 +- tests/unittests/test_start_date_bookmark.py | 8 +- tests/unittests/test_sub_streams_selection.py | 8 +- tests/unittests/test_verify_access.py | 20 +- 19 files changed, 699 insertions(+), 155 deletions(-) create mode 100644 tests/test_github_all_fields.py create mode 100644 tests/test_github_automatic_fields.py create mode 100644 tests/test_github_bookmarks.py create mode 100644 tests/unittests/test_extract_repos_from_config.py create mode 100644 tests/unittests/test_get_all_repos.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 78712556..a6969bf5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,8 +1,11 @@ -version: 2 +version: 2.1 +orbs: + slack: circleci/slack@3.4.2 + jobs: build: docker: - - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:tap-tester-v4 + - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester steps: - checkout - run: @@ -21,7 +24,7 @@ jobs: name: 'pylint' command: | source /usr/local/share/virtualenvs/tap-github/bin/activate - pylint tap_github --disable 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-class-docstring,missing-function-docstring,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,wrong-spelling-in-comment,wrong-spelling-in-docstring,bad-whitespace' + pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-locals' - run: name: 'Unit Tests' command: | @@ -29,6 +32,7 @@ jobs: pip install nose coverage nosetests --with-coverage --cover-erase --cover-package=tap_github --cover-html-dir=htmlcov tests/unittests coverage html + when: always - store_test_results: path: test_output/report.xml - store_artifacts: @@ -39,20 +43,19 @@ jobs: aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh source dev_env.sh source /usr/local/share/virtualenvs/tap-tester/bin/activate - run-test --tap=tap-github \ - --target=target-stitch \ - --orchestrator=stitch-orchestrator \ - --email=harrison+sandboxtest@stitchdata.com \ - --password=$SANDBOX_PASSWORD \ - --client-id=50 \ - --token=$STITCH_API_TOKEN \ - tests + run-test --tap=tap-github tests + when: always + - slack/notify-on-failure: + only_for_branches: master + workflows: version: 2 commit: jobs: - build: - context: circleci-user + context: + - circleci-user + - tap-tester-user build_daily: triggers: - schedule: @@ -63,4 +66,6 @@ workflows: - master jobs: - build: - context: circleci-user + context: + - circleci-user + - tap-tester-user diff --git a/.gitignore b/.gitignore index 59ed95a6..57a09a59 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,8 @@ properties.json # Jetbrains IDE .idea + +# macOS +*.DS_Store +.AppleDouble +.LSOverride \ No newline at end of file diff --git a/README.md b/README.md index 12454261..3e956789 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,32 @@ This is a [Singer](https://singer.io) tap that produces JSON-formatted data from the GitHub API following the [Singer -spec](https://github.com/singer-io/getting-started/blob/master/SPEC.md). +spec](https://github.com/singer-io/getting-started/blob/master/docs/SPEC.md). This tap: - Pulls raw data from the [GitHub REST API](https://developer.github.com/v3/) - Extracts the following resources from GitHub for a single repository: - - [Assignees](https://developer.github.com/v3/issues/assignees/#list-assignees) - - [Collaborators](https://developer.github.com/v3/repos/collaborators/#list-collaborators) - - [Commits](https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository) - - [Issues](https://developer.github.com/v3/issues/#list-issues-for-a-repository) - - [Pull Requests](https://developer.github.com/v3/pulls/#list-pull-requests) - - [Comments](https://developer.github.com/v3/issues/comments/#list-comments-in-a-repository) - - [Reviews](https://developer.github.com/v3/pulls/reviews/#list-reviews-on-a-pull-request) - - [Review Comments](https://developer.github.com/v3/pulls/comments/) - - [Stargazers](https://developer.github.com/v3/activity/starring/#list-stargazers) + - [Assignees](https://docs.github.com/en/rest/reference/issues#list-assigneess) + - [Collaborators](https://docs.github.com/en/rest/reference/repos#list-repository-collaborators) + - [Commits](https://docs.github.com/en/rest/reference/repos#list-commits) + - [Commit Comments](https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository) + - [Events](https://docs.github.com/en/rest/reference/issues#events) + - [Issues](https://docs.github.com/en/rest/reference/issues#list-repository-issues) + - [Issue Events](https://docs.github.com/en/rest/reference/issues#list-issue-events-for-a-repository) + - [Issue Milestones](https://docs.github.com/en/rest/reference/issues#list-milestones) + - [Projects](https://docs.github.com/en/rest/reference/projects#list-repository-projects) + - [Project Cards](https://docs.github.com/en/rest/reference/projects#list-project-cards) + - [Project Columns](https://docs.github.com/en/rest/reference/projects#list-project-columns) + - [Pull Requests](https://docs.github.com/en/rest/reference/pulls#list-pull-requests) + - [PR Commits](https://docs.github.com/en/rest/reference/pulls#list-commits-on-a-pull-request) + - [Releases](https://docs.github.com/en/rest/reference/repos#list-releases) + - [Comments](https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository) + - [Reviews](https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request) + - [Review Comments](https://docs.github.com/en/rest/reference/pulls#list-review-comments-in-a-repository) + - [Stargazers](https://docs.github.com/en/rest/reference/activity#list-stargazers) + - [Teams](https://docs.github.com/en/rest/reference/teams#list-teams) + - [Team Members](https://docs.github.com/en/rest/reference/teams#list-team-members) + - [Team Memberships](https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user) - Outputs the schema for each resource - Incrementally pulls data based on the input state diff --git a/setup.py b/setup.py index e37afe7e..4f8a4836 100644 --- a/setup.py +++ b/setup.py @@ -11,13 +11,15 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.20.0' + 'requests==2.20.0', + 'backoff==1.8.0' ], extras_require={ 'dev': [ 'pylint==2.6.2', 'ipdb', 'nose', + 'requests-mock==1.9.3' ] }, entry_points=''' diff --git a/tap_github/__init__.py b/tap_github/__init__.py index 3d4536c8..1f4eebcc 100644 --- a/tap_github/__init__.py +++ b/tap_github/__init__.py @@ -1,15 +1,13 @@ -import argparse import os import json import collections import time import requests -import singer -import singer.bookmarks as bookmarks -import singer.metrics as metrics import backoff +import singer -from singer import metadata +from singer import (bookmarks, metrics, metadata) +from simplejson import JSONDecodeError session = requests.Session() logger = singer.get_logger() @@ -45,6 +43,9 @@ 'team_memberships': ['url'] } +DEFAULT_SLEEP_SECONDS = 600 +MAX_SLEEP_SECONDS = DEFAULT_SLEEP_SECONDS + class GithubException(Exception): pass @@ -101,7 +102,7 @@ class RateLimitExceeded(GithubException): }, 404: { "raise_exception": NotFoundException, - "message": "The resource you have specified cannot be found" + "message": "The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource" }, 409: { "raise_exception": ConflictError, @@ -172,7 +173,7 @@ def raise_for_error(resp, source): error_code = resp.status_code try: response_json = resp.json() - except Exception: + except JSONDecodeError: response_json = {} if error_code == 404: @@ -180,9 +181,12 @@ def raise_for_error(resp, source): if source == "teams": details += ' or it is a personal account repository' message = "HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details.".format(details, response_json.get("documentation_url")) - else: - message = "HTTP-error-code: {}, Error: {}".format( - error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) + logger.info(message) + # don't raise a NotFoundException + return None + + message = "HTTP-error-code: {}, Error: {}".format( + error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) exc = ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("raise_exception", GithubException) raise exc(message) from None @@ -195,7 +199,7 @@ def rate_throttling(response): if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - if seconds_to_sleep > 600: + if seconds_to_sleep > MAX_SLEEP_SECONDS: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) raise RateLimitExceeded(message) from None @@ -214,6 +218,9 @@ def authed_get(source, url, headers={}): raise_for_error(resp, source) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp) + if resp.status_code == 404: + # return an empty response body since we're not raising a NotFoundException + resp._content = b'{}' # pylint: disable=protected-access return resp def authed_get_all_pages(source, url, headers={}): @@ -249,7 +256,7 @@ def load_schemas(): for filename in os.listdir(get_abs_path('schemas')): path = get_abs_path('schemas') + '/' + filename file_raw = filename.replace('.json', '') - with open(path) as file: + with open(path, encoding='utf-8') as file: schemas[file_raw] = json.load(file) schemas['pr_commits'] = generate_pr_commit_schema(schemas['commits']) @@ -315,6 +322,57 @@ def get_catalog(): return {'streams': streams} +def get_all_repos(organizations: list) -> list: + """ + Retrieves all repositories for the provided organizations and + verifies basic access for them. + + Docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories + """ + repos = [] + + for org_path in organizations: + org = org_path.split('/')[0] + for response in authed_get_all_pages( + 'get_all_repos', + 'https://api.github.com/orgs/{}/repos?sort=created&direction=desc'.format(org) + ): + org_repos = response.json() + + for repo in org_repos: + repo_full_name = repo.get('full_name') + + logger.info("Verifying access of repository: %s", repo_full_name) + verify_repo_access( + 'https://api.github.com/repos/{}/commits'.format(repo_full_name), + repo + ) + + repos.append(repo_full_name) + + return repos + +def extract_repos_from_config(config: dict ) -> list: + """ + Extracts all repositories from the config and calls get_all_repos() + for organizations using the wildcard 'org/*' format. + """ + repo_paths = list(filter(None, config['repository'].split(' '))) + + orgs_with_all_repos = list(filter(lambda x: x.split('/')[1] == '*', repo_paths)) + + if orgs_with_all_repos: + # remove any wildcard "org/*" occurrences from `repo_paths` + repo_paths = list(set(repo_paths).difference(set(orgs_with_all_repos))) + + # get all repositores for an org in the config + all_repos = get_all_repos(orgs_with_all_repos) + + # update repo_paths + repo_paths.extend(all_repos) + + return repo_paths + def verify_repo_access(url_for_repo, repo): try: authed_get("verifying repository access", url_for_repo) @@ -328,7 +386,7 @@ def verify_access_for_repo(config): access_token = config['access_token'] session.headers.update({'authorization': 'token ' + access_token, 'per_page': '1', 'page': '1'}) - repositories = list(filter(None, config['repository'].split(' '))) + repositories = extract_repos_from_config(config) for repo in repositories: logger.info("Verifying access of repository: %s", repo) @@ -360,18 +418,16 @@ def get_all_teams(schemas, repo_path, state, mdata, _start_date): # transform and write release record with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) + rec = transformer.transform(r, schemas['teams'], metadata=metadata.to_map(mdata['teams'])) singer.write_record('teams', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'teams', {'since': singer.utils.strftime(extraction_time)}) counter.increment() if schemas.get('team_members'): - for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata): + for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata['team_members']): singer.write_record('team_members', team_members_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'team_members', {'since': singer.utils.strftime(extraction_time)}) if schemas.get('team_memberships'): - for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata): + for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata['team_memberships']): singer.write_record('team_memberships', team_memberships_rec, time_extracted=extraction_time) return state @@ -547,7 +603,6 @@ def get_all_issue_labels(schemas, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('issue_labels', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'issue_labels', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -616,7 +671,7 @@ def get_all_projects(schemas, repo_path, state, mdata, start_date): # transform and write release record with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) + rec = transformer.transform(r, schemas['projects'], metadata=metadata.to_map(mdata['projects'])) singer.write_record('projects', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'projects', {'since': singer.utils.strftime(extraction_time)}) counter.increment() @@ -627,14 +682,14 @@ def get_all_projects(schemas, repo_path, state, mdata, start_date): # sync project_columns if that schema is present (only there if selected) if schemas.get('project_columns'): - for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata, start_date): + for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata['project_columns'], start_date): singer.write_record('project_columns', project_column_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_columns', {'since': singer.utils.strftime(extraction_time)}) # sync project_cards if that schema is present (only there if selected) if schemas.get('project_cards'): column_id = project_column_rec['id'] - for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata, start_date): + for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata['project_cards'], start_date): singer.write_record('project_cards', project_card_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_cards', {'since': singer.utils.strftime(extraction_time)}) return state @@ -721,7 +776,6 @@ def get_all_releases(schemas, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('releases', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'releases', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -761,14 +815,14 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): # transform and write pull_request record with singer.Transformer() as transformer: - rec = transformer.transform(pr, schemas['pull_requests'], metadata=metadata.to_map(mdata)) + rec = transformer.transform(pr, schemas['pull_requests'], metadata=metadata.to_map(mdata['pull_requests'])) singer.write_record('pull_requests', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'pull_requests', {'since': singer.utils.strftime(extraction_time)}) counter.increment() # sync reviews if that schema is present (only there if selected) if schemas.get('reviews'): - for review_rec in get_reviews_for_pr(pr_num, schemas['reviews'], repo_path, state, mdata): + for review_rec in get_reviews_for_pr(pr_num, schemas['reviews'], repo_path, state, mdata['reviews']): singer.write_record('reviews', review_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'reviews', {'since': singer.utils.strftime(extraction_time)}) @@ -776,7 +830,7 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): # sync review comments if that schema is present (only there if selected) if schemas.get('review_comments'): - for review_comment_rec in get_review_comments_for_pr(pr_num, schemas['review_comments'], repo_path, state, mdata): + for review_comment_rec in get_review_comments_for_pr(pr_num, schemas['review_comments'], repo_path, state, mdata['review_comments']): singer.write_record('review_comments', review_comment_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'review_comments', {'since': singer.utils.strftime(extraction_time)}) @@ -787,7 +841,7 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): schemas['pr_commits'], repo_path, state, - mdata + mdata['pr_commits'] ): singer.write_record('pr_commits', pr_commit, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'pr_commits', {'since': singer.utils.strftime(extraction_time)}) @@ -859,7 +913,6 @@ def get_all_assignees(schema, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(assignee, schema, metadata=metadata.to_map(mdata)) singer.write_record('assignees', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'assignees', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -869,19 +922,26 @@ def get_all_collaborators(schema, repo_path, state, mdata, _start_date): https://developer.github.com/v3/repos/collaborators/#list-collaborators ''' with metrics.record_counter('collaborators') as counter: - for response in authed_get_all_pages( - 'collaborators', - 'https://api.github.com/repos/{}/collaborators'.format(repo_path) - ): - collaborators = response.json() - extraction_time = singer.utils.now() - for collaborator in collaborators: - collaborator['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata)) - singer.write_record('collaborators', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'collaborator', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() + try: + responses = authed_get_all_pages( + 'collaborators', + 'https://api.github.com/repos/{}/collaborators'.format(repo_path) + ) + except NotFoundException as error: + logger.info( + 'Unable to retreive collaborators stream, check access_token is valid for %s. See full error message: %s', + repo_path, error + ) + else: + for response in responses: + collaborators = response.json() + extraction_time = singer.utils.now() + for collaborator in collaborators: + collaborator['_sdc_repository'] = repo_path + with singer.Transformer() as transformer: + rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata)) + singer.write_record('collaborators', rec, time_extracted=extraction_time) + counter.increment() return state @@ -987,7 +1047,6 @@ def get_all_stargazers(schema, repo_path, state, mdata, _start_date): rec = transformer.transform(stargazer, schema, metadata=metadata.to_map(mdata)) rec['user_id'] = user_id singer.write_record('stargazers', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'stargazers', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -1064,7 +1123,7 @@ def do_sync(config, state, catalog): selected_stream_ids = get_selected_streams(catalog) validate_dependencies(selected_stream_ids) - repositories = list(filter(None, config['repository'].split(' '))) + repositories = extract_repos_from_config(config) state = translate_state(state, catalog, repositories) singer.write_state(state) @@ -1096,17 +1155,19 @@ def do_sync(config, state, catalog): # handle streams with sub streams else: stream_schemas = {stream_id: stream_schema} + stream_mdata = {stream_id: mdata} # get and write selected sub stream schemas for sub_stream_id in sub_stream_ids: if sub_stream_id in selected_stream_ids: sub_stream = get_stream_from_catalog(sub_stream_id, catalog) stream_schemas[sub_stream_id] = sub_stream['schema'] + stream_mdata[sub_stream_id] = sub_stream['metadata'] singer.write_schema(sub_stream_id, sub_stream['schema'], sub_stream['key_properties']) # sync stream and it's sub streams - state = sync_func(stream_schemas, repo, state, mdata, start_date) + state = sync_func(stream_schemas, repo, state, stream_mdata, start_date) singer.write_state(state) @@ -1114,6 +1175,13 @@ def do_sync(config, state, catalog): def main(): args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) + # get optional config key `max_sleep_seconds` + config_max_sleep = args.config.get('max_sleep_seconds') + + # set global `MAX_SLEEP_SECONDS` for rate_throttling function or use default + global MAX_SLEEP_SECONDS #pylint: disable=global-statement + MAX_SLEEP_SECONDS = config_max_sleep if config_max_sleep else DEFAULT_SLEEP_SECONDS + if args.discover: do_discover(args.config) else: diff --git a/tests/base.py b/tests/base.py index 7b204e64..6ea415a4 100644 --- a/tests/base.py +++ b/tests/base.py @@ -46,8 +46,8 @@ def get_properties(self, original: bool = True): :param original: set to false to change the start_date or end_date """ return_value = { - 'start_date' : dt.strftime(dt.utcnow()-timedelta(days=5), self.START_DATE_FORMAT), - 'repository': 'singer-io/tap-github' + 'start_date' : '2021-10-01T00:00:00Z', + 'repository': 'singer-io/test-repo' } if original: return return_value @@ -61,32 +61,6 @@ def get_credentials(self): 'access_token': os.getenv("TAP_GITHUB_TOKEN") } - @staticmethod - def expected_check_streams(): - return { - 'assignees', - 'collaborators', - 'comments', - 'commit_comments', - 'commits', - 'events', - 'issue_labels', - 'issue_milestones', - 'issue_events', - 'issues', - 'pr_commits', - 'project_cards', - 'project_columns', - 'projects', - 'pull_requests', - 'releases', - 'review_comments', - 'reviews', - 'stargazers', - 'team_members', - 'team_memberships', - 'teams' - } def expected_metadata(self): """The expected streams and metadata about the streams""" @@ -134,7 +108,7 @@ def expected_metadata(self): "issue_milestones": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"due_on"}, + self.BOOKMARK: {"updated_at"}, self.OBEYS_START_DATE: True }, "issue_events": { @@ -193,7 +167,7 @@ def expected_metadata(self): "reviews": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"updated_at"}, + self.BOOKMARK: {"submitted_at"}, self.OBEYS_START_DATE: True }, "stargazers": { diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py new file mode 100644 index 00000000..17173dc1 --- /dev/null +++ b/tests/test_github_all_fields.py @@ -0,0 +1,90 @@ +import os + +from tap_tester import runner, connections, menagerie + +from base import TestGithubBase + + +class TestGithubAllFields(TestGithubBase): + """Test that with all fields selected for a stream automatic and available fields are replicated""" + + @staticmethod + def name(): + return "tap_tester_github_all_fields" + + def test_run(self): + """ + Ensure running the tap with all streams and fields selected results in the + replication of all fields. + - Verify no unexpected streams were replicated + - Verify that more than just the automatic fields are replicated for each stream. + """ + # BUG TDL-16672 + # The excluded streams are not honoring all fields selection + excluded_streams = { + 'issue_events', + 'comments', + 'projects', + 'pr_commits', + 'events', + 'review_comments', + 'issues', + 'project_cards', + 'project_columns', + 'commits', + 'collaborators' + } + + expected_streams = self.expected_streams() - excluded_streams + + # instantiate connection + conn_id = connections.ensure_connection(self) + + # run check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # table and field selection + test_catalogs_all_fields = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in expected_streams] + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_all_fields, select_all_fields=True, + ) + + # grab metadata after performing table-and-field selection to set expectations + stream_to_all_catalog_fields = dict() # used for asserting all fields are replicated + for catalog in test_catalogs_all_fields: + stream_id, stream_name = catalog['stream_id'], catalog['stream_name'] + catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) + fields_from_field_level_md = [md_entry['breadcrumb'][1] + for md_entry in catalog_entry['metadata'] + if md_entry['breadcrumb'] != []] + stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) + + # run initial sync + record_count_by_stream = self.run_and_verify_sync(conn_id) + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(expected_streams, synced_stream_names) + + for stream in expected_streams: + with self.subTest(stream=stream): + # expected values + expected_automatic_keys = self.expected_primary_keys().get(stream) + + # get all expected keys + expected_all_keys = stream_to_all_catalog_fields[stream] + + # collect actual values + messages = synced_records.get(stream) + actual_all_keys = [set(message['data'].keys()) for message in messages['messages'] + if message['action'] == 'upsert'][0] + + # Verify that you get some records for each stream + self.assertGreater(record_count_by_stream.get(stream, -1), 0) + + # verify all fields for a stream were replicated + self.assertGreater(len(expected_all_keys), len(expected_automatic_keys)) + self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') + self.assertSetEqual(expected_all_keys, actual_all_keys) diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py new file mode 100644 index 00000000..03ae904f --- /dev/null +++ b/tests/test_github_automatic_fields.py @@ -0,0 +1,73 @@ +""" +Test that with no fields selected for a stream automatic fields are still replicated +""" +from tap_tester import runner, connections + +from base import TestGithubBase + + +class TestGithubAutomaticFields(TestGithubBase): + """Test that with no fields selected for a stream automatic fields are still replicated""" + + @staticmethod + def name(): + return "tap_tester_github_automatic_fields" + + def test_run(self): + """ + - Verify that for each stream you can get multiple pages of data + when no fields are selected. + - Verify that only the automatic fields are sent to the target. + - Verify that all replicated records have unique primary key values. + """ + # Exclude collaborators stream due to access issues in circle + expected_streams = self.expected_streams() - {'collaborators'} + + # instantiate connection + conn_id = connections.ensure_connection(self) + + # run check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # table and field selection + test_catalogs_automatic_fields = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in expected_streams] + + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_automatic_fields, select_all_fields=False, + ) + + # run initial sync + record_count_by_stream = self.run_and_verify_sync(conn_id) + synced_records = runner.get_records_from_target_output() + + for stream in expected_streams: + with self.subTest(stream=stream): + # expected values + expected_keys = self.expected_primary_keys().get(stream) + + # collect actual values + data = synced_records.get(stream, {}) + record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})] + primary_keys_list = [ + tuple(message.get('data').get(expected_pk) for expected_pk in expected_keys) + for message in data.get('messages') + if message.get('action') == 'upsert'] + unique_primary_keys_list = set(primary_keys_list) + + # Verify that you get some records for each stream + self.assertGreater( + record_count_by_stream.get(stream, -1), 0, + msg="The number of records is not over the stream max limit for the {} stream".format(stream)) + + # Verify that only the automatic fields are sent to the target + for actual_keys in record_messages_keys: + self.assertSetEqual(expected_keys, actual_keys) + + # BUG-TDL-17507 An org can have multiple teams with overlapping membership + if stream != 'team_members': + # Verify that all replicated records have unique primary key values. + self.assertEqual( + len(primary_keys_list), + len(unique_primary_keys_list), + msg="Replicated record does not have unique primary key values.") diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py new file mode 100644 index 00000000..3520a9d8 --- /dev/null +++ b/tests/test_github_bookmarks.py @@ -0,0 +1,207 @@ +import datetime +import dateutil.parser +import pytz + +from tap_tester import runner, menagerie, connections + +from base import TestGithubBase + + +class TestGithubBookmarks(TestGithubBase): + @staticmethod + def name(): + return "tap_tester_github_bookmarks" + + @staticmethod + def convert_state_to_utc(date_str): + """ + Convert a saved bookmark value of the form '2020-08-25T13:17:36-07:00' to + a string formatted utc datetime, + in order to compare against json formatted datetime values + """ + date_object = dateutil.parser.parse(date_str) + date_object_utc = date_object.astimezone(tz=pytz.UTC) + return datetime.datetime.strftime(date_object_utc, "%Y-%m-%dT%H:%M:%SZ") + + def calculated_states_by_stream(self, current_state, synced_records, replication_keys): + """ + Look at the bookmarks from a previous sync and set a new bookmark + value based off timedelta expectations. This ensures the subsequent sync will replicate + at least 1 record but, fewer records than the previous sync. + + If the test data is changed in the future this will break expectations for this test. + """ + timedelta_by_stream = {stream: [90,0,0] # {stream_name: [days, hours, minutes], ...} + for stream in self.expected_streams()} + timedelta_by_stream['comments'] = [7, 0, 0] + timedelta_by_stream['commit_comments'] = [0, 0, 1] + timedelta_by_stream['commits'] = [0, 17, 0] + timedelta_by_stream['issue_events'] = [1, 0, 0] + timedelta_by_stream['issue_milestones'] = [0, 1, 0] + timedelta_by_stream['issues'] = [7, 0, 0] + timedelta_by_stream['pull_requests'] = [7, 0, 0] + + repo = self.get_properties().get('repository') + + stream_to_calculated_state = {stream: "" for stream in current_state['bookmarks'][repo].keys()} + for stream, state in current_state['bookmarks'][repo].items(): + state_key, state_value = next(iter(state.keys())), next(iter(state.values())) + sync_messages = [record.get('data') for record in + synced_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + + # the `commits` and `pr_commits` streams don't have a top level replication_key field + if stream in ('commits', 'pr_commits'): + max_record_values = [values.get('commit', {}).get('committer', {}).get('date') + for values in sync_messages] + max_value = max(max_record_values) + else: + replication_key = next(iter(replication_keys.get(stream))) + max_record_values = [values.get(replication_key) for values in sync_messages] + max_value = max(max_record_values) + + # this is because the tap uses `time_extracted` to bookmark with `since` at execution + new_state_value = min(max_value, state_value) + state_as_datetime = dateutil.parser.parse(new_state_value) + + days, hours, minutes = timedelta_by_stream[stream] + calculated_state_as_datetime = state_as_datetime - datetime.timedelta(days=days, hours=hours, minutes=minutes) + + state_format = '%Y-%m-%dT%H:%M:%S-00:00' + calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) + + stream_to_calculated_state[stream] = {state_key: calculated_state_formatted} + + return stream_to_calculated_state + + + def test_run(self): + # Exclude collaborators stream due to access issues in circle + expected_streams = self.expected_streams() - {'collaborators'} + + expected_replication_keys = self.expected_bookmark_keys() + expected_replication_methods = self.expected_replication_method() + + repo = self.get_properties().get('repository') + + ########################################################################## + ### First Sync + ########################################################################## + + conn_id = connections.ensure_connection(self, original_properties=True) + + # Run in check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Select only the expected streams tables + catalog_entries = [ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams] + self.perform_and_verify_table_and_field_selection(conn_id, catalog_entries, select_all_fields=True) + + # Run a sync job using orchestrator + first_sync_record_count = self.run_and_verify_sync(conn_id) + first_sync_records = runner.get_records_from_target_output() + first_sync_bookmarks = menagerie.get_state(conn_id) + + ########################################################################## + ### Update State Between Syncs + ########################################################################## + + new_states = {'bookmarks': dict()} + simulated_states = self.calculated_states_by_stream(first_sync_bookmarks, + first_sync_records, expected_replication_keys) + for stream, new_state in simulated_states.items(): + new_states['bookmarks'][stream] = new_state + menagerie.set_state(conn_id, new_states) + + ########################################################################## + ### Second Sync + ########################################################################## + + second_sync_record_count = self.run_and_verify_sync(conn_id) + second_sync_records = runner.get_records_from_target_output() + second_sync_bookmarks = menagerie.get_state(conn_id) + + ########################################################################## + ### Test By Stream + ########################################################################## + + for stream in expected_streams: + with self.subTest(stream=stream): + + # expected values + expected_replication_method = expected_replication_methods[stream] + + # collect information for assertions from syncs 1 & 2 base on expected values + first_sync_count = first_sync_record_count.get(stream, 0) + second_sync_count = second_sync_record_count.get(stream, 0) + first_sync_messages = [record.get('data') for record in + first_sync_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + second_sync_messages = [record.get('data') for record in + second_sync_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + first_bookmark_key_value = first_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) + second_bookmark_key_value = second_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) + + + if expected_replication_method == self.INCREMENTAL: + # collect information specific to incremental streams from syncs 1 & 2 + replication_key = next(iter(expected_replication_keys[stream])) + first_bookmark_value = first_bookmark_key_value.get('since') + second_bookmark_value = second_bookmark_key_value.get('since') + first_bookmark_value_utc = self.convert_state_to_utc(first_bookmark_value) + second_bookmark_value_utc = self.convert_state_to_utc(second_bookmark_value) + + # Verify the first sync sets a bookmark of the expected form + self.assertIsNotNone(first_bookmark_key_value) + self.assertIsNotNone(first_bookmark_key_value.get('since')) + + # Verify the second sync sets a bookmark of the expected form + self.assertIsNotNone(second_bookmark_key_value) + self.assertIsNotNone(second_bookmark_key_value.get('since')) + + # Verify the second sync bookmark is Equal or Greater than the first sync bookmark + # the tap uses `time_extracted` and sets a bookmark using `since` for all real/pseudo incremental streams + self.assertGreaterEqual(second_bookmark_value, first_bookmark_value) + + for record in second_sync_messages: + # Verify the second sync bookmark value is the max replication key value for a given stream + if stream in ('commits', 'pr_commits'): + replication_key_value = record.get('commit', {}).get('committer', {}).get('date') + else: + replication_key_value = record.get(replication_key) + self.assertLessEqual( + replication_key_value, second_bookmark_value_utc, + msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + ) + + for record in first_sync_messages: + # Verify the first sync bookmark value is the max replication key value for a given stream + if stream in ('commits', 'pr_commits'): + replication_key_value = record.get('commit', {}).get('committer', {}).get('date') + else: + replication_key_value = record.get(replication_key) + self.assertLessEqual( + replication_key_value, first_bookmark_value_utc, + msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + ) + + # Verify the number of records in the 2nd sync is less then the first + self.assertLessEqual(second_sync_count, first_sync_count) + + + elif expected_replication_method == self.FULL: + # Verify the syncs do not set a bookmark for full table streams + self.assertIsNone(first_bookmark_key_value) + self.assertIsNone(second_bookmark_key_value) + + # Verify the number of records in the second sync is the same as the first + self.assertEqual(second_sync_count, first_sync_count) + + else: + raise NotImplementedError( + "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}".format(stream, expected_replication_method) + ) + + # Verify at least 1 record was replicated in the second sync + self.assertGreater(second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format(stream)) diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index e48ab584..34065255 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -18,7 +18,7 @@ def name(): def generate_data(self): # get the token token = os.getenv("TAP_GITHUB_TOKEN") - url = "https://api.github.com/user/starred/singer-io/tap-github" + url = "https://api.github.com/user/starred/singer-io/test-repo" headers = {"Authorization": "Bearer {}".format(token)} # generate a data for 'events' stream: 'watchEvent' ie. star the repo @@ -33,13 +33,13 @@ def test_run(self): # run the test for all the streams excluding 'events' stream # as for 'events' stream we have to use dynamic dates - self.run_test('2020-04-01T00:00:00Z', '2021-06-10T00:00:00Z', self.expected_streams() - {'events'}) + self.run_test('2020-04-01T00:00:00Z', '2021-10-08T00:00:00Z', self.expected_streams() - {'events'}) # As per the Documentation: https://docs.github.com/en/rest/reference/activity#events # the 'events' of past 90 days will only be returned # if there are no events in past 90 days, then there will be '304 Not Modified' error today = datetime.today() - date_1 = datetime.strftime(today - timedelta(days=4), "%Y-%m-%dT00:00:00Z") + date_1 = datetime.strftime(today - timedelta(days=90), "%Y-%m-%dT00:00:00Z") date_2 = datetime.strftime(today - timedelta(days=1), "%Y-%m-%dT00:00:00Z") # run the test for 'events' stream self.run_test(date_1, date_2, {'events'}) @@ -126,11 +126,11 @@ def run_test(self, date_1, date_2, streams): # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync_1 = record_count_by_stream_1.get(stream, 0) record_count_sync_2 = record_count_by_stream_2.get(stream, 0) - primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_1.get(stream).get('messages') + primary_keys_list_1 = [tuple(message.get('data', {}).get(expected_pk) for expected_pk in expected_primary_keys) + for message in synced_records_1.get(stream, {'messages': []}).get('messages') if message.get('action') == 'upsert'] - primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_2.get(stream).get('messages') + primary_keys_list_2 = [tuple(message.get('data', {}).get(expected_pk) for expected_pk in expected_primary_keys) + for message in synced_records_2.get(stream, {'messages': []}).get('messages') if message.get('action') == 'upsert'] primary_keys_sync_1 = set(primary_keys_list_1) diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 8036e093..e2c86120 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -32,7 +32,7 @@ def test_zero_content_length(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_400_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(400, raise_error = True) @@ -40,7 +40,7 @@ def test_400_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_401_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(401, raise_error = True) @@ -48,7 +48,7 @@ def test_401_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: Invalid authorization credentials.") + self.assertEqual(str(e), "HTTP-error-code: 401, Error: Invalid authorization credentials.") def test_403_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(403, raise_error = True) @@ -56,7 +56,7 @@ def test_403_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.AuthException as e: - self.assertEquals(str(e), "HTTP-error-code: 403, Error: User doesn't have permission to access the resource.") + self.assertEqual(str(e), "HTTP-error-code: 403, Error: User doesn't have permission to access the resource.") def test_404_error(self, mocked_parse_args, mocked_request): json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} @@ -65,7 +65,7 @@ def test_404_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Please refer '{}' for more details.".format(json.get("documentation_url"))) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Please refer '{}' for more details.".format(json.get("documentation_url"))) def test_404_error_for_teams(self, mocked_parse_args, mocked_request): json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} @@ -73,7 +73,7 @@ def test_404_error_for_teams(self, mocked_parse_args, mocked_request): try: tap_github.raise_for_error(get_response(404, json = json, raise_error = True), "teams") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found or it is a personal account repository. Please refer '{}' for more details.".format(json.get("documentation_url"))) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found or it is a personal account repository. Please refer '{}' for more details.".format(json.get("documentation_url"))) def test_500_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(500, raise_error = True) @@ -81,7 +81,7 @@ def test_500_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.InternalServerError as e: - self.assertEquals(str(e), "HTTP-error-code: 500, Error: An error has occurred at Github's end.") + self.assertEqual(str(e), "HTTP-error-code: 500, Error: An error has occurred at Github's end.") def test_301_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(301, raise_error = True) @@ -89,7 +89,7 @@ def test_301_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.MovedPermanentlyError as e: - self.assertEquals(str(e), "HTTP-error-code: 301, Error: The resource you are looking for is moved to another URL.") + self.assertEqual(str(e), "HTTP-error-code: 301, Error: The resource you are looking for is moved to another URL.") def test_304_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(304, raise_error = True) @@ -97,7 +97,7 @@ def test_304_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.NotModifiedError as e: - self.assertEquals(str(e), "HTTP-error-code: 304, Error: The requested resource has not been modified since the last time you accessed it.") + self.assertEqual(str(e), "HTTP-error-code: 304, Error: The requested resource has not been modified since the last time you accessed it.") def test_422_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(422, raise_error = True) @@ -105,7 +105,7 @@ def test_422_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.UnprocessableError as e: - self.assertEquals(str(e), "HTTP-error-code: 422, Error: The request was not able to process right now.") + self.assertEqual(str(e), "HTTP-error-code: 422, Error: The request was not able to process right now.") def test_409_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(409, raise_error = True) @@ -113,11 +113,11 @@ def test_409_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.ConflictError as e: - self.assertEquals(str(e), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") + self.assertEqual(str(e), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") def test_200_success(self, mocked_parse_args, mocked_request): json = {"key": "value"} mocked_request.return_value = get_response(200, json) resp = tap_github.authed_get("", "") - self.assertEquals(json, resp.json()) + self.assertEqual(json, resp.json()) diff --git a/tests/unittests/test_extract_repos_from_config.py b/tests/unittests/test_extract_repos_from_config.py new file mode 100644 index 00000000..4a205696 --- /dev/null +++ b/tests/unittests/test_extract_repos_from_config.py @@ -0,0 +1,32 @@ +import unittest +import tap_github + + +@unittest.mock.patch('tap_github.get_all_repos') +class TestExtractReposFromConfig(unittest.TestCase): + + def test_single_repo(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo'} + expected_repositories = ['singer-io/test-repo'] + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) + + def test_multiple_repos(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo singer-io/tap-github'} + expected_repositories = ['singer-io/test-repo', 'singer-io/tap-github'] + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) + + def test_org_all_repos(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo test-org/*'} + expected_repositories = [ + 'singer-io/test-repo', + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + mocked_get_all_repos.return_value = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) diff --git a/tests/unittests/test_formatting_dates.py b/tests/unittests/test_formatting_dates.py index 701b2714..72a70925 100644 --- a/tests/unittests/test_formatting_dates.py +++ b/tests/unittests/test_formatting_dates.py @@ -91,7 +91,7 @@ def test_due_on_not_none_2(self, mocked_request): final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") # as we will get 0 records, initial and final bookmark will be same - self.assertEquals(init_bookmark, final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) + self.assertEqual(init_bookmark, final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) @mock.patch("singer.write_record") def test_data_containing_both_values(self, mocked_write_record, mocked_request): @@ -117,4 +117,4 @@ def test_data_containing_both_values(self, mocked_write_record, mocked_request): # as we will get 2 record, final bookmark will be greater than initial bookmark self.assertGreater(last_bookmark, init_bookmark) # as we will get 2 record, write_records will also be called 2 times - self.assertEquals(mocked_write_record.call_count, 2) + self.assertEqual(mocked_write_record.call_count, 2) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py new file mode 100644 index 00000000..c8ca7a0b --- /dev/null +++ b/tests/unittests/test_get_all_repos.py @@ -0,0 +1,74 @@ +import unittest +import requests +import requests_mock +import simplejson as json + +import tap_github + +from itertools import cycle + + +SESSION = requests.Session() +ADAPTER = requests_mock.Adapter() +SESSION.mount('mock://', ADAPTER) + + +@unittest.mock.patch('tap_github.verify_repo_access') +@unittest.mock.patch('tap_github.authed_get_all_pages') +class TestGetAllRepos(unittest.TestCase): + + def test_single_organization(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + orgs = ['test-org/*'] + repos = ['repo1', 'repo2', 'repo3'] + + mocked_url = 'mock://github.com/orgs/test-org/repos' + mocked_response_body = [ + {'full_name': ''.join(r).replace('*', '')} for r in zip(cycle(orgs), repos) + ] + mocked_response_text = json.dumps(mocked_response_body) + ADAPTER.register_uri( + 'GET', + mocked_url, + text=mocked_response_text) + mocked_response = SESSION.get(mocked_url) + + expected_repositories = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + mocked_authed_get_all_pages.return_value = [mocked_response] + + self.assertEqual(expected_repositories, tap_github.get_all_repos(orgs)) + + def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + orgs = ['test-org/*', 'singer-io/*'] + repos = ['repo1', 'repo2', 'repo3'] + + mocked_url = 'mock://github.com/orgs/test-org/repos' + side_effect = [] + for org in orgs: + mocked_response_body = [ + {'full_name': ''.join(r).replace('*', '')} for r in zip(cycle([org]), repos) + ] + ADAPTER.register_uri( + 'GET', + mocked_url, + text=json.dumps(mocked_response_body)) + mocked_response = SESSION.get(mocked_url) + mocked_authed_get_all_pages.return_value = [mocked_response] + + call_response = tap_github.get_all_repos([org]) + + side_effect.extend(call_response) + + expected_repositories = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3', + 'singer-io/repo1', + 'singer-io/repo2', + 'singer-io/repo3' + ] + + self.assertListEqual(expected_repositories, side_effect) diff --git a/tests/unittests/test_key_error.py b/tests/unittests/test_key_error.py index ab23600d..7e5bb28c 100644 --- a/tests/unittests/test_key_error.py +++ b/tests/unittests/test_key_error.py @@ -23,21 +23,22 @@ def test_slug_sub_stream_selected_slug_selected(self, mocked_team_members, mocke mocked_request.return_value = get_response(json) schemas = {"teams": "None", "team_members": "None"} - mdata =[ + mdata_slug = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, + }, { 'breadcrumb': ['properties', 'slug'], 'metadata': {'inclusion': 'available'} - }, + }, { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} }] + mdata = {"teams": mdata_slug, "team_members": mdata_slug} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 1) + self.assertEqual(mocked_team_members.call_count, 1) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, mocked_request): @@ -46,9 +47,9 @@ def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, m mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = {"teams": [ { - 'breadcrumb': [], + 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} }, { @@ -58,9 +59,9 @@ def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, m { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} - }] + }]} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 0) + self.assertEqual(mocked_team_members.call_count, 0) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, mocked_request): @@ -69,7 +70,7 @@ def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, m mocked_request.return_value = get_response(json) schemas = {"teams": "None", "team_members": "None"} - mdata =[ + mdata_slug = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} @@ -82,8 +83,9 @@ def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, m "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} }] + mdata = {"teams": mdata_slug, "team_members": mdata_slug} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 1) + self.assertEqual(mocked_team_members.call_count, 1) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_members, mocked_request): @@ -92,7 +94,7 @@ def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_member mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = {"teams": [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} @@ -104,9 +106,9 @@ def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_member { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} - }] + }]} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 0) + self.assertEqual(mocked_team_members.call_count, 0) @mock.patch("tap_github.__init__.authed_get_all_pages") class TestKeyErrorUser(unittest.TestCase): @@ -118,7 +120,7 @@ def test_user_not_selected_in_stargazers(self, mocked_write_records, mocked_requ mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['user_id']} @@ -132,7 +134,7 @@ def test_user_not_selected_in_stargazers(self, mocked_write_records, mocked_requ "metadata": {"inclusion": "available"} }] tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_write_records.call_count, 1) + self.assertEqual(mocked_write_records.call_count, 1) @mock.patch("singer.write_record") def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request): @@ -141,7 +143,7 @@ def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request) mocked_request.return_value = get_response(json) schemas = {"stargazers": "None"} - mdata =[ + mdata = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['user_id']} @@ -155,4 +157,4 @@ def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request) "metadata": {"inclusion": "available"} }] tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_write_records.call_count, 1) + self.assertEqual(mocked_write_records.call_count, 1) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index d335a1e5..7fb01873 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -36,7 +36,7 @@ def test_rate_limit_exception(self, mocked_sleep): try: tap_github.rate_throttling(resp) except tap_github.RateLimitExceeded as e: - self.assertEquals(str(e), "API rate limit exceeded, please try after 601 seconds.") + self.assertEqual(str(e), "API rate limit exceeded, please try after 601 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): diff --git a/tests/unittests/test_start_date_bookmark.py b/tests/unittests/test_start_date_bookmark.py index 6489b1d2..8cfb4b18 100644 --- a/tests/unittests/test_start_date_bookmark.py +++ b/tests/unittests/test_start_date_bookmark.py @@ -12,7 +12,7 @@ def test_no_bookmark_no_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = None - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_no_bookmark_yes_start_date(self, mocked_get_bookmark): # Start date is present and bookmark is not present then start date should be return. @@ -21,7 +21,7 @@ def test_no_bookmark_yes_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-04-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_yes_bookmark_yes_start_date(self, mocked_get_bookmark): # Start date and bookmark both are present then bookmark should be return. @@ -30,7 +30,7 @@ def test_yes_bookmark_yes_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_yes_bookmark_no_start_date(self, mocked_get_bookmark): # Start date is not present and bookmark is present then bookmark should be return. @@ -39,4 +39,4 @@ def test_yes_bookmark_no_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) diff --git a/tests/unittests/test_sub_streams_selection.py b/tests/unittests/test_sub_streams_selection.py index 329f88b2..8dd16ff9 100644 --- a/tests/unittests/test_sub_streams_selection.py +++ b/tests/unittests/test_sub_streams_selection.py @@ -12,7 +12,7 @@ def test_pull_request_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'reviews' data, to receive 'reviews' data, you also need to select 'pull_requests'. Unable to extract 'pr_commits' data, to receive 'pr_commits' data, you also need to select 'pull_requests'.") + self.assertEqual(str(e), "Unable to extract 'reviews' data, to receive 'reviews' data, you also need to select 'pull_requests'. Unable to extract 'pr_commits' data, to receive 'pr_commits' data, you also need to select 'pull_requests'.") def test_teams_sub_streams_selected(self): selected_streams = ["teams", "team_members"] @@ -23,7 +23,7 @@ def test_teams_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'team_members' data, to receive 'team_members' data, you also need to select 'teams'.") + self.assertEqual(str(e), "Unable to extract 'team_members' data, to receive 'team_members' data, you also need to select 'teams'.") def test_projects_sub_streams_selected(self): selected_streams = ["projects", "project_cards"] @@ -34,7 +34,7 @@ def test_projects_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'project_columns' data, to receive 'project_columns' data, you also need to select 'projects'.") + self.assertEqual(str(e), "Unable to extract 'project_columns' data, to receive 'project_columns' data, you also need to select 'projects'.") def test_mixed_streams_positive(self): selected_streams = ["pull_requests", "reviews", "collaborators", "team_members", "stargazers", "projects", "teams", "project_cards"] @@ -45,4 +45,4 @@ def test_mixed_streams_negative(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'review_comments' data, to receive 'review_comments' data, you also need to select 'pull_requests'.") + self.assertEqual(str(e), "Unable to extract 'review_comments' data, to receive 'review_comments' data, you also need to select 'pull_requests'.") diff --git a/tests/unittests/test_verify_access.py b/tests/unittests/test_verify_access.py index 72f868de..1e00df32 100644 --- a/tests/unittests/test_verify_access.py +++ b/tests/unittests/test_verify_access.py @@ -34,7 +34,7 @@ def test_repo_not_found(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.") + self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.") def test_repo_bad_request(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(400, raise_error = True) @@ -42,7 +42,7 @@ def test_repo_bad_request(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_repo_bad_creds(self, mocked_parse_args, mocked_request): json = {"message": "Bad credentials", "documentation_url": "https://docs.github.com/"} @@ -51,7 +51,7 @@ def test_repo_bad_creds(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) @mock.patch("tap_github.get_catalog") def test_discover_valid_creds(self, mocked_get_catalog, mocked_parse_args, mocked_request): @@ -71,8 +71,8 @@ def test_discover_not_found(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'org/repo' or you do not have sufficient permissions to access this repository.") - self.assertEqual(mocked_get_catalog.call_count, 0) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name org/repo or you do not have sufficient permissions to access this repository.") + self.assertEqual(mocked_get_catalog.call_count, 1) @mock.patch("tap_github.get_catalog") def test_discover_bad_request(self, mocked_get_catalog, mocked_parse_args, mocked_request): @@ -82,7 +82,7 @@ def test_discover_bad_request(self, mocked_get_catalog, mocked_parse_args, mocke try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") self.assertEqual(mocked_get_catalog.call_count, 0) @mock.patch("tap_github.get_catalog") @@ -94,7 +94,7 @@ def test_discover_bad_creds(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) self.assertEqual(mocked_get_catalog.call_count, 0) @mock.patch("tap_github.get_catalog") @@ -106,7 +106,7 @@ def test_discover_forbidden(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.AuthException as e: - self.assertEquals(str(e), "HTTP-error-code: 403, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 403, Error: {}".format(json)) self.assertEqual(mocked_get_catalog.call_count, 0) @@ -123,5 +123,5 @@ def test_repo_call_count(self, mocked_repo, mocked_logger_info): config = {"access_token": "access_token", "repository": "org1/repo1 org1/repo2 org2/repo1"} tap_github.verify_access_for_repo(config) - self.assertEquals(mocked_logger_info.call_count, 3) - self.assertEquals(mocked_repo.call_count, 3) + self.assertEqual(mocked_logger_info.call_count, 3) + self.assertEqual(mocked_repo.call_count, 3)