Skip to content

Commit

Permalink
simplify upload all script
Browse files Browse the repository at this point in the history
  • Loading branch information
lmossman committed Sep 27, 2023
1 parent 95eb832 commit 18c208c
Show file tree
Hide file tree
Showing 4 changed files with 1,077 additions and 347 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import click
from metadata_service.constants import METADATA_FILE_NAME
from metadata_service.gcs_upload import MetadataUploadInfo, upload_metadata_to_gcs, upload_all_docs_to_gcs
from metadata_service.gcs_upload import MetadataUploadInfo, upload_metadata_to_gcs
from metadata_service.validators.metadata_validator import PRE_UPLOAD_VALIDATORS, ValidatorOptions, validate_and_load
from pydantic import ValidationError
from connector_ops.utils import get_all_connectors_in_repo


def log_metadata_upload_info(metadata_upload_info: MetadataUploadInfo):
Expand Down Expand Up @@ -62,12 +63,15 @@ def upload(metadata_file_path: pathlib.Path, docs_path: pathlib.Path, bucket_nam
exit(5)

@metadata_service.command(help="Upload docs for all connectors to a GCS bucket.")
@click.argument("connectors-dir", type=click.Path(exists=True, path_type=pathlib.Path))
@click.argument("airbyte-repo-path", type=click.Path(exists=True, path_type=pathlib.Path))
@click.argument("docs-dir", type=click.Path(exists=True, path_type=pathlib.Path))
@click.argument("bucket-name", type=click.STRING)
def upload_all_docs(connectors_dir: pathlib.Path, docs_dir: pathlib.Path, bucket_name: str):
print("connectors_dir: ", connectors_dir)
print("docs_dir: ", docs_dir)
print("bucket_name: ", bucket_name)
upload_all_docs_to_gcs(connectors_dir, docs_dir, bucket_name)
def upload_all_metadata(airbyte_repo_path: pathlib.Path, docs_dir: pathlib.Path, bucket_name: str):
connectors = get_all_connectors_in_repo()
for connector in connectors:
print(f"~~~~~~ Uploading metadata for {connector}")
metadata_file_path = airbyte_repo_path / connector.metadata_file_path
upload_metadata_to_gcs(bucket_name, metadata_file_path, ValidatorOptions(docs_path=str(docs_dir)))
# This break just makes the script upload docs for a single connector. Comment it out to upload docs for all connectors.
break
exit(0)
Original file line number Diff line number Diff line change
Expand Up @@ -286,91 +286,3 @@ def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator
),
],
)

def upload_all_docs_to_gcs(connectors_dir: Path, docs_dir: Path, bucket_name: str):
service_account_info = json.loads(os.environ.get("GCS_CREDENTIALS"))
credentials = service_account.Credentials.from_service_account_info(service_account_info)
storage_client = storage.Client(credentials=credentials)
bucket = storage_client.bucket(bucket_name)

# A function to extract type and name from the folder name
def parse_folder_name(folder_name: str) -> (str, str):
if "scaffol" in folder_name:
return None, None
elif folder_name.startswith('source-'):
return 'source', folder_name[len('source-'):]
elif folder_name.startswith('destination-'):
return 'destination', folder_name[len('destination-'):]
else:
return None, None

def read_metadata_yaml(path: Path) -> ConnectorMetadataDefinitionV0:
return ConnectorMetadataDefinitionV0.parse_obj(yaml.safe_load(path.read_text()))

def get_doc_paths(metadata: ConnectorMetadataDefinitionV0, connector_name: str) -> (str, str):
sub_dir = f"{metadata.data.connectorType}s"
doc_file_name = metadata.data.documentationUrl.split('/')[-1]
doc_path = docs_dir / sub_dir / f"{doc_file_name}.md"
inapp_doc_path = docs_dir / sub_dir / f"{doc_file_name}.inapp.md"

# some connectors like source-appstore-singer have an old documentationUrl, so we need to check with the connector name too
alt_doc_path = docs_dir / sub_dir / f"{connector_name}.md"
alt_inapp_doc_path = docs_dir / sub_dir / f"{connector_name}.inapp.md"

if (doc_path.exists()):
return doc_path, inapp_doc_path if inapp_doc_path.exists() else None
elif (alt_doc_path.exists()):
return alt_doc_path, alt_inapp_doc_path if alt_inapp_doc_path.exists() else None
else:
return None, None

excluded_connectors = []
connector_infos = []

for connector_dir in connectors_dir.iterdir():
if connector_dir.is_dir():
connector_type, connector_name = parse_folder_name(connector_dir.name)
if connector_type and connector_name: # Skip folders that don't match the pattern
metadata_file_path = connector_dir / METADATA_FILE_NAME
if metadata_file_path.exists():
metadata = read_metadata_yaml(metadata_file_path)
doc_path, inapp_doc_path = get_doc_paths(metadata, connector_name) # 'source' becomes 'sources', 'destination' becomes 'destinations'

if not doc_path:
raise FileNotFoundError(f"Expected to find connector doc file at {doc_path} for metadata file at {metadata_file_path}, but none was found.")

directory_info = {
'type': connector_type,
'name': connector_name,
'path': connector_dir,
'metadata': metadata,
'doc_path': doc_path
}
if inapp_doc_path:
directory_info['inapp_doc_path'] = inapp_doc_path

connector_infos.append(directory_info)
else:
excluded_connectors.append(connector_dir.name)

print("excluded_connectors: ", excluded_connectors)
print(f"Found docs for {len(connector_infos)} connectors")

# Example to show uploading the docs for a single connector. Comment these lines out when uploading all docs.
github_connector_info = [connector_info for connector_info in connector_infos if connector_info['name'] == "github"][0]
# versioned uploads
_doc_upload(github_connector_info['metadata'], bucket, github_connector_info['doc_path'], False, False)
_doc_upload(github_connector_info['metadata'], bucket, github_connector_info['doc_path'], False, True)
# latest uploads
_doc_upload(github_connector_info['metadata'], bucket, github_connector_info['doc_path'], True, False)
_doc_upload(github_connector_info['metadata'], bucket, github_connector_info['doc_path'], True, True)

# Uncomment these lines to upload all docs
# for connector_info in connector_infos:
# print(f"Uploading docs for connector {connector_info['name']}")
# # versioned uploads
# _doc_upload(connector_info['metadata'], bucket, connector_info['doc_path'], False, False)
# _doc_upload(connector_info['metadata'], bucket, connector_info['doc_path'], False, True)
# # latest uploads
# _doc_upload(connector_info['metadata'], bucket, connector_info['doc_path'], True, False)
# _doc_upload(connector_info['metadata'], bucket, connector_info['doc_path'], True, True)
Loading

0 comments on commit 18c208c

Please sign in to comment.