Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: dump postgres records #709

Merged
merged 8 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,30 @@ start-all-service:
docker compose up --wait -d db

# Insert all dump data in before running servers
@(cd lib/workload/stateless/stacks/metadata-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/sequence-run-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/workflow-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/filemanager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/metadata-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/sequence-run-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/workflow-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/filemanager && $(MAKE) reset-db)

# Running the rest of the µ-service server
docker compose up --wait -d --build

# Commands for pg-dd
dump: PG_DD_COMMAND=dump
dump: pg-dd

upload: PG_DD_COMMAND=upload
upload: pg-dd

download: PG_DD_COMMAND=download
download: pg-dd

load: PG_DD_COMMAND=load
load: pg-dd

pg-dd:
@PG_DD_COMMAND=$(COMMAND) docker compose up pg-dd

stop-all-service:
docker compose down

Expand Down
35 changes: 35 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,38 @@ services:
interval: 10s
timeout: 2s
retries: 5

# Load test data into the database.
pg-dd:
build:
context: ./lib/workload/stateless/stacks/pg-dd
dockerfile: Dockerfile
volumes:
# Store the dumps to the local filesystem.
- ./lib/workload/stateless/stacks/pg-dd/data:/app/data
depends_on:
# Depends on migration from all services, so they must be started first.
- db
- metadata-manager
- workflow-manager
- sequence-run-manager
- filemanager
command: ${PG_DD_COMMAND:-load}
environment:
- PG_DD_URL=postgresql://orcabus:orcabus@db:5432
- PG_DD_DIR=data
- PG_DD_BUCKET=orcabus-test-data-843407916570-ap-southeast-2
- PG_DD_PREFIX=pg-dd

- PG_DD_DATABASE_METADATA_MANAGER=metadata_manager
- PG_DD_DATABASE_SEQUENCE_RUN_MANAGER=sequence_run_manager
- PG_DD_DATABASE_WORKFLOW_MANAGER=workflow_manager
- PG_DD_DATABASE_FILEMANAGER=filemanager
- PG_DD_DATABASE_FILEMANAGER_SQL_DUMP=select * from s3_object order by sequencer limit 10000
- PG_DD_DATABASE_FILEMANAGER_SQL_LOAD=s3_object

- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-access_key_id}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-secret_access_key}
- AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-ap-southeast-2}
- AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-session_token}
restart: no
2 changes: 2 additions & 0 deletions config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import {
getOraCompressionIcav2PipelineTableStackProps,
} from './stacks/oraCompressionPipelineManager';
import { getOraDecompressionManagerStackProps } from './stacks/oraDecompressionPipelineManager';
import { getPgDDProps } from './stacks/pgDD';

interface EnvironmentConfig {
name: string;
Expand Down Expand Up @@ -130,6 +131,7 @@ export const getEnvironmentConfig = (stage: AppStage): EnvironmentConfig | null
workflowManagerStackProps: getWorkflowManagerStackProps(stage),
stackyMcStackFaceProps: getGlueStackProps(stage),
fmAnnotatorProps: getFmAnnotatorProps(),
pgDDProps: getPgDDProps(stage),
},
};

Expand Down
25 changes: 25 additions & 0 deletions config/stacks/pgDD.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import {
accountIdAlias,
AppStage,
computeSecurityGroupName,
rdsMasterSecretName,
region,
vpcProps,
} from '../constants';
import { PgDDStackProps } from '../../lib/workload/stateless/stacks/pg-dd/deploy/stack';
import { getDataBucketStackProps } from './dataBucket';

export const getPgDDProps = (stage: AppStage): PgDDStackProps | undefined => {
const bucket = getDataBucketStackProps(stage);
if (bucket.bucketName === undefined) {
return undefined;
} else {
return {
bucket: bucket.bucketName,
prefix: 'pg-dd',
secretArn: `arn:aws:secretsmanager:${region}:${accountIdAlias.beta}:secret:${rdsMasterSecretName}`, // pragma: allowlist secret
lambdaSecurityGroupName: computeSecurityGroupName,
vpcProps,
};
}
};
9 changes: 4 additions & 5 deletions lib/workload/stateless/stacks/filemanager/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,17 @@ restore:
@docker compose exec -T postgres pg_restore -U filemanager -d filemanager

## Targets related to top-level database management and S3.
apply-schema:
@for file in database/migrations/*; do \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d filemanager -c "$$(cat $$file)"; \
done
reset-db:
@docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d orcabus -c "DROP DATABASE IF EXISTS filemanager;" && \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d orcabus -c "CREATE DATABASE filemanager;"
for file in database/migrations/*; do \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d filemanager -c "$$(cat $$file)"; \
done
s3-dump-upload:
@aws s3 cp data/fm_s3_objects_100000.csv.gz s3://orcabus-test-data-843407916570-ap-southeast-2/file-manager/fm_s3_objects_100000.csv.gz
s3-dump-download:
@aws s3 cp s3://orcabus-test-data-843407916570-ap-southeast-2/file-manager/fm_s3_objects_100000.csv.gz data/fm_s3_objects_100000.csv.gz
db-load-data: reset-db apply-schema
db-load-data: reset-db
@gunzip -c data/fm_s3_objects_100000.csv.gz | \
docker exec -i orcabus_db psql -U orcabus -d filemanager -c "copy s3_object from stdin with (format csv, header);"
s3-dump-download-if-not-exists:
Expand Down
7 changes: 7 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
deploy
.env
.env.example
.gitignore
README.md
data
.ruff_cache
11 changes: 11 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
PG_DD_URL=postgresql://orcabus:[email protected]:5432 # pragma: allowlist secret
PG_DD_DIR=data
PG_DD_BUCKET=orcabus-test-data-843407916570-ap-southeast-2
PG_DD_PREFIX=pg-dd

PG_DD_DATABASE_METADATA_MANAGER=metadata_manager
PG_DD_DATABASE_SEQUENCE_RUN_MANAGER=sequence_run_manager
PG_DD_DATABASE_WORKFLOW_MANAGER=workflow_manager
PG_DD_DATABASE_FILEMANAGER=filemanager
PG_DD_DATABASE_FILEMANAGER_SQL_DUMP='select * from s3_object order by sequencer limit 10000'
PG_DD_DATABASE_FILEMANAGER_SQL_LOAD=s3_object
4 changes: 4 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
data
.ruff_cache
response.json
14 changes: 14 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This Dockerfile is intended to be used as part of a Docker Compose setup.
# When running this microservice from the Docker Compose root, this Dockerfile
# will build the image, install dependencies, and start the server

FROM public.ecr.aws/docker/library/python:3.13

RUN pip3 install poetry

WORKDIR /app

COPY . .
RUN poetry install --no-interaction --no-root

ENTRYPOINT ["poetry", "run", "cli"]
18 changes: 18 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.PHONY: *

COMMAND ?= "load --exists-ok"

install:
@poetry update

lint: install
@poetry run ruff format .

check: lint
@poetry run ruff check .

cli: install
@poetry run cli $(COMMAND)

clean:
rm -rf data && rm -rf .ruff_cache
58 changes: 58 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Postgres data dump

Postgres data dump - a service that dumps (like `dd`) orcabus postgres databases to S3.

## Usage

Call the deployed function to update the current dump:

```sh
aws lambda invoke --function-name orcabus-pg-dd response.json
```

This is setup to dump the metadata_manager, workflow_manager, sequence_run_manager, and 10000 of the most recent
rows of the filemanager database.

This command can be run locally using make, or by running `poetry` directly:

```sh
make cli COMMAND="--help"
```

For example, to dump and upload a specific database to s3:

```sh
poetry run cli dump --database metadata_manager && poetry run cli upload
```

The `Dockerfile` is setup to launch with the top-level `Makefile`, which also contains commands for running the CLI.
By default `start-all-service` will run a `load` on the local database, downloading any dumps from S3 along the way.
AWS credentials must be present in the shell for this to work.

## Configuration

This function can be configured by setting the following environment variables, see [.env.example][env-example] for an example:

| Name | Description | Type |
|-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
| `PG_DD_URL` | The database URL to dump databases from. | Postgres connection string |
| `PG_DD_SECRET` | The secret name or ARN to fetch the database URL from. This is only used in the Lambda function, and overrides `PG_DD_URL`. | `string` |
| `PG_DD_DATABASE_<DATABASE_NAME>` | A name of the database to dump records from where `<DATABASE_NAME>` represents the target database. Specify this multiple times to use dump from multiple databases. | `string` |
| `PG_DD_DATABASE_<DATABASE_NAME>_SQL_DUMP` | Custom SQL code to execute when dumping database records for `<DATABASE_NAME>`. This is optional, and by default all records from all tables are dumped. Specify this is a list of SQL statements to generate a corresponding CSV file. | `string[]` or undefined |
| `PG_DD_DATABASE_<DATABASE_NAME>_SQL_LOAD` | The name of the table to load into for `<DATABASE_NAME>`. This is required if loading data after dumping with `<PG_DD_DATABASE_DATABASE_NAME_SQL_DUMP>` to specify the table to load data into. | `string[]` or undefined |
| `PG_DD_BUCKET` | The bucket to dump data to. This is required when deploying the Lambda function. | `string` or undefined |
| `PG_DD_PREFIX` | The bucket prefix to use when writing to a bucket. This is optional. | `string` or undefined |
| `PG_DD_DIR` | The local filesystem directory to dump data to when running this command locally. This is not used on the deployed Lambda function. | filesystem directory or undefined |

## Local development

This project uses [poetry] to manage dependencies.

Run the linter and formatter:

```
make check
```

[poetry]: https://python-poetry.org/
[env-example]: .env.example
116 changes: 116 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/deploy/stack.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import { Duration, Stack, StackProps } from 'aws-cdk-lib';
import { Construct } from 'constructs';
import { PythonFunction } from '@aws-cdk/aws-lambda-python-alpha';
import path from 'path';
import { Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
import {
ISecurityGroup,
IVpc,
SecurityGroup,
SubnetType,
Vpc,
VpcLookupOptions,
} from 'aws-cdk-lib/aws-ec2';
import { NamedLambdaRole } from '../../../../components/named-lambda-role';
import { ManagedPolicy, PolicyStatement, Role } from 'aws-cdk-lib/aws-iam';
import { readFileSync } from 'fs';

/**
* Props for the PgDD stack.
*/
export type PgDDStackProps = {
/**
* The bucket to dump data to.
*/
bucket: string;
/**
* Secret to connect to database with.
*/
secretArn: string;
/**
* The key prefix when writing data.
*/
prefix?: string;
/**
* Props to lookup the VPC with.
*/
vpcProps: VpcLookupOptions;
/**
* Existing security group name to be attached on lambda.
*/
lambdaSecurityGroupName: string;
};

/**
* Deploy the PgDD stack.
*/
export class PgDDStack extends Stack {
private readonly vpc: IVpc;
private readonly securityGroup: ISecurityGroup;
private readonly role: Role;

constructor(scope: Construct, id: string, props: StackProps & PgDDStackProps) {
super(scope, id, props);

this.vpc = Vpc.fromLookup(this, 'MainVpc', props.vpcProps);
this.securityGroup = SecurityGroup.fromLookupByName(
this,
'OrcaBusLambdaSecurityGroup',
props.lambdaSecurityGroupName,
this.vpc
);

this.role = new NamedLambdaRole(this, 'Role');
this.role.addManagedPolicy(
ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaVPCAccessExecutionRole')
);
this.role.addToPolicy(
new PolicyStatement({
actions: ['s3:PutObject'],
resources: [`arn:aws:s3:::${props.bucket}`, `arn:aws:s3:::${props.bucket}/*`],
})
);
this.role.addToPolicy(
new PolicyStatement({
actions: ['secretsmanager:GetSecretValue'],
resources: [`${props.secretArn}-*`],
})
);

const securityGroup = new SecurityGroup(this, 'SecurityGroup', {
vpc: this.vpc,
allowAllOutbound: true,
description: 'Security group that allows the PgDD Lambda function to egress out.',
});

const entry = path.join(__dirname, '..');
new PythonFunction(this, 'function', {
entry,
functionName: 'orcabus-pg-dd',
index: 'pg_dd/handler.py',
runtime: Runtime.PYTHON_3_12,
architecture: Architecture.ARM_64,
timeout: Duration.minutes(5),
memorySize: 1024,
vpc: this.vpc,
vpcSubnets: {
subnetType: SubnetType.PRIVATE_WITH_EGRESS,
},
bundling: {
assetExcludes: [...readFileSync(path.join(entry, '.dockerignore'), 'utf-8').split('\n')],
},
role: this.role,
securityGroups: [securityGroup, this.securityGroup],
environment: {
PG_DD_SECRET: props.secretArn,
PG_DD_BUCKET: props.bucket,
PG_DD_DATABASE_METADATA_MANAGER: 'metadata_manager',
PG_DD_DATABASE_SEQUENCE_RUN_MANAGER: 'sequence_run_manager',
PG_DD_DATABASE_WORKFLOW_MANAGER: 'workflow_manager',
PG_DD_DATABASE_FILEMANAGER: 'filemanager',
PG_DD_DATABASE_FILEMANAGER_SQL: 'select * from s3_object order by sequencer limit 10000',
...(props.prefix && { PG_DD_PREFIX: props.prefix }),
},
});
}
}
Empty file.
Loading