Skip to content

Commit

Permalink
Merge pull request #709 from umccr/feat/pg-dd
Browse files Browse the repository at this point in the history
feat: dump postgres records
  • Loading branch information
mmalenic authored Nov 26, 2024
2 parents 73695e8 + ae393eb commit 446eb0f
Show file tree
Hide file tree
Showing 19 changed files with 1,140 additions and 9 deletions.
24 changes: 20 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,30 @@ start-all-service:
docker compose up --wait -d db

# Insert all dump data in before running servers
@(cd lib/workload/stateless/stacks/metadata-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/sequence-run-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/workflow-manager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/filemanager && $(MAKE) s3-load)
@(cd lib/workload/stateless/stacks/metadata-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/sequence-run-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/workflow-manager && $(MAKE) reset-db)
@(cd lib/workload/stateless/stacks/filemanager && $(MAKE) reset-db)

# Running the rest of the µ-service server
docker compose up --wait -d --build

# Commands for pg-dd
dump: PG_DD_COMMAND=dump
dump: pg-dd

upload: PG_DD_COMMAND=upload
upload: pg-dd

download: PG_DD_COMMAND=download
download: pg-dd

load: PG_DD_COMMAND=load
load: pg-dd

pg-dd:
@PG_DD_COMMAND=$(COMMAND) docker compose up pg-dd

stop-all-service:
docker compose down

Expand Down
35 changes: 35 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,38 @@ services:
interval: 10s
timeout: 2s
retries: 5

# Load test data into the database.
pg-dd:
build:
context: ./lib/workload/stateless/stacks/pg-dd
dockerfile: Dockerfile
volumes:
# Store the dumps to the local filesystem.
- ./lib/workload/stateless/stacks/pg-dd/data:/app/data
depends_on:
# Depends on migration from all services, so they must be started first.
- db
- metadata-manager
- workflow-manager
- sequence-run-manager
- filemanager
command: ${PG_DD_COMMAND:-load}
environment:
- PG_DD_URL=postgresql://orcabus:orcabus@db:5432
- PG_DD_DIR=data
- PG_DD_BUCKET=orcabus-test-data-843407916570-ap-southeast-2
- PG_DD_PREFIX=pg-dd

- PG_DD_DATABASE_METADATA_MANAGER=metadata_manager
- PG_DD_DATABASE_SEQUENCE_RUN_MANAGER=sequence_run_manager
- PG_DD_DATABASE_WORKFLOW_MANAGER=workflow_manager
- PG_DD_DATABASE_FILEMANAGER=filemanager
- PG_DD_DATABASE_FILEMANAGER_SQL_DUMP=select * from s3_object order by sequencer limit 10000
- PG_DD_DATABASE_FILEMANAGER_SQL_LOAD=s3_object

- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-access_key_id}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-secret_access_key}
- AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-ap-southeast-2}
- AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-session_token}
restart: no
2 changes: 2 additions & 0 deletions config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import {
getOraCompressionIcav2PipelineTableStackProps,
} from './stacks/oraCompressionPipelineManager';
import { getOraDecompressionManagerStackProps } from './stacks/oraDecompressionPipelineManager';
import { getPgDDProps } from './stacks/pgDD';

interface EnvironmentConfig {
name: string;
Expand Down Expand Up @@ -130,6 +131,7 @@ export const getEnvironmentConfig = (stage: AppStage): EnvironmentConfig | null
workflowManagerStackProps: getWorkflowManagerStackProps(stage),
stackyMcStackFaceProps: getGlueStackProps(stage),
fmAnnotatorProps: getFmAnnotatorProps(),
pgDDProps: getPgDDProps(stage),
},
};

Expand Down
25 changes: 25 additions & 0 deletions config/stacks/pgDD.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import {
accountIdAlias,
AppStage,
computeSecurityGroupName,
rdsMasterSecretName,
region,
vpcProps,
} from '../constants';
import { PgDDStackProps } from '../../lib/workload/stateless/stacks/pg-dd/deploy/stack';
import { getDataBucketStackProps } from './dataBucket';

export const getPgDDProps = (stage: AppStage): PgDDStackProps | undefined => {
const bucket = getDataBucketStackProps(stage);
if (bucket.bucketName === undefined) {
return undefined;
} else {
return {
bucket: bucket.bucketName,
prefix: 'pg-dd',
secretArn: `arn:aws:secretsmanager:${region}:${accountIdAlias.beta}:secret:${rdsMasterSecretName}`, // pragma: allowlist secret
lambdaSecurityGroupName: computeSecurityGroupName,
vpcProps,
};
}
};
9 changes: 4 additions & 5 deletions lib/workload/stateless/stacks/filemanager/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,17 @@ restore:
@docker compose exec -T postgres pg_restore -U filemanager -d filemanager

## Targets related to top-level database management and S3.
apply-schema:
@for file in database/migrations/*; do \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d filemanager -c "$$(cat $$file)"; \
done
reset-db:
@docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d orcabus -c "DROP DATABASE IF EXISTS filemanager;" && \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d orcabus -c "CREATE DATABASE filemanager;"
for file in database/migrations/*; do \
docker exec -e PGPASSWORD=orcabus -it orcabus_db psql -h $(FILEMANAGER_DATABASE_HOST) -U orcabus -d filemanager -c "$$(cat $$file)"; \
done
s3-dump-upload:
@aws s3 cp data/fm_s3_objects_100000.csv.gz s3://orcabus-test-data-843407916570-ap-southeast-2/file-manager/fm_s3_objects_100000.csv.gz
s3-dump-download:
@aws s3 cp s3://orcabus-test-data-843407916570-ap-southeast-2/file-manager/fm_s3_objects_100000.csv.gz data/fm_s3_objects_100000.csv.gz
db-load-data: reset-db apply-schema
db-load-data: reset-db
@gunzip -c data/fm_s3_objects_100000.csv.gz | \
docker exec -i orcabus_db psql -U orcabus -d filemanager -c "copy s3_object from stdin with (format csv, header);"
s3-dump-download-if-not-exists:
Expand Down
7 changes: 7 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
deploy
.env
.env.example
.gitignore
README.md
data
.ruff_cache
11 changes: 11 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
PG_DD_URL=postgresql://orcabus:[email protected]:5432 # pragma: allowlist secret
PG_DD_DIR=data
PG_DD_BUCKET=orcabus-test-data-843407916570-ap-southeast-2
PG_DD_PREFIX=pg-dd

PG_DD_DATABASE_METADATA_MANAGER=metadata_manager
PG_DD_DATABASE_SEQUENCE_RUN_MANAGER=sequence_run_manager
PG_DD_DATABASE_WORKFLOW_MANAGER=workflow_manager
PG_DD_DATABASE_FILEMANAGER=filemanager
PG_DD_DATABASE_FILEMANAGER_SQL_DUMP='select * from s3_object order by sequencer limit 10000'
PG_DD_DATABASE_FILEMANAGER_SQL_LOAD=s3_object
4 changes: 4 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
data
.ruff_cache
response.json
14 changes: 14 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This Dockerfile is intended to be used as part of a Docker Compose setup.
# When running this microservice from the Docker Compose root, this Dockerfile
# will build the image, install dependencies, and start the server

FROM public.ecr.aws/docker/library/python:3.13

RUN pip3 install poetry

WORKDIR /app

COPY . .
RUN poetry install --no-interaction --no-root

ENTRYPOINT ["poetry", "run", "cli"]
18 changes: 18 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.PHONY: *

COMMAND ?= "load --exists-ok"

install:
@poetry update

lint: install
@poetry run ruff format .

check: lint
@poetry run ruff check .

cli: install
@poetry run cli $(COMMAND)

clean:
rm -rf data && rm -rf .ruff_cache
58 changes: 58 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Postgres data dump

Postgres data dump - a service that dumps (like `dd`) orcabus postgres databases to S3.

## Usage

Call the deployed function to update the current dump:

```sh
aws lambda invoke --function-name orcabus-pg-dd response.json
```

This is setup to dump the metadata_manager, workflow_manager, sequence_run_manager, and 10000 of the most recent
rows of the filemanager database.

This command can be run locally using make, or by running `poetry` directly:

```sh
make cli COMMAND="--help"
```

For example, to dump and upload a specific database to s3:

```sh
poetry run cli dump --database metadata_manager && poetry run cli upload
```

The `Dockerfile` is setup to launch with the top-level `Makefile`, which also contains commands for running the CLI.
By default `start-all-service` will run a `load` on the local database, downloading any dumps from S3 along the way.
AWS credentials must be present in the shell for this to work.

## Configuration

This function can be configured by setting the following environment variables, see [.env.example][env-example] for an example:

| Name | Description | Type |
|-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
| `PG_DD_URL` | The database URL to dump databases from. | Postgres connection string |
| `PG_DD_SECRET` | The secret name or ARN to fetch the database URL from. This is only used in the Lambda function, and overrides `PG_DD_URL`. | `string` |
| `PG_DD_DATABASE_<DATABASE_NAME>` | A name of the database to dump records from where `<DATABASE_NAME>` represents the target database. Specify this multiple times to use dump from multiple databases. | `string` |
| `PG_DD_DATABASE_<DATABASE_NAME>_SQL_DUMP` | Custom SQL code to execute when dumping database records for `<DATABASE_NAME>`. This is optional, and by default all records from all tables are dumped. Specify this is a list of SQL statements to generate a corresponding CSV file. | `string[]` or undefined |
| `PG_DD_DATABASE_<DATABASE_NAME>_SQL_LOAD` | The name of the table to load into for `<DATABASE_NAME>`. This is required if loading data after dumping with `<PG_DD_DATABASE_DATABASE_NAME_SQL_DUMP>` to specify the table to load data into. | `string[]` or undefined |
| `PG_DD_BUCKET` | The bucket to dump data to. This is required when deploying the Lambda function. | `string` or undefined |
| `PG_DD_PREFIX` | The bucket prefix to use when writing to a bucket. This is optional. | `string` or undefined |
| `PG_DD_DIR` | The local filesystem directory to dump data to when running this command locally. This is not used on the deployed Lambda function. | filesystem directory or undefined |

## Local development

This project uses [poetry] to manage dependencies.

Run the linter and formatter:

```
make check
```

[poetry]: https://python-poetry.org/
[env-example]: .env.example
116 changes: 116 additions & 0 deletions lib/workload/stateless/stacks/pg-dd/deploy/stack.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import { Duration, Stack, StackProps } from 'aws-cdk-lib';
import { Construct } from 'constructs';
import { PythonFunction } from '@aws-cdk/aws-lambda-python-alpha';
import path from 'path';
import { Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
import {
ISecurityGroup,
IVpc,
SecurityGroup,
SubnetType,
Vpc,
VpcLookupOptions,
} from 'aws-cdk-lib/aws-ec2';
import { NamedLambdaRole } from '../../../../components/named-lambda-role';
import { ManagedPolicy, PolicyStatement, Role } from 'aws-cdk-lib/aws-iam';
import { readFileSync } from 'fs';

/**
* Props for the PgDD stack.
*/
export type PgDDStackProps = {
/**
* The bucket to dump data to.
*/
bucket: string;
/**
* Secret to connect to database with.
*/
secretArn: string;
/**
* The key prefix when writing data.
*/
prefix?: string;
/**
* Props to lookup the VPC with.
*/
vpcProps: VpcLookupOptions;
/**
* Existing security group name to be attached on lambda.
*/
lambdaSecurityGroupName: string;
};

/**
* Deploy the PgDD stack.
*/
export class PgDDStack extends Stack {
private readonly vpc: IVpc;
private readonly securityGroup: ISecurityGroup;
private readonly role: Role;

constructor(scope: Construct, id: string, props: StackProps & PgDDStackProps) {
super(scope, id, props);

this.vpc = Vpc.fromLookup(this, 'MainVpc', props.vpcProps);
this.securityGroup = SecurityGroup.fromLookupByName(
this,
'OrcaBusLambdaSecurityGroup',
props.lambdaSecurityGroupName,
this.vpc
);

this.role = new NamedLambdaRole(this, 'Role');
this.role.addManagedPolicy(
ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaVPCAccessExecutionRole')
);
this.role.addToPolicy(
new PolicyStatement({
actions: ['s3:PutObject'],
resources: [`arn:aws:s3:::${props.bucket}`, `arn:aws:s3:::${props.bucket}/*`],
})
);
this.role.addToPolicy(
new PolicyStatement({
actions: ['secretsmanager:GetSecretValue'],
resources: [`${props.secretArn}-*`],
})
);

const securityGroup = new SecurityGroup(this, 'SecurityGroup', {
vpc: this.vpc,
allowAllOutbound: true,
description: 'Security group that allows the PgDD Lambda function to egress out.',
});

const entry = path.join(__dirname, '..');
new PythonFunction(this, 'function', {
entry,
functionName: 'orcabus-pg-dd',
index: 'pg_dd/handler.py',
runtime: Runtime.PYTHON_3_12,
architecture: Architecture.ARM_64,
timeout: Duration.minutes(5),
memorySize: 1024,
vpc: this.vpc,
vpcSubnets: {
subnetType: SubnetType.PRIVATE_WITH_EGRESS,
},
bundling: {
assetExcludes: [...readFileSync(path.join(entry, '.dockerignore'), 'utf-8').split('\n')],
},
role: this.role,
securityGroups: [securityGroup, this.securityGroup],
environment: {
PG_DD_SECRET: props.secretArn,
PG_DD_BUCKET: props.bucket,
PG_DD_DATABASE_METADATA_MANAGER: 'metadata_manager',
PG_DD_DATABASE_SEQUENCE_RUN_MANAGER: 'sequence_run_manager',
PG_DD_DATABASE_WORKFLOW_MANAGER: 'workflow_manager',
PG_DD_DATABASE_FILEMANAGER: 'filemanager',
PG_DD_DATABASE_FILEMANAGER_SQL: 'select * from s3_object order by sequencer limit 10000',
...(props.prefix && { PG_DD_PREFIX: props.prefix }),
},
});
}
}
Empty file.
Loading

0 comments on commit 446eb0f

Please sign in to comment.