Skip to content

Commit

Permalink
Feat (MM): Custom CSV Importer for MM (#566)
Browse files Browse the repository at this point in the history
  • Loading branch information
williamputraintan authored Sep 23, 2024
1 parent 77e132d commit 8842621
Show file tree
Hide file tree
Showing 12 changed files with 511 additions and 77 deletions.
27 changes: 27 additions & 0 deletions lib/workload/stateless/stacks/metadata-manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,33 @@ Some important notes of the sync:

Please refer to the [tracking-sheet-service](proc/service/tracking_sheet_srv.py) implementation.

### Custom CSV File Loader

The application also supports loading data from a custom CSV file. The CSV file should have the following columns:

| Sheet Header | Table | Field Name |
|----------------------|--------------|--------------------|
| Individual_id | `Individual` | individual_id |
| individual_id_source | `Individual` | subject_id |
| subject_id | `Subject` | subject_id |
| sample_id | `Sample` | sample_id |
| external_sample_id | `Sample` | external_sample_id |
| source | `Sample` | source |
| library_id | `Library` | library_id |
| phenotype | `Library` | phenotype |
| workflow | `Library` | workflow |
| quality | `Library` | quality |
| type | `Library` | type |
| coverage | `Library` | coverage |
| assay | `Library` | assay |
| project_name | `Project` | project_id |
| project_owner | `Contact` | contact_id |

The CSV file should be in a presigned URL format, where the loader will read and insert to the database.
To trigger the loader please look at `./deploy/README.md` for more info.

Please refer to the [load-csv-service](proc/service/load_csv_srv.py) implementation.

### Audit Data

The application is configured with [django-simple-history](https://django-simple-history.readthedocs.io/en/latest/)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging
from django.core.management import BaseCommand
from libumccr import libjson

from handler.load_custom_metadata_csv import handler

logger = logging.getLogger()
logger.setLevel(logging.INFO)


class Command(BaseCommand):
help = "Trigger lambda handler for to sync metadata from csv url"

def handle(self, *args, **options):
event = {
"url": "SOME_URL",
}

print(f"Trigger lambda handler for sync tracking sheet. Event {libjson.dumps(event)}")
result = handler(event, {})

print(f"result: {libjson.dumps(result)}")
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,16 @@ class Library(BaseModel):

# history
history = HistoricalRecords(m2m_fields=[project_set])


def sanitize_library_coverage(value: str):
"""
convert value that is valid in the tracking sheet to return a value that is recognizable by the Django Model
"""
try:
# making coverage is float-able type
lib_coverage = float(value)
return f'{lib_coverage}'

except (ValueError, TypeError):
return None
32 changes: 32 additions & 0 deletions lib/workload/stateless/stacks/metadata-manager/deploy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,35 @@ aws lambda invoke \
--cli-binary-format raw-in-base64-out \
res.json
```

### CustomCsvLambda

- Load tracking sheet data from csv presigned url

To manually trigger the sync, the lambda ARN is stored in the SSM Parameter Store named
`/orcabus/metadata-manager/load-custom-csv-lambda-arn`.

To query in a local terminal

```sh
load_custom_csv_lambda_arn=$(aws ssm get-parameter --name '/orcabus/metadata-manager/load-custom-csv-lambda-arn' --with-decryption | jq -r .Parameter.Value)
```

The lambda handler will accept a json which only accepts a single key `url` which is the presigned url of the csv file.

```json
{
"url": "https://example.com/csv"
}
```

Invoking lambda cmd:

```sh
aws lambda invoke \
--function-name $load_custom_csv_lambda_arn \
--invocation-type Event \
--payload '{ "url": "https://the.url.csv" }' \
--cli-binary-format raw-in-base64-out \
res.json
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import path from 'path';
import { Construct } from 'constructs';
import { Duration } from 'aws-cdk-lib';
import { PythonFunction } from '@aws-cdk/aws-lambda-python-alpha';
import { ISecret } from 'aws-cdk-lib/aws-secretsmanager';
import { StringParameter } from 'aws-cdk-lib/aws-ssm';
import {
DockerImageFunction,
DockerImageFunctionProps,
DockerImageCode,
} from 'aws-cdk-lib/aws-lambda';

type LambdaProps = {
/**
* The basic common lambda properties that it should inherit from
*/
basicLambdaConfig: Partial<DockerImageFunctionProps>;
/**
* The secret for the db connection where the lambda will need access to
*/
dbConnectionSecret: ISecret;
};

export class LambdaLoadCustomCSVConstruct extends Construct {
private readonly lambda: PythonFunction;

constructor(scope: Construct, id: string, lambdaProps: LambdaProps) {
super(scope, id);

this.lambda = new DockerImageFunction(this, 'LoadCustomCSVLambda', {
environment: {
...lambdaProps.basicLambdaConfig.environment,
},
securityGroups: lambdaProps.basicLambdaConfig.securityGroups,
vpc: lambdaProps.basicLambdaConfig.vpc,
vpcSubnets: lambdaProps.basicLambdaConfig.vpcSubnets,
architecture: lambdaProps.basicLambdaConfig.architecture,
code: DockerImageCode.fromImageAsset(path.join(__dirname, '../../../'), {
file: 'deploy/construct/lambda-load-custom-csv/lambda.Dockerfile',
}),
timeout: Duration.minutes(15),
memorySize: 4096,
});

lambdaProps.dbConnectionSecret.grantRead(this.lambda);

// We need to store this lambda ARN somewhere so that we could refer when need to load this manually
const ssmParameter = new StringParameter(this, 'LoadCustomCSVLambdaArnParameterStore', {
parameterName: '/orcabus/metadata-manager/load-custom-csv-lambda-arn',
description: 'The ARN of the lambda that load metadata from a presigned URL CSV file',
stringValue: this.lambda.functionArn,
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM public.ecr.aws/lambda/python:3.12

WORKDIR ${LAMBDA_TASK_ROOT}

# COPY all files
COPY . .

# Install the specified packages
RUN pip install -r deps/requirements-full.txt

# Specify handler
CMD [ "handler.load_custom_metadata_csv.handler" ]
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import { LambdaMigrationConstruct } from './construct/lambda-migration';
import { LambdaAPIConstruct } from './construct/lambda-api';
import { ApiGatewayConstructProps } from '../../../../components/api-gateway';
import { PostgresManagerStack } from '../../../../stateful/stacks/postgres-manager/deploy/stack';
import { LambdaLoadCustomCSVConstruct } from './construct/lambda-load-custom-csv';

export type MetadataManagerStackProps = {
/**
Expand Down Expand Up @@ -82,6 +83,7 @@ export class MetadataManagerStack extends Stack {
// 1. To handle API calls
// 2. To do migrations
// 3. To sync db with external sources (e.g. metadata in gsheet)
// 4. To load-db from external csv presigned url file

// (1)
new LambdaAPIConstruct(this, 'APILambda', {
Expand All @@ -103,5 +105,11 @@ export class MetadataManagerStack extends Stack {
dbConnectionSecret: dbSecret,
isDailySync: props.isDailySync,
});

// (4)
new LambdaLoadCustomCSVConstruct(this, 'CustomCsvLoaderLambda', {
basicLambdaConfig: basicLambdaConfig,
dbConnectionSecret: dbSecret,
});
}
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import django
import os
import logging

from libumccr import libjson

from proc.service.utils import sanitize_lab_metadata_df, warn_drop_duplicated_library

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings.base')
django.setup()

from proc.service.load_csv_srv import load_metadata_csv, download_csv_to_pandas

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def handler(event, _context):
logger.info(f'event: {libjson.dumps(event)}')

csv_url = event.get('url', None)
if csv_url is None:
raise ValueError("URL is required")

csv_df = download_csv_to_pandas(csv_url)
sanitize_df = sanitize_lab_metadata_df(csv_df)
duplicate_clean_df = warn_drop_duplicated_library(sanitize_df)
result = load_metadata_csv(duplicate_clean_df)

logger.info(f'persist report: {libjson.dumps(result)}')
return result


if __name__ == '__main__':
handler({}, {})
Loading

0 comments on commit 8842621

Please sign in to comment.