From b01c982e24322f378fe2a1cc0a5ff3de6e1b8f1c Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:55:22 -0400 Subject: [PATCH] Dev sync main (#108) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon * deleted jenkins and added workflows * unlinked helx-actions * testing paths * testing again * d * tests * commented out pytest * try again for bandit * commented out bandit * changed dag to dags * Added fixes * Bagel (#103) * bump dug version * adding bdc new pipelines * adding curesc * adding bagel config * add test parser * add score threshold * point to dug develop * Dbgap programs (#104) * bump dug version * adding bdc new pipelines * adding curesc * fix up merge conflict * adding bagel config parse to bool * remove jenkins file * bump apache version * revert airflow version --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> --- .github/workflows/build-push-dev-image.yml | 86 ++++++++++++++ .github/workflows/build-push-release.yml | 131 +++++++++++++++++++++ .github/workflows/code-checks.yml | 129 ++++++++++++++++++++ .github/workflows/trivy-pr-scan.yml | 67 +++++++++++ Dockerfile | 2 +- Jenkinsfile | 84 ------------- dags/roger/config/__init__.py | 18 +++ dags/roger/config/config.yaml | 22 +++- dags/roger/pipelines/bdc_pipelines.py | 48 ++++++++ requirements.txt | 2 +- 10 files changed, 498 insertions(+), 91 deletions(-) create mode 100644 .github/workflows/build-push-dev-image.yml create mode 100644 .github/workflows/build-push-release.yml create mode 100644 .github/workflows/code-checks.yml create mode 100644 .github/workflows/trivy-pr-scan.yml delete mode 100644 Jenkinsfile create mode 100644 dags/roger/pipelines/bdc_pipelines.py diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml new file mode 100644 index 00000000..13f8cfb7 --- /dev/null +++ b/.github/workflows/build-push-dev-image.yml @@ -0,0 +1,86 @@ +# Workflow responsible for the +# development release processes. +# +name: Build-Push-Dev-Image +on: + push: + branches: + - develop + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + # Do not build another image on a pull request. + # Any push to develop will trigger a new build however. + pull_request: + branches-ignore: + - '*' + +jobs: + build-push-dev-image: + runs-on: ubuntu-latest + steps: + + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + # fetch-depth: 0 means, get all branches and commits + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + ${{ github.repository }}:develop + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + containers.renci.org/${{ github.repository }}:develop + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev + cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max \ No newline at end of file diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml new file mode 100644 index 00000000..07b22d21 --- /dev/null +++ b/.github/workflows/build-push-release.yml @@ -0,0 +1,131 @@ +# Workflow responsible for the +# major release processes. +# + +name: Build-Push-Release +on: + push: + branches: + - master + - main + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + tags-ignore: + - '*' +jobs: + build-push-release: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # https://github.com/marketplace/actions/git-semantic-version + - name: Semver Check + uses: paulhatch/semantic-version@v5.0.3 + id: version + with: + # The prefix to use to identify tags + tag_prefix: "v" + # A string which, if present in a git commit, indicates that a change represents a + # major (breaking) change, supports regular expressions wrapped with '/' + major_pattern: "/breaking:|major:/" + # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs + major_regexp_flags: "ig" + # Same as above except indicating a minor change, supports regular expressions wrapped with '/' + minor_pattern: "/feat:|feature:|minor:/" + # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs + minor_regexp_flags: "ig" + # A string to determine the format of the version output + # version_format: "${major}.${minor}.${patch}-prerelease${increment}" + version_format: "${major}.${minor}.${patch}" + search_commit_body: false + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }} + containers.renci.org/${{ github.repository }}:latest + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + ${{ github.repository }}:v${{ steps.version.outputs.version }} + ${{ github.repository }}:latest + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-release + cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max + +#==========================TAG & RELEASE W/ NOTES ========================= + + # Note: GITHUB_TOKEN is autogenerated feature of github app + # which is auto-enabled when using github actions. + # https://docs.github.com/en/actions/security-guides/automatic-token-authentication + # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object + # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference + # This creates a "lightweight" ref tag. + - name: Create Tag for Release + run: | + curl \ + -s --fail -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/git/refs \ + -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}' + +# https://cli.github.com/manual/gh_release_create + - name: Create Release + env: + RELEASE_VERSION: ${{ steps.version.outputs.version }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create ${{ env.RELEASE_VERSION }} \ + -t "${{ env.RELEASE_VERSION }}" \ + --generate-notes \ + --latest \ No newline at end of file diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 00000000..b7f3e6a5 --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,129 @@ +# Workflow responsible for core acceptance testing. +# Tests Currently Run: +# - flake8-linter +# - PYTest +# - Bandit +# For PR Vulnerability Scanning a separate workflow will run. +# The build-push-dev-image and build-push-release workflows +# handle the develop and release image storage respectively. +# +# + +name: Code-Checks +on: + push: + branches-ignore: + - master + - main + - develop + pull_request: + branches: + - develop + - master + - main + types: [opened, synchronize] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + ############################## flake8-linter ############################## + flake8-linter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + # Currently actions/setup-python supports caching + # but the cache is not as robust as cache action. + # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) + # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d + - uses: actions/cache@v3 + name: Cache Python + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} + + - name: Install Requirements + run: | + pip install -r requirements.txt + + - name: Lint with flake8 + run: | + pip install flake8 + flake8 --ignore=E,W dags + # We continue on error here until the code is clean + # flake8 --ignore=E,W --exit-zero . + continue-on-error: true + + ################################### PYTEST ################################### + # pytest: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.12' + + # - name: Install Requirements + # run: | + # pip install -r requirements.txt + # pip install coverage + # pip install ./tests + + # - name: Test with pytest + # run: | + # make test + ############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml new file mode 100644 index 00000000..1e7bc060 --- /dev/null +++ b/.github/workflows/trivy-pr-scan.yml @@ -0,0 +1,67 @@ +name: trivy-pr-scan +on: + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + trivy-pr-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tags: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + # We will not be concerned with Medium and Low vulnerabilities + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'sarif' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + output: 'trivy-results.sarif' + exit-code: '1' + # Scan results should be viewable in GitHub Security Dashboard + # We still fail the job if results are found, so below will always run + # unless manually canceled. + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + if: '!cancelled()' + with: + sarif_file: 'trivy-results.sarif' \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 49c1fd26..7760c983 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ - apt-get install -y git nano vim + apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow RUN pip install -r requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index a68ba92b..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,84 +0,0 @@ -library 'pipeline-utils@master' - -pipeline { - agent { - kubernetes { - label 'kaniko-build-agent' - yaml ''' -kind: Pod -metadata: - name: kaniko -spec: - containers: - - name: jnlp - workingDir: /home/jenkins/agent - - name: kaniko - workingDir: /home/jenkins/agent - image: gcr.io/kaniko-project/executor:debug - imagePullPolicy: Always - resources: - requests: - cpu: "512m" - memory: "1024Mi" - ephemeral-storage: "4Gi" - limits: - cpu: "1024m" - memory: "2048Mi" - ephemeral-storage: "8Gi" - command: - - /busybox/cat - tty: true - volumeMounts: - - name: jenkins-docker-cfg - mountPath: /kaniko/.docker - volumes: - - name: jenkins-docker-cfg - projected: - sources: - - secret: - name: rencibuild-imagepull-secret - items: - - key: .dockerconfigjson - path: config.json -''' - } - } - environment { - PATH = "/busybox:/kaniko:/ko-app/:$PATH" - DOCKERHUB_CREDS = credentials("${env.CONTAINERS_REGISTRY_CREDS_ID_STR}") - REGISTRY = "${env.REGISTRY}" - REG_OWNER="helxplatform" - REG_APP="roger" - COMMIT_HASH="${sh(script:"git rev-parse --short HEAD", returnStdout: true).trim()}" - VERSION_FILE="./dags/_version.py" - VERSION="${sh(script:'awk \'{ print $3 }\' ./dags/_version.py | xargs', returnStdout: true).trim()}" - IMAGE_NAME="${REGISTRY}/${REG_OWNER}/${REG_APP}" - TAG1="$BRANCH_NAME" - TAG2="$COMMIT_HASH" - TAG3="$VERSION" - TAG4="latest" - } - stages { - stage('Test') { - steps { - sh ''' - echo "Test stage" - ''' - } - } - stage('Build') { - steps { - script { - container(name: 'kaniko', shell: '/busybox/sh') { - if (env.BRANCH_NAME == "main") { - // Tag with latest and version iff when pushed to master - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]) - } else { - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"]) - } - } - } - } - } - } -} diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index ac9eb23a..71111f39 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -99,6 +99,21 @@ class AnnotationConfig(DictLike): "sapbert": { "classification_url": "https://med-nemo.apps.renci.org/annotate/", "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "score_threshold": 0.8, + "bagel": { + "enabled": False, + "url": "https://bagel.apps.renci.org/group_synonyms_openai", + "prompt": "bagel/ask_classes", + "llm_args": { + "llm_model_name": "gpt-4o-2024-05-13", + "organization": "", + "access_key": "", + "llm_model_args": { + "top_p": 0, + "temperature": 0.1 + } + } + } }, } ) @@ -119,6 +134,9 @@ class AnnotationConfig(DictLike): "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS" ]) + def __post_init__(self): + self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][ + "enabled"].lower() == "true" @dataclass diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index e9402ce4..c407555f 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -1,6 +1,6 @@ redisgraph: username: "" - password: "12345" + password: "weak" host: localhost graph: test port: 6379 @@ -42,13 +42,25 @@ bulk_loader: annotation: clear_http_cache: false - annotator_type: monarch + annotator_type: sapbert annotator_args: monarch: url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" sapbert: classification_url: "https://med-nemo.apps.renci.org/annotate/" - annotator_url: "https://babel-sapbert.apps.renci.org/annotate/" + annotator_url: "https://sap-qdrant.apps.renci.org/annotate/" + score_threshold: 0.8 + bagel: + enabled: false + url: "http://localhost:9099/group_synonyms_openai" + prompt: "bagel/ask_classes" + llm_args: + llm_model_name: "gpt-4o-2024-05-13" + organization: + access_key: + llm_model_args: + top_p: 0 + temperature: 0.1 normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup" ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/" @@ -93,9 +105,9 @@ indexing: action: "files" elasticsearch: - host: elasticsearch + host: localhost username: elastic - password: "" + password: "12345" nboost_host: "" scheme: "http" ca_path: "" diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py new file mode 100644 index 00000000..5a945641 --- /dev/null +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -0,0 +1,48 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class BIOLINCCdbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'biolincc' + parser_name = 'biolincc' + + +class covid19dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'covid19-dbgap' + parser_name = 'covid19' + +class dirDbGaPPipeline(DugPipeline): + pipeline_name = "dir-dbgap" + parser_name = "dir" + +class LungMapDbGaPPipeline(DugPipeline): + pipeline_name = "lungmap-dbgap" + parser_name = "lungmap" + +class nsrrDbGaPPipeline(DugPipeline): + pipeline_name = "nsrr-dbgap" + parser_name = "nsrr" + +class ParentDbGaPPipeline(DugPipeline): + pipeline_name = "parent-dbgap" + parser_name = "parent" + +class PCGCDbGaPPipeline(DugPipeline): + pipeline_name = "pcgc-dbgap" + parser_name = "pcgc" + +class RecoverDbGaPPipeline(DugPipeline): + pipeline_name = "recover-dbgap" + parser_name = "recover" + +class TopmedDBGaPPipeline(DugPipeline): + pipeline_name = "topmed-gen3-dbgap" + parser_name = "topmeddbgap" + +class CureSCPipeline(DugPipeline): + pipeline_name = "curesc-dbgap" + parser_name = "curesc" + + diff --git a/requirements.txt b/requirements.txt index d1b1f68f..3a0ee223 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jsonpickle redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.1 +git+https://github.com/helxplatform/dug@2.13.2 orjson kg-utils==0.0.6 bmt==1.1.0