From 6974933377ab2025afce586f28d9441437dadb2c Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Fri, 25 Oct 2024 21:20:33 +0000 Subject: [PATCH 1/4] refactor(devcontainer): use Dockerfile.local as starting point - remove node install - add google-cloud-cli install - use non-root user --- .devcontainer/Dockerfile | 58 +++++++++++++++++++++++++++----------- warehouse/Dockerfile.local | 31 -------------------- 2 files changed, 41 insertions(+), 48 deletions(-) delete mode 100644 warehouse/Dockerfile.local diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 98adc52ef3..db02b4b529 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,22 +1,46 @@ -# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/python-3/.devcontainer/base.Dockerfile +FROM python:3.9 -# [Choice] Python version: 3, 3.9, 3.8, 3.7, 3.6 -ARG VARIANT="3.9" -FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT} +LABEL org.opencontainers.image.source=https://github.com/cal-itp/data-infra -# [Option] Install Node.js -ARG INSTALL_NODE="true" -ARG NODE_VERSION="lts/*" -RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + USER=calitp -# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. -# COPY requirements.txt /tmp/pip-tmp/ -# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ -# && rm -rf /tmp/pip-tmp +# install gcloud CLI +RUN apt-get update && apt-get install -y apt-transport-https ca-certificates curl gnupg +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \ + apt-get update -y && apt-get install -y google-cloud-cli -# [Optional] Uncomment this section to install additional OS packages. -# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ -# && apt-get -y install --no-install-recommends +# install pygraphviz deps +RUN apt-get update && apt-get install -y libgdal-dev libgraphviz-dev graphviz-dev -# [Optional] Uncomment this line to install global node packages. -# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 +# create and switch to non-root user for devcontainer +RUN useradd --create-home --shell /bin/bash $USER && \ + chown -R $USER:$USER /home/$USER +USER $USER + +# setup warehouse deps +WORKDIR /home/$USER/app/warehouse +# pip install location for non-root +ENV PATH="$PATH:/home/$USER/.local/bin" +# upgrade pip, install poetry +RUN python -m pip install --upgrade pip && pip install poetry + +# copy source files +COPY ./warehouse/pyproject.toml pyproject.toml +COPY ./warehouse/poetry.lock poetry.lock +COPY ./warehouse/dbt_project.yml dbt_project.yml +COPY ./warehouse/packages.yml packages.yml + +# install warehouse deps +RUN poetry install +RUN poetry run dbt deps + +# install dev deps +RUN pip install black memray pre-commit + +# switch back to app root +WORKDIR /home/$USER/app +# CMD for devcontainers +CMD ["sleep", "infinity"] diff --git a/warehouse/Dockerfile.local b/warehouse/Dockerfile.local deleted file mode 100644 index 0732f24541..0000000000 --- a/warehouse/Dockerfile.local +++ /dev/null @@ -1,31 +0,0 @@ -FROM python:3.9-buster - -LABEL org.opencontainers.image.source https://github.com/cal-itp/data-infra - -RUN apt-get update -RUN apt-get install -y ca-certificates curl gnupg -RUN mkdir -p /etc/apt/keyrings -RUN curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg -RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list -RUN apt-get update \ - && apt-get install -y nodejs libgdal-dev libgraphviz-dev graphviz-dev - -RUN npm install -g --unsafe-perm=true --allow-root netlify-cli - -RUN curl -sSL https://install.python-poetry.org | python3 - -ENV PATH="/root/.local/bin:${PATH}" - -RUN mkdir /app -WORKDIR /app - -COPY ./pyproject.toml /app/pyproject.toml -COPY ./poetry.lock /app/poetry.lock -RUN poetry export -f requirements.txt --without-hashes --output requirements.txt \ - && pip install -r requirements.txt -RUN pip install memray - -COPY ./dbt_project.yml /app/dbt_project.yml -COPY ./packages.yml /app/packages.yml -RUN dbt deps - -CMD ["dbt", "run", "--project-dir", "/app", "--profiles-dir", "/app"] From 5cb3ea30dacf1d842e6bbba2c74fd1b0a5495fe2 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Fri, 25 Oct 2024 21:26:15 +0000 Subject: [PATCH 2/4] refactor(devcontainer): use compose.yml to launch/config - extra mounted volumes - env vars modernize devcontainer.json syntax use postAttach to ensure dbt and gcloud are configured --- .devcontainer/compose.yml | 14 +++++++ .devcontainer/devcontainer.json | 72 ++++++++++++--------------------- .devcontainer/postAttach.sh | 21 ++++++++++ 3 files changed, 60 insertions(+), 47 deletions(-) create mode 100644 .devcontainer/compose.yml create mode 100644 .devcontainer/postAttach.sh diff --git a/.devcontainer/compose.yml b/.devcontainer/compose.yml new file mode 100644 index 0000000000..40ecc29610 --- /dev/null +++ b/.devcontainer/compose.yml @@ -0,0 +1,14 @@ +services: + dev: + build: + context: .. + dockerfile: .devcontainer/Dockerfile + image: data_infra:dev + entrypoint: sleep infinity + environment: + - DBT_PROFILES_DIR=/home/calitp/.dbt + - GOOGLE_APPLICATION_CREDENTIALS=/home/calitp/.config/gcloud/application_default_credentials.json + volumes: + - ..:/home/calitp/app + - ~/.dbt:/home/calitp/.dbt + - ~/.config/gcloud:/home/calitp/.config/gcloud diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f6f98754e1..82afbc1ddf 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,59 +1,37 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: // https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/python-3 { - "name": "Python 3", - "build": { - "dockerfile": "Dockerfile", - "context": "..", - "args": { - // Update 'VARIANT' to pick a Python version: 3, 3.6, 3.7, 3.8, 3.9 - "VARIANT": "3.8", - // Options - "INSTALL_NODE": "false", - "NODE_VERSION": "lts/*" - } - }, - - // Set *default* container specific settings.json values on container create. - "settings": { - "terminal.integrated.shell.linux": "/bin/bash", - "python.pythonPath": "/usr/local/bin/python", - "python.languageServer": "Pylance", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", - "python.formatting.blackPath": "/usr/local/py-utils/bin/black", - "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", - "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", - "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", - "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", - "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", - "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", - "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint", + "name": "cal-itp/data-infra", + "dockerComposeFile": ["./compose.yml"], + "service": "dev", + "runServices": ["dev"], + "workspaceFolder": "/home/calitp/app", + "postAttachCommand": ["/bin/bash", ".devcontainer/postAttach.sh"], + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.defaultProfile.linux": "bash", + "terminal.integrated.profiles.linux": { + "bash": { + "path": "/bin/bash" + } + }, + "editor.formatOnSave": true, "files.trimTrailingWhitespace": true, - "files.insertFinalNewline": true - }, + "files.insertFinalNewline": true, + "files.encoding": "utf8", + "files.eol": "\n", + "python.languageServer": "Pylance" + }, - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ "ms-python.python", "ms-python.vscode-pylance", "davidanson.vscode-markdownlint", "bierner.markdown-mermaid", "mhutchie.git-graph" - ], - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - - // Use 'postCreateCommand' to run commands after the container is created. - // "postCreateCommand": "pip3 install --user -r requirements.txt", - - // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. - "remoteUser": "vscode", - "portsAttributes": { - "8000": { - "label": "documentation website" - } + ] } + } } diff --git a/.devcontainer/postAttach.sh b/.devcontainer/postAttach.sh new file mode 100644 index 0000000000..1e81d2ade6 --- /dev/null +++ b/.devcontainer/postAttach.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -u + +# workaround VS Code devcontainer .git mounting issue +git config --global --add safe.directory /home/calitp/app + +# initialize hook environments +pre-commit install --install-hooks --overwrite + +cd warehouse/ + +if [ ! -f ~/.dbt/profiles.yml ]; then + poetry run dbt init +fi + +poetry run dbt debug + +if [[ $? != 0 ]]; then + gcloud init + gcloud auth application-default login +fi From 47ca6a72166aec6513b504918380cf8d6d91d919 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Fri, 25 Oct 2024 21:26:48 +0000 Subject: [PATCH 3/4] chore(warehouse): replace dbt script with compose service script seems unused --- .devcontainer/compose.yml | 15 +++++++++++++++ warehouse/dbt.sh | 3 --- 2 files changed, 15 insertions(+), 3 deletions(-) delete mode 100755 warehouse/dbt.sh diff --git a/.devcontainer/compose.yml b/.devcontainer/compose.yml index 40ecc29610..24dcac18a6 100644 --- a/.devcontainer/compose.yml +++ b/.devcontainer/compose.yml @@ -12,3 +12,18 @@ services: - ..:/home/calitp/app - ~/.dbt:/home/calitp/.dbt - ~/.config/gcloud:/home/calitp/.config/gcloud + + dbt: + build: + context: .. + dockerfile: .devcontainer/Dockerfile + image: data_infra:dev + entrypoint: ["poetry", "run", "dbt"] + environment: + - DBT_PROFILES_DIR=/home/calitp/.dbt + - GOOGLE_APPLICATION_CREDENTIALS=/home/calitp/.config/gcloud/application_default_credentials.json + volumes: + - ..:/home/calitp/app + - ~/.dbt:/home/calitp/.dbt + - ~/.config/gcloud:/home/calitp/.config/gcloud + working_dir: /home/calitp/app/warehouse diff --git a/warehouse/dbt.sh b/warehouse/dbt.sh deleted file mode 100755 index 089316bed9..0000000000 --- a/warehouse/dbt.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -docker build -t local-dbt -f Dockerfile.local . -docker run --entrypoint dbt -e GOOGLE_APPLICATION_CREDENTIALS=/gcloud_config/application_default_credentials.json -v ~/.dbt:/local_dbt -v ~/.config/gcloud:/gcloud_config -v $(pwd):/app local-dbt "$@" --profiles-dir /local_dbt From 6263e2bdf13d210e6d62affd58a408448d0a8205 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Fri, 25 Oct 2024 21:50:00 +0000 Subject: [PATCH 4/4] docs: add steps for running devcontainer --- warehouse/README.md | 81 +++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/warehouse/README.md b/warehouse/README.md index ed4eb7389e..c5e043fe40 100644 --- a/warehouse/README.md +++ b/warehouse/README.md @@ -27,14 +27,14 @@ are already configured/installed. 3. Execute `poetry install` to create a virtual environment and install requirements. - > [!NOTE] - > If you run into an error complaining about graphviz (e.g. `fatal error: 'graphviz/cgraph.h' file not found`); see [pygraphviz#398](https://github.com/pygraphviz/pygraphviz/issues/398). - > - > ```bash - > export CFLAGS="-I $(brew --prefix graphviz)/include" - > export LDFLAGS="-L $(brew --prefix graphviz)/lib" - > poetry install - > ``` + > [!NOTE] + > If you run into an error complaining about graphviz (e.g. `fatal error: 'graphviz/cgraph.h' file not found`); see [pygraphviz#398](https://github.com/pygraphviz/pygraphviz/issues/398). + > + > ```bash + > export CFLAGS="-I $(brew --prefix graphviz)/include" + > export LDFLAGS="-L $(brew --prefix graphviz)/lib" + > poetry install + > ``` 4. Execute `poetry run dbt deps` to install the dbt dependencies defined in `packages.yml` (such as `dbt_utils`). @@ -59,15 +59,15 @@ are already configured/installed. See [the dbt docs on profiles.yml](https://docs.getdbt.com/dbt-cli/configure-your-profile) for more background on this file. - > [!NOTE] - > This default profile template will set a maximum bytes billed of 2 TB; no models should fail with the default lookbacks in our development environment, even with a full refresh. You can override this limit during the init, or change it later by calling init again and choosing to overwrite (or editing the profiles.yml directly). - > - > [!WARNING] - > If you receive a warning similar to the following, do **NOT** overwrite the file. This is a sign that you do not have a `DBT_PROFILES_DIR` variable available in your environment and need to address that first (see step 5). - > - > ```text - > The profile calitp_warehouse already exists in /data-infra/warehouse/profiles.yml. Continue and overwrite it? [y/N]: - > ``` + > [!NOTE] + > This default profile template will set a maximum bytes billed of 2 TB; no models should fail with the default lookbacks in our development environment, even with a full refresh. You can override this limit during the init, or change it later by calling init again and choosing to overwrite (or editing the profiles.yml directly). + > + > [!WARNING] + > If you receive a warning similar to the following, do **NOT** overwrite the file. This is a sign that you do not have a `DBT_PROFILES_DIR` variable available in your environment and need to address that first (see step 5). + > + > ```text + > The profile calitp_warehouse already exists in /data-infra/warehouse/profiles.yml. Continue and overwrite it? [y/N]: + > ``` 7. Check whether `~/.dbt/profiles.yml` was successfully created, e.g. `cat ~/.dbt/profiles.yml`. If you encountered an error, you may create it by hand and fill it with the same content - this will point your models at BigQuery datasets (schemas) in the `cal-itp-data-infra-staging` project that are prefixed with your name, where operations on them will not impact production data: @@ -147,10 +147,10 @@ Once you have performed the setup above, you are good to go run 2. You will need to re-run seeds if new seeds are added, or existing ones are changed. 2. `poetry run dbt run` 1. Wll run all the models, i.e. execute SQL in the warehouse. - 2. In the future, you can specify [selections](https://docs.getdbt.com/reference/node-selection/syntax) (via the `-s` or `--select` flags) to run only a subset of models, otherwise this will run *all* the tables. + 2. In the future, you can specify [selections](https://docs.getdbt.com/reference/node-selection/syntax) (via the `-s` or `--select` flags) to run only a subset of models, otherwise this will run _all_ the tables. 3. By default, your very first `run` is a [full refresh](https://docs.getdbt.com/reference/commands/run#refresh-incremental-models) but you'll need to pass the `--full-refresh` flag in the future if you want to change the schema of incremental tables, or "backfill" existing rows with new logic. -> [!NOTE] +> [!NOTE] > In general, it's a good idea to run `seed` and `run --full-refresh` if you think your local environment is substantially outdated (for example, if you haven't worked on dbt models in a few weeks but want to create or modify a model). We have macros in the project that prevent a non-production "full refresh" from actually processing all possible data. Some additional helpful commands: @@ -177,10 +177,10 @@ If this is your first time using the terminal, we recommend reading "[Learning t You can enable [displaying hidden folders/files in macOS Finder](https://www.macworld.com/article/671158/how-to-show-hidden-files-on-a-mac.html) but generally, we recommend using the terminal when possible for editing these files. Generally, `nano ~/.dbt/profiles.yml` will be the easiest method for editing your personal profiles file. `nano` is a simple terminal-based text editor; you use the arrows keys to navigate and the hotkeys displayed at the bottom to save and exit. Reading an [online tutorial for using `nano`](https://www.howtogeek.com/42980/the-beginners-guide-to-nano-the-linux-command-line-text-editor/) may be useful if you haven't used a terminal-based editor before. -> [!NOTE] +> [!NOTE] > These instructions assume you are on macOS, but are largely similar for other operating systems. Most \*nix OSes will have a package manager that you should use instead of Homebrew. > -> [!NOTE] +> [!NOTE] > If you get `Operation not permitted` when attempting to use the terminal, you may need to [fix your terminal permissions](https://osxdaily.com/2018/10/09/fix-operation-not-permitted-terminal-error-macos/) ### Install Homebrew (if you haven't) @@ -303,6 +303,45 @@ and the cal-itp-data-infra-staging project's default service account (`473674835 since the buckets for compiled Python models (`gs://calitp-dbt-python-models` and `gs://test-calitp-dbt-python-models`) as well as external tables exist in the production project. +## Run with VS Code Dev Containers + +This repository comes with a [Dev Containers](https://containers.dev/) configuration that makes it possible to run everything +within VS Code with minimal dependencies, from any operating system. + +1. Ensure you have Docker and Docker Compose installed locally +1. Ensure you have the Dev Containers VS Code extension installed: `ms-vscode-remote.remote-containers` +1. If you have never run the DBT project before, create the following directories locally: + + ```console + mkdir ~/.dbt + mkdir -p ~/.config/gcloud + ``` + +1. Open this repository in VS Code +1. When prompted, choose `Reopen in Container` or use the Command Palette: `Ctrl/Cmd` + `Shift` + `P` and type `Dev Containers` +1. If you have never run the DBT project before, once the devcontainer has built and opens, you will be guided through the + initialization process for DBT and Google Cloud CLI. + +You can also run any DBT command from your local machine via Docker Compose. + +Change into the `.devcontainer/` directory: + +```console +cd .devcontainer/ +``` + +Then use `docker compose run` with a `dbt `: + +```console +docker compose run dbt +``` + +E.g. + +```console +docker compose run dbt debug +``` + ## Testing Warehouse Image Changes A person with Docker set up locally can build a development version of the underlying warehouse image at any time after making changes to the Dockerfile or its requirements. From the relevant subfolder, run