Skip to content

Commit

Permalink
Merge pull request #3032 from catalyst-cooperative/dev
Browse files Browse the repository at this point in the history
Dev -> Main sync
  • Loading branch information
jdangerx authored Nov 9, 2023
2 parents ce1b983 + 1bb33dd commit b783809
Show file tree
Hide file tree
Showing 57 changed files with 944 additions and 546 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
--container-env DAGSTER_PG_HOST="104.154.182.24" \
--container-env DAGSTER_PG_DB="dagster-storage" \
--container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
--container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
# Start the VM
- name: Start the deploy-pudl-vm
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ notebooks/*.tgz
terraform/.terraform/*
.env
.hypothesis/

# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
devtools/datasette/fly/Dockerfile
devtools/datasette/fly/inspect-data.json
devtools/datasette/fly/metadata.yml
devtools/datasette/fly/all_dbs.tar.zst
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ repos:
# Formatters: hooks that re-write Python & documentation files
####################################################################################
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.3
rev: v0.1.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
34 changes: 34 additions & 0 deletions devtools/datasette/fly/fly.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#
app = "catalyst-coop-pudl"
primary_region = "bos"

[[mounts]]
destination = "/data"
source = "datasette"

[[services]]
internal_port = 8080
protocol = "tcp"

[services.concurrency]
hard_limit = 25
soft_limit = 20

[[services.ports]]
handlers = ["http"]
port = 80

[[services.ports]]
handlers = ["tls", "http"]
port = 443

[[services.tcp_checks]]
grace_period = "1m"
interval = 10000
timeout = 2000

[deploy]
wait_timeout = "15m"
10 changes: 10 additions & 0 deletions devtools/datasette/fly/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/env bash
set -eux

shopt -s nullglob

find /data/ -name '*.sqlite' -delete
mv all_dbs.tar.zst /data
zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
tar -xf /data/all_dbs.tar --directory /data
datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
122 changes: 122 additions & 0 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Publish the datasette to fly.io.
We use custom logic here because the datasette-publish-fly plugin bakes the
uncompressed databases into the image, which makes the image too large.
We compress the databases before baking them into the image. Then we decompress
them at runtime to a Fly volume mounted at /data. This avoids a long download
at startup, and allows us stay within the Fly.io 8GB image size limit.
The volume handling is done manually outside of this publish.py script - it
should be terraformed at some point.
Some static fly.io deployment-related files live in ./fly:
* fly.toml - service configuration
* run.sh - service entrypoint
Apart from that: the Dockerfile and dataset-specific
metadata.yml/inspect-data.json are generated by this script.
"""

import json
import logging
import secrets
from pathlib import Path
from subprocess import check_call, check_output

from pudl.metadata.classes import DatasetteMetadata
from pudl.workspace.setup import PudlPaths

logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)

DOCKERFILE_TEMPLATE = """
FROM python:3.11.0-slim-bullseye
COPY . /app
WORKDIR /app
RUN apt-get update
RUN apt-get install -y zstd
ENV DATASETTE_SECRET '{datasette_secret}'
RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
ENV PORT 8080
EXPOSE 8080
CMD ["./run.sh"]
"""


def make_dockerfile():
"""Write a dockerfile from template, to use in fly deploy.
We write this from template so we can generate a datasette secret. This way
we don't have to manage secrets at all.
"""
datasette_secret = secrets.token_hex(16)
return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)


def inspect_data(datasets, pudl_out):
"""Pre-inspect databases to generate some metadata for Datasette.
This is done in the image build process in datasette-publish-fly, but since
we don't have access to the databases in the build process we have to
inspect before building the Docker image.
"""
inspect_output = json.loads(
check_output(
[ # noqa: S603
"datasette",
"inspect",
]
+ [str(pudl_out / ds) for ds in datasets]
)
)

for dataset in inspect_output:
name = Path(inspect_output[dataset]["file"]).name
new_filepath = Path("/data") / name
inspect_output[dataset]["file"] = str(new_filepath)
return inspect_output


def metadata(pudl_out) -> str:
"""Return human-readable metadata for Datasette."""
return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()


def main():
"""Generate deployment files and run the deploy."""
fly_dir = Path(__file__).parent.absolute() / "fly"
docker_path = fly_dir / "Dockerfile"
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"

pudl_out = PudlPaths().pudl_output
datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
logging.info(f"Inspecting DBs for datasette: {datasets}...")
inspect_output = inspect_data(datasets, pudl_out)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

logging.info("Writing metadata...")
with metadata_path.open("w") as f:
f.write(metadata(pudl_out))

logging.info("Writing Dockerfile...")
with docker_path.open("w") as f:
f.write(make_dockerfile())

logging.info(f"Compressing {datasets} and putting into docker context...")
check_call(
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603
cwd=pudl_out,
)

logging.info("Running fly deploy...")
check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir) # noqa: S603
logging.info("Deploy finished!")


if __name__ == "__main__":
main()
26 changes: 0 additions & 26 deletions devtools/datasette/publish.sh

This file was deleted.

9 changes: 9 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM condaforge/mambaforge:23.3.1-1

SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]

# Install curl and js
# awscli requires unzip, less, groff and mandoc
# hadolint ignore=DL3008
Expand All @@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
USER catalyst
WORKDIR ${CONTAINER_HOME}

# Install flyctl
RUN curl -L https://fly.io/install.sh | sh
ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"

ENV CONDA_PREFIX=${CONTAINER_HOME}/env
ENV PUDL_REPO=${CONTAINER_HOME}/pudl
ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"
Expand All @@ -37,6 +43,9 @@ ENV DAGSTER_HOME=${CONTAINER_PUDL_WORKSPACE}/dagster_home
# Create data input/output directories
RUN mkdir -p ${PUDL_INPUT} ${PUDL_OUTPUT} ${DAGSTER_HOME}

# Copy dagster configuration file
COPY docker/dagster.yaml ${DAGSTER_HOME}/dagster.yaml

# Create a conda environment based on the specification in the repo
COPY test/test-environment.yml test/test-environment.yml
RUN mamba create --copy --prefix ${CONDA_PREFIX} --yes python=${PYTHON_VERSION} && \
Expand Down
12 changes: 12 additions & 0 deletions docker/dagster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
storage:
postgres:
postgres_db:
username:
env: DAGSTER_PG_USERNAME
password:
env: DAGSTER_PG_PASSWORD
hostname:
env: DAGSTER_PG_HOST
db_name:
env: DAGSTER_PG_DB
port: 5432
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ services:
environment:
- API_KEY_EIA
- GCP_BILLING_PROJECT
- FLY_ACCESS_TOKEN
env_file:
- .env
build:
Expand Down
15 changes: 9 additions & 6 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ function run_pudl_etl() {
$PUDL_SETTINGS_YML \
&& pudl_etl \
--loglevel DEBUG \
--max-concurrent 6 \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
$PUDL_SETTINGS_YML \
&& pytest \
Expand Down Expand Up @@ -86,20 +85,24 @@ function notify_slack() {
# 2>&1 redirects stderr to stdout.
run_pudl_etl 2>&1 | tee $LOGFILE

# Notify slack if the etl succeeded.
# if pipeline is successful, distribute + publish datasette
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
fi

# Deploy the updated data to datasette
if [ $GITHUB_REF = "dev" ]; then
gcloud config set run/region us-central1
source ~/devtools/datasette/publish.sh
python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
fi
fi

# Notify slack about entire pipeline's success or failure;
# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
# task that was run above
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"
else
notify_slack "failure"
fi
Expand Down
20 changes: 10 additions & 10 deletions docs/data_access.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,42 +83,42 @@ AWS CLI, or programmatically via the S3 API. They can also be downloaded directl
HTTPS using the following links:

* `PUDL SQLite DB <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/pudl.sqlite>`__
* `EPA CEMS Hourly Emissions Parquet (1995-2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/hourly_emissions_epacems.parquet>`__
* `EPA CEMS Hourly Emissions Parquet (1995-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/hourly_emissions_epacems.parquet>`__
* `Census DP1 SQLite DB (2010) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/censusdp1tract.sqlite>`__

* Raw FERC Form 1:

* `FERC-1 SQLite derived from DBF (1994-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1.sqlite>`__
* `FERC-1 SQLite derived from XBRL (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl.sqlite>`__
* `FERC-1 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl.sqlite>`__
* `FERC-1 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl_datapackage.json>`__
* `FERC-1 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl_taxonomy_metadata.json>`__
* `FERC-1 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 2:

* `FERC-2 SQLite derived from DBF (1996-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2.sqlite>`__
* `FERC-2 SQLite derived from XBRL (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl.sqlite>`__
* `FERC-2 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl.sqlite>`__
* `FERC-2 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl_datapackage.json>`__
* `FERC-2 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl_taxonomy_metadata.json>`__
* `FERC-2 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 6:

* `FERC-6 SQLite derived from DBF (2000-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6.sqlite>`__
* `FERC-6 SQLite derived from XBRL (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl.sqlite>`__
* `FERC-6 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl.sqlite>`__
* `FERC-6 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl_datapackage.json>`__
* `FERC-6 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl_taxonomy_metadata.json>`__
* `FERC-6 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 60:

* `FERC-60 SQLite derived from DBF (2006-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60.sqlite>`__
* `FERC-60 SQLite derived from XBRL (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl.sqlite>`__
* `FERC-60 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl.sqlite>`__
* `FERC-60 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl_datapackage.json>`__
* `FERC-60 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 714:

* `FERC-714 SQLite derived from XBRL (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl.sqlite>`__
* `FERC-714 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl.sqlite>`__
* `FERC-714 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl_datapackage.json>`__
* `FERC-714 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl_taxonomy_metadata.json>`__
* `FERC-714 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl_taxonomy_metadata.json>`__


.. _access-zenodo:
Expand Down
3 changes: 2 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ Data Coverage
^^^^^^^^^^^^^

* Updated :doc:`data_sources/eia860` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022 and
monthly YTD data as of April 2023.
* Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new
CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico
and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs
Expand Down
Loading

0 comments on commit b783809

Please sign in to comment.