openaq
diff --git a/‎.github/workflows/deploy.yml
+76 b/‎.github/workflows/deploy.yml
+76
diff --git a/‎README.md
+2 b/‎README.md
+2
diff --git a/‎cdk/app.py
+3-3 b/‎cdk/app.py
+3-3
diff --git a/‎cdk/cdk.json
+1-1 b/‎cdk/cdk.json
+1-1
diff --git a/‎cdk/config.py
+11-10 b/‎cdk/config.py
+11-10
diff --git a/‎cdk/lambda_ingest_stack.py
+8-8 b/‎cdk/lambda_ingest_stack.py
+8-8
diff --git a/‎cdk/requirements.txt
-4 b/‎cdk/requirements.txt
-4
diff --git a/‎cdk/utils.py
+7-5 b/‎cdk/utils.py
+7-5
diff --git a/‎check.py
+7-6 b/‎check.py
+7-6
diff --git a/‎ingest/etl_process_measurements.sql
+29-8 b/‎ingest/etl_process_measurements.sql
+29-8
@@ -0,0 +1,76 @@
+name: Deploy ingestor
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Configure aws credentials
+        uses: aws-actions/configure-aws-credentials@master
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_PROD }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_PROD }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Get envionmental values
+        uses: aws-actions/aws-secretsmanager-get-secrets@v2
+        with:
+          secret-ids: |
+            AEOLUS, openaq-env/aeolus
+          name-transformation: uppercase
+          parse-json-secrets: true
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+
+      - name: Install CDK
+        run: |
+          npm install -g aws-cdk
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+
+      - name: Deploy stack
+        env:
+          ENV: "aeolus"
+          PROJECT: "openaq"
+
+          ## deployment variables
+          # CDK_ACCOUNT: ${{ secrets.CDK_ACCOUNT }}
+          # CDK_REGION: ${{ secrets.CDK_REGION }}
+
+          VPC_ID: ${{ env.AEOLUS_VPC_ID }}
+
+          TOPIC_ARN: ${{ env.AEOLUS_FETCH_OBJECT_TOPIC_ARN }}
+
+          ## application variables
+          DATABASE_READ_USER: ${{ env.AEOLUS_DATABASE_READ_USER }}
+          DATABASE_READ_PASSWORD: ${{ env.AEOLUS_DATABASE_READ_PASSWORD }}
+          DATABASE_WRITE_USER: ${{ env.AEOLUS_DATABASE_WRITE_USER }}
+          DATABASE_WRITE_PASSWORD: ${{ env.AEOLUS_DATABASE_WRITE_PASSWORD }}
+          DATABASE_DB: ${{ env.AEOLUS_DATABASE_DB }}
+          DATABASE_HOST: ${{ env.AEOLUS_DATABASE_HOST }}
+          DATABASE_PORT: ${{ env.AEOLUS_DATABASE_PORT }}
+          FETCH_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }}
+          ETL_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }}
+          PAUSE_INGESTING: False
+
+
+        working-directory: ./cdk
+        run: |
+          poetry self add poetry-plugin-export
+          poetry install
+          cdk deploy openaq-ingest-aeolus --require-approval never
@@ -2,3 +2,5 @@
 
 
 # Testing a realtime file
+
+# Testing files
@@ -32,9 +32,9 @@
     lambda_env=lambda_env,
     fetch_bucket=settings.FETCH_BUCKET,
 	vpc_id=settings.VPC_ID,
-    ingest_lambda_timeout=settings.INGEST_LAMBDA_TIMEOUT,
-    ingest_lambda_memory_size=settings.INGEST_LAMBDA_MEMORY_SIZE,
-    ingest_rate_minutes=settings.INGEST_RATE_MINUTES,
+    lambda_timeout=settings.LAMBDA_TIMEOUT,
+    lambda_memory_size=settings.LAMBDA_MEMORY_SIZE,
+    rate_minutes=settings.RATE_MINUTES,
     topic_arn=settings.TOPIC_ARN,
 	env=env,
 )
 
@@ -1,5 +1,5 @@
 {
-  "app": "python app.py",
+  "app": "poetry run python app.py",
   "context": {
     "aws-cdk:enableDiffNoFail": "true",
     "@aws-cdk/core:stackRelativeExports": "true",
 
@@ -1,5 +1,8 @@
 from typing import List
-from pydantic import BaseSettings
+from pydantic_settings import (
+    BaseSettings,
+    SettingsConfigDict,
+    )
 from pathlib import Path
 from os import environ
 
@@ -8,19 +11,17 @@ class Settings(BaseSettings):
     FETCH_BUCKET: str
     ENV: str = "staging"
     PROJECT: str = "openaq"
-    INGEST_LAMBDA_TIMEOUT: int = 900
-    INGEST_LAMBDA_MEMORY_SIZE: int = 1536
-    INGEST_RATE_MINUTES: int = 15
+    LAMBDA_TIMEOUT: int = 900
+    LAMBDA_MEMORY_SIZE: int = 1536
+    RATE_MINUTES: int = 15
     LOG_LEVEL: str = 'INFO'
     TOPIC_ARN: str = None
     VPC_ID: str = None
 
-    class Config:
-        parent = Path(__file__).resolve().parent.parent
-        if 'DOTENV' in environ:
-            env_file = Path.joinpath(parent, environ['DOTENV'])
-        else:
-            env_file = Path.joinpath(parent, ".env")
+
+    model_config = SettingsConfigDict(
+        extra="ignore", env_file=f"../{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8"
+    )
 
 
 settings = Settings()
@@ -30,9 +30,9 @@ def __init__(
         env_name: str,
         lambda_env: Dict,
         fetch_bucket: str,
-        ingest_lambda_timeout: int,
-        ingest_lambda_memory_size: int,
-        ingest_rate_minutes: int = 15,
+        lambda_timeout: int,
+        lambda_memory_size: int,
+        rate_minutes: int = 15,
         topic_arn: str = None,
         vpc_id: str = None,
         **kwargs,
@@ -66,11 +66,11 @@ def __init__(
             ),
             handler="ingest.handler.handler",
 			vpc=vpc_id,
-            runtime=aws_lambda.Runtime.PYTHON_3_9,
+            runtime=aws_lambda.Runtime.PYTHON_3_12,
             allow_public_subnet=True,
-            memory_size=ingest_lambda_memory_size,
+            memory_size=lambda_memory_size,
             environment=stringify_settings(lambda_env),
-            timeout=Duration.seconds(ingest_lambda_timeout),
+            timeout=Duration.seconds(lambda_timeout),
             layers=[
                 create_dependencies_layer(
                     self,
@@ -89,12 +89,12 @@ def __init__(
 
         # Set how often the ingester will run
         # If 0 the ingester will not run automatically
-        if ingest_rate_minutes > 0:
+        if rate_minutes > 0:
             aws_events.Rule(
                 self,
                 f"{id}-ingest-event-rule",
                 schedule=aws_events.Schedule.cron(
-                    minute=f"0/{ingest_rate_minutes}"
+                    minute=f"0/{rate_minutes}"
                 ),
                 targets=[
                     aws_events_targets.LambdaFunction(ingest_function),
 
@@ -19,14 +19,16 @@ def create_dependencies_layer(
         function_name: str,
         requirements_path: Path
 ) -> aws_lambda.LayerVersion:
-    requirements_file = str(requirements_path.resolve())
+    #requirements_file = str(requirements_path.resolve())
     output_dir = f'../.build/{function_name}'
     layer_id = f'openaq-{function_name}-{env_name}-dependencies'
 
-    if not environ.get('SKIP_PIP'):
-        print(f'Building {layer_id} from {requirements_file} into {output_dir}')
+    if not environ.get('SKIP_BUILD'):
+        print(f'Building {layer_id} into {output_dir}')
         subprocess.run(
-            f"""python -m pip install -qq -r {requirements_file} \
+            f"""
+             poetry export --without=cdk -o requirements.txt --without-hashes && \
+             poetry run python -m pip install -qq -r requirements.txt \
             -t {output_dir}/python && \
             cd {output_dir}/python && \
             find . -type f -name '*.pyc' | \
@@ -47,5 +49,5 @@ def create_dependencies_layer(
         self,
         layer_id,
         code=layer_code,
-        compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_9]
+        compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_12]
     )
@@ -5,8 +5,7 @@
 import orjson
 import psycopg2
 
-
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('check.py')
 
 #os.chdir('/home/christian/git/caparker/openaq-ingestor/ingest')
 #print(os.getcwd())
@@ -77,14 +76,17 @@
     os.environ['USE_TEMP_TABLES'] = 'False'
 
 from botocore.exceptions import ClientError
-from ingest.handler import cronhandler, logger
+from ingest.handler import cronhandler
 from ingest.settings import settings
 
 from ingest.lcs import (
     load_metadata,
+    load_metadata_batch,
+)
+
+from ingest.lcsV2 import (
     load_measurements,
     load_measurements_batch,
-    load_metadata_batch,
 )
 
 from ingest.fetch import (
@@ -154,8 +156,6 @@ def check_realtime_key(key: str, fix: bool = False):
         mark_success(key=key, reset=True)
 
 
-logger.debug(settings)
-
 if args.file is not None:
     # check if the files exists
     # is it a realtime file or a lcs file?
@@ -172,6 +172,7 @@ def check_realtime_key(key: str, fix: bool = False):
     # get just the keys
     keys = [log[1] for log in logs]
     # loop through and check each
+    logger.info(f"Downloading {len(keys)} files")
     for idx, key in enumerate(keys):
         if args.download:
             # we may be using the new source pat
 
@@ -52,12 +52,16 @@ FROM staging_measurements;
 -- that duplicate sensors with the same ingest/source id are created
 	-- this is a short term fix
 	-- a long term fix would not allow duplicate source_id's
-WITH ranked_sensors AS (
+WITH staged_sensors AS (
+  -- this first part significantly speeds it up on slow machines
+  SELECT DISTINCT ingest_id
+  FROM staging_measurements
+), ranked_sensors AS (
   SELECT s.sensors_id
 	, s.source_id
 	, RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk
 	FROM sensors s
-	JOIN staging_measurements m ON (s.source_id = m.ingest_id)
+	JOIN staged_sensors m ON (s.source_id = m.ingest_id)
 ), active_sensors AS (
 	SELECT source_id
 	, sensors_id
@@ -68,6 +72,7 @@ WITH ranked_sensors AS (
 	FROM active_sensors s
 	WHERE s.source_id=ingest_id;
 
+
 -- Now we have to fill in any missing information
 -- first add the nodes and systems that dont exist
 -- add just the bare minimum amount of data to the system
@@ -285,6 +290,7 @@ INSERT INTO sensors_rollup (
   , value_latest
   , value_count
   , value_avg
+  , value_sd
   , value_min
   , value_max
   , geom_latest
@@ -299,6 +305,7 @@ WITH numbered AS (
    , sum(1) OVER (PARTITION BY sensors_id) as value_count
    , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min
    , avg(value) OVER (PARTITION BY sensors_id) as value_avg
+   , stddev(value) OVER (PARTITION BY sensors_id) as value_sd
    , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn
   FROM staging_inserted_measurements
 ), latest AS (
@@ -308,6 +315,7 @@ WITH numbered AS (
    , value
    , value_count
    , value_avg
+   , value_sd
    , datetime_min
    , lat
    , lon
@@ -320,6 +328,7 @@ SELECT l.sensors_id
 , l.value -- last value
 , l.value_count
 , l.value_avg
+, COALESCE(l.value_sd, 0)
 , l.value -- min
 , l.value -- max
 , public.pt3857(lon, lat)
@@ -348,12 +357,24 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las
 
 
 -- Update the table that will help to track hourly rollups
-INSERT INTO hourly_stats (datetime)
-  SELECT date_trunc('hour', datetime)
-  FROM staging_inserted_measurements
-  GROUP BY 1
-ON CONFLICT (datetime) DO UPDATE
-SET modified_on = now();
+-- this is a replacement to the hourly stats table
+  WITH inserted_hours AS (
+    -- first we group things, adding an hour to make it time-ending after truncating
+    SELECT datetime + '1h'::interval as datetime
+    , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset
+    FROM staging_inserted_measurements m
+    JOIN sensors s ON (s.sensors_id = m.sensors_id)
+    JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id)
+    JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id)
+    JOIN timezones tz ON (sn.timezones_id = tz.timezones_id)
+    GROUP BY 1, 2
+   )
+    INSERT INTO hourly_data_queue (datetime, tz_offset)
+    SELECT as_utc_hour(datetime, tz_offset), tz_offset
+    FROM inserted_hours
+    GROUP BY 1, 2
+    ON CONFLICT (datetime, tz_offset) DO UPDATE
+    SET modified_on = now();
 
 
 --Update the export queue/logs to export these records
Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,5 @@`
`2`	`2`
`3`	`3`
`4`	`4`	`# Testing a realtime file`
	`5`	`+`
	`6`	`+# Testing files`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "app": "python app.py",`
	`2`	`+ "app": "poetry run python app.py",`
`3`	`3`	`"context": {`
`4`	`4`	`"aws-cdk:enableDiffNoFail": "true",`
`5`	`5`	`"@aws-cdk/core:stackRelativeExports": "true",`