Skip to content

Commit

Permalink
First version of the datawarehouse (#859)
Browse files Browse the repository at this point in the history
* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* wip

Signed-off-by: Raphaël Courivaud <[email protected]>

* change and fix models

Signed-off-by: Raphaël Courivaud <[email protected]>

---------

Signed-off-by: Raphaël Courivaud <[email protected]>
  • Loading branch information
rcourivaud authored Sep 9, 2024
1 parent cd3e019 commit c524425
Show file tree
Hide file tree
Showing 140 changed files with 290,471 additions and 162 deletions.
2 changes: 1 addition & 1 deletion .docker/dagster/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ WORKDIR $DAGSTER_HOME

# Setup Nginx configuration
RUN htpasswd -cb /etc/nginx/.htpasswd zlv zlv
COPY docker/nginx/nginx.conf /etc/nginx/sites-available/default
COPY .docker/dagster/nginx/nginx.conf /etc/nginx/sites-available/default

EXPOSE 8080 3000

Expand Down
15 changes: 15 additions & 0 deletions .docker/dagster/nginx/nginx.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
server {
listen 8080;

location / {
auth_basic "Restricted Access";
auth_basic_user_file /etc/nginx/.htpasswd;

proxy_pass http://127.0.0.1:3000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}

37 changes: 37 additions & 0 deletions .docker/metabase/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Utiliser l'image OpenJDK comme base
FROM openjdk:19-buster


ARG CELLAR_ADDON_HOST
ARG CELLAR_ADDON_KEY_ID
ARG CELLAR_ADDON_KEY_SECRET
ARG CELLAR_ADDON_BUCKET
ARG FILENAME=metabase.duckdb

ENV MB_PLUGINS_DIR=/home/plugins/
ENV AWS_ACCESS_KEY_ID=$CELLAR_ADDON_KEY_ID
ENV AWS_SECRET_ACCESS_KEY=$CELLAR_ADDON_KEY_SECRET
ENV CELLAR_ADDON_BUCKET=$CELLAR_ADDON_BUCKET
ENV AWS_ENDPOINT_URL=https://$CELLAR_ADDON_HOST
ENV FILE_PATH=s3://$CELLAR_ADDON_BUCKET/$FILENAME

RUN apt-get update && apt-get install -y \
python3-pip \
python3-setuptools \
cron

RUN pip3 install s3cmd awscli

ADD https://downloads.metabase.com/v0.50.22/metabase.jar /home
ADD https://github.com/MotherDuck-Open-Source/metabase_duckdb_driver/releases/download/0.2.9/duckdb.metabase-driver.jar $MB_PLUGINS_DIR

RUN mkdir -p /data/duckdb/
RUN chmod 744 /home/plugins/duckdb.metabase-driver.jar

# Configurer un cron job pour mettre à jour quotidiennement le fichier DuckDB
RUN echo "0 3 * * * root aws s3 --endpoint-url $AWS_ENDPOINT_URL cp $FILE_PATH /data/duckdb/$FILENAME" >> /etc/crontab

# Commande pour démarrer Metabase
CMD aws s3 --endpoint-url $AWS_ENDPOINT_URL cp $FILE_PATH /data/duckdb/$FILENAME && \
cron && \
java -jar /home/metabase.jar
Empty file added .docker/metabase/metabase.jar
Empty file.
15 changes: 11 additions & 4 deletions .github/workflows/github-actions-data-stack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@ name: Data Stack CI

on:
push:
paths:
- analytics/**

jobs:
deploy-dagster:
uses: ./.github/workflows/deploy.yml
with:
app: dagster-production
branch: main
app: Dagster (Production)
alias: dagster-production
branch: feat/data-*
secrets: inherit

deploy-metabase:
uses: ./.github/workflows/deploy.yml
with:
app: Production - Metabase
alias: metabase-production
branch: feat/data-*
secrets: inherit
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,10 @@ __pycache__
analytics/dagster/storage/
.duckdb
.jsonl
analytics/*.duckdb
analytics/dbt/*.duckdb
analytics/exploration/data/*
exploration/data/*
data/
data/*
analytics/plugins/
10 changes: 10 additions & 0 deletions analytics/Dockerfile.metabase
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM openjdk:19-buster

ENV MB_PLUGINS_DIR=/home/plugins/

ADD https://downloads.metabase.com/v0.46.2/metabase.jar /home
ADD https://github.com/MotherDuck-Open-Source/metabase_duckdb_driver/releases/download/0.2.9/duckdb.metabase-driver.jar /home/plugins/

RUN chmod 744 /home/plugins/duckdb.metabase-driver.jar

CMD ["java", "-jar", "/home/metabase.jar"]
3 changes: 2 additions & 1 deletion analytics/dagster/docker/nginx/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ server {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
}

5 changes: 4 additions & 1 deletion analytics/dagster/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
dagster-dbt
dbt-duckdb

duckdb
matplotlib
pandas
requests
dlt[duckdb]


9 changes: 5 additions & 4 deletions analytics/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ profile: 'duckdb_profile'

vars:
dateFormat: '%d/%m/%Y'
startDate: '2020-01-01'

seeds:
zlv_dbt_project:
Expand All @@ -34,11 +35,11 @@ clean-targets: # directories to be removed by `dbt clean`
models:
zlv_dbt_project:
staging:
+schema: stg
schema: stg
+materialized: view
intermediate:
+schema: int
+materialized: table
schema: int
+materialized: view
marts:
+schema: mart
schema: marts
+materialized: table
15 changes: 15 additions & 0 deletions analytics/dbt/macros/lovac/filters_lovac.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{% macro filter_lovac(ccthp=False, vacancy=True) %}
WHERE
1 = 1
{% if ccthp %}
AND ff_ccthp IN ('V')
{% endif %}
-- - 2 ans
{% if vacancy %}
AND vacancy_start_year < data_year - 2
{% endif %}
AND (groupe NOT IN (1, 2, 3, 4, 5, 6, 9) OR groupe is NULL)
AND aff = 'H'
AND housing_kind IN ('APPART', 'MAISON')
AND local_id IS NOT NULL
{% endmacro %}
25 changes: 25 additions & 0 deletions analytics/dbt/macros/lovac/handle_lovac_different_years.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{% macro handle_lovac_different_years() %}
cleaned_data AS (
SELECT
annee as data_year,
ff_millesime,
invariant,
ff_idlocal AS local_id,
TRY_CAST(groupe AS INTEGER) as groupe,
debutvacance AS vacancy_start_year,
aff as aff,
'lovac-' || annee as file_year,
annee as year, groupe, aff , nature,ff_ccthp, TRY_CAST(debutvacance as INTEGER) as debutvacance,
ccodep,
lpad(ccodep, 2, '0') || lpad(commune, 3, '0') AS geo_code,
nature AS housing_kind

FROM
source
)

SELECT * FROM cleaned_data
QUALIFY
ROW_NUMBER() OVER (PARTITION BY local_id ORDER BY 1 DESC) = 1

{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT geo_code , TRUE as is_in
FROM {{ ref('stg_common_article_232_1')}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT rtrim(geo_code) as geo_code, TRUE as is_in
FROM {{ ref('stg_common_article_232_2')}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT DISTINCT ON(insee_code) *,
insee_code AS geo_code
FROM {{ ref('stg_common_cities')}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
geo_code,
CASE
WHEN geo_code IN ('75101', '75102', '75103', '75104', '75105', '75106', '75107', '75108', '75109', '75110', '75111', '75112', '75113', '75114', '75115', '75116', '75117', '75118', '75119', '75120') THEN '75056' -- Paris
WHEN geo_code IN ('13201', '13202','13203','13204','13205','13206','13207','13208','13209','13210','13211','13212','13213','13214','13215', '13216',) THEN '13055' -- Marseille
WHEN geo_code IN ('69381', '69382', '69383', '69384', '69385', '69386', '69387', '69388', '69389') THEN '69123' -- Lyon
ELSE geo_code
END AS city_code
FROM {{ ref('int_common_cities')}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SELECT
insee_com as geo_code,
CASE WHEN id_acv IS NOT NULL THEN TRUE ELSE FALSE END as action_coeur_de_ville,
CASE WHEN id_acv2 IS NOT NULL THEN TRUE ELSE FALSE END as action_coeur_de_ville,
CASE WHEN id_pvd IS NOT NULL THEN TRUE ELSE FALSE END as petite_ville_de_demain,
CASE WHEN id_va IS NOT NULL THEN TRUE ELSE FALSE END as village_davenir
FROM {{ ref('stg_common_ngeo_anct_cog_2023')}}
6 changes: 6 additions & 0 deletions analytics/dbt/models/intermediate/common/int_common_opah.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
SELECT insee_code as geo_code,
SUM(CASE WHEN typeprogramme LIKE '%OPAH%' THEN 1 ELSE 0 END) AS opah,
STRING_AGG(DISTINCT(CASE WHEN typeprogramme LIKE '%OPAH%' THEN typeprogramme END)) AS type_opah,
SUM(CASE WHEN typeprogramme LIKE '%PIG%' THEN 1 ELSE 0 END) AS pig
FROM {{ ref('stg_common_opah')}}
GROUP BY geo_code
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{{ dbt_utils.date_spine(
datepart="day",
start_date="cast('2020-01-01' as date)",
end_date="CURRENT_DATE"
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{{ dbt_utils.date_spine(
datepart="month",
start_date="cast('2020-01-01' as date)",
end_date="CURRENT_DATE"
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{{ dbt_utils.date_spine(
datepart="year",
start_date="cast('2020-01-01' as date)",
end_date="CURRENT_DATE"
) }}
12 changes: 12 additions & 0 deletions analytics/dbt/models/intermediate/common/schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2

models:
- name: int_common_cities
- name: int_common_cities_mapping
- name: int_common_spine_days
- name: int_common_spine_months
- name: int_common_spine_years
- name: int_common_ngeo_anct_cog_2023
- name: int_common_opah
- name: int_common_article_232_2
- name: int_common_article_232_1
4 changes: 4 additions & 0 deletions analytics/dbt/models/intermediate/ff/int_ff_ext_2023.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SELECT *
FROM {{ ref('stg_ff_ext_2023') }}
WHERE ff_dteloc IN ('1','2')
AND (ff_ccogrm NOT IN ('1','2','3','4','5','6','9') OR ff_ccogrm IS NULL)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT geo_code, COUNT(*) as nombre_logements
FROM {{ ref('int_ff_ext_2023') }}
GROUP BY geo_code
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ models:
- name: int_ff_owners_dedup
+enabled: false
description: "Description de ce que fait le modèle lovac, par exemple, analyse des locaux vacants."
- name: int_ff_ext_2023
- name: int_ff_ext_2023_geo_code


2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2019.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2019') }}
{{ filter_lovac(ccthp=True) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2020.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2020') }}
{{ filter_lovac(ccthp=True) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2021.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2021') }}
{{ filter_lovac(ccthp=True) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2022.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2022') }}
{{ filter_lovac(ccthp=True) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2023.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2023') }}
{{ filter_lovac(ccthp=True) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2023_ex.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2023') }}
{{ filter_lovac(ccthp=True, vacancy=False) }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2024') }}
{{ filter_lovac() }}
2 changes: 2 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_2024_ex.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT * FROM {{ ref('stg_lovac_2024') }}
{{ filter_lovac(vacancy=False) }}
45 changes: 45 additions & 0 deletions analytics/dbt/models/intermediate/lovac/int_lovac_geo_code.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
WITH vacancies_2024 AS (
SELECT
geo_code,
COUNT(CASE WHEN data_year = 2024 AND vacancy_start_year = 2023 THEN 1 END) AS vac_2024_1an,
COUNT(CASE WHEN data_year = 2024 AND vacancy_start_year = 2022 THEN 1 END) AS vac_2024_2ans,
COUNT(CASE WHEN data_year = 2024 AND vacancy_start_year = 2021 THEN 1 END) AS vac_2024_3ans,
COUNT(CASE WHEN data_year = 2024 AND vacancy_start_year <= 2020 THEN 1 END) AS vac_2024_4ans_plus,
COUNT(CASE WHEN data_year = 2024 AND vacancy_start_year = 2024 THEN 1 END) AS vac_2024_moins_1an,
COUNT(*) AS vac_2024_total
FROM {{ ref('int_lovac_2024_ex')}}
GROUP BY geo_code
),
vacancies_2023 AS (
SELECT
geo_code,
COUNT(CASE WHEN data_year = 2023 AND vacancy_start_year = 2022 THEN 1 END) AS vac_2023_1an,
COUNT(CASE WHEN data_year = 2023 AND vacancy_start_year = 2021 THEN 1 END) AS vac_2023_2ans,
COUNT(CASE WHEN data_year = 2023 AND vacancy_start_year = 2020 THEN 1 END) AS vac_2023_3ans,
COUNT(CASE WHEN data_year = 2023 AND vacancy_start_year <= 2019 THEN 1 END) AS vac_2023_4ans_plus,
COUNT(CASE WHEN data_year = 2023 AND vacancy_start_year = 2023 THEN 1 END) AS vac_2023_moins_1an,
COUNT(*) AS vac_2023_total
FROM {{ ref('int_lovac_2023_ex')}}
GROUP BY geo_code
)

SELECT
COALESCE(v2024.geo_code, v2023.geo_code) AS geo_code,
COALESCE(v2024.vac_2024_moins_1an, 0) AS vac_2024_moins_1an,
COALESCE(v2024.vac_2024_1an, 0) AS vac_2024_1an,
COALESCE(v2024.vac_2024_2ans, 0) AS vac_2024_2ans,
COALESCE(v2024.vac_2024_3ans, 0) AS vac_2024_3ans,
COALESCE(v2024.vac_2024_4ans_plus, 0) AS vac_2024_4ans_plus,
COALESCE(v2023.vac_2023_moins_1an, 0) AS vac_2023_moins_1an,
COALESCE(v2023.vac_2023_1an, 0) AS vac_2023_1an,
COALESCE(v2023.vac_2023_2ans, 0) AS vac_2023_2ans,
COALESCE(v2023.vac_2023_3ans, 0) AS vac_2023_3ans,
COALESCE(v2023.vac_2023_4ans_plus, 0) AS vac_2023_4ans_plus,
COALESCE(v2023.vac_2023_total, 0) AS vac_2023_total,
COALESCE(v2024.vac_2024_total, 0) AS vac_2024_total
FROM
vacancies_2024 v2024
FULL OUTER JOIN
vacancies_2023 v2023
ON
v2024.geo_code = v2023.geo_code
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
WITH lovac_history as (
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2024') }}
UNION ALL
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2023') }}
UNION ALL
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2022') }}
UNION ALL
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2021') }}
UNION ALL
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2020') }}
UNION ALL
SELECT local_id, 'lovac-' || data_year as file_year, geo_code, data_year,
FROM {{ ref('int_lovac_2019') }}
)

SELECT
local_id,
listagg(file_year, ',') as file_years,
(SELECT geo_code
FROM lovac_history lh2
WHERE lh2.local_id = lh.local_id
ORDER BY lh2.data_year DESC
LIMIT 1) AS geo_code
FROM lovac_history lh
GROUP BY local_id
Loading

0 comments on commit c524425

Please sign in to comment.