forked from databricks-industry-solutions/omop-cdm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
3-omop-vocab-setup.sql
212 lines (186 loc) · 8.05 KB
/
3-omop-vocab-setup.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
-- Databricks notebook source
-- MAGIC %md
-- MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/omop-cdm. For more information about this solution accelerator, visit https://www.databricks.com/blog/2021/07/19/unlocking-the-power-of-health-data-with-a-modern-data-lakehouse.html.
-- COMMAND ----------
-- MAGIC %md
-- MAGIC # OMOP Vocabulary Setup
-- MAGIC Construct vocabulary tables, based on tables downloaded from [Athena](https://athena.ohdsi.org/search-terms/start) website and available here on `s3://hls-eng-data-public/data/rwe/omop-vocabs/`
-- MAGIC If you like to download a different dataset, downoad the vocabularies from [Athena](https://athena.ohdsi.org/search-terms/start) and
-- MAGIC use [databricks dbfs api](https://docs.databricks.com/dev-tools/api/latest/dbfs.html#dbfs-api) utilities to upload downloaded vocabularies to `dbfs` under your `vocab_path`.
-- MAGIC
-- MAGIC <img align="right" width="700" src="https://drive.google.com/uc?export=view&id=16TU2l7XHjQLugmS_McXegBXKMglD--Fr">
-- COMMAND ----------
-- DBTITLE 1, config
-- MAGIC %python
-- MAGIC omop_version="OMOP531"
-- MAGIC project_name='omop-cdm-100K'
-- MAGIC
-- MAGIC
-- MAGIC vocab_s3_path = "s3://hls-eng-data-public/data/rwe/omop-vocabs/"
-- MAGIC
-- MAGIC print(f"Using OMOP version {omop_version}")
-- MAGIC print(f"Using vocabulary tables in {vocab_s3_path}")
-- MAGIC spark.sql(f"USE {omop_version}")
-- MAGIC
-- MAGIC display(dbutils.fs.ls(vocab_s3_path))
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Loading vocabularies as delta tables
-- COMMAND ----------
-- MAGIC %python
-- MAGIC
-- MAGIC from pyspark.sql.functions import to_date
-- MAGIC spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
-- MAGIC spark.sql("set spark.sql.legacy.parquet.datetimeRebaseModeInWrite=LEGACY")
-- MAGIC
-- MAGIC tablelist = ["CONCEPT","VOCABULARY","CONCEPT_ANCESTOR","CONCEPT_RELATIONSHIP","RELATIONSHIP","CONCEPT_SYNONYM","DOMAIN","CONCEPT_CLASS","DRUG_STRENGTH"]
-- MAGIC for table in tablelist:
-- MAGIC df = spark.read.csv(f'{vocab_s3_path}/{table}.csv.gz', inferSchema=True, header=True, dateFormat="yyyy-MM-dd")
-- MAGIC if table in ["CONCEPT","CONCEPT_RELATIONSHIP","DRUG_STRENGTH"] :
-- MAGIC df = df.withColumn('valid_start_date', to_date(df.valid_start_date,'yyyy-MM-dd')).withColumn('valid_end_date', to_date(df.valid_end_date,'yyyy-MM-dd'))
-- MAGIC
-- MAGIC df.write.format('delta').mode('overwrite').option('overwriteSchema','true').saveAsTable(omop_version+'.'+table)
-- COMMAND ----------
-- DBTITLE 1,display tables and counts of records
-- MAGIC %python
-- MAGIC tablecount = "SELECT '-' AS table, 0 as recs"
-- MAGIC for table in tablelist:
-- MAGIC tablecount += " UNION SELECT '"+table+"', COUNT(1) FROM "+omop_version+"."+table
-- MAGIC tablecount += " ORDER BY 2 DESC"
-- MAGIC
-- MAGIC display(spark.sql(tablecount))
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Create vocab map tables
-- COMMAND ----------
-- MAGIC %md
-- MAGIC #### source_to_standard_vocab_map
-- COMMAND ----------
DROP TABLE IF EXISTS source_to_standard_vocab_map;
CREATE TABLE source_to_standard_vocab_map AS WITH CTE_VOCAB_MAP AS (
SELECT
c.concept_code AS SOURCE_CODE,
c.concept_id AS SOURCE_CONCEPT_ID,
c.concept_name AS SOURCE_CODE_DESCRIPTION,
c.vocabulary_id AS SOURCE_VOCABULARY_ID,
c.domain_id AS SOURCE_DOMAIN_ID,
c.CONCEPT_CLASS_ID AS SOURCE_CONCEPT_CLASS_ID,
c.VALID_START_DATE AS SOURCE_VALID_START_DATE,
c.VALID_END_DATE AS SOURCE_VALID_END_DATE,
c.INVALID_REASON AS SOURCE_INVALID_REASON,
c1.concept_id AS TARGET_CONCEPT_ID,
c1.concept_name AS TARGET_CONCEPT_NAME,
c1.VOCABULARY_ID AS TARGET_VOCABULARY_ID,
c1.domain_id AS TARGET_DOMAIN_ID,
c1.concept_class_id AS TARGET_CONCEPT_CLASS_ID,
c1.INVALID_REASON AS TARGET_INVALID_REASON,
c1.standard_concept AS TARGET_STANDARD_CONCEPT
FROM
CONCEPT C
JOIN CONCEPT_RELATIONSHIP CR ON C.CONCEPT_ID = CR.CONCEPT_ID_1
AND CR.invalid_reason IS NULL
AND lower(cr.relationship_id) = 'maps to'
JOIN CONCEPT C1 ON CR.CONCEPT_ID_2 = C1.CONCEPT_ID
AND C1.INVALID_REASON IS NULL
UNION
SELECT
source_code,
SOURCE_CONCEPT_ID,
SOURCE_CODE_DESCRIPTION,
source_vocabulary_id,
c1.domain_id AS SOURCE_DOMAIN_ID,
c2.CONCEPT_CLASS_ID AS SOURCE_CONCEPT_CLASS_ID,
c1.VALID_START_DATE AS SOURCE_VALID_START_DATE,
c1.VALID_END_DATE AS SOURCE_VALID_END_DATE,
stcm.INVALID_REASON AS SOURCE_INVALID_REASON,
target_concept_id,
c2.CONCEPT_NAME AS TARGET_CONCEPT_NAME,
target_vocabulary_id,
c2.domain_id AS TARGET_DOMAIN_ID,
c2.concept_class_id AS TARGET_CONCEPT_CLASS_ID,
c2.INVALID_REASON AS TARGET_INVALID_REASON,
c2.standard_concept AS TARGET_STANDARD_CONCEPT
FROM
source_to_concept_map stcm
LEFT OUTER JOIN CONCEPT c1 ON c1.concept_id = stcm.source_concept_id
LEFT OUTER JOIN CONCEPT c2 ON c2.CONCEPT_ID = stcm.target_concept_id
WHERE
stcm.INVALID_REASON IS NULL
)
SELECT
*
FROM
CTE_VOCAB_MAP
;
SELECT * FROM source_to_standard_vocab_map LIMIT 100
;
-- COMMAND ----------
-- MAGIC %md
-- MAGIC #### source_to_source_vocab_map
-- COMMAND ----------
DROP TABLE IF EXISTS source_to_source_vocab_map
;
CREATE TABLE source_to_source_vocab_map AS WITH CTE_VOCAB_MAP AS (
SELECT
c.concept_code AS SOURCE_CODE,
c.concept_id AS SOURCE_CONCEPT_ID,
c.CONCEPT_NAME AS SOURCE_CODE_DESCRIPTION,
c.vocabulary_id AS SOURCE_VOCABULARY_ID,
c.domain_id AS SOURCE_DOMAIN_ID,
c.concept_class_id AS SOURCE_CONCEPT_CLASS_ID,
c.VALID_START_DATE AS SOURCE_VALID_START_DATE,
c.VALID_END_DATE AS SOURCE_VALID_END_DATE,
c.invalid_reason AS SOURCE_INVALID_REASON,
c.concept_ID as TARGET_CONCEPT_ID,
c.concept_name AS TARGET_CONCEPT_NAME,
c.vocabulary_id AS TARGET_VOCABULARY_ID,
c.domain_id AS TARGET_DOMAIN_ID,
c.concept_class_id AS TARGET_CONCEPT_CLASS_ID,
c.INVALID_REASON AS TARGET_INVALID_REASON,
c.STANDARD_CONCEPT AS TARGET_STANDARD_CONCEPT
FROM
CONCEPT c
UNION
SELECT
source_code,
SOURCE_CONCEPT_ID,
SOURCE_CODE_DESCRIPTION,
source_vocabulary_id,
c1.domain_id AS SOURCE_DOMAIN_ID,
c2.CONCEPT_CLASS_ID AS SOURCE_CONCEPT_CLASS_ID,
c1.VALID_START_DATE AS SOURCE_VALID_START_DATE,
c1.VALID_END_DATE AS SOURCE_VALID_END_DATE,
stcm.INVALID_REASON AS SOURCE_INVALID_REASON,
target_concept_id,
c2.CONCEPT_NAME AS TARGET_CONCEPT_NAME,
target_vocabulary_id,
c2.domain_id AS TARGET_DOMAIN_ID,
c2.concept_class_id AS TARGET_CONCEPT_CLASS_ID,
c2.INVALID_REASON AS TARGET_INVALID_REASON,
c2.standard_concept AS TARGET_STANDARD_CONCEPT
FROM
source_to_concept_map stcm
LEFT OUTER JOIN CONCEPT c1 ON c1.concept_id = stcm.source_concept_id
LEFT OUTER JOIN CONCEPT c2 ON c2.CONCEPT_ID = stcm.target_concept_id
WHERE
stcm.INVALID_REASON IS NULL
)
SELECT
*
FROM
CTE_VOCAB_MAP
;
SELECT * FROM source_to_source_vocab_map LIMIT 100
;
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Copyright / License info of the notebook. Copyright Databricks, Inc. [2021]. The source in this notebook is provided subject to the [Databricks License](https://databricks.com/db-license-source). All included or referenced third party libraries are subject to the licenses set forth below.
-- MAGIC
-- MAGIC |Library Name|Library License|Library License URL|Library Source URL|
-- MAGIC | :-: | :-:| :-: | :-:|
-- MAGIC |Synthea|Apache License 2.0|https://github.com/synthetichealth/synthea/blob/master/LICENSE| https://github.com/synthetichealth/synthea|
-- MAGIC | OHDSI/CommonDataModel| Apache License 2.0 | https://github.com/OHDSI/CommonDataModel/blob/master/LICENSE | https://github.com/OHDSI/CommonDataModel |
-- MAGIC | OHDSI/ETL-Synthea| Apache License 2.0 | https://github.com/OHDSI/ETL-Synthea/blob/master/LICENSE | https://github.com/OHDSI/ETL-Synthea |
-- MAGIC |OHDSI/OMOP-Queries|||https://github.com/OHDSI/OMOP-Queries|
-- MAGIC |The Book of OHDSI | Creative Commons Zero v1.0 Universal license.|https://ohdsi.github.io/TheBookOfOhdsi/index.html#license|https://ohdsi.github.io/TheBookOfOhdsi/|
-- COMMAND ----------