-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.py
445 lines (432 loc) · 14.8 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
DEFAULT_MODEL_ID = "Meta-Llama-3-70B-Instruct"
DEFAULT_INTERFACE_MODEL_ID = "NumbersStation/nsql-llama-2-7B"
DEFAULT_KIND = "json"
DEFAULT_TEMPERATURE = 0.6
DEFAULT_TOP_P = 0.95
DEFAULT_FEW_SHOT_NUM = 3
DEFAULT_FEW_SHOT_SELECTION = "random"
DEFAULT_SAVE_INTERVAL = 3
DEFAULT_RES_DIR = "data/results"
DEFAULT_LOG_DIR = "logs"
DEFAULT_TABLES_DIR = "data/databases"
COOCCURRENCE_QUERY = """
WITH concept_pairs AS (
SELECT p1.concept AS concept1, p2.concept AS concept2, p1.paper_id, p1.tag_type
FROM predictions p1
JOIN predictions p2 ON p1.paper_id = p2.paper_id AND p1.concept < p2.concept
WHERE p1.tag_type = p2.tag_type
)
SELECT concept1, concept2, tag_type, COUNT(DISTINCT paper_id) AS co_occurrences
FROM concept_pairs
GROUP BY concept1, concept2, tag_type
HAVING co_occurrences > 5
ORDER BY co_occurrences DESC;
"""
canned_queries = [
(
"Modalities in Physics and Astronomy papers",
"""
SELECT DISTINCT LOWER(concept) AS concept
FROM predictions
JOIN (
SELECT paper_id, url
FROM papers
WHERE primary_category LIKE '%physics.space-ph%'
OR primary_category LIKE '%astro-ph.%'
) AS paper_ids
ON predictions.paper_id = paper_ids.paper_id
WHERE predictions.tag_type = 'modality'
""",
),
(
"Datasets in Evolutionary Biology that use PDEs",
"""
WITH pde_predictions AS (
SELECT paper_id, concept AS pde_concept, tag_type AS pde_tag_type
FROM predictions
WHERE tag_type IN ('method', 'model')
AND (
LOWER(concept) LIKE '%pde%'
OR LOWER(concept) LIKE '%partial differential equation%'
)
)
SELECT DISTINCT
papers.paper_id,
papers.url,
LOWER(p_dataset.concept) AS dataset,
pde_predictions.pde_concept AS pde_related_concept,
pde_predictions.pde_tag_type AS pde_related_type
FROM papers
JOIN pde_predictions ON papers.paper_id = pde_predictions.paper_id
LEFT JOIN predictions p_dataset ON papers.paper_id = p_dataset.paper_id
WHERE papers.primary_category LIKE '%q-bio.PE%'
AND (p_dataset.tag_type = 'dataset' OR p_dataset.tag_type IS NULL)
ORDER BY papers.paper_id, dataset, pde_related_concept;
""",
),
(
"Trends in objects of study in Cosmology since 2019",
"""
SELECT
substr(papers.updated_on, 2, 4) as year,
predictions.concept as object,
COUNT(DISTINCT papers.paper_id) as paper_count
FROM
papers
JOIN
predictions ON papers.paper_id = predictions.paper_id
WHERE
predictions.tag_type = 'object'
AND CAST(SUBSTR(papers.updated_on, 2, 4) AS INTEGER) >= 2019
GROUP BY
year, object
ORDER BY
year DESC, paper_count DESC;
""",
),
(
"New datasets in fluid dynamics since 2020",
"""
WITH ranked_datasets AS (
SELECT
p.paper_id,
p.url,
pred.concept AS dataset,
p.updated_on,
ROW_NUMBER() OVER (PARTITION BY pred.concept ORDER BY p.updated_on ASC) AS rn
FROM
papers p
JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
pred.tag_type = 'dataset'
AND p.primary_category LIKE '%physics.flu-dyn%'
AND CAST(SUBSTR(p.updated_on, 2, 4) AS INTEGER) >= 2020
)
SELECT
paper_id,
url,
dataset,
updated_on
FROM
ranked_datasets
WHERE
rn = 1
ORDER BY
updated_on ASC
""",
),
(
"Evolutionary biology datasets that use spatiotemporal dynamics",
"""
WITH evo_bio_papers AS (
SELECT paper_id
FROM papers
WHERE primary_category LIKE '%q-bio.PE%'
),
spatiotemporal_keywords AS (
SELECT 'spatio-temporal' AS keyword
UNION SELECT 'spatiotemporal'
UNION SELECT 'spatio-temporal'
UNION SELECT 'spatial and temporal'
UNION SELECT 'space-time'
UNION SELECT 'geographic distribution'
UNION SELECT 'phylogeograph'
UNION SELECT 'biogeograph'
UNION SELECT 'dispersal'
UNION SELECT 'migration'
UNION SELECT 'range expansion'
UNION SELECT 'population dynamics'
)
SELECT DISTINCT
p.paper_id,
p.updated_on,
p.abstract,
d.concept AS dataset,
GROUP_CONCAT(DISTINCT stk.keyword) AS spatiotemporal_keywords_found
FROM
evo_bio_papers ebp
JOIN
papers p ON ebp.paper_id = p.paper_id
JOIN
predictions d ON p.paper_id = d.paper_id
JOIN
predictions st ON p.paper_id = st.paper_id
JOIN
spatiotemporal_keywords stk
WHERE
d.tag_type = 'dataset'
AND st.tag_type = 'modality'
AND LOWER(st.concept) LIKE '%' || stk.keyword || '%'
GROUP BY
p.paper_id, p.updated_on, p.abstract, d.concept
ORDER BY
p.updated_on DESC
""",
),
(
"What percentage of papers use only galaxy or spectra, or both or neither?",
"""
WITH paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
categorized_papers AS (
SELECT
CASE
WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
WHEN uses_spectra = 1 THEN 'Only Spectra'
ELSE 'Neither'
END AS category,
COUNT(*) AS paper_count
FROM
paper_modalities
GROUP BY
CASE
WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
WHEN uses_spectra = 1 THEN 'Only Spectra'
ELSE 'Neither'
END
)
SELECT
category,
paper_count,
ROUND(CAST(paper_count AS FLOAT) / (SELECT SUM(paper_count) FROM categorized_papers) * 100, 2) AS percentage
FROM
categorized_papers
ORDER BY
paper_count DESC
""",
),
(
"What are all the next highest data modalities after images and spectra?",
"""
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
ORDER BY
usage_count DESC
""",
),
(
"If we include the next biggest data modality, how much does coverage change?",
"""
WITH modality_counts AS (
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
ORDER BY
usage_count DESC
LIMIT 1
),
paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
MAX(CASE WHEN LOWER(pred.concept) LIKE (SELECT '%' || modality || '%' FROM modality_counts) THEN 1 ELSE 0 END) AS uses_third_modality
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
coverage_before AS (
SELECT
SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
),
coverage_after AS (
SELECT
SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 OR uses_third_modality = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
)
SELECT
(SELECT modality FROM modality_counts) AS third_modality,
ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_before_percent,
ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) AS coverage_after_percent,
ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) -
ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_increase_percent
FROM
coverage_before
""",
),
(
"Coverage if we select the next 5 highest modalities?",
"""
WITH ranked_modalities AS (
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count,
ROW_NUMBER() OVER (ORDER BY COUNT(DISTINCT paper_id) DESC) AS rank
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
),
paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
MAX(CASE WHEN rm.rank = 1 THEN 1 ELSE 0 END) AS uses_modality_1,
MAX(CASE WHEN rm.rank = 2 THEN 1 ELSE 0 END) AS uses_modality_2,
MAX(CASE WHEN rm.rank = 3 THEN 1 ELSE 0 END) AS uses_modality_3,
MAX(CASE WHEN rm.rank = 4 THEN 1 ELSE 0 END) AS uses_modality_4,
MAX(CASE WHEN rm.rank = 5 THEN 1 ELSE 0 END) AS uses_modality_5
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
LEFT JOIN
ranked_modalities rm ON LOWER(pred.concept) = rm.modality
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
cumulative_coverage AS (
SELECT
'Images and Spectra' AS modalities,
0 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, and Modality 1' AS modalities,
1 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, and 2' AS modalities,
2 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, and 3' AS modalities,
3 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, 3, and 4' AS modalities,
4 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, 3, 4, and 5' AS modalities,
5 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 OR uses_modality_5 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
)
SELECT
cc.modalities,
COALESCE(rm.modality, 'N/A') AS added_modality,
rm.usage_count AS added_modality_usage,
ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) AS coverage_percent,
ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) -
LAG(ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2), 1, 0) OVER (ORDER BY cc.added_modality_rank) AS coverage_increase_percent
FROM
cumulative_coverage cc
LEFT JOIN
ranked_modalities rm ON cc.added_modality_rank = rm.rank
ORDER BY
cc.added_modality_rank
""",
),
(
"List all papers",
"SELECT paper_id, abstract AS abstract_preview, authors, primary_category FROM papers",
),
(
"Count papers by category",
"SELECT primary_category, COUNT(*) as paper_count FROM papers GROUP BY primary_category ORDER BY paper_count DESC",
),
(
"Top authors with most papers",
"""
WITH author_papers AS (
SELECT json_each.value AS author
FROM papers, json_each(papers.authors)
)
SELECT author, COUNT(*) as paper_count
FROM author_papers
GROUP BY author
ORDER BY paper_count DESC
""",
),
(
"Papers with 'quantum' in abstract",
"SELECT paper_id, abstract AS abstract_preview FROM papers WHERE abstract LIKE '%quantum%'",
),
(
"Most common concepts",
"SELECT concept, COUNT(*) as concept_count FROM predictions GROUP BY concept ORDER BY concept_count DESC",
),
(
"Papers with multiple authors",
"""
SELECT paper_id, json_array_length(authors) as author_count, authors
FROM papers
WHERE json_array_length(authors) > 1
ORDER BY author_count DESC
""",
),
]