-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubpeer_git.sql
532 lines (401 loc) · 14.5 KB
/
pubpeer_git.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
/*******************************************/
/*******************************************/
/* ##### Recette #### */
/*******************************************/
/*******************************************/
-- ID des publications dans la base commentaires
select *
from public.data_commentaires
where publication ='1'
/*******************************************/
-- ID des publications dans la base commentaires
select y.*
from public.data_commentaires x,
public.data_sites_comm y
where x.publication = y.publication
and x.publication ='106541'
/* Vérification nombre ID dans les bases */
-- ID des publications dans la base publications
select count (distinct publication) as id,
count (distinct original_id) as original_id
⁄
from public.data_pub;
-- ID des publications dans la base commentaires
select count (distinct id) as id,
count (distinct publication) as id_pub
from public.data_commentaires
;
-- ID dans la table data.commentaire.annee
select count (distinct id_pub) from public.data_commentaires_annees ; --101272 -- ok
select count (id_comm) from public.data_commentaires_annees ; --189416 -- ok
-- 1,87 commentaire par publication en moyenne.
/* Vérification nombre de commentaires */ -- ATTENTION !!! 1 commentaire en plus dans la table des "Publications"
select *,
(x.nb_com_table_pub - y.nb_com_table_com_ann) as diff
from
(select sum (nb_comm) as nb_com_table_pub
from public.data_annee) x,
(select count (id_comm) as nb_com_table_com_ann
from public.data_commentaires_annees) y
;
/* Test fractionnement */
SELECT id, SUM(frac_geo) FROM public.data_pays_frac_geo GROUP BY id ORDER BY 1 desc; --ok
SELECT id, SUM(frac_geo) FROM public.data_pays_frac_geo GROUP BY id ORDER BY 1 asc; --ok
-- Test fractionnement disciplinaire.
SELECT id, SUM(frac_disc) FROM public.data_frac_disc GROUP BY id ORDER BY 1 desc; --ok
SELECT id, SUM(frac_disc) FROM public.data_frac_disc GROUP BY id ORDER BY 1 asc; --ok
-- Test fractionnement disciplinaire.
SELECT id, SUM(frac_disc) FROM public.data_frac_disc_jwc GROUP BY id ORDER BY 1 desc; --ok
SELECT id, SUM(frac_disc) FROM public.data_frac_disc_jwc GROUP BY id ORDER BY 1 asc; --ok
/*******************************************/
/*******************************************/
/* ##### Préparation des données #### */
/*******************************************/
/*******************************************/
/* Créer une table de libellés des pays */
drop table pays_lib;
create table pays_lib
(ordre VARCHAR(50),
num VARCHAR(50),
iso2 VARCHAR(50),
iso3 VARCHAR(50),
lib_fr VARCHAR(150),
lib_en VARCHAR(150),
PRIMARY KEY (ordre)
)
;
/* Les données sont importées directement sur le fichier dans le chemin ci-après (récupéré sur internet : code iso des pays) :
-- Commande utilisée : " "\\copy public.pays_lib (ordre, num, iso2, iso3, lib_fr, lib_en)
FROM '/Users/maddi/Documents/Pubpeer project/Pubpeer explo/Excel CSV/PAYS lib.csv' DELIMITER ';'
CSV HEADER ENCODING 'UTF8' QUOTE '\"' ESCAPE '''';""
*/
/* Calcul du fractionnelent géographique */
drop table data_pays_frac_geo;
create table data_pays_frac_geo as
SELECT X.id,
X.pays,
(X.nb_adress_pays/Y.nb_tot_adress::float) AS frac_geo
FROM
(select DISTINCT id,
pays,
COUNT (*) AS nb_adress_pays
FROM (SELECT id, TRIM(pays) AS pays
FROM public.data_pays) AA
GROUP BY id,
pays) X,
(select id,
COUNT(pays) AS nb_tot_adress
FROM public.data_pays
GROUP BY id) Y
WHERE X.id = Y.id
;
/* Calcul du fractionnement disciplinaire des grands domaines */
drop table data_frac_disc;
create table data_frac_disc as
SELECT X.id,
trim(X.jdw) as jdw,
(1/Y.nb_tot_jd::float) AS frac_disc
FROM
(select * FROM public.data_jdw) X,
(select id,
COUNT(trim(jdw)) AS nb_tot_jd
FROM public.data_jdw
GROUP BY id) Y
WHERE X.id = Y.id
--AND X.id='52676' -- juste pour le test
;
/* Calcul du fractionnement disciplinaire des WoS subject categories */
drop table data_frac_disc_jwc;
create table data_frac_disc_jwc as
SELECT X.id,
trim(X.jcw) as jcw,
(1/Y.nb_tot_jd::float) AS frac_disc
FROM
(select * FROM public.data_jcw) X,
(select id,
COUNT(trim(jcw)) AS nb_tot_jd
FROM public.data_jcw
GROUP BY id) Y
WHERE X.id = Y.id
--AND X.id='52676' -- juste pour le test
;
/*******************************************/
/*******************************************/
/* ##### Analyse des données #### */
/*******************************************/
/*******************************************/
/* Nombre total de publications commentées */
select count (distinct id) AS Nombre_de_publications_commentees from public.data_annee
;
-- Nombres par discipline
select jdw as jdw,
sum(frac_disc) nbr_pub
from public.data_pays_frac_disc
group by jdw
order by 2 desc
-- Sur les 101,271 publications, 90,434 ont au moins une discipline (WoS). Soit 89,3 %
;
/**********************************/
/* Nombre de commentaires par année, nombre de publications commentées et
nombre moyen de commentaires par publication (NB : c'est l'année de commentaire
qui est utilisée et non l'année de publication des papier commentés) */
select annee_comm,
nb_comm,
nb_pub,
ROUND(comm_par_pub,2) as comm_par_pub
from
(select annee_comm,
nb_comm,
nb_pub,
nb_comm/nb_pub::numeric as comm_par_pub
from
(select annee_comm,
count (id_comm) as nb_comm,
count (distinct id_pub) as nb_pub
from public.data_commentaires_annees
group by annee_comm) x) aa
order by 1
;
/**********************************/
-- Nombre de publications commentées par année de publication
select annee, count (distinct id) as nb_par_ann_pub
from public.data_annee
where annee between 2000 and 2020
group by annee
order by 1
;
/* Nombre de commentaires par année */
select annee_comm, count(id_comm) AS Nombre_de_commentaires
from public.data_commentaires_annees
WHERE annee_comm between 2010 and 2020-->'1984'
GROUP BY annee_comm order by 1;
/* Distribution disciplinaire */
select jdw,
SUM("frac_disc") nombre_publications_frac
from public.data_pays_frac_disc
group by jdw
order by 2 desc;
/* selectionner le top 20 des pays avec le plus de pub commentées */
select pays,
sum(frac_geo) AS Nombre_de_publications_pays
from public.data_pays_frac_geo
where pays <> 'None'
Group by pays
order by 2 desc;
/* sur les 101271, il y a 20069 publications sans information sur le pays (veleurs = 'None'),
et on a 80765 dont l'information sur la pays existe*/
/* Evolution pub commentées par pays *//* Année de commentaire utilisée */
select Y.annee_comm,
Z.lib_en,
sum(X."frac_geo") AS Nombre_de_publications_pays
from public.data_pays_frac_geo X,
public.data_commentaires_annees Y,
public.pays_lib Z
where X.id = Y.id_pub
and X.pays = Z.iso2
and Y.annee_comm between 2000 and 2021
and X.pays in ('BR','SE','TW','BE','CH','KR','IR','ES','IL','JP','NL',
'FR','AU','IT','DE','IN','CA','GB','CN','US')
Group by Y.annee_comm,
Z.lib_en
order by 1
;
/*******************************************/
/* Analyse des commentaires par discipline */
/*******************************************/
-- Evolution nombre de commentaires par année par discipline ## WoS domains
select jdw as discipline,
Y.annee_comm,
sum(X.frac_disc) AS Nombre_de_publications_disc
from public.data_frac_disc X,
public.data_commentaires_annees Y
where X.id = Y.id_pub
and Y.annee_comm between 2000 and 2021
Group by Y.annee_comm,
jdw
order by 1
;
-- Faire un zoom sur l'annee 2016 et surtout pour les Sciences sociales qui ont causé le pic
select discipline,
annee_comm,
Nombre_de_publications_disc,
Nombre_de_publications_disc/nb_tot::real as part_discipline
from
(select jcw as discipline,
Y.annee_comm,
sum(X.frac_disc) AS Nombre_de_publications_disc
from public.data_frac_disc_jwc X,
public.data_commentaires_annees Y
where X.id = Y.id_pub
and Y.annee_comm = 2016
and X.id in (select distinct id from public.data_jdw where jdw = 'Social Sciences')
Group by Y.annee_comm,
jcw) aa,
(select count (distinct id_pub) AS nb_tot
from public.data_commentaires_annees
where annee_comm = 2016
and id_pub in (select distinct id from public.data_jdw where jdw = 'Social Sciences')
) bb
order by 3 desc
;
/*******************************************/
/*******************************************/
/*# Extractions pour l'analyse textuelle #*/
/*******************************************/
/*******************************************/
/* Extraction données commentaires par discipline */
drop table commentaires_par_discipline;
create table commentaires_par_discipline as
select distinct inner_id, publication, markdown ,
'Arts Humanities' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Arts Humanities'
union
select distinct inner_id, publication, markdown ,
'Life Sciences Biomedicine' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Life Sciences Biomedicine'
union
select distinct inner_id, publication, markdown ,
'Multidisciplinary' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Multidisciplinary'
union
select distinct inner_id, publication, markdown ,
'Physical Sciences' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Physical Sciences'
union
select distinct inner_id, publication, markdown ,
'Social Sciences' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Social Sciences'
union
select distinct inner_id, publication, markdown ,
'Technology' as discipline
from public.data_commentaires x,
public.data_jdw y
where x.publication = y.id
and jdw = 'Technology'
;
/* Citations par annee */
select * from public."DATA_CITATIONS"
where "ANNEE_CITATION" <> 'None'
and id='106986'
ORDER BY "ANNEE_CITATION"
;
/* Préparation des données sur les liens qui figurent dans les commentaires pour les représenter sur VosViewer */
drop table liens_com_disc;
create table liens_com_disc as
select x.*,
jdw
from public.liens_com_nettoyes x,
public.data_jdw y
where id = publication
order by x.site
;
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, site as site_1 from public.publication_sites_comm) x,
(select publication, site as site_2 from public.publication_sites_comm) y,
(select distinct publication from commentaires_par_discipline where discipline = 'Life Sciences Biomedicine') z
where x.publication = y.publication
and x.publication = z.publication
group by x.site_1,
y.site_2
order by 3 desc
;
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, site as site_1 from public.publication_sites_comm) x,
(select publication, site as site_2 from public.publication_sites_comm) y,
(select distinct publication from commentaires_par_discipline where discipline = 'Social Sciences') z
where x.publication = y.publication
and x.publication = z.publication
group by x.site_1,
y.site_2
order by 3 desc
;
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, site as site_1 from public.publication_sites_comm) x,
(select publication, site as site_2 from public.publication_sites_comm) y,
(select distinct publication from commentaires_par_discipline where discipline = 'Physical Sciences') z
where x.publication = y.publication
and x.publication = z.publication
group by x.site_1,
y.site_2
order by 3 desc
;
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, site as site_1 from public.publication_sites_comm) x,
(select publication, site as site_2 from public.publication_sites_comm) y,
(select distinct publication from commentaires_par_discipline where discipline = 'Technology') z
where x.publication = y.publication
and x.publication = z.publication
group by x.site_1,
y.site_2
order by 3 desc
;
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, type_sit as site_1 from public.data_type_sites) x,
(select publication, type_sit as site_2 from public.data_type_sites) y,
(select distinct publication from commentaires_par_discipline where discipline = 'Arts Humanities') z
where x.publication = y.publication
and x.publication = z.publication
group by x.site_1,
y.site_2
order by 3 desc
;
/* typologie des sites */
select distinct site as site_1
from public.publication_sites_comm
where site like '%edu%'
;
drop table coocurrences_typ_sites;
create table coocurrences_typ_sites as
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, typo as site_1 from public.data_urls_comm) x,
(select publication, typo as site_2 from public.data_urls_comm) y
where x.publication = y.publication
group by x.site_1,
y.site_2
order by 3 desc
;
-- drop table coocu_typ_sites_media;
-- create table coocu_typ_sites_media as
select distinct x.site_1,
y.site_2,
count (distinct x.publication)
from (select publication, domain as site_1
from public.data_urls_comm
where typo = 'Médias'
) x,
(select publication, domain as site_2
from public.data_urls_comm
where typo = 'Médias'
) y
where x.publication = y.publication
group by x.site_1,
y.site_2
order by 3 desc
;