-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvizier_stats.py
194 lines (155 loc) · 7.16 KB
/
vizier_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import argparse
from datetime import date
import json
import numpy as np
import pandas as pd
import plotly.express as px
import pyvo
import requests
number_catalogs = "number of catalogues"
journal_names_json_file = "journal_names.json"
def query_vizier_for_metadata_of_catalogs():
"""Query the metadata catalogue from VizieR.
It returns it as a pandas DataFrame
"""
tap_vizier = pyvo.dal.TAPService("https://tapvizier.cds.unistra.fr/TAPVizieR/tap/")
query_meta_catalogs = """
SELECT name, bibcode FROM METAcat
WHERE catid >1 AND name NOT LIKE 'B/%'
"""
meta_catalogs = tap_vizier.search(query_meta_catalogs)
return meta_catalogs.to_table().to_pandas()
def get_count_for_journals(metadata, cut: int):
"""Clean data and export count per journal.
Arguments
---------
cut (int, optional): Cut for the minimal number of catalogues to
appear in the graph. Defaults to 50.
Returns
-------
int: total number of catalogues
"""
# cleaning steps
journal_code = [
bibcode[4:].split(".")[0] for bibcode in np.array(metadata["bibcode"])
]
journal_code = [code.replace("+", "&") for code in journal_code]
journal_code = [code.split("(")[0] for code in journal_code]
# dirty fix for apjl that has its L far from it in its bib code
journal_code = ["ApJL" if (code == "ApJ" and name.split("/")[-1][0] == "L")
else code for code, name
in zip(journal_code, np.array(metadata["name"]))]
# fusion A&A and A&AS because they became the same thing in 2001
journal_code = ["A&A" if code=="A&AS" else code for code in journal_code]
# add column with the counts
metadata[number_catalogs] = journal_code
# proceeds the groupby
count_journal = metadata.groupby(journal_code).count()
count_journal.reset_index(inplace=True)
count_journal = count_journal[["index", number_catalogs]]
count_journal.loc[
(count_journal[number_catalogs] < cut) | (count_journal["index"] == ""), "index"
] = "other"
# exports the counts per journal after groupby
count_per_journal = count_journal.groupby("index").sum("journal_code")
count_per_journal.reset_index(inplace=True)
return count_per_journal
def get_dict_of_journals_names(count_per_journal, ads_api_key: str):
"""Query the journal names corresponding to bibcodes.
This is done by scanning the `index` column of `count_per_journal.csv`
and if one entry is missing in `journal_names.json` it will fetch the
value through the ADS API.
Args:
ads_api_key (str): API key to connect to ADS
"""
ads_api_adress = "https://api.adsabs.harvard.edu/v1/journals/summary/"
# open journal names data files
with open(journal_names_json_file) as json_data:
current_list = json.load(json_data)
updated = False
# scans through the journal short names
for index in count_per_journal["index"]:
if (
(index not in current_list.keys())
and isinstance(index, str)
and index != "other"
):
print("add index", index)
# access to the ADS API here
result = requests.get(
ads_api_adress + index,
headers={"Authorization": "Bearer " + ads_api_key},
)
if result.status_code == 200:
journal_name = result.json()["summary"]["master"]["journal_name"]
current_list.update({index: journal_name})
updated = True
else:
raise Warning(f"request status code: {result.status_code}")
# saves the new current_list if it changed
if updated:
with open(journal_names_json_file, "w") as file:
json.dump(current_list, file, indent=2)
def plot_pie_chart(today: str, number_of_catalogs: int, count_per_journal, cut):
"""Plot a pie chart for the catalogues provenance in VizieR.
Args:
today (str): the date of today
number_of_catalogs (int): total number of catalogues, as of today
"""
# import the data
with open(journal_names_json_file) as json_data:
journal_names = json.load(json_data)
journal_names.update({"other": f"Journals with less than {cut} catalogues and large datasets"})
# join the dataframe and the dictionnary
count_per_journal["sub journal name"] = count_per_journal["index"].map(journal_names)
# create column for journal families
principal_journal = {
"The Astronomical Journal": "American Astronomical Society Journals",
"The Astrophysical Journal": "American Astronomical Society Journals",
"The Astrophysical Journal Letters": "American Astronomical Society Journals",
"The Astrophysical Journal Supplement Series": "American Astronomical Society Journals",
}
count_per_journal["journal name"] = count_per_journal["sub journal name"].replace(principal_journal)
count_per_journal.loc[count_per_journal["sub journal name"] == count_per_journal["journal name"], ["sub journal name"]] = np.nan
# and now plot time!
fig = px.sunburst(
count_per_journal,
values="number of catalogues",
path=["journal name", "sub journal name"],
title=f"Provenance of the {number_of_catalogs} catalogues in VizieR ({today})",
color_discrete_sequence=px.colors.qualitative.Pastel,
)
invert_journal_names = {v: k for k, v in journal_names.items()}
invert_journal_names.update({"American Astronomical Society Journals": "AAS Journals", "Astronomy & Astrophysics Journals": "A&A Journals"})
fig.update_traces(hovertemplate='<b>%{label}</b><br>Number of catalogues: %{value}') # parent, or label, or id, or value
print(fig.data[0])
short_label = np.array([invert_journal_names[label] for label in fig.data[0]["labels"] ], dtype=object)
fig.update_traces(texttemplate="<b>" + short_label + "</b><br>%{percentEntry}")
fig.update_traces(legendrank=100, selector=dict(type='sunburst'))
fig.update_layout(margin = dict(t=50, l=0, r=0, b=0))
fig.write_html("index.html")
def main(ads_api_key):
# queries VizieR for its metadata catalog
metadata = query_vizier_for_metadata_of_catalogs()
# metadata.to_pickle("metadata.pkl")
# metadata = pd.read_pickle("metadata.pkl")
# number of records today
todays_number_of_catalogs = len(metadata)
# threshold on number of catalogs from a same journal
# only keep above cut
cut = 250
# a dataframe containing journal short id and corresponding catalog count
counts_per_journals = get_count_for_journals(metadata, cut)
# updates the list of journal names if needed by querying the ADS API
get_dict_of_journals_names(counts_per_journals, ads_api_key)
# saves the plot as html
plot_pie_chart(
date.today().isoformat(), todays_number_of_catalogs, counts_per_journals, cut
)
if __name__ == "__main__":
# add the API argument
parser = argparse.ArgumentParser()
parser.add_argument("api", help="api key to connect to ADS", type=str)
arguments = parser.parse_args()
# call main
main(arguments.api)