forked from ScilifelabDataCentre/dynamic-img
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlivewordcloud.py
86 lines (71 loc) · 2.96 KB
/
livewordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Generate wordclouds
"""
# importing all necessary modules
import io
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import numpy as np
import requests
def gen_wordcloud(field="title"):
"""
Generate a wordcloud file.
Returns:
BytesIO: the image as bytes
"""
# imports the .json from the publications library:
# https://publications-covid19.scilifelab.se/label/Funder%3A%20KAW/SciLifeLab.json
# (will give just scilifelab funded papers)
resp = requests.get("https://publications-covid19.scilifelab.se/publications.json")
txt = resp.json()
# the below level of normalisation will give access to abstract and the title -
# authors requires further 'digging' in the .json
df = pd.json_normalize(txt["publications"])
# add whatever words you'd like to exclude
stopwords = list(STOPWORDS) + ["None", "s"]
# pick the column you want to import words from df.columnname
title_words = " ".join(" ".join(str(val).split()) for val in df[field])
# to make a square shaped wordcloud
mask = np.array(Image.open("img/SciLifeLab_symbol_POS.png"))
# COVID portal visual identity
# add font
font_path = "font/IBMPlexSans-Bold.ttf"
# give colours
# pylint: disable=unused-argument
def multi_color_func(word=None,
font_size=None,
position=None,
orientation=None,
font_path=None,
random_state=None):
colors = [[338, 73, 52], [211, 56, 41],
[206, 62, 50], [208, 7, 46]]
rand = random_state.randint(0, len(colors) - 1)
return f"hsl({colors[rand][0]}, {colors[rand][1]}%, {colors[rand][2]}%)"
wordcloud = WordCloud(background_color="white",
stopwords=stopwords,
font_path=font_path,
mask=mask,
min_font_size=14,
width=mask.shape[1],
height=mask.shape[0],
# 50 threshold sufficient to exclude 'ill covid'
# which makes little sense as a bigram (pair of words).
collocation_threshold=50,
color_func=multi_color_func,
prefer_horizontal=1,
# This now includes hyphens in punctuation
regexp=r"\w(?:[-\w])*\w?",
# max word default is 200, can make more or less be in cloud
max_words=200).generate(title_words)
# plot the WordCloud image
plt.figure(figsize=(10, 10), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
img = io.BytesIO()
# savefig will save the figure (at resolution 300dpi - good enoough for print)
plt.savefig(img, dpi=300)
return img