forked from lucasrodes/whatstk-webapp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
209 lines (188 loc) · 7.08 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
# My first app
Here's our first attempt at using data to create a table:
"""
from pathlib import Path
import streamlit as st
import tempfile
from whatstk import df_from_txt_whatsapp
import zipfile
import os
from whatstk import FigureBuilder
# Page settings
st.set_page_config(
page_title="WhatsApp chat parser",
page_icon="favicon.png",
layout="centered",
initial_sidebar_state="collapsed",
menu_items=None
)
hide_default_format = """
<style>
#MainMenu {visibility: hidden; }
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_default_format, unsafe_allow_html=True)
# APP title
st.title('WhatsApp chat parser')
st.markdown("**⚡ Powered by [whatstk](https://github.com/lucasrodes/whatstk)**")
# Side bar
with st.sidebar:
hformat = st.text_input(
"Header format",
help="More info at https://whatstk.readthedocs.io/en/stable/source/getting_started/hformat.html.",
)
encoding = st.text_input(
"Encoding",
value="utf-8",
help="Encoding of the chat.",
)
# Encoding default
ENCODING_DEFAULT = "utf-8"
# Privacy message & toast
msg_privacy = (
"**Privacy policy**"
"\n\n"
"All your uploaded files are deleted once you leave the page. "
"Your files are _only_ used to automatically generate your visualisations and a CSV file for you. "
"Your files are never accessed by any human, and remain totally private. "
"All the code used to run this site is [public](https://github.com/lucasrodes/whatstk-webapp/)"
)
# st.toast(
# msg_privacy,
# icon="🔒"
# )
# Upload file box
uploaded_file = st.file_uploader(
label="Upload your WhatsApp chat file ([learn more](https://whatstk.readthedocs.io/en/stable/source/getting_started/export_chat.html)).",
type=["txt", "zip"],
# label_visibility="collapsed",
)
# Define temporary file (chat will be stored here temporarily)
temp_dir = tempfile.TemporaryDirectory()
uploaded_file_path = Path(temp_dir.name) / "chat"
if uploaded_file is not None:
with open(uploaded_file_path, 'wb') as output_temporary_file:
output_temporary_file.write(uploaded_file.read())
# Load file as dataframe
try:
if uploaded_file.name.endswith(".zip"):
with tempfile.TemporaryDirectory() as temp_dir:
# Uncompress the file
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
files = os.listdir(temp_dir)
assert len(files) == 1, "Unexpected number of files in the compressed file! Only one is expected (the chat txt file)"
# Read
df = df_from_txt_whatsapp(
temp_dir / Path(files[0]),
hformat=hformat,
encoding=encoding,
)
else:
df = df_from_txt_whatsapp(
output_temporary_file.name,
hformat=hformat,
encoding=encoding,
)
except RuntimeError:
st.error(
"The chat could not be parsed automatically! You can try to set custom `hformat` "
"value in the side bar config."
"Additionally, please report to https://github.com/lucasrodes/whatstk/issues. If possible, "
"please provide a sample of your chat (feel free to replace the actual messages with dummy text)."
)
else:
# Remove system messages
sys_msgs = [
# r"Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.",
r".?Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.",
r".?Group creator created this group",
r".?\screated this group",
r".?You were added",
]
for sys_msg in sys_msgs:
mask = df['message'].str.fullmatch(sys_msg)
df = df[~mask]
if df["username"].nunique() > 2:
# Get username of the system
username_system = []
for sys_msg in sys_msgs:
mask = df['message'].str.fullmatch(sys_msg)
username_system += list(df.loc[mask, "username"])
df = df[~df["username"].isin(set(username_system))]
# Download option
csv = df.to_csv().encode(ENCODING_DEFAULT)
st.download_button(
label="Download CSV",
data=csv,
file_name='chat.csv',
mime='text/csv',
help="Download the formatted chat as a CSV file",
)
# Visualisations
st.header("Visualisations")
# Print chat as dataframe
tab1, tab2, tab3, tab4 = st.tabs(["Number of messages sent", "Length of messages", "User interaction", "Table"])
# FigureBuilder
fb = FigureBuilder(df=df)
with tab1:
# Countring mode
count_mode = st.radio(
"Counting mode",
("Number of messages sent", "Number of characters sent"),
horizontal=True,
)
# Aggregate all users or disaggregated by user?
all_users = st.radio(
"Aggregate all users",
("Yes", "No"),
horizontal=True,
index=1,
)
all_users = True if all_users == "Yes" else False
if count_mode == "Number of messages sent":
figs = [
fb.user_interventions_count_linechart(
cumulative=True,
title='Number of messages sent (cumulative)',
msg_length=False,
all_users=all_users,
),
fb.user_interventions_count_linechart(
date_mode='hour',
title='Number of messages sent (per hour in a day)',
xlabel='Hour',
msg_length=False,
all_users=all_users,
),
]
else:
figs = [
fb.user_interventions_count_linechart(
cumulative=True,
title='Number of characters sent (cumulative)',
msg_length=True,
all_users=all_users,
),
fb.user_interventions_count_linechart(
date_mode='hour',
title='Number of characters sent (per hour in a day)',
xlabel='Hour',
msg_length=True,
all_users=all_users,
),
]
for fig in figs:
st.plotly_chart(fig)
with tab2:
fig = fb.user_msg_length_boxplot()
st.plotly_chart(fig)
with tab3:
fig = fb.user_message_responses_heatmap()
st.plotly_chart(fig)
with tab4:
st.dataframe(df)
st.divider()
st.markdown("🔒 " + msg_privacy)