-
Notifications
You must be signed in to change notification settings - Fork 2
/
app.py
216 lines (164 loc) · 9.09 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
'''this streamlit app will be used to scrape webpage, extract the text and the create a wordcloud from it, the wordcloud can be also saved as an image file.
The app also will be used to create a sentiment analysis of the text and Name Entity Recognition.
The app will also be used to create a screenshot and download it.
the app is useful for content writers, bloggers, and anyone who wants to create a wordcloud from a webpage, to monitor the topics that are being discussed in the webpage.
Library used:
- streamlit
- requests
- selenium
- spacy
- wordcloud
- matplotlib
- PIL
- json
- pandas
'''
### Importing the libraries ###
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import io
from wordcloud import STOPWORDS, ImageColorGenerator, WordCloud
import warnings
#### Main functions import from utils: ###
from utils.get_text import get_text
from utils.preprocess_text import preprocess_text
from utils.sentiment_analysis import sentiment_analysis
from utils.name_ent_recognition import name_ent_recognition
from utils.screenshot import screenshot
## Warnings ignore ###
warnings.filterwarnings(action='ignore')
st.set_option('deprecation.showfileUploaderEncoding', False)
st.set_option('deprecation.showPyplotGlobalUse', False)
### App layout ###
def main():
# create a wide layout
st.set_page_config(
page_title="Scrapify ¦ Web Page Analysis For Content Writers",
page_icon="📰",
initial_sidebar_state="expanded",
layout="wide")
# create a sidebar menu
option = st.sidebar.selectbox("Select an option", ["Home", "Webpage Text Scraper", "Word Cloud", "Sentiment Analysis", "Name Entity Recognition", "Screenshot"])
if option == "Home":
# add an image
display = image = Image.open("img/Sport+Performance+Analysis+-+Web+Scraping+21.png")
display = np.array(display)
st.image(display, width=500)
st.title("Scrapyfy\n")
st.success('**Web Page Text Analysys App** for Content Writers, Bloggers, to monitor the topics that are being discussed on the webpage.')
st.markdown('''This streamlit app will be used to scrape webpage, extract the text and create a wordcloud from it, the wordcloud can be also saved as an image file.
The app also will be used to create a sentiment analysis of the text and Name Entity Recognition. The multiple features are accessible from the sidebar menu on the left.\n
The app will also be used to create a screenshot and download it. The app is useful for content writers, bloggers, and anyone who wants to create a wordcloud from a webpage,
to monitor the topics that are being discussed in the webpage.\n''')
st.subheader('''**🧾 The features of the app you will find on the sidebar menu on the left are:**\n''')
st.success('''- **Webpage Text Scraper:** Scrape the text from the webpage and show the first ten lines of the text in a table.\n
- **Word Cloud:** Create a Wordcloud from the text scraped from the webpage.\n
- **Sentiment Analysis:** Create a sentiment analysis of the text scraped from the webpage.\n
- **Name Entity Recognition:** Create a name entity recognition of the text scraped from the webpage.\n
- **Screenshot:** Create a screenshot of the webpage and download it.\n''')
st.info('''👈👈Start playing selecting the option from the sidebar menu on the left.👈👈\n''')
# create a footer
st.caption('''App created by [**@marcellodichiera**](https://marcello-personal-website.netlify.app/)''')
## Webpage Text Scraper
elif option == "Webpage Text Scraper":
st.title("Webpage Scraper test")
st.markdown('##### This option lets you scrape a webpage, extract the text from it and show the first 10 lines in a table and download the text as a csv file.')
# create a text box to enter the URL
URL = st.text_input("Enter the URL of the webpage you want to scrape")
# create a button to scrape the text from the webpage
if URL is not None:
if st.button("Scrape"):
text = get_text(URL)
df = pd.DataFrame(text.splitlines(), columns=["Webpage_text"], index=None)
# show the text in the dataframe
st.markdown('## Showing the first ten lines of the text')
st.dataframe(df.head(10))
# download the text as a csv file
st.info('''Download the text as a csv file if you like.''')
st.download_button(label="Download the text as a csv file",
data=df.to_csv(index=False, encoding='utf-8'),
file_name='webpage_text.csv',
mime='text/csv')
else:
st.warning("Please enter a valid URL")
# Word Cloud
elif option == "Word Cloud":
st.title("Word Cloud")
st.markdown('##### This option lets you create a wordcloud text from a webpage')
# create a text box to enter the URL
URL = st.text_input("Enter the URL of the webpage you want to scrape")
# create a button to scrape the text from the webpage
if URL is not None:
if st.button("Scrape"):
text = get_text(URL)
# create a wordcloud
wordcloud = WordCloud(width=800, height=400, background_color="white", stopwords=STOPWORDS).generate(text)
# wordcloud title
st.info('#### Wordcloud of {}'.format(URL))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
# show the wordcloud
st.pyplot()
else:
st.warning("Please enter a valid URL")
# Sentiment Analysis
elif option == "Sentiment Analysis":
st.title("Sentiment Analysis")
st.markdown('##### This option lets you create a sentiment analysis of the text from a webpage')
# create a text box to enter the URL
URL = st.text_input("Enter the URL of the webpage you want to scrape")
# create a button to scrape the text from the webpage
if URL is not None:
if st.button("Scrape"):
text = get_text(URL)
# preprocess the text
#text = preprocess_text(text)
#get the sentiment score and label
sentiment_score, sentiment_label = sentiment_analysis(text)
# show the sentiment score rounded to 2 decimals and label
st.markdown('#### Sentiment score: {}'.format(round(sentiment_score, 2)))
st.markdown('#### Sentiment of web page content is: {}'.format(sentiment_label))
else:
st.warning("Please enter a valid URL")
# Name Entity Recognition
elif option == "Name Entity Recognition":
st.title("Name Entity Recognition")
st.markdown('##### This option lets you create a Name Entity Recognition of the text from a webpage')
st.markdown('Name Entity Recognition is a process of extracting the entities from the text, such as person, location, organization, etc.')
# create a text box to enter the URL
URL = st.text_input("Enter the URL of the webpage you want to scrape")
# create a button to scrape the text from the webpage
if URL is not None:
if st.button("Scrape"):
text = get_text(URL)
# preprocess the text
text = preprocess_text(text)
#get the name entities
name_entities = name_ent_recognition(text)
# show the name entities
st.markdown('#### Name entities of web page content are: {}'.format(name_entities))
else:
st.warning("Please enter a valid URL")
# Screenshot
elif option == 'Screenshot':
st.title("Screenshot")
st.markdown('##### This option lets you create a screenshot of a webpage and download it')
# create a text box to enter the URL
URL = st.text_input("Enter the URL of the webpage you want to scrape")
# create a button to scrape the text from the webpage
if URL is not None:
if st.button("Scrape"):
# get the screenshot
take_screen = screenshot(URL)
# show the screenshot
img_screen = Image.open('page_screen.jpg')
st.image(img_screen, caption='Screenshot of the webpage', use_column_width=True)
else:
st.warning("Please enter a valid URL")
# run the app
if __name__ == "__main__":
main()