-
Notifications
You must be signed in to change notification settings - Fork 0
/
scanner.py
93 lines (61 loc) · 3.14 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import logging
import warnings
import wikipedia
import streamlit as st
from typing import List
from scanner_utils import *
from xgboost import XGBClassifier
from streamlit_searchbox import st_searchbox
from transformers import logging as hflogging
logging.disable(logging.WARNING)
hflogging.set_verbosity_warning()
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
st.set_page_config(layout="centered", page_title="Egyptian Wikipedia Scanner", page_icon="🇪🇬")
wikipedia.set_lang("arz")
with open('.streamlit/style.css') as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
st.markdown("""
<h1 style='text-align: center';>Egyptian Arabic Wikipedia Scanner</h1>
<h5 style='text-align: center';>Automatic Detection of Template-translated Articles in the Egyptian Wikipedia</h5>
""", unsafe_allow_html=True)
st.markdown("", unsafe_allow_html=True)
def search_wikipedia(searchterm: str) -> List[any]:
return wikipedia.search(searchterm) if searchterm else []
@st.cache_resource
def load_xgb_model(model):
loaded_xgb_classifier = XGBClassifier()
loaded_xgb_classifier.load_model(model)
return loaded_xgb_classifier
selected_title = st_searchbox(search_wikipedia, label="Search for an article in Egyptian Arabic Wikipedia:",
placeholder="Search for an article", rerun_on_update=True, clear_on_submit=False, key="wiki_searchbox")
if selected_title:
X, article, dataframe, selected_title = prepare_features(selected_title)
st.write(f':black_small_square: Collected Metadata of **{selected_title}**')
st.dataframe(dataframe, hide_index=True , use_container_width=True)
loaded_xgb_classifier = load_xgb_model("XGBoost.model")
id2label = {0:'Human-generated Article', 1:'Template-translated Article'}
result = id2label[int(loaded_xgb_classifier.predict(X))]
if result =='Human-generated Article':
st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
st.success(result, icon="✅")
else:
st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
st.error(result, icon="🚨")
st.write(f":black_small_square: Full Summary of **{selected_title}**")
with st.expander(f'**{selected_title}**', expanded=True):
st.markdown('<style>p {text-align: justify;}</style>', unsafe_allow_html=True)
try:
article_text = wikipedia.summary(selected_title)
except wikipedia.exceptions.DisambiguationError as e:
article_text = wikipedia.summary(e.options[0])
st.write(article_text)
st.write(f'> :globe_with_meridians: Read Full Text of **{selected_title}**: <br>{article.url}', unsafe_allow_html=True)
st.markdown('<br><br>', unsafe_allow_html=True)
footer="""
<div class="footer"> <p class="p1">
Copyright © 2024 by Saied Alshahrani<br>
Hosted with Streamlit Community Cloud</p> </div>
"""
st.markdown(footer, unsafe_allow_html=True)