forked from abderraouf2che/Hate-Speech-Detector-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomArticle.py
46 lines (36 loc) · 1.52 KB
/
randomArticle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#This function does the job of extracting a random sentences or articles from Wikipedia
from inscriptis import get_text
import urllib.request
def random_sent(n_iters=1000):
'''
This function generates random sentences from random articles from wikipedia database,
it stores the dataset in both variable and csv file so when needed.
'''
#number of iterations
n=n_iters
random_sents=[]
with open('randomly.csv','w') as file:
for i in range(n):
#random article link
url = "http://en.wikipedia.org/wiki/Special:Random"
html = urllib.request.urlopen(url).read().decode('utf-8')
text = get_text(html)
text=text.split('\n')
#loop in the text and extract sentences( or paragraphs), add them to random_sents and add them to the csv file :
for i,line in enumerate(text):
if ' is ' in line:
line=text[i]
#filter some trivially repeated sentences.
if 'Wikipedia' not in line and 'Wikidata' not in line:
if'section is empty' not in line:
random_sents.append(line)
file.write((line.encode('ascii','ignore')).decode('utf-8'))
file.write('\n')
# break
return random_sents
## write into csv
text=random_sent()
with open('rand.csv','w') as file:
for line in text:
file.write((line.encode('ascii','ignore')).decode('utf-8'))
file.write('\n')