-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiaou
63 lines (48 loc) · 2.15 KB
/
miaou
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from bs4 import BeautifulSoup
import requests
import csv
urldepart = "http://ww w.f f - v oy a nc e.c om/"
visitedUrl = set()
urlToVisit = [urldepart]
with open('eggs.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
def lit_page(url):
global visitedUrl
global spamwriter
global urlToVisit
print(".")
r = requests.get(url)
visitedUrl.add(url)
data = r.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
urlHref=link.get('href')
if urlHref is not None and urlHref not in visitedUrl and (urlHref[0]=='.' or urlHref[0]=='/' or urlHref.startswith(urldepart)):
if urlHref[0]=='.' or urlHref[0]=='/':
urlToVisit.append(urldepart+urlHref)
else:
urlToVisit.append(urlHref)
for link in soup.find_all('div'):
classe=link.get('class')
if classe is not None and 'post' in classe and 'row' in classe and 'panel' in classe:
for link2 in link.find_all('div'):
classe2=link2.get('class')
if "postbody" in classe2 :
member = None
content = None
for link3 in link2.find_all('p'):
classe3=link3.get('class')
if "author" in classe3:
count=0
for link4 in link3.find_all('a'):
if count>0:
member=link4.get_text()
count=count+1
if member is not None:
for link3 in link2.find_all('div'):
classe3=link3.get('class')
if "content" in classe3:
content=link3.get_text()
spamwriter.writerow([url,member,content])
for url in urlToVisit:
lit_page(url)