-
Notifications
You must be signed in to change notification settings - Fork 0
/
wascrapper.py
89 lines (78 loc) · 3.54 KB
/
wascrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from bs4 import BeautifulSoup
import re
from lxml import etree
import requests
def scrapperWa():
buscar = input("Enter the search term:")
x = 0
while x <= 70:
url = f'https://www.google.co.ve/search?q=+{buscar}+site:https://chat.whatsapp.com/*&newwindow=1&start={x}&newwindow=1&filter=0&biw=1920&bih=1009&dpr=1'
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\
'Accept-Language': 'pt-BR, pt;q=0.5'})
request_result=requests.get( url, headers=HEADERS)
soup = BeautifulSoup(request_result.text,"html.parser")
links_page=soup.find_all( 'h3' )
x +=10
print(url)
for info in links_page:
link_grupo = info.getText()
print(link_grupo)
arquivo = open("content/links.txt", "a")
grupos = link_grupo
arquivo.write(f"\n{grupos}")
def scrapperGo():
buscar = input("Enter the search term:")
site = input("Enter website to extract links: Example: facebook.com")
x = 0
while x <= 100:
url = f'https://www.google.com/search?q=site:{site}+{buscar}+https://chat.whatsapp.com&rlz=1C1FCXM_pt-PTBR997BR997&ei=VOmhY8OoH6_e1sQP9_WpqAQ&start={x}'
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\
'Accept-Language': 'pt-BR, pt;q=0.5'})
request_result=requests.get(url, headers=HEADERS)
soup = BeautifulSoup(request_result.text,"html.parser")
soup = str(soup)
links_page = re.findall(r"(https:\/\/chat\.whatsapp\.com\/(invite\/)?[a-zA-Z0-9]{22})", soup)
x +=10
for links in links_page:
links = list(links)
print(links[0])
arquivo = open("content/links.txt", "a")
arquivo.write(f"\n{links[0]}")
def scrapperOpt():
url = input("Enter the website to extract the links:")
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\
'Accept-Language': 'pt-BR, pt;q=0.5'})
request_result=requests.get(url, headers=HEADERS)
soup = BeautifulSoup(request_result.text,"html.parser")
soup = str(soup)
links_page = re.findall(r"(https:\/\/chat\.whatsapp\.com\/(invite\/)?[a-zA-Z0-9]{22})", soup)
for links in links_page:
links = list(links)
print(links[0])
def scrapperVal():
print("You can validate the links you extracted or you can validate links by pasting the links in the content/links.txt file.")
links = open('content/links.txt')
link = links.readlines()
qtd = len(link)
i = 0
while i <= qtd -1:
url = link[i]
url = url[:-1]
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\
'Accept-Language': 'pt-BR, pt;q=0.5'})
webpage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
nome_grupo = (dom.xpath('//*[@id="main_block"]/h3')[0].text)
arquivo = open("/content/links_valid.csv", "a")
if nome_grupo != None:
arquivo.write(f"{nome_grupo},{url}\n")
i +=1