-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsher.py
32 lines (25 loc) · 963 Bytes
/
parsher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pickle
import os.path
directory = 'C:/way to output file'
filename = "vystup.txt"
file_path = os.path.join(directory, filename)
print (file_path)
my_list = list()
for path, dirs, files in os.walk(r"C:/waytofiles in directory/"):
for f in files:
html_files = os.path.join(path, f)
print(html_files)
# Parse the html content
soup = BeautifulSoup(open(html_files, encoding="utf-8"), "lxml")
for link in soup.select('a[href^=mailto]'):
print("Inner Text: {}".format(link.text))
my_list.append(format(link.text))
newEmails = list(filter(lambda x : x != '[email protected]', my_list))
print (newEmails)
file = open(file_path, 'w') #write to file
for line in newEmails:
file.write(line+"\n")
file.close() #close file