-
Notifications
You must be signed in to change notification settings - Fork 8
/
CLscraper.py
executable file
·121 lines (95 loc) · 5.47 KB
/
CLscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
"""Written By: Bobak Hashemi
Last Updated: 5/9/2018
This program takes in a list of craigslist search queries (given by the search links with GET data),
indexes the results for each query, and emails new posts to a set of specified email addresses.
This program uses the beautiful soup 4 package, to install run "sudo easy_install beautifulsoup4"
"""
import urllib
import datetime
from bs4 import BeautifulSoup
import time
import smtplib
import json
import sys
try: from urllib.request import urlopen
except ImportError: from urllib2 import urlopen
import random
from configparser import ConfigParser
CHECK_OLD_LISTINGS = True # If True, don't resend listings that have been reposted
config=ConfigParser()
config.read('config.ini')
smtp_server=config.get("CLscraper","smtp_server").strip()
smtp_username=config.get("CLscraper","smtp_username").strip()
smtp_password=config.get("CLscraper","smtp_password").strip()
fromaddr=config.get("CLscraper","fromaddr").strip()
toaddrs = json.loads(config.get("CLscraper","toaddrs"))
urls = json.loads(config.get("CLscraper","urls"))
sleepinterval = json.loads(config.get("CLscraper","sleeptime"))
if sleepinterval[1] < sleepinterval[0]:
print("Sleep interval %s is not well formed, second number must be larger than the first. Exiting..." % str(sleepinterval))
exit(1)
SLEEPTIME = random.randint(60*sleepinterval[0],60*sleepinterval[1]) # Number of seconds between searches, randomly between 1mn to 10mn
old_listings = [] #Initialize list of old posting's unique craigslist ID
email = [] #Initialize list of posting's to be emailed after some run.
def constructMessage(msg, new_listings):
"""Constructs the message given the message head and the list of new postings"""
msg = "Subject: New Matches on Craigslist Search \n\n"+msg
for pid in new_listings.keys(): #construct the email message
msg = msg+new_listings[pid][0]+" : "+new_listings[pid][1]+"\n"
return msg
def getListOfIdsAndUrls():
"""Scrapes the web pages for listings and returns a dictionary with PIDs as keys and (URLs, title) as value for stuff not in old_listings"""
new_listings = {} #dictionary which holds the unique ID for each listing and the URL.
for craigslistLinkAddress in urls:
f = urlopen(craigslistLinkAddress) #Open Web Address
soup = BeautifulSoup(f.read(),"html.parser") #read in html into BS4 data structure
content = str(soup.find_all("div", class_="content")[0]) #Get the body of the search, strips away all the sidebars and stuff.
soup = BeautifulSoup(content[:content.find("<h4 class=\"ban nearby\">")],"html.parser") #Remove's the links from "nearby" towns that are far away
for listing in soup.find_all("li", {"class": "result-row"}): #For each listing, find the listings by searching for results table elements. This tag also stores a unique ID for each listing
#grad the unique ID and URL for the listing, CHECK_OLD_LISTINGS is set, it also checks the repost of ID against the old_listings list.
pid = listing.attrs["data-pid"]
old_pid = pid
url = listing.find("a", {"class": "result-title"}).attrs["href"] #finds the link by looking for a link with results-title class and extracts the url.
title = listing.find("a", {"class": "result-title"}).text #finds the listing title
if CHECK_OLD_LISTINGS:
if "data-repost-of" in listing.attrs.keys():
old_pid = listing.attrs["data-repost-of"]
if (pid not in old_listings) and (old_pid not in old_listings): #check if listing is in the old list
new_listings[pid] = (url, title) #listing should be returned
old_listings.append(pid) #add the new pid to list of ones we've seen
old_listings.append(old_pid) #add the old pid to list of ones we've seen for future proofing
elif (pid not in old_listings):
old_listings.append(pid) #I'm not sure if the old PID gets updated when someone reposts, or if it stays as the very first PID the listing was ever posted as. This should take care of the former case.
return new_listings
def doIteration(msg):
new_listings = getListOfIdsAndUrls()
if new_listings:
msg = constructMessage(msg, new_listings)
sys.stdout.buffer.write(("Found new listings, about to send email: \n\n%s" % msg).encode('utf-8'))
sys.stdout.buffer.flush()
server = smtplib.SMTP(smtp_server)
server.starttls()
if smtp_username: server.login(smtp_username,smtp_password)
server.sendmail(fromaddr, toaddrs, msg.encode('utf-8'))
server.quit()
else:
print("No new listings found")
# ---- Start Initialization Run to get all posts already on craigslist
#Welcome message sent on first email
msg = "Hi! \n I will do your craigslist search between every %f to %f minutes and notify you whenever a new listing is posted that matches our search criteria. Here are all the intial positings that were up at the time your search was started... \n\n" % (sleepinterval[0], sleepinterval[1])
doIteration(msg)
email = [] #re-initialize list of new posts and new post flag
new = False
time.sleep(SLEEPTIME) #wait for SLEEPTIME seconds before entering main loop
# ---- End Initialization Run
# ---- Start Main Loop
while True:
print("\n\n "+str(datetime.datetime.now())+": --Checking again!-- \n\n") #Print timestamp to terminal so you know it's working
msg = "There are new postings: \n\n" #construct new message header
doIteration(msg)
#re-initialize list of new posts and new post flag and wait SLEEPTIME seconds before starting again
email = []
new = False
time.sleep(SLEEPTIME)
SLEEPTIME = random.randint(60*sleepinterval[0],60*sleepinterval[1])