Skip to content

Commit

Permalink
improve first-time experience
Browse files Browse the repository at this point in the history
  • Loading branch information
saffronjam committed Jan 31, 2024
1 parent bce8616 commit 4ef6829
Show file tree
Hide file tree
Showing 13 changed files with 348 additions and 139 deletions.
7 changes: 7 additions & 0 deletions api/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,15 @@ def setup():

global c
c["listings-live-clean"] = db["listings-live-clean"]
c["listings-live-clean"].create_index("url", unique=True)

c["listings"] = db["listings"]
c["listings"].create_index("id", unique=True)
c["listings"].create_index("url", unique=True)

c["inflation"] = db["inflation"]
c["inflation"].create_index("id", unique=True)

c["predictions"] = db["predictions"]


Expand Down
5 changes: 5 additions & 0 deletions inference/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,13 @@ def setup():

global c
c["listings"] = db["listings"]
c["listings"].create_index("id", unique=True)
c["listings"].create_index("url", unique=True)

c["predictions"] = db["predictions"]

c["inflation"] = db["inflation"]
c["inflation"].create_index("id", unique=True)


setup()
Expand Down
10 changes: 8 additions & 2 deletions model/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,18 @@ def setup():

global c
c["listings"] = db["listings"]
c["listings"].create_index("id", unique=True)
c["listings"].create_index("url", unique=True)

c["inflation"] = db["inflation"]
c["inflation"].create_index("id", unique=True)


setup()

# Read



def get_listings(n: int = 0, page: int = 0):
res = (
c["listings"]
Expand All @@ -55,6 +60,7 @@ def get_listings(n: int = 0, page: int = 0):
)
return list(res)


def get_inflation(year: int, month: int):
if month < 10:
key = f"{year}M0{month}"
Expand Down Expand Up @@ -86,4 +92,4 @@ def get_cpi(date):
if latest is None:
return None

return float(latest["cpiDecided"])
return float(latest["cpiDecided"])
97 changes: 97 additions & 0 deletions scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Getting started with the scraper


### 1. Ensure MongoDB is running and can be accessed by the scraper using the required envs
All the collections and required indexes will be created automatically.

If you are trying locally, you can use the following docker-compose file to start a MongoDB instance.

(Make sure you have a `data` folder in the same directory as the `docker-compose.yml` file)

```yaml
version: '3.1'

services:
mongodb:
image: mongo
environment:
MONGO_INITDB_ROOT_USERNAME: user
MONGO_INITDB_ROOT_PASSWORD: password
volumes:
- ./data:/data/db
ports:
- 27017:27017
```
And then run it using `docker-compose up -d`.

Then create a file called `.env` in the scraper directory with the following content:

```bash
MONGO_USER=user
MONGO_SECRET=password
MONGO_HOST=localhost:27017
```

### 2. Add the SCB inflation data to the database
```python
python scb.py
```

### 3. Add the search-terms to the database
```python
python generate_search_terms.py
```

### 4. Run the scraper

The scraper can be run in mutliple ways, see below.

#### 4.1. Run the scraper manually

The data is fetched in a pipeline, with MongoDB keeping track on what needs to be done and what has been done. The scraper will run until all the search-terms have been processed.

The flow of the data is: Search term -> Location IDs -> URLs -> Raw Listing (unparsed) -> Parsed Listing -> Parsed Listing with geocoding (coordinates)

This means you will need to run all the separate scripts in order to get the data from the search terms to the parsed listings with coordinates.

1. Fetch the location IDs from the search terms
```python
python get_ids.py
```

2. Fetch the URLs from the location IDs
```python
python get_urls.py
```

3. Fetch the raw listings from the URLs
```python
python get_listings_raw.py
```

4. Parse the raw listings
```python
python get_listings_clean.py
```

#### 4.2. Run the scraper using the bash script

The bash script will run all the parts of the scraper. Since every part will wait if there isn't anything to do, it is safe to run the in parallel.

```bash
./start-all.sh <number of processes>
```

### 4.3. Run the scraper using systemd

The scraper takes A LOT of time to go through the data, so it is convinent to run it as a service. This can be installed on mutliple computers to speed up the process.

The service is called `bostadspriser.service` and can be started using `systemctl start bostadspriser.service`. The service will run the bash script `start-all.sh` with the number of processes specified in the `ExecStart` command.

So you need to edit the `bostadspriser.service` file to specify the number of processes you want to run.

```bash
sudo setup-service.sh
```

29 changes: 29 additions & 0 deletions scraper/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,39 @@ def setup():

global c
c["listings-raw"] = db["listings-raw"]
c["listings-raw"].create_index("coord")
c["listings-raw"].create_index("status")
c["listings-raw"].create_index("url", unique=True)

c["listings"] = db["listings"]
c["listings"].create_index("id", unique=True)
c["listings"].create_index("url", unique=True)

c["listings-live"] = db["listings-live"]
c["listings-live"].create_index("url", unique=True)

c["listings-live-clean"] = db["listings-live-clean"]
c["listings-live-clean"].create_index("url", unique=True)

c["urls"] = db["urls"]
c["urls"].create_index("url", unique=True)
c["urls"].create_index("status")

c["urls-live"] = db["urls-live"]
c["urls-live"].create_index("url", unique=True)
c["urls-live"].create_index("status")

c["locations"] = db["locations"]
c["locations"].create_index("id", unique=True)
c["locations"].create_index("status")

c["search-terms"] = db["search-terms"]
c["search-terms"].create_index("term", unique=True)
c["search-terms"].create_index("status")

c["inflation"] = db["inflation"]
c["inflation"].create_index("id", unique=True)

c["status"] = db["status"]


Expand Down Expand Up @@ -227,6 +252,10 @@ def mark_search_terms_as_done(terms: list):


def write_search_terms(terms: list):
# Add status
for term in terms:
term["status"] = "pending"

try:
c["search-terms"].insert_many(terms, ordered=False)
except mongo.errors.BulkWriteError as e:
Expand Down
21 changes: 17 additions & 4 deletions scraper/get_ids.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import hemnet
import db
import time

while True:

def main():
print("Getting pending search terms...")
terms = db.get_pending_search_terms()

done = []
for item in terms:
try:
locs = hemnet.get_location_ids()
locs = hemnet.get_location_ids(item["term"])
curr = item["term"]

print(f"Found {len(locs)} locations for {curr}")
Expand All @@ -19,8 +21,19 @@
done.append(curr)

except Exception as e:
print(e)
print(f"Failed to get locations for {curr}")
if "rate limit" in str(e):
print("Rate limit, sleeping for 1 minute...")
time.sleep(60)
else:
print(f"Failed to get locations for {curr}")

print(f"Marking {len(done)} search terms as done...")
db.mark_search_terms_as_done(done)


if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Exiting...")
exit()
44 changes: 27 additions & 17 deletions scraper/get_listings_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,33 @@
import db
import time

i = 0
while True:
urls = db.get_pending_urls(n=100, random=True)
if len(urls) == 0:
print("No more pending urls. Sleeping for 60 seconds...")
time.sleep(60)

print(f"Getting {len(urls)} listings...")
for item in urls:
raw_listing = hemnet.get_single_listing(item["url"])
if raw_listing is None:
print(f"Failed to get {item['url']}")
continue
def main():
i = 0
while True:
urls = db.get_pending_urls(n=100, random=True)
if len(urls) == 0:
print("No more pending urls. Sleeping for 60 seconds...")
time.sleep(60)

raw_listing["url"] = item["url"]
print(f"Done with {item['url']} - iteration {i}")
print(f"Getting {len(urls)} listings...")
for item in urls:
raw_listing = hemnet.get_single_listing(item["url"])
if raw_listing is None:
print(f"Failed to get {item['url']}")
continue

db.write_raw_listing(raw_listing)
db.marks_urls_as_done(urls=[item["url"]])
i += 1
raw_listing["url"] = item["url"]
print(f"Done with {item['url']} - iteration {i}")

db.write_raw_listing(raw_listing)
db.marks_urls_as_done(urls=[item["url"]])
i += 1


if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Exiting...")
exit()
76 changes: 44 additions & 32 deletions scraper/get_live_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,38 +165,50 @@ def clean_listing(listing_raw):
return output


while True:
raw_listings = db.get_pending_raw_listings(n=5000, random=False, live=True)
if len(raw_listings) == 0:
print("No more live listings to clean. Sleeping for 60 seconds...")
time.sleep(60)
continue

print(f"Cleaning {len(raw_listings)} listings...")

cleaned = []
err_due_to_missing_field = 0
for raw_listing in raw_listings:
try:
listing = clean_listing(raw_listing)
cleaned.append(listing)
except Exception as e:
if "Missing required field" in str(e):
err_due_to_missing_field += 1
db.mark_raw_listing_as_missing_fields(raw_listing["url"], live=True)
def main():
while True:
raw_listings = db.get_pending_raw_listings(
n=5000, random=False, live=True)
if len(raw_listings) == 0:
print("No more live listings to clean. Sleeping for 60 seconds...")
time.sleep(60)
continue

print(f"Cleaning {len(raw_listings)} listings...")

cleaned = []
err_due_to_missing_field = 0
for raw_listing in raw_listings:
try:
listing = clean_listing(raw_listing)
cleaned.append(listing)
except Exception as e:
if "Missing required field" in str(e):
err_due_to_missing_field += 1
db.mark_raw_listing_as_missing_fields(
raw_listing["url"], live=True)
continue

print(
"Failed to clean listing ("
+ str(raw_listing["url"])
+ "), details: "
+ str(e)
)
db.mark_raw_listing_as_failed(raw_listing["url"], live=True)
continue
db.write_listings(cleaned, live=True)
db.mark_raw_listings_as_done([listing["url"]
for listing in cleaned], live=True)

print(
f"Done cleaning {len(cleaned)} listings. {err_due_to_missing_field} listings failed due to missing fields."
)

print(
"Failed to clean listing ("
+ str(raw_listing["url"])
+ "), details: "
+ str(e)
)
db.mark_raw_listing_as_failed(raw_listing["url"], live=True)
continue
db.write_listings(cleaned, live=True)
db.mark_raw_listings_as_done([listing["url"] for listing in cleaned], live=True)

print(
f"Done cleaning {len(cleaned)} listings. {err_due_to_missing_field} listings failed due to missing fields."
)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Exiting...")
exit()
Loading

0 comments on commit 4ef6829

Please sign in to comment.