improve first-time experience

pierrelefevre · Jan 31, 2024 · 4ef6829 · 4ef6829
1 parent bce8616
commit 4ef6829
Show file tree

Hide file tree

Showing 13 changed files with 348 additions and 139 deletions.
diff --git a/api/db.py b/api/db.py
@@ -38,8 +38,15 @@ def setup():
 
     global c
     c["listings-live-clean"] = db["listings-live-clean"]
+    c["listings-live-clean"].create_index("url", unique=True)
+
     c["listings"] = db["listings"]
+    c["listings"].create_index("id", unique=True)
+    c["listings"].create_index("url", unique=True)
+
     c["inflation"] = db["inflation"]
+    c["inflation"].create_index("id", unique=True)
+
     c["predictions"] = db["predictions"]
 
 

diff --git a/inference/db.py b/inference/db.py
@@ -38,8 +38,13 @@ def setup():
 
     global c
     c["listings"] = db["listings"]
+    c["listings"].create_index("id", unique=True)
+    c["listings"].create_index("url", unique=True)
+
     c["predictions"] = db["predictions"]
+
     c["inflation"] = db["inflation"]
+    c["inflation"].create_index("id", unique=True)
 
 
 setup()

diff --git a/model/db.py b/model/db.py
@@ -39,13 +39,18 @@ def setup():
 
     global c
     c["listings"] = db["listings"]
+    c["listings"].create_index("id", unique=True)
+    c["listings"].create_index("url", unique=True)
+
     c["inflation"] = db["inflation"]
+    c["inflation"].create_index("id", unique=True)
 
 
 setup()
 
 # Read
-
+
+
 def get_listings(n: int = 0, page: int = 0):
     res = (
         c["listings"]
@@ -55,6 +60,7 @@ def get_listings(n: int = 0, page: int = 0):
     )
     return list(res)
 
+
 def get_inflation(year: int, month: int):
     if month < 10:
         key = f"{year}M0{month}"
@@ -86,4 +92,4 @@ def get_cpi(date):
     if latest is None:
         return None
 
-    return float(latest["cpiDecided"])
+    return float(latest["cpiDecided"])
diff --git a/scraper/README.md b/scraper/README.md
@@ -0,0 +1,97 @@
+# Getting started with the scraper
+
+
+### 1. Ensure MongoDB is running and can be accessed by the scraper using the required envs
+All the collections and required indexes will be created automatically.
+
+If you are trying locally, you can use the following docker-compose file to start a MongoDB instance.
+
+(Make sure you have a `data` folder in the same directory as the `docker-compose.yml` file)
+
+```yaml
+version: '3.1'
+
+services:
+  mongodb:
+    image: mongo
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: user
+      MONGO_INITDB_ROOT_PASSWORD: password
+    volumes:
+      - ./data:/data/db
+    ports:
+      - 27017:27017
+```
+
+And then run it using `docker-compose up -d`.
+
+Then create a file called `.env` in the scraper directory with the following content:
+
+```bash
+MONGO_USER=user
+MONGO_SECRET=password
+MONGO_HOST=localhost:27017
+```
+
+### 2. Add the SCB inflation data to the database
+```python
+python scb.py
+```
+
+### 3. Add the search-terms to the database
+```python
+python generate_search_terms.py
+```
+
+### 4. Run the scraper
+
+The scraper can be run in mutliple ways, see below.
+
+#### 4.1. Run the scraper manually
+
+The data is fetched in a pipeline, with MongoDB keeping track on what needs to be done and what has been done. The scraper will run until all the search-terms have been processed.
+
+The flow of the data is: Search term -> Location IDs -> URLs -> Raw Listing (unparsed) -> Parsed Listing -> Parsed Listing with geocoding (coordinates)
+
+This means you will need to run all the separate scripts in order to get the data from the search terms to the parsed listings with coordinates.
+
+1. Fetch the location IDs from the search terms
+```python
+python get_ids.py
+```
+
+2. Fetch the URLs from the location IDs
+```python
+python get_urls.py
+```
+
+3. Fetch the raw listings from the URLs
+```python
+python get_listings_raw.py
+```
+
+4. Parse the raw listings
+```python
+python get_listings_clean.py
+```
+
+#### 4.2. Run the scraper using the bash script
+
+The bash script will run all the parts of the scraper. Since every part will wait if there isn't anything to do, it is safe to run the in parallel.
+
+```bash
+./start-all.sh <number of processes>
+```
+
+### 4.3. Run the scraper using systemd
+
+The scraper takes A LOT of time to go through the data, so it is convinent to run it as a service. This can be installed on mutliple computers to speed up the process.
+
+The service is called `bostadspriser.service` and can be started using `systemctl start bostadspriser.service`. The service will run the bash script `start-all.sh` with the number of processes specified in the `ExecStart` command. 
+
+So you need to edit the `bostadspriser.service` file to specify the number of processes you want to run.
+
+```bash
+sudo setup-service.sh
+```
+
diff --git a/scraper/db.py b/scraper/db.py
@@ -41,14 +41,39 @@ def setup():
 
     global c
     c["listings-raw"] = db["listings-raw"]
+    c["listings-raw"].create_index("coord")
+    c["listings-raw"].create_index("status")
+    c["listings-raw"].create_index("url", unique=True)
+
     c["listings"] = db["listings"]
+    c["listings"].create_index("id", unique=True)
+    c["listings"].create_index("url", unique=True)
+
     c["listings-live"] = db["listings-live"]
+    c["listings-live"].create_index("url", unique=True)
+
     c["listings-live-clean"] = db["listings-live-clean"]
+    c["listings-live-clean"].create_index("url", unique=True)
+
     c["urls"] = db["urls"]
+    c["urls"].create_index("url", unique=True)
+    c["urls"].create_index("status")
+
     c["urls-live"] = db["urls-live"]
+    c["urls-live"].create_index("url", unique=True)
+    c["urls-live"].create_index("status")
+
     c["locations"] = db["locations"]
+    c["locations"].create_index("id", unique=True)
+    c["locations"].create_index("status")
+
     c["search-terms"] = db["search-terms"]
+    c["search-terms"].create_index("term", unique=True)
+    c["search-terms"].create_index("status")
+
     c["inflation"] = db["inflation"]
+    c["inflation"].create_index("id", unique=True)
+
     c["status"] = db["status"]
 
 
@@ -227,6 +252,10 @@ def mark_search_terms_as_done(terms: list):
 
 
 def write_search_terms(terms: list):
+    # Add status
+    for term in terms:
+        term["status"] = "pending"
+
     try:
         c["search-terms"].insert_many(terms, ordered=False)
     except mongo.errors.BulkWriteError as e:

diff --git a/scraper/get_ids.py b/scraper/get_ids.py
@@ -1,14 +1,16 @@
 import hemnet
 import db
+import time
 
-while True:
+
+def main():
     print("Getting pending search terms...")
     terms = db.get_pending_search_terms()
 
     done = []
     for item in terms:
         try:
-            locs = hemnet.get_location_ids()
+            locs = hemnet.get_location_ids(item["term"])
             curr = item["term"]
 
             print(f"Found {len(locs)} locations for {curr}")
@@ -19,8 +21,19 @@
             done.append(curr)
 
         except Exception as e:
-            print(e)
-            print(f"Failed to get locations for {curr}")
+            if "rate limit" in str(e):
+                print("Rate limit, sleeping for 1 minute...")
+                time.sleep(60)
+            else:
+                print(f"Failed to get locations for {curr}")
 
     print(f"Marking {len(done)} search terms as done...")
     db.mark_search_terms_as_done(done)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("Exiting...")
+        exit()
diff --git a/scraper/get_listings_raw.py b/scraper/get_listings_raw.py
@@ -2,23 +2,33 @@
 import db
 import time
 
-i = 0
-while True:
-    urls = db.get_pending_urls(n=100, random=True)
-    if len(urls) == 0:
-        print("No more pending urls. Sleeping for 60 seconds...")
-        time.sleep(60)
 
-    print(f"Getting {len(urls)} listings...")
-    for item in urls:
-        raw_listing = hemnet.get_single_listing(item["url"])
-        if raw_listing is None:
-            print(f"Failed to get {item['url']}")
-            continue
+def main():
+    i = 0
+    while True:
+        urls = db.get_pending_urls(n=100, random=True)
+        if len(urls) == 0:
+            print("No more pending urls. Sleeping for 60 seconds...")
+            time.sleep(60)
 
-        raw_listing["url"] = item["url"]
-        print(f"Done with {item['url']} - iteration {i}")
+        print(f"Getting {len(urls)} listings...")
+        for item in urls:
+            raw_listing = hemnet.get_single_listing(item["url"])
+            if raw_listing is None:
+                print(f"Failed to get {item['url']}")
+                continue
 
-        db.write_raw_listing(raw_listing)
-        db.marks_urls_as_done(urls=[item["url"]])
-        i += 1
+            raw_listing["url"] = item["url"]
+            print(f"Done with {item['url']} - iteration {i}")
+
+            db.write_raw_listing(raw_listing)
+            db.marks_urls_as_done(urls=[item["url"]])
+            i += 1
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("Exiting...")
+        exit()
diff --git a/scraper/get_live_clean.py b/scraper/get_live_clean.py
@@ -165,38 +165,50 @@ def clean_listing(listing_raw):
     return output
 
 
-while True:
-    raw_listings = db.get_pending_raw_listings(n=5000, random=False, live=True)
-    if len(raw_listings) == 0:
-        print("No more live listings to clean. Sleeping for 60 seconds...")
-        time.sleep(60)
-        continue
-
-    print(f"Cleaning {len(raw_listings)} listings...")
-
-    cleaned = []
-    err_due_to_missing_field = 0
-    for raw_listing in raw_listings:
-        try:
-            listing = clean_listing(raw_listing)
-            cleaned.append(listing)
-        except Exception as e:
-            if "Missing required field" in str(e):
-                err_due_to_missing_field += 1
-                db.mark_raw_listing_as_missing_fields(raw_listing["url"], live=True)
+def main():
+    while True:
+        raw_listings = db.get_pending_raw_listings(
+            n=5000, random=False, live=True)
+        if len(raw_listings) == 0:
+            print("No more live listings to clean. Sleeping for 60 seconds...")
+            time.sleep(60)
+            continue
+
+        print(f"Cleaning {len(raw_listings)} listings...")
+
+        cleaned = []
+        err_due_to_missing_field = 0
+        for raw_listing in raw_listings:
+            try:
+                listing = clean_listing(raw_listing)
+                cleaned.append(listing)
+            except Exception as e:
+                if "Missing required field" in str(e):
+                    err_due_to_missing_field += 1
+                    db.mark_raw_listing_as_missing_fields(
+                        raw_listing["url"], live=True)
+                    continue
+
+                print(
+                    "Failed to clean listing ("
+                    + str(raw_listing["url"])
+                    + "), details: "
+                    + str(e)
+                )
+                db.mark_raw_listing_as_failed(raw_listing["url"], live=True)
                 continue
+        db.write_listings(cleaned, live=True)
+        db.mark_raw_listings_as_done([listing["url"]
+                                     for listing in cleaned], live=True)
+
+        print(
+            f"Done cleaning {len(cleaned)} listings. {err_due_to_missing_field} listings failed due to missing fields."
+        )
 
-            print(
-                "Failed to clean listing ("
-                + str(raw_listing["url"])
-                + "), details: "
-                + str(e)
-            )
-            db.mark_raw_listing_as_failed(raw_listing["url"], live=True)
-            continue
-    db.write_listings(cleaned, live=True)
-    db.mark_raw_listings_as_done([listing["url"] for listing in cleaned], live=True)
 
-    print(
-        f"Done cleaning {len(cleaned)} listings. {err_due_to_missing_field} listings failed due to missing fields."
-    )
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("Exiting...")
+        exit()