Skip to content

Commit

Permalink
add base for data preparation and model training
Browse files Browse the repository at this point in the history
  • Loading branch information
saffronjam committed Jan 4, 2024
1 parent d992bfd commit bdb35e4
Show file tree
Hide file tree
Showing 11 changed files with 380 additions and 222 deletions.
Binary file added dataset/listings.parquet
Binary file not shown.
109 changes: 0 additions & 109 deletions model/clean.py

This file was deleted.

45 changes: 23 additions & 22 deletions model/db.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import os
import pymongo as mongo
import datetime
from dotenv import load_dotenv
import os


# Collection dict
c = {}

load_dotenv()

def setup():
load_dotenv()
print("Setting up database connection...")

def setup():
required_env_vars = [
"MONGO_USER",
"MONGO_SECRET",
Expand All @@ -31,36 +33,35 @@ def setup():
f"mongodb://{db_user}:{db_pass}@{os.getenv('MONGO_HOST')}"
)

client.server_info()

db = client["bostadspriser"]

global c
c["listings-raw"] = db["listings-raw"]
c["listings"] = db["listings"]
c["urls"] = db["urls"]
c["locations"] = db["locations"]
c["search-terms"] = db["search-terms"]
c["inflation"] = db["inflation"]


def get_pending_raw_listings(n: int = 0, page: int = 0, random: bool = False):
if random:
res = c["listings_raw"].aggregate(
[
{"$match": {"status": "pending"}},
{"$sample": {"size": n}},
]
)
return list(res)
setup()

# Read

def get_listings(n: int = 0, page: int = 0):
res = (
c["listings-raw"]
.find({"status": "pending"})
.sort("createdAt", -1)
c["listings"]
.find({})
.skip(n * page)
.limit(n)
)
return list(res)

def get_inflation(year: int, month: int):
if month < 10:
key = f"{year}M0{month}"
else:
key = f"{year}M{month}"

res = c["inflation"].find_one({"id": key})
return res


def get_inflation(key: str):
c["inflation"].find_one({"key": key})
49 changes: 10 additions & 39 deletions model/eda.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,17 @@
import db
import datetime
import json
import re
import clean
import pandas as pd

example_listing = {}


def parse_date(date: str) -> datetime.datetime:
# Såld 19 november 2020

regex = r"Såld (\d+) (\w+) (\d+)"
match = re.match(regex, date)
if match:
day = int(match.group(1))
month = match.group(2)
year = int(match.group(3))
return datetime.datetime(year, month, day)
else:
print("Could not parse date: ", date)
return None


def get_closest_inflation(date: datetime.datetime):
# 1980M01
year = date.year
month = date.month
id = f"{year}M{month}"

return db.get_inflation(id)
def load_dataset(name: str) -> pd.DataFrame:
return pd.read_parquet(f"../dataset/{name}.parquet")


def main():
db.setup()
listings = db.get_pending_raw_listings(n=10)

for listing in listings:
print(listing["props"]["pageProps"].keys())

exit()
clean.clean_listing(listings[0])
listings = load_dataset("listings")

print(listings.describe())
print()
print(listings.info())
print()
print(listings.head())

if __name__ == "__main__":
if __name__ == '__main__':
main()
50 changes: 50 additions & 0 deletions model/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import db
import transform


def main():
# Load data from collection 'listings' and create a large parquet file from it
print("Loading data from database...")

print("Preparing the data...")
transformed = []
page = 0
pageSize = 10000

while True:
listings = db.get_listings(pageSize, page)
if len(listings) == 0:
print("No more listings to load")
break
print("Loaded " + str(len(listings)) + " listings (page " +
str(page) + "), transforming...")

page += 1

for i, listing in enumerate(listings):
res = transform.transform_listing(listing)
if not res:
continue

transformed.append(res)

if i % 10000 == 0:
print("Done with " + str(i) +
" listings, appending to parquet...")
df = pd.DataFrame(transformed)
if i == 0:
df.to_parquet('../dataset/listings.parquet')
else:
df.to_parquet(
'../dataset/listings.parquet', append=True)

print("Done preparing the data")


if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("Exiting...")
exit()
4 changes: 3 additions & 1 deletion model/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pymongo
python_dotenv
python_dotenv
scikit-learn
pandas
Loading

0 comments on commit bdb35e4

Please sign in to comment.