-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add base for data preparation and model training
- Loading branch information
1 parent
d992bfd
commit bdb35e4
Showing
11 changed files
with
380 additions
and
222 deletions.
There are no files selected for viewing
Binary file not shown.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,17 @@ | ||
import db | ||
import datetime | ||
import json | ||
import re | ||
import clean | ||
import pandas as pd | ||
|
||
example_listing = {} | ||
|
||
|
||
def parse_date(date: str) -> datetime.datetime: | ||
# Såld 19 november 2020 | ||
|
||
regex = r"Såld (\d+) (\w+) (\d+)" | ||
match = re.match(regex, date) | ||
if match: | ||
day = int(match.group(1)) | ||
month = match.group(2) | ||
year = int(match.group(3)) | ||
return datetime.datetime(year, month, day) | ||
else: | ||
print("Could not parse date: ", date) | ||
return None | ||
|
||
|
||
def get_closest_inflation(date: datetime.datetime): | ||
# 1980M01 | ||
year = date.year | ||
month = date.month | ||
id = f"{year}M{month}" | ||
|
||
return db.get_inflation(id) | ||
def load_dataset(name: str) -> pd.DataFrame: | ||
return pd.read_parquet(f"../dataset/{name}.parquet") | ||
|
||
|
||
def main(): | ||
db.setup() | ||
listings = db.get_pending_raw_listings(n=10) | ||
|
||
for listing in listings: | ||
print(listing["props"]["pageProps"].keys()) | ||
|
||
exit() | ||
clean.clean_listing(listings[0]) | ||
listings = load_dataset("listings") | ||
|
||
print(listings.describe()) | ||
print() | ||
print(listings.info()) | ||
print() | ||
print(listings.head()) | ||
|
||
if __name__ == "__main__": | ||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pandas as pd | ||
import db | ||
import transform | ||
|
||
|
||
def main(): | ||
# Load data from collection 'listings' and create a large parquet file from it | ||
print("Loading data from database...") | ||
|
||
print("Preparing the data...") | ||
transformed = [] | ||
page = 0 | ||
pageSize = 10000 | ||
|
||
while True: | ||
listings = db.get_listings(pageSize, page) | ||
if len(listings) == 0: | ||
print("No more listings to load") | ||
break | ||
print("Loaded " + str(len(listings)) + " listings (page " + | ||
str(page) + "), transforming...") | ||
|
||
page += 1 | ||
|
||
for i, listing in enumerate(listings): | ||
res = transform.transform_listing(listing) | ||
if not res: | ||
continue | ||
|
||
transformed.append(res) | ||
|
||
if i % 10000 == 0: | ||
print("Done with " + str(i) + | ||
" listings, appending to parquet...") | ||
df = pd.DataFrame(transformed) | ||
if i == 0: | ||
df.to_parquet('../dataset/listings.parquet') | ||
else: | ||
df.to_parquet( | ||
'../dataset/listings.parquet', append=True) | ||
|
||
print("Done preparing the data") | ||
|
||
|
||
if __name__ == '__main__': | ||
try: | ||
main() | ||
except KeyboardInterrupt: | ||
print("Exiting...") | ||
exit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
pymongo | ||
python_dotenv | ||
python_dotenv | ||
scikit-learn | ||
pandas |
Oops, something went wrong.