-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinitdb.py
120 lines (90 loc) · 3.83 KB
/
initdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Import dependencies
import pandas as pd
import numpy as np
import pymongo
import datetime as dt
from datetime import datetime
import os
# ## Extract Victorian Biodiversity Atlas (VBA) fauna data
# Read just the column names in fauna data csv
col_names = pd.read_csv("data/VBA_2015_2020.csv", nrows = 0).columns
# Set data types for columns with data types other than strings
dtypes_dict = {
"TOTALCOUNT": int,
"START_YEAR": int,
"START_MTH": int,
"END_YEAR": int,
"END_MTH": int,
"LONG_DD94": float,
"LAT_DD94": float
}
# Read in vic fauna csv from 2015 to 2020
fauna_data = pd.read_csv(
"data/VBA_2015_2020.csv",
parse_dates = ["STARTDATE", "ENDDATE"],
dtype = {col: str for col in col_names if col not in dtypes_dict})
# ## Transform VBA fauna data
# Column Renaming
fauna_df = fauna_data.rename(columns={
"RECORD_ID": "record_id",
"SITE_ID": "site_id",
"SURVEY_ID": "survey_id",
"PROJECT_ID": "project_id",
"TAXON_ID": "taxon_id",
"SCI_NAME": "sci_name",
"COMM_NAME": "comm_name",
"RECORDTYPE": "recordtype",
"RELIABILTY": "reliability",
"TOTALCOUNT": "totalcount",
"STARTDATE": "start_date",
"START_YEAR": "start_year",
"START_MTH": "start_mth",
"ENDDATE": "end_date",
"END_YEAR": "end_year",
"END_MTH": "end_mth",
"LOCN_DESC": "location_desc",
"TAXON_TYPE": "taxon_type",
"LONG_DD94": "long",
"LAT_DD94": "lat"})
# Reorder the columns
fauna_df = fauna_df[["record_id", "survey_id", "site_id", "project_id", "taxon_id", "taxon_type"
,"comm_name", "sci_name", "totalcount", "location_desc", "long", "lat"
,"end_year", "end_mth", "end_date", "start_year", "start_mth", "start_date"
,"recordtype", "reliability"]]
# ## Filter VBA fauna data against scraped data
# Import the webscraped animal data
scraped_df = pd.read_csv("data/animal_image_to_merge.csv", dtype="str")
# Extract list of unique animals of interest
species = scraped_df["animal_name"].unique().tolist()
# Filter the fauna data with the species of interest
short_fauna_df = fauna_df[fauna_df["comm_name"].isin(species)]
# Filter out the records with total count of 0 and remove end_year, end_date, end_mth, start_year, start_mth columns
final_fauna_df = short_fauna_df[short_fauna_df.totalcount > 0].drop([
'site_id', 'project_id', 'location_desc', 'end_year', 'end_mth',
'end_date', 'start_year', 'start_mth', 'recordtype', 'reliability'], axis = 1)
# Sort the fauna dataframe by comm_name and start_date
final_fauna_df.sort_values(by=["comm_name", "start_date"], inplace=True)
# Add a new column month_year for aggregation purpose
final_fauna_df["year_month"] = final_fauna_df["start_date"].dt.strftime('%Y-%m')
# ## Filter webscraped animal image data against VBA fauna data
final_animal_list = final_fauna_df["comm_name"].unique().tolist()
# Filter the webscraped data to have only the above animals
final_scraped_df = scraped_df[scraped_df["animal_name"].isin(final_animal_list)].copy()
# Fill the NaN values with None values for the json-converted file to work
final_scraped_df = final_scraped_df.where(final_scraped_df.notnull(), None)
# ## Load
# Setup connection to MongoDB database remotely on MongoDB Atlas or locally
conn = os.environ.get('DATABASE_URL', '') or 'mongodb://localhost:27017'
# Initialize PyMongo to work with MongoDBs.
client = pymongo.MongoClient(conn)
# Define database and collections
db = client.animal_visual_db
vba_fauna = db.vba_fauna
scraped_fauna = db.scraped_fauna
# Drops collections if available to remove duplicates
vba_fauna.drop()
scraped_fauna.drop()
# Load vba fauna data into the vba_fauna collection
vba_fauna.insert_many(final_fauna_df.to_dict('records'))
# Load scraped fauna image and info into the scraped_fauna collection
scraped_fauna.insert_many(final_scraped_df.to_dict('records'))