diff --git a/scripts/basic/bol_basic.py b/scripts/basic/bol_basic.py deleted file mode 100644 index 4cdcba2..0000000 --- a/scripts/basic/bol_basic.py +++ /dev/null @@ -1,53 +0,0 @@ -import geopandas as gpd -import pandas as pd -import os - -from utils import * - - -iso = "BOL" -bol_ef = pd.read_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/ids/bol_ids.csv") -bol_ef["adm0"] = iso -bol_ef["address"] = None - -print(bol_ef.head()) - -coords = pd.read_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/coordinates/bol_coordinates.csv") -bol_ef = pd.merge(bol_ef, coords, on = "geo_id") - - -bol_ef = bol_ef[["geo_id", "deped_id", "school_name", "address", "adm0"]] - -print(bol_ef.head()) - - -longs = bol_ef["longitude"].values -lats = bol_ef["latitude"].values - -# Geocode to ADM levels -cols = ["geo_id", "deped_id", "school_name", "adm0", "address"] -for adm in range(1, 4): - - try: - - cols += ["adm" + str(adm)] - downloadGB(iso, str(adm), "../../gb") - shp = gpd.read_file(getGBpath(iso, f"ADM{str(adm)}", "../../gb")) - bol_ef = gpd.GeoDataFrame(bol_ef, geometry = gpd.points_from_xy(bol_ef.longitude, bol_ef.latitude)) - bol_ef = gpd.tools.sjoin(bol_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] - bol_ef["longitude"] = longs - bol_ef["latitude"] = lats - print(bol_ef.head()) - - - except Exception as e: - - bol_ef["adm" + str(adm)] = None - print(e) - -bol_ef = bol_ef[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -# bol_ef = bol_ef[cols].drop(["longitude", "latitude"], axis = 1) - -bol_ef.to_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/basic/bol_basic.csv", index = False) - diff --git a/scripts/basic/tan_basic.py b/scripts/basic/tan_basic.py deleted file mode 100644 index ce460b5..0000000 --- a/scripts/basic/tan_basic.py +++ /dev/null @@ -1,50 +0,0 @@ -import geopandas as gpd -import pandas as pd -import os - -from utils import * - - -iso = "TZA" -tan_ef = pd.read_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/ids/tan_ids.csv") -tan_ef["adm0"] = iso -tan_ef["address"] = None - -print(tan_ef.head()) - -coords = pd.read_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/coordinates/tan_coordinates.csv") -tan_ef = pd.merge(tan_ef, coords, on = "geo_id") - -print(tan_ef.head()) - - -longs = tan_ef["longitude"].values -lats = tan_ef["latitude"].values - -# Geocode to ADM levels -cols = ["geo_id", "deped_id", "school_name", "adm0", "address"] -for adm in range(1, 4): - - try: - - cols += ["adm" + str(adm)] - downloadGB(iso, str(adm), "../../gb") - shp = gpd.read_file(getGBpath(iso, f"ADM{str(adm)}", "../../gb")) - tan_ef = gpd.GeoDataFrame(tan_ef, geometry = gpd.points_from_xy(tan_ef.longitude, tan_ef.latitude)) - tan_ef = gpd.tools.sjoin(tan_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] - tan_ef["longitude"] = longs - tan_ef["latitude"] = lats - print(tan_ef.head()) - - - except Exception as e: - - tan_ef["adm" + str(adm)] = None - print(e) - -tan_ef = tan_ef[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -# tan_ef = tan_ef[cols].drop(["longitude", "latitude"], axis = 1) - -tan_ef.to_csv("/Users/heatherbaier/Documents/geo_git/files_for_db/basic/tan_basic.csv", index = False) - diff --git a/scripts/basic/utils.py b/scripts/basic/utils.py deleted file mode 100644 index 5e6bb56..0000000 --- a/scripts/basic/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -import requests -import argparse -import zipfile -import shutil -import json -import os - - -def makeGBdir(iso, base_dir): - - # If the folder already exists, delete it - if os.path.isdir(os.path.join(base_dir, iso)): - shutil.rmtree(os.path.join(base_dir, iso)) - else: - "Couldn't delete old folder" - - # Create new folder - try: - os.mkdir(os.path.join(base_dir, iso)) - except: - os.mkdir(base_dir) - os.mkdir(os.path.join(base_dir, iso)) - - # ...and return the path - return os.path.join(base_dir, iso) - - -def downloadGB(iso, adm, base_dir): - - # Create the request URL - url = "https://www.geoboundaries.org/api/current/gbOpen/" + iso + "/ADM" + adm - print("Making request to: ", url) - - # Make the request to the URL - r = requests.get(url) - dlPath = r.json()['staticDownloadLink'] - print("Downloading data from: ", dlPath) - - # Get the download URL - r = requests.get(dlPath, allow_redirects=True) - - # Make directory for downloaded zipfolder - tmp_dir = makeGBdir(iso, base_dir) - print("Downloading data into: ", tmp_dir) - - # Open the request and download the zipfolder - open(os.path.join(tmp_dir, "temp.zip"), 'wb').write(r.content) - - # Open the downloaded zipfolder - with zipfile.ZipFile(os.path.join(tmp_dir, "temp.zip"), 'r') as zip_ref: - zip_ref.extractall(tmp_dir) - - # Grab the name of the second zipfolder - # to_open = [i for i in os.listdir(tmp_dir) if i.endswith(".zip") and i.startswith('geo')] - - # # Extract the files from the second zipfolder - # with zipfile.ZipFile(os.path.join(tmp_dir, to_open[0]), 'r') as zip_ref: - # zip_ref.extractall(tmp_dir) - - # # Clean up directory - # to_delete = [i for i in os.listdir(tmp_dir) if i.endswith(".zip") or i.startswith('geo')] - # for i in to_delete: - # os.remove(os.path.join(tmp_dir, i)) - - print("Done downloading boundary data.") - - - -def getGBpath(iso, adm, base_dir): - - files = os.listdir(os.path.join(base_dir, iso)) - shp = [_ for _ in files if ".shp" in _][0] - fname = os.path.join(base_dir, iso, shp) - - return fname \ No newline at end of file diff --git a/scripts/db.zip b/scripts/db.zip deleted file mode 100644 index f9ddf9f..0000000 Binary files a/scripts/db.zip and /dev/null differ diff --git a/scripts/ids/generate_arg_ids.py b/scripts/ids/generate_arg_ids.py deleted file mode 100644 index 10efd5b..0000000 --- a/scripts/ids/generate_arg_ids.py +++ /dev/null @@ -1,27 +0,0 @@ -import pandas as pd - -#import data -#file located at ../../data/ARG is a zip file and needs to be downloaded and unzipped -arg_table = pd.read_excel("Mae actualizado 2019-09-16_Envios.xls", header=11) - -#select only primary and secondary schools -arg_table = arg_table[(arg_table["Primaria"] == "X") | (arg_table["Secundaria"] == "X") | (arg_table["Secundaria Técnica (INET)"] == "X")] -arg_table.reset_index(inplace = True) - -#select and rename necessary columns -arg_table = arg_table[["CUE Anexo", "Nombre", "Jurisdicción", "Departamento"]] -arg_table.columns = ["country_id", "school_name", "adm1", "adm2"] - -#generate geo_ids -arg_table.reset_index(inplace=True) -arg_table["geo_id"] = arg_table["index"].apply(lambda x: 'ARG-{0:0>6}'.format(x)) - -#create and reorder colums -arg_table["address"] = None -arg_table["adm0"] = "ARG" -arg_table["adm3"] = None - -arg_table = arg_table[["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -arg_table.to_csv("../../files_for_db/ids/arg_ids.csv", index=False) \ No newline at end of file diff --git a/scripts/ids/generate_bhr_ids.py b/scripts/ids/generate_bhr_ids.py deleted file mode 100644 index 7c851ae..0000000 --- a/scripts/ids/generate_bhr_ids.py +++ /dev/null @@ -1,48 +0,0 @@ -from pandas.io.json import json_normalize -import geopandas as gpd -import pandas as pd - -from utils import * - - -bhr_ef = pd.read_csv("../../data/BHR/bahrain_school_locations.csv") -bhr_ef = bhr_ef[bhr_ef["SUBTYPE EN"].isin(["KINDERGARTEN", "PUBLIC SCHOOLS - BOYS", "PUBLIC SCHOOLS - GIRLS"])] -bhr_ef = bhr_ef[["NAME", "#", "POINT_X_Longitude", "POINT_Y_Latitude"]] -print(bhr_ef.shape) - -bhr_ef = bhr_ef.drop_duplicates(subset = ["POINT_X_Longitude", "POINT_Y_Latitude"]) -print(bhr_ef.shape) - -bhr_ef = bhr_ef.reset_index() -bhr_ef['geo_id'] = bhr_ef['index'].apply(lambda x: 'BHR-{0:0>6}'.format(x)) -bhr_ef = bhr_ef.drop(["index"], axis = 1) -bhr_ef = bhr_ef[["geo_id", "#", "NAME", "POINT_X_Longitude", "POINT_Y_Latitude"]].rename(columns = {"#": "deped_id", "NAME": "school_name", "POINT_X_Longitude": "longitude", "POINT_Y_Latitude": "latitude"}) -bhr_ef["address"] = None -bhr_ef["adm0"] = "BHR" -print(bhr_ef.head()) - -# dgad - -# Geocode to ADM levels -cols = ["geo_id", "deped_id", "school_name", "longitude", "latitude", "address", "adm0"] -for adm in range(1, 4): - - try: - - cols += ["adm" + str(adm)] - downloadGB("BHR", str(adm), "../../gb") - shp = gpd.read_file(getGBpath("BHR", str(adm), "../../gb")) - bhr_ef = gpd.GeoDataFrame(bhr_ef, geometry = gpd.points_from_xy(bhr_ef.longitude, bhr_ef.latitude)) - bhr_ef = gpd.tools.sjoin(bhr_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] - print(bhr_ef) - - except Exception as e: - - bhr_ef["adm" + str(adm)] = None - print(e) - -bhr_ef = bhr_ef[cols].drop(["longitude", "latitude"], axis = 1) - -print(bhr_ef.head()) - -bhr_ef.to_csv("../../files_for_db/ids/bhr_ids.csv", index = False) diff --git a/scripts/ids/generate_bol_ids.py b/scripts/ids/generate_bol_ids.py deleted file mode 100644 index bc02d68..0000000 --- a/scripts/ids/generate_bol_ids.py +++ /dev/null @@ -1,58 +0,0 @@ -import geopandas as gpd -import pandas as pd - -from utils import * - - -bol_ef = gpd.read_file("../../data/BOL/shp/EstabEducativos/EstabEducativos.shp") -bol_ef = bol_ef[["gml_id", "POINT_X", "POINT_Y"]].rename(columns = {"POINT_Y": "latitude", "POINT_X": "longitude"}) -print(bol_ef.head()) - -print(bol_ef.shape) -bol_ef = bol_ef.drop_duplicates(subset = ["gml_id"]) -bol_ef = bol_ef.drop_duplicates(subset = ["latitude", "longitude"]) - -print(bol_ef.shape) - -print(bol_ef.head()) - - -# bol_ef = bol_ef.reset_index() -# bol_ef['geo_id'] = bol_ef['index'].apply(lambda x: 'BOL-{0:0>6}'.format(x)) -# bol_ef["school_name"] = None -# bol_ef = bol_ef[["geo_id", "gml_id", "school_name", "latitude", "longitude"]].rename(columns = {"gml_id": "deped_id"}) -# bol_ef["address"] = None -# bol_ef["adm0"] = "BOL" -# print(bol_ef.head()) - -# # print(bol_ef["geo_id"].value_counts()) - -# # agad - -# # Geocode to ADM levels -# cols = ["geo_id", "deped_id", "school_name", "longitude", "latitude", "address", "adm0"] -# for adm in range(1, 4): - -# try: - -# cols += ["adm" + str(adm)] -# downloadGB("BOL", str(adm), "../../gb") -# shp = gpd.read_file(getGBpath("BOL", str(adm), "../../gb")) -# bol_ef = gpd.GeoDataFrame(bol_ef, geometry = gpd.points_from_xy(bol_ef.longitude, bol_ef.latitude)) -# bol_ef = gpd.tools.sjoin(bol_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] -# print(bol_ef) - -# except Exception as e: - -# bol_ef["adm" + str(adm)] = None -# print(e) - -# bol_ef = bol_ef[cols].drop(["longitude", "latitude"], axis = 1) - -# print(bol_ef.head()) - -# print(bol_ef.shape) - - - -bol_ef.to_csv("../../files_for_db/ids/bol_ids.csv", index = False) diff --git a/scripts/ids/generate_cri_ids.py b/scripts/ids/generate_cri_ids.py deleted file mode 100644 index 0f4363a..0000000 --- a/scripts/ids/generate_cri_ids.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd - -#import data from github -cri_table1 = pd.read_excel("../../data/CRI/MATRICULA_INICIAL_COLEGIOS_2014-2021_POR_AÑO_CURSADO_Y_SEXO.xlsx", header=2) -cri_table2 = pd.read_excel("../../data/CRI/MATRICULA_INICIAL_ESCUELAS_DIURNAS_2014-2021_POR_AÑO_CURSADO_Y_SEXO.xlsx", header=2) - -#append tables -cri_table1 = cri_table1[["NOMBRE", "PROVINCIA", "CANTON", "DISTRITO"]] -cri_table2 = cri_table2[["NOMBRE", "PROVINCIA", "CANTON", "DISTRITO"]] -cri_table = cri_table1.append(cri_table2) - -#drop duplicate entries -#there are so many duplicates because this data covers multiple years -cri_table.drop_duplicates(inplace=True) -cri_table.reset_index(inplace=True, drop=True) - -#rename files -cri_table.columns = ["school_name", "adm1", "adm2", "adm3"] - -#create geo_ids -cri_table.reset_index(inplace=True) -cri_table["geo_id"] = cri_table["index"].apply(lambda x: 'CRI-{0:0>6}'.format(x)) - -#final additions and cleaning -cri_table["country_id"] = None -cri_table["address"] = None -cri_table["adm0"] = "CRI" -cri_table = cri_table[["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -cri_table.to_csv("../../files_for_db/ids/cri_ids.csv", index=False) \ No newline at end of file diff --git a/scripts/ids/generate_ecu_ids.py b/scripts/ids/generate_ecu_ids.py deleted file mode 100644 index 7443b2f..0000000 --- a/scripts/ids/generate_ecu_ids.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd -from utils import * - -ecu_table = pd.read_csv("MINEDUC_RegistrosAdministrativos_2021-2022-Fin.csv", header=None) #import data -ecu_table.columns = ecu_table.iloc[11] #add column names -ecu_table.drop(ecu_table.index[:12], inplace=True) #drop columns at beginning of dataset that just contain metadata -#reset index and create an index column for use in geo_id -ecu_table.reset_index(inplace=True) -ecu_table.drop(columns=["index"], inplace=True) -ecu_table.reset_index(inplace=True) - -#create new dataframe with just necessary information -ecu_ids_table = ecu_table[["index","AMIE", "Nombre Institución", "Provincia", "Cantón", "Parroquia"]] -ecu_ids_table = ecu_ids_table.rename_axis(None, axis=1) #get rid of name leftover from adding column names -ecu_ids_table.columns = ["index", "deped_id", "school_name", "adm1", "adm2", "adm3"] #rename columns -#change data from all caps to titlecase -ecu_ids_table["school_name"] = ecu_ids_table["school_name"].str.title() -ecu_ids_table["adm1"] = ecu_ids_table["adm1"].str.title() -ecu_ids_table["adm2"] = ecu_ids_table["adm2"].str.title() -ecu_ids_table["adm3"] = ecu_ids_table["adm3"].str.title() -#add columns that are the same for all rows -ecu_ids_table["address"] = None -ecu_ids_table["adm0"] = "ECU" -ecu_ids_table['geo_id'] = ecu_ids_table['index'].apply(lambda x: 'ECU-{0:0>6}'.format(x)) #add geo_id based on index -ecu_ids_table = ecu_ids_table[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] #rename columns and drop unnecessary columns - -#export as csv -ecu_ids_table.to_csv("../../files_for_db/ids/ecu_ids.csv", index = False) \ No newline at end of file diff --git a/scripts/ids/generate_guy_ids.py b/scripts/ids/generate_guy_ids.py deleted file mode 100644 index cc654de..0000000 --- a/scripts/ids/generate_guy_ids.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -from utils import * - -#import data -guy_table = pd.read_excel("../../data/GUY/National list of Schools (1).xlsx") - -#add region name as adm1 -#GT stands for Georgetown, the capital of Guyana, and it is in region 4 -guy_table["adm1"] = guy_table["REGION"].str.replace("01", "Barima-Waini").replace("02", "Pomeroon-Supernaam").replace("03", "Essequibo Islands-West Demerara").replace(["04", "GT"], "Demerara-Mahaica").replace("05", "Mahaica-Berbice").replace("06", "East Berbice-Corentyne").replace("07", "Cuyuni-Mazaruni").replace("08", "Potaro-Siparuni").replace("09", "Upper Takutu-Upper Essequibo").replace("10", "Upper Demerara-Berbice") - -#format strings in columns -guy_table["SCHOOL NAME"] = guy_table["SCHOOL NAME"].str.title() -guy_table["ADDRESS"] = guy_table["ADDRESS"].str.title() - -#add geo_ids -guy_table.reset_index(inplace=True) -guy_table['geo_id'] = guy_table['index'].apply(lambda x: 'GUY-{0:0>6}'.format(x)) - -#create table of only useful columns from original data -guy_ids_table = guy_table[["geo_id", "SCHOOL NAME", "ADDRESS", "adm1"]] - -#add, rename, and sort columns -guy_ids_table["deped_id"] = None -guy_ids_table["adm0"] = "GUY" -guy_ids_table["adm2"] = None -guy_ids_table["adm3"] = None -guy_ids_table.columns = ["geo_id", "school_name", "address", "adm1", "deped_id", "adm0", "adm2", "adm3"] -guy_ids_table = guy_ids_table[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -guy_ids_table.to_csv("../../files_for_db/ids/guy_ids.csv", index = False) diff --git a/scripts/ids/generate_ken_ids.py b/scripts/ids/generate_ken_ids.py deleted file mode 100644 index 3442169..0000000 --- a/scripts/ids/generate_ken_ids.py +++ /dev/null @@ -1,78 +0,0 @@ -import pandas as pd -from utils import * - -#import data and format so json file will read better into dataframe -with open("../../data/KEN/schools.json") as schools_raw: - schools_raw = schools_raw.read()[96:-1] -ken_table = pd.read_json(schools_raw) - -#extract info from dictionaries to create columns -ken_table["adm1"] = None -ken_table["adm2"] = None -ken_table["adm3"] = None -ken_table["school_name"] = None - -for i in range(len(ken_table)): - ken_table["adm1"].iloc[i] = ken_table["properties"].iloc[i]["County"].title() - ken_table["adm2"].iloc[i] = ken_table["properties"].iloc[i]["SUB_COUNTY"] - ken_table["adm3"].iloc[i] = ken_table["properties"].iloc[i]["Ward"].title() - if ken_table["adm3"].iloc[i] == " ": - ken_table["adm3"].iloc[i] = None - ken_table["school_name"].iloc[i] = ken_table["properties"].iloc[i]["SCHOOL_NAM"].title() - -#clean adm3 inconsistencies -ken_table["adm3"] = ken_table["adm3"].str.replace(' ', ' ') -ken_table["adm3"] = ken_table["adm3"].str.replace(' ', ' ') -ken_table["adm3"] = ken_table["adm3"].str.replace('\\', '/') -ken_table["adm3"] = ken_table["adm3"].str.replace('-', '/') -ken_table["adm3"] = ken_table["adm3"].str.replace(' / ', '/') -ken_table["adm3"] = ken_table["adm3"].str.replace('/ ', '/') -ken_table["adm3"] = ken_table["adm3"].str.replace(' /', '/') -ken_table["adm3"] = ken_table["adm3"].str.replace('\n', '') -ken_table["adm3"] = ken_table["adm3"].str.replace("’", "'") -#these are inconsistencies I found by hand based on geoBoundaries adm3 names -ken_table["adm3"] = ken_table["adm3"].str.replace('Baba Dogo', 'Babadogo') -ken_table["adm3"] = ken_table["adm3"].str.replace('Bassi', 'Bobasi') -ken_table["adm3"] = ken_table["adm3"].str.replace('Basi', 'Bobasi') -ken_table["adm3"] = ken_table["adm3"].str.replace('Bobasi/Boitangare', 'Bobasi Boitangare') -ken_table["adm3"] = ken_table["adm3"].str.replace('Walatsi', 'Waltsi') -ken_table["adm3"] = ken_table["adm3"].str.replace('Centrl', 'Central') -ken_table["adm3"] = ken_table["adm3"].str.replace('Kimathi', 'Kimanthi') -ken_table["adm3"] = ken_table["adm3"].str.replace('Lenkism', 'Lenkisim') -ken_table["adm3"] = ken_table["adm3"].str.replace('Oo Nkidong', 'Oonkidong') -ken_table["adm3"] = ken_table["adm3"].str.replace("Kachieng'", "Kachien'G") -ken_table["adm3"] = ken_table["adm3"].str.replace('Lakezone', 'Lake Zone') -ken_table["adm3"] = ken_table["adm3"].str.replace('Loiyamorok', 'Loiyamorock') -ken_table["adm3"] = ken_table["adm3"].str.replace('Mackinon Road', 'Mackinnon Road') -ken_table["adm3"] = ken_table["adm3"].str.replace('Maji Moto', 'Majimoto') -ken_table["adm3"] = ken_table["adm3"].str.replace('Malaha/Isongo/Makun Ga', 'Isongo/Makunga/Malaha') -ken_table["adm3"] = ken_table["adm3"].str.replace('Malaha/Isongo/Makunga', 'Isongo/Makunga/Malaha') -ken_table["adm3"] = ken_table["adm3"].str.replace("Manyatta B", "Manyatta 'B'") -ken_table["adm3"] = ken_table["adm3"].str.replace('Muhoroni Koru', 'Muhoroni/Koru') -ken_table["adm3"] = ken_table["adm3"].str.replace('Mutitu', 'Mutito') -ken_table["adm3"] = ken_table["adm3"].str.replace('Muvau/Kikumini', 'Muvau/Kikuumini') -ken_table["adm3"] = ken_table["adm3"].str.replace('Namboboto/Nambuku', 'Namboboto Nambuku') -ken_table["adm3"] = ken_table["adm3"].str.replace('Naromoru/Kiamathaga', 'Naromoru Kiamathaga') -ken_table["adm3"] = ken_table["adm3"].str.replace("Nyalenda A", "Nyalenda 'A'") -ken_table["adm3"] = ken_table["adm3"].str.replace("Sarang'Ombe", "Sarangombe") -ken_table["adm3"] = ken_table["adm3"].str.replace('Shauri Moyo', 'Shaurimoyo') -ken_table["adm3"] = ken_table["adm3"].str.replace('Tulwet/Chiyat', 'Tulwet/Chuiyat') -ken_table["adm3"] = ken_table["adm3"].str.replace("Wang'Chieng", "Wangchieng") -ken_table["adm3"] = ken_table["adm3"].str.replace('Ingostse/Mathia', 'Ingostse-Mathia') -ken_table["adm3"] = ken_table["adm3"].str.replace('Ingotse/Matiha', 'Ingostse-Mathia') - -#create specific table for ids with only necessary columns -ken_table_ids = ken_table[["id", "adm1", "adm2", "adm3", "school_name"]] - -#create geoIDs -ken_table_ids.reset_index(inplace=True) -ken_table_ids["geo_id"] = ken_table_ids["index"].apply(lambda x: 'KEN-{0:0>6}'.format(x)) - -#fill in remaining columns and reorder -ken_table_ids["address"] = None -ken_table_ids["adm0"] = "KEN" -ken_table_ids["country_id"] = ken_table_ids["id"] -ken_table_ids = ken_table_ids[["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -ken_table_ids.to_csv("../../files_for_db/ids/ken_ids.csv", index=False) diff --git a/scripts/ids/generate_nga_ids.py b/scripts/ids/generate_nga_ids.py deleted file mode 100644 index 03697c4..0000000 --- a/scripts/ids/generate_nga_ids.py +++ /dev/null @@ -1,59 +0,0 @@ -import geopandas as gpd -import pandas as pd - -from utils import * - - -nig_ef = pd.read_csv("../../data/NGA/educational-facilities-in-nigeria.csv") -nig_ef = nig_ef[["facility_name", "facility_id", "longitude", "latitude"]] - -print(nig_ef.shape) -nig_ef = nig_ef.drop_duplicates("facility_id") -print(nig_ef.shape) - - -# dgasa - - -nig_ef = nig_ef.reset_index() -nig_ef['geo_id'] = nig_ef['index'].apply(lambda x: 'NIG-{0:0>6}'.format(x)) -nig_ef["deped_id"] = nig_ef["facility_id"] -nig_ef = nig_ef[["geo_id", "deped_id", "facility_name", "longitude", "latitude"]] -nig_ef = nig_ef.rename(columns = {"facility_name":"school_name"}) -nig_ef["address"] = None -nig_ef["adm0"] = "NGA" - -print(nig_ef.head()) - -print(nig_ef.shape) - - - -# agfdag - -# Geocode to ADM levels -cols = ["geo_id", "deped_id", "school_name", "longitude", "latitude", "address", "adm0"] -for adm in range(1, 4): - - try: - - cols += ["adm" + str(adm)] - downloadGB("NGA", str(adm), "../../gb") - shp = gpd.read_file(getGBpath("NGA", str(adm), "../../gb")) - nig_ef = gpd.GeoDataFrame(nig_ef, geometry = gpd.points_from_xy(nig_ef.longitude, nig_ef.latitude)) - nig_ef = gpd.tools.sjoin(nig_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] - print(nig_ef) - - except Exception as e: - - nig_ef["adm" + str(adm)] = None - print(e) - - -nig_ef = nig_ef[cols].drop(["longitude", "latitude"], axis = 1) - -print(nig_ef) - -print(nig_ef.shape) - -nig_ef.to_csv("../../files_for_db/ids/nga_ids.csv", index = False) diff --git a/scripts/ids/generate_npl_ids.py b/scripts/ids/generate_npl_ids.py deleted file mode 100644 index c3fbd00..0000000 --- a/scripts/ids/generate_npl_ids.py +++ /dev/null @@ -1,29 +0,0 @@ -import pandas as pd -import numpy as np -from utils import * - -#import data and combine into one file -npl_table_raw1 = pd.read_csv("../../data/NPL/School Performance for the year 2062 BS.csv") -npl_table_raw2 = pd.read_csv("../../data/NPL/School Performance for the year 2063 BS.csv") - -npl_table_raw1 = npl_table_raw1[["District", "Zone", "School Code", "Name of School"]] -npl_table_raw2 = npl_table_raw2[["District", "Zone", "School Code", "Name of School"]] - -npl_table_raw = pd.concat([npl_table_raw1, npl_table_raw2]) - -npl_table = npl_table_raw.drop_duplicates().reset_index() - -#create new index column -npl_table.drop("index", axis=1, inplace=True) -npl_table.reset_index(inplace=True) - -#rename, delete, and add columns -npl_table.rename(columns = {"Zone":"adm1", "District":"adm2", "School Code":"deped_id", "Name of School":"school_name"}, inplace=True) -npl_table['geo_id'] = npl_table['index'].apply(lambda x: 'NPL-{0:0>6}'.format(x)) -npl_table["address"] = None -npl_table["adm0"] = "NGL" -npl_table["adm3"] = None -npl_table = npl_table[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -npl_table.to_csv("../../files_for_db/ids/npl_ids.csv", index = False) diff --git a/scripts/ids/generate_per_ids.py b/scripts/ids/generate_per_ids.py deleted file mode 100644 index 3f3891f..0000000 --- a/scripts/ids/generate_per_ids.py +++ /dev/null @@ -1,41 +0,0 @@ -import geopandas as gpd -import pandas as pd - -from utils import * - -per_ef = pd.read_csv("../../data/PER/Relación de instituciones y programas educativos.csv") -print(per_ef.columns) -per_ef = per_ef.drop_duplicates(subset = ["cod_mod"]) -per_ef['cen_edu'] = per_ef['cen_edu'].str.replace('\d+', '') -per_ef = per_ef[["cod_mod", "cen_edu", "nlong_ie", "nlat_ie"]] -per_ef = per_ef.reset_index() -per_ef['geo_id'] = per_ef['index'].apply(lambda x: 'PER-{0:0>6}'.format(x)) -per_ef = per_ef.drop(["index"], axis = 1) -per_ef = per_ef[["geo_id", "cod_mod", "cen_edu", "nlong_ie", "nlat_ie"]].rename(columns = {"cod_mod": "deped_id", "cen_edu": "school_name", "nlong_ie": "longitude", "nlat_ie": "latitude"}) -per_ef["address"] = None -per_ef["adm0"] = "PER" -print(per_ef.head()) - -# Geocode to ADM levels -cols = ["geo_id", "deped_id", "school_name", "longitude", "latitude", "address", "adm0"] -for adm in range(1, 4): - - try: - - cols += ["adm" + str(adm)] - downloadGB("PER", str(adm), "../../gb") - shp = gpd.read_file(getGBpath("PER", str(adm), "../../gb")) - per_ef = gpd.GeoDataFrame(per_ef, geometry = gpd.points_from_xy(per_ef.longitude, per_ef.latitude)) - per_ef = gpd.tools.sjoin(per_ef, shp, how = "left").rename(columns = {"shapeName": "adm" + str(adm)})[cols] - print(per_ef) - - except Exception as e: - - per_ef["adm" + str(adm)] = None - print(e) - -per_ef = per_ef[cols].drop(["longitude", "latitude"], axis = 1) - -print(per_ef.head()) - -per_ef.to_csv("../../files_for_db/ids/per_ids.csv", index = False) diff --git a/scripts/ids/generate_phl_ids.py b/scripts/ids/generate_phl_ids.py deleted file mode 100644 index 1c4dcca..0000000 --- a/scripts/ids/generate_phl_ids.py +++ /dev/null @@ -1,20 +0,0 @@ -import geopandas as gpd -import pandas as pd - -from utils import * - - -phl_ef = pd.read_csv("../../data/PHL/this_one.csv") -print(phl_ef.columns) -phl_ef = phl_ef[["school_id", "school_name", "longitude", "latitude", "region", "division", "province"]] -phl_ef = phl_ef.drop_duplicates(subset = ["school_id"]) -phl_ef = phl_ef.reset_index() -phl_ef['geo_id'] = phl_ef['index'].apply(lambda x: 'PHL-{0:0>6}'.format(x)) -phl_ef["adm0"] = "PHL" -phl_ef["address"] = None -phl_ef = phl_ef[["geo_id", "school_id", "school_name", "address", "adm0", "region", "division", "province"]].rename(columns = {"school_id": "deped_id", "region": "adm1", "division":"adm2", "province": "adm3"}) -phl_ef = phl_ef[["geo_id","deped_id","school_name","address","adm0","adm1","adm2","adm3"]] - -print(phl_ef.head()) - -phl_ef.to_csv("../../files_for_db/ids/phl_ids.csv", index = False) \ No newline at end of file diff --git a/scripts/ids/generate_pry_ids.py b/scripts/ids/generate_pry_ids.py deleted file mode 100644 index b4e741a..0000000 --- a/scripts/ids/generate_pry_ids.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd -from utils import * - -#import data -pry_ids = pd.read_csv("../../data/PRY/pry_coords.csv") - -#select and rename columns -pry_ids = pry_ids[["codigo_est", "nombre_dep", "nombre_dis", "direccion"]] -pry_ids.rename(columns = {"codigo_est":"deped_id", "nombre_dep":"adm1", "nombre_dis":"adm2", "direccion":"school_name"}, inplace=True) - -#create geo_ids -pry_ids.reset_index(inplace=True) -pry_ids['geo_id'] = pry_ids['index'].apply(lambda x: 'PRY-{0:0>6}'.format(x)) - -#create and reorder columns -pry_ids["address"] = None -pry_ids["adm0"] = "PRY" -pry_ids["adm3"] = None - -pry_ids = pry_ids[["geo_id", "deped_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -pry_ids.to_csv("../../files_for_db/ids/pry_ids.csv", index = False) \ No newline at end of file diff --git a/scripts/ids/generate_sle_ids.py b/scripts/ids/generate_sle_ids.py deleted file mode 100644 index 4a99901..0000000 --- a/scripts/ids/generate_sle_ids.py +++ /dev/null @@ -1,17 +0,0 @@ -import pandas as pd - - -sle_ef = pd.read_csv("../../data/SLE/sleeducptstewardschools.csv") -print(sle_ef.columns) -sle_ef = sle_ef[["Educ_Code", "Educ_Name", 'Adm2_Name', 'Adm3_Name', 'Adm4_Name']].rename(columns = {'Adm2_Name':"adm1", 'Adm3_Name': "adm2", 'Adm4_Name':"adm3"}) -sle_ef = sle_ef.reset_index() -sle_ef['geo_id'] = sle_ef['index'].apply(lambda x: 'SLE-{0:0>6}'.format(x)) -sle_ef["address"] = "None" -sle_ef["adm0"] = "SLE" -sle_ef = sle_ef[["geo_id", "Educ_Code", "Educ_Name", "address","adm0","adm1","adm2","adm3"]].rename(columns = {"Educ_Code": "deped_id", "Educ_Name": "school_name"}) - -print(sle_ef.head()) - -sle_ef.to_csv("../../files_for_db/ids/sle_ids.csv", index = False) - - diff --git a/scripts/ids/generate_slv_ids.py b/scripts/ids/generate_slv_ids.py deleted file mode 100644 index 43326b9..0000000 --- a/scripts/ids/generate_slv_ids.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd - -#import data from github -slv_table = pd.read_csv("../../data/SLV/datos_sedes_educativas.csv") - -#select and rename necessary columns -slv_table = slv_table[["Departamento","Municipio", "Dirección", "Código sede", "Nombre sede"]] -slv_table.columns = ["adm1", "adm2", "address", "country_id", "school_name"] - -#create geo_ids -slv_table.reset_index(inplace=True) -slv_table.reset_index(inplace=True) - -#final formatting -slv_table["adm0"] = "SLV" -slv_table["adm3"] = None -slv_table = slv_table[["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] - -#export as csv -slv_table.to_csv("../../files_for_db/ids/slv_ids.csv", index=False) \ No newline at end of file diff --git a/scripts/ids/generate_tan_ids.py b/scripts/ids/generate_tan_ids.py deleted file mode 100644 index f270a31..0000000 --- a/scripts/ids/generate_tan_ids.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd - -tan_ef = pd.read_csv("../../data/TAN/pri-performing-all.csv") -tan_ef = tan_ef[["CODE", "NAME", "REGION", "DISTRICT", "WARD", "OWNERSHIP", "LONGITUDE", "LATITUDE"]] -tan_ef = tan_ef.drop_duplicates(subset = ["CODE"]) -tan_ef = tan_ef.reset_index() -tan_ef['geo_id'] = tan_ef['index'].apply(lambda x: 'TAN-{0:0>6}'.format(x)) -tan_ef = tan_ef.drop(["index"], axis = 1) -tan_ef = tan_ef[["geo_id", "CODE", "NAME"]].rename(columns = {"CODE": "deped_id", "NAME": "school_name"}) -print(tan_ef.head()) -tan_ef.to_csv("../../files_for_db/ids/tan_ids.csv", index = False) - - - -# tan_ef_prim = pd.read_csv("../../data/TAN/Consolidated_Primary_EnrolmentbyGrade_PTR_2022_PSLE2021.csv") -# tan_ef_prim = tan_ef_prim[tan_ef_prim["SCHOOL OWNERSHIP"] == "Government"] -# tan_ef_prim = tan_ef_prim[['REGION', 'COUNCIL', 'WARD', 'SCHOOL NAME', 'SCHOOL REG. NUMBER', 'LATITUTE', 'LONGITUDE']] -# tan_ef_prim.columns = [_.title() for _ in tan_ef_prim.columns] -# tan_ef_prim = tan_ef_prim.rename(columns = {"Latitute": "Latitude", "School Reg. Number": "deped_id"}) - -# tan_ef_sec = pd.read_csv("../../data/TAN/Consolidated_Secondary_EnrolmentbyGrade_PTR_CSEE2021, 2022.csv") -# tan_ef_sec = tan_ef_sec[['Region', 'Council', 'Ward', 'School', 'Reg.No.', 'Latitude', 'Longitude']] -# tan_ef_sec = tan_ef_sec.rename(columns = {"School": "School Name", "Reg.No.":"deped_id"}) - -# tan_ef = tan_ef_prim.append(tan_ef_sec) - -# tan_ef = tan_ef.reset_index() -# tan_ef['geo_id'] = tan_ef['index'].apply(lambda x: 'TAN-{0:0>6}'.format(x)) - -# tan_ef["address"] = None -# tan_ef["adm0"] = "TAN" -# tan_ef = tan_ef[["geo_id", "deped_id", "School Name", "address", "adm0", "Region", "Council", "Ward"]].rename(columns = {"School Name": "school_name"}) -# tan_ef.columns = ["geo_id", "deped_id", "School_Name", "address", "adm0", "adm1", "adm2", "adm3"] -# tan_ef.columns = [_.lower() for _ in tan_ef.columns] - -# print(tan_ef.head()) - -# tan_ef.to_csv("../../files_for_db/ids/tan_ids.csv", index = False) diff --git a/scripts/ids/generate_usa_ids.py b/scripts/ids/generate_usa_ids.py deleted file mode 100644 index f379395..0000000 --- a/scripts/ids/generate_usa_ids.py +++ /dev/null @@ -1,21 +0,0 @@ -import pandas as pd - -#get data from github and choose necessary columns -usa_table = pd.read_csv("../../data/USA/ccd_sch_029_2122_w_1a_071722.csv", dtype={"LZIP":str}) -usa_table = usa_table[["SCH_NAME", "SCHID", "LSTREET1", "LCITY", "LSTATE","LZIP", "STATENAME"]] - -#create necessary columns -usa_table["address"] = usa_table["LSTREET1"] + ", " + usa_table["LCITY"] + ", " + usa_table["LSTATE"] + " " + usa_table["LZIP"] - -usa_table.reset_index(inplace=True) -usa_table['geo_id'] = usa_table['index'].apply(lambda x: 'USA-{0:0>6}'.format(x)) - -usa_table["adm0"] = "USA" -usa_table["adm3"] = None - -#choose final columns and rename -usa_table = usa_table[["geo_id", "SCHID", "SCH_NAME", "address", "adm0", "STATENAME", "LCITY", "adm3"]] -usa_table.columns = ["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"] - -#export as csv -usa_table.to_csv("../../files_for_db/ids/usa_ids.csv", index=False) \ No newline at end of file diff --git a/scripts/ids/generate_zaf_ids.py b/scripts/ids/generate_zaf_ids.py deleted file mode 100644 index 3fc0595..0000000 --- a/scripts/ids/generate_zaf_ids.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd -import numpy as np - -#import necessary data from github -zaf_table = pd.read_excel("../../data/ZAF/National.xlsx") - -#select relevant columns -zaf_table = zaf_table[["NatEmis", "Province", "Official_Institution_Name", "EIDistrict", "LMunName", "StreetAddress"]] - -#reformat data about adm 1 -zaf_table["Province"] = zaf_table["Province"].str.replace("FS", "Free State").replace("ES", "Eastern Cape").replace("GT", "Gauteng").replace("KZN", "KwaZulu-Natal").replace("LP", "Limpopo").replace("MP", "Mpumalanga").replace("NC", "Northern Cape").replace("NW", "North West").replace("WC", "Western Cape").replace("Province", np.nan).replace("", np.nan).replace(" ", np.nan) - -#rename columns -zaf_table.columns = ["country_id", "adm1", "school_name", "adm2", "adm3", "address"] - -#create geo_ids and adm0 column -zaf_table.reset_index(inplace=True) -zaf_table["geo_id"] = zaf_table["index"].apply(lambda x: "ZAF-{0:0>6}".format(x)) -zaf_table["adm0"] = "ZAF" - -#final formatting and cleaning -zaf_table = zaf_table[["geo_id", "country_id", "school_name", "address", "adm0", "adm1", "adm2", "adm3"]] -zaf_table["address"] = zaf_table["address"].replace("", np.nan).replace(" ", np.nan) -zaf_table["adm2"] = zaf_table["adm2"].replace("", np.nan).replace(" ", np.nan) -zaf_table["adm3"] = zaf_table["adm3"].replace("", np.nan).replace(" ", np.nan) - -#export as csv -zaf_table.to_csv("../../files_for_db/ids/zaf_ids.csv", index=False) \ No newline at end of file diff --git a/scripts/ids/utils.py b/scripts/ids/utils.py deleted file mode 100644 index 8de1350..0000000 --- a/scripts/ids/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -import requests -import argparse -import zipfile -import shutil -import json -import os - - -def makeGBdir(iso, base_dir): - - # If the folder already exists, delete it - if os.path.isdir(os.path.join(base_dir, iso)): - shutil.rmtree(os.path.join(base_dir, iso)) - else: - "Couldn't delete old folder" - - # Create new folder - try: - os.mkdir(os.path.join(base_dir, iso)) - except: - os.mkdir(base_dir) - os.mkdir(os.path.join(base_dir, iso)) - - # ...and return the path - return os.path.join(base_dir, iso) - - -def downloadGB(iso, adm, base_dir): - - # Create the request URL - url = "https://www.geoboundaries.org/api/current/gbOpen/" + iso + "/ADM" + adm - print("Making request to: ", url) - - # Make the request to the URL - r = requests.get(url) - dlPath = r.json()['staticDownloadLink'] - print("Downloading data from: ", dlPath) - - # Get the download URL - r = requests.get(dlPath, allow_redirects=True) - - # Make directory for downloaded zipfolder - tmp_dir = makeGBdir(iso, base_dir) - print("Downloading data into: ", tmp_dir) - - # Open the request and download the zipfolder - open(os.path.join(tmp_dir, "temp.zip"), 'wb').write(r.content) - - # Open the downloaded zipfolder - with zipfile.ZipFile(os.path.join(tmp_dir, "temp.zip"), 'r') as zip_ref: - zip_ref.extractall(tmp_dir) - - # Grab the name of the second zipfolder - to_open = [i for i in os.listdir(tmp_dir) if i.endswith(".zip") and i.startswith('geo')] - - # Extract the files from the second zipfolder - with zipfile.ZipFile(os.path.join(tmp_dir, to_open[0]), 'r') as zip_ref: - zip_ref.extractall(tmp_dir) - - # Clean up directory - to_delete = [i for i in os.listdir(tmp_dir) if i.endswith(".zip") or i.startswith('geo')] - for i in to_delete: - os.remove(os.path.join(tmp_dir, i)) - - print("Done downloading boundary data.") - - - -def getGBpath(iso, adm, base_dir): - - files = os.listdir(os.path.join(base_dir, iso)) - shp = [_ for _ in files if ".shp" in _][0] - fname = os.path.join(base_dir, iso, shp) - - return fname \ No newline at end of file diff --git a/scripts/personnel.zip b/scripts/personnel.zip deleted file mode 100644 index 98aef94..0000000 Binary files a/scripts/personnel.zip and /dev/null differ diff --git a/scripts/resources.zip b/scripts/resources.zip deleted file mode 100644 index 5d68e27..0000000 Binary files a/scripts/resources.zip and /dev/null differ