From 2ba547a6e7a1b97084a6a8f0106b0973d51e2ad1 Mon Sep 17 00:00:00 2001 From: vontell Date: Tue, 30 Jan 2024 16:38:24 -0500 Subject: [PATCH] Notebook with analysis --- data_analysis/2024DataAnalysis.ipynb | 843 +++++++++++++++++++++++++++ 1 file changed, 843 insertions(+) create mode 100644 data_analysis/2024DataAnalysis.ipynb diff --git a/data_analysis/2024DataAnalysis.ipynb b/data_analysis/2024DataAnalysis.ipynb new file mode 100644 index 0000000..0fcddc0 --- /dev/null +++ b/data_analysis/2024DataAnalysis.ipynb @@ -0,0 +1,843 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Analysis of PHLASK data in early 2024\n", + "\n", + "This notebook contains the analysis of the PHLASK data that exists in early 2024. It uses the \n", + "PHLASK Firebase DB to analyze the schema and presence of the data.\n", + "\n", + "**The goal of this analysis is to begin designing an official schema for the data, which we can then\n", + "use to go through the data and normalize or remove invalid resources.**\n", + "\n", + "You can find the Firebase databases and dashboard [here.](https://console.firebase.google.com/u/1/project/phlask-web-map/overview)\n", + "\n", + "## Getting Started Hints\n", + "\n", + "- If you are new to Firebase, check out [this link](https://www.geeksforgeeks.org/firebase-introduction/).\n", + "- I found [this tutorial](https://www.freecodecamp.org/news/how-to-get-started-with-firebase-using-python/) to be useful for learning how to connect to Firebase using Python\n", + "\n", + "## Table of Contents:\n", + "\n", + "- Getting started - Taking a look at the data\n", + "- Analyzing specific aspects of PHLASK data\n", + " - Analysis of \"Hours\" Data in PHLASK\n", + " - Analysis of address fields\n", + "- Coming up with a schema for the data" + ], + "metadata": { + "collapsed": false + }, + "id": "9ebea0f7553adaa8" + }, + { + "cell_type": "markdown", + "source": [ + "# Getting started - Taking a look at the data\n", + "\n", + "First, we need to install the required dependencies and get some initial configuration setup to access the Firebase DB.\n", + "\n", + "**DO NOT skip this step! Without it, none of the code will run!**\n", + "\n", + "First, we will install some Python dependencies from pip.\n" + ], + "metadata": { + "collapsed": false + }, + "id": "f2e6b74589e51490" + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: firebase-admin in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (6.3.0)\r\n", + "Requirement already satisfied: cachecontrol>=0.12.6 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from firebase-admin) (0.13.1)\r\n", + "Requirement already satisfied: google-api-python-client>=1.7.8 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from firebase-admin) (2.114.0)\r\n", + "Requirement already satisfied: google-cloud-storage>=1.37.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from firebase-admin) (2.14.0)\r\n", + "Requirement already satisfied: pyjwt>=2.5.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from pyjwt[crypto]>=2.5.0->firebase-admin) (2.8.0)\r\n", + "Requirement already satisfied: google-api-core<3.0.0dev,>=1.22.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (2.15.0)\r\n", + "Requirement already satisfied: google-cloud-firestore>=2.9.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from firebase-admin) (2.14.0)\r\n", + "Requirement already satisfied: requests>=2.16.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from cachecontrol>=0.12.6->firebase-admin) (2.31.0)\r\n", + "Requirement already satisfied: msgpack>=0.5.2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from cachecontrol>=0.12.6->firebase-admin) (1.0.7)\r\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (1.62.0)\r\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (4.25.2)\r\n", + "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (2.26.2)\r\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (1.60.0)\r\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (1.60.0)\r\n", + "Requirement already satisfied: httplib2<1.dev0,>=0.15.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-python-client>=1.7.8->firebase-admin) (0.22.0)\r\n", + "Requirement already satisfied: google-auth-httplib2>=0.1.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-python-client>=1.7.8->firebase-admin) (0.2.0)\r\n", + "Requirement already satisfied: uritemplate<5,>=3.0.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-api-python-client>=1.7.8->firebase-admin) (4.1.1)\r\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.4.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-cloud-firestore>=2.9.1->firebase-admin) (2.4.1)\r\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-cloud-firestore>=2.9.1->firebase-admin) (1.23.0)\r\n", + "Requirement already satisfied: google-resumable-media>=2.6.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-cloud-storage>=1.37.1->firebase-admin) (2.7.0)\r\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-cloud-storage>=1.37.1->firebase-admin) (1.5.0)\r\n", + "Requirement already satisfied: cryptography>=3.4.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from pyjwt[crypto]>=2.5.0->firebase-admin) (41.0.3)\r\n", + "Requirement already satisfied: cffi>=1.12 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from cryptography>=3.4.0->pyjwt[crypto]>=2.5.0->firebase-admin) (1.16.0)\r\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (5.3.2)\r\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (0.3.0)\r\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (4.9)\r\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from httplib2<1.dev0,>=0.15.0->google-api-python-client>=1.7.8->firebase-admin) (3.1.1)\r\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from requests>=2.16.0->cachecontrol>=0.12.6->firebase-admin) (2.0.4)\r\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from requests>=2.16.0->cachecontrol>=0.12.6->firebase-admin) (3.4)\r\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from requests>=2.16.0->cachecontrol>=0.12.6->firebase-admin) (1.26.18)\r\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from requests>=2.16.0->cachecontrol>=0.12.6->firebase-admin) (2023.11.17)\r\n", + "Requirement already satisfied: pycparser in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from cffi>=1.12->cryptography>=3.4.0->pyjwt[crypto]>=2.5.0->firebase-admin) (2.21)\r\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0dev,>=1.22.1->google-api-core[grpc]<3.0.0dev,>=1.22.1; platform_python_implementation != \"PyPy\"->firebase-admin) (0.5.1)\r\n", + "Requirement already satisfied: cerberus in /Users/aaronvontell/anaconda3/envs/autobots/lib/python3.8/site-packages (1.3.5)\r\n" + ] + } + ], + "source": [ + "!pip install firebase-admin # used for accessing Firebase DB\n", + "!pip install cerberus # used for validating the schema of the data" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:30:32.818311Z", + "start_time": "2024-01-30T21:30:28.652258Z" + } + }, + "id": "6e12a07bc528e62f" + }, + { + "cell_type": "markdown", + "source": [ + "Next, we will need to configure our environment to properly access the Firebase DB. This will use a Firebase Cert that\n", + "you must request from a PHLASK admin. Ask in the #phlask_data channel for a `firebase_cert.json` file, and then place it\n", + "in the same directory as this notebook. Then, run the following code to configure your environment." + ], + "metadata": { + "collapsed": false + }, + "id": "d3664c2e2a231923" + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Set the cert path to /Users/aaronvontell/Projects/phlask-data-handlers/data_analysis/firebase_cert.json\n" + ] + } + ], + "source": [ + "import os\n", + "cert_path = os.path.abspath(\"firebase_cert.json\")\n", + "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = cert_path\n", + "print(\"Set the cert path to \" + cert_path)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:08.587615Z", + "start_time": "2024-01-30T21:34:08.580785Z" + } + }, + "id": "1610f8a35ef4c3a1" + }, + { + "cell_type": "markdown", + "source": [ + "Now, we will startup the client to access the DB, and take a look at the data of a specific database. This constant,\n", + "`DB_URL`, is used in the next sections as well as the main DB to look at. You can find the whole list of databases [here](https://console.firebase.google.com/u/1/project/phlask-web-map/database/phlask-web-map/data)." + ], + "metadata": { + "collapsed": false + }, + "id": "f91f63b38c7fb280" + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded PHLASK DB reference with 274 resources\n" + ] + } + ], + "source": [ + "DB_URL = 'https://phlask-web-map-beta-water-live.firebaseio.com/'\n", + "\n", + "from firebase_admin import initialize_app, db\n", + "default_app = initialize_app()\n", + "ref = db.reference(url=DB_URL)\n", + "all_entries = [e for e in ref.get() if e is not None]\n", + "print(f\"Loaded PHLASK DB reference with {len(all_entries)} resources\")\n", + "\n", + "# If you get an error about the Firebase app already existing, restart your notebook kernel." + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:10.162433Z", + "start_time": "2024-01-30T21:34:09.597810Z" + } + }, + "id": "5cd58c9cc70e0b2b" + }, + { + "cell_type": "markdown", + "source": [ + "Now let's look at some of the data from these resources." + ], + "metadata": { + "collapsed": false + }, + "id": "cdae0710cd165307" + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "{'access': 'Public',\n 'address': '2501 Walnut St.',\n 'city': 'Philadelphia',\n 'description': 'Drinking fountain along Schuykill Banks path near Locust St. entrance',\n 'filtration': 'No',\n 'gp_id': 'ChIJ1QhLoknGxokRY1BxIaMBmEY',\n 'handicap': 'Unsure',\n 'hours': [{'close': {'day': 1, 'time': '0000'},\n 'open': {'day': 0, 'time': '0600'}},\n {'close': {'day': 2, 'time': '0000'}, 'open': {'day': 1, 'time': '0600'}},\n {'close': {'day': 3, 'time': '0000'}, 'open': {'day': 2, 'time': '0600'}},\n {'close': {'day': 4, 'time': '0000'}, 'open': {'day': 3, 'time': '0600'}},\n {'close': {'day': 5, 'time': '0000'}, 'open': {'day': 4, 'time': '0600'}},\n {'close': {'day': 6, 'time': '0000'}, 'open': {'day': 5, 'time': '0600'}},\n {'close': {'day': 0, 'time': '0000'}, 'open': {'day': 6, 'time': '0600'}}],\n 'images': ['https://i.imgur.com/TwWKydJ.jpg'],\n 'lat': 39.952195,\n 'lon': -75.180653,\n 'norms_rules': '',\n 'organization': 'Schuylkill Banks',\n 'permanently_closed': False,\n 'phone': '(215) 309-5523',\n 'quality': '1-4 Missing - Good',\n 'service': 'Self-serve',\n 'statement': '',\n 'status': 'OPERATIONAL',\n 'tap_type': 'Drinking Fountain',\n 'tapnum': 6,\n 'vessel': 'No',\n 'zip_code': ''}" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_entries[5]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:14.146047Z", + "start_time": "2024-01-30T21:34:14.142200Z" + } + }, + "id": "d51a1621bf08591c" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "{'access': 'Public',\n 'address': '',\n 'city': 'Philadelphia',\n 'description': 'Dawn - Dusk',\n 'filtration': '',\n 'gp_id': 'ChIJBb3JfPK4xokRajvM6KUr75c',\n 'handicap': '',\n 'lat': 40.0290751,\n 'lon': -75.211313,\n 'norms_rules': '',\n 'organization': 'Kendrick Playground & Recreation Center',\n 'permanently_closed': False,\n 'quality': '5-7 Missing - Needs Work',\n 'service': 'Self-serve',\n 'statement': '',\n 'tap_type': 'Drinking Fountain',\n 'tapnum': 151,\n 'vessel': '',\n 'zip_code': 19128}" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_entries[150]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:15.225080Z", + "start_time": "2024-01-30T21:34:15.221396Z" + } + }, + "id": "52f2c5561e922324" + }, + { + "cell_type": "markdown", + "source": [ + "# Analyzing specific aspects of PHLASK data\n", + "\n", + "## Analysis of \"Hours\" Data in PHLASK\n", + "\n", + "Let's start with an analysis of the hour format in these databases. We want to answer the following questions:\n", + "- How many entries have hours includes?\n", + "- What is the format of the hours?" + ], + "metadata": { + "collapsed": false + }, + "id": "8f47894dd6fe0789" + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 274 resources in this PHLASK db.\n" + ] + } + ], + "source": [ + "print(f\"There are {len(all_entries)} resources in this PHLASK db.\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:23.200898Z", + "start_time": "2024-01-30T21:34:23.195595Z" + } + }, + "id": "3a8427dca8554ba0" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 187 entries with hours (68.25%).\n" + ] + } + ], + "source": [ + "# Now, let's see how many entries have hours\n", + "entries_with_hours = [entry for entry in all_entries if entry.get('hours') is not None]\n", + "print(f\"There are {len(entries_with_hours)} entries with hours ({len(entries_with_hours)*100/len(all_entries):.2f}%).\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:25.494888Z", + "start_time": "2024-01-30T21:34:25.490602Z" + } + }, + "id": "601e4a34c9333048" + }, + { + "cell_type": "markdown", + "source": [ + "Now, let's take a look at a few of these hour entries." + ], + "metadata": { + "collapsed": false + }, + "id": "a34b89b097d4c8eb" + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "[{'close': {'day': 0, 'time': '2100'}, 'open': {'day': 0, 'time': '0700'}},\n {'close': {'day': 1, 'time': '2200'}, 'open': {'day': 1, 'time': '0700'}},\n {'close': {'day': 2, 'time': '2200'}, 'open': {'day': 2, 'time': '0700'}},\n {'close': {'day': 3, 'time': '2200'}, 'open': {'day': 3, 'time': '0700'}},\n {'close': {'day': 4, 'time': '2200'}, 'open': {'day': 4, 'time': '0700'}},\n {'close': {'day': 5, 'time': '2200'}, 'open': {'day': 5, 'time': '0700'}},\n {'close': {'day': 6, 'time': '2100'}, 'open': {'day': 6, 'time': '0700'}}]" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entries_with_hours[0].get('hours')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:40.518751Z", + "start_time": "2024-01-30T21:34:40.511298Z" + } + }, + "id": "641a577677bf5351" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "[{'close': {'day': 1, 'time': '2130'}, 'open': {'day': 1, 'time': '1300'}},\n {'close': {'day': 2, 'time': '2130'}, 'open': {'day': 2, 'time': '1300'}},\n {'close': {'day': 3, 'time': '2130'}, 'open': {'day': 3, 'time': '1300'}},\n {'close': {'day': 4, 'time': '2130'}, 'open': {'day': 4, 'time': '1300'}},\n {'close': {'day': 5, 'time': '2130'}, 'open': {'day': 5, 'time': '1300'}}]" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entries_with_hours[35].get('hours')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:41.605008Z", + "start_time": "2024-01-30T21:34:41.598888Z" + } + }, + "id": "c55c8e241dff5c32" + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "[{'close': {'day': 0, 'time': '1930'}, 'open': {'day': 0, 'time': '0600'}},\n {'close': {'day': 1, 'time': '1930'}, 'open': {'day': 1, 'time': '0530'}},\n {'close': {'day': 2, 'time': '1930'}, 'open': {'day': 2, 'time': '0530'}},\n {'close': {'day': 3, 'time': '1930'}, 'open': {'day': 3, 'time': '0530'}},\n {'close': {'day': 4, 'time': '1930'}, 'open': {'day': 4, 'time': '0530'}},\n {'close': {'day': 5, 'time': '1930'}, 'open': {'day': 5, 'time': '0530'}},\n {'close': {'day': 6, 'time': '1930'}, 'open': {'day': 6, 'time': '0600'}}]" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entries_with_hours[150].get('hours')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:42.655394Z", + "start_time": "2024-01-30T21:34:42.648550Z" + } + }, + "id": "8b9dba7252879ec7" + }, + { + "cell_type": "markdown", + "source": [ + "We can see that these hours entries are can be different, but they have some commonality. Let's do an analysis to see how many resources have this format." + ], + "metadata": { + "collapsed": false + }, + "id": "8030a406d3b140d0" + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Distribution of number of days in a resource: Counter({7: 112, 6: 34, 5: 29, 1: 8, 2: 2, 3: 2})\n", + "There are 6 bad resources (3.21%) out of 187 resources.\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "def validate_hours_format(resources: list):\n", + " \"\"\"\n", + " This function does the following:\n", + " - Counts the number of hour entries each resource has\n", + " - Counts the number of resources that follow the format of 'close' and 'open' keys with a day and time\n", + " - Prints the results\n", + " :param resources: The list of resources. Must all have hours (so make sure to pre-filter!)\n", + " :return: None\n", + " \"\"\"\n", + " \n", + " # Count the distribution of hours\n", + " hour_counts = map(lambda r: len(r.get('hours')), resources)\n", + " hour_distribution = Counter(hour_counts)\n", + " print(\"Distribution of number of days in a resource: \" + str(hour_distribution))\n", + " \n", + " bad_entries = []\n", + " for resource in resources:\n", + " days = resource.get('hours')\n", + " for day in days:\n", + " close_time = day.get('close')\n", + " open_time = day.get('open')\n", + " if close_time is None or open_time is None:\n", + " bad_entries.append(resource)\n", + " break\n", + " validate_time = lambda t: 0 <= t.get('day') <= 6 and 0 <= int(t.get('time')) <= 2400\n", + " \n", + " try:\n", + " if not validate_time(close_time) or not validate_time(open_time):\n", + " bad_entries.append(resource)\n", + " break\n", + " except:\n", + " bad_entries.append(resource)\n", + " break\n", + " \n", + " print(f\"There are {len(bad_entries)} bad resources ({len(bad_entries)*100/len(resources):.2f}%) out of {len(resources)} resources.\")\n", + " \n", + "validate_hours_format(entries_with_hours)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:34:55.970025Z", + "start_time": "2024-01-30T21:34:55.961116Z" + } + }, + "id": "3d63a1a69a42155" + }, + { + "cell_type": "markdown", + "source": [ + "### Some observations from this analysis\n", + "\n", + "- There are some resources that have no hours attached at all\n", + "- If they do have hours, they are organized as lists of days, but this ranges from 1 to 7 day entries\n", + "- Some of the entries are missing close times, and only have an open time\n", + "- All close/open entries have a day (0-6) which represent the day of the week (assuming Sunday = 0?)\n", + "\n", + "## Analysis of address fields\n", + "\n", + "Within the data, we can see that there are a few fields related to addresses\n", + "\n", + "- `address` - A street address for the resource\n", + "- `city` - The city of the resource\n", + "- `zip_code` - The zip code of the resource\n", + "- `lat` - The latitude of the resource\n", + "- `lon` - The longitude of the resource\n", + "- `organization` - The organization that owns the resource\n", + "- `gp_id` - The Google Places ID of the resource (you can do a reverse lookup [here](https://developers.google.com/maps/documentation/javascript/examples/geocoding-place-id))\n", + "\n", + "Let's start by seeing how many of these resources have these fields." + ], + "metadata": { + "collapsed": false + }, + "id": "84dab1cb2aac957f" + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 274 resources in this PHLASK db.\n", + "There are 189 entries with addresses (68.98%).\n", + "There are 248 entries with cities (90.51%).\n", + "There are 232 entries with zip codes (84.67%).\n", + "There are 274 entries with latitudes (100.00%).\n", + "There are 274 entries with longitudes (100.00%).\n", + "There are 246 entries with organizations (89.78%).\n", + "There are 257 entries with Google Places IDs (93.80%).\n" + ] + } + ], + "source": [ + "ref = db.reference(url='https://phlask-web-map-beta-water-live.firebaseio.com/')\n", + "all_entries = [e for e in ref.get() if e is not None]\n", + "print(f\"There are {len(all_entries)} resources in this PHLASK db.\")\n", + "\n", + "entries_with_address = [entry for entry in all_entries if entry.get('address') is not None and entry.get('address').strip() != \"\"]\n", + "entries_with_city = [entry for entry in all_entries if entry.get('city') is not None and entry.get('city').strip() != \"\"]\n", + "entries_with_zip_code = [entry for entry in all_entries if entry.get('zip_code') is not None and str(entry.get('zip_code')) != \"\"]\n", + "entries_with_lat = [entry for entry in all_entries if entry.get('lat') is not None and entry.get('lat') != 0]\n", + "entries_with_lon = [entry for entry in all_entries if entry.get('lon') is not None and entry.get('lon') != 0]\n", + "entries_with_organization = [entry for entry in all_entries if entry.get('organization') is not None and entry.get('organization').strip() != \"\"]\n", + "entries_with_gp_id = [entry for entry in all_entries if entry.get('gp_id') is not None and entry.get('gp_id').strip() != \"\"]\n", + "\n", + "print(f\"There are {len(entries_with_address)} entries with addresses ({len(entries_with_address)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_city)} entries with cities ({len(entries_with_city)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_zip_code)} entries with zip codes ({len(entries_with_zip_code)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_lat)} entries with latitudes ({len(entries_with_lat)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_lon)} entries with longitudes ({len(entries_with_lon)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_organization)} entries with organizations ({len(entries_with_organization)*100/len(all_entries):.2f}%).\")\n", + "print(f\"There are {len(entries_with_gp_id)} entries with Google Places IDs ({len(entries_with_gp_id)*100/len(all_entries):.2f}%).\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:09.549508Z", + "start_time": "2024-01-30T21:35:09.293388Z" + } + }, + "id": "b461853bb0274eb7" + }, + { + "cell_type": "markdown", + "source": [ + "We can also take a quick look and see what these different entries look like" + ], + "metadata": { + "collapsed": false + }, + "id": "5ebdde2cb4d2a64a" + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "data": { + "text/plain": "['1020 Lombard St.',\n 'Market St. between 10th & 12th Sts.',\n '1500 Chestnut St.',\n '16th St. & JFK Blvd.',\n '1901 Vine St.',\n '2501 Walnut St.',\n '23 S Christopher Columbus Blvd',\n '2955 Market St.',\n '3601 Walnut St.',\n '1500 Spring Garden St',\n '1500 Spring Garden St',\n '1500 Spring Garden St',\n '1700 S. Broad Street, Unit 201',\n '555 S. 43rd Street',\n '1900 N. 20th Street',\n '321 W. Girard Avenue',\n '131 E. Chelten Avenue',\n '2840 W. Dauphin Street',\n '705 S. 5th St.',\n '1710 E Passyunk Ave',\n '1602 Spruce St.',\n '131 Old Lancaster Road',\n '321 University Avenue',\n '4400 Haverford Avenue',\n '500 S. Broad Street',\n '23 S Christopher Columbus Blvd',\n '2106 S Christopher Columbus Blvd',\n '2110 S Christopher Columbus Blvd',\n '2206 S Christopher Columbus Blvd',\n '23 S Christopher Columbus Blvd',\n '2300 S Christopher Columbus Blvd',\n '29 Snyder Ave',\n '1 Mifflin St',\n '2000 S Swanson St',\n '51 N 12th St',\n '1500 N 50th St',\n '6666 Ridge Ave',\n '3900 Lancaster Ave',\n '4221-29 Market St',\n '12 Cobbs Creek Pkwy',\n '7401 Lansdowne Ave',\n '236-48 Spring Garden St',\n '1430 E Passyunk Ave',\n '5510 Pine St',\n '2600 Morris St',\n '1900 Wakeling St',\n '1517-19 Belmont Ave',\n '2501-59 Diamond St',\n '1100 E Chelten Ave',\n '9550 Frankford Ave',\n '3325 Red Lion Rd',\n '205 Race Street',\n '8 E. Lancaster Ave.',\n '4131 Unruh Ave.',\n '4131 Unruh Ave.',\n '1431 N. 6th Street',\n '5200 Penn St.',\n '231 Wyoming Ave.',\n '231 Wyoming Ave.',\n '201 Fountain St.',\n '2501 W. Diamond St.',\n '2501 W. Diamond St.',\n '2551 N. 22nd Street',\n '2551 N. 22nd Street',\n '1101 W. Susquehanna Ave.',\n '2140 N. 33rd St.',\n '2140 N. 33rd St.',\n '820 N. 8th Street',\n '1800 Washington Ave.',\n '5700 Haverford Ave.',\n '5700 Haverford Ave.',\n '5700 Haverford Ave.',\n '3320 Haverford Ave.',\n '3321 Haverford Ave.',\n '5800 Chester Ave.',\n '5800 Chester Ave.',\n '4328 Haverford Ave.',\n '2889 Cedar St.',\n '6455 Walker Street',\n '6455 Walker Street',\n '1900 Wakeling St.',\n '3267 Almond St.',\n '3267 Almond St.',\n '400 Ontario St.',\n '400 Ontario St.',\n '3201 North 5th Street',\n '3201 North 5th Street',\n '100 E. Godfrey Ave.',\n '1100 E Chelten Ave.',\n '1610 W. Chelten Ave.',\n '2100 S. 24th St.',\n '4901 Kingsessing Ave.',\n '728 S. 55th St.',\n '728 S. 55th St.',\n '6839 Lansdowne Ave.',\n '5901 W. Columbia Ave.',\n '6201 Torresdale Ave.',\n '2109 W. Chew Ave.',\n '2109 W. Chew Ave.',\n '1101 E. Cayuga St.',\n '1101 E. Cayuga St.',\n '4800 Wayne Ave.',\n '6801 Grovers Ave.',\n '3001 Robbins Ave.',\n '1832 Howard Street',\n '217 E Butler Ave',\n '8500 Pickering St',\n '1500 Market Street, Suite 465',\n '1600 Arch Street',\n '101 N Broad St.',\n '1301 Chestnut Street',\n '1528 Walnut Street',\n '1201 Market Street',\n '200 S Broad St',\n '1701 John F Kennedy Blvd',\n '1801 Market St.',\n '254 South 15th Street',\n '1201 Walnut St.',\n '1122 Chestnut St',\n '1839 Chestnut Street',\n '1900 Market Street',\n '1001-1005 Chestnut St.',\n '1801 Spruce Street',\n '337-341 South Broad Street',\n '1945 Callowhill Street',\n '200 Washington Square West',\n '2001 Pennsylvania Ave',\n '600 9th Street',\n '525 ARCH ST',\n '2201 South Street',\n '1002 S. Broad St.',\n '456 N 5th St',\n '57-63 North Third Street',\n '3141 Chestnut St',\n '3400 Lancaster Ave',\n '3401 WALNUT ST',\n '3400 Spruce Street',\n '3400 Civic Center Boulevard',\n '3601 Walnut St.',\n '1018 N. 2nd St',\n '1601 N. Broad St.',\n '180 W. Girard Ave',\n '3800 LOCUST WALK, 1920 COMMONS BLDG',\n '3901 Walnut St',\n '1755 N. 13TH ST.',\n '2201-2219 South Broad Street',\n '1 Mifflin St.',\n '4140 Woodland Ave.',\n '326 Penn St',\n '29 Snyder Avenue',\n '3340 N Broad St',\n '4000 Monument Rd',\n '4600 City Line Avenue',\n '2461 North 54th St',\n '2701 Castor Ave',\n '138 Montogomery Ave',\n '1900 W Olney Dr',\n '1401 John F Kennedy Blvd',\n '1315 Spruce Street',\n 'Spring Garden St SS 20ft E/O Front St F/E - 1',\n 'Oregon Av SS 0.3mi E/O Front St F/E - 1',\n 'I-76 Schuylkill Xwy SS 25ft E/O 28th St F/NW - 1',\n 'Belmont Av ES 30ft N/O Girard Av F/S - 1',\n '52nd St WS 25ft N/O Spruce St F/S - 3',\n 'Lancaster Av NS 25ft E/O 60th St F/E - 1',\n 'Stenton Av NS 30ft S/O Ellett F/E - 1',\n 'Haines St SS 25ft W/O Limekiln Pk F/E - 2',\n '5th St WS 25ft N/O Delphine St F/S - 1',\n '5th St WS 25ft N/O Delphine St F/S - 1',\n 'Castor Av WS 25ft E/O Oxford Av F/S - 2',\n 'Dungan Rd WS 200ft S/O Rhawn St F/N - 2',\n 'Grant Av NS 100ft E/O Krewstown Rd F/E - 1',\n 'Southampton Rd SS 0.5mi W/O US 1 Roosevelt Blvd F/W - 1',\n 'Bridge St SS 25ft E/O Harbison Av F/E - 2',\n 'Byberry Rd NS 100ft W/O Evans St F/E - 1',\n 'Wyoming Av NS 25ft W/O G St F/W - 1',\n 'N 2nd St WS 0.1mi N/O Erie Av F/N - 1',\n 'Germantown Av WS 25ft N/O Glenwood Av F/S - 1',\n 'State Rd ES 25ft N/O Unruh St F/N - 1',\n 'Allegheny Av SS 25ft W/O Rosehill St F/E - 2',\n 'Delaware Av ES 25ft S/O Poplar St F/N - 2',\n 'Summerdale Av WS 25ft N/O Foulkrod St F/S - 2',\n '2nd St ES 25ft S/O Norris St F/N - 1',\n 'Oxford Av WS 25ft S/O Langdon St F/N - 1',\n 'Linden Av NS 25ft W/O Keystone St F/W - 2',\n '1335 Frankford Ave',\n '1414 South Penn Square',\n '130 South 19th Street',\n '100 S Independence Mall W (SW Corner of 6th & Market)']" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('address') for e in entries_with_address]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:12.214530Z", + "start_time": "2024-01-30T21:35:12.210892Z" + } + }, + "id": "c3dfce37ce039340" + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [ + { + "data": { + "text/plain": "['Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Bala Cynwyd',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Ardmore',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Ambler',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'PHILADELPHIA',\n 'Philadelphia',\n 'PHILADELPHIA',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'PHILADELPHIA',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Camden',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Bala Cynwyd',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia',\n 'Philadelphia']" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('city') for e in entries_with_city]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:13.226797Z", + "start_time": "2024-01-30T21:35:13.223073Z" + } + }, + "id": "57c967b3885a924f" + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [ + { + "data": { + "text/plain": "[19145,\n 19104,\n 19120,\n 19123,\n 19144,\n 19132,\n 19147,\n 19148,\n 19103,\n 19004,\n 10194,\n '19104',\n 19146,\n '19107',\n '19131',\n 19143,\n 19106,\n '19003',\n 19135,\n 19135,\n 19122,\n 19124,\n 19120,\n 19120,\n 19128,\n 19121,\n 19121,\n 19121,\n 19121,\n 19122,\n 19121,\n 19121,\n 19123,\n 19146,\n 19131,\n 19131,\n 19131,\n 19104,\n 19104,\n 19143,\n 19143,\n 19104,\n 19134,\n 19135,\n 19135,\n 19124,\n 19134,\n 19134,\n 19134,\n 19134,\n 19140,\n 19140,\n 19120,\n 19138,\n 19126,\n 19145,\n 19143,\n 19143,\n 19143,\n 19151,\n 19151,\n 19135,\n 19138,\n 19138,\n 19124,\n 19124,\n '19144',\n 19142,\n 19149,\n 19125,\n 19002,\n 19140,\n 19143,\n 19138,\n 19111,\n 19143,\n 19111,\n 19148,\n 19147,\n 19124,\n 19131,\n 19131,\n 19132,\n 19154,\n 19104,\n 19130,\n 19143,\n 19140,\n 19145,\n 19150,\n 19125,\n 19103,\n 19111,\n 19130,\n 19145,\n 19106,\n 19145,\n 19115,\n 19147,\n 19114,\n 19136,\n 19140,\n 19132,\n 19136,\n 19111,\n 19153,\n 19154,\n 19131,\n 19128,\n 19115,\n 19123,\n 19103,\n 19107,\n 19146,\n 19149,\n 19152,\n 19129,\n 19104,\n 19114,\n 19134,\n 19118,\n 19151,\n 19133,\n 19123,\n 19132,\n 19118,\n 19152,\n 19124,\n 19114,\n 19119,\n 19103,\n 19144,\n 19128,\n 19139,\n 19103,\n 19121,\n 19134,\n 19120,\n 19131,\n 19130,\n 19118,\n 19130,\n 19144,\n 19131,\n 19139,\n 19146,\n 19128,\n 19128,\n 19144,\n 19121,\n 191022148,\n 191032028,\n 19107,\n 191073521,\n 191023604,\n 191072817,\n 191023803,\n 19102,\n 191031628,\n 191023812,\n 191074914,\n 19107,\n 191033711,\n 191033527,\n 191074219,\n 191036102,\n 191075839,\n 191303841,\n 191063513,\n 19130,\n 191472018,\n 19106,\n 19146,\n 19146,\n 19123,\n 191064508,\n 191042807,\n 19104,\n 191046228,\n 191044229,\n 19104,\n 19104,\n 19123,\n 19122,\n 19123,\n 191046139,\n 19104,\n 19122,\n 191482916,\n 19148,\n 191044546,\n 81021410,\n 19148,\n 191405102,\n 191311600,\n 19131,\n 19131,\n 191345599,\n 190042984,\n 191411108,\n 19102,\n 19107,\n 19123,\n 19148,\n 19145,\n 19104,\n 19139,\n 19131,\n 19150,\n 19138,\n 19120,\n 19120,\n 19149,\n 19111,\n 19115,\n 19116,\n 19124,\n 19116,\n 19124,\n 19140,\n 19133,\n 19135,\n 19134,\n 19123,\n 19124,\n 19122,\n 19149,\n 19114,\n 19125,\n 19102,\n 19103,\n 19106]" + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('zip_code') for e in entries_with_zip_code]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:13.789679Z", + "start_time": "2024-01-30T21:35:13.786742Z" + } + }, + "id": "ceac350653528952" + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [ + { + "data": { + "text/plain": "[39.943787,\n 39.9525,\n 39.950861,\n 39.954101,\n 39.959269,\n 39.952195,\n 39.947466,\n 39.95578,\n 39.953382,\n 39.9622221,\n 39.962293,\n 39.9623835,\n 39.929181,\n 39.948857,\n 39.983632,\n 39.970143,\n 40.037962,\n 39.990727,\n 39.94005,\n 39.928485,\n 39.947317,\n 40.001739,\n 39.947333,\n 39.96225,\n 39.944297,\n 39.902329176510136,\n 39.91981266450526,\n 39.91885839345153,\n 39.917115817628755,\n 39.91563704822793,\n 39.91596279208233,\n 39.923320372467465,\n 39.92411526481998,\n 39.92254648240829,\n 39.95362880647623,\n 39.97776846261271,\n 40.03993018,\n 39.96164004,\n 39.95820598,\n 39.95813417,\n 39.97380358,\n 39.96042156,\n 39.93146051,\n 39.95414737,\n 39.93124389,\n 40.01449636,\n 39.986008408122665,\n 39.97676301,\n 39.98726035,\n 40.04955653,\n 40.05905883,\n 40.08059935,\n 39.953902,\n 40.007543,\n 39.992697,\n 39.967240055434665,\n 39.9804902975695,\n 39.97919520469946,\n 39.971695729140244,\n 39.96960035510774,\n 40.027686,\n 40.02779,\n 39.973224,\n 40.023873,\n 40.021384,\n 40.021378,\n 40.034027,\n 39.987278,\n 39.987266,\n 39.993714,\n 39.993642,\n 39.986026,\n 39.98906,\n 39.989247,\n 39.966136,\n 39.937967,\n 39.967301,\n 39.967155,\n 39.96713,\n 39.963893,\n 39.963921,\n 39.934521,\n 39.934661,\n 39.96179,\n 39.984056,\n 40.026326,\n 40.0262,\n 40.014491,\n 39.986371,\n 39.986428,\n 40.000414,\n 40.000202,\n 40.000549,\n 40.00032,\n 40.042737,\n 40.049571,\n 40.050461,\n 39.925553,\n 39.942749,\n 39.948831,\n 39.948662,\n 39.974332,\n 39.981794,\n 40.018475,\n 40.041747,\n 40.041628,\n 40.014857,\n 40.014777,\n 40.024242,\n 39.915266,\n 40.029044,\n 39.978564,\n 40.156234,\n 40.0078168,\n 39.9421216,\n 40.0534563,\n 40.0765476,\n 39.931814,\n 40.0686077,\n 39.9215883,\n 39.93379,\n 40.0119263,\n 39.9786489,\n 39.9709672,\n 39.9934968,\n 40.0798456,\n 39.9734259,\n 39.9657416,\n 39.9519958,\n 39.9997027,\n 39.9291811,\n 40.0816801,\n 39.9718684,\n 39.9476121,\n 40.0728519,\n 39.9686968,\n 39.9067035,\n 39.955527,\n 39.9229622,\n 40.0881324,\n 39.9308637,\n 40.0560014,\n 40.0368799,\n 40.0174374,\n 40.0020045,\n 40.048362,\n 40.0562433,\n 39.891587,\n 40.1099437,\n 39.9810775,\n 40.0290751,\n 40.1009083,\n 39.965805,\n 39.9579313,\n 39.945175,\n 39.9420729,\n 40.0383522,\n 40.060815,\n 40.0184144,\n 39.9672126,\n 40.0681671,\n 39.9831361,\n 40.0881211,\n 39.9743255,\n 39.9893396,\n 39.962725,\n 40.002163,\n 40.0700182,\n 40.0621995,\n 40.0124465,\n 40.042546,\n 40.057919,\n 39.949994,\n 40.0472231,\n 40.0557229,\n 39.9679903,\n 39.956809,\n 39.9817254,\n 39.982155,\n 40.0380778,\n 39.982961,\n 39.9644879,\n 40.0745988,\n 39.9660769,\n 40.0452759,\n 39.9816024,\n 39.968379,\n 39.9355369,\n 40.0796772,\n 40.021622,\n 40.0325662,\n 39.9824902,\n 39.95252,\n 39.954728,\n 39.954677,\n 39.950825,\n 39.949641,\n 39.95224,\n 39.949131,\n 39.954412,\n 39.953417,\n 39.947776,\n 39.949026,\n 39.950265,\n 39.951915,\n 39.953446,\n 39.950214,\n 39.947885,\n 39.945649,\n 39.960928,\n 39.947861,\n 39.961705,\n 39.942521,\n 39.952661,\n 39.945427,\n 39.938096,\n 39.959975,\n 39.952086,\n 39.954823,\n 39.957007,\n 39.953065,\n 39.950432,\n 39.947649,\n 39.953154,\n 39.966445,\n 39.977533,\n 39.968983,\n 39.952479,\n 39.95394,\n 39.979406,\n 39.922503,\n 39.924189,\n 39.946772,\n 39.948118,\n 39.921477,\n 40.004543,\n 40.006494,\n 40.002691,\n 39.996388,\n 39.991031,\n 40.005212,\n 40.038155,\n 39.952384,\n 39.946958,\n 39.9602,\n 39.9132,\n 39.9237,\n 39.9737,\n 39.9548,\n 39.9829,\n 40.0673,\n 40.0581,\n 40.0328,\n 40.0328,\n 40.0327,\n 40.0672,\n 40.0875,\n 40.1212,\n 40.0109,\n 40.1122,\n 40.0201,\n 40.0084,\n 39.9998,\n 40.02,\n 39.9977,\n 39.9618,\n 40.0286,\n 39.9805,\n 40.0374,\n 40.0479,\n 39.971243,\n 39.951652,\n 39.950769,\n 39.950603]" + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('lat') for e in entries_with_lat]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:14.446611Z", + "start_time": "2024-01-30T21:35:14.439106Z" + } + }, + "id": "4d3ecee6ea9b07a3" + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "data": { + "text/plain": "[-75.159048,\n -75.158056,\n -75.165866,\n -75.166818,\n -75.170716,\n -75.180653,\n -75.183711,\n -75.181968,\n -75.195194,\n -75.1643174,\n -75.1641745,\n -75.1635925,\n -75.169272,\n -75.208927,\n -75.167033,\n -75.14186,\n -75.173098,\n -75.179025,\n -75.150955,\n -75.165337,\n -75.168257,\n -75.239045,\n -75.198113,\n -75.21102,\n -75.165706,\n -75.1485013961792,\n -75.143228305418,\n -75.14283707270079,\n -75.14127848082069,\n -75.13942835053376,\n -75.13871427120705,\n -75.14519768090526,\n -75.14611676864189,\n -75.14690828504962,\n -75.15868786689634,\n -75.22195019461691,\n -75.22466851,\n -75.20043271,\n -75.20749114,\n -75.24818842,\n -75.25908428,\n -75.143069,\n -75.1623555,\n -75.23283566,\n -75.18872768,\n -75.07590834,\n -75.20183763523539,\n -75.21274045,\n -75.17470372,\n -75.16003691,\n -74.99839391,\n -74.98926787,\n -75.142927,\n -75.290701,\n -75.194197,\n -75.18336724332909,\n -75.19882121259286,\n -75.19633446198884,\n -75.19019653264229,\n -75.18733461599174,\n -75.049327,\n -75.049119,\n -75.145755,\n -75.080084,\n -75.121258,\n -75.121088,\n -75.228731,\n -75.174874,\n -75.174685,\n -75.167717,\n -75.16774,\n -75.151924,\n -75.187629,\n -75.187542,\n -75.1513,\n -75.174387,\n -75.233976,\n -75.234063,\n -75.233895,\n -75.191304,\n -75.191031,\n -75.228125,\n -75.227914,\n -75.210291,\n -75.113111,\n -75.053644,\n -75.053544,\n -75.075894,\n -75.101233,\n -75.101149,\n -75.121036,\n -75.121078,\n -75.138055,\n -75.137815,\n -75.11722,\n -75.160024,\n -75.146019,\n -75.187203,\n -75.217173,\n -75.234601,\n -75.234641,\n -75.256512,\n -75.24097,\n -75.054485,\n -75.156342,\n -75.156336,\n -75.105145,\n -75.10528,\n -75.165956,\n -75.232329,\n -75.061534,\n -75.134883,\n -75.134883,\n -75.1618759,\n -75.2096958,\n -75.165298,\n -75.0854155,\n -75.211663,\n -75.0886693,\n -75.1503318,\n -75.159799,\n -75.0730715,\n -75.2136967,\n -75.2364409,\n -75.1677678,\n -74.9738251,\n -75.1989143,\n -75.1674813,\n -75.2506083,\n -75.1305746,\n -75.1692716,\n -75.1752912,\n -75.1280311,\n -75.1797949,\n -75.0801766,\n -75.1668711,\n -75.1738724,\n -75.15018,\n -75.1728246,\n -75.0220411,\n -75.1479365,\n -75.0098985,\n -75.0261585,\n -75.1438209,\n -75.1758621,\n -75.0245657,\n -75.0757494,\n -75.2571052,\n -74.9656197,\n -75.2130798,\n -75.211313,\n -75.0341217,\n -75.141906,\n -75.169182,\n -75.160115,\n -75.1708111,\n -75.0702814,\n -75.0268042,\n -75.186208,\n -75.1953204,\n -74.9916794,\n -75.0997377,\n -75.221963,\n -75.2564537,\n -75.1378648,\n -75.143572,\n -75.165745,\n -75.2059126,\n -75.0509334,\n -75.096592,\n -74.9909842,\n -75.181221,\n -75.1706837,\n -75.1808554,\n -75.2521911,\n -75.2337491,\n -75.1691723,\n -75.1956208,\n -75.1071604,\n -75.1136721,\n -75.231605,\n -75.1765844,\n -75.2012574,\n -75.1831974,\n -75.1684012,\n -75.2238642,\n -75.2228284,\n -75.183212,\n -75.236996,\n -75.213135,\n -75.1603179,\n -75.1919113,\n -75.166682,\n -75.166651,\n -75.163067,\n -75.161965,\n -75.167423,\n -75.160791,\n -75.164587,\n -75.169217,\n -75.170185,\n -75.166489,\n -75.160648,\n -75.159397,\n -75.171801,\n -75.17305,\n -75.157194,\n -75.171287,\n -75.164787,\n -75.171445,\n -75.154007,\n -75.172957,\n -75.157215,\n -75.148943,\n -75.178659,\n -75.166885,\n -75.148006,\n -75.145146,\n -75.185836,\n -75.191415,\n -75.192219,\n -75.192721,\n -75.193383,\n -75.194788,\n -75.140361,\n -75.158067,\n -75.139116,\n -75.199367,\n -75.200323,\n -75.155468,\n -75.169813,\n -75.146318,\n -75.206819,\n -75.122542,\n -75.146812,\n -75.152396,\n -75.212449,\n -75.222654,\n -75.23442,\n -75.089284,\n -75.239931,\n -75.153823,\n -75.163595,\n -75.163365,\n -75.1388,\n -75.1447,\n -75.1931,\n -75.2121,\n -75.2262,\n -75.2414,\n -75.1787,\n -75.1535,\n -75.1316,\n -75.1316,\n -75.0842,\n -75.0691,\n -75.0414,\n -74.9973,\n -75.0697,\n -75.004,\n -75.1112,\n -75.1315,\n -75.1478,\n -75.0427,\n -75.1241,\n -75.1362,\n -75.0955,\n -75.137,\n -75.0886,\n -74.9969,\n -75.134243,\n -75.165309,\n -75.172275,\n -75.150761]" + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('lon') for e in entries_with_lon]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:15.021976Z", + "start_time": "2024-01-30T21:35:15.017753Z" + } + }, + "id": "436a72d15afc5e24" + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "data": { + "text/plain": "['Seger Recreation Center',\n 'Jefferson Station',\n 'Staples',\n 'Suburban Station',\n 'Parkway Central Library',\n 'Schuylkill Banks',\n 'Southeast Pollution Control Plant',\n '30th Street Station',\n 'University of Pennsylvania Book Store',\n 'Harrisburg University Philadelphia Location',\n 'Health Center #2',\n 'Health Center #3',\n 'Health Center #5',\n 'Health Center #6',\n 'Health Center #9',\n 'Health Center #12 (Strawberry Mansion)',\n 'Plenty Cafe - Queen Village',\n 'Plenty Cafe - East Passyunk',\n 'Plenty Cafe - Rittenhouse',\n 'Bala Cynwyd Library',\n \"Medical Examiner's Office\",\n 'Health Center #4',\n 'Health Administration Building',\n 'Southeast Pollution Control Plant',\n \"Lowe's\",\n 'Raymour & Flannigan',\n 'Ikea',\n 'Southeast Pollution Control Plant',\n 'Best Buy',\n 'Acme',\n 'Target',\n \"Marshall's\",\n 'Reading Terminal Market',\n \"Lowe's - West Philly\",\n 'Police 5th District & Fire Ladder 30',\n 'Police 16th District',\n 'Fire Engine 5 / Ladder 6 & L&I West District Office & Fuel Site 225',\n 'Cobbs Creek Park & Rec Center',\n 'Rose Playground',\n 'Fire Administration Building & Fuel Site 039',\n 'South Philadelphia Older Adult Center',\n 'Philadelphia Police 18th District',\n 'Vare Recreation Center',\n 'Gambrell Recreation Center',\n 'PWD Water Station',\n 'Fire Engine 16',\n 'Hank Gathers YAC',\n 'Lonnie Young Recreation Center',\n 'Torresdale Playground',\n 'Picariello Playground',\n 'United By Blue',\n 'JPM Market',\n 'PWD Water Station',\n 'PWD Water Station',\n 'Water Monster',\n 'HOSR Water Monster',\n 'HOSR Water Monster',\n 'HOSR Water Monster',\n 'Vogt Recreation Center',\n 'Vogt Recreation Center',\n 'Cruz Recreation Center',\n 'Mc Ilvain Playground',\n 'Feltonville Recreation Center',\n 'Feltonville Recreation Center',\n 'Hillside Recreation Center',\n 'Hank Gathers Youth Access Center',\n 'Hank Gathers Youth Access Center',\n 'C.B. Moore Recreation Center',\n 'C.B. Moore Recreation Center',\n 'Penrose Recreation Center',\n 'Mander Playground',\n 'Mander Playground',\n 'East Poplar Playground',\n 'Chew Playground',\n 'Shepard',\n 'Shepard',\n 'Shepard',\n 'James L. Wright Recreation Center',\n 'James L. Wright Recreation Center',\n 'Francis Myers Recreation Center',\n 'Francis Myers Recreation Center',\n 'Lee Cultural Center',\n 'Cohocksink Recreation Center',\n 'Roosevelt Playground',\n 'Roosevelt Playground',\n 'Gambrel Playground',\n 'Frank Glavin Memorial Playground',\n 'Frank Glavin Memorial Playground',\n 'McVeigh Recreation Center',\n 'McVeigh Recreation Center',\n 'Rivera Recreation Center',\n 'Rivera Recreation Center',\n 'Olney Recreation Center',\n 'Lonnie Young Recreation Center',\n 'Morris Estate Recreation Center',\n 'Smith Playground',\n 'Kingsessing Rec Center',\n 'Christy Park',\n 'Christy Park',\n 'Papa Playground',\n 'Tustin Playground',\n 'American Legion Playground',\n 'Belfield Recreation Center',\n 'Belfield Recreation Center',\n 'Ferko Playground',\n 'Ferko Playground',\n 'Happy Hollow Recreation Center',\n 'James Finnegan Playground',\n 'Lower Mayfair School',\n 'Towey Playground',\n 'Weavers Way Co-op',\n '20th & Tioga Ballfield',\n '48th & Woodland',\n 'Awbury Park & Recreation Center',\n 'Barnes & Loney Park',\n \"Bartram's Garden\",\n 'Burholme Park',\n 'Burke Playground',\n 'Capitolo Playground',\n 'Carmella Playground',\n 'Carousel House Recreation Center',\n 'Carroll Park',\n 'Cecil B. Moore Recreation Center',\n 'Chalfont Playground',\n 'Clayborn Lewis Community Center Playground',\n 'Clemente Playground',\n 'Cobbs Creek Environmental Education Center',\n 'Collazo Playground',\n 'DiSilvestro Playground',\n 'Dorothy Emanuel Playground & Recreation Center',\n 'Fishtown Recreation Center',\n 'Fitler Square',\n 'Fox Chase Playground',\n 'Francisville Playground',\n 'Franklin D. Roosevelt Park',\n 'Franklin Square',\n 'Guerin Recreation Center',\n 'Hayes Woods',\n 'Herron Playground',\n 'Holme Playground',\n 'Holmesburg Playground',\n 'Hunting Park Recreation Center',\n 'James Allen Shuler Playground',\n 'James Ramp Playground',\n 'Jardel Recreation Center',\n 'John Heinz National Wildlife Refuge at Tinicum',\n 'Junod Playground',\n 'Kelly Pool',\n 'Kendrick Playground & Recreation Center',\n 'Lackman Playground',\n 'Liberty Lands Park',\n 'Logan Square',\n 'Louis Kahn Park',\n 'Marian Anderson Recreation Center',\n 'Max Myers Playground',\n 'McArdle Recreation Center',\n 'McMichael Park',\n 'Miles Mack Playground',\n 'Mitchell Playground',\n 'Monkiewicz Playground',\n 'Morris Arboretum',\n 'Morris Park/Papa Playground',\n 'Nelson Playground',\n 'Northern Liberties Recreation Center',\n 'Panati Playground',\n 'Pastorius Park',\n 'Pelbano Playground',\n 'Piccoli Playground',\n 'Pleasant Hill Park',\n 'Pleasant Playground',\n 'Rittenhouse Square',\n 'Rumph Playground',\n 'Schuylkill Center for Environmental Education',\n 'Shepard Recreation Center',\n 'Sister Cities Park',\n 'Smith Memorial Playground',\n 'Stokely Playground',\n 'Tacony Creek Park',\n 'Triangle Park',\n 'Von Colln Field',\n 'Water Tower Recreation Center',\n 'Water Works',\n 'Waterview Recreation Center',\n 'West Fairmount Park-Parkside Evans Playground',\n 'West Mill Creek Park',\n 'Wharton Square Playground',\n 'Wissahickon Environmental Center',\n 'Wissahickon Neighbors Park',\n 'Wister Playground & Recreation Center',\n 'The Discovery Center',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Starbucks',\n 'Philly Water Bar',\n 'William Way LGBT Community Center',\n 'La Colombe',\n 'La Colombe',\n 'La Colombe',\n 'La Colombe']" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('organization') for e in entries_with_organization]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:15.492581Z", + "start_time": "2024-01-30T21:35:15.489767Z" + } + }, + "id": "e194bdbaf5c174c4" + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": "['ChIJocPgsybGxokR3QyYHHmQ118',\n 'ChIJSVdgvSnGxokRWJqLCbC4xzQ',\n 'ChIJCSet3i_GxokRecQeyoGrAl0',\n 'ChIJ9Sdt-zHGxokRad5acsk-ifo',\n 'ChIJ_x-wODPGxokREDE-Lq4X9dE',\n 'ChIJ1QhLoknGxokRY1BxIaMBmEY',\n 'ChIJK-2pcVrPxokRsUB6CwKx3lk',\n 'ChIJKxHG307GxokRP23dRyxLy64',\n 'ChIJP1z1qVDGxokRvUMnTTzBxdM',\n 'ChIJcZR_YNLHxokRDNZxCexFgYk',\n 'ChIJ87U4TdLHxokRoghtMn8iA2o',\n 'ChIJ87U4TdLHxokRoghtMn8iA2o',\n 'ChIJvw4u5v7JxokRVlUweG4Qhx0',\n 'ChIJubVbRAXGxokR6Cu3BPxfIw8',\n 'ChIJVSBvHTvGxokRz0C0_VaUxyM',\n 'ChIJB0P3_1DHxokRepHAZTGyjfY',\n 'ChIJmSU991jGxokRGMSyRlN5gLA',\n 'ChIJVaMsViPGxokRoYdepfdi_5U',\n 'ChIJK-2pcVrPxokRsUB6CwKx3lk',\n 'ChIJO95SFrLIxokRv38358hdaaI',\n 'ChIJ3x7cPLLIxokRERcNvsd1j9I',\n 'ChIJXatc2LLIxokRh-hZFBX8EWM',\n 'ChIJK-2pcVrPxokRsUB6CwKx3lk',\n 'ChIJY74fK7PIxokRfcHUUcHx5UE',\n 'ChIJY_vkQq7IxokRYLyz37r5W8A',\n 'ChIJiVZ4I6nIxokR2uGzPMAI5uA',\n 'ChIJuWWRaDfGxokR0mYy349nqGU',\n 'ChIJCQH7WCnGxokRAWsd3AfQj80',\n 'ChIJ3dwlnhfHxokRGAtTKSmcm5Q',\n 'ChIJ07J1fue4xokRQbv_61o6Yhk',\n 'ChIJh7WdcVXGxokRaAJQjqmiF7A',\n 'ChIJb0YntDDIxokRLOvAezkHf9Y',\n 'ChIJrUwqs7HGxokRG3GEi-C6Jms',\n 'ChIJqwjIidjAxokR4ymb_nk7JZs',\n 'ChIJyatX0HHIxokRl_gNAAccv3Q',\n 'ChIJl3sICxvGxokRsA1OIwCa5RI',\n 'ChIJgz5eOMTGxokR97VFNDNMBI0',\n 'ChIJeasNC3DGxokRq1D_oG_7bds',\n 'ChIJ8ehFgm62xokRxOkgxOE_sgE',\n 'ChIJueACQ_XHxokRNiv0wU_1Vuw',\n 'ChIJvy2Y4Q_HxokRtQIR72pqQb8',\n 'ChIJr-KwXezHxokRrui-1rWdlW8',\n 'ChIJCZry69m5xokRMDcC8ROr7uM',\n 'ChIJQ61RD2KzxokRYbNDLXbw_A8',\n 'ChIJ9ZuWPD6zxokR40dYUCHFgDA',\n 'ChIJ7ROCztTHxokR7CTttFgnbGk',\n 'ChIJ79mgRobAxokRP6EPssmDe-o',\n 'ChIJueACQ_XHxokRNiv0wU_1Vuw',\n 'ChIJueACQ_XHxokRNiv0wU_1Vuw',\n 'ChIJDSjGFinGxokRhrsVnoGZyJE',\n 'ChIJRYODDi60xokRydntYdJh5cY',\n 'ChIJRYODDi60xokRydntYdJh5cY',\n 'ChIJzZeQRnLIxokR19wsCerVRUU',\n 'ChIJb0Q-tWa2xokRpovKFQtVyr4',\n 'ChIJdVSEqL23xokR260B2MiD9Hk',\n 'ChIJdVSEqL23xokR260B2MiD9Hk',\n 'ChIJkUXfvsO4xokRBF2rpPNwdBY',\n 'ChIJr-KwXezHxokRrui-1rWdlW8',\n 'ChIJr-KwXezHxokRrui-1rWdlW8',\n 'ChIJ0RPa6_DHxokRyFRHGF2Pw5s',\n 'ChIJ0RPa6_DHxokRyFRHGF2Pw5s',\n 'ChIJEbYeyhfIxokRGddU7NdSGlI',\n 'ChIJ5evvQRbIxokRR70LHaWHwRA',\n 'ChIJ5evvQRbIxokRR70LHaWHwRA',\n 'ChIJadAfr3nIxokRjWz3KA0rOOQ',\n 'ChIJkWQsshXGxokRQb2G7KoBAZ0',\n 'ChIJj2-QQNjGxokRxlWAjmAwhH0',\n 'ChIJj2-QQNjGxokRxlWAjmAwhH0',\n 'ChIJj2-QQNjGxokRxlWAjmAwhH0',\n 'ChIJndp0P63HxokRYmHjHG0LTyw',\n 'ChIJndp0P63HxokRYmHjHG0LTyw',\n 'ChIJO636q6PGxokRpc1-GXJ4nQY',\n 'ChIJO636q6PGxokRpc1-GXJ4nQY',\n 'ChIJOyPVVfzGxokRhgvu59GGImA',\n 'ChIJhbsm3i3IxokRIb6WNt-wQZo',\n 'ChIJKwmVtiy0xokRr6p1FP_wuGc',\n 'ChIJKwmVtiy0xokRr6p1FP_wuGc',\n 'ChIJ8ehFgm62xokRxOkgxOE_sgE',\n 'ChIJrczer9bJxokRz2XfSTZPK3I',\n 'ChIJrczer9bJxokRz2XfSTZPK3I',\n 'ChIJI6lgH9q3xokRopUlvJGgPfA',\n 'ChIJI6lgH9q3xokRopUlvJGgPfA',\n 'ChIJ7xCBv-O3xokRMYtgBmFLUdY',\n 'ChIJ7xCBv-O3xokRMYtgBmFLUdY',\n 'ChIJizZgdQW3xokRrZfEC6AneRw',\n 'ChIJCZry69m5xokRMDcC8ROr7uM',\n 'ChIJbxr9yGW3xokRv1a2S0bW_6c',\n 'ChIJq3lfInfGxokR6xTloExtwhs',\n 'ChIJD-IMCZTGxokRIKnfLuXkuhY',\n 'ChIJGTVji8fGxokRi9toRxEB7U8',\n 'ChIJGTVji8fGxokRi9toRxEB7U8',\n 'ChIJk_-I8NbAxokR4xq-tUpiZi4',\n 'ChIJAR2mqzHHxokR-zTRHr575NY',\n 'ChIJp1FSsda1xokRLCB8RgsddR8',\n 'ChIJwxJIddW5xokR1hXDOHwYDe0',\n 'ChIJwxJIddW5xokR1hXDOHwYDe0',\n 'ChIJjduF_Uq2xokRdUUBNByK16o',\n 'ChIJjduF_Uq2xokRdUUBNByK16o',\n 'ChIJVSg4tDy4xokRtnxJExNkPSo',\n 'ChIJry0GnRHExokRm0fEkPSfrXQ',\n 'ChIJrzKbbiS0xokRs4eYr835Nj8',\n 'ChIJdfbbWBTIxokRIZZfzVI6ubU',\n 'ChIJdVcm9jm7xokR1tSUC2UyVOU',\n 'ChIJF2izQn3HxokR6CFYWCnUHec',\n 'ChIJt13Iwe3GxokRMqEAgqDn_M4',\n 'ChIJ-0wDqtu5xokRikDWwTTD8XI',\n 'ChIJyST8CTrGxokRXPobOd5Irz4',\n 'ChIJmTX_74TGxokRtJ5tELv_3ms',\n 'ChIJa8W67cy2xokROgXq8Zpx3cM',\n 'ChIJGSFMv6vIxokRDIGvvoScCb8',\n 'ChIJHzbqTBnGxokReQFH06YUKi0',\n 'ChIJ5edQ_G22xokRkOtMD3TP7xM',\n 'ChIJ4dyz_C3GxokRdVk8QsPV6I8',\n 'ChIJkX7BUy_HxokRWFOqn_ym0RY',\n 'ChIJ01Z04OnHxokRbcbcGSBaWgs',\n 'ChIJAZ8dVTGzxokRP1vYaSnTBsM',\n 'ChIJ6_nQKGTIxokRH6j6gv5DK9A',\n 'ChIJMYxzDM7HxokRtwdT6XvnskA',\n 'ChIJmz-wIqXGxokRL4-v5ARdZk8',\n 'ChIJ5_ECBd63xokRpEo1lYE45x8',\n 'ChIJEwQxPQ7GxokRm8LBicqCFE0',\n 'ChIJrW5f0R-6xokR-aG4EKKqLu0',\n 'ChIJ759TF0HIxokRebK_P9XwLJo',\n 'ChIJIQ0RF0fGxokRRBPTCYK7s-o',\n 'ChIJdR2Zz0uxxokRIL9bQyOo1a8',\n 'ChIJj7cWws_HxokRs87mBqMwm0k',\n 'ChIJie9v2DbBxokRHylCIW7ceSA',\n 'ChIJ1U-L54DIxokRcoW6JtzjcDM',\n 'ChIJG4ACxgvGxokRim_o1O9KtmE',\n 'ChIJs0QN7M2zxokRkZ_aS6ouuQ8',\n 'ChIJmwSdmqbIxokRguhagWvwpvk',\n 'ChIJv8sZk4C0xokRKATinleodZg',\n 'ChIJt_G9VEO0xokR2oPLc6IZ6PI',\n 'ChIJr9-gOe23xokRbPM2mBanERo',\n 'ChIJx2CWewq4xokRKioxVfK7FYQ',\n 'ChIJ012lomW0xokRie5Ya3JbocA',\n 'ChIJ38ov8Lq2xokR28IHoFLiILQ',\n 'ChIJ2SMy_xLDxokR8mn3ZUlRNEI',\n 'ChIJG5ZhgdKyxokRsk7N2OIGBjY',\n 'ChIJ__-PuQ7HxokRH7xg8gWDVdg',\n 'ChIJBb3JfPK4xokRajvM6KUr75c',\n 'ChIJU9aH47SzxokR8ZNb3vfj-So',\n 'ChIJpWVFSGXIxokRAMCRkCDLIXE',\n 'ChIJlTy8vzPGxokRRqwERk5mYOw',\n 'ChIJC-YRICTGxokRzSqdkj3Y6JI',\n 'ChIJ310X2j3GxokRQ37R5YqiMtA',\n 'ChIJu0ZX2Zq2xokRYcsxjXtgkec',\n 'ChIJBSko8YizxokRKUuC7pL7UKQ',\n 'ChIJAf0WTG-4xokR3augPONwaTk',\n 'ChIJVWC-CqzHxokRanm3lTp6amY',\n 'ChIJGXYzwEKzxokR6I1YEHhBrAI',\n 'ChIJW4FSjtHJxokRygXo7ONtLDw',\n 'ChIJE0IUmt-7xokRQi32DdBL4kk',\n 'ChIJk_-I8NbAxokR4xq-tUpiZi4',\n 'ChIJy4iUzBvIxokR4w-uDXtxLLg',\n 'ChIJgy4NT3vIxokRPB3oQzXwe04',\n 'ChIJ64olvGS5xokRihKhLENJU3s',\n 'ChIJQxf7F6K5xokRdOcuY4_fkbs',\n 'ChIJh1f3rwe0xokRqPEtNECS2p0',\n 'ChIJS2p_Pki2xokRzeQ2mv_ONYI',\n 'ChIJ1wCpc5a0xokRSNN2kgHmBC8',\n 'ChIJyaSOyb65xokRFsgIDhMRQyE',\n 'ChIJIQGrHTrGxokRofc6nBBJrRU',\n 'ChIJo7ZdZra5xokR4fiYyMey6Wo',\n 'ChIJY7BmfzK5xokRo5s5Op2gKCo',\n 'ChIJj2-QQNjGxokRxlWAjmAwhH0',\n 'ChIJK6SVAzPGxokRaNqldb6kA9I',\n 'ChIJr6puOL3HxokRg8MLy4Dz_x0',\n 'ChIJeYX2q9LJxokRQI8kagDo9I0',\n 'ChIJ91aSfKq3xokRGK8HYl58E_g',\n 'ChIJ0-xS6TnHxokRE39aZqmQF7Y',\n 'ChIJqb_DeebHxokRHF7lLiNqGW8',\n 'ChIJsUiw_3e5xokR5lI3EGSAVMU',\n 'ChIJUWi_rLbHxokRjLQnQDmpamw',\n 'ChIJhZmy1s25xokRIV27W0vTLCs',\n 'ChIJu4zGeD7HxokRETghEz74Ups',\n 'ChIJNUE0H2a_xokRXHH32GjaZ7c',\n 'ChIJ0Q_lGGzGxokRi4pR3nar2l4',\n 'ChIJ_3fw01u5xokRV472QWVFLDE',\n 'ChIJvZkpBpK4xokR90Nk45WjYec',\n 'ChIJefbr0S64xokRumI2TKUVDAE',\n 'ChIJ1ZcLhaLHxokReC6w0L6UbII',\n 'ChIJXU3hLy7GxokR7N8tEJYJMrs',\n 'ChIJWZU5AjLGxokR2k1bCiIvS1A',\n 'ChIJ8aKQiuLHxokRAVbsZY4j-X4',\n 'ChIJb6KLPS_GxokReHhu-PQijZo',\n 'ChIJXSiRoDrGxokR9t_tWa_CTbI',\n 'ChIJJcgOqyjGxokRxAv9qOOqbas',\n 'ChIJ8-z8YCXGxokRy3gUqFOdm-s',\n 'ChIJ94L3jzHGxokRGT8L30stCKg',\n 'ChIJg-WoGjHGxokRrPlFhnzUY8o',\n 'ChIJQ8PfzTrGxokRLJUthdiLgMY',\n 'ChIJT6ZAVC_GxokRP5u4fZOWI0E',\n 'ChIJYXzkWSjGxokRpgtXrrzPOY4',\n 'ChIJb6KLPS_GxokReHhu-PQijZo',\n 'ChIJ8VJoMjHGxokRJRQGfU3J4Ok',\n 'ChIJYXzkWSjGxokRpgtXrrzPOY4',\n 'ChIJXSiRoDrGxokRzhzf0p4Zg2o',\n 'ChIJ8-z8YCXGxokRy3gUqFOdm-s',\n 'ChIJ7Xb5xszHxokRU4t_ryJJc1E',\n 'ChIJ8-z8YCXGxokR-xoVhEzK2M4',\n 'ChIJ06GAQsvHxokRB1dn0qAcfCs',\n 'ChIJ5_oO2SDGxokRLX-XOBPgS8s',\n 'ChIJoxVzqiXJxokRMrG6Ent-qvU',\n 'ChIJ_w-4ukDGxokRDkpQkydNkag',\n 'ChIJVbE8LaXHxokRy9Q-wBpnpic',\n 'ChIJ6U74wSzJxokRkWw2Th0EoOQ',\n 'ChIJlVCNeYbIxokRBeF0EsSs2Bw',\n 'ChIJY5U-407GxokR16MrVB5fRog',\n 'ChIJRajzkFHGxokRztc9D6M_POU',\n 'ChIJr_Ief1DGxokRUUIs2M-ZNAo',\n 'ChIJjfGor1vGxokRmPw1Olmr0zA',\n 'ChIJQ5CCIksVkFQRzMX643Gvwps',\n 'ChIJ_bb7S8DHxokRLV5TrAYARSE',\n 'ChIJ3zOdcojJxokRNcCxdwFMrpI',\n 'ChIJ7d7aLAbHxokRt_IZD79XiVI',\n 'ChIJV4Cl5G7IxokRLzXtXvKl4GM',\n 'ChIJhzswR1jGxokRMKqmLJQjDBk',\n 'ChIJqSS691fGxokRKcXzFtvXsAo',\n 'ChIJG1k9I-DHxokRDGMsK-nBl70',\n 'ChIJ_SVXBAnGxokRH5tbYX2EWtE',\n 'ChIJCewgAq_IxokR3wtGo1Ksssc',\n 'ChIJTfOdffTGxokRNsU8e95e3w8',\n 'ChIJEzabuvDIxokR7WKuJjUrUtQ',\n 'ChIJodOfQ67IxokRDl_I9qcnpEk',\n 'ChIJx1BLwvi3xokRAiOWeJ_8al0',\n 'ChIJNbXqsYK4xokRmJ0-r_C7y-E',\n 'ChIJC-5FzmDHxokRrWV3KtPq6rM',\n 'ChIJMyhYCUXHxokRDTjikF_Ls3E',\n 'ChIJUdxGxyC2xokRwKdL-burv4U',\n 'ChIJ8d65xFbHxokRRC5uZH0ixd4',\n 'ChIJOfABI4C3xokRdFacV3mEqKs',\n 'ChIJDSjGFinGxokRLKv6W0li864',\n 'ChIJt1jgACXGxokRzUjccKj9Tv4',\n 'ChIJXTdKgszHxokRge2AheHDKio',\n 'ChIJ-Tx6ikKay4kRM3QCT-WRLgY',\n 'ChIJp04LKAXHxokRRXiXrf5Ep9s',\n 'ChIJ-6w55RbHxokRBnuGweFgyUs',\n 'ChIJ3ZdU2svGxokRaU8LY9zEfAE',\n 'ChIJcXuBRZO5xokR__OL0Qzss0o',\n 'ChIJo4CZGGC3xokRRdCHDW4B8Uo',\n 'ChIJHetR_oe2xokRVzW_JyvgYTk',\n 'ChIJb0XPY1OxxokRp0dH2nqmVGg',\n 'ChIJIbfxKYOzxokRBD6PXrqLcms',\n 'ChIJRV8ayXJoo4kR9y4B322HuyY',\n 'ChIJKUlBLXK2xokRjnmLHCsEzEI',\n 'ChIJFdJUggayxokRpXXgT1cIJyU',\n 'EiVHZXJtYW50b3duIEF2ZSwgUGhpbGFkZWxwaGlhLCBQQSwgVVNBIi4qLAoUChIJt1ADr0O4xokRpZgXCsIcYQQSFAoSCestArAAyMaJEc2larvr74Ri',\n 'ChIJ54OTQ6lMwYkRdXHWATzaHTY',\n 'ChIJ5TRT-ivIxokRpx3t0cMO4mA',\n 'ChIJ-z6t4by2xokRfJd27hDko0s',\n 'ChIJh7m16Oq2xokR815s72XwTZk',\n 'ChIJpaZXS8y1xokRFi9TMwYNA8M',\n 'ChIJWXerz2vIxokRkue1--XSK5M',\n 'ChIJL4OUUjfGxokRLgek7FLfoko',\n 'ChIJL4OUUjfGxokRWbdDe02buok',\n 'ChIJwePkqIPIxokRGVmYB9X6zjw']" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[e.get('gp_id') for e in entries_with_gp_id]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:35:15.969740Z", + "start_time": "2024-01-30T21:35:15.967058Z" + } + }, + "id": "a17a2653d486bf32" + }, + { + "cell_type": "markdown", + "source": [ + "### Some observations from this analysis\n", + "\n", + "- All of the resources seem to have valid latitude and longitude entries (at least from a data perspective)\n", + "- Zip codes do have some invalid entries, such as weird numbers or empty strings, and a mix of numbers and strings (data inconsistency)\n", + "- Most of the resources have Google Places IDs, which could be helpful with data sanitization\n", + "- Sometimes a resource will have an address, and sometimes it will have an organization\n", + "- Some addresses are not real addresses, but approximate locations \"located between X and Y streets\"\n", + "\n", + "# Coming up with a schema\n", + "\n", + "This section is a WIP. After analyzing the data, we can come up with an official schema, and validate the resources which follow this schema." + ], + "metadata": { + "collapsed": false + }, + "id": "c804367ce188947d" + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [ + { + "data": { + "text/plain": "{'hours': ['unknown field']}" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from cerberus import Validator\n", + "\n", + "schema = {\n", + " 'access': {'type': 'string', 'allowed': ['Restricted', 'Public']},\n", + " 'address': {'type': 'string'},\n", + " 'city': {'type': 'string', 'allowed': ['Philadelphia']},\n", + " 'zip_code': {'type': 'string'}, # Needs more validation\n", + " 'description': {'type': 'string'},\n", + " 'filtration': {'type': 'string', 'allowed': ['Unsure', 'No']},\n", + " 'gp_id': {'type': 'string'},\n", + " 'handicap': {'type': 'string', 'allowed': ['Unsure']},\n", + " 'images': {'type': 'list', 'schema': {'type': 'string'}},\n", + " 'lat': {'type': 'float'},\n", + " 'lon': {'type': 'float'},\n", + " 'norms_rules': {'type': 'string'},\n", + " 'organization': {'type': 'string'},\n", + " 'permanently_closed': {'type': 'boolean'},\n", + " 'phone': {'type': 'string'}, # Add stricter validation here\n", + " 'quality': {'type': 'string'}, # Add stricter validation here\n", + " 'service': {'type': 'string', 'allowed': ['Self-serve']},\n", + " 'statement': {'type': 'string'},\n", + " 'status': {'type': 'string', 'allowed': ['OPERATIONAL']},\n", + " 'tap_type': {'type': 'string', 'allowed': ['Drinking fountain and water dispenser', 'Drinking Fountain']},\n", + " 'tapnum': {'type': 'number'},\n", + " 'vessel': {'type': 'string', 'allowed': ['No']},\n", + "}\n", + "\n", + "v = Validator(schema, require_all=True)\n", + "v.validate(all_entries[5])\n", + "v.errors" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-30T21:36:34.740583Z", + "start_time": "2024-01-30T21:36:34.704860Z" + } + }, + "id": "9fd5b814c9d7d62e" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "a7d16d0f45caba51" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}