From 85b25efd05d2f7bfa4a94e7aa51b384398bc9e29 Mon Sep 17 00:00:00 2001 From: Carlos Gomez Date: Mon, 6 Apr 2020 21:12:30 -0400 Subject: [PATCH 1/3] South Korea dataset module --- .gitignore | 100 ++++++++++++++++++ .../covid/south_korea/__init__.py | 3 + .../covid/south_korea/__main__.py | 25 +++++ .../covid/south_korea/south_korea_patients.py | 55 ++++++++++ 4 files changed, 183 insertions(+) create mode 100644 task_geo/data_sources/covid/south_korea/__init__.py create mode 100644 task_geo/data_sources/covid/south_korea/__main__.py create mode 100644 task_geo/data_sources/covid/south_korea/south_korea_patients.py diff --git a/.gitignore b/.gitignore index 9286729..57df3c1 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,103 @@ ENV/ notebooks/data/ docs/notebooks + + Created by https://www.gitignore.io/api/pycharm +# Edit at https://www.gitignore.io/?templates=pycharm + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +.idea/**/sonarlint/ + +# SonarQube Plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator/ + +/.idea/.gitignore +/.idea/misc.xml +/.idea/modules.xml +/.idea/inspectionProfiles/profiles_settings.xml +/.idea/rSettings.xml +/.idea/task-geo.iml +/.idea/vcs.xml +# End of https://www.gitignore.io/api/pycharm diff --git a/task_geo/data_sources/covid/south_korea/__init__.py b/task_geo/data_sources/covid/south_korea/__init__.py new file mode 100644 index 0000000..ae1b459 --- /dev/null +++ b/task_geo/data_sources/covid/south_korea/__init__.py @@ -0,0 +1,3 @@ +from task_geo.data_sources.covid.south_korea.south_korea_patients import south_korea_patients + +__all__ = ['south_korea_patients'] diff --git a/task_geo/data_sources/covid/south_korea/__main__.py b/task_geo/data_sources/covid/south_korea/__main__.py new file mode 100644 index 0000000..26918dd --- /dev/null +++ b/task_geo/data_sources/covid/south_korea/__main__.py @@ -0,0 +1,25 @@ +import argparse + +from south_korea_patients import south_korea_patients + + +def get_argparser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + '-o', '--output', required=True, + help='Destination file to store the processed dataset.') + + return parser + + +def main(): + parser = get_argparser() + args = parser.parse_args() + + dataset = south_korea_patients() + dataset.to_csv(args.output, index=False, header=True) + + +if __name__ == '__main__': + main() diff --git a/task_geo/data_sources/covid/south_korea/south_korea_patients.py b/task_geo/data_sources/covid/south_korea/south_korea_patients.py new file mode 100644 index 0000000..fc7d714 --- /dev/null +++ b/task_geo/data_sources/covid/south_korea/south_korea_patients.py @@ -0,0 +1,55 @@ +import io + +import pandas as pd +import requests + + +def south_korea_patients_connector(*args, **kwargs): + """Retrieves data from south_korea_patients. + + Arguments: + url(string): Dataset url + Returns: + pandas.DataFrame + """ + csv = requests.get(kwargs['url']).content + return pd.read_csv(io.StringIO(csv.decode('utf-8'))) + + +def south_korea_patients_formatter(df): + """Formats data retrieved from south_korea_patients. + + Arguments: + df(pandas.DataFrame): + + Returns: + pandas.DataFrame + """ + cols_ordered = [ + 'country', 'state', 'province', 'confirmed_date', + 'released_date', 'deceased_date', 'exposure_start', + 'exposure_end', 'global_id', 'birth_year', + 'local_id', 'sex', 'disease', + 'group', 'infection_reason', 'infection_order', + 'infected_by', 'contact_number' + ] + df = df.reindex(columns=cols_ordered) + df['confirmed_date'] = pd.to_datetime(df.confirmed_date) + df['released_date'] = pd.to_datetime(df.released_date) + df['deceased_date'] = pd.to_datetime(df.deceased_date) + df['exposure_start'] = pd.to_datetime(df.exposure_start) + df['exposure_end'] = pd.to_datetime(df.exposure_end) + return df + + +def south_korea_patients(*args, **kwargs): + """Data Source for south_korea_patients. + + Arguments: + url(string): Dataset url + + Returns: + pandas.DataFrame + """ + data = south_korea_patients_connector(*args, **kwargs) + return south_korea_patients_formatter(data) From 4274adb3dbc31a9906e11bc579cd10a031a2495e Mon Sep 17 00:00:00 2001 From: KrSuma Date: Tue, 14 Apr 2020 10:08:31 +0900 Subject: [PATCH 2/3] without audit.md and datapackage.json --- .../covid/south_korea/__init__.py | 4 +-- .../covid/south_korea/__main__.py | 5 ++-- .../{south_korea_patients.py => kr_covid.py} | 28 +++++++++++-------- 3 files changed, 20 insertions(+), 17 deletions(-) rename task_geo/data_sources/covid/south_korea/{south_korea_patients.py => kr_covid.py} (54%) diff --git a/task_geo/data_sources/covid/south_korea/__init__.py b/task_geo/data_sources/covid/south_korea/__init__.py index ae1b459..6c2ae1b 100644 --- a/task_geo/data_sources/covid/south_korea/__init__.py +++ b/task_geo/data_sources/covid/south_korea/__init__.py @@ -1,3 +1,3 @@ -from task_geo.data_sources.covid.south_korea.south_korea_patients import south_korea_patients +from task_geo.data_sources.covid.south_korea.kr_covid import kr_covid -__all__ = ['south_korea_patients'] +__all__ = ['kr_covid'] diff --git a/task_geo/data_sources/covid/south_korea/__main__.py b/task_geo/data_sources/covid/south_korea/__main__.py index 26918dd..42a957c 100644 --- a/task_geo/data_sources/covid/south_korea/__main__.py +++ b/task_geo/data_sources/covid/south_korea/__main__.py @@ -1,7 +1,6 @@ import argparse -from south_korea_patients import south_korea_patients - +from kr_covid import kr_covid def get_argparser(): parser = argparse.ArgumentParser() @@ -17,7 +16,7 @@ def main(): parser = get_argparser() args = parser.parse_args() - dataset = south_korea_patients() + dataset = kr_covid() dataset.to_csv(args.output, index=False, header=True) diff --git a/task_geo/data_sources/covid/south_korea/south_korea_patients.py b/task_geo/data_sources/covid/south_korea/kr_covid.py similarity index 54% rename from task_geo/data_sources/covid/south_korea/south_korea_patients.py rename to task_geo/data_sources/covid/south_korea/kr_covid.py index fc7d714..66a8909 100644 --- a/task_geo/data_sources/covid/south_korea/south_korea_patients.py +++ b/task_geo/data_sources/covid/south_korea/kr_covid.py @@ -1,10 +1,9 @@ import io - import pandas as pd import requests -def south_korea_patients_connector(*args, **kwargs): +def kr_covid_connector(): """Retrieves data from south_korea_patients. Arguments: @@ -12,11 +11,12 @@ def south_korea_patients_connector(*args, **kwargs): Returns: pandas.DataFrame """ - csv = requests.get(kwargs['url']).content + url = 'https://raw.githubusercontent.com/KrSuma/COVID19_Kr/master/Datasets/PatientInfo.csv' + csv = requests.get('url').content return pd.read_csv(io.StringIO(csv.decode('utf-8'))) -def south_korea_patients_formatter(df): +def kr_covid_formatter(df): """Formats data retrieved from south_korea_patients. Arguments: @@ -34,15 +34,19 @@ def south_korea_patients_formatter(df): 'infected_by', 'contact_number' ] df = df.reindex(columns=cols_ordered) - df['confirmed_date'] = pd.to_datetime(df.confirmed_date) - df['released_date'] = pd.to_datetime(df.released_date) - df['deceased_date'] = pd.to_datetime(df.deceased_date) - df['exposure_start'] = pd.to_datetime(df.exposure_start) - df['exposure_end'] = pd.to_datetime(df.exposure_end) + date_columns = ['confirmed_date', 'release_date', 'deceased_date', 'exposure_start', + 'exposure_end'] + df[date_columns] = df[date_columns].apply(pd.to_datetime()) + + # df['confirmed_date'] = pd.to_datetime(df.confirmed_date) + # df['released_date'] = pd.to_datetime(df.released_date) + # df['deceased_date'] = pd.to_datetime(df.deceased_date) + # df['exposure_start'] = pd.to_datetime(df.exposure_start) + # df['exposure_end'] = pd.to_datetime(df.exposure_end) return df -def south_korea_patients(*args, **kwargs): +def kr_covid(): """Data Source for south_korea_patients. Arguments: @@ -51,5 +55,5 @@ def south_korea_patients(*args, **kwargs): Returns: pandas.DataFrame """ - data = south_korea_patients_connector(*args, **kwargs) - return south_korea_patients_formatter(data) + data = kr_covid_connector() + return kr_covid_formatter(data) From cdc16defaa66e20b7d2994fa987a46cb87edf969 Mon Sep 17 00:00:00 2001 From: KrSuma Date: Tue, 14 Apr 2020 12:48:50 +0900 Subject: [PATCH 3/3] added audit(unfinished), datapackage left --- task_geo/data_sources/covid/south_korea/audit.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 task_geo/data_sources/covid/south_korea/audit.md diff --git a/task_geo/data_sources/covid/south_korea/audit.md b/task_geo/data_sources/covid/south_korea/audit.md new file mode 100644 index 0000000..e69de29