From e05e404e58f3696aa4bfa19ba739db51743e8c09 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 13:33:14 -0700 Subject: [PATCH 01/35] Add pandas wrapper function for time series data frame. --- datacommons/__init__.py | 2 +- datacommons/examples/stat_vars.py | 9 ++++ datacommons/stat_vars.py | 55 +++++++++++++++++++++--- datacommons/test/stat_vars_test.py | 69 ++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 7 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index da2f9fc4..37219cca 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -19,7 +19,7 @@ from datacommons.core import get_property_labels, get_property_values, get_triples from datacommons.places import get_places_in, get_related_places, get_stats from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs -from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all +from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, records_place_by_time # Other utilities from .utils import set_api_key diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index b5e16f08..6ea6ca50 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -126,6 +126,15 @@ def call_str(pvs): dc.get_stat_all(['badPlaceId', 'country/FRA'], ['Median_Age_Person', 'Count_Person'])) + print( + 'dc.records_place_by_time(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' + ) + print('>>> ') + pp.pprint( + dc.records_place_by_time([ + "geoId/29", "geoId/33" + ], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")) + if __name__ == '__main__': main() diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index ec07569e..306c4019 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -20,13 +20,8 @@ from __future__ import division from __future__ import print_function -from datacommons.utils import _API_ROOT, _API_ENDPOINTS, _ENV_VAR_API_KEY - import collections -import json -import os -import six.moves.urllib.error -import six.moves.urllib.request +import six import datacommons.utils as utils @@ -215,3 +210,51 @@ def get_stat_all(places, stat_vars): for stat_var_dcid, stat_var in place['statVarData'].items(): place_statvar_series[place_dcid][stat_var_dcid] = stat_var return dict(place_statvar_series) + + +# Pandas Helpers +# These functions are wrapper functions that create Python data structures +# that are easily converted to Pandas DataFrames (and Series). + + +def records_place_by_time(places, stat_var): + """Returns a `list` of `dict` per element of `places` based on the `stat_var`. + + Args: + places (`str` or `iterable` of `str`): The dcid of Places to query for. + stat_var (`str`): The dcid of the StatisticalVariable. + Returns: + A `list` of `dict`, one per element of `places`. Each `dict` consists of + the time series and place identifier. + + Raises: + ValueError: If the payload returned by the Data Commons REST API is + malformed. + + Examples: + >>> records_place_by_time(["geoId/29", "geoId/33"], "Count_Person") + [ + {'2020-03-07': 20, '2020-03-08': 40, 'place': 'geoId/29'}, + {'2020-08-21': 428, '2020-08-22': 429, 'place': 'geoId/33'} + ] + """ + try: + if isinstance(places, six.string_types): + places = [places] + else: + places = list(places) + except: + raise ValueError( + 'Parameter `places` must a string object or list-like object.') + if not isinstance(stat_var, six.string_types): + raise ValueError('Parameter `stat_var` must be a string.') + + stat_all = get_stat_all(places, [stat_var]) + # Use the first time series result of each Place+StatVar pair. + # Create a list of rows to be passed into pd.DataFrame.from_records + rows = [ + dict({'place': place}, + **data[next(iter(data))]['sourceSeries'][0]['val']) + for place, data in stat_all.items() + ] + return rows \ No newline at end of file diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index 16ae7693..aba31a95 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -217,6 +217,39 @@ def read(self): } return MockResponse(json.dumps(resp)) + if (data['places'] == ['geoId/06', 'nuts/HU22'] and + data['stat_vars'] == ['Count_Person']): + # Response returned when querying with above params. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + } + }, + "nuts/HU22": { + "statVarData": { + "Count_Person": HU22_COUNT_PERSON, + } + } + } + } + return MockResponse(json.dumps(resp)) + + if (data['places'] == ['geoId/06'] and + data['stat_vars'] == ['Count_Person']): + # Response returned when querying with above params. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + } + } + } + } + return MockResponse(json.dumps(resp)) + if (data['places'] == ['badPlaceId', 'nuts/HU22'] and data['stat_vars'] == ['Count_Person', 'badStatVarId']): # Response returned when querying with above params. @@ -342,5 +375,41 @@ def test_bad_dcids(self, urlopen): self.assertDictEqual(stats, exp) +class TestRecordsPlaceByTime(unittest.TestCase): + """Unit tests for records_place_by_time.""" + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_basic(self, urlopen): + """Calling records_place_by_time with proper args.""" + # Expecting at least one TS per Place+StatVar + rows = dc.records_place_by_time(['geoId/06', 'nuts/HU22'], + 'Count_Person') + exp = [{ + "1990": 23640, + "1991": 24100, + "1992": 25090, + "place": "geoId/06" + }, { + "1990": 2360, + "1991": 2410, + "1992": 2500, + "place": "nuts/HU22" + }] + self.assertEqual(rows, exp) + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_tolerate_place_string(self, urlopen): + """Calling records_place_by_time with proper args.""" + # Expecting at least one TS per Place+StatVar + rows = dc.records_place_by_time('geoId/06', 'Count_Person') + exp = [{ + "1990": 23640, + "1991": 24100, + "1992": 25090, + "place": "geoId/06" + }] + self.assertEqual(rows, exp) + + if __name__ == '__main__': unittest.main() From 2f070fdeb520f0ea9bdd493729782ad48d12cf6c Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 17:41:27 -0700 Subject: [PATCH 02/35] Save work so far on pandas. --- datacommons/__init__.py | 2 +- datacommons/examples/stat_vars.py | 4 +- datacommons/stat_vars.py | 175 +++++++++++++++++++---------- datacommons/test/stat_vars_test.py | 30 ++--- 4 files changed, 133 insertions(+), 78 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 37219cca..e8059dc2 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -19,7 +19,7 @@ from datacommons.core import get_property_labels, get_property_values, get_triples from datacommons.places import get_places_in, get_related_places, get_stats from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs -from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, records_place_by_time +from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, time_series_pd_input # Other utilities from .utils import set_api_key diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 6ea6ca50..2de03727 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -127,11 +127,11 @@ def call_str(pvs): ['Median_Age_Person', 'Count_Person'])) print( - 'dc.records_place_by_time(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' + 'dc.cohort_time_series(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' ) print('>>> ') pp.pprint( - dc.records_place_by_time([ + dc.cohort_time_series([ "geoId/29", "geoId/33" ], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")) diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index 306c4019..0f33ae9f 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -143,55 +143,62 @@ def get_stat_all(places, stat_vars): >>> get_stat_all(["geoId/05", "geoId/06"], ["Count_Person", "Count_Person_Male"]) { "geoId/05": { - "Count_Person": [ - { - "val": { - "2010": 1633, - "2011": 1509, - "2012": 1581, + "Count_Person": { + "sourceSeries": [ + { + "val": { + "2010": 1633, + "2011": 1509, + "2012": 1581, + }, + "observationPeriod": "P1Y", + "importName": "Wikidata", + "provenanceDomain": "wikidata.org" }, - "observationPeriod": "P1Y", - "importName": "Wikidata", - "provenanceDomain": "wikidata.org" - }, - { - "val": { - "2010": 1333, - "2011": 1309, - "2012": 131, - }, - "observationPeriod": "P1Y", - "importName": "CensusPEPSurvey", - "provenanceDomain": "census.gov" - } - ], - "Count_Person_Male": [ - { - "val": { - "2010": 1633, - "2011": 1509, - "2012": 1581, - }, - "observationPeriod": "P1Y", - "importName": "CensusPEPSurvey", - "provenanceDomain": "census.gov" + { + "val": { + "2010": 1333, + "2011": 1309, + "2012": 131, + }, + "observationPeriod": "P1Y", + "importName": "CensusPEPSurvey", + "provenanceDomain": "census.gov" + } + ], } - ], + }, + "Count_Person_Male": { + "sourceSeries": [ + { + "val": { + "2010": 1633, + "2011": 1509, + "2012": 1581, + }, + "observationPeriod": "P1Y", + "importName": "CensusPEPSurvey", + "provenanceDomain": "census.gov" + } + ], + } }, "geoId/02": { - "Count_Person": [], - "Count_Person_Male": [ - { - "val": { - "2010": 13, - "2011": 13, - "2012": 322, - }, - "observationPeriod": "P1Y", - "importName": "CensusPEPSurvey", - "provenanceDomain": "census.gov" + "Count_Person": {}, + "Count_Person_Male": { + "sourceSeries": [ + { + "val": { + "2010": 13, + "2011": 13, + "2012": 322, + }, + "observationPeriod": "P1Y", + "importName": "CensusPEPSurvey", + "provenanceDomain": "census.gov" + } + ] } - ], } } """ @@ -217,11 +224,44 @@ def get_stat_all(places, stat_vars): # that are easily converted to Pandas DataFrames (and Series). -def records_place_by_time(places, stat_var): +def _get_first_time_series(stat_var_data): + """Helper function to return one time series.""" + return stat_var_data['sourceSeries'][0]['val'] + + +def time_series_pd_input_options(places, stat_var): + """Returns a `dict` mapping StatVarObservation options to `list` of `dict` of time series for each Place. + """ + res = collections.defaultdict(list) + stat_all = get_stat_all(places, [stat_var]) + for place, place_data in stat_all.items(): + if not place_data: + continue + stat_var_data = place_data[stat_var] + if not stat_var_data: + continue + for source_series in stat_var_data['sourceSeries']: + time_series = source_series['val'] + # Hashable SVO options. + svo_options = (('measurementMethod', + source_series.get('measurementMethod')), + ('observationPeriod', + source_series.get('observationPeriod')), + ('unit', source_series.get('unit')), + ('scalingFactor', + source_series.get('scalingFactor'))) + res[svo_options].append(dict({'place': place}, **time_series)) + return dict(res) + + +def time_series_pd_input(places, stat_var): """Returns a `list` of `dict` per element of `places` based on the `stat_var`. + Data Commons will pick a set of StatVarObservation options that covers the + maximum number of queried places. + Args: - places (`str` or `iterable` of `str`): The dcid of Places to query for. + places (`str` or `iterable` of `str`): The dcids of Places to query for. stat_var (`str`): The dcid of the StatisticalVariable. Returns: A `list` of `dict`, one per element of `places`. Each `dict` consists of @@ -232,7 +272,7 @@ def records_place_by_time(places, stat_var): malformed. Examples: - >>> records_place_by_time(["geoId/29", "geoId/33"], "Count_Person") + >>> time_series_pd_input(["geoId/29", "geoId/33"], "Count_Person") [ {'2020-03-07': 20, '2020-03-08': 40, 'place': 'geoId/29'}, {'2020-08-21': 428, '2020-08-22': 429, 'place': 'geoId/33'} @@ -245,16 +285,37 @@ def records_place_by_time(places, stat_var): places = list(places) except: raise ValueError( - 'Parameter `places` must a string object or list-like object.') + 'Parameter `places` must be a string object or list-like object.') if not isinstance(stat_var, six.string_types): raise ValueError('Parameter `stat_var` must be a string.') - stat_all = get_stat_all(places, [stat_var]) - # Use the first time series result of each Place+StatVar pair. - # Create a list of rows to be passed into pd.DataFrame.from_records - rows = [ - dict({'place': place}, - **data[next(iter(data))]['sourceSeries'][0]['val']) - for place, data in stat_all.items() - ] - return rows \ No newline at end of file + rows_dict = time_series_pd_input_options(places, stat_var) + most_geos = [] + max_geos_so_far = 0 + latest_date = [] + max_date_so_far = '' + for svo, rows in rows_dict.items(): + current_geos = len(rows) + if current_geos > max_geos_so_far: + max_geos_so_far = current_geos + most_geos = [svo] + # Reset tiebreaker stats. Recompute after this if-else block. + latest_date = [] + max_date_so_far = '' + elif current_geos == max_geos_so_far: + most_geos.append(svo) + else: + # Do not compute tiebreaker stats if not in most_geos. + continue + for row in rows: + dates = set(row.keys()) + dates.remove('place') + row_max_date = max(dates) + if row_max_date > max_date_so_far: + max_date_so_far = row_max_date + latest_date = [svo] + elif row_max_date == max_date_so_far: + latest_date.append(svo) + for svo in most_geos: + if svo in latest_date: + return rows_dict[svo] diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index aba31a95..72faed60 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -29,6 +29,7 @@ import datacommons.utils as utils import json import unittest +import six import six.moves.urllib as urllib # Reusable parts of REST API /stat/all response. @@ -40,7 +41,7 @@ "val": { "1990": 23640, "1991": 24100, - "1992": 25090, + "1993": 25090, }, "observationPeriod": "P1Y", "importName": "WorldDevelopmentIndicators", @@ -375,37 +376,30 @@ def test_bad_dcids(self, urlopen): self.assertDictEqual(stats, exp) -class TestRecordsPlaceByTime(unittest.TestCase): - """Unit tests for records_place_by_time.""" +class TestPdTimeSeries(unittest.TestCase): + """Unit tests for time_series_pd_input.""" @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) def test_basic(self, urlopen): - """Calling records_place_by_time with proper args.""" - # Expecting at least one TS per Place+StatVar - rows = dc.records_place_by_time(['geoId/06', 'nuts/HU22'], - 'Count_Person') + """Calling time_series_pd_input with proper args.""" + rows = dc.time_series_pd_input(['geoId/06', 'nuts/HU22'], + 'Count_Person') exp = [{ "1990": 23640, "1991": 24100, - "1992": 25090, + "1993": 25090, "place": "geoId/06" - }, { - "1990": 2360, - "1991": 2410, - "1992": 2500, - "place": "nuts/HU22" }] - self.assertEqual(rows, exp) + six.assertCountEqual(self, rows, exp) @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) def test_tolerate_place_string(self, urlopen): - """Calling records_place_by_time with proper args.""" - # Expecting at least one TS per Place+StatVar - rows = dc.records_place_by_time('geoId/06', 'Count_Person') + """Calling time_series_pd_input with single string place arg.""" + rows = dc.time_series_pd_input('geoId/06', 'Count_Person') exp = [{ "1990": 23640, "1991": 24100, - "1992": 25090, + "1993": 25090, "place": "geoId/06" }] self.assertEqual(rows, exp) From b1feeca25dd09056739ec6c3b7e97ac26766db58 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 19:49:49 -0700 Subject: [PATCH 03/35] Minor edits. --- datacommons/examples/stat_vars.py | 2 +- datacommons/stat_vars.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 2de03727..1128378e 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -131,7 +131,7 @@ def call_str(pvs): ) print('>>> ') pp.pprint( - dc.cohort_time_series([ + dc.time_series_pd_input([ "geoId/29", "geoId/33" ], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")) diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index 0f33ae9f..e56ac18c 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -223,13 +223,12 @@ def get_stat_all(places, stat_vars): # These functions are wrapper functions that create Python data structures # that are easily converted to Pandas DataFrames (and Series). +# def _get_first_time_series(stat_var_data): +# """Helper function to return one time series.""" +# return stat_var_data['sourceSeries'][0]['val'] -def _get_first_time_series(stat_var_data): - """Helper function to return one time series.""" - return stat_var_data['sourceSeries'][0]['val'] - -def time_series_pd_input_options(places, stat_var): +def _time_series_pd_input_options(places, stat_var): """Returns a `dict` mapping StatVarObservation options to `list` of `dict` of time series for each Place. """ res = collections.defaultdict(list) @@ -258,7 +257,8 @@ def time_series_pd_input(places, stat_var): """Returns a `list` of `dict` per element of `places` based on the `stat_var`. Data Commons will pick a set of StatVarObservation options that covers the - maximum number of queried places. + maximum number of queried places. Among ties, Data Commons selects an option + set with the latest Observation. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. @@ -289,11 +289,11 @@ def time_series_pd_input(places, stat_var): if not isinstance(stat_var, six.string_types): raise ValueError('Parameter `stat_var` must be a string.') - rows_dict = time_series_pd_input_options(places, stat_var) + rows_dict = _time_series_pd_input_options(places, stat_var) most_geos = [] max_geos_so_far = 0 latest_date = [] - max_date_so_far = '' + latest_date_so_far = '' for svo, rows in rows_dict.items(): current_geos = len(rows) if current_geos > max_geos_so_far: @@ -301,7 +301,7 @@ def time_series_pd_input(places, stat_var): most_geos = [svo] # Reset tiebreaker stats. Recompute after this if-else block. latest_date = [] - max_date_so_far = '' + latest_date_so_far = '' elif current_geos == max_geos_so_far: most_geos.append(svo) else: @@ -311,10 +311,10 @@ def time_series_pd_input(places, stat_var): dates = set(row.keys()) dates.remove('place') row_max_date = max(dates) - if row_max_date > max_date_so_far: - max_date_so_far = row_max_date + if row_max_date > latest_date_so_far: + latest_date_so_far = row_max_date latest_date = [svo] - elif row_max_date == max_date_so_far: + elif row_max_date == latest_date_so_far: latest_date.append(svo) for svo in most_geos: if svo in latest_date: From 516033b842a4b78e6ea825834c3d8f837ab65352 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 21:24:52 -0700 Subject: [PATCH 04/35] Add function for creating covariate pandas df. --- datacommons/stat_vars.py | 92 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index e56ac18c..d6d8fb4d 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -319,3 +319,95 @@ def time_series_pd_input(places, stat_var): for svo in most_geos: if svo in latest_date: return rows_dict[svo] + + +def _covariate_pd_input_options(places, stat_vars): + """Returns a `dict` mapping each stat_var to a `dict` of StatVarObservation options to `list` of `dict` of latest Observations for each Place. Note that the `observationDate` may differ across + Places, even if + """ + res = collections.defaultdict(lambda: collections.defaultdict(list)) + stat_all = get_stat_all(places, stat_vars) + for place, place_data in stat_all.items(): + if not place_data: + continue + for stat_var, stat_var_data in place_data.items(): + if not stat_var_data: + continue + for source_series in stat_var_data['sourceSeries']: + time_series = source_series['val'] + latest_obs = time_series[max(time_series)] + # Hashable SVO options. + svo_options = (('measurementMethod', + source_series.get('measurementMethod')), + ('observationPeriod', + source_series.get('observationPeriod')), + ('unit', source_series.get('unit')), + ('scalingFactor', + source_series.get('scalingFactor'))) + res[stat_var][svo_options].append({ + 'place': place, + 'val': latest_obs + }) + return {k: dict(v) for k, v in res.items()} + + +def covariate_pd_input(places, stat_vars): + """Returns a `list` of `dict` per element of `places` based on the `stat_var`. + + Data Commons will pick a set of StatVarObservation options that covers the + maximum number of queried places. Among ties, Data Commons selects an option + set with the latest Observation. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_var (`str`): The dcid of the StatisticalVariable. + Returns: + A `list` of `dict`, one per element of `places`. Each `dict` consists of + the time series and place identifier. + + Raises: + ValueError: If the payload returned by the Data Commons REST API is + malformed. + + Examples: + >>> covariate_pd_input(["geoId/29", "geoId/33"], ["Count_Person", "Median_Income_Person"]) + [ + {'Count_Person': 20, 'Median_Income_Person': 40, 'place': 'geoId/29'}, + {'Count_Person': 428, 'Median_Income_Person': 429, 'place': 'geoId/33'} + ] + """ + + try: + if isinstance(places, six.string_types): + places = [places] + else: + places = list(places) + except: + raise ValueError( + 'Parameter `places` must be a string object or list-like object.') + try: + if isinstance(stat_vars, six.string_types): + stat_vars = [stat_vars] + else: + stat_vars = list(stat_vars) + except: + raise ValueError( + 'Parameter `stat_vars` must be a string object or list-like object.' + ) + + rows_dict = _covariate_pd_input_options(places, stat_vars) + place2cov = collections.defaultdict({}) # {geo: {var1: 3, var2: 33}} + for stat_var, candidates_dict in rows_dict.items(): + selected_rows = None + max_rows_so_far = 0 + for svo, rows in candidates_dict.items(): + current_geos = len(rows) + if current_geos > max_rows_so_far: + max_rows_so_far = current_geos + selected_rows = rows + for row in selected_rows: + place2cov[row['place']] = {stat_var: row['val']} + return [ + dict({'place': place}, **covariates) + for place, covariates in place2cov.items() + ] From 529aeb3eb205ff576faa089d487561f3e75c005e Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 22:12:49 -0700 Subject: [PATCH 05/35] Add latest date sorting to covariate as well. Add test for covariate pd input function. --- datacommons/__init__.py | 2 +- datacommons/stat_vars.py | 39 ++++++++-- datacommons/test/stat_vars_test.py | 113 ++++++++++++++++++++++------- 3 files changed, 119 insertions(+), 35 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index e8059dc2..b5c97e27 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -19,7 +19,7 @@ from datacommons.core import get_property_labels, get_property_values, get_triples from datacommons.places import get_places_in, get_related_places, get_stats from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs -from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, time_series_pd_input +from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, time_series_pd_input, covariate_pd_input # Other utilities from .utils import set_api_key diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index d6d8fb4d..aadeb52f 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -335,7 +335,8 @@ def _covariate_pd_input_options(places, stat_vars): continue for source_series in stat_var_data['sourceSeries']: time_series = source_series['val'] - latest_obs = time_series[max(time_series)] + latest_date = max(time_series) + latest_obs = time_series[latest_date] # Hashable SVO options. svo_options = (('measurementMethod', source_series.get('measurementMethod')), @@ -346,6 +347,7 @@ def _covariate_pd_input_options(places, stat_vars): source_series.get('scalingFactor'))) res[stat_var][svo_options].append({ 'place': place, + 'date': latest_date, 'val': latest_obs }) return {k: dict(v) for k, v in res.items()} @@ -396,17 +398,40 @@ def covariate_pd_input(places, stat_vars): ) rows_dict = _covariate_pd_input_options(places, stat_vars) - place2cov = collections.defaultdict({}) # {geo: {var1: 3, var2: 33}} + place2cov = collections.defaultdict(dict) # {geo: {var1: 3, var2: 33}} + for stat_var, candidates_dict in rows_dict.items(): selected_rows = None - max_rows_so_far = 0 + most_geos = [] + max_geos_so_far = 0 + latest_date = [] + latest_date_so_far = '' for svo, rows in candidates_dict.items(): current_geos = len(rows) - if current_geos > max_rows_so_far: - max_rows_so_far = current_geos - selected_rows = rows + if current_geos > max_geos_so_far: + max_geos_so_far = current_geos + most_geos = [svo] + # Reset tiebreaker stats. Recompute after this if-else block. + latest_date = [] + latest_date_so_far = '' + elif current_geos == max_geos_so_far: + most_geos.append(svo) + else: + # Do not compute tiebreaker stats if not in most_geos. + continue + for row in rows: + row_date = row['date'] + if row_date > latest_date_so_far: + latest_date_so_far = row_date + latest_date = [svo] + elif row_date == latest_date_so_far: + latest_date.append(svo) + for svo in most_geos: + if svo in latest_date: + selected_rows = candidates_dict[svo] + for row in selected_rows: - place2cov[row['place']] = {stat_var: row['val']} + place2cov[row['place']][stat_var] = row['val'] return [ dict({'place': place}, **covariates) for place, covariates in place2cov.items() diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index 72faed60..ef7f65a4 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -36,28 +36,35 @@ CA_COUNT_PERSON = { "isDcAggregate": "true", - "sourceSeries": [ - { - "val": { - "1990": 23640, - "1991": 24100, - "1993": 25090, - }, - "observationPeriod": "P1Y", - "importName": "WorldDevelopmentIndicators", - "provenanceDomain": "worldbank.org" + "sourceSeries": [{ + "val": { + "1990": 23640, + "1991": 24100, + "1993": 25090, }, - { - "val": { - "1790": 3929214, - "1800": 5308483, - "1810": 7239881, - }, - "measurementMethod": "WikidataPopulation", - "importName": "WikidataPopulation", - "provenanceDomain": "wikidata.org" + "observationPeriod": "P1Y", + "importName": "WorldDevelopmentIndicators", + "provenanceDomain": "worldbank.org" + }, { + "val": { + "1790": 3929214, + "1800": 5308483, + "1810": 7239881, + }, + "measurementMethod": "WikidataPopulation", + "importName": "WikidataPopulation", + "provenanceDomain": "wikidata.org" + }, { + "val": { + "1890": 28360, + "1891": 24910, + "1892": 25070, }, - ] + "measurementMethod": "OECDRegionalStatistics", + "observationPeriod": "P1Y", + "importName": "OECDRegionalDemography", + "provenanceDomain": "oecd.org" + }] } CA_COUNT_PERSON_MALE = { @@ -101,7 +108,7 @@ }] } -HU22_MEDIAN_AGE_PERSON = { +CA_MEDIAN_AGE_PERSON = { "sourceSeries": [{ "val": { "1990": 12, @@ -205,7 +212,7 @@ def read(self): "geoId/06": { "statVarData": { "Count_Person": CA_COUNT_PERSON, - "Median_Age_Person": HU22_MEDIAN_AGE_PERSON + "Median_Age_Person": CA_MEDIAN_AGE_PERSON } }, "nuts/HU22": { @@ -273,6 +280,27 @@ def read(self): } return MockResponse(json.dumps(resp)) + if (data['places'] == ['geoId/06', 'nuts/HU22'] and + data['stat_vars'] == ['Count_Person', 'Median_Age_Person']): + # Response returned when querying with above params. + # Median Age missing for HU22. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + "Median_Age_Person": CA_MEDIAN_AGE_PERSON + } + }, + "nuts/HU22": { + "statVarData": { + "Count_Person": HU22_COUNT_PERSON, + "Median_Age_Person": {} + } + } + } + } + return MockResponse(json.dumps(resp)) # Otherwise, return an empty response and a 404. return urllib.error.HTTPError @@ -350,7 +378,7 @@ def test_basic(self, urlopen): exp = { "geoId/06": { "Count_Person": CA_COUNT_PERSON, - "Median_Age_Person": HU22_MEDIAN_AGE_PERSON + "Median_Age_Person": CA_MEDIAN_AGE_PERSON }, "nuts/HU22": { "Count_Person": HU22_COUNT_PERSON, @@ -385,10 +413,15 @@ def test_basic(self, urlopen): rows = dc.time_series_pd_input(['geoId/06', 'nuts/HU22'], 'Count_Person') exp = [{ - "1990": 23640, - "1991": 24100, - "1993": 25090, - "place": "geoId/06" + '1890': 28360, + '1891': 24910, + '1892': 25070, + 'place': 'geoId/06' + }, { + '1991': 2410, + '1990': 2360, + '1992': 2500, + 'place': 'nuts/HU22' }] six.assertCountEqual(self, rows, exp) @@ -405,5 +438,31 @@ def test_tolerate_place_string(self, urlopen): self.assertEqual(rows, exp) +class TestPdCovariates(unittest.TestCase): + """Unit tests for covariate_pd_input.""" + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_basic(self, urlopen): + """Calling covariate_pd_input with proper args.""" + rows = dc.covariate_pd_input(['geoId/06', 'nuts/HU22'], + ['Count_Person', 'Median_Age_Person']) + exp = [{ + "place": "geoId/06", + "Median_Age_Person": 24, + "Count_Person": 25070 + }, { + "place": "nuts/HU22", + "Count_Person": 2500 + }] + six.assertCountEqual(self, rows, exp) + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_tolerate_place_string(self, urlopen): + """Calling covariate_pd_input with single string place arg.""" + rows = dc.covariate_pd_input(['geoId/06'], 'Count_Person') + exp = [{"place": "geoId/06", "Count_Person": 25090}] + self.assertEqual(rows, exp) + + if __name__ == '__main__': unittest.main() From a7868e24150d76566c858d1d26afb7ef3e3a9c0c Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 22:20:43 -0700 Subject: [PATCH 06/35] stat_vars_test: make response and expected response strings consistently double quoted. --- datacommons/test/stat_vars_test.py | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index ef7f65a4..df02732f 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -146,37 +146,37 @@ def read(self): if req.get_full_url( ) == stat_value_url_base + '?place=geoId/06&stat_var=Count_Person': # Response returned when querying with basic args. - return MockResponse(json.dumps({'value': 123})) + return MockResponse(json.dumps({"value": 123})) if req.get_full_url( ) == stat_value_url_base + '?place=geoId/06&stat_var=Count_Person&date=2010': # Response returned when querying with observationDate. - return MockResponse(json.dumps({'value': 133})) + return MockResponse(json.dumps({"value": 133})) if (req.get_full_url() == stat_value_url_base + '?place=geoId/06&stat_var=Count_Person&' + 'date=2010&measurement_method=CensusPEPSurvey&' + 'observation_period=P1Y&unit=RealPeople&scaling_factor=100'): # Response returned when querying with above optional params. - return MockResponse(json.dumps({'value': 103})) + return MockResponse(json.dumps({"value": 103})) # Mock responses for urlopen requests to get_stat_series. if req.get_full_url( ) == stat_series_url_base + '?place=geoId/06&stat_var=Count_Person': # Response returned when querying with basic args. - return MockResponse(json.dumps({'series': {'2000': 1, '2001': 2}})) + return MockResponse(json.dumps({"series": {"2000": 1, "2001": 2}})) if (req.get_full_url() == stat_series_url_base + '?place=geoId/06&stat_var=Count_Person&' + 'measurement_method=CensusPEPSurvey&observation_period=P1Y&' + 'unit=RealPeople&scaling_factor=100'): # Response returned when querying with above optional params. - return MockResponse(json.dumps({'series': {'2000': 3, '2001': 42}})) + return MockResponse(json.dumps({"series": {"2000": 3, "2001": 42}})) if (req.get_full_url() == stat_series_url_base + '?place=geoId/06&stat_var=Count_Person&' + 'measurement_method=DNE'): # Response returned when data not available for optional parameters. # /stat/series?place=geoId/06&stat_var=Count_Person&measurement_method=DNE - return MockResponse(json.dumps({'series': {}})) + return MockResponse(json.dumps({"series": {}})) # Mock responses for urlopen requests to get_stat_all. if req.get_full_url() == stat_all_url_base: @@ -336,7 +336,7 @@ def test_basic(self, urlopen): """Calling get_stat_value with minimal and proper args.""" # Call get_stat_series stats = dc.get_stat_series('geoId/06', 'Count_Person') - self.assertEqual(stats, {'2000': 1, '2001': 2}) + self.assertEqual(stats, {"2000": 1, "2001": 2}) @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) def test_opt_args(self, urlopen): @@ -345,7 +345,7 @@ def test_opt_args(self, urlopen): # Call get_stat_series with all optional args stats = dc.get_stat_series('geoId/06', 'Count_Person', 'CensusPEPSurvey', 'P1Y', 'RealPeople', 100) - self.assertEqual(stats, {'2000': 3, '2001': 42}) + self.assertEqual(stats, {"2000": 3, "2001": 42}) # Call get_stat_series with non-satisfiable optional args stats = dc.get_stat_series('geoId/06', 'Count_Person', 'DNE') @@ -413,15 +413,15 @@ def test_basic(self, urlopen): rows = dc.time_series_pd_input(['geoId/06', 'nuts/HU22'], 'Count_Person') exp = [{ - '1890': 28360, - '1891': 24910, - '1892': 25070, - 'place': 'geoId/06' + "1890": 28360, + "1891": 24910, + "1892": 25070, + "place": "geoId/06" }, { - '1991': 2410, - '1990': 2360, - '1992': 2500, - 'place': 'nuts/HU22' + "1991": 2410, + "1990": 2360, + "1992": 2500, + "place": "nuts/HU22" }] six.assertCountEqual(self, rows, exp) From ea3c2ff705ec4aeebd359a75b79c20003576fc23 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 22:24:02 -0700 Subject: [PATCH 07/35] Add an example for covariate_pd_input --- datacommons/examples/stat_vars.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 1128378e..6ee550f7 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -127,13 +127,20 @@ def call_str(pvs): ['Median_Age_Person', 'Count_Person'])) print( - 'dc.cohort_time_series(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' + 'dc.time_series_pd_input(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' ) print('>>> ') pp.pprint( - dc.time_series_pd_input([ - "geoId/29", "geoId/33" - ], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")) + dc.time_series_pd_input(["geoId/29", "geoId/33"], + "Median_Income_Person")) + + print( + "dc.covariate_pd_input(['geoId/06', 'country/FRA'], 'Median_Age_Person', 'Count_Person'])" + ) + print('>>> ') + pp.pprint( + dc.covariate_pd_input(['geoId/06', 'country/FRA'], + ['Median_Age_Person', 'Count_Person'])) if __name__ == '__main__': From ab3f755d50fb5a99451e712b81454029a8aaef5e Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Sun, 23 Aug 2020 22:31:22 -0700 Subject: [PATCH 08/35] Make stat_var examples quoting consistent. --- datacommons/examples/stat_vars.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 6ee550f7..974a7817 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -25,16 +25,16 @@ def main(): param_sets = [ { 'place': 'geoId/06085', - 'stat_var': 'Count_Person', + 'stat_var': "Count_Person", }, { 'place': 'geoId/06085', - 'stat_var': 'Count_Person', + 'stat_var': "Count_Person", 'date': '2018', }, { 'place': 'geoId/06085', - 'stat_var': 'Count_Person', + 'stat_var': "Count_Person", 'date': '2018', 'measurement_method': 'CensusACS5yrSurvey', }, @@ -111,20 +111,20 @@ def call_str(pvs): pp = pprint.PrettyPrinter(indent=4) print( - "\nget_stat_all(['geoId/06085', 'country/FRA'], ['Median_Age_Person', 'Count_Person'])" + '\nget_stat_all(["geoId/06085", "country/FRA"], ["Median_Age_Person", "Count_Person"])' ) print('>>> ') pp.pprint( - dc.get_stat_all(['geoId/06085', 'country/FRA'], - ['Median_Age_Person', 'Count_Person'])) + dc.get_stat_all(["geoId/06085", "country/FRA"], + ["Median_Age_Person", "Count_Person"])) print( - "\nget_stat_all(['badPlaceId', 'country/FRA'], ['Median_Age_Person', 'Count_Person'])" + '\nget_stat_all(["badPlaceId", "country/FRA"], ["Median_Age_Person", "Count_Person"])' ) print('>>> ') pp.pprint( - dc.get_stat_all(['badPlaceId', 'country/FRA'], - ['Median_Age_Person', 'Count_Person'])) + dc.get_stat_all(["badPlaceId", "country/FRA"], + ["Median_Age_Person", "Count_Person"])) print( 'dc.time_series_pd_input(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' @@ -135,12 +135,12 @@ def call_str(pvs): "Median_Income_Person")) print( - "dc.covariate_pd_input(['geoId/06', 'country/FRA'], 'Median_Age_Person', 'Count_Person'])" + 'dc.covariate_pd_input(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person"])' ) print('>>> ') pp.pprint( - dc.covariate_pd_input(['geoId/06', 'country/FRA'], - ['Median_Age_Person', 'Count_Person'])) + dc.covariate_pd_input(["geoId/06", "country/FRA"], + ["Median_Age_Person", "Count_Person"])) if __name__ == '__main__': From e72ae4a17ef15829a9e9cdef69909cefd0f6987b Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:10:46 -0700 Subject: [PATCH 09/35] Create dcpandas module that uses pandas natively. --- datacommons/__init__.py | 2 +- datacommons/examples/stat_vars.py | 2 +- datacommons/stat_vars.py | 219 --------------------- datacommons/test/stat_vars_test.py | 60 ------ dcpandas/CHANGELOG.md | 21 ++ dcpandas/README.md | 47 +++++ dcpandas/__init__.py | 16 ++ dcpandas/df_builder.py | 298 +++++++++++++++++++++++++++++ dcpandas/examples/__init__.py | 13 ++ dcpandas/examples/df_builder.py | 84 ++++++++ dcpandas/setup.py | 59 ++++++ dcpandas/stat_vars.py | 1 + dcpandas/test/__init__.py | 13 ++ dcpandas/test/df_builder_test.py | 286 +++++++++++++++++++++++++++ dcpandas/utils.py | 1 + requirements.txt | 1 + setup.py | 3 +- 17 files changed, 843 insertions(+), 283 deletions(-) create mode 100644 dcpandas/CHANGELOG.md create mode 100644 dcpandas/README.md create mode 100644 dcpandas/__init__.py create mode 100644 dcpandas/df_builder.py create mode 100644 dcpandas/examples/__init__.py create mode 100644 dcpandas/examples/df_builder.py create mode 100644 dcpandas/setup.py create mode 120000 dcpandas/stat_vars.py create mode 100644 dcpandas/test/__init__.py create mode 100644 dcpandas/test/df_builder_test.py create mode 120000 dcpandas/utils.py diff --git a/datacommons/__init__.py b/datacommons/__init__.py index b5c97e27..da2f9fc4 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -19,7 +19,7 @@ from datacommons.core import get_property_labels, get_property_values, get_triples from datacommons.places import get_places_in, get_related_places, get_stats from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs -from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all, time_series_pd_input, covariate_pd_input +from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all # Other utilities from .utils import set_api_key diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 974a7817..9b7c29b5 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Basic examples for StatisticalVariable-based param_set Commons API functions.""" +"""Basic examples for StatisticalVariable-based param_set Data Commons API functions.""" from __future__ import absolute_import from __future__ import division diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index aadeb52f..64cf1759 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -217,222 +217,3 @@ def get_stat_all(places, stat_vars): for stat_var_dcid, stat_var in place['statVarData'].items(): place_statvar_series[place_dcid][stat_var_dcid] = stat_var return dict(place_statvar_series) - - -# Pandas Helpers -# These functions are wrapper functions that create Python data structures -# that are easily converted to Pandas DataFrames (and Series). - -# def _get_first_time_series(stat_var_data): -# """Helper function to return one time series.""" -# return stat_var_data['sourceSeries'][0]['val'] - - -def _time_series_pd_input_options(places, stat_var): - """Returns a `dict` mapping StatVarObservation options to `list` of `dict` of time series for each Place. - """ - res = collections.defaultdict(list) - stat_all = get_stat_all(places, [stat_var]) - for place, place_data in stat_all.items(): - if not place_data: - continue - stat_var_data = place_data[stat_var] - if not stat_var_data: - continue - for source_series in stat_var_data['sourceSeries']: - time_series = source_series['val'] - # Hashable SVO options. - svo_options = (('measurementMethod', - source_series.get('measurementMethod')), - ('observationPeriod', - source_series.get('observationPeriod')), - ('unit', source_series.get('unit')), - ('scalingFactor', - source_series.get('scalingFactor'))) - res[svo_options].append(dict({'place': place}, **time_series)) - return dict(res) - - -def time_series_pd_input(places, stat_var): - """Returns a `list` of `dict` per element of `places` based on the `stat_var`. - - Data Commons will pick a set of StatVarObservation options that covers the - maximum number of queried places. Among ties, Data Commons selects an option - set with the latest Observation. - - Args: - places (`str` or `iterable` of `str`): The dcids of Places to query for. - stat_var (`str`): The dcid of the StatisticalVariable. - Returns: - A `list` of `dict`, one per element of `places`. Each `dict` consists of - the time series and place identifier. - - Raises: - ValueError: If the payload returned by the Data Commons REST API is - malformed. - - Examples: - >>> time_series_pd_input(["geoId/29", "geoId/33"], "Count_Person") - [ - {'2020-03-07': 20, '2020-03-08': 40, 'place': 'geoId/29'}, - {'2020-08-21': 428, '2020-08-22': 429, 'place': 'geoId/33'} - ] - """ - try: - if isinstance(places, six.string_types): - places = [places] - else: - places = list(places) - except: - raise ValueError( - 'Parameter `places` must be a string object or list-like object.') - if not isinstance(stat_var, six.string_types): - raise ValueError('Parameter `stat_var` must be a string.') - - rows_dict = _time_series_pd_input_options(places, stat_var) - most_geos = [] - max_geos_so_far = 0 - latest_date = [] - latest_date_so_far = '' - for svo, rows in rows_dict.items(): - current_geos = len(rows) - if current_geos > max_geos_so_far: - max_geos_so_far = current_geos - most_geos = [svo] - # Reset tiebreaker stats. Recompute after this if-else block. - latest_date = [] - latest_date_so_far = '' - elif current_geos == max_geos_so_far: - most_geos.append(svo) - else: - # Do not compute tiebreaker stats if not in most_geos. - continue - for row in rows: - dates = set(row.keys()) - dates.remove('place') - row_max_date = max(dates) - if row_max_date > latest_date_so_far: - latest_date_so_far = row_max_date - latest_date = [svo] - elif row_max_date == latest_date_so_far: - latest_date.append(svo) - for svo in most_geos: - if svo in latest_date: - return rows_dict[svo] - - -def _covariate_pd_input_options(places, stat_vars): - """Returns a `dict` mapping each stat_var to a `dict` of StatVarObservation options to `list` of `dict` of latest Observations for each Place. Note that the `observationDate` may differ across - Places, even if - """ - res = collections.defaultdict(lambda: collections.defaultdict(list)) - stat_all = get_stat_all(places, stat_vars) - for place, place_data in stat_all.items(): - if not place_data: - continue - for stat_var, stat_var_data in place_data.items(): - if not stat_var_data: - continue - for source_series in stat_var_data['sourceSeries']: - time_series = source_series['val'] - latest_date = max(time_series) - latest_obs = time_series[latest_date] - # Hashable SVO options. - svo_options = (('measurementMethod', - source_series.get('measurementMethod')), - ('observationPeriod', - source_series.get('observationPeriod')), - ('unit', source_series.get('unit')), - ('scalingFactor', - source_series.get('scalingFactor'))) - res[stat_var][svo_options].append({ - 'place': place, - 'date': latest_date, - 'val': latest_obs - }) - return {k: dict(v) for k, v in res.items()} - - -def covariate_pd_input(places, stat_vars): - """Returns a `list` of `dict` per element of `places` based on the `stat_var`. - - Data Commons will pick a set of StatVarObservation options that covers the - maximum number of queried places. Among ties, Data Commons selects an option - set with the latest Observation. - - Args: - places (`str` or `iterable` of `str`): The dcids of Places to query for. - stat_var (`str`): The dcid of the StatisticalVariable. - Returns: - A `list` of `dict`, one per element of `places`. Each `dict` consists of - the time series and place identifier. - - Raises: - ValueError: If the payload returned by the Data Commons REST API is - malformed. - - Examples: - >>> covariate_pd_input(["geoId/29", "geoId/33"], ["Count_Person", "Median_Income_Person"]) - [ - {'Count_Person': 20, 'Median_Income_Person': 40, 'place': 'geoId/29'}, - {'Count_Person': 428, 'Median_Income_Person': 429, 'place': 'geoId/33'} - ] - """ - - try: - if isinstance(places, six.string_types): - places = [places] - else: - places = list(places) - except: - raise ValueError( - 'Parameter `places` must be a string object or list-like object.') - try: - if isinstance(stat_vars, six.string_types): - stat_vars = [stat_vars] - else: - stat_vars = list(stat_vars) - except: - raise ValueError( - 'Parameter `stat_vars` must be a string object or list-like object.' - ) - - rows_dict = _covariate_pd_input_options(places, stat_vars) - place2cov = collections.defaultdict(dict) # {geo: {var1: 3, var2: 33}} - - for stat_var, candidates_dict in rows_dict.items(): - selected_rows = None - most_geos = [] - max_geos_so_far = 0 - latest_date = [] - latest_date_so_far = '' - for svo, rows in candidates_dict.items(): - current_geos = len(rows) - if current_geos > max_geos_so_far: - max_geos_so_far = current_geos - most_geos = [svo] - # Reset tiebreaker stats. Recompute after this if-else block. - latest_date = [] - latest_date_so_far = '' - elif current_geos == max_geos_so_far: - most_geos.append(svo) - else: - # Do not compute tiebreaker stats if not in most_geos. - continue - for row in rows: - row_date = row['date'] - if row_date > latest_date_so_far: - latest_date_so_far = row_date - latest_date = [svo] - elif row_date == latest_date_so_far: - latest_date.append(svo) - for svo in most_geos: - if svo in latest_date: - selected_rows = candidates_dict[svo] - - for row in selected_rows: - place2cov[row['place']][stat_var] = row['val'] - return [ - dict({'place': place}, **covariates) - for place, covariates in place2cov.items() - ] diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index df02732f..60ef037a 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -404,65 +404,5 @@ def test_bad_dcids(self, urlopen): self.assertDictEqual(stats, exp) -class TestPdTimeSeries(unittest.TestCase): - """Unit tests for time_series_pd_input.""" - - @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) - def test_basic(self, urlopen): - """Calling time_series_pd_input with proper args.""" - rows = dc.time_series_pd_input(['geoId/06', 'nuts/HU22'], - 'Count_Person') - exp = [{ - "1890": 28360, - "1891": 24910, - "1892": 25070, - "place": "geoId/06" - }, { - "1991": 2410, - "1990": 2360, - "1992": 2500, - "place": "nuts/HU22" - }] - six.assertCountEqual(self, rows, exp) - - @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) - def test_tolerate_place_string(self, urlopen): - """Calling time_series_pd_input with single string place arg.""" - rows = dc.time_series_pd_input('geoId/06', 'Count_Person') - exp = [{ - "1990": 23640, - "1991": 24100, - "1993": 25090, - "place": "geoId/06" - }] - self.assertEqual(rows, exp) - - -class TestPdCovariates(unittest.TestCase): - """Unit tests for covariate_pd_input.""" - - @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) - def test_basic(self, urlopen): - """Calling covariate_pd_input with proper args.""" - rows = dc.covariate_pd_input(['geoId/06', 'nuts/HU22'], - ['Count_Person', 'Median_Age_Person']) - exp = [{ - "place": "geoId/06", - "Median_Age_Person": 24, - "Count_Person": 25070 - }, { - "place": "nuts/HU22", - "Count_Person": 2500 - }] - six.assertCountEqual(self, rows, exp) - - @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) - def test_tolerate_place_string(self, urlopen): - """Calling covariate_pd_input with single string place arg.""" - rows = dc.covariate_pd_input(['geoId/06'], 'Count_Person') - exp = [{"place": "geoId/06", "Count_Person": 25090}] - self.assertEqual(rows, exp) - - if __name__ == '__main__': unittest.main() diff --git a/dcpandas/CHANGELOG.md b/dcpandas/CHANGELOG.md new file mode 100644 index 00000000..5249b4f6 --- /dev/null +++ b/dcpandas/CHANGELOG.md @@ -0,0 +1,21 @@ +# Changelog + +## 0.01 + +**Date** - 08/24/2020 + +**Release Tag** - [pd.0.0.1](https://github.com/datacommonsorg/api-python/releases/tag/pd0.0.1) + +**Release Status** - Current head of branch [`master`](https://github.com/datacommonsorg/api-python/tree/master) + +Added Pandas wrapper functions. + +- `build_time_series` will construct a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. +- `build_time_series_dataframe` will construct a pd.DataFrame for a given StatisticalVariable and a set of Places: where Places are the index and date are the columns. +- `build_covariate_dataframe` will construct a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places: with Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. + +For multi-place functions, when a StatisticalVariable has multiple StatVarObservation options, +Data Commons chooses a set of StatVarObservation options that covers the most geos. This +ensures that the data fetched for a StatisticalVariable is comparable across places. +When there is a tie, we select the StatVarObservation options set with the latest date +data is available for any place. diff --git a/dcpandas/README.md b/dcpandas/README.md new file mode 100644 index 00000000..1b8c6158 --- /dev/null +++ b/dcpandas/README.md @@ -0,0 +1,47 @@ +# Data Commons Pandas API + +This is a Python library for creating Pandas objects with data in the +Data Commons Graph. +To get started, install this package from pip. + + pip install datacommons-pandas + +Once the package is installed, import `datacommons-pandas`. + + import datacommons-pandas as dcpd + +For more detail on getting started with the API, please visit our +[API Overview](http://docs.datacommons.org/api/). + +After you're ready to use the API, you can refer to `datacommons-pandas/examples` for +examples on how to use this package to perform various tasks. More tutorials and +documentation can be found at [tutorials](https://datacommons.org/colab)! + +## About Data Commons + +[Data Commons](https://datacommons.org/) is an open knowledge repository that +provides a unified view across multiple public data sets and statistics. You can +view what [datasets](https://datacommons.org/datasets) are currently ingested +and browse the graph using our [browser](https://browser.datacommons.org/). + +## License + +Apache 2.0 + +## Development + +Please follow the Development instructions from the root directory. + +## Release to PyPI + +- Update "VERSION" in setup.py +- Update CHANGELOG.md for a new version +- Upload a new package using steps for [generating distribution archives](https://packaging.python.org/tutorials/packaging-projects/#generating-distribution-archives) and [uploading the distribution archives](https://packaging.python.org/tutorials/packaging-projects/#uploading-the-distribution-archives) + +## Support + +For general questions or issues about the API, please open an issue on our +[issues](https://github.com/datacommonsorg/api-python/issues) page. For all other +questions, please send an email to `support@datacommons.org`. + +**Note** - This is not an officially supported Google product. diff --git a/dcpandas/__init__.py b/dcpandas/__init__.py new file mode 100644 index 00000000..baa42576 --- /dev/null +++ b/dcpandas/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dcpandas.stat_vars import get_stat_value, get_stat_series, get_stat_all +from dcpandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe \ No newline at end of file diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py new file mode 100644 index 00000000..432b148c --- /dev/null +++ b/dcpandas/df_builder.py @@ -0,0 +1,298 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data Commons Python API Stat Module. + +Provides functions for getting data on StatisticalVariables from Data Commons Graph. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import pandas as pd +import six + +import dcpandas.stat_vars as dc + +# Pandas Helpers +# These functions are wrapper functions that create Python data structures +# that are easily converted to Pandas DataFrames (and Series). + + +def build_time_series(place, stat_var): + """Constructs a pandas Series with `dates` as the index and corresponding `stat_var` statistics as values. + + Args: + place (`str`): The dcid of Place to query for. + stat_var (`str`): The dcid of the StatisticalVariable. + Returns: + A pandas Series with Place IDs as the index, and Observed statistics as values. + """ + return pd.Series(dc.get_stat_series(place, stat_var)) + + +def _group_stat_all_by_obs_options(places, stat_vars, mode): + """Groups the result of `get_stat_all` by Observation options for time series. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. + mode (`str`): "series" to output time series grouped by Observation options, or + "covariates" to output latest Observations. + Returns: + A pandas Series with Place IDs as the index, and Observed statistics as values. + + Raises: + ValueError: If the payload returned by the Data Commons REST API is + malformed. + """ + kseries = "series" + kcov = "covariates" + + if mode == kseries: + if len(stat_vars) != 1: + raise ValueError( + 'When `mode=series`, only one StatisticalVariable for `stat_vars` is allowed.' + ) + res = collections.defaultdict(list) + elif mode == kcov: + res = collections.defaultdict(lambda: collections.defaultdict(list)) + else: + raise ValueError( + 'Value of `mode` must be one of ("series", "covariates")') + + stat_all = dc.get_stat_all(places, stat_vars) + for place, place_data in stat_all.items(): + if not place_data: + continue + for stat_var, stat_var_data in place_data.items(): + if not stat_var_data: + continue + for source_series in stat_var_data['sourceSeries']: + time_series = source_series['val'] + # Create a hashable for Observation options. + obs_options = (('measurementMethod', + source_series.get('measurementMethod')), + ('observationPeriod', + source_series.get('observationPeriod')), + ('unit', source_series.get('unit')), + ('scalingFactor', + source_series.get('scalingFactor'))) + if mode == kseries: + res[obs_options].append( + dict({'place': place}, **time_series)) + elif mode == kcov: + date = max(time_series) + res[stat_var][obs_options].append({ + 'place': place, + 'date': date, + 'val': time_series[date] + }) + if mode == kseries: + return dict(res) + elif mode == kcov: + return {k: dict(v) for k, v in res.items()} + + +def _time_series_pd_input(places, stat_var): + """Returns a `list` of `dict` per element of `places` based on the `stat_var`. + + Data Commons will pick a set of Observation options that covers the + maximum number of queried places. Among ties, Data Commons selects an option + set with the latest Observation. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_var (`str`): The dcid of the StatisticalVariable. + Returns: + A `list` of `dict`, one per element of `places`. Each `dict` consists of + the time series and place identifier. + + Examples: + >>> _time_series_pd_input(["geoId/29", "geoId/33"], "Count_Person") + [ + {'2020-03-07': 20, '2020-03-08': 40, 'place': 'geoId/29'}, + {'2020-08-21': 428, '2020-08-22': 429, 'place': 'geoId/33'} + ] + """ + + rows_dict = _group_stat_all_by_obs_options(places, [stat_var], 'series') + most_geos = [] + max_geos_so_far = 0 + latest_date = [] + latest_date_so_far = '' + for options, rows in rows_dict.items(): + current_geos = len(rows) + if current_geos > max_geos_so_far: + max_geos_so_far = current_geos + most_geos = [options] + # Reset tiebreaker stats. Recompute after this if-else block. + latest_date = [] + latest_date_so_far = '' + elif current_geos == max_geos_so_far: + most_geos.append(options) + else: + # Do not compute tiebreaker stats if not in most_geos. + continue + for row in rows: + dates = set(row.keys()) + dates.remove('place') + row_max_date = max(dates) + if row_max_date > latest_date_so_far: + latest_date_so_far = row_max_date + latest_date = [options] + elif row_max_date == latest_date_so_far: + latest_date.append(options) + for options in most_geos: + if options in latest_date: + return rows_dict[options] + + +def build_time_series_dataframe(places, stat_var, desc_col=False): + """Constructs a pandas DataFrame with `places` as the index and dates of the time series as the columns. + + To ensure statistics are comparable across all Places, when multiple + StatVarObservations are available for Place and StatVar combos, Data + Commons selects the Observation options that covers the most Places, and breaks + ties using the Observation options that yield the latest Observation for any + Place. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_var (`str`): The dcid of the StatisticalVariable. + desc_col: Whether to order columns in descending order. + Returns: + A pandas DataFrame with Place IDs as the index, and sorted dates as columns. + """ + try: + if isinstance(places, six.string_types): + places = [places] + else: + places = list(places) + assert all(isinstance(place, six.string_types) for place in places) + except: + raise ValueError( + 'Parameter `places` must be a string object or list-like object of string.' + ) + if not isinstance(stat_var, six.string_types): + raise ValueError('Parameter `stat_var` must be a string.') + + df = pd.DataFrame.from_records(_time_series_pd_input(places, stat_var)) + df.set_index('place', inplace=True) + df.sort_index(inplace=True) + if desc_col: + return df[sorted(df.columns, reverse=desc_col)] + return df + + +def _covariate_pd_input(places, stat_vars): + """Returns a `list` of `dict` per element of `places` based on the `stat_var`. + + Data Commons will pick a set of StatVarObservation options that covers the + maximum number of queried places. Among ties, Data Commons selects an option + set with the latest Observation. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. + Returns: + A `list` of `dict`, one per element of `places`. Each `dict` consists of + the time series and place identifier. + + Examples: + >>> _covariate_pd_input(["geoId/29", "geoId/33"], ["Count_Person", "Median_Income_Person"]) + [ + {'Count_Person': 20, 'Median_Income_Person': 40, 'place': 'geoId/29'}, + {'Count_Person': 428, 'Median_Income_Person': 429, 'place': 'geoId/33'} + ] + """ + + rows_dict = _group_stat_all_by_obs_options(places, stat_vars, 'covariates') + place2cov = collections.defaultdict(dict) # {geo: {var1: 3, var2: 33}} + + for stat_var, candidates_dict in rows_dict.items(): + selected_rows = None + most_geos = [] + max_geos_so_far = 0 + latest_date = [] + latest_date_so_far = '' + for options, rows in candidates_dict.items(): + current_geos = len(rows) + if current_geos > max_geos_so_far: + max_geos_so_far = current_geos + most_geos = [options] + # Reset tiebreaker stats. Recompute after this if-else block. + latest_date = [] + latest_date_so_far = '' + elif current_geos == max_geos_so_far: + most_geos.append(options) + else: + # Do not compute tiebreaker stats if not in most_geos. + continue + for row in rows: + row_date = row['date'] + if row_date > latest_date_so_far: + latest_date_so_far = row_date + latest_date = [options] + elif row_date == latest_date_so_far: + latest_date.append(options) + for options in most_geos: + if options in latest_date: + selected_rows = candidates_dict[options] + + for row in selected_rows: + place2cov[row['place']][stat_var] = row['val'] + return [ + dict({'place': place}, **covariates) + for place, covariates in place2cov.items() + ] + + +def build_covariate_dataframe(places, stat_vars): + """Constructs a pandas DataFrame with `places` as the index and `stat_vars` as the columns. + + To ensure statistics are comparable across all Places, when multiple + StatVarObservations are available for Place and StatVar combos, Data + Commons selects the Observation options that covers the most Places, and breaks + ties using the Observation options that yield the latest Observation for any + Place. + + Args: + places (`str` or `iterable` of `str`): The dcids of Places to query for. + stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. + Returns: + A pandas DataFrame with Place IDs as the index and `stat_vars` as columns. + """ + try: + if isinstance(places, six.string_types): + places = [places] + else: + places = list(places) + assert all(isinstance(place, six.string_types) for place in places) + if isinstance(stat_vars, six.string_types): + stat_vars = [stat_vars] + else: + stat_vars = list(stat_vars) + assert all( + isinstance(stat_var, six.string_types) + for stat_var in stat_vars) + except: + raise ValueError( + 'Parameter `places` and `stat_vars` must be string object or list-like object.' + ) + df = pd.DataFrame.from_records(_covariate_pd_input(places, stat_vars)) + df.set_index('place', inplace=True) + df.sort_index(inplace=True) + return df \ No newline at end of file diff --git a/dcpandas/examples/__init__.py b/dcpandas/examples/__init__.py new file mode 100644 index 00000000..2c79033c --- /dev/null +++ b/dcpandas/examples/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/dcpandas/examples/df_builder.py b/dcpandas/examples/df_builder.py new file mode 100644 index 00000000..cf70fd30 --- /dev/null +++ b/dcpandas/examples/df_builder.py @@ -0,0 +1,84 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Basic examples for building Pandas objects using the Data Commons Pandas API.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import dcpandas as dcpd + + +def main(): + + print('\nBuild a pd.Series of time series for one variable and one place.') + print('dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent")') + print('>>> ') + print(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent")) + + print( + '\nBuild a DataFrame of time series for one variable in multiple places.' + ) + print( + 'dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "Median_Income_Person")' + ) + print('>>> ') + print( + dcpd.build_time_series_dataframe( + ["geoId/33", "geoId/29", "country/USA"], "Median_Income_Person")) + print( + 'dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' + ) + print('>>> ') + print( + dcpd.build_time_series_dataframe(["country/USA"], + "Median_Income_Person", + desc_col=True)) + print( + '\nBuild a DataFrame of latest observations for multiple variables in multiple places.' + ) + + print( + 'dcpd.build_covariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person"])' + ) + print('>>> ') + print( + dcpd.build_covariate_dataframe( + ["geoId/06", "country/FRA"], + ["Median_Age_Person", "Count_Person", "Count_Household"])) + + print('\n\nExpect 4 errors, starting HERE:') + try: + dcpd.build_time_series_dataframe( + ["geoId/33"], ["Median_Income_Person", "Count_Person"]) + except ValueError as e: + print("Successfully errored on: ", e) + try: + dcpd.build_time_series_dataframe(24, ["Median_Income_Person"]) + except ValueError as e: + print("Successfully errored on: ", e) + try: + dcpd.build_covariate_dataframe([3], + ["Median_Income_Person", "Count_Person"]) + except ValueError as e: + print("Successfully errored on: ", e) + try: + dcpd.build_covariate_dataframe("country/USA", True) + except ValueError as e: + print("Successfully errored on: ", e) + print('until HERE.') + + +if __name__ == '__main__': + main() diff --git a/dcpandas/setup.py b/dcpandas/setup.py new file mode 100644 index 00000000..53e94314 --- /dev/null +++ b/dcpandas/setup.py @@ -0,0 +1,59 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Install datacommons.""" +from setuptools import setup, find_packages + +with open('README.md', 'r') as fh: + long_description = fh.read() + +# Package metadata. +NAME = 'dcpandas' +DESCRIPTION = 'A library to create Pandas objects using the Data Commons Python API.' +URL = 'https://github.com/datacommonsorg/api-python' +EMAIL = 'support@datacommons.org' +AUTHOR = 'datacommons.org' +REQUIRES_PYTHON = '>=2.7' +VERSION = '0.0.1' + +REQUIRED = [ + 'six', + 'pandas', +] + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=long_description, + long_description_content_type='text/markdown', + author=AUTHOR, + author_email=EMAIL, + maintainer=AUTHOR, + maintainer_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(), + install_requires=REQUIRED, + include_package_data=True, + license='Apache 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Topic :: Software Development', + ], +) diff --git a/dcpandas/stat_vars.py b/dcpandas/stat_vars.py new file mode 120000 index 00000000..ab7359b6 --- /dev/null +++ b/dcpandas/stat_vars.py @@ -0,0 +1 @@ +../datacommons/stat_vars.py \ No newline at end of file diff --git a/dcpandas/test/__init__.py b/dcpandas/test/__init__.py new file mode 100644 index 00000000..2c79033c --- /dev/null +++ b/dcpandas/test/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/dcpandas/test/df_builder_test.py b/dcpandas/test/df_builder_test.py new file mode 100644 index 00000000..a9fc9c65 --- /dev/null +++ b/dcpandas/test/df_builder_test.py @@ -0,0 +1,286 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Data Commons Python API unit tests. + +Unit tests for StatVar methods in the Data Commons Pandas API. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +try: + from unittest.mock import patch +except ImportError: + from mock import patch + +import dcpandas.df_builder as dcpd +import dcpandas.utils as utils +import json +import unittest +import six +import six.moves.urllib as urllib + +# Reusable parts of REST API /stat/all response. +CA_COUNT_PERSON = { + "isDcAggregate": + "true", + "sourceSeries": [{ + "val": { + "1990": 23640, + "1991": 24100, + "1993": 25090, + }, + "observationPeriod": "P1Y", + "importName": "WorldDevelopmentIndicators", + "provenanceDomain": "worldbank.org" + }, { + "val": { + "1790": 3929214, + "1800": 5308483, + "1810": 7239881, + }, + "measurementMethod": "WikidataPopulation", + "importName": "WikidataPopulation", + "provenanceDomain": "wikidata.org" + }, { + "val": { + "1890": 28360, + "1891": 24910, + "1892": 25070, + }, + "measurementMethod": "OECDRegionalStatistics", + "observationPeriod": "P1Y", + "importName": "OECDRegionalDemography", + "provenanceDomain": "oecd.org" + }] +} + +HU22_COUNT_PERSON = { + "sourceSeries": [{ + "val": { + "1990": 2360, + "1991": 2410, + "1992": 2500, + }, + "measurementMethod": "OECDRegionalStatistics", + "observationPeriod": "P1Y", + "importName": "OECDRegionalDemography", + "provenanceDomain": "oecd.org" + }] +} + +CA_MEDIAN_AGE_PERSON = { + "sourceSeries": [{ + "val": { + "1990": 12, + "1991": 24, + "1992": 24, + }, + "measurementMethod": "WikidataPopulation", + "importName": "WikidataPopulation", + "provenanceDomain": "wikidata.org" + }] +} + + +def request_mock(*args, **kwargs): + """A mock urlopen requests sent in the requests package.""" + + # Create the mock response object. + class MockResponse: + + def __init__(self, json_data): + self.json_data = json_data + + def read(self): + return self.json_data + + req = args[0] + + stat_value_url_base = utils._API_ROOT + utils._API_ENDPOINTS[ + 'get_stat_value'] + stat_series_url_base = utils._API_ROOT + utils._API_ENDPOINTS[ + 'get_stat_series'] + stat_all_url_base = utils._API_ROOT + utils._API_ENDPOINTS['get_stat_all'] + + # Mock responses for urlopen requests to get_stat_series. + if req.get_full_url( + ) == stat_series_url_base + '?place=geoId/06&stat_var=Count_Person': + # Response returned when querying with basic args. + return MockResponse(json.dumps({"series": {"2000": 1, "2001": 2}})) + if (req.get_full_url() == stat_series_url_base + + '?place=geoId/06&stat_var=Count_Person&' + + 'measurement_method=CensusPEPSurvey&observation_period=P1Y&' + + 'unit=RealPeople&scaling_factor=100'): + + # Response returned when querying with above optional params. + return MockResponse(json.dumps({"series": {"2000": 3, "2001": 42}})) + if (req.get_full_url() == stat_series_url_base + + '?place=geoId/06&stat_var=Count_Person&' + + 'measurement_method=DNE'): + + # Response returned when data not available for optional parameters. + # /stat/series?place=geoId/06&stat_var=Count_Person&measurement_method=DNE + return MockResponse(json.dumps({"series": {}})) + + # Mock responses for urlopen requests to get_stat_all. + if req.get_full_url() == stat_all_url_base: + data = json.loads(req.data) + + if (data['places'] == ['geoId/06', 'nuts/HU22'] and + data['stat_vars'] == ['Count_Person', 'Median_Age_Person']): + # Response returned when querying with above params. + # Median Age missing for HU22. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + "Median_Age_Person": CA_MEDIAN_AGE_PERSON + } + }, + "nuts/HU22": { + "statVarData": { + "Count_Person": HU22_COUNT_PERSON, + "Median_Age_Person": {} + } + } + } + } + return MockResponse(json.dumps(resp)) + + if (data['places'] == ['geoId/06', 'nuts/HU22'] and + data['stat_vars'] == ['Count_Person']): + # Response returned when querying with above params. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + } + }, + "nuts/HU22": { + "statVarData": { + "Count_Person": HU22_COUNT_PERSON, + } + } + } + } + return MockResponse(json.dumps(resp)) + print("??????????????????") + print(data) + print("??????????????????") + + if (data['places'] == ['geoId/06'] and + data['stat_vars'] == ['Count_Person']): + # Response returned when querying with above params. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + } + } + } + } + return MockResponse(json.dumps(resp)) + + if (data['places'] == ['geoId/06', 'nuts/HU22'] and + data['stat_vars'] == ['Count_Person', 'Median_Age_Person']): + # Response returned when querying with above params. + # Median Age missing for HU22. + resp = { + "placeData": { + "geoId/06": { + "statVarData": { + "Count_Person": CA_COUNT_PERSON, + "Median_Age_Person": CA_MEDIAN_AGE_PERSON + } + }, + "nuts/HU22": { + "statVarData": { + "Count_Person": HU22_COUNT_PERSON, + "Median_Age_Person": {} + } + } + } + } + return MockResponse(json.dumps(resp)) + # Otherwise, return an empty response and a 404. + return urllib.error.HTTPError + + +class TestPdTimeSeries(unittest.TestCase): + """Unit tests for _time_series_pd_input.""" + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_basic(self, urlopen): + """Calling _time_series_pd_input with proper args.""" + rows = dcpd._time_series_pd_input(['geoId/06', 'nuts/HU22'], + 'Count_Person') + exp = [{ + "1890": 28360, + "1891": 24910, + "1892": 25070, + "place": "geoId/06" + }, { + "1991": 2410, + "1990": 2360, + "1992": 2500, + "place": "nuts/HU22" + }] + six.assertCountEqual(self, rows, exp) + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_one_place(self, urlopen): + """Calling _time_series_pd_input with single place.""" + rows = dcpd._time_series_pd_input(['geoId/06'], 'Count_Person') + exp = [{ + "1990": 23640, + "1991": 24100, + "1993": 25090, + "place": "geoId/06" + }] + self.assertEqual(rows, exp) + + +class TestPdCovariates(unittest.TestCase): + """Unit tests for _covariate_pd_input.""" + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_basic(self, urlopen): + """Calling _covariate_pd_input with proper args.""" + rows = dcpd._covariate_pd_input(['geoId/06', 'nuts/HU22'], + ['Count_Person', 'Median_Age_Person']) + exp = [{ + "place": "geoId/06", + "Median_Age_Person": 24, + "Count_Person": 25070 + }, { + "place": "nuts/HU22", + "Count_Person": 2500 + }] + six.assertCountEqual(self, rows, exp) + + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_one_each(self, urlopen): + """Calling _covariate_pd_input with single place and var.""" + rows = dcpd._covariate_pd_input(['geoId/06'], ['Count_Person']) + exp = [{"place": "geoId/06", "Count_Person": 25090}] + self.assertEqual(rows, exp) + + +if __name__ == '__main__': + unittest.main() diff --git a/dcpandas/utils.py b/dcpandas/utils.py new file mode 120000 index 00000000..06c545f5 --- /dev/null +++ b/dcpandas/utils.py @@ -0,0 +1 @@ +../datacommons/utils.py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0141b5f1..35bbdfec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ six pytest mock +pandas \ No newline at end of file diff --git a/setup.py b/setup.py index 7907027e..d8eaf315 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """Install datacommons.""" from setuptools import setup @@ -25,7 +24,7 @@ EMAIL = 'support@datacommons.org' AUTHOR = 'datacommons.org' REQUIRES_PYTHON = '>=2.7' -VERSION = '1.3.0' +VERSION = '1.4.0' REQUIRED = [ 'six', From 32a028486c8b4b4d1388b96b16a41c8572a4a06c Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:16:06 -0700 Subject: [PATCH 10/35] Do the python release in another PR. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d8eaf315..68d1016d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ EMAIL = 'support@datacommons.org' AUTHOR = 'datacommons.org' REQUIRES_PYTHON = '>=2.7' -VERSION = '1.4.0' +VERSION = '1.3.0' REQUIRED = [ 'six', From 160eee6f346e2ea2d500b73787e748a02ea915a3 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:16:20 -0700 Subject: [PATCH 11/35] Remove stale refs in datacommons library to pandas features. --- datacommons/examples/stat_vars.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 9b7c29b5..8ec77a1b 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -126,22 +126,6 @@ def call_str(pvs): dc.get_stat_all(["badPlaceId", "country/FRA"], ["Median_Age_Person", "Count_Person"])) - print( - 'dc.time_series_pd_input(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' - ) - print('>>> ') - pp.pprint( - dc.time_series_pd_input(["geoId/29", "geoId/33"], - "Median_Income_Person")) - - print( - 'dc.covariate_pd_input(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person"])' - ) - print('>>> ') - pp.pprint( - dc.covariate_pd_input(["geoId/06", "country/FRA"], - ["Median_Age_Person", "Count_Person"])) - if __name__ == '__main__': main() From 771b0a5fb9f1bb98df7d3c12f9fbb3063d9cf1f0 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:18:07 -0700 Subject: [PATCH 12/35] Update pandas readme. --- dcpandas/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dcpandas/README.md b/dcpandas/README.md index 1b8c6158..fb994f97 100644 --- a/dcpandas/README.md +++ b/dcpandas/README.md @@ -4,16 +4,16 @@ This is a Python library for creating Pandas objects with data in the Data Commons Graph. To get started, install this package from pip. - pip install datacommons-pandas + pip install dcpandas -Once the package is installed, import `datacommons-pandas`. +Once the package is installed, import `dcpandas`. - import datacommons-pandas as dcpd + import dcpandas as dcpd For more detail on getting started with the API, please visit our [API Overview](http://docs.datacommons.org/api/). -After you're ready to use the API, you can refer to `datacommons-pandas/examples` for +After you're ready to use the API, you can refer to `dcpandas/examples` for examples on how to use this package to perform various tasks. More tutorials and documentation can be found at [tutorials](https://datacommons.org/colab)! From 5a86466e0a746d6ec3ea32eb03121e06e8decb6a Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:19:45 -0700 Subject: [PATCH 13/35] Cleanup format. --- dcpandas/__init__.py | 2 +- dcpandas/test/df_builder_test.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/dcpandas/__init__.py b/dcpandas/__init__.py index baa42576..fe8f6bb8 100644 --- a/dcpandas/__init__.py +++ b/dcpandas/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. from dcpandas.stat_vars import get_stat_value, get_stat_series, get_stat_all -from dcpandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe \ No newline at end of file +from dcpandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe diff --git a/dcpandas/test/df_builder_test.py b/dcpandas/test/df_builder_test.py index a9fc9c65..20e3ab76 100644 --- a/dcpandas/test/df_builder_test.py +++ b/dcpandas/test/df_builder_test.py @@ -179,9 +179,6 @@ def read(self): } } return MockResponse(json.dumps(resp)) - print("??????????????????") - print(data) - print("??????????????????") if (data['places'] == ['geoId/06'] and data['stat_vars'] == ['Count_Person']): From 5780970d7101477b55b476a57fdd86076359d8be Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:47:00 -0700 Subject: [PATCH 14/35] Remove pd-related mocks from python testing. --- datacommons/test/stat_vars_test.py | 54 ------------------------------ 1 file changed, 54 deletions(-) diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index 60ef037a..613674a6 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -225,39 +225,6 @@ def read(self): } return MockResponse(json.dumps(resp)) - if (data['places'] == ['geoId/06', 'nuts/HU22'] and - data['stat_vars'] == ['Count_Person']): - # Response returned when querying with above params. - resp = { - "placeData": { - "geoId/06": { - "statVarData": { - "Count_Person": CA_COUNT_PERSON, - } - }, - "nuts/HU22": { - "statVarData": { - "Count_Person": HU22_COUNT_PERSON, - } - } - } - } - return MockResponse(json.dumps(resp)) - - if (data['places'] == ['geoId/06'] and - data['stat_vars'] == ['Count_Person']): - # Response returned when querying with above params. - resp = { - "placeData": { - "geoId/06": { - "statVarData": { - "Count_Person": CA_COUNT_PERSON, - } - } - } - } - return MockResponse(json.dumps(resp)) - if (data['places'] == ['badPlaceId', 'nuts/HU22'] and data['stat_vars'] == ['Count_Person', 'badStatVarId']): # Response returned when querying with above params. @@ -280,27 +247,6 @@ def read(self): } return MockResponse(json.dumps(resp)) - if (data['places'] == ['geoId/06', 'nuts/HU22'] and - data['stat_vars'] == ['Count_Person', 'Median_Age_Person']): - # Response returned when querying with above params. - # Median Age missing for HU22. - resp = { - "placeData": { - "geoId/06": { - "statVarData": { - "Count_Person": CA_COUNT_PERSON, - "Median_Age_Person": CA_MEDIAN_AGE_PERSON - } - }, - "nuts/HU22": { - "statVarData": { - "Count_Person": HU22_COUNT_PERSON, - "Median_Age_Person": {} - } - } - } - } - return MockResponse(json.dumps(resp)) # Otherwise, return an empty response and a 404. return urllib.error.HTTPError From cb83487bf73f2e992cac925414798c101b3a762f Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:59:03 -0700 Subject: [PATCH 15/35] Cosmetics. --- dcpandas/CHANGELOG.md | 4 ++-- dcpandas/README.md | 2 +- dcpandas/__init__.py | 2 +- dcpandas/df_builder.py | 10 +++------- dcpandas/examples/df_builder.py | 2 +- dcpandas/setup.py | 2 +- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/dcpandas/CHANGELOG.md b/dcpandas/CHANGELOG.md index 5249b4f6..750ab4b9 100644 --- a/dcpandas/CHANGELOG.md +++ b/dcpandas/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 0.01 +## 0.0.1 **Date** - 08/24/2020 @@ -8,7 +8,7 @@ **Release Status** - Current head of branch [`master`](https://github.com/datacommonsorg/api-python/tree/master) -Added Pandas wrapper functions. +Added pandas wrapper functions. - `build_time_series` will construct a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. - `build_time_series_dataframe` will construct a pd.DataFrame for a given StatisticalVariable and a set of Places: where Places are the index and date are the columns. diff --git a/dcpandas/README.md b/dcpandas/README.md index fb994f97..b3daafc3 100644 --- a/dcpandas/README.md +++ b/dcpandas/README.md @@ -1,6 +1,6 @@ # Data Commons Pandas API -This is a Python library for creating Pandas objects with data in the +This is a Python library for creating pandas objects with data in the Data Commons Graph. To get started, install this package from pip. diff --git a/dcpandas/__init__.py b/dcpandas/__init__.py index fe8f6bb8..4a34cabb 100644 --- a/dcpandas/__init__.py +++ b/dcpandas/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 Google Inc. +# Copyright 2020 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py index 432b148c..fbe9ddd0 100644 --- a/dcpandas/df_builder.py +++ b/dcpandas/df_builder.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Data Commons Python API Stat Module. +"""Data Commons Pandas API DataFrame Builder Module. -Provides functions for getting data on StatisticalVariables from Data Commons Graph. +Provides functions for building pandas DataFrames using the Data Commons Graph. """ from __future__ import absolute_import @@ -26,10 +26,6 @@ import dcpandas.stat_vars as dc -# Pandas Helpers -# These functions are wrapper functions that create Python data structures -# that are easily converted to Pandas DataFrames (and Series). - def build_time_series(place, stat_var): """Constructs a pandas Series with `dates` as the index and corresponding `stat_var` statistics as values. @@ -295,4 +291,4 @@ def build_covariate_dataframe(places, stat_vars): df = pd.DataFrame.from_records(_covariate_pd_input(places, stat_vars)) df.set_index('place', inplace=True) df.sort_index(inplace=True) - return df \ No newline at end of file + return df diff --git a/dcpandas/examples/df_builder.py b/dcpandas/examples/df_builder.py index cf70fd30..ab72ae73 100644 --- a/dcpandas/examples/df_builder.py +++ b/dcpandas/examples/df_builder.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Basic examples for building Pandas objects using the Data Commons Pandas API.""" +"""Basic examples for building pandas objects using the Data Commons Pandas API.""" from __future__ import absolute_import from __future__ import division diff --git a/dcpandas/setup.py b/dcpandas/setup.py index 53e94314..14e2dc70 100644 --- a/dcpandas/setup.py +++ b/dcpandas/setup.py @@ -19,7 +19,7 @@ # Package metadata. NAME = 'dcpandas' -DESCRIPTION = 'A library to create Pandas objects using the Data Commons Python API.' +DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' URL = 'https://github.com/datacommonsorg/api-python' EMAIL = 'support@datacommons.org' AUTHOR = 'datacommons.org' From 85b3a9b83df67598cd3e75de7a159513b583fdfa Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 03:28:20 -0700 Subject: [PATCH 16/35] Update docstring --- dcpandas/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcpandas/setup.py b/dcpandas/setup.py index 14e2dc70..fd94ca0e 100644 --- a/dcpandas/setup.py +++ b/dcpandas/setup.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Install datacommons.""" +"""Install dcpandas.""" from setuptools import setup, find_packages with open('README.md', 'r') as fh: From 4044c0359dc5223ce888e1b965dd078afd2024ce Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 04:19:04 -0700 Subject: [PATCH 17/35] Fix import statement for pip. Always sort time series df columns. --- dcpandas/df_builder.py | 6 ++-- setup_dcpandas.py | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 setup_dcpandas.py diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py index fbe9ddd0..e0317f91 100644 --- a/dcpandas/df_builder.py +++ b/dcpandas/df_builder.py @@ -24,7 +24,7 @@ import pandas as pd import six -import dcpandas.stat_vars as dc +import datacommons.stat_vars as dc def build_time_series(place, stat_var): @@ -188,9 +188,7 @@ def build_time_series_dataframe(places, stat_var, desc_col=False): df = pd.DataFrame.from_records(_time_series_pd_input(places, stat_var)) df.set_index('place', inplace=True) df.sort_index(inplace=True) - if desc_col: - return df[sorted(df.columns, reverse=desc_col)] - return df + return df[sorted(df.columns, reverse=desc_col)] def _covariate_pd_input(places, stat_vars): diff --git a/setup_dcpandas.py b/setup_dcpandas.py new file mode 100644 index 00000000..a73fb3c8 --- /dev/null +++ b/setup_dcpandas.py @@ -0,0 +1,62 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Install dcpandas.""" +from setuptools import setup, find_packages + +with open('README.md', 'r') as fh: + long_description = fh.read() + +# Package metadata. +NAME = 'dcpandas_tjann' +DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' +URL = 'https://github.com/datacommonsorg/api-python' +EMAIL = 'support@datacommons.org' +AUTHOR = 'datacommons.org' +REQUIRES_PYTHON = '>=2.7' +VERSION = '0.0.4' + +REQUIRED = [ + 'six', + 'pandas', +] + +PACKAGES = ['dcpandas'] +PACKAGE_DIR = {'dcpandas': 'dcpandas'} +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=long_description, + long_description_content_type='text/markdown', + author=AUTHOR, + author_email=EMAIL, + maintainer=AUTHOR, + maintainer_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=PACKAGES, + package_dir=PACKAGE_DIR, + install_requires=REQUIRED, + include_package_data=True, + license='Apache 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Topic :: Software Development', + ], +) From d6290beabccf8b084749fbf964541c13cbfc1cae Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 04:21:11 -0700 Subject: [PATCH 18/35] Restore pandas setup to prepare for release. --- setup_dcpandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup_dcpandas.py b/setup_dcpandas.py index a73fb3c8..14dbbe33 100644 --- a/setup_dcpandas.py +++ b/setup_dcpandas.py @@ -18,13 +18,13 @@ long_description = fh.read() # Package metadata. -NAME = 'dcpandas_tjann' +NAME = 'dcpandas' DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' URL = 'https://github.com/datacommonsorg/api-python' EMAIL = 'support@datacommons.org' AUTHOR = 'datacommons.org' REQUIRES_PYTHON = '>=2.7' -VERSION = '0.0.4' +VERSION = '0.0.1' REQUIRED = [ 'six', From 4bba808c4959d3548319fe28db176410b78834d4 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 13:40:19 -0700 Subject: [PATCH 19/35] change _group_stat_all_by_obs_options mode parameter to time_series boolean. --- dcpandas/df_builder.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py index e0317f91..64ef8395 100644 --- a/dcpandas/df_builder.py +++ b/dcpandas/df_builder.py @@ -39,14 +39,13 @@ def build_time_series(place, stat_var): return pd.Series(dc.get_stat_series(place, stat_var)) -def _group_stat_all_by_obs_options(places, stat_vars, mode): - """Groups the result of `get_stat_all` by Observation options for time series. +def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): + """Groups the result of `get_stat_all` by Observation options for time series or covariates. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. - mode (`str`): "series" to output time series grouped by Observation options, or - "covariates" to output latest Observations. + mode (`boolean`): if True, output time series grouped by Observation options; if False, output latest Observation grouped by Observation options. Returns: A pandas Series with Place IDs as the index, and Observed statistics as values. @@ -54,20 +53,14 @@ def _group_stat_all_by_obs_options(places, stat_vars, mode): ValueError: If the payload returned by the Data Commons REST API is malformed. """ - kseries = "series" - kcov = "covariates" - - if mode == kseries: + if time_series: if len(stat_vars) != 1: raise ValueError( - 'When `mode=series`, only one StatisticalVariable for `stat_vars` is allowed.' + 'When `time_series` is set, only one StatisticalVariable for `stat_vars` is allowed.' ) res = collections.defaultdict(list) - elif mode == kcov: - res = collections.defaultdict(lambda: collections.defaultdict(list)) else: - raise ValueError( - 'Value of `mode` must be one of ("series", "covariates")') + res = collections.defaultdict(lambda: collections.defaultdict(list)) stat_all = dc.get_stat_all(places, stat_vars) for place, place_data in stat_all.items(): @@ -86,19 +79,19 @@ def _group_stat_all_by_obs_options(places, stat_vars, mode): ('unit', source_series.get('unit')), ('scalingFactor', source_series.get('scalingFactor'))) - if mode == kseries: + if time_series: res[obs_options].append( dict({'place': place}, **time_series)) - elif mode == kcov: + else: date = max(time_series) res[stat_var][obs_options].append({ 'place': place, 'date': date, 'val': time_series[date] }) - if mode == kseries: + if time_series: return dict(res) - elif mode == kcov: + else: return {k: dict(v) for k, v in res.items()} @@ -124,7 +117,8 @@ def _time_series_pd_input(places, stat_var): ] """ - rows_dict = _group_stat_all_by_obs_options(places, [stat_var], 'series') + rows_dict = _group_stat_all_by_obs_options(places, [stat_var], + time_series=True) most_geos = [] max_geos_so_far = 0 latest_date = [] @@ -213,7 +207,9 @@ def _covariate_pd_input(places, stat_vars): ] """ - rows_dict = _group_stat_all_by_obs_options(places, stat_vars, 'covariates') + rows_dict = _group_stat_all_by_obs_options(places, + stat_vars, + time_series=False) place2cov = collections.defaultdict(dict) # {geo: {var1: 3, var2: 33}} for stat_var, candidates_dict in rows_dict.items(): From c81eaa67fb6265c00c3793505777f3e5ce02fafa Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 14:18:19 -0700 Subject: [PATCH 20/35] Address some documentation suggestions from cyin. --- README.md | 4 ++-- dcpandas/CHANGELOG.md | 4 ++-- dcpandas/README.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8a93ce8e..ba65c8e6 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ understanding API usage. For more detail on getting started with the API, please visit our [API Overview](http://docs.datacommons.org/api/). -After you're ready to use the API, you can refer to `datacommons/examples` for +When you are ready to use the API, you can refer to `datacommons/examples` for examples on how to use this package to perform various tasks. More tutorials and -documentation can be found at [tutorials](https://datacommons.org/colab)! +documentation can be found on our [tutorials page](https://datacommons.org/colab)! ## About Data Commons diff --git a/dcpandas/CHANGELOG.md b/dcpandas/CHANGELOG.md index 750ab4b9..068f599a 100644 --- a/dcpandas/CHANGELOG.md +++ b/dcpandas/CHANGELOG.md @@ -11,8 +11,8 @@ Added pandas wrapper functions. - `build_time_series` will construct a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. -- `build_time_series_dataframe` will construct a pd.DataFrame for a given StatisticalVariable and a set of Places: where Places are the index and date are the columns. -- `build_covariate_dataframe` will construct a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places: with Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. +- `build_time_series_dataframe` will construct a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. +- `build_covariate_dataframe` will construct a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. For multi-place functions, when a StatisticalVariable has multiple StatVarObservation options, Data Commons chooses a set of StatVarObservation options that covers the most geos. This diff --git a/dcpandas/README.md b/dcpandas/README.md index b3daafc3..779e247f 100644 --- a/dcpandas/README.md +++ b/dcpandas/README.md @@ -13,9 +13,9 @@ Once the package is installed, import `dcpandas`. For more detail on getting started with the API, please visit our [API Overview](http://docs.datacommons.org/api/). -After you're ready to use the API, you can refer to `dcpandas/examples` for +When you are ready to use the API, you can refer to `dcpandas/examples` for examples on how to use this package to perform various tasks. More tutorials and -documentation can be found at [tutorials](https://datacommons.org/colab)! +documentation can be found on our [tutorials page](https://datacommons.org/colab)! ## About Data Commons From d3a618d53da1cb1a32b8e49eebd6c1855051e13e Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 14:19:21 -0700 Subject: [PATCH 21/35] Fix bug from reassigning parameter time_series value in _group_stat_all_by_obs_options. --- dcpandas/df_builder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py index 64ef8395..dc035746 100644 --- a/dcpandas/df_builder.py +++ b/dcpandas/df_builder.py @@ -70,7 +70,7 @@ def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): if not stat_var_data: continue for source_series in stat_var_data['sourceSeries']: - time_series = source_series['val'] + series = source_series['val'] # Create a hashable for Observation options. obs_options = (('measurementMethod', source_series.get('measurementMethod')), @@ -80,14 +80,13 @@ def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): ('scalingFactor', source_series.get('scalingFactor'))) if time_series: - res[obs_options].append( - dict({'place': place}, **time_series)) + res[obs_options].append(dict({'place': place}, **series)) else: - date = max(time_series) + date = max(series) res[stat_var][obs_options].append({ 'place': place, 'date': date, - 'val': time_series[date] + 'val': series[date] }) if time_series: return dict(res) @@ -136,6 +135,7 @@ def _time_series_pd_input(places, stat_var): else: # Do not compute tiebreaker stats if not in most_geos. continue + for row in rows: dates = set(row.keys()) dates.remove('place') From a4bcf4efb9786f108f48aead227dd1b3c6ba0f8f Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 14:19:47 -0700 Subject: [PATCH 22/35] Make df_builder examples more readable. --- dcpandas/examples/df_builder.py | 48 ++++++++++++++------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/dcpandas/examples/df_builder.py b/dcpandas/examples/df_builder.py index ab72ae73..aabe6519 100644 --- a/dcpandas/examples/df_builder.py +++ b/dcpandas/examples/df_builder.py @@ -22,41 +22,33 @@ def main(): - print('\nBuild a pd.Series of time series for one variable and one place.') - print('dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent")') - print('>>> ') - print(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent")) + print(""" +# Build a pd.Series of time series for one variable and one place. +$ dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent") +{}""".format(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent"))) - print( - '\nBuild a DataFrame of time series for one variable in multiple places.' - ) - print( - 'dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "Median_Income_Person")' - ) - print('>>> ') - print( + print(""" +# Build a DataFrame of time series for one variable in multiple places. +$ dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "Median_Income_Person") +{}""".format( dcpd.build_time_series_dataframe( - ["geoId/33", "geoId/29", "country/USA"], "Median_Income_Person")) - print( - 'dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased")' - ) - print('>>> ') - print( + ["geoId/33", "geoId/29", "country/USA"], "Median_Income_Person"))) + + print(""" +# Build a DataFrame of time series with columns sorted in descending order. +$ dcpd.build_time_series_dataframe(["country/USA"], "Median_Income_Person", desc_col=True) +{}""".format( dcpd.build_time_series_dataframe(["country/USA"], "Median_Income_Person", - desc_col=True)) - print( - '\nBuild a DataFrame of latest observations for multiple variables in multiple places.' - ) + desc_col=True))) - print( - 'dcpd.build_covariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person"])' - ) - print('>>> ') - print( + print(""" +# Build a DataFrame of latest observations for multiple variables in multiple places. +$ dcpd.build_covariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person", "Count_Household"]) +{}""".format( dcpd.build_covariate_dataframe( ["geoId/06", "country/FRA"], - ["Median_Age_Person", "Count_Person", "Count_Household"])) + ["Median_Age_Person", "Count_Person", "Count_Household"]))) print('\n\nExpect 4 errors, starting HERE:') try: From a2202c08f20af03eddd6fc7e0e07c0ccd836b8df Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 14:23:54 -0700 Subject: [PATCH 23/35] Update the docstrings for both PyPI release setup*.py files. Change dcpandas to datacommons_pandas. --- setup.py | 2 +- setup_dcpandas.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 68d1016d..f266ac52 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Install datacommons.""" +"""Build and distrubute the datacommons package to PyPI.""" from setuptools import setup with open('README.md', 'r') as fh: diff --git a/setup_dcpandas.py b/setup_dcpandas.py index 14dbbe33..79741bf3 100644 --- a/setup_dcpandas.py +++ b/setup_dcpandas.py @@ -1,4 +1,4 @@ -# Copyright 2017 Google Inc. +# Copyright 2020 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Install dcpandas.""" -from setuptools import setup, find_packages +"""Build and distrubute the datacommons_pandas package to PyPI.""" +from setuptools import setup with open('README.md', 'r') as fh: long_description = fh.read() # Package metadata. -NAME = 'dcpandas' +NAME = 'datacommons_pandas' DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' URL = 'https://github.com/datacommonsorg/api-python' EMAIL = 'support@datacommons.org' From 0ebb20fd267600904aa94c1c4c4da17cf24bbb2a Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 15:52:27 -0700 Subject: [PATCH 24/35] Rename time_series parameter to keep_series for _group_stat_all_by_obs_options. --- dcpandas/df_builder.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dcpandas/df_builder.py b/dcpandas/df_builder.py index dc035746..74160755 100644 --- a/dcpandas/df_builder.py +++ b/dcpandas/df_builder.py @@ -39,13 +39,13 @@ def build_time_series(place, stat_var): return pd.Series(dc.get_stat_series(place, stat_var)) -def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): +def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): """Groups the result of `get_stat_all` by Observation options for time series or covariates. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. - mode (`boolean`): if True, output time series grouped by Observation options; if False, output latest Observation grouped by Observation options. + keep_series (`boolean`): if True, output time series grouped by Observation options; if False, output latest Observation grouped by Observation options. Returns: A pandas Series with Place IDs as the index, and Observed statistics as values. @@ -53,10 +53,10 @@ def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): ValueError: If the payload returned by the Data Commons REST API is malformed. """ - if time_series: + if keep_series: if len(stat_vars) != 1: raise ValueError( - 'When `time_series` is set, only one StatisticalVariable for `stat_vars` is allowed.' + 'When `keep_series` is set, only one StatisticalVariable for `stat_vars` is allowed.' ) res = collections.defaultdict(list) else: @@ -79,7 +79,7 @@ def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): ('unit', source_series.get('unit')), ('scalingFactor', source_series.get('scalingFactor'))) - if time_series: + if keep_series: res[obs_options].append(dict({'place': place}, **series)) else: date = max(series) @@ -88,7 +88,7 @@ def _group_stat_all_by_obs_options(places, stat_vars, time_series=True): 'date': date, 'val': series[date] }) - if time_series: + if keep_series: return dict(res) else: return {k: dict(v) for k, v in res.items()} @@ -117,7 +117,7 @@ def _time_series_pd_input(places, stat_var): """ rows_dict = _group_stat_all_by_obs_options(places, [stat_var], - time_series=True) + keep_series=True) most_geos = [] max_geos_so_far = 0 latest_date = [] @@ -209,7 +209,7 @@ def _covariate_pd_input(places, stat_vars): rows_dict = _group_stat_all_by_obs_options(places, stat_vars, - time_series=False) + keep_series=False) place2cov = collections.defaultdict(dict) # {geo: {var1: 3, var2: 33}} for stat_var, candidates_dict in rows_dict.items(): From efd2e0c9834909a20425e035d0aa3e22724e1c53 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 19:33:00 -0700 Subject: [PATCH 25/35] dcpandas to datacommons_pandas, including all datacommons functions --- datacommons/__init__.py | 2 +- {dcpandas => datacommons_pandas}/CHANGELOG.md | 0 {dcpandas => datacommons_pandas}/README.md | 0 datacommons_pandas/__init__.py | 27 +++++++++ datacommons_pandas/core.py | 1 + .../df_builder.py | 2 +- .../examples/__init__.py | 0 .../examples/df_builder.py | 2 +- datacommons_pandas/places.py | 1 + datacommons_pandas/populations.py | 1 + datacommons_pandas/query.py | 1 + {dcpandas => datacommons_pandas}/stat_vars.py | 0 .../test/__init__.py | 0 .../test/df_builder_test.py | 4 +- {dcpandas => datacommons_pandas}/utils.py | 0 dcpandas/__init__.py | 16 ----- dcpandas/setup.py | 59 ------------------- setup.py => setup_datacommons.py | 0 ...dcpandas.py => setup_datacommons_pandas.py | 6 +- 19 files changed, 39 insertions(+), 83 deletions(-) rename {dcpandas => datacommons_pandas}/CHANGELOG.md (100%) rename {dcpandas => datacommons_pandas}/README.md (100%) create mode 100644 datacommons_pandas/__init__.py create mode 120000 datacommons_pandas/core.py rename {dcpandas => datacommons_pandas}/df_builder.py (99%) rename {dcpandas => datacommons_pandas}/examples/__init__.py (100%) rename {dcpandas => datacommons_pandas}/examples/df_builder.py (98%) create mode 120000 datacommons_pandas/places.py create mode 120000 datacommons_pandas/populations.py create mode 120000 datacommons_pandas/query.py rename {dcpandas => datacommons_pandas}/stat_vars.py (100%) rename {dcpandas => datacommons_pandas}/test/__init__.py (100%) rename {dcpandas => datacommons_pandas}/test/df_builder_test.py (99%) rename {dcpandas => datacommons_pandas}/utils.py (100%) delete mode 100644 dcpandas/__init__.py delete mode 100644 dcpandas/setup.py rename setup.py => setup_datacommons.py (100%) rename setup_dcpandas.py => setup_datacommons_pandas.py (92%) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index da2f9fc4..3d7384d5 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -22,4 +22,4 @@ from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all # Other utilities -from .utils import set_api_key +from datacommons.utils import set_api_key diff --git a/dcpandas/CHANGELOG.md b/datacommons_pandas/CHANGELOG.md similarity index 100% rename from dcpandas/CHANGELOG.md rename to datacommons_pandas/CHANGELOG.md diff --git a/dcpandas/README.md b/datacommons_pandas/README.md similarity index 100% rename from dcpandas/README.md rename to datacommons_pandas/README.md diff --git a/datacommons_pandas/__init__.py b/datacommons_pandas/__init__.py new file mode 100644 index 00000000..2c0e7268 --- /dev/null +++ b/datacommons_pandas/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Data Commons SPARQL query support +from datacommons_pandas.query import query + +# Data Commons Python API +from datacommons_pandas.core import get_property_labels, get_property_values, get_triples +from datacommons_pandas.places import get_places_in, get_related_places, get_stats +from datacommons_pandas.populations import get_populations, get_observations, get_pop_obs, get_place_obs +from datacommons_pandas.stat_vars import get_stat_value, get_stat_series, get_stat_all + +# Other utilities +from datacommons_pandas.utils import set_api_key + +from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe diff --git a/datacommons_pandas/core.py b/datacommons_pandas/core.py new file mode 120000 index 00000000..15f455cf --- /dev/null +++ b/datacommons_pandas/core.py @@ -0,0 +1 @@ +../datacommons/core.py \ No newline at end of file diff --git a/dcpandas/df_builder.py b/datacommons_pandas/df_builder.py similarity index 99% rename from dcpandas/df_builder.py rename to datacommons_pandas/df_builder.py index 74160755..c965a973 100644 --- a/dcpandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -24,7 +24,7 @@ import pandas as pd import six -import datacommons.stat_vars as dc +import datacommons_pandas.stat_vars as dc def build_time_series(place, stat_var): diff --git a/dcpandas/examples/__init__.py b/datacommons_pandas/examples/__init__.py similarity index 100% rename from dcpandas/examples/__init__.py rename to datacommons_pandas/examples/__init__.py diff --git a/dcpandas/examples/df_builder.py b/datacommons_pandas/examples/df_builder.py similarity index 98% rename from dcpandas/examples/df_builder.py rename to datacommons_pandas/examples/df_builder.py index aabe6519..2939e0e5 100644 --- a/dcpandas/examples/df_builder.py +++ b/datacommons_pandas/examples/df_builder.py @@ -17,7 +17,7 @@ from __future__ import division from __future__ import print_function -import dcpandas as dcpd +import datacommons_pandas as dcpd def main(): diff --git a/datacommons_pandas/places.py b/datacommons_pandas/places.py new file mode 120000 index 00000000..7206307a --- /dev/null +++ b/datacommons_pandas/places.py @@ -0,0 +1 @@ +../datacommons/places.py \ No newline at end of file diff --git a/datacommons_pandas/populations.py b/datacommons_pandas/populations.py new file mode 120000 index 00000000..3e74c37b --- /dev/null +++ b/datacommons_pandas/populations.py @@ -0,0 +1 @@ +../datacommons/populations.py \ No newline at end of file diff --git a/datacommons_pandas/query.py b/datacommons_pandas/query.py new file mode 120000 index 00000000..d7db3c39 --- /dev/null +++ b/datacommons_pandas/query.py @@ -0,0 +1 @@ +../datacommons/query.py \ No newline at end of file diff --git a/dcpandas/stat_vars.py b/datacommons_pandas/stat_vars.py similarity index 100% rename from dcpandas/stat_vars.py rename to datacommons_pandas/stat_vars.py diff --git a/dcpandas/test/__init__.py b/datacommons_pandas/test/__init__.py similarity index 100% rename from dcpandas/test/__init__.py rename to datacommons_pandas/test/__init__.py diff --git a/dcpandas/test/df_builder_test.py b/datacommons_pandas/test/df_builder_test.py similarity index 99% rename from dcpandas/test/df_builder_test.py rename to datacommons_pandas/test/df_builder_test.py index 20e3ab76..f917bb9b 100644 --- a/dcpandas/test/df_builder_test.py +++ b/datacommons_pandas/test/df_builder_test.py @@ -25,8 +25,8 @@ except ImportError: from mock import patch -import dcpandas.df_builder as dcpd -import dcpandas.utils as utils +import datacommons_pandas.df_builder as dcpd +import datacommons_pandas.utils as utils import json import unittest import six diff --git a/dcpandas/utils.py b/datacommons_pandas/utils.py similarity index 100% rename from dcpandas/utils.py rename to datacommons_pandas/utils.py diff --git a/dcpandas/__init__.py b/dcpandas/__init__.py deleted file mode 100644 index 4a34cabb..00000000 --- a/dcpandas/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2020 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dcpandas.stat_vars import get_stat_value, get_stat_series, get_stat_all -from dcpandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe diff --git a/dcpandas/setup.py b/dcpandas/setup.py deleted file mode 100644 index fd94ca0e..00000000 --- a/dcpandas/setup.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Install dcpandas.""" -from setuptools import setup, find_packages - -with open('README.md', 'r') as fh: - long_description = fh.read() - -# Package metadata. -NAME = 'dcpandas' -DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' -URL = 'https://github.com/datacommonsorg/api-python' -EMAIL = 'support@datacommons.org' -AUTHOR = 'datacommons.org' -REQUIRES_PYTHON = '>=2.7' -VERSION = '0.0.1' - -REQUIRED = [ - 'six', - 'pandas', -] - -setup( - name=NAME, - version=VERSION, - description=DESCRIPTION, - long_description=long_description, - long_description_content_type='text/markdown', - author=AUTHOR, - author_email=EMAIL, - maintainer=AUTHOR, - maintainer_email=EMAIL, - python_requires=REQUIRES_PYTHON, - url=URL, - packages=find_packages(), - install_requires=REQUIRED, - include_package_data=True, - license='Apache 2.0', - classifiers=[ - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: Implementation :: CPython', - 'Topic :: Software Development', - ], -) diff --git a/setup.py b/setup_datacommons.py similarity index 100% rename from setup.py rename to setup_datacommons.py diff --git a/setup_dcpandas.py b/setup_datacommons_pandas.py similarity index 92% rename from setup_dcpandas.py rename to setup_datacommons_pandas.py index 79741bf3..f59a49d7 100644 --- a/setup_dcpandas.py +++ b/setup_datacommons_pandas.py @@ -14,7 +14,7 @@ """Build and distrubute the datacommons_pandas package to PyPI.""" from setuptools import setup -with open('README.md', 'r') as fh: +with open('datacommons_pandas/README.md', 'r') as fh: long_description = fh.read() # Package metadata. @@ -31,8 +31,8 @@ 'pandas', ] -PACKAGES = ['dcpandas'] -PACKAGE_DIR = {'dcpandas': 'dcpandas'} +PACKAGES = ['datacommons_pandas'] +PACKAGE_DIR = {'datacommons_pandas': 'datacommons_pandas'} setup( name=NAME, version=VERSION, From f645f3fefb30a0867175687fe8e312b5e12c3098 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Mon, 24 Aug 2020 20:02:56 -0700 Subject: [PATCH 26/35] Fix various docstrings. --- datacommons/__init__.py | 8 ++++++++ datacommons/examples/stat_vars.py | 2 +- datacommons_pandas/README.md | 8 ++++---- datacommons_pandas/__init__.py | 9 +++++++-- datacommons_pandas/df_builder.py | 32 +++++++++++++++++-------------- setup_datacommons.py | 2 +- setup_datacommons_pandas.py | 2 +- 7 files changed, 40 insertions(+), 23 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 3d7384d5..58903a6b 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -12,6 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +################################## IMPORTANT ################################# +# All user-facing functions in this package must be simlinked to the # +# datacommons_pandas pkg. This is so that users do not need to import both # +# libraries for pd support. Please keep the below imports in sync with the # +# __init__.py in the datacommons_pandas/ dir, and add a simlink when # +# creating a new file. # +##############################################@################################ + # Data Commons SPARQL query support from datacommons.query import query diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 8ec77a1b..73f353f6 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Basic examples for StatisticalVariable-based param_set Data Commons API functions.""" +"""Basic examples for StatisticalVariable-based Data Commons API functions.""" from __future__ import absolute_import from __future__ import division diff --git a/datacommons_pandas/README.md b/datacommons_pandas/README.md index 779e247f..135e8224 100644 --- a/datacommons_pandas/README.md +++ b/datacommons_pandas/README.md @@ -4,16 +4,16 @@ This is a Python library for creating pandas objects with data in the Data Commons Graph. To get started, install this package from pip. - pip install dcpandas + pip install datacommons_pandas -Once the package is installed, import `dcpandas`. +Once the package is installed, import `datacommons_pandas`. - import dcpandas as dcpd + import datacommons_pandas as dcpd For more detail on getting started with the API, please visit our [API Overview](http://docs.datacommons.org/api/). -When you are ready to use the API, you can refer to `dcpandas/examples` for +When you are ready to use the API, you can refer to `datacommons_pandas/examples` for examples on how to use this package to perform various tasks. More tutorials and documentation can be found on our [tutorials page](https://datacommons.org/colab)! diff --git a/datacommons_pandas/__init__.py b/datacommons_pandas/__init__.py index 2c0e7268..0bc6d75d 100644 --- a/datacommons_pandas/__init__.py +++ b/datacommons_pandas/__init__.py @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe + +################################ SIMLINK FILES ################################ +# We include simlinks to all user-facing functions from the datacommons pkg. # +# This is so that users do not need to import both libraries for pd support. # +# Please keep the below in sync with the __init__.py in the datacommons/ dir # +##############################################@################################ # Data Commons SPARQL query support from datacommons_pandas.query import query @@ -23,5 +30,3 @@ # Other utilities from datacommons_pandas.utils import set_api_key - -from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index c965a973..93ccbdef 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -34,20 +34,23 @@ def build_time_series(place, stat_var): place (`str`): The dcid of Place to query for. stat_var (`str`): The dcid of the StatisticalVariable. Returns: - A pandas Series with Place IDs as the index, and Observed statistics as values. + A pandas Series with Place IDs as the index, and observed statistics as values. """ return pd.Series(dc.get_stat_series(place, stat_var)) def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): - """Groups the result of `get_stat_all` by Observation options for time series or covariates. + """Groups the result of `get_stat_all` by StatVarObservation options for time series or covariates. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. - keep_series (`boolean`): if True, output time series grouped by Observation options; if False, output latest Observation grouped by Observation options. + keep_series (`boolean`): if True, output time series grouped by + StatVarObservation options; if False, output latest statistics grouped + by StatVarObservation options. Returns: - A pandas Series with Place IDs as the index, and Observed statistics as values. + A pandas Series with Place IDs as the index, and observed statistics as + values. Raises: ValueError: If the payload returned by the Data Commons REST API is @@ -97,7 +100,7 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): def _time_series_pd_input(places, stat_var): """Returns a `list` of `dict` per element of `places` based on the `stat_var`. - Data Commons will pick a set of Observation options that covers the + Data Commons will pick a set of StatVarObservation options that covers the maximum number of queried places. Among ties, Data Commons selects an option set with the latest Observation. @@ -154,10 +157,10 @@ def build_time_series_dataframe(places, stat_var, desc_col=False): """Constructs a pandas DataFrame with `places` as the index and dates of the time series as the columns. To ensure statistics are comparable across all Places, when multiple - StatVarObservations are available for Place and StatVar combos, Data - Commons selects the Observation options that covers the most Places, and breaks - ties using the Observation options that yield the latest Observation for any - Place. + StatVarObservations options are available for Place and StatVar combos, + Data Commons selects the StatVarObservation options that covers the most + Places, and breaks ties using the StatVarObservation options that yield + the latest Observation for any Place. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. @@ -200,7 +203,8 @@ def _covariate_pd_input(places, stat_vars): the time series and place identifier. Examples: - >>> _covariate_pd_input(["geoId/29", "geoId/33"], ["Count_Person", "Median_Income_Person"]) + >>> _covariate_pd_input(["geoId/29", "geoId/33"], + ["Count_Person", "Median_Income_Person"]) [ {'Count_Person': 20, 'Median_Income_Person': 40, 'place': 'geoId/29'}, {'Count_Person': 428, 'Median_Income_Person': 429, 'place': 'geoId/33'} @@ -254,10 +258,10 @@ def build_covariate_dataframe(places, stat_vars): """Constructs a pandas DataFrame with `places` as the index and `stat_vars` as the columns. To ensure statistics are comparable across all Places, when multiple - StatVarObservations are available for Place and StatVar combos, Data - Commons selects the Observation options that covers the most Places, and breaks - ties using the Observation options that yield the latest Observation for any - Place. + StatVarObservations options are available for Place and StatVar combos, + Data Commons selects the StatVarObservation options that covers the most + Places, and breaks ties using the StatVarObservation options that yield + the latest Observation for any Place. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. diff --git a/setup_datacommons.py b/setup_datacommons.py index f266ac52..bf3efd64 100644 --- a/setup_datacommons.py +++ b/setup_datacommons.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Build and distrubute the datacommons package to PyPI.""" +"""Build and distribute the datacommons package to PyPI.""" from setuptools import setup with open('README.md', 'r') as fh: diff --git a/setup_datacommons_pandas.py b/setup_datacommons_pandas.py index f59a49d7..ec6034bd 100644 --- a/setup_datacommons_pandas.py +++ b/setup_datacommons_pandas.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Build and distrubute the datacommons_pandas package to PyPI.""" +"""Build and distribute the datacommons_pandas package to PyPI.""" from setuptools import setup with open('datacommons_pandas/README.md', 'r') as fh: From 18cb93e0749a32a8a8d8051c86b48c81f1042d13 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 07:54:29 -0700 Subject: [PATCH 27/35] Add optional args to pandas lib build_time_series to pass onto python get_stat_series. --- datacommons_pandas/df_builder.py | 11 +++++++++-- datacommons_pandas/examples/df_builder.py | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index 93ccbdef..501ecb5a 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -27,7 +27,12 @@ import datacommons_pandas.stat_vars as dc -def build_time_series(place, stat_var): +def build_time_series(place, + stat_var, + measurement_method=None, + observation_period=None, + unit=None, + scaling_factor=None): """Constructs a pandas Series with `dates` as the index and corresponding `stat_var` statistics as values. Args: @@ -36,7 +41,9 @@ def build_time_series(place, stat_var): Returns: A pandas Series with Place IDs as the index, and observed statistics as values. """ - return pd.Series(dc.get_stat_series(place, stat_var)) + return pd.Series( + dc.get_stat_series(place, stat_var, measurement_method, + observation_period, unit, scaling_factor)) def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): diff --git a/datacommons_pandas/examples/df_builder.py b/datacommons_pandas/examples/df_builder.py index 2939e0e5..735b1d5a 100644 --- a/datacommons_pandas/examples/df_builder.py +++ b/datacommons_pandas/examples/df_builder.py @@ -25,6 +25,11 @@ def main(): print(""" # Build a pd.Series of time series for one variable and one place. $ dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent") +{}""".format(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent"))) + + print(""" +# Build a pd.Series of time series for one variable and one place and optional args. +$ dcpd.build_time_series("country/USA", "Count_Person", "CensusPEPSurvey") {}""".format(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent"))) print(""" From 51582c83297fcc9f608a2cf2a7744b6d0264f3d4 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 07:57:35 -0700 Subject: [PATCH 28/35] Update docstrings for time series funcs. --- datacommons/stat_vars.py | 6 +++--- datacommons_pandas/df_builder.py | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index bd2ba72f..dd6c2cee 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -72,9 +72,9 @@ def get_stat_value(place, url += '&scaling_factor={}'.format(scaling_factor) try: - res_json = utils._send_request(url, post=False, use_payload=False) + res_json = utils._send_request(url, post=False, use_payload=False) except ValueError: - raise ValueError('No data in response.') + raise ValueError('No data in response.') return res_json['value'] @@ -97,7 +97,7 @@ def get_stat_series(place, scaling_factor (`int`): Optional, the preferred `scalingFactor` value. Returns: A `dict` mapping dates to value of `stat_var` for `place`, - filtered by optional args. + representing a time series that satisfies all input parameters. Raises: ValueError: If the payload returned by the Data Commons REST API is diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index 501ecb5a..72889afd 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -38,8 +38,15 @@ def build_time_series(place, Args: place (`str`): The dcid of Place to query for. stat_var (`str`): The dcid of the StatisticalVariable. + measurement_method (`str`): Optional, the dcid of the preferred + `measurementMethod` value. + observation_period (`str`): Optional, the preferred + `observationPeriod` value. + unit (`str`): Optional, the dcid of the preferred `unit` value. + scaling_factor (`int`): Optional, the preferred `scalingFactor` value. Returns: - A pandas Series with Place IDs as the index, and observed statistics as values. + A pandas Series with Place IDs as the index and observed statistics as + values, representing a time series satisfying all optional args. """ return pd.Series( dc.get_stat_series(place, stat_var, measurement_method, From a49a095d4d24cbc029c6aa5e04a381739e13ec88 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 09:33:04 -0700 Subject: [PATCH 29/35] Remove will from CHANGELOG. --- datacommons_pandas/CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datacommons_pandas/CHANGELOG.md b/datacommons_pandas/CHANGELOG.md index 068f599a..c130193d 100644 --- a/datacommons_pandas/CHANGELOG.md +++ b/datacommons_pandas/CHANGELOG.md @@ -10,9 +10,9 @@ Added pandas wrapper functions. -- `build_time_series` will construct a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. -- `build_time_series_dataframe` will construct a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. -- `build_covariate_dataframe` will construct a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. +- `build_time_series` constructs a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. +- `build_time_series_dataframe` constructs a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. +- `build_covariate_dataframe` constructs a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. For multi-place functions, when a StatisticalVariable has multiple StatVarObservation options, Data Commons chooses a set of StatVarObservation options that covers the most geos. This From 7f46fdd2a3fe9210927a4ec499f196ff388f585a Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 09:41:02 -0700 Subject: [PATCH 30/35] Reference TODO for cloudbuild pandas-python sync check. Update changelog. --- datacommons/__init__.py | 3 ++- datacommons_pandas/CHANGELOG.md | 2 +- datacommons_pandas/__init__.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 58903a6b..182ecc2e 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -18,7 +18,8 @@ # libraries for pd support. Please keep the below imports in sync with the # # __init__.py in the datacommons_pandas/ dir, and add a simlink when # # creating a new file. # -##############################################@################################ +# TODO: https://github.com/datacommonsorg/api-python/issues/149 # +############################################################################## # Data Commons SPARQL query support from datacommons.query import query diff --git a/datacommons_pandas/CHANGELOG.md b/datacommons_pandas/CHANGELOG.md index c130193d..77ea2b0a 100644 --- a/datacommons_pandas/CHANGELOG.md +++ b/datacommons_pandas/CHANGELOG.md @@ -10,7 +10,7 @@ Added pandas wrapper functions. -- `build_time_series` constructs a pd.Series for a given StatisticalVariable and Place, where dates are the index for the time series. +- `build_time_series` constructs a pd.Series for a given StatisticalVariable and Place, where the time series are indexed by date. - `build_time_series_dataframe` constructs a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. - `build_covariate_dataframe` constructs a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. diff --git a/datacommons_pandas/__init__.py b/datacommons_pandas/__init__.py index 0bc6d75d..b68f5d65 100644 --- a/datacommons_pandas/__init__.py +++ b/datacommons_pandas/__init__.py @@ -18,6 +18,7 @@ # We include simlinks to all user-facing functions from the datacommons pkg. # # This is so that users do not need to import both libraries for pd support. # # Please keep the below in sync with the __init__.py in the datacommons/ dir # +# TODO: enforce this. https://github.com/datacommonsorg/api-python/issues/149 # ##############################################@################################ # Data Commons SPARQL query support from datacommons_pandas.query import query From 87b13ec2c8373a0beef1d7c086c83e935728e509 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 12:50:00 -0700 Subject: [PATCH 31/35] Rename covariate* to multivariate*, address cyin's comments on df_builder; add test for raising no data error; fix various tests that were returning the type HTTPError instead of an instance of HTTPError. --- datacommons/__init__.py | 4 +- datacommons/test/core_test.py | 2 +- datacommons/test/places_test.py | 2 +- datacommons/test/stat_vars_test.py | 2 +- datacommons/utils.py | 5 ++- datacommons_pandas/README.md | 1 + datacommons_pandas/__init__.py | 6 +-- datacommons_pandas/df_builder.py | 47 +++++++++++++--------- datacommons_pandas/examples/df_builder.py | 25 +++++++++--- datacommons_pandas/test/df_builder_test.py | 28 +++++++++---- 10 files changed, 80 insertions(+), 42 deletions(-) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 182ecc2e..93d61ab4 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -13,10 +13,10 @@ # limitations under the License. ################################## IMPORTANT ################################# -# All user-facing functions in this package must be simlinked to the # +# All user-facing functions in this package must be symlinked to the # # datacommons_pandas pkg. This is so that users do not need to import both # # libraries for pd support. Please keep the below imports in sync with the # -# __init__.py in the datacommons_pandas/ dir, and add a simlink when # +# __init__.py in the datacommons_pandas/ dir, and add a symlink when # # creating a new file. # # TODO: https://github.com/datacommonsorg/api-python/issues/149 # ############################################################################## diff --git a/datacommons/test/core_test.py b/datacommons/test/core_test.py index 8dae3443..e64064a6 100644 --- a/datacommons/test/core_test.py +++ b/datacommons/test/core_test.py @@ -311,7 +311,7 @@ def read(self): return MockResponse(json.dumps({'payload': res_json})) # Otherwise, return an empty response and a 404. - return urllib.error.HTTPError + return urllib.error.HTTPError(None, 404, None, None, None) class TestGetPropertyLabels(unittest.TestCase): diff --git a/datacommons/test/places_test.py b/datacommons/test/places_test.py index 6c57a8cd..d4147655 100644 --- a/datacommons/test/places_test.py +++ b/datacommons/test/places_test.py @@ -212,7 +212,7 @@ def read(self): return MockResponse(json.dumps({'payload': res_json})) # Otherwise, return an empty response and a 404. - return urllib.error.HTTPError + return urllib.error.HTTPError(None, 404, None, None, None) class TestGetPlacesIn(unittest.TestCase): """ Unit stests for get_places_in. """ diff --git a/datacommons/test/stat_vars_test.py b/datacommons/test/stat_vars_test.py index 613674a6..e7a5a44d 100644 --- a/datacommons/test/stat_vars_test.py +++ b/datacommons/test/stat_vars_test.py @@ -248,7 +248,7 @@ def read(self): return MockResponse(json.dumps(resp)) # Otherwise, return an empty response and a 404. - return urllib.error.HTTPError + return urllib.error.HTTPError(None, 404, None, None, None) class TestGetStatValue(unittest.TestCase): diff --git a/datacommons/utils.py b/datacommons/utils.py index b0f855e5..502205e9 100644 --- a/datacommons/utils.py +++ b/datacommons/utils.py @@ -113,7 +113,10 @@ def _send_request(req_url, req_json={}, compress=False, post=True, use_payload=T raise ValueError( 'Response error: An HTTP {} code was returned by the mixer. Printing ' 'response\n\n{}'.format(e.code, e.read())) - + if isinstance(res, six.moves.urllib.error.HTTPError): + raise ValueError( + 'Response error: An HTTP {} code was returned by the mixer. Printing ' + 'response\n\n{}'.format(res.code, res.msg)) # Get the JSON res_json = json.loads(res.read()) if not use_payload: diff --git a/datacommons_pandas/README.md b/datacommons_pandas/README.md index 135e8224..0386fec3 100644 --- a/datacommons_pandas/README.md +++ b/datacommons_pandas/README.md @@ -2,6 +2,7 @@ This is a Python library for creating pandas objects with data in the Data Commons Graph. + To get started, install this package from pip. pip install datacommons_pandas diff --git a/datacommons_pandas/__init__.py b/datacommons_pandas/__init__.py index b68f5d65..c19dca9a 100644 --- a/datacommons_pandas/__init__.py +++ b/datacommons_pandas/__init__.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_covariate_dataframe +from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_multivariate_dataframe -################################ SIMLINK FILES ################################ -# We include simlinks to all user-facing functions from the datacommons pkg. # +################################ SYMLINK FILES ################################ +# We include symlinks to all user-facing functions from the datacommons pkg. # # This is so that users do not need to import both libraries for pd support. # # Please keep the below in sync with the __init__.py in the datacommons/ dir # # TODO: enforce this. https://github.com/datacommonsorg/api-python/issues/149 # diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index 72889afd..635d7f31 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -54,7 +54,11 @@ def build_time_series(place, def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): - """Groups the result of `get_stat_all` by StatVarObservation options for time series or covariates. + """Groups the result of `get_stat_all` by StatVarObservation options for time series or multivariates. + + Note that this function does not preserve `(place, stat_var)` pairs that + yield no data `from get_stat_all`. In the extreme case, no data + for any pairs will return an empty dict. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. @@ -63,8 +67,9 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): StatVarObservation options; if False, output latest statistics grouped by StatVarObservation options. Returns: - A pandas Series with Place IDs as the index, and observed statistics as - values. + A nested dict mapping each StatisticalVariable in `stat_vars` to its + StatVarObservation options. In turn, each StatVarObservation option + maps to a list of rows, one per place, with the place id and stat data. Raises: ValueError: If the payload returned by the Data Commons REST API is @@ -88,7 +93,7 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): continue for source_series in stat_var_data['sourceSeries']: series = source_series['val'] - # Create a hashable for Observation options. + # Convert dict of SVO options into nested tuple (hashable key). obs_options = (('measurementMethod', source_series.get('measurementMethod')), ('observationPeriod', @@ -105,6 +110,8 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): 'date': date, 'val': series[date] }) + if not res: + raise ValueError('No data for any of specified places and stat_vars.') if keep_series: return dict(res) else: @@ -136,21 +143,22 @@ def _time_series_pd_input(places, stat_var): rows_dict = _group_stat_all_by_obs_options(places, [stat_var], keep_series=True) most_geos = [] - max_geos_so_far = 0 + max_geo_count_so_far = 0 latest_date = [] latest_date_so_far = '' for options, rows in rows_dict.items(): current_geos = len(rows) - if current_geos > max_geos_so_far: - max_geos_so_far = current_geos + if current_geos > max_geo_count_so_far: + max_geo_count_so_far = current_geos most_geos = [options] # Reset tiebreaker stats. Recompute after this if-else block. latest_date = [] latest_date_so_far = '' - elif current_geos == max_geos_so_far: + elif current_geos == max_geo_count_so_far: most_geos.append(options) else: - # Do not compute tiebreaker stats if not in most_geos. + # Do not compute tiebreaker stats if no change to most_geos. + # Skip to top of the for loop. continue for row in rows: @@ -202,7 +210,7 @@ def build_time_series_dataframe(places, stat_var, desc_col=False): return df[sorted(df.columns, reverse=desc_col)] -def _covariate_pd_input(places, stat_vars): +def _multivariate_pd_input(places, stat_vars): """Returns a `list` of `dict` per element of `places` based on the `stat_var`. Data Commons will pick a set of StatVarObservation options that covers the @@ -217,7 +225,7 @@ def _covariate_pd_input(places, stat_vars): the time series and place identifier. Examples: - >>> _covariate_pd_input(["geoId/29", "geoId/33"], + >>> _multivariate_pd_input(["geoId/29", "geoId/33"], ["Count_Person", "Median_Income_Person"]) [ {'Count_Person': 20, 'Median_Income_Person': 40, 'place': 'geoId/29'}, @@ -233,22 +241,23 @@ def _covariate_pd_input(places, stat_vars): for stat_var, candidates_dict in rows_dict.items(): selected_rows = None most_geos = [] - max_geos_so_far = 0 + max_geo_count_so_far = 0 latest_date = [] latest_date_so_far = '' for options, rows in candidates_dict.items(): current_geos = len(rows) - if current_geos > max_geos_so_far: - max_geos_so_far = current_geos + if current_geos > max_geo_count_so_far: + max_geo_count_so_far = current_geos most_geos = [options] # Reset tiebreaker stats. Recompute after this if-else block. latest_date = [] latest_date_so_far = '' - elif current_geos == max_geos_so_far: + elif current_geos == max_geo_count_so_far: most_geos.append(options) else: # Do not compute tiebreaker stats if not in most_geos. continue + for row in rows: row_date = row['date'] if row_date > latest_date_so_far: @@ -263,12 +272,12 @@ def _covariate_pd_input(places, stat_vars): for row in selected_rows: place2cov[row['place']][stat_var] = row['val'] return [ - dict({'place': place}, **covariates) - for place, covariates in place2cov.items() + dict({'place': place}, **multivariates) + for place, multivariates in place2cov.items() ] -def build_covariate_dataframe(places, stat_vars): +def build_multivariate_dataframe(places, stat_vars): """Constructs a pandas DataFrame with `places` as the index and `stat_vars` as the columns. To ensure statistics are comparable across all Places, when multiple @@ -300,7 +309,7 @@ def build_covariate_dataframe(places, stat_vars): raise ValueError( 'Parameter `places` and `stat_vars` must be string object or list-like object.' ) - df = pd.DataFrame.from_records(_covariate_pd_input(places, stat_vars)) + df = pd.DataFrame.from_records(_multivariate_pd_input(places, stat_vars)) df.set_index('place', inplace=True) df.sort_index(inplace=True) return df diff --git a/datacommons_pandas/examples/df_builder.py b/datacommons_pandas/examples/df_builder.py index 735b1d5a..63fecfc9 100644 --- a/datacommons_pandas/examples/df_builder.py +++ b/datacommons_pandas/examples/df_builder.py @@ -49,13 +49,13 @@ def main(): print(""" # Build a DataFrame of latest observations for multiple variables in multiple places. -$ dcpd.build_covariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person", "Count_Household"]) +$ dcpd.build_multivariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person", "Count_Household"]) {}""".format( - dcpd.build_covariate_dataframe( + dcpd.build_multivariate_dataframe( ["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person", "Count_Household"]))) - print('\n\nExpect 4 errors, starting HERE:') + print('\n\nExpect 6 errors, starting HERE:') try: dcpd.build_time_series_dataframe( ["geoId/33"], ["Median_Income_Person", "Count_Person"]) @@ -66,12 +66,25 @@ def main(): except ValueError as e: print("Successfully errored on: ", e) try: - dcpd.build_covariate_dataframe([3], - ["Median_Income_Person", "Count_Person"]) + dcpd.build_multivariate_dataframe( + [3], ["Median_Income_Person", "Count_Person"]) except ValueError as e: print("Successfully errored on: ", e) try: - dcpd.build_covariate_dataframe("country/USA", True) + dcpd.build_multivariate_dataframe("country/USA", True) + except ValueError as e: + print("Successfully errored on: ", e) + # If the following two do not error due to the addition of + # Median_Income_Person statistics for NUTS geos, then please + # replace either the places or the StatVar. + try: + dcpd.build_time_series_dataframe(['nuts/HU2', 'nuts/HU22'], + 'Median_Income_Person') + except ValueError as e: + print("Successfully errored on: ", e) + try: + dcpd.build_multivariate_dataframe(['nuts/HU2', 'nuts/HU22'], + ['Median_Income_Person']) except ValueError as e: print("Successfully errored on: ", e) print('until HERE.') diff --git a/datacommons_pandas/test/df_builder_test.py b/datacommons_pandas/test/df_builder_test.py index f917bb9b..e686b16d 100644 --- a/datacommons_pandas/test/df_builder_test.py +++ b/datacommons_pandas/test/df_builder_test.py @@ -216,7 +216,7 @@ def read(self): } return MockResponse(json.dumps(resp)) # Otherwise, return an empty response and a 404. - return urllib.error.HTTPError + return urllib.error.HTTPError(None, 404, None, None, None) class TestPdTimeSeries(unittest.TestCase): @@ -253,14 +253,14 @@ def test_one_place(self, urlopen): self.assertEqual(rows, exp) -class TestPdCovariates(unittest.TestCase): - """Unit tests for _covariate_pd_input.""" +class TestPdMultivariates(unittest.TestCase): + """Unit tests for _multivariate_pd_input.""" @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) def test_basic(self, urlopen): - """Calling _covariate_pd_input with proper args.""" - rows = dcpd._covariate_pd_input(['geoId/06', 'nuts/HU22'], - ['Count_Person', 'Median_Age_Person']) + """Calling _multivariate_pd_input with proper args.""" + rows = dcpd._multivariate_pd_input( + ['geoId/06', 'nuts/HU22'], ['Count_Person', 'Median_Age_Person']) exp = [{ "place": "geoId/06", "Median_Age_Person": 24, @@ -273,11 +273,23 @@ def test_basic(self, urlopen): @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) def test_one_each(self, urlopen): - """Calling _covariate_pd_input with single place and var.""" - rows = dcpd._covariate_pd_input(['geoId/06'], ['Count_Person']) + """Calling _multivariate_pd_input with single place and var.""" + rows = dcpd._multivariate_pd_input(['geoId/06'], ['Count_Person']) exp = [{"place": "geoId/06", "Count_Person": 25090}] self.assertEqual(rows, exp) + @patch('six.moves.urllib.request.urlopen', side_effect=request_mock) + def test_no_data(self, urlopen): + """Error if there is no data.""" + with self.assertRaises(ValueError): + dcpd._group_stat_all_by_obs_options( + ['FOO/100'], ['Count_Person', 'Median_Age_Person']) + with self.assertRaises(ValueError): + dcpd._time_series_pd_input(['FOO/100', 'BAR/200'], ['Count_Person']) + with self.assertRaises(ValueError): + dcpd._multivariate_pd_input(['FOO/100', 'BAR/200'], + ['Count_Person', 'Median_Age_Person']) + if __name__ == '__main__': unittest.main() From f116e42b08fe1882b5b940dbea216fead6326272 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 12:55:28 -0700 Subject: [PATCH 32/35] Update docstring for _group_stat_all_by_obs_options. --- datacommons_pandas/df_builder.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index 635d7f31..92f580c7 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -57,8 +57,8 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): """Groups the result of `get_stat_all` by StatVarObservation options for time series or multivariates. Note that this function does not preserve `(place, stat_var)` pairs that - yield no data `from get_stat_all`. In the extreme case, no data - for any pairs will return an empty dict. + yield no data `from get_stat_all`. In the extreme case that there is no + data for any pairs, raise a ValueError instead of returning an empty dict. Args: places (`str` or `iterable` of `str`): The dcids of Places to query for. @@ -73,7 +73,8 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): Raises: ValueError: If the payload returned by the Data Commons REST API is - malformed. + malformed, or if there is no data for any (Place, StatisticalVariables) + pair. """ if keep_series: if len(stat_vars) != 1: From 975c956aacbfdec979cf6166150af190233c105c Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 15:20:54 -0700 Subject: [PATCH 33/35] Make err msg for _group_stat_all_by_obs_options no data more general. --- datacommons_pandas/df_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datacommons_pandas/df_builder.py b/datacommons_pandas/df_builder.py index 92f580c7..44dcb69f 100644 --- a/datacommons_pandas/df_builder.py +++ b/datacommons_pandas/df_builder.py @@ -112,7 +112,8 @@ def _group_stat_all_by_obs_options(places, stat_vars, keep_series=True): 'val': series[date] }) if not res: - raise ValueError('No data for any of specified places and stat_vars.') + raise ValueError( + 'No data for any of specified Places and StatisticalVariables.') if keep_series: return dict(res) else: From 9865e098225c88ba8140efe9b89b9bee0a22be36 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 17:02:48 -0700 Subject: [PATCH 34/35] Parameterize some pandas lib example functions. --- datacommons_pandas/examples/df_builder.py | 106 +++++++++++++++------- 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/datacommons_pandas/examples/df_builder.py b/datacommons_pandas/examples/df_builder.py index 63fecfc9..6dff26bf 100644 --- a/datacommons_pandas/examples/df_builder.py +++ b/datacommons_pandas/examples/df_builder.py @@ -20,58 +20,95 @@ import datacommons_pandas as dcpd -def main(): +def build_time_series_example(): print(""" # Build a pd.Series of time series for one variable and one place. -$ dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent") -{}""".format(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent"))) +$ dcpd.build_time_series('country/CAN', 'Count_WildlandFireEvent') +{}""".format(dcpd.build_time_series('country/CAN', 'Count_WildlandFireEvent'))) print(""" # Build a pd.Series of time series for one variable and one place and optional args. -$ dcpd.build_time_series("country/USA", "Count_Person", "CensusPEPSurvey") -{}""".format(dcpd.build_time_series("country/CAN", "Count_WildlandFireEvent"))) - - print(""" -# Build a DataFrame of time series for one variable in multiple places. -$ dcpd.build_time_series_dataframe(["geoId/29", "geoId/33"], "Median_Income_Person") +$ dcpd.build_time_series('country/USA', 'Count_Person', 'CensusPEPSurvey') {}""".format( - dcpd.build_time_series_dataframe( - ["geoId/33", "geoId/29", "country/USA"], "Median_Income_Person"))) + dcpd.build_time_series('country/USA', 'Count_Person', + 'CensusPEPSurvey'))) - print(""" -# Build a DataFrame of time series with columns sorted in descending order. -$ dcpd.build_time_series_dataframe(["country/USA"], "Median_Income_Person", desc_col=True) -{}""".format( - dcpd.build_time_series_dataframe(["country/USA"], - "Median_Income_Person", - desc_col=True))) - print(""" -# Build a DataFrame of latest observations for multiple variables in multiple places. -$ dcpd.build_multivariate_dataframe(["geoId/06", "country/FRA"], ["Median_Age_Person", "Count_Person", "Count_Household"]) -{}""".format( - dcpd.build_multivariate_dataframe( - ["geoId/06", "country/FRA"], - ["Median_Age_Person", "Count_Person", "Count_Household"]))) +def build_time_series_dataframe_example(): + + def demonstrate_build_time_series_dataframe(intro_str, + places, + stat_var, + desc_col=False): + arg_str = "{}, '{}'".format(places, stat_var) + if desc_col: + arg_str += ", desc_col=True" + print(""" + # {} + $ dcpd.build_time_series_dataframe({}) + {}""".format(intro_str, arg_str, + dcpd.build_time_series_dataframe(places, stat_var, desc_col))) + + build_time_series_dataframe_params = [{ + 'intro_str': + 'Build a DataFrame of time series for one variable in multiple places.', + 'places': ['geoId/33', 'geoId/29', 'country/USA'], + 'stat_var': + 'Median_Income_Person' + }, { + 'intro_str': + 'Build a DataFrame of time series with columns sorted in descending order.', + 'places': ['country/USA'], + 'stat_var': + 'Median_Income_Person', + 'desc_col': + True + }] - print('\n\nExpect 6 errors, starting HERE:') + for param_set in build_time_series_dataframe_params: + demonstrate_build_time_series_dataframe(**param_set) + + +def build_multivariate_dataframe_example(): + + def demonstrate_build_multivariate_dataframe(intro_str, places, stat_vars): + print(""" + # {} + $ dcpd.build_multivariate_dataframe({}, {}) + {}""".format(intro_str, places, stat_vars, + dcpd.build_multivariate_dataframe(places, stat_vars))) + + build_multivariate_dataframe_params = [{ + 'intro_str': + 'Build a DataFrame of latest observations for multiple variables in multiple places.', + 'places': ['geoId/06', 'country/FRA'], + 'stat_vars': ['Median_Age_Person', 'Count_Person', 'Count_Household'] + }] + + for param_set in build_multivariate_dataframe_params: + demonstrate_build_multivariate_dataframe(**param_set) + + +def expect_err_examples(): + + print("\n\nExpect 6 errors, starting HERE:") try: dcpd.build_time_series_dataframe( - ["geoId/33"], ["Median_Income_Person", "Count_Person"]) + ['geoId/33'], ['Median_Income_Person', 'Count_Person']) except ValueError as e: print("Successfully errored on: ", e) try: - dcpd.build_time_series_dataframe(24, ["Median_Income_Person"]) + dcpd.build_time_series_dataframe(24, ['Median_Income_Person']) except ValueError as e: print("Successfully errored on: ", e) try: dcpd.build_multivariate_dataframe( - [3], ["Median_Income_Person", "Count_Person"]) + [3], ['Median_Income_Person', 'Count_Person']) except ValueError as e: print("Successfully errored on: ", e) try: - dcpd.build_multivariate_dataframe("country/USA", True) + dcpd.build_multivariate_dataframe('country/USA', True) except ValueError as e: print("Successfully errored on: ", e) # If the following two do not error due to the addition of @@ -87,7 +124,14 @@ def main(): ['Median_Income_Person']) except ValueError as e: print("Successfully errored on: ", e) - print('until HERE.') + print("until HERE.") + + +def main(): + build_time_series_example() + build_time_series_dataframe_example() + build_multivariate_dataframe_example() + expect_err_examples() if __name__ == '__main__': From 14dea408c8d7a268e73b5a6992f9795520a3722f Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Tue, 25 Aug 2020 18:52:59 -0700 Subject: [PATCH 35/35] Released pandas. --- .vscode/settings.json | 3 +++ datacommons_pandas/CHANGELOG.md | 6 +++--- datacommons_pandas/__init__.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..99b087e5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.enabled": true +} \ No newline at end of file diff --git a/datacommons_pandas/CHANGELOG.md b/datacommons_pandas/CHANGELOG.md index 77ea2b0a..aae73af0 100644 --- a/datacommons_pandas/CHANGELOG.md +++ b/datacommons_pandas/CHANGELOG.md @@ -2,7 +2,7 @@ ## 0.0.1 -**Date** - 08/24/2020 +**Date** - 08/25/2020 **Release Tag** - [pd.0.0.1](https://github.com/datacommonsorg/api-python/releases/tag/pd0.0.1) @@ -12,10 +12,10 @@ Added pandas wrapper functions. - `build_time_series` constructs a pd.Series for a given StatisticalVariable and Place, where the time series are indexed by date. - `build_time_series_dataframe` constructs a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. -- `build_covariate_dataframe` constructs a covariate pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. +- `build_multivariate_dataframe` constructs a pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. For multi-place functions, when a StatisticalVariable has multiple StatVarObservation options, -Data Commons chooses a set of StatVarObservation options that covers the most geos. This +Data Commons chooses a set of StatVarObservation options that covers the most places. This ensures that the data fetched for a StatisticalVariable is comparable across places. When there is a tie, we select the StatVarObservation options set with the latest date data is available for any place. diff --git a/datacommons_pandas/__init__.py b/datacommons_pandas/__init__.py index c19dca9a..353c395d 100644 --- a/datacommons_pandas/__init__.py +++ b/datacommons_pandas/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 Google Inc. +# Copyright 2020 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.