Skip to content

Commit

Permalink
update catalogs
Browse files Browse the repository at this point in the history
  • Loading branch information
Joseph Hamman committed Aug 18, 2020
1 parent b86fd6e commit 021156a
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 39 deletions.
40 changes: 30 additions & 10 deletions catalogs/fluxnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,50 @@ plugins:

sources:

raw_table:
description: Raw fluxnet data
raw_aux:
description: Raw Aux FLUXNET data
metadata:
url:
url: https://fluxnet.org/
tags: [fluxnet, parquet]
parameters:
station:
description: fluxnet station code
type: str
default: at-neu
default: it-noe
kind:
description: fluxnet data stream
type: str
default: fullset
allowed: [auxmeteo, auxnee, erai, fullset]
default: auxmeteo
allowed: [auxmeteo, auxnee]
freq:
description: temporal frequency
type: str
default: dd
allowed: [dd, hh, mm, ww, yy]
suffix:
description: temporary keyword arg to make catalog work. (TODO --> FIXME!)
driver: parquet
args:
urlpath: "gs://carbonplan-data/raw/fluxnet/{{ station }}_{{ kind }}.parquet"

raw_fullset:
description: Raw Fullset FLUXNET data
metadata:
url: https://fluxnet.org/
tags: [fluxnet, parquet]
parameters:
station:
description: fluxnet station code
type: str
default: 2002-2012_1-4
default: it-noe
kind:
description: fluxnet data stream
type: str
default: fullset
allowed: [erai, fullset]
freq:
description: temporal frequency
type: str
default: dd
allowed: [dd, hh, mm, ww, yy]
driver: parquet
args:
urlpath: "gs://carbonplan-data/raw/fluxnet/flx_{{ station }}_fluxnet2015_{{ kind }}_{{ freq }}_{{ suffix }}.parquet"
urlpath: "gs://carbonplan-data/raw/fluxnet/{{ station }}_{{ kind }}_{{ freq }}.parquet"
17 changes: 12 additions & 5 deletions catalogs/gcp.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
sources:

raw_table:
description: CarbonPlan Reports Data
driver: csv
args:
urlpath: 'https://api.carbonplan.org/projects.csv'
description: Global Carbon Budget 2019 raw data in parquet format
metadata:
origin_url: 'https://api.carbonplan.org/docs'
url: https://www.globalcarbonproject.org/carbonbudget/19/data.htm
tags: [carbon cycle]
parameters:
name:
description: name of GCB dataset
type: str
default: global_carbon_budget
allowed: [global_carbon_budget, fossil_emissions_by_fuel_type, land_use_change_emissions, ocean_sink, terrestrial_sfink, historical_budget, consumption_emissions, territorial_emissions, transfer_emissions]
driver: parquet
args:
urlpath: gs://carbonplan-data/raw/gcp/{{ name }}.parquet
48 changes: 24 additions & 24 deletions catalogs/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,57 @@ description: 'CarbonPlan Master Data Catalog'
sources:

fia:
args:
path: "{{CATALOG_DIR}}/fia.yaml"
description: 'Catalog for data from Forest Inventory Analysis (FIA) database'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/fia.yaml"

fluxnet:
args:
path: "{{CATALOG_DIR}}/fluxnet.yaml"
description: 'Catalog for data from the FLUXNET dataset'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/fluxnet.yaml"

gcb:
args:
path: "{{CATALOG_DIR}}/gcp.yaml"
description: 'Catalog for data from the Global Carbon Project'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/gcp.yaml"

mtbs:
args:
path: "{{CATALOG_DIR}}/mtbs.yaml"
description: 'Catalog for data from the Monitoring Trends in Burn Severity (MTBS) dataset'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/mtbs.yaml"

nftd:
args:
path: "{{CATALOG_DIR}}/nftd.yaml"
description: 'Catalog for data from the National Forest Type Database (NFTD)'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/nftd.yaml"

nlcd:
args:
path: "{{CATALOG_DIR}}/nlcd.yaml"
description: 'Catalog for data from the National Land Cover Database Database (NLCD)'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/nlcd.yaml"

projects:
args:
path: "{{CATALOG_DIR}}/projects.yaml"
description: 'CarbonPlan Projects Dataset Catalog'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/projects.yaml"

spawnetal2020:
args:
path: "{{CATALOG_DIR}}/nlcd.yaml"
description: 'Catalog for data from Global Aboveground and Belowground Biomass Carbon Density Maps for the Year 2010 from Spawn et al (2020)'
driver: intake.catalog.local.YAMLFileCatalog
metadata: {}
driver: intake.catalog.local.YAMLFileCatalog
args:
path: "{{CATALOG_DIR}}/nlcd.yaml"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ rasterio
requests
wget
xarray
xlrd
zarr
166 changes: 166 additions & 0 deletions scripts/gcp/01_raw_to_parquet.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img width=\"50\" src=\"https://carbonplan-assets.s3.amazonaws.com/monogram/dark-small.png\" style=\"margin-left:0px;margin-top:20px\"/>\n",
"\n",
"# Global Carbon Project to Parquet\n",
"\n",
"_by Joe Hamman (CarbonPlan), August 17, 2020_\n",
"\n",
"This notebook converts faw Excel files from the Global Carbon Project to Parquet\n",
"format and stages them in a Google Cloud Storage bucket.\n",
"\n",
"**Inputs:**\n",
"\n",
"- `gcp` directory\n",
"\n",
"**Outputs:**\n",
"\n",
"- One Parquet dataset per Excel sheet:\n",
" `gs://carbonplan-data/raw/gcp/<name>.parquet`\n",
"\n",
"**Notes:**\n",
"\n",
"- No reprojection or processing of the data is done in this notebook.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import dask.dataframe as dd\n",
"import gcsfs\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# run `gcloud auth login` on the command line, or try switching token to `browser`\n",
"fs = gcsfs.GCSFileSystem(\n",
" project=\"carbonplan\",\n",
" token=\"/Users/jhamman/.config/gcloud/legacy_credentials/[email protected]/adc.json\",\n",
")\n",
"\n",
"storage_options = {\"token\": fs.session.credentials, \"project\": \"carbonplan\"}\n",
"\n",
"\n",
"def process(fname, target, **open_kwargs):\n",
" df = pd.read_excel(fname, **open_kwargs)\n",
" df = df.loc[:, ~df.columns.str.contains(\"^Unnamed\")]\n",
" df = dd.from_pandas(df, npartitions=1)\n",
" df.to_parquet(target, engine=\"fastparquet\", storage_options=storage_options)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## National Carbon Emissions\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fname = \"/Users/jhamman/workdir/carbonplan_data_downloads/gcp/National_Carbon_Emissions_2019v1.0.xlsx\"\n",
"\n",
"# Territorial Emissions\n",
"target = \"gs://carbonplan-data/raw/gcp/consumption_emissions.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Territorial Emissions\", skiprows=16, index_col=0)\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Consumption Emissions\n",
"target = \"gs://carbonplan-data/raw/gcp/territorial_emissions.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Consumption Emissions\", skiprows=8, index_col=0)\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Emissions Transfers\n",
"target = \"gs://carbonplan-data/raw/gcp/transfer_emissions.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Emissions Transfers\", skiprows=8, index_col=0)\n",
"process(fname, target, **open_kwargs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Global Carbon Budget\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fname = \"/Users/jhamman/workdir/carbonplan_data_downloads/gcp/raw_gcb_Global_Carbon_Budget_2019v1.0.xlsx\"\n",
"\n",
"# Global Carbon Budget\n",
"target = \"gs://carbonplan-data/raw/gcp/global_carbon_budget.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Global Carbon Budget\", skiprows=18, index_col=0)\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Fossil Emissions by Fuel Type\n",
"target = \"gs://carbonplan-data/raw/gcp/fossil_emissions_by_fuel_type.parquet\"\n",
"open_kwargs = dict(\n",
" sheet_name=\"Fossil Emissions by Fuel Type\", skiprows=12, index_col=0\n",
")\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Land-Use Change Emissions\n",
"target = \"gs://carbonplan-data/raw/gcp/land_use_change_emissions.parquet\"\n",
"open_kwargs = dict(\n",
" sheet_name=\"Land-Use Change Emissions\", skiprows=25, index_col=0\n",
")\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Ocean Sink\n",
"target = \"gs://carbonplan-data/raw/gcp/ocean_sink.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Ocean Sink\", skiprows=22, index_col=0)\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Terrestrial Sink\n",
"target = \"gs://carbonplan-data/raw/gcp/terrestrial_sink.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Terrestrial Sink\", skiprows=23, index_col=0)\n",
"process(fname, target, **open_kwargs)\n",
"\n",
"# Historical Budget\n",
"target = \"gs://carbonplan-data/raw/gcp/historical_budget.parquet\"\n",
"open_kwargs = dict(sheet_name=\"Historical Budget\", skiprows=14, index_col=0)\n",
"process(fname, target, **open_kwargs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 021156a

Please sign in to comment.