From 103755fa292aaaa71c0621411a1357413cc851c9 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 20 Dec 2024 16:37:38 +0000 Subject: [PATCH] differences for PR #15 --- md5sum.txt | 2 +- working-with-diverse-filetypes.md | 117 ++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/md5sum.txt b/md5sum.txt index ec87496..c6721ef 100644 --- a/md5sum.txt +++ b/md5sum.txt @@ -5,7 +5,7 @@ "index.md" "447e37bf2e1d5254609dbb0fa9409b16" "site/built/index.md" "2024-10-24" "links.md" "8184cf4149eafbf03ce8da8ff0778c14" "site/built/links.md" "2022-04-22" "episodes/introduction.md" "1cf3c7fc50b2f6ddaf448fbd9e5f90b1" "site/built/introduction.md" "2024-12-19" -"episodes/working-with-diverse-filetypes.md" "569a3ff1bf082d0409f6311d340cc418" "site/built/working-with-diverse-filetypes.md" "2024-10-24" +"episodes/working-with-diverse-filetypes.md" "d6777c1ab156a0ffaa93d6f3f26cb072" "site/built/working-with-diverse-filetypes.md" "2024-12-20" "episodes/downloading-files-programmatically.md" "6ccd49246dea3ea14f36411416e4031b" "site/built/downloading-files-programmatically.md" "2024-12-19" "episodes/scraping-data.md" "8dd50e6f762a4c0e6595399d026e93aa" "site/built/scraping-data.md" "2024-12-19" "episodes/plotting.md" "971edcc43024e78fdbac3a7ad8d7eb48" "site/built/plotting.md" "2024-12-13" diff --git a/working-with-diverse-filetypes.md b/working-with-diverse-filetypes.md index fff1e95..fabe7c3 100644 --- a/working-with-diverse-filetypes.md +++ b/working-with-diverse-filetypes.md @@ -4,6 +4,8 @@ teaching: 0 exercises: 0 --- +Expected duration: 45 min? + :::::::::::::::::::::::::::::::::::::: questions - How can I read in different tabular data types to a familiar format in Python? @@ -29,3 +31,118 @@ transform many kinds of data with similar functions in Python. :::::::::::::::::::::::::::::::::::::::::::::::: +# `pd.read_xlsx() + +TODO: +recap pd.read methods and imports +set the scene +describe the EIA data we're working with. +pd.read_excel() with no input +using help() to get the docs. + +:::::::: challenge + +## Challenge 1: handling gnarly + +Using `pd.read_excel()`, read in the first sheet ("Page 1 Energy Storage") using the `skiprows` parameter to select the column header row. + +:::: solution + +```python +import pandas as pd + +excel_923 = pd.read_excel('data/eia923_2022.xlsx', sheet_name=0, skiprows=) +``` + +:::: + +:::::::: + +# `pd.read_json() + +TODO: +What is a json file and when might you see it +Try to use straight pd.read_json +When might you need to load a JSON first - nested JSONs. +Practice with the warnings table. +FLAG as decision - teach json_normalize() or no? +Drill down through the dictionary, this is basically what the param is doing under the hood. + +:::::::: challenge + +## Challenge 2: handling nested JSONs + +Using `json.load()` and Pandas, read in the `data` from the `eia923_2022.json` file into a Pandas DataFrame. + +:::: solution + +```python +import pandas as pd +import json + +# First, read in the file +import json +with open('data/eia923_2022.json') as file: + eia923_json = json.load(file) + +eia923_json = pd.json_normalize(eia923_json, record_path = ['response', 'data']) + +# OR +eia923_json = pd.DataFrame(eia923_json['response']['data']) + +``` + +:::: + +:::::::: + +TODO: +talk a bit about deeper nesting + +# `pd.read_xml() + +TODO: +What is an XML file and when might you see it +What's different from JSONs? + +:::::::: challenge + +## Challenge 3: unpacking XML files +### QUESTION - any way to make this more exciting? + +Using `pd.read_xml()`, read in the `data` from the `eia923_2022.xml` file into a Pandas DataFrame. + +:::: solution + +```python +import pandas as pd + +eia923_xml = pd.read_xml('data/eia923_2022.xml', xpath = '//response/data/row') + +``` +:::: + +:::::::: + +# `pd.read_parquet() + +TODO: +What is a Parquet file and when might you see it +Just demo pd.read_parquet, no need for a challenge here. + +`pd.read_parquet('data/eia923_2022.parquet)` + +:::::::: challenge + +Pick two datasets we've just read in, and compare them. How are they similar, and how are they different? Share your reflections with a peer. + +:::: hint + +* Inspect a column in a DataFrame `df` by using `df[column_name]`. +* To quickly see what values are contained in a column, you can use `df[column_name].unique()` to get a list of unique values in the column. +* Try using `df.iloc[0]` to get the values from the first row of the data. +* `df.head(n)` returns the first n rows of the data, and `df.tail(n)` returns the last n rows. +* to add - isin()?? +:::: + +:::::::: \ No newline at end of file