diff --git a/PyData/bookmarks.txt b/PyData/bookmarks.txt index 425a989..55e9bb9 100644 --- a/PyData/bookmarks.txt +++ b/PyData/bookmarks.txt @@ -11,6 +11,7 @@ https://hakibenita.com/postgresql-unknown-features # PySpark https://jacobcelestine.com/knowledge_repo/colab_and_pyspark/ https://sparkbyexamples.com/ +chaining: https://mungingdata.com/pyspark/chaining-dataframe-transformations/ https://phoenixnap.com/kb/install-spark-on-ubuntu https://www.how2shout.com/linux/installing-apache-spark-on-ubuntu-20-04-or-18-04/ installing: https://computingforgeeks.com/how-to-install-apache-spark-on-ubuntu-debian/ diff --git a/ibis-framework/Basic_Examples.ipynb b/ibis-framework/Basic_Examples.ipynb new file mode 100644 index 0000000..e602d94 --- /dev/null +++ b/ibis-framework/Basic_Examples.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf4d09b7-468e-41b4-af86-b1fed4f474ec", + "metadata": {}, + "source": [ + "# ibis examples" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9a13d6a6-c2c1-43c1-873e-9e8d870a85ca", + "metadata": {}, + "outputs": [], + "source": [ + "import ibis\n", + "import pandas as pd\n", + "pd.options.display.max_rows = 20\n", + "ibis.options.interactive = True" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c5fa1300-c01c-44ea-ae22-a9205134b8e4", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2748f73f-8d1d-4349-9d3a-05ba042db5aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(244, 7)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2855fab5-e21e-42a7-addb-d8b3079dfeae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Male No Sun Dinner 3\n", + "3 23.68 3.31 Male No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fec4a567-4ce8-4997-a030-389715f00bbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsize
count244.000000244.000000244.000000
mean19.7859432.9982792.569672
std8.9024121.3836380.951100
min3.0700001.0000001.000000
25%13.3475002.0000002.000000
50%17.7950002.9000002.000000
75%24.1275003.5625003.000000
max50.81000010.0000006.000000
\n", + "
" + ], + "text/plain": [ + " total_bill tip size\n", + "count 244.000000 244.000000 244.000000\n", + "mean 19.785943 2.998279 2.569672\n", + "std 8.902412 1.383638 0.951100\n", + "min 3.070000 1.000000 1.000000\n", + "25% 13.347500 2.000000 2.000000\n", + "50% 17.795000 2.900000 2.000000\n", + "75% 24.127500 3.562500 3.000000\n", + "max 50.810000 10.000000 6.000000" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "2dde52d4-31b8-4917-8dbe-f9f49aff236a", + "metadata": {}, + "source": [ + "## Convert Pandas DataFrame to ibis table" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "534a7253-df69-401d-853b-5e58f885c11b", + "metadata": {}, + "outputs": [], + "source": [ + "tips = ibis.pandas.connect(\n", + " {\n", + " 'tips': df,\n", + " }\n", + ").table('tips')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7286bc75-0c8b-417e-820d-0ec1cbe07d44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.columns" + ] + }, + { + "cell_type": "markdown", + "id": "c6b84479-669f-4ee3-bc7a-26f8ec8dfd4e", + "metadata": {}, + "source": [ + "## Look Up Schema / Data Type Info" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "05a3a457-b960-490f-a641-257930643789", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ibis.Schema { \n", + " total_bill float64\n", + " tip float64\n", + " sex string\n", + " smoker string\n", + " day string\n", + " time string\n", + " size int64\n", + "}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.schema()" + ] + }, + { + "cell_type": "markdown", + "id": "8592cd3c-9354-469b-b1ec-f6cda3dc09d2", + "metadata": {}, + "source": [ + "## Selecting certain columns" + ] + }, + { + "cell_type": "markdown", + "id": "90b1900f-db27-4a1e-981f-daf3a937ed1d", + "metadata": {}, + "source": [ + "```\n", + "SELECT\n", + " total_bill,\n", + " tip,\n", + " day,\n", + " time\n", + "FROM\n", + " tips as tips\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0247c328-eb05-4d82-b3ef-377df65df836", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " total_bill tip day time\n", + "0 16.99 1.01 Sun Dinner\n", + "1 10.34 1.66 Sun Dinner\n", + "2 21.01 3.50 Sun Dinner\n", + "3 23.68 3.31 Sun Dinner\n", + "4 24.59 3.61 Sun Dinner\n", + ".. ... ... ... ...\n", + "239 29.03 5.92 Sat Dinner\n", + "240 27.18 2.00 Sat Dinner\n", + "241 22.67 2.00 Sat Dinner\n", + "242 17.82 1.75 Sat Dinner\n", + "243 18.78 3.00 Thur Dinner\n", + "\n", + "[244 rows x 4 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips['total_bill','tip','day','time']" + ] + }, + { + "cell_type": "markdown", + "id": "1f22ef41-eea4-4373-9e64-852a99016480", + "metadata": {}, + "source": [ + "## Getting Row Count" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4d1a97cf-e185-47af-8f7f-8d9bec37e182", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "244" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.count()" + ] + }, + { + "cell_type": "markdown", + "id": "d0ed6587-2250-4375-a254-2e2c9822bfdc", + "metadata": {}, + "source": [ + "## Limiting the Number of Rows" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ed82bff2-5680-4cb8-8f7d-91317ca97966", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " total_bill tip day time\n", + "0 16.99 1.01 Sun Dinner\n", + "1 10.34 1.66 Sun Dinner\n", + "2 21.01 3.50 Sun Dinner\n", + "3 23.68 3.31 Sun Dinner\n", + "4 24.59 3.61 Sun Dinner" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips['total_bill','tip','day','time'].limit(5)" + ] + }, + { + "cell_type": "markdown", + "id": "0a71c523-054a-4375-a2cf-2f804fa0103d", + "metadata": {}, + "source": [ + "## Filtering" + ] + }, + { + "cell_type": "markdown", + "id": "599e1703-599a-4af8-9ec2-b6ddee77ca5b", + "metadata": {}, + "source": [ + "```\n", + "SELECT\n", + " total_bill,\n", + " tip,\n", + " day,\n", + " time\n", + "FROM\n", + " tips as tips\n", + "WHERE\n", + " tips.day = 'Sun'\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "47fd4d39-1efc-40a7-b0ad-d356a5cbe939", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " total_bill tip day time\n", + "0 16.99 1.01 Sun Dinner\n", + "1 10.34 1.66 Sun Dinner\n", + "2 21.01 3.50 Sun Dinner\n", + "3 23.68 3.31 Sun Dinner\n", + "4 24.59 3.61 Sun Dinner\n", + ".. ... ... ... ...\n", + "71 20.90 3.50 Sun Dinner\n", + "72 30.46 2.00 Sun Dinner\n", + "73 18.15 3.50 Sun Dinner\n", + "74 23.10 4.00 Sun Dinner\n", + "75 15.69 1.50 Sun Dinner\n", + "\n", + "[76 rows x 4 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips['total_bill','tip','day','time'].filter(\n", + " tips['day'] == 'Sun'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2f7a38d4-b038-4bc5-b44d-99c46ed03e5e", + "metadata": {}, + "source": [ + "## Aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c97b0188-5420-4d94-bdef-cbc1b067ebdc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " day day_bill_avg\n", + "0 Fri 17.151579\n", + "1 Sat 20.441379\n", + "2 Sun 21.410000\n", + "3 Thur 17.682742" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.group_by('day').aggregate(tips['total_bill'].mean().name('day_bill_avg'))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6e0877e4-303d-4fbf-811d-dacfd37c34fb", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'ibis.postgres'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_621/1418244849.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mibis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpostgres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'ibis.postgres'" + ] + } + ], + "source": [ + "import ibis.postgres" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Py3.8 (data_dev)", + "language": "python", + "name": "data_dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pandas/Aggregate_Filter_Transform.ipynb b/pandas/Aggregate_Filter_Transform.ipynb index a8515cd..581e134 100644 --- a/pandas/Aggregate_Filter_Transform.ipynb +++ b/pandas/Aggregate_Filter_Transform.ipynb @@ -16,7 +16,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "pd.options.display.max_rows = 20" + "pd.options.display.max_rows = 20``" ] }, {