diff --git a/polars_v_pandas/DataFrame_Plots.ipynb b/polars_v_pandas/DataFrame_Plots.ipynb new file mode 100644 index 0000000000..711c9201a4 --- /dev/null +++ b/polars_v_pandas/DataFrame_Plots.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ced8243-d770-437e-a90d-f794ffa57fc0", + "metadata": {}, + "source": [ + "# Dataframe Plots\n", + "## (i) polars Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1cf56fd3-605c-4449-8a5e-d0fd94b49080", + "metadata": {}, + "outputs": [], + "source": [ + "from data_generation import data_generation\n", + "\n", + "sales_data = data_generation(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "014e9e56-8fff-45ab-85ff-eb51840f2bc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "from data_generation import data_generation\n", + "\n", + "orders_polars = pl.DataFrame(sales_data)\n", + "\n", + "(\n", + " orders_polars.group_by(\"region\")\n", + " .agg(total_sales=pl.col(\"sales_income\").sum())\n", + " .plot.bar(x=\"region\", y=\"total_sales\")\n", + " .properties(width=200, height=200, title=\"Total Sales per Region ($)\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a62335eb-4763-4c46-adb2-d7386914f56b", + "metadata": {}, + "source": [ + "## (ii) Pandas Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "85929590-e514-4497-b396-58cfe26e59d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "orders_pandas = pd.DataFrame(sales_data)\n", + "\n", + "(\n", + " orders_pandas.groupby(\n", + " [\n", + " \"region\",\n", + " ]\n", + " )[\"sales_income\"]\n", + " .sum()\n", + " .plot(kind=\"bar\", title=\"Total Sales per Region ($)\", ylabel=\"total_sales\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "344d9c42-8566-43c6-b3e2-c840aa8ab46d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/polars_v_pandas/README.md b/polars_v_pandas/README.md new file mode 100644 index 0000000000..60c7018f45 --- /dev/null +++ b/polars_v_pandas/README.md @@ -0,0 +1,26 @@ +The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/). + +You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder. + +Your download bundle contains the following files: + + + +Online\_Retail.parquet - This parquet file contains retail data used in some of the queries. + +data\_generation.py - This script contains the data\_generation() function used to generate different quantities of data. + +code\_speed\_test.py - This script performs time tests for pandas and Polars DataFrames. + +dataframe\_and\_lazyframe\_time\_tests.py - This script performs time tests for DataFrames and a LazyFrame. + +streaming\_test.py - This script performs time tests for a LazyFrame with streaming enabled. + + + +dataframe\_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. + +sample\_pandas\_and\_polars\_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax. + +DataFrame\_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. + diff --git a/polars_v_pandas/data_generation.py b/polars_v_pandas/data_generation.py new file mode 100644 index 0000000000..80da9d69be --- /dev/null +++ b/polars_v_pandas/data_generation.py @@ -0,0 +1,19 @@ +import numpy as np + + +def data_generation(number_of_rows): + rng = np.random.default_rng() + + return { + "order_id": range(1, number_of_rows + 1), + "region": rng.choice( + ["North", "South", "East", "West"], size=number_of_rows + ), + "sales_person": rng.choice( + ["Armstrong", "Aldrin", "Collins"], size=number_of_rows + ), + "product": rng.choice( + ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows + ), + "sales_income": rng.integers(1, 5001, size=number_of_rows), + } diff --git a/polars_v_pandas/dataframe_and_lazyframe_time_tests.py b/polars_v_pandas/dataframe_and_lazyframe_time_tests.py new file mode 100644 index 0000000000..009b50ff54 --- /dev/null +++ b/polars_v_pandas/dataframe_and_lazyframe_time_tests.py @@ -0,0 +1,78 @@ +import functools +import sys +from timeit import Timer + +import pandas as pd +import polars as pl +from data_generation import data_generation + + +def create_pandas_dataframe(test_data): + return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow") + + +def create_polars_dataframe(test_data): + return pl.DataFrame(test_data) + + +def create_polars_lazyframe(test_data): + return pl.LazyFrame(test_data) + + +def analyze_pandas_dataframe(pandas_df): + pandas_df.groupby( + ["region", "product", "sales_person"] + )["sales_income"].sum() + + +def analyze_polars_dataframe(polars_df): + polars_df.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ) + + +def analyze_polars_lazyframe(polars_lf): + polars_lf.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ).collect() + + +test_data = data_generation(int(sys.argv[1])) + +print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100) +) +print() +print(f"Polars dataframe creation time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100) +) +print() +print(f"Polars lazyframe creation time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100) +) + +print() + +pandas_df = create_pandas_dataframe(test_data) +polars_df = create_polars_dataframe(test_data) +polars_lf = create_polars_lazyframe(test_data) + +print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100) +) + +print() +print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100) +) + +print() +print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) +) diff --git a/polars_v_pandas/dataframe_conversions.py b/polars_v_pandas/dataframe_conversions.py new file mode 100644 index 0000000000..0699605fe2 --- /dev/null +++ b/polars_v_pandas/dataframe_conversions.py @@ -0,0 +1,29 @@ +import narwhals as nw +import polars as pl +from data_generation import data_generation + +polars_df = pl.DataFrame(data_generation(4)) +polars_df + +pandas_df = polars_df.to_pandas() +type(pandas_df) +pandas_df + +polars_df = pl.from_pandas(pandas_df) +type(polars_df) +polars_df + + +def agnositic_groupby(df): + return ( + nw.from_native(df) + .group_by("region") + .agg(nw.col("sales_income").sum()) + .sort("region") + .to_native() + ) + + +agnositic_groupby(pandas_df) + +agnositic_groupby(polars_df) diff --git a/polars_v_pandas/online_retail.parquet b/polars_v_pandas/online_retail.parquet new file mode 100644 index 0000000000..30f6193e23 Binary files /dev/null and b/polars_v_pandas/online_retail.parquet differ diff --git a/polars_v_pandas/sample_pandas_and_polars_code.py b/polars_v_pandas/sample_pandas_and_polars_code.py new file mode 100644 index 0000000000..56d2b5c7e7 --- /dev/null +++ b/polars_v_pandas/sample_pandas_and_polars_code.py @@ -0,0 +1,36 @@ +import pandas as pd +import polars as pl + +orders_pandas = pd.read_parquet("online_retail.parquet") + +orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"] + +orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][ + orders_pandas["Total"] > 10 +].head(3) + + +( + orders_pandas + .assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]) + .filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"]) + .query("Total > 100") +).head(3) + + +( + orders_pandas.assign( + Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"] + ).filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"]) + # .query("Total > 100") +).head(3) + + +orders_polars = pl.read_parquet("online_retail.parquet") + +( + orders_polars.select( + pl.col(["InvoiceNo", "Quantity", "UnitPrice"]), + total=pl.col("Quantity") * pl.col("UnitPrice"), + ).filter(pl.col("total") > 10) +).head(3) diff --git a/polars_v_pandas/streaming_test.py b/polars_v_pandas/streaming_test.py new file mode 100644 index 0000000000..0ac3672fe4 --- /dev/null +++ b/polars_v_pandas/streaming_test.py @@ -0,0 +1,38 @@ +import functools +import sys +from timeit import Timer + +import polars as pl +from data_generation import data_generation + + +def create_polars_lazyframe(test_data): + return pl.LazyFrame(test_data) + + +def analyze_polars_lazyframe(polars_lf): + polars_lf.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ).collect() + + +def analyze_polars_streaming(polars_lf): + polars_lf.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ).collect(engine="streaming") + + +test_data = data_generation(int(sys.argv[1])) + +polars_lf = create_polars_lazyframe(test_data) + +print() +print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) +) + +print(f"Polars streaming analysis time for {int(sys.argv[1])} rows:") +print( + Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100) +)