realpython · eyrei123 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/polars_v_pandas/DataFrame_Plots.ipynb b/polars_v_pandas/DataFrame_Plots.ipynb
diff --git a/polars_v_pandas/README.md b/polars_v_pandas/README.md
@@ -0,0 +1,26 @@
+The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).
+
+You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.
+
+Your download bundle contains the following files:
+
+
+
+Online\_Retail.parquet		 	- This parquet file contains retail data used in some of the queries.
+
+data\_generation.py		 	- This script contains the data\_generation() function used to generate different quantities of data.
+
+code\_speed\_test.py		 		- This script performs time tests for pandas and Polars DataFrames.
+
+dataframe\_and\_lazyframe\_time\_tests.py	- This script performs time tests for DataFrames and a LazyFrame.
+
+streaming\_test.py	                - This script performs time tests for a LazyFrame with streaming enabled.
+
+
+
+dataframe\_conversions.py	 		- This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
+
+sample\_pandas\_and\_polars\_code.py 	- This file contains the code used to illustrate the differences between pandas and Polars syntax.
+
+DataFrame\_Plots.ipynb				- This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
+
diff --git a/polars_v_pandas/data_generation.py b/polars_v_pandas/data_generation.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+
+def data_generation(number_of_rows):
+    rng = np.random.default_rng()
+
+    return {
+        "order_id": range(1, number_of_rows + 1),
+        "region": rng.choice(
+            ["North", "South", "East", "West"], size=number_of_rows
+        ),
+        "sales_person": rng.choice(
+            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
+        ),
+        "product": rng.choice(
+            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
+        ),
+        "sales_income": rng.integers(1, 5001, size=number_of_rows),
+    }
diff --git a/polars_v_pandas/dataframe_and_lazyframe_time_tests.py b/polars_v_pandas/dataframe_and_lazyframe_time_tests.py
@@ -0,0 +1,78 @@
+import functools
+import sys
+from timeit import Timer
+
+import pandas as pd
+import polars as pl
+from data_generation import data_generation
+
+
+def create_pandas_dataframe(test_data):
+    return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
+
+
+def create_polars_dataframe(test_data):
+    return pl.DataFrame(test_data)
+
+
+def create_polars_lazyframe(test_data):
+    return pl.LazyFrame(test_data)
+
+
+def analyze_pandas_dataframe(pandas_df):
+    pandas_df.groupby(
+        ["region", "product", "sales_person"]
+    )["sales_income"].sum()
+
+
+def analyze_polars_dataframe(polars_df):
+    polars_df.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    )
+
+
+def analyze_polars_lazyframe(polars_lf):
+    polars_lf.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    ).collect()
+
+
+test_data = data_generation(int(sys.argv[1]))
+
+print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)
+)
+print()
+print(f"Polars dataframe creation time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)
+)
+print()
+print(f"Polars lazyframe creation time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)
+)
+
+print()
+
+pandas_df = create_pandas_dataframe(test_data)
+polars_df = create_polars_dataframe(test_data)
+polars_lf = create_polars_lazyframe(test_data)
+
+print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
+)
+
+print()
+print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
+)
+
+print()
+print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
+)
diff --git a/polars_v_pandas/dataframe_conversions.py b/polars_v_pandas/dataframe_conversions.py
@@ -0,0 +1,29 @@
+import narwhals as nw
+import polars as pl
+from data_generation import data_generation
+
+polars_df = pl.DataFrame(data_generation(4))
+polars_df
+
+pandas_df = polars_df.to_pandas()
+type(pandas_df)
+pandas_df
+
+polars_df = pl.from_pandas(pandas_df)
+type(polars_df)
+polars_df
+
+
+def agnositic_groupby(df):
+    return (
+        nw.from_native(df)
+        .group_by("region")
+        .agg(nw.col("sales_income").sum())
+        .sort("region")
+        .to_native()
+    )
+
+
+agnositic_groupby(pandas_df)
+
+agnositic_groupby(polars_df)
diff --git a/polars_v_pandas/online_retail.parquet b/polars_v_pandas/online_retail.parquet
diff --git a/polars_v_pandas/sample_pandas_and_polars_code.py b/polars_v_pandas/sample_pandas_and_polars_code.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import polars as pl
+
+orders_pandas = pd.read_parquet("online_retail.parquet")
+
+orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+
+orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
+    orders_pandas["Total"] > 10
+].head(3)
+
+
+(
+    orders_pandas
+    .assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
+    .filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
+    .query("Total > 100")
+).head(3)
+
+
+(
+    orders_pandas.assign(
+        Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+    ).filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
+    #   .query("Total > 100")
+).head(3)
+
+
+orders_polars = pl.read_parquet("online_retail.parquet")
+
+(
+    orders_polars.select(
+        pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
+        total=pl.col("Quantity") * pl.col("UnitPrice"),
+    ).filter(pl.col("total") > 10)
+).head(3)
diff --git a/polars_v_pandas/streaming_test.py b/polars_v_pandas/streaming_test.py
@@ -0,0 +1,38 @@
+import functools
+import sys
+from timeit import Timer
+
+import polars as pl
+from data_generation import data_generation
+
+
+def create_polars_lazyframe(test_data):
+    return pl.LazyFrame(test_data)
+
+
+def analyze_polars_lazyframe(polars_lf):
+    polars_lf.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    ).collect()
+
+
+def analyze_polars_streaming(polars_lf):
+    polars_lf.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    ).collect(engine="streaming")
+
+
+test_data = data_generation(int(sys.argv[1]))
+
+polars_lf = create_polars_lazyframe(test_data)
+
+print()
+print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
+)
+
+print(f"Polars streaming analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
+)