Skip to content

Post TR1 Commit #686

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions polars_v_pandas/DataFrame_Plots.ipynb

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions polars_v_pandas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).

You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.

Your download bundle contains the following files:



Online\_Retail.parquet - This parquet file contains retail data used in some of the queries.

data\_generation.py - This script contains the data\_generation() function used to generate different quantities of data.

code\_speed\_test.py - This script performs time tests for pandas and Polars DataFrames.

dataframe\_and\_lazyframe\_time\_tests.py - This script performs time tests for DataFrames and a LazyFrame.

streaming\_test.py - This script performs time tests for a LazyFrame with streaming enabled.



dataframe\_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.

sample\_pandas\_and\_polars\_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.

DataFrame\_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.

19 changes: 19 additions & 0 deletions polars_v_pandas/data_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np


def data_generation(number_of_rows):
rng = np.random.default_rng()

return {
"order_id": range(1, number_of_rows + 1),
"region": rng.choice(
["North", "South", "East", "West"], size=number_of_rows
),
"sales_person": rng.choice(
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
),
"product": rng.choice(
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
),
"sales_income": rng.integers(1, 5001, size=number_of_rows),
}
78 changes: 78 additions & 0 deletions polars_v_pandas/dataframe_and_lazyframe_time_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import functools
import sys
from timeit import Timer

import pandas as pd
import polars as pl
from data_generation import data_generation


def create_pandas_dataframe(test_data):
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")


def create_polars_dataframe(test_data):
return pl.DataFrame(test_data)


def create_polars_lazyframe(test_data):
return pl.LazyFrame(test_data)


def analyze_pandas_dataframe(pandas_df):
pandas_df.groupby(
["region", "product", "sales_person"]
)["sales_income"].sum()


def analyze_polars_dataframe(polars_df):
polars_df.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)


def analyze_polars_lazyframe(polars_lf):
polars_lf.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
).collect()


test_data = data_generation(int(sys.argv[1]))

print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)
)
print()
print(f"Polars dataframe creation time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)
)
print()
print(f"Polars lazyframe creation time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)
)

print()

pandas_df = create_pandas_dataframe(test_data)
polars_df = create_polars_dataframe(test_data)
polars_lf = create_polars_lazyframe(test_data)

print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
)

print()
print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
)

print()
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
)
29 changes: 29 additions & 0 deletions polars_v_pandas/dataframe_conversions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import narwhals as nw
import polars as pl
from data_generation import data_generation

polars_df = pl.DataFrame(data_generation(4))
polars_df

pandas_df = polars_df.to_pandas()
type(pandas_df)
pandas_df

polars_df = pl.from_pandas(pandas_df)
type(polars_df)
polars_df


def agnositic_groupby(df):
return (
nw.from_native(df)
.group_by("region")
.agg(nw.col("sales_income").sum())
.sort("region")
.to_native()
)


agnositic_groupby(pandas_df)

agnositic_groupby(polars_df)
Binary file added polars_v_pandas/online_retail.parquet
Binary file not shown.
36 changes: 36 additions & 0 deletions polars_v_pandas/sample_pandas_and_polars_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import polars as pl

orders_pandas = pd.read_parquet("online_retail.parquet")

orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]

orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
orders_pandas["Total"] > 10
].head(3)


(
orders_pandas
.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
.query("Total > 100")
).head(3)


(
orders_pandas.assign(
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
).filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
# .query("Total > 100")
).head(3)


orders_polars = pl.read_parquet("online_retail.parquet")

(
orders_polars.select(
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
total=pl.col("Quantity") * pl.col("UnitPrice"),
).filter(pl.col("total") > 10)
).head(3)
38 changes: 38 additions & 0 deletions polars_v_pandas/streaming_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import functools
import sys
from timeit import Timer

import polars as pl
from data_generation import data_generation


def create_polars_lazyframe(test_data):
return pl.LazyFrame(test_data)


def analyze_polars_lazyframe(polars_lf):
polars_lf.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
).collect()


def analyze_polars_streaming(polars_lf):
polars_lf.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
).collect(engine="streaming")


test_data = data_generation(int(sys.argv[1]))

polars_lf = create_polars_lazyframe(test_data)

print()
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
)

print(f"Polars streaming analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
)
Loading