From 948eb4fd7c7bdaeeaa91b466ae57e5b061af0afd Mon Sep 17 00:00:00 2001 From: DrSaadLa Date: Sat, 6 Jan 2024 08:31:19 +0300 Subject: [PATCH] Add content to introduction to polars chapter --- .../IntroToPolars.ipynb | 169 ++++ ...PolarsVsPandas_TimingReading_CSVData.ipynb | 0 .../1_Series.ipynb | 0 .../2_DataFrames.ipynb | 0 .../3_CreaatingDataFrames.ipynb | 798 ++++++++++++++++++ .../01. Reading Plain Text Data.ipynb | 0 scripts/utils.py | 72 ++ 7 files changed, 1039 insertions(+) create mode 100644 Tutorials/01_IntroductionToPolars/IntroToPolars.ipynb rename Tutorials/{01_PolarsVsPandas => 02_PolarsVsPandas}/PolarsVsPandas_TimingReading_CSVData.ipynb (100%) rename Tutorials/{02_BasicDataStructures => 03_BasicDataStructures}/1_Series.ipynb (100%) rename Tutorials/{02_BasicDataStructures => 03_BasicDataStructures}/2_DataFrames.ipynb (100%) create mode 100644 Tutorials/03_BasicDataStructures/3_CreaatingDataFrames.ipynb rename Tutorials/{03_ReadingData => 04_ReadingData}/01. Reading Plain Text Data.ipynb (100%) create mode 100644 scripts/utils.py diff --git a/Tutorials/01_IntroductionToPolars/IntroToPolars.ipynb b/Tutorials/01_IntroductionToPolars/IntroToPolars.ipynb new file mode 100644 index 0000000..febbd6b --- /dev/null +++ b/Tutorials/01_IntroductionToPolars/IntroToPolars.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a699447-f6c2-4385-a32e-4db8f5202d01", + "metadata": {}, + "source": [ + "
\n", + " Basic Data Structures with Polars
\n", + " Series\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c34b1ebb-1849-4b6c-bbd4-a66919a78008", + "metadata": {}, + "outputs": [], + "source": [ + "# ============================================================\n", + "# =\n", + "# Title: Basic Data Structures with Polars =\n", + "# Series =\n", + "# --------------------------------- =\n", + "# =\n", + "# Author: Dr. Saad Laouadi =\n", + "# =\n", + "# Copyright: Dr. Saad Laouadi =\n", + "# ============================================================\n", + "# =\n", + "# LICENSE =\n", + "# ---------------------- =\n", + "# =\n", + "# This material is intended for educational =\n", + "# purposes only and may not be used directly in =\n", + "# courses, video recordings, or similar =\n", + "# without prior consent from the author. =\n", + "# When using or referencing this material, =\n", + "# proper credit must be attributed to the =\n", + "# author. =\n", + "# ============================================================" + ] + }, + { + "cell_type": "markdown", + "id": "8433e4ea-e8e8-4839-9afa-26750adaf9d2", + "metadata": {}, + "source": [ + "## Importing Polars " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb9f4303-1437-414c-8d35-8418cf9f0d7e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# import polars as pl\n", + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "42fa0cf4-0846-4135-b277-64cb0aa916a3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Polars version is: 0.20.2\n" + ] + } + ], + "source": [ + "# Check polars version\n", + "print(f\"Polars version is: {pl.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "id": "692b5b6f-c2b0-4ee5-b23d-26090af142cd", + "metadata": {}, + "source": [ + "## Show Version with `show_versions()` Method" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b6907938-d41b-4146-a8e1-5344c8a59def", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------Version info---------\n", + "Polars: 0.20.2\n", + "Index type: UInt32\n", + "Platform: macOS-13.6.1-x86_64-i386-64bit\n", + "Python: 3.12.1 | packaged by conda-forge | (main, Dec 23 2023, 08:05:03) [Clang 16.0.6 ]\n", + "\n", + "----Optional dependencies----\n", + "adbc_driver_manager: \n", + "cloudpickle: \n", + "connectorx: \n", + "deltalake: \n", + "fsspec: \n", + "gevent: \n", + "matplotlib: 3.8.2\n", + "numpy: 1.26.2\n", + "openpyxl: 3.1.2\n", + "pandas: 2.1.4\n", + "pyarrow: 14.0.2\n", + "pydantic: 2.5.3\n", + "pyiceberg: \n", + "pyxlsb: 1.0.10\n", + "sqlalchemy: 2.0.25\n", + "xlsx2csv: 0.8.1\n", + "xlsxwriter: 3.1.9\n" + ] + } + ], + "source": [ + "# Check polars versions\n", + "pl.show_versions()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f9c99b-1724-46cc-823a-b5b44723be68", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PolarsEnv", + "language": "python", + "name": "plenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Tutorials/01_PolarsVsPandas/PolarsVsPandas_TimingReading_CSVData.ipynb b/Tutorials/02_PolarsVsPandas/PolarsVsPandas_TimingReading_CSVData.ipynb similarity index 100% rename from Tutorials/01_PolarsVsPandas/PolarsVsPandas_TimingReading_CSVData.ipynb rename to Tutorials/02_PolarsVsPandas/PolarsVsPandas_TimingReading_CSVData.ipynb diff --git a/Tutorials/02_BasicDataStructures/1_Series.ipynb b/Tutorials/03_BasicDataStructures/1_Series.ipynb similarity index 100% rename from Tutorials/02_BasicDataStructures/1_Series.ipynb rename to Tutorials/03_BasicDataStructures/1_Series.ipynb diff --git a/Tutorials/02_BasicDataStructures/2_DataFrames.ipynb b/Tutorials/03_BasicDataStructures/2_DataFrames.ipynb similarity index 100% rename from Tutorials/02_BasicDataStructures/2_DataFrames.ipynb rename to Tutorials/03_BasicDataStructures/2_DataFrames.ipynb diff --git a/Tutorials/03_BasicDataStructures/3_CreaatingDataFrames.ipynb b/Tutorials/03_BasicDataStructures/3_CreaatingDataFrames.ipynb new file mode 100644 index 0000000..ed30b7c --- /dev/null +++ b/Tutorials/03_BasicDataStructures/3_CreaatingDataFrames.ipynb @@ -0,0 +1,798 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a699447-f6c2-4385-a32e-4db8f5202d01", + "metadata": {}, + "source": [ + "
\n", + " Basic Data Structures with Polars
\n", + " DataFrames\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c34b1ebb-1849-4b6c-bbd4-a66919a78008", + "metadata": {}, + "outputs": [], + "source": [ + "# ============================================================\n", + "# =\n", + "# Title: Basic Data Structures with Polars =\n", + "# Data Frames =\n", + "# --------------------------------- =\n", + "# =\n", + "# Author: Dr. Saad Laouadi =\n", + "# =\n", + "# Copyright: Dr. Saad Laouadi =\n", + "# ============================================================\n", + "# =\n", + "# LICENSE =\n", + "# ---------------------- =\n", + "# =\n", + "# This material is intended for educational =\n", + "# purposes only and may not be used directly in =\n", + "# courses, video recordings, or similar =\n", + "# without prior consent from the author. =\n", + "# When using or referencing this material, =\n", + "# proper credit must be attributed to the =\n", + "# author. =\n", + "# ============================================================" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a9be308-bfec-44e1-b2f6-14187eba305e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Environment Setup\n", + "import sys\n", + "sys.path.append('../../scripts/') \n", + "\n", + "# import the working libraries\n", + "from importlibs import *\n", + "from utils import install_faker, update_pip" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "889cdc44-6fd2-4ea6-97a9-e63040f3aca6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pip is already up-to-date.\n", + "Faker is already installed. Version: 22.0.0\n" + ] + } + ], + "source": [ + "update_pip()\n", + "install_faker()" + ] + }, + { + "cell_type": "markdown", + "id": "22b48f06-4cf6-4b4e-98d5-b531abc53d2d", + "metadata": {}, + "source": [ + "## Creating DataFrames with `DataFrame` Constructor " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edf58a05-c25d-4cad-a878-2cbedb36087b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a79238cd-e4d7-4e2c-86c4-b483d045824c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 4)
Var_1Var_2Var_3Var_4
f32f64f32f64
-0.09195-1.4633511.081792-0.239325
-0.491129-1.0022720.918822-1.103632
0.626493-0.5615140.028855-0.230767
0.5877520.752318-1.0585031.055972
0.747751.0646771.52013-1.488603
" + ], + "text/plain": [ + "shape: (5, 4)\n", + "┌───────────┬───────────┬───────────┬───────────┐\n", + "│ Var_1 ┆ Var_2 ┆ Var_3 ┆ Var_4 │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f32 ┆ f64 ┆ f32 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ -0.09195 ┆ -1.463351 ┆ 1.081792 ┆ -0.239325 │\n", + "│ -0.491129 ┆ -1.002272 ┆ 0.918822 ┆ -1.103632 │\n", + "│ 0.626493 ┆ -0.561514 ┆ 0.028855 ┆ -0.230767 │\n", + "│ 0.587752 ┆ 0.752318 ┆ -1.058503 ┆ 1.055972 │\n", + "│ 0.74775 ┆ 1.064677 ┆ 1.52013 ┆ -1.488603 │\n", + "└───────────┴───────────┴───────────┴───────────┘" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Creating column names (Variables)\n", + "cols = [\"Var_\" + str(num) for num in range(1, 5)]\n", + "\n", + "# Generating Random Data\n", + "np.random.seed(22)\n", + "my_data = np.random.normal(size = 60).reshape(15, 4)\n", + "\n", + "dtypes = [pl.Float32, pl.Float64, pl.Float32, pl.Float64]\n", + "schema = {colname:coltype for (colname, coltype) in zip(cols, dtypes)}\n", + "\n", + "df = pl.DataFrame(data = my_data, schema=schema)\n", + "df.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "95ad44f5-106c-460b-8c99-91f6481137af", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Examine the type of the df object\n", + "print(type(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "93813cec-7fb4-4a4e-ab85-e1f7fe491589", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "OrderedDict([('Var_1', Float32),\n", + " ('Var_2', Float64),\n", + " ('Var_3', Float32),\n", + " ('Var_4', Float64)])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8bb9145-9ba9-4ca9-b97e-fce8601cf358", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9a641ace-7b7d-4c95-bee4-f28c0cd3aa8c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Var1': [12.46, 13.79, 6.1, 20.07, 7.36, 10.89],\n", + " 'Var2': [20, 30, 40, 20, 10, 10],\n", + " 'Var3': [3, 2, 1, 2, 2, 2]}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a random dictionary \n", + "np.random.seed(2)\n", + "data = {'Var1': list(np.round(np.random.normal(loc = 14, scale=3.7, size = 6), 2)), \n", + " 'Var2': list(np.random.poisson(lam = 3, size = 6)*10), \n", + " 'Var3': list(np.random.binomial(n = 5, p = 0.5, size = 6))}\n", + "# np.random.binomial()\n", + "# Display the dict object \n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "19a007f8-f0fe-477c-a73d-7727c8ba4bf9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 3)
Var1Var2Var3
f64i64i64
12.46203
13.79302
6.1401
20.07202
7.36102
10.89102
" + ], + "text/plain": [ + "shape: (6, 3)\n", + "┌───────┬──────┬──────┐\n", + "│ Var1 ┆ Var2 ┆ Var3 │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ i64 ┆ i64 │\n", + "╞═══════╪══════╪══════╡\n", + "│ 12.46 ┆ 20 ┆ 3 │\n", + "│ 13.79 ┆ 30 ┆ 2 │\n", + "│ 6.1 ┆ 40 ┆ 1 │\n", + "│ 20.07 ┆ 20 ┆ 2 │\n", + "│ 7.36 ┆ 10 ┆ 2 │\n", + "│ 10.89 ┆ 10 ┆ 2 │\n", + "└───────┴──────┴──────┘" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Construct a dataframe object from the dict object\n", + "df_from_dict = pl.DataFrame(data)\n", + "df_from_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "796390a5-df53-4fcc-9b00-b56cc01b4e60", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "OrderedDict([('Var1', Float64), ('Var2', Int64), ('Var3', Int64)])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_from_dict.schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25436179-f600-4fc4-8c09-47dd1d1a3dbb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9e5aaf04-fef0-4e68-a591-ac574baad15e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (6, 3)\n", + "┌─────────────────┬──────────────────────────────────┬───────────────┐\n", + "│ Cities ┆ Countries ┆ Country_codes │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str │\n", + "╞═════════════════╪══════════════════════════════════╪═══════════════╡\n", + "│ Glennmouth ┆ Palau ┆ AL │\n", + "│ Romeroshire ┆ Saint Vincent and the Grenadines ┆ SM │\n", + "│ East Linda ┆ Nicaragua ┆ BI │\n", + "│ Sotoburgh ┆ Pakistan ┆ BH │\n", + "│ South Kevinland ┆ Cayman Islands ┆ AZ │\n", + "│ New Tinamouth ┆ Nigeria ┆ ET │\n", + "└─────────────────┴──────────────────────────────────┴───────────────┘\n" + ] + } + ], + "source": [ + "# Generate fake data \n", + "from faker import Faker\n", + "fake = Faker()\n", + "\n", + "\n", + "# Set the seed\n", + "fake.seed_instance(11)\n", + "\n", + "# Generate a random data in a dict object\n", + "data = {\"Cities\": [fake.city() for i in range(6)],\n", + " \"Countries\": [fake.country() for i in range(6)],\n", + " \"Country_codes\": [fake.country_code() for i in range(6)]\n", + " }\n", + "\n", + "# Construct a data frame object \n", + "data_df = pl.DataFrame(data)\n", + "\n", + "# Display the data frame object\n", + "print(data_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a73f6ba6-b80b-4d6a-81e9-52245c12829e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OrderedDict({'Cities': Utf8, 'Countries': Utf8, 'Country_codes': Utf8})\n" + ] + } + ], + "source": [ + "print(data_df.schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33306d0b-5348-4a32-ae78-1bfe3a7a7000", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d630fc7e-93bd-4770-a951-0f5a4082e330", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Generate a random list of lists\n", + "import random\n", + "fake.seed_instance(11)\n", + "lst_of_lsts = [\n", + " fake.pylist(nb_elements=10,\n", + " variable_nb_elements = False, \n", + " value_types='int') for i in range(5)]\n", + "\n", + "\n", + "# Generate random Keys\n", + "random.seed(11)\n", + "keys = fake.words(nb = 5)\n", + "keys = [key.title() for key in keys]\n", + "\n", + "# Create a dictionary whose values are lists\n", + "d_lst = {k:v for k, v in zip(keys, lst_of_lsts)}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a5a535be-d050-4215-98c4-01736abae56d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Politics': [9171, 7402, 3025, 3050, 7316, 2323, 8825, 9755, 7421, 245],\n", + " 'Beyond': [975, 3116, 9824, 7601, 7217, 8505, 4819, 75, 7492, 6664],\n", + " 'Writer': [4161, 3762, 487, 9226, 6560, 4766, 1094, 8, 3436, 7700],\n", + " 'Degree': [6511, 1196, 4420, 1427, 5449, 6718, 2205, 1655, 981, 7976],\n", + " 'Election': [9163, 7330, 2145, 6287, 6469, 3487, 4420, 321, 3068, 9863]}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d_lst" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b57046db-6a13-492e-8e45-7fdb292c3238", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 5)
PoliticsBeyondWriterDegreeElection
i64i64i64i64i64
9171975416165119163
74023116376211967330
3025982448744202145
30507601922614276287
73167217656054496469
23238505476667183487
88254819109422054420
97557581655321
7421749234369813068
2456664770079769863
" + ], + "text/plain": [ + "shape: (10, 5)\n", + "┌──────────┬────────┬────────┬────────┬──────────┐\n", + "│ Politics ┆ Beyond ┆ Writer ┆ Degree ┆ Election │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │\n", + "╞══════════╪════════╪════════╪════════╪══════════╡\n", + "│ 9171 ┆ 975 ┆ 4161 ┆ 6511 ┆ 9163 │\n", + "│ 7402 ┆ 3116 ┆ 3762 ┆ 1196 ┆ 7330 │\n", + "│ 3025 ┆ 9824 ┆ 487 ┆ 4420 ┆ 2145 │\n", + "│ 3050 ┆ 7601 ┆ 9226 ┆ 1427 ┆ 6287 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 8825 ┆ 4819 ┆ 1094 ┆ 2205 ┆ 4420 │\n", + "│ 9755 ┆ 75 ┆ 8 ┆ 1655 ┆ 321 │\n", + "│ 7421 ┆ 7492 ┆ 3436 ┆ 981 ┆ 3068 │\n", + "│ 245 ┆ 6664 ┆ 7700 ┆ 7976 ┆ 9863 │\n", + "└──────────┴────────┴────────┴────────┴──────────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.DataFrame(d_lst)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87bfd89b-5bf1-4dac-be8a-738ca048fe17", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e09c122a-1bfd-459a-b045-2cbe40b27f89", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (1, 5)\n", + "┌───────────┬──────┬──────┬───────┬────────┐\n", + "│ community ┆ kid ┆ road ┆ prove ┆ father │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │\n", + "╞═══════════╪══════╪══════╪═══════╪════════╡\n", + "│ 3539 ┆ 8396 ┆ 8275 ┆ 52 ┆ 6079 │\n", + "└───────────┴──────┴──────┴───────┴────────┘\n" + ] + } + ], + "source": [ + "# Generate random data\n", + "# Set the seed\n", + "fake.seed_instance(21)\n", + "\n", + "# Generate a random dict object \n", + "rnd_dict = fake.pydict(nb_elements = 5, \n", + " variable_nb_elements = False, \n", + " value_types='int')\n", + "\n", + "# Construct a dataframe\n", + "rnd_df = pl.DataFrame(rnd_dict)\n", + "\n", + "# Display the dataframe object\n", + "print(rnd_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6775214-843d-4dfe-ad40-cf816c57a46c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5025072f-894a-414f-921c-1cd5f2543b99", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 6)
Michael ReidDaniel SparksKayla WagnerAaron WatsonAndrea LeonDavid Williams
i64i64i64i64i64i64
72393848633340262659809
53893233439276371962287
18979515260428517916463
380200646614147185992
14499157114860142266041
3203294936366685697937
263923949117967540725045
657547832960567864638754
247573945705829255392282
60852095678405024776264
" + ], + "text/plain": [ + "shape: (10, 6)\n", + "┌──────────────┬───────────────┬──────────────┬──────────────┬─────────────┬────────────────┐\n", + "│ Michael Reid ┆ Daniel Sparks ┆ Kayla Wagner ┆ Aaron Watson ┆ Andrea Leon ┆ David Williams │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │\n", + "╞══════════════╪═══════════════╪══════════════╪══════════════╪═════════════╪════════════════╡\n", + "│ 7239 ┆ 3848 ┆ 6333 ┆ 4026 ┆ 2659 ┆ 809 │\n", + "│ 5389 ┆ 3233 ┆ 439 ┆ 2763 ┆ 7196 ┆ 2287 │\n", + "│ 1897 ┆ 9515 ┆ 2604 ┆ 2851 ┆ 7916 ┆ 463 │\n", + "│ 380 ┆ 2006 ┆ 46 ┆ 6141 ┆ 4718 ┆ 5992 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 2639 ┆ 2394 ┆ 9117 ┆ 9675 ┆ 4072 ┆ 5045 │\n", + "│ 6575 ┆ 4783 ┆ 2960 ┆ 5678 ┆ 6463 ┆ 8754 │\n", + "│ 2475 ┆ 7394 ┆ 5705 ┆ 8292 ┆ 5539 ┆ 2282 │\n", + "│ 608 ┆ 5209 ┆ 5678 ┆ 4050 ┆ 2477 ┆ 6264 │\n", + "└──────────────┴───────────────┴──────────────┴──────────────┴─────────────┴────────────────┘" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate random data\n", + "fake.seed_instance(21)\n", + "rnd_dict = {k:v for k, v in zip([fake.name() for i in range(6)],\n", + " [fake.pylist(nb_elements=10, variable_nb_elements=False, \n", + " value_types='int') for i in range(10)])}\n", + "\n", + "rnd_df = pl.DataFrame(rnd_dict)\n", + "rnd_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1464e267-08c2-4c28-a338-c90fca098f25", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26bd4ef6-260e-4cef-be8b-3c9cc04818a9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ad02558e-087f-499f-9941-28dd8485328a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (10, 3)\n", + "┌─────────┬──────┬────────┐\n", + "│ ocuntry ┆ Year ┆ GPD │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ f64 │\n", + "╞═════════╪══════╪════════╡\n", + "│ England ┆ 2018 ┆ 2000.0 │\n", + "│ England ┆ 2019 ┆ 2500.0 │\n", + "│ England ┆ 2020 ┆ 3000.0 │\n", + "│ Japan ┆ 2018 ┆ 5000.0 │\n", + "│ … ┆ … ┆ … │\n", + "│ Germany ┆ 2019 ┆ 4500.0 │\n", + "│ Germany ┆ 2020 ┆ 5000.0 │\n", + "│ Germany ┆ 2021 ┆ 5500.0 │\n", + "│ Germany ┆ 2022 ┆ 6000.0 │\n", + "└─────────┴──────┴────────┘\n" + ] + } + ], + "source": [ + "data = dict(ocuntry = list(np.repeat(\"England\", repeats = 3)) + \\\n", + " list(np.repeat(\"Japan\", repeats = 2)) + \\\n", + " list(np.repeat(\"Germany\", repeats = 5)),\n", + " Year = [*range(2018, 2021)] + \\\n", + " [*range(2018, 2020)] + \\\n", + " [*range(2018, 2023)],\n", + " GPD = list(np.linspace(2000, 3000, num = 3 )) + \\\n", + " list(np.linspace(5000, 6000, num = 2 )) + \\\n", + " list(np.linspace(4000, 6000, num = 5 )))\n", + "\n", + "# Create a data frame\n", + "df = pl.DataFrame(data)\n", + "\n", + "# Print the data frame\n", + "print(df)" + ] + }, + { + "cell_type": "markdown", + "id": "99b9f9d1-c4b2-4e4f-9807-0f5bd470e2b6", + "metadata": {}, + "source": [ + "## Creating Polars DataFrame from Pandas DataFrame " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "bc689fc4-45ce-49be-933c-99803ab48fc6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 13)
jobcompanyssnresidencecurrent_locationblood_groupwebsiteusernamenamesexaddressmailbirthdate
strstrstrstrlist[f64]strlist[str]strstrstrstrstrdate
"Banker""Miller and Son…"754-86-5049""092 Little Uni…[-80.356638, -165.265656]"AB-"["https://snow-fox.net/", "http://www.johnson.com/", … "https://pollard-hayden.com/"]"framirez""Joel Moreno""M""775 Tucker For…"ncollins@yahoo…1967-05-24
"Youth worker""Mueller LLC""744-02-3136""Unit 1090 Box …[-18.638796, -48.556833]"B-"["https://www.cox.com/"]"watsondavid""Tiffany Johnso…"F""USNS Mills\n", + "FPO…"bauerjohn@gmai…1908-07-01
"Warden/ranger""Newton, Wilson…"032-40-7351""6927 Clarence …[49.126367, 134.045698]"O+"["https://www.green-obrien.biz/", "http://www.brown.com/", … "http://king.com/"]"hernandezpaul""Jay Fox""M""11023 Jeff Pik…"gailhodges@gma…1992-07-27
"Politician's a…"Hunt-Perkins""053-54-8603""750 Cheryl Hig…[-7.879592, -73.489453]"A+"["http://tucker.com/", "http://watson.com/", … "https://www.foster-martinez.com/"]"heatherjohnson…"Sarah Davidson…"F""745 Thomas Lan…"rfrederick@yah…1953-10-17
"Designer, grap…"Carlson-Dunn""713-87-1382""52826 Lauren S…[-1.573427, 55.603566]"B+"["http://www.hatfield.com/", "http://jacobson.com/", "https://daniel.org/"]"qhill""Christopher Di…"M""758 Reeves Sho…"edwardsalexand…1983-06-04
"Chief Marketin…"Powell and Son…"747-73-9936""68349 Tracy Ri…[-21.900311, 95.651652]"B+"["http://www.smith-washington.com/", "https://adams-shaw.com/", … "http://www.porter.com/"]"austin69""Kristin Rasmus…"F""2784 Jonathan …"laurenfuller@y…1912-10-06
"Programmer, sy…"Hall, Ramirez …"115-87-6214""5157 Jackson M…[28.647527, 163.393493]"B-"["https://www.barron.com/", "https://www.taylor-taylor.com/", "https://peterson.net/"]"amanda15""Paul Booth""M""45771 Leslie L…"usmith@gmail.c…2003-09-21
"Therapist, mus…"Weaver-Harriso…"155-47-5184""23542 Ward Cor…[-42.359901, -146.144115]"O+"["https://www.mills-bruce.com/", "https://www.long.info/", … "https://www.cruz-chavez.com/"]"longstephen""Mr. Timothy Fr…"M""53202 Santiago…"heatherweaver@…1931-03-03
"Accounting tec…"Morgan, Rice a…"381-63-0025""17309 Connie J…[-46.245894, 94.325648]"A-"["http://www.harris-peterson.com/", "https://www.edwards-evans.com/"]"jermaine28""Diane Cannon""F""833 Peter Cany…"vfernandez@gma…2004-12-05
"Therapist, spo…"Johnson-Gonzal…"111-18-8918""PSC 1743, Box …[20.779751, 43.046518]"A+"["https://www.park.com/", "https://medina-long.net/", … "https://salazar-gilbert.org/"]"kyle25""Pedro Horn""M""99243 Garcia T…"barnesmartha@g…1940-10-18
" + ], + "text/plain": [ + "shape: (10, 13)\n", + "┌────────────┬────────────┬────────────┬────────────┬───┬─────┬────────────┬───────────┬───────────┐\n", + "│ job ┆ company ┆ ssn ┆ residence ┆ … ┆ sex ┆ address ┆ mail ┆ birthdate │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ date │\n", + "╞════════════╪════════════╪════════════╪════════════╪═══╪═════╪════════════╪═══════════╪═══════════╡\n", + "│ Banker ┆ Miller and ┆ 754-86-504 ┆ 092 Little ┆ … ┆ M ┆ 775 Tucker ┆ ncollins@ ┆ 1967-05-2 │\n", + "│ ┆ Sons ┆ 9 ┆ Unions ┆ ┆ ┆ Forges ┆ yahoo.com ┆ 4 │\n", + "│ ┆ ┆ ┆ Amymouth, ┆ ┆ ┆ Suite 294 ┆ ┆ │\n", + "│ ┆ ┆ ┆ NY 3… ┆ ┆ ┆ Coll… ┆ ┆ │\n", + "│ Youth ┆ Mueller ┆ 744-02-313 ┆ Unit 1090 ┆ … ┆ F ┆ USNS Mills ┆ bauerjohn ┆ 1908-07-0 │\n", + "│ worker ┆ LLC ┆ 6 ┆ Box 4931 ┆ ┆ ┆ FPO AE ┆ @gmail.co ┆ 1 │\n", + "│ ┆ ┆ ┆ DPO AP ┆ ┆ ┆ 51037 ┆ m ┆ │\n", + "│ ┆ ┆ ┆ 18392 ┆ ┆ ┆ ┆ ┆ │\n", + "│ Warden/ran ┆ Newton, ┆ 032-40-735 ┆ 6927 ┆ … ┆ M ┆ 11023 Jeff ┆ gailhodge ┆ 1992-07-2 │\n", + "│ ger ┆ Wilson and ┆ 1 ┆ Clarence ┆ ┆ ┆ Pike ┆ s@gmail.c ┆ 7 │\n", + "│ ┆ Rogers ┆ ┆ Spur ┆ ┆ ┆ Chavezside ┆ om ┆ │\n", + "│ ┆ ┆ ┆ Marytown, ┆ ┆ ┆ , SD 5… ┆ ┆ │\n", + "│ ┆ ┆ ┆ NV … ┆ ┆ ┆ ┆ ┆ │\n", + "│ Politician ┆ Hunt-Perki ┆ 053-54-860 ┆ 750 Cheryl ┆ … ┆ F ┆ 745 Thomas ┆ rfrederic ┆ 1953-10-1 │\n", + "│ 's ┆ ns ┆ 3 ┆ Highway ┆ ┆ ┆ Landing ┆ k@yahoo.c ┆ 7 │\n", + "│ assistant ┆ ┆ ┆ Kylecheste ┆ ┆ ┆ Toddfort, ┆ om ┆ │\n", + "│ ┆ ┆ ┆ r, … ┆ ┆ ┆ VA … ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ Programmer ┆ Hall, ┆ 115-87-621 ┆ 5157 ┆ … ┆ M ┆ 45771 ┆ usmith@gm ┆ 2003-09-2 │\n", + "│ , systems ┆ Ramirez ┆ 4 ┆ Jackson ┆ ┆ ┆ Leslie ┆ ail.com ┆ 1 │\n", + "│ ┆ and ┆ ┆ Mills ┆ ┆ ┆ Light Apt. ┆ ┆ │\n", + "│ ┆ Rosario ┆ ┆ Stevenberg ┆ ┆ ┆ 015 ┆ ┆ │\n", + "│ ┆ ┆ ┆ , P… ┆ ┆ ┆ West… ┆ ┆ │\n", + "│ Therapist, ┆ Weaver-Har ┆ 155-47-518 ┆ 23542 Ward ┆ … ┆ M ┆ 53202 ┆ heatherwe ┆ 1931-03-0 │\n", + "│ music ┆ rison ┆ 4 ┆ Corners ┆ ┆ ┆ Santiago ┆ aver@gmai ┆ 3 │\n", + "│ ┆ ┆ ┆ Suite 333 ┆ ┆ ┆ Course ┆ l.com ┆ │\n", + "│ ┆ ┆ ┆ Wes… ┆ ┆ ┆ Suite 707 ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ … ┆ ┆ │\n", + "│ Accounting ┆ Morgan, ┆ 381-63-002 ┆ 17309 ┆ … ┆ F ┆ 833 Peter ┆ vfernande ┆ 2004-12-0 │\n", + "│ technician ┆ Rice and ┆ 5 ┆ Connie ┆ ┆ ┆ Canyon ┆ z@gmail.c ┆ 5 │\n", + "│ ┆ Allen ┆ ┆ Junction ┆ ┆ ┆ Lake ┆ om ┆ │\n", + "│ ┆ ┆ ┆ Stronghave ┆ ┆ ┆ Andrew, ┆ ┆ │\n", + "│ ┆ ┆ ┆ … ┆ ┆ ┆ KS… ┆ ┆ │\n", + "│ Therapist, ┆ Johnson-Go ┆ 111-18-891 ┆ PSC 1743, ┆ … ┆ M ┆ 99243 ┆ barnesmar ┆ 1940-10-1 │\n", + "│ sports ┆ nzalez ┆ 8 ┆ Box 2548 ┆ ┆ ┆ Garcia ┆ tha@gmail ┆ 8 │\n", + "│ ┆ ┆ ┆ APO AP ┆ ┆ ┆ Turnpike ┆ .com ┆ │\n", + "│ ┆ ┆ ┆ 09352 ┆ ┆ ┆ Apt. 350 ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ A… ┆ ┆ │\n", + "└────────────┴────────────┴────────────┴────────────┴───┴─────┴────────────┴───────────┴───────────┘" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fake.seed_instance(2)\n", + "data = [fake.profile() for _ in range(10)]\n", + "data = pd.DataFrame(data)\n", + "\n", + "\n", + "pl.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad0c74ba-10fe-4228-af91-996d1b957938", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PolarsEnv", + "language": "python", + "name": "plenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Tutorials/03_ReadingData/01. Reading Plain Text Data.ipynb b/Tutorials/04_ReadingData/01. Reading Plain Text Data.ipynb similarity index 100% rename from Tutorials/03_ReadingData/01. Reading Plain Text Data.ipynb rename to Tutorials/04_ReadingData/01. Reading Plain Text Data.ipynb diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..c45d7a4 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,72 @@ +""" +This module has a set of functions to be used in the polars learning path. +""" + +import importlib +import subprocess +import sys +import re + +from importlib.metadata import version, PackageNotFoundError + + +def get_pip_version(): + result = subprocess.run([sys.executable, '-m', 'pip', '--version'], + capture_output=True, text=True) + + match = re.search(r'pip (\d+\.\d+\.\d+)', result.stdout) + return match.group(1) if match else None + + +def update_pip(): + current_version = get_pip_version() + if current_version is None: + print("Unable to determine the current version of pip.") + return + + # Get the latest version of pip from pypi + latest_version = subprocess.run([sys.executable, '-m', 'pip', 'index', 'versions', 'pip'], + capture_output=True, text=True).stdout.split()[-1] + + if current_version == latest_version: + print("pip is already up-to-date.") + else: + try: + subprocess.check_call( + [sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + print(f"pip has been successfully updated from {current_version} to {latest_version}.") + except subprocess.CalledProcessError: + print("Failed to update pip.") + + +def install_faker(): + """ + Installs the Faker package if it is not already installed in the current Python environment. + + This function first checks if the Faker package is available in the current Python environment. + If Faker is already installed, it prints a message with the installed version. + If Faker is not installed, it proceeds to install the package using pip. + + Exceptions: + - Catches `importlib.metadata.PackageNotFoundError` if Faker is not installed. + - May raise exceptions related to subprocess execution or pip installation failures. + + Requires: + - `importlib.metadata` (standard library in Python 3.8 and later; for earlier versions, install `importlib-metadata` package). + - `subprocess` module for executing the pip install command. + """ + + try: + # Check if Faker is already installed + faker_version = version("Faker") + print(f"Faker is already installed. Version: {faker_version}") + + except PackageNotFoundError: + print("Faker is not installed. Installing Faker...") + subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "Faker"]) + print("Faker has been successfully installed.") + +