diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb new file mode 100644 index 00000000..8dcfce10 --- /dev/null +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington Unknown Master Unknown \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount State Gender \n", + "0 Four-Door Car 2.704934 Unknown Unknown \n", + "1 Four-Door Car 1131.464935 Unknown Unknown \n", + "2 Two-Door Car 566.472247 Unknown Unknown \n", + "3 SUV 529.881344 Unknown Unknown \n", + "4 Four-Door Car 17.269323 Unknown Unknown \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Cargar los tres archivos CSV\n", + "file1 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')\n", + "file2 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv')\n", + "file3 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv')\n", + "\n", + "# Combinar los tres datasets\n", + "combined_data = pd.concat([file1, file2, file3], ignore_index=True)\n", + "\n", + "# Limpiar los datos (ejemplo: eliminar duplicados y manejar valores nulos)\n", + "combined_data = combined_data.drop_duplicates()\n", + "combined_data = combined_data.fillna('Unknown')\n", + "\n", + "# Mostrar las primeras filas del dataset combinado y limpio\n", + "print(combined_data.head())" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " customer st gender education \\\n", + "0 RB50392 Washington Unknown Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59 0.0 94.0 \n", + "2 1288743.17 48767.0 108.0 \n", + "3 764586.18 0.0 106.0 \n", + "4 536307.65 36357.0 68.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 NaN Personal Auto Four-Door Car \n", + "1 NaN Personal Auto Four-Door Car \n", + "2 NaN Personal Auto Two-Door Car \n", + "3 NaN Corporate Auto SUV \n", + "4 NaN Personal Auto Four-Door Car \n", + "\n", + " total_claim_amount state gender \n", + "0 2.704934 Unknown Unknown \n", + "1 1131.464935 Unknown Unknown \n", + "2 566.472247 Unknown Unknown \n", + "3 529.881344 Unknown Unknown \n", + "4 17.269323 Unknown Unknown \n" + ] + } + ], + "source": [ + "# Limpiar y estandarizar las columnas del DataFrame\n", + "combined_data.columns = combined_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '')\n", + "\n", + "# Convertir las columnas numéricas a su tipo adecuado\n", + "combined_data['customer_lifetime_value'] = pd.to_numeric(combined_data['customer_lifetime_value'].str.replace('%', ''), errors='coerce')\n", + "combined_data['income'] = pd.to_numeric(combined_data['income'], errors='coerce')\n", + "combined_data['monthly_premium_auto'] = pd.to_numeric(combined_data['monthly_premium_auto'], errors='coerce')\n", + "combined_data['number_of_open_complaints'] = pd.to_numeric(combined_data['number_of_open_complaints'], errors='coerce')\n", + "combined_data['total_claim_amount'] = pd.to_numeric(combined_data['total_claim_amount'], errors='coerce')\n", + "\n", + "# Mostrar las primeras filas del DataFrame limpio\n", + "print(combined_data.head())" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "44a4736d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount\n", + "policy_type \n", + "Corporate Auto 840081.32\n", + "Personal Auto 2926168.68\n", + "Special Auto 165758.07\n", + "Unknown 0.00\n" + ] + } + ], + "source": [ + "# Crear una tabla pivot para calcular los ingresos totales por canal de ventas\n", + "sales_channel_revenue = combined_data.pivot_table(\n", + " values='total_claim_amount',\n", + " index='policy_type',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "# Mostrar la tabla resumen\n", + "print(sales_channel_revenue)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "051e0fd2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0567cfec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor Bachelors College Doctor High School or Below \\\n", + "gender \n", + "F 808678.19 474208.57 809143.59 746331.45 815118.42 \n", + "Femal 628743.37 NaN 666203.88 NaN 559589.48 \n", + "M 743238.59 520935.92 773024.72 680651.40 772507.89 \n", + "Male 1102121.81 NaN 796178.26 615315.91 1115956.93 \n", + "Unknown 786023.52 NaN 817609.06 846882.10 798422.68 \n", + "female 916457.01 NaN 758341.72 NaN 487828.18 \n", + "\n", + "education Master \n", + "gender \n", + "F 794365.71 \n", + "Femal NaN \n", + "M 744210.77 \n", + "Male 916723.23 \n", + "Unknown 473031.73 \n", + "female 475043.06 \n" + ] + } + ], + "source": [ + "# Eliminar columnas duplicadas en el DataFrame\n", + "combined_data = combined_data.loc[:, ~combined_data.columns.duplicated()]\n", + "\n", + "# Crear una tabla pivot para calcular el valor promedio de la vida del cliente por género y nivel educativo\n", + "avg_clv_pivot = combined_data.pivot_table(\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "# Mostrar la tabla resumen\n", + "print(avg_clv_pivot)" + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b03baba", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\sergi\\AppData\\Local\\Temp\\ipykernel_20980\\2745828994.py:6: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " combined_data['month'] = pd.to_datetime(combined_data['customer'], errors='coerce').dt.month\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Empty DataFrame\n", + "Columns: [Policy Type, Month, Number of Complaints]\n", + "Index: []\n" + ] + } + ], + "source": [ + "# Asegurarse de que la columna 'number_of_open_complaints' sea numérica\n", + "combined_data['number_of_open_complaints'] = pd.to_numeric(combined_data['number_of_open_complaints'], errors='coerce')\n", + "\n", + "# Extraer el mes de la columna 'customer' si contiene información de fecha\n", + "# Suponiendo que la columna 'customer' contiene cadenas con formato de fecha, de lo contrario reemplazar con la columna correcta\n", + "combined_data['month'] = pd.to_datetime(combined_data['customer'], errors='coerce').dt.month\n", + "\n", + "# Agrupar por tipo de póliza y mes, sumando el número de quejas\n", + "complaints_summary = combined_data.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "\n", + "# Renombrar columnas para mayor claridad\n", + "complaints_summary.columns = ['Policy Type', 'Month', 'Number of Complaints']\n", + "\n", + "# Mostrar la tabla en formato largo\n", + "print(complaints_summary)\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-python-data-structures.ipynb b/lab-python-data-structures.ipynb index 5b3ce9e0..fc751987 100644 --- a/lab-python-data-structures.ipynb +++ b/lab-python-data-structures.ipynb @@ -9,6 +9,13 @@ "# Lab | Data Structures " ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -20,7 +27,7 @@ "Follow the steps below to complete the exercise:\n", "\n", "1. Define a list called `products` that contains the following items: \"t-shirt\", \"mug\", \"hat\", \"book\", \"keychain\".\n", - "\n", + " \n", "2. Create an empty dictionary called `inventory`.\n", "\n", "3. Ask the user to input the quantity of each product available in the inventory. Use the product names from the `products` list as keys in the `inventory` dictionary and assign the respective quantities as values.\n", @@ -50,11 +57,235 @@ "\n", "Solve the exercise by implementing the steps using the Python concepts of lists, dictionaries, sets, and basic input/output operations. " ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "1. Define a list called `products` that contains the following items: \"t-shirt\", \"mug\", \"hat\", \"book\", \"keychain\"." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "products = [\"t-shirt\", \"mug\", \"hat\", \"book\", \"keychain\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "2. Create an empty dictionary called `inventory`." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "inventory = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "3. Ask the user to input the quantity of each product available in the inventory. Use the product names from the `products` list as keys in the `inventory` dictionary and assign the respective quantities as values." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "for product in products:\n", + " quantity = int(input(f\"Enter the quantity for {product}: \"))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "4. Create an empty set called `customer_orders`." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "customer_orders = set()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "5. Ask the user to input the name of three products that a customer wants to order (from those in the products list, meaning three products out of \"t-shirt\", \"mug\", \"hat\", \"book\" or \"keychain\". Add each product name to the `customer_orders` set." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'mug'}\n", + "{'mug', 'hat'}\n", + "{'book', 'mug', 'hat'}\n" + ] + } + ], + "source": [ + "for _ in range(3):\n", + " order = input(\"Enter the product you want to order: \")\n", + " customer_orders.add(order)\n", + " print(customer_orders)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(customer_orders)\n", + "total_products_ordered = len(customer_orders)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Products Ordered: 3\n" + ] + } + ], + "source": [ + "\n", + "percentage_ordered = (len(customer_orders) / len(products)) * 100\n", + "print(f\"Total Products Ordered: {total_products_ordered}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "order_status = (total_products_ordered, percentage_ordered)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Order Statistics:\n", + "Total Products Ordered: 3\n", + "Percentage of Products Ordered: 60.0%\n" + ] + } + ], + "source": [ + "print(f\"Order Statistics:\")\n", + "print(f\"Total Products Ordered: {total_products_ordered}\")\n", + "print(f\"Percentage of Products Ordered: {percentage_ordered}%\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "9. Update the inventory by subtracting 1 from the quantity of each product. Modify the `inventory` dictionary accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'t-shirt': 1, 'mug': 1, 'hat': 2, 'book': 3, 'keychain': 5}\n" + ] + } + ], + "source": [ + "inventory = {'t-shirt': 1, 'mug': 2, 'hat': 3, 'book': 4, 'keychain': 5}\n", + "products_to_subtract = ['mug', 'hat', 'book']\n", + "for product in products_to_subtract:\n", + " if product in inventory:\n", + " inventory[product] -= 1\n", + "print(inventory)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-shirt 1\n", + "mug 1\n", + "hat 2\n", + "book 3\n", + "keychain 5\n" + ] + } + ], + "source": [ + "for product, quantity in inventory.items() :\n", + " print(product, quantity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -68,7 +299,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.2" } }, "nbformat": 4,