diff --git a/exercicios/para-casa/Rafaella-Fiel/tarefa.ipynb b/exercicios/para-casa/Rafaella-Fiel/tarefa.ipynb new file mode 100644 index 0000000..3a5aa5d --- /dev/null +++ b/exercicios/para-casa/Rafaella-Fiel/tarefa.ipynb @@ -0,0 +1,2379 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importando as planilhas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idcustomer_unique_idzip_code_prefixcustomer_citycustomer_state
006b8999e2fba1a1fbc88172c00ba8bc7861eff4711a542e4b93843c6dd7febb014409francaSP
118955e83d337fd6b2def6b18a428ac77290c77bc529b7ac935b93aa66c333dc39790sao bernardo do campoSP
24e7b3e00288586ebd08712fdd0374a03060e732b5b29e8181a18229c7b0b2b5e1151sao pauloSP
3b2b6027bc5c5109e529d4dc6358b12c3259dac757896d24d7702b9acbbff3f3c8775mogi das cruzesSP
44f2d8ab171c80ec8364f7c12e35b23ad345ecd01c38d18a9036ed96c73b8d06613056campinasSP
..................
9943617ddf5dd5d51696bb3d7c6291687be6f1a29b476fee25c95fbafc67c5ac95cf83937sao pauloSP
99437e7b71a9017aa05c9a7fd292d714858e8d52a67c98be1cf6a5c84435bd38d095d6764taboao da serraSP
994385e28dfe12db7fb50a4b2f691faecea5ee9f50caf99f032f0bf3c55141f019d9960115fortalezaCE
9943956b18e2166679b8a959d72dd06da27f973c2643a0a458b49f58cea58833b192e92120canoasRS
99440274fa6071e5e17fe303b9748641082c884732c5050c01db9b23e19ba398993986703cotiaSP
\n", + "

99441 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " customer_id customer_unique_id \\\n", + "0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 \n", + "1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 \n", + "2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e \n", + "3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c \n", + "4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 \n", + "... ... ... \n", + "99436 17ddf5dd5d51696bb3d7c6291687be6f 1a29b476fee25c95fbafc67c5ac95cf8 \n", + "99437 e7b71a9017aa05c9a7fd292d714858e8 d52a67c98be1cf6a5c84435bd38d095d \n", + "99438 5e28dfe12db7fb50a4b2f691faecea5e e9f50caf99f032f0bf3c55141f019d99 \n", + "99439 56b18e2166679b8a959d72dd06da27f9 73c2643a0a458b49f58cea58833b192e \n", + "99440 274fa6071e5e17fe303b9748641082c8 84732c5050c01db9b23e19ba39899398 \n", + "\n", + " zip_code_prefix customer_city customer_state \n", + "0 14409 franca SP \n", + "1 9790 sao bernardo do campo SP \n", + "2 1151 sao paulo SP \n", + "3 8775 mogi das cruzes SP \n", + "4 13056 campinas SP \n", + "... ... ... ... \n", + "99436 3937 sao paulo SP \n", + "99437 6764 taboao da serra SP \n", + "99438 60115 fortaleza CE \n", + "99439 92120 canoas RS \n", + "99440 6703 cotia SP \n", + "\n", + "[99441 rows x 5 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumidor = pd.read_csv(\"olist_customers_dataset.csv\")\n", + "df_consumidor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zip_code_prefixgeolocation_latgeolocation_lnggeolocation_citygeolocation_state
01037-23.545621-46.639292sao pauloSP
11046-23.546081-46.644820sao pauloSP
21046-23.546129-46.642951sao pauloSP
31041-23.544392-46.639499sao pauloSP
41035-23.541578-46.641607sao pauloSP
..................
100015899950-28.068639-52.010705tapejaraRS
100015999900-27.877125-52.224882getulio vargasRS
100016099950-28.071855-52.014716tapejaraRS
100016199980-28.388932-51.846871david canabarroRS
100016299950-28.070104-52.018658tapejaraRS
\n", + "

1000163 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " zip_code_prefix geolocation_lat geolocation_lng geolocation_city \\\n", + "0 1037 -23.545621 -46.639292 sao paulo \n", + "1 1046 -23.546081 -46.644820 sao paulo \n", + "2 1046 -23.546129 -46.642951 sao paulo \n", + "3 1041 -23.544392 -46.639499 sao paulo \n", + "4 1035 -23.541578 -46.641607 sao paulo \n", + "... ... ... ... ... \n", + "1000158 99950 -28.068639 -52.010705 tapejara \n", + "1000159 99900 -27.877125 -52.224882 getulio vargas \n", + "1000160 99950 -28.071855 -52.014716 tapejara \n", + "1000161 99980 -28.388932 -51.846871 david canabarro \n", + "1000162 99950 -28.070104 -52.018658 tapejara \n", + "\n", + " geolocation_state \n", + "0 SP \n", + "1 SP \n", + "2 SP \n", + "3 SP \n", + "4 SP \n", + "... ... \n", + "1000158 RS \n", + "1000159 RS \n", + "1000160 RS \n", + "1000161 RS \n", + "1000162 RS \n", + "\n", + "[1000163 rows x 5 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_localizacao = pd.read_csv(\"olist_geolocation_dataset.csv\")\n", + "df_localizacao" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seller_idzip_code_prefixseller_cityseller_state
03442f8959a84dea7ee197c632cb2df1513023campinasSP
1d1b65fc7debc3361ea86b5f14c68d2e213844mogi guacuSP
2ce3ad9de960102d0677a81f5d0bb7b2d20031rio de janeiroRJ
3c0f3eea2e14555b6faeea3dd58c1b1c34195sao pauloSP
451a04a8a6bdcb23deccc82b0b80742cf12914braganca paulistaSP
...............
309098dddbc4601dd4443ca174359b23716687111sarandiPR
3091f8201cab383e484733266d1906e2fdfa88137palhocaSC
309274871d19219c7d518d0090283e03c1374650sao pauloSP
3093e603cf3fec55f8697c9059638d6c8eb596080pelotasRS
30949e25199f6ef7e7c347120ff175652c3b12051taubateSP
\n", + "

3095 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " seller_id zip_code_prefix seller_city \\\n", + "0 3442f8959a84dea7ee197c632cb2df15 13023 campinas \n", + "1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 mogi guacu \n", + "2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 rio de janeiro \n", + "3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 sao paulo \n", + "4 51a04a8a6bdcb23deccc82b0b80742cf 12914 braganca paulista \n", + "... ... ... ... \n", + "3090 98dddbc4601dd4443ca174359b237166 87111 sarandi \n", + "3091 f8201cab383e484733266d1906e2fdfa 88137 palhoca \n", + "3092 74871d19219c7d518d0090283e03c137 4650 sao paulo \n", + "3093 e603cf3fec55f8697c9059638d6c8eb5 96080 pelotas \n", + "3094 9e25199f6ef7e7c347120ff175652c3b 12051 taubate \n", + "\n", + " seller_state \n", + "0 SP \n", + "1 SP \n", + "2 RJ \n", + "3 SP \n", + "4 SP \n", + "... ... \n", + "3090 PR \n", + "3091 SC \n", + "3092 SP \n", + "3093 RS \n", + "3094 SP \n", + "\n", + "[3095 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_vendedores = pd.read_csv(\"olist_sellers_dataset.csv\")\n", + "df_vendedores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Limpando e tratando os dados" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 99441 entries, 0 to 99440\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 customer_id 99441 non-null object\n", + " 1 customer_unique_id 99441 non-null object\n", + " 2 zip_code_prefix 99441 non-null int64 \n", + " 3 customer_city 99441 non-null object\n", + " 4 customer_state 99441 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 3.8+ MB\n" + ] + } + ], + "source": [ + "df_consumidor.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idzip_code_prefixcustomer_citycustomer_state
006b8999e2fba1a1fbc88172c00ba8bc714409francaSP
118955e83d337fd6b2def6b18a428ac779790sao bernardo do campoSP
24e7b3e00288586ebd08712fdd0374a031151sao pauloSP
3b2b6027bc5c5109e529d4dc6358b12c38775mogi das cruzesSP
44f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP
...............
9943617ddf5dd5d51696bb3d7c6291687be6f3937sao pauloSP
99437e7b71a9017aa05c9a7fd292d714858e86764taboao da serraSP
994385e28dfe12db7fb50a4b2f691faecea5e60115fortalezaCE
9943956b18e2166679b8a959d72dd06da27f992120canoasRS
99440274fa6071e5e17fe303b9748641082c86703cotiaSP
\n", + "

99441 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " customer_id zip_code_prefix \\\n", + "0 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "1 18955e83d337fd6b2def6b18a428ac77 9790 \n", + "2 4e7b3e00288586ebd08712fdd0374a03 1151 \n", + "3 b2b6027bc5c5109e529d4dc6358b12c3 8775 \n", + "4 4f2d8ab171c80ec8364f7c12e35b23ad 13056 \n", + "... ... ... \n", + "99436 17ddf5dd5d51696bb3d7c6291687be6f 3937 \n", + "99437 e7b71a9017aa05c9a7fd292d714858e8 6764 \n", + "99438 5e28dfe12db7fb50a4b2f691faecea5e 60115 \n", + "99439 56b18e2166679b8a959d72dd06da27f9 92120 \n", + "99440 274fa6071e5e17fe303b9748641082c8 6703 \n", + "\n", + " customer_city customer_state \n", + "0 franca SP \n", + "1 sao bernardo do campo SP \n", + "2 sao paulo SP \n", + "3 mogi das cruzes SP \n", + "4 campinas SP \n", + "... ... ... \n", + "99436 sao paulo SP \n", + "99437 taboao da serra SP \n", + "99438 fortaleza CE \n", + "99439 canoas RS \n", + "99440 cotia SP \n", + "\n", + "[99441 rows x 4 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumidor = df_consumidor.drop(\"customer_unique_id\", axis=1)\n", + "df_consumidor" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_id 0\n", + "zip_code_prefix 0\n", + "customer_city 0\n", + "customer_state 0\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumidor.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Length mismatch: Expected axis has 4 elements, new values have 1 elements", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\rafae\\estudos\\semana14\\on26-python-s14-analise-de-dados\\exercicios\\para-casa\\tarefa.ipynb Cell 11\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m df_consumidor\u001b[39m.\u001b[39;49mcolumns \u001b[39m=\u001b[39m nova_coluna\n\u001b[0;32m 2\u001b[0m df_consumidor\n", + "File \u001b[1;32mc:\\Users\\rafae\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\core\\generic.py:6218\u001b[0m, in \u001b[0;36mNDFrame.__setattr__\u001b[1;34m(self, name, value)\u001b[0m\n\u001b[0;32m 6216\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 6217\u001b[0m \u001b[39mobject\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__getattribute__\u001b[39m(\u001b[39mself\u001b[39m, name)\n\u001b[1;32m-> 6218\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mobject\u001b[39;49m\u001b[39m.\u001b[39;49m\u001b[39m__setattr__\u001b[39;49m(\u001b[39mself\u001b[39;49m, name, value)\n\u001b[0;32m 6219\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mAttributeError\u001b[39;00m:\n\u001b[0;32m 6220\u001b[0m \u001b[39mpass\u001b[39;00m\n", + "File \u001b[1;32mproperties.pyx:69\u001b[0m, in \u001b[0;36mpandas._libs.properties.AxisProperty.__set__\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32mc:\\Users\\rafae\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\core\\generic.py:767\u001b[0m, in \u001b[0;36mNDFrame._set_axis\u001b[1;34m(self, axis, labels)\u001b[0m\n\u001b[0;32m 762\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 763\u001b[0m \u001b[39mThis is called from the cython code when we set the `index` attribute\u001b[39;00m\n\u001b[0;32m 764\u001b[0m \u001b[39mdirectly, e.g. `series.index = [1, 2, 3]`.\u001b[39;00m\n\u001b[0;32m 765\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 766\u001b[0m labels \u001b[39m=\u001b[39m ensure_index(labels)\n\u001b[1;32m--> 767\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_mgr\u001b[39m.\u001b[39;49mset_axis(axis, labels)\n\u001b[0;32m 768\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_clear_item_cache()\n", + "File \u001b[1;32mc:\\Users\\rafae\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:227\u001b[0m, in \u001b[0;36mBaseBlockManager.set_axis\u001b[1;34m(self, axis, new_labels)\u001b[0m\n\u001b[0;32m 225\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mset_axis\u001b[39m(\u001b[39mself\u001b[39m, axis: AxisInt, new_labels: Index) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 226\u001b[0m \u001b[39m# Caller is responsible for ensuring we have an Index object.\u001b[39;00m\n\u001b[1;32m--> 227\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_set_axis(axis, new_labels)\n\u001b[0;32m 228\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39maxes[axis] \u001b[39m=\u001b[39m new_labels\n", + "File \u001b[1;32mc:\\Users\\rafae\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\core\\internals\\base.py:85\u001b[0m, in \u001b[0;36mDataManager._validate_set_axis\u001b[1;34m(self, axis, new_labels)\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[0;32m 84\u001b[0m \u001b[39melif\u001b[39;00m new_len \u001b[39m!=\u001b[39m old_len:\n\u001b[1;32m---> 85\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 86\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mLength mismatch: Expected axis has \u001b[39m\u001b[39m{\u001b[39;00mold_len\u001b[39m}\u001b[39;00m\u001b[39m elements, new \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 87\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mvalues have \u001b[39m\u001b[39m{\u001b[39;00mnew_len\u001b[39m}\u001b[39;00m\u001b[39m elements\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 88\u001b[0m )\n", + "\u001b[1;31mValueError\u001b[0m: Length mismatch: Expected axis has 4 elements, new values have 1 elements" + ] + } + ], + "source": [ + "df_consumidor.columns = nova_coluna\n", + "df_consumidor" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idzip_code_prefixcustomer_citycustomer_state
count9944199441.0000009944199441
unique99441NaN411927
top06b8999e2fba1a1fbc88172c00ba8bc7NaNsao pauloSP
freq1NaN1554041746
meanNaN35137.474583NaNNaN
stdNaN29797.938996NaNNaN
minNaN1003.000000NaNNaN
25%NaN11347.000000NaNNaN
50%NaN24416.000000NaNNaN
75%NaN58900.000000NaNNaN
maxNaN99990.000000NaNNaN
\n", + "
" + ], + "text/plain": [ + " customer_id zip_code_prefix customer_city \\\n", + "count 99441 99441.000000 99441 \n", + "unique 99441 NaN 4119 \n", + "top 06b8999e2fba1a1fbc88172c00ba8bc7 NaN sao paulo \n", + "freq 1 NaN 15540 \n", + "mean NaN 35137.474583 NaN \n", + "std NaN 29797.938996 NaN \n", + "min NaN 1003.000000 NaN \n", + "25% NaN 11347.000000 NaN \n", + "50% NaN 24416.000000 NaN \n", + "75% NaN 58900.000000 NaN \n", + "max NaN 99990.000000 NaN \n", + "\n", + " customer_state \n", + "count 99441 \n", + "unique 27 \n", + "top SP \n", + "freq 41746 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumidor.describe(include=\"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1000163 entries, 0 to 1000162\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 zip_code_prefix 1000163 non-null int64 \n", + " 1 geolocation_lat 1000163 non-null float64\n", + " 2 geolocation_lng 1000163 non-null float64\n", + " 3 geolocation_city 1000163 non-null object \n", + " 4 geolocation_state 1000163 non-null object \n", + "dtypes: float64(2), int64(1), object(2)\n", + "memory usage: 38.2+ MB\n" + ] + } + ], + "source": [ + "df_localizacao.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "zip_code_prefix 0\n", + "geolocation_lat 0\n", + "geolocation_lng 0\n", + "geolocation_city 0\n", + "geolocation_state 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_localizacao.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zip_code_prefixgeolocation_latgeolocation_lnggeolocation_citygeolocation_state
count1.000163e+061.000163e+061.000163e+0610001631000163
uniqueNaNNaNNaN801127
topNaNNaNNaNsao pauloSP
freqNaNNaNNaN135800404268
mean3.657417e+04-2.117615e+01-4.639054e+01NaNNaN
std3.054934e+045.715866e+004.269748e+00NaNNaN
min1.001000e+03-3.660537e+01-1.014668e+02NaNNaN
25%1.107500e+04-2.360355e+01-4.857317e+01NaNNaN
50%2.653000e+04-2.291938e+01-4.663788e+01NaNNaN
75%6.350400e+04-1.997962e+01-4.376771e+01NaNNaN
max9.999000e+044.506593e+011.211054e+02NaNNaN
\n", + "
" + ], + "text/plain": [ + " zip_code_prefix geolocation_lat geolocation_lng geolocation_city \\\n", + "count 1.000163e+06 1.000163e+06 1.000163e+06 1000163 \n", + "unique NaN NaN NaN 8011 \n", + "top NaN NaN NaN sao paulo \n", + "freq NaN NaN NaN 135800 \n", + "mean 3.657417e+04 -2.117615e+01 -4.639054e+01 NaN \n", + "std 3.054934e+04 5.715866e+00 4.269748e+00 NaN \n", + "min 1.001000e+03 -3.660537e+01 -1.014668e+02 NaN \n", + "25% 1.107500e+04 -2.360355e+01 -4.857317e+01 NaN \n", + "50% 2.653000e+04 -2.291938e+01 -4.663788e+01 NaN \n", + "75% 6.350400e+04 -1.997962e+01 -4.376771e+01 NaN \n", + "max 9.999000e+04 4.506593e+01 1.211054e+02 NaN \n", + "\n", + " geolocation_state \n", + "count 1000163 \n", + "unique 27 \n", + "top SP \n", + "freq 404268 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_localizacao.describe(include=\"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3095 entries, 0 to 3094\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 seller_id 3095 non-null object\n", + " 1 zip_code_prefix 3095 non-null int64 \n", + " 2 seller_city 3095 non-null object\n", + " 3 seller_state 3095 non-null object\n", + "dtypes: int64(1), object(3)\n", + "memory usage: 96.8+ KB\n" + ] + } + ], + "source": [ + "df_vendedores.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "seller_id 0\n", + "zip_code_prefix 0\n", + "seller_city 0\n", + "seller_state 0\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_vendedores.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seller_idzip_code_prefixseller_cityseller_state
count30953095.00000030953095
unique3095NaN61123
top3442f8959a84dea7ee197c632cb2df15NaNsao pauloSP
freq1NaN6941849
meanNaN32291.059451NaNNaN
stdNaN32713.453830NaNNaN
minNaN1001.000000NaNNaN
25%NaN7093.500000NaNNaN
50%NaN14940.000000NaNNaN
75%NaN64552.500000NaNNaN
maxNaN99730.000000NaNNaN
\n", + "
" + ], + "text/plain": [ + " seller_id zip_code_prefix seller_city \\\n", + "count 3095 3095.000000 3095 \n", + "unique 3095 NaN 611 \n", + "top 3442f8959a84dea7ee197c632cb2df15 NaN sao paulo \n", + "freq 1 NaN 694 \n", + "mean NaN 32291.059451 NaN \n", + "std NaN 32713.453830 NaN \n", + "min NaN 1001.000000 NaN \n", + "25% NaN 7093.500000 NaN \n", + "50% NaN 14940.000000 NaN \n", + "75% NaN 64552.500000 NaN \n", + "max NaN 99730.000000 NaN \n", + "\n", + " seller_state \n", + "count 3095 \n", + "unique 23 \n", + "top SP \n", + "freq 1849 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_vendedores.describe(include=\"all\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analisando os dados" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_state\n", + "SP 41746\n", + "RJ 12852\n", + "MG 11635\n", + "RS 5466\n", + "PR 5045\n", + "SC 3637\n", + "BA 3380\n", + "DF 2140\n", + "ES 2033\n", + "GO 2020\n", + "PE 1652\n", + "CE 1336\n", + "PA 975\n", + "MT 907\n", + "MA 747\n", + "MS 715\n", + "PB 536\n", + "PI 495\n", + "RN 485\n", + "AL 413\n", + "SE 350\n", + "TO 280\n", + "RO 253\n", + "AM 148\n", + "AC 81\n", + "AP 68\n", + "RR 46\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Quais estados mais aparecem para consumidores? E para vendedores?\n", + "estado_cons = df_consumidor[\"customer_state\"].value_counts()\n", + "estado_cons" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_state\n", + "SP 0.419807\n", + "RJ 0.129242\n", + "MG 0.117004\n", + "RS 0.054967\n", + "PR 0.050734\n", + "SC 0.036574\n", + "BA 0.033990\n", + "DF 0.021520\n", + "ES 0.020444\n", + "GO 0.020314\n", + "PE 0.016613\n", + "CE 0.013435\n", + "PA 0.009805\n", + "MT 0.009121\n", + "MA 0.007512\n", + "MS 0.007190\n", + "PB 0.005390\n", + "PI 0.004978\n", + "RN 0.004877\n", + "AL 0.004153\n", + "SE 0.003520\n", + "TO 0.002816\n", + "RO 0.002544\n", + "AM 0.001488\n", + "AC 0.000815\n", + "AP 0.000684\n", + "RR 0.000463\n", + "Name: proportion, dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dados em percentual\n", + "estado_cons_perc = df_consumidor[\"customer_state\"].value_counts(normalize=True)\n", + "estado_cons_perc" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "seller_state\n", + "SP 1849\n", + "PR 349\n", + "MG 244\n", + "SC 190\n", + "RJ 171\n", + "RS 129\n", + "GO 40\n", + "DF 30\n", + "ES 23\n", + "BA 19\n", + "CE 13\n", + "PE 9\n", + "PB 6\n", + "RN 5\n", + "MS 5\n", + "MT 4\n", + "RO 2\n", + "SE 2\n", + "PI 1\n", + "AC 1\n", + "MA 1\n", + "AM 1\n", + "PA 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "estado_vend = df_vendedores[\"seller_state\"].value_counts()\n", + "estado_vend" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "seller_state\n", + "SP 0.597415\n", + "PR 0.112763\n", + "MG 0.078837\n", + "SC 0.061389\n", + "RJ 0.055250\n", + "RS 0.041680\n", + "GO 0.012924\n", + "DF 0.009693\n", + "ES 0.007431\n", + "BA 0.006139\n", + "CE 0.004200\n", + "PE 0.002908\n", + "PB 0.001939\n", + "RN 0.001616\n", + "MS 0.001616\n", + "MT 0.001292\n", + "RO 0.000646\n", + "SE 0.000646\n", + "PI 0.000323\n", + "AC 0.000323\n", + "MA 0.000323\n", + "AM 0.000323\n", + "PA 0.000323\n", + "Name: proportion, dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dados em percentual\n", + "estado_vend_perc = df_vendedores[\"seller_state\"].value_counts(normalize=True)\n", + "estado_vend_perc" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "geolocation_city\n", + "sao paulo 135800\n", + "rio de janeiro 62151\n", + "belo horizonte 27805\n", + "são paulo 24918\n", + "curitiba 16593\n", + " ... \n", + "jacuípe 1\n", + "mar vermelho 1\n", + "quebrangulo 1\n", + "poço das trincheiras 1\n", + "poxim 1\n", + "Name: count, Length: 8011, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Quais as cidades que mais aparecem?\n", + "cidades = df_localizacao[\"geolocation_city\"].value_counts()\n", + "cidades" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Juntando os dados" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idzip_code_prefixcustomer_citycustomer_stategeolocation_latgeolocation_lnggeolocation_citygeolocation_state
006b8999e2fba1a1fbc88172c00ba8bc714409francaSP-20.509897-47.397866francaSP
106b8999e2fba1a1fbc88172c00ba8bc714409francaSP-20.497396-47.399241francaSP
206b8999e2fba1a1fbc88172c00ba8bc714409francaSP-20.510459-47.399553francaSP
306b8999e2fba1a1fbc88172c00ba8bc714409francaSP-20.480940-47.394161francaSP
406b8999e2fba1a1fbc88172c00ba8bc714409francaSP-20.515413-47.398194francaSP
...........................
15083450d9110683c7a282144e9fc97660026a2874980aparecida de goianiaGO-16.821866-49.244027aparecida de goianiaGO
15083451d9110683c7a282144e9fc97660026a2874980aparecida de goianiaGO-16.821866-49.244027aparecida de goianiaGO
15083452d9110683c7a282144e9fc97660026a2874980aparecida de goianiaGO-16.822945-49.244615aparecida de goianiaGO
150834536fb4f2354f36e554ac80141e9128f52899043passo fundoRS-28.226596-52.467505passo fundoRS
150834546fb4f2354f36e554ac80141e9128f52899043passo fundoRS-28.226596-52.467505passo fundoRS
\n", + "

15083455 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " customer_id zip_code_prefix \\\n", + "0 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "1 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "2 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "3 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "4 06b8999e2fba1a1fbc88172c00ba8bc7 14409 \n", + "... ... ... \n", + "15083450 d9110683c7a282144e9fc97660026a28 74980 \n", + "15083451 d9110683c7a282144e9fc97660026a28 74980 \n", + "15083452 d9110683c7a282144e9fc97660026a28 74980 \n", + "15083453 6fb4f2354f36e554ac80141e9128f528 99043 \n", + "15083454 6fb4f2354f36e554ac80141e9128f528 99043 \n", + "\n", + " customer_city customer_state geolocation_lat \\\n", + "0 franca SP -20.509897 \n", + "1 franca SP -20.497396 \n", + "2 franca SP -20.510459 \n", + "3 franca SP -20.480940 \n", + "4 franca SP -20.515413 \n", + "... ... ... ... \n", + "15083450 aparecida de goiania GO -16.821866 \n", + "15083451 aparecida de goiania GO -16.821866 \n", + "15083452 aparecida de goiania GO -16.822945 \n", + "15083453 passo fundo RS -28.226596 \n", + "15083454 passo fundo RS -28.226596 \n", + "\n", + " geolocation_lng geolocation_city geolocation_state \n", + "0 -47.397866 franca SP \n", + "1 -47.399241 franca SP \n", + "2 -47.399553 franca SP \n", + "3 -47.394161 franca SP \n", + "4 -47.398194 franca SP \n", + "... ... ... ... \n", + "15083450 -49.244027 aparecida de goiania GO \n", + "15083451 -49.244027 aparecida de goiania GO \n", + "15083452 -49.244615 aparecida de goiania GO \n", + "15083453 -52.467505 passo fundo RS \n", + "15083454 -52.467505 passo fundo RS \n", + "\n", + "[15083455 rows x 8 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_juncao = pd.merge(df_consumidor, df_localizacao, on=\"zip_code_prefix\")\n", + "df_juncao" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idzip_code_prefixcustomer_citycustomer_stategeolocation_latgeolocation_lnggeolocation_citygeolocation_stateseller_idseller_cityseller_state
04f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP-22.987222-47.151073campinasSP8d46553a36e68f95350a200c12f8f2e2campinasSP
14f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP-22.964194-47.146534campinasSP8d46553a36e68f95350a200c12f8f2e2campinasSP
24f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP-22.961982-47.146298campinasSP8d46553a36e68f95350a200c12f8f2e2campinasSP
34f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP-22.968059-47.147139campinasSP8d46553a36e68f95350a200c12f8f2e2campinasSP
44f2d8ab171c80ec8364f7c12e35b23ad13056campinasSP-22.977905-47.145693campinasSP8d46553a36e68f95350a200c12f8f2e2campinasSP
....................................
114925107fb62e47282b83f023cf1fef8d8309c94319sao pauloSP-23.646627-46.638771sao pauloSPab91571efab27993ff2f6b36e38055c3sao pauloSP
114925117fb62e47282b83f023cf1fef8d8309c94319sao pauloSP-23.647430-46.636954sao pauloSPab91571efab27993ff2f6b36e38055c3sao pauloSP
114925127fb62e47282b83f023cf1fef8d8309c94319sao pauloSP-23.647765-46.635971sao pauloSPab91571efab27993ff2f6b36e38055c3sao pauloSP
114925137fb62e47282b83f023cf1fef8d8309c94319sao pauloSP-23.647430-46.636954sao pauloSPab91571efab27993ff2f6b36e38055c3sao pauloSP
114925147fb62e47282b83f023cf1fef8d8309c94319sao pauloSP-23.647471-46.636862são pauloSPab91571efab27993ff2f6b36e38055c3sao pauloSP
\n", + "

11492515 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer_id zip_code_prefix customer_city \\\n", + "0 4f2d8ab171c80ec8364f7c12e35b23ad 13056 campinas \n", + "1 4f2d8ab171c80ec8364f7c12e35b23ad 13056 campinas \n", + "2 4f2d8ab171c80ec8364f7c12e35b23ad 13056 campinas \n", + "3 4f2d8ab171c80ec8364f7c12e35b23ad 13056 campinas \n", + "4 4f2d8ab171c80ec8364f7c12e35b23ad 13056 campinas \n", + "... ... ... ... \n", + "11492510 7fb62e47282b83f023cf1fef8d8309c9 4319 sao paulo \n", + "11492511 7fb62e47282b83f023cf1fef8d8309c9 4319 sao paulo \n", + "11492512 7fb62e47282b83f023cf1fef8d8309c9 4319 sao paulo \n", + "11492513 7fb62e47282b83f023cf1fef8d8309c9 4319 sao paulo \n", + "11492514 7fb62e47282b83f023cf1fef8d8309c9 4319 sao paulo \n", + "\n", + " customer_state geolocation_lat geolocation_lng geolocation_city \\\n", + "0 SP -22.987222 -47.151073 campinas \n", + "1 SP -22.964194 -47.146534 campinas \n", + "2 SP -22.961982 -47.146298 campinas \n", + "3 SP -22.968059 -47.147139 campinas \n", + "4 SP -22.977905 -47.145693 campinas \n", + "... ... ... ... ... \n", + "11492510 SP -23.646627 -46.638771 sao paulo \n", + "11492511 SP -23.647430 -46.636954 sao paulo \n", + "11492512 SP -23.647765 -46.635971 sao paulo \n", + "11492513 SP -23.647430 -46.636954 sao paulo \n", + "11492514 SP -23.647471 -46.636862 são paulo \n", + "\n", + " geolocation_state seller_id seller_city \\\n", + "0 SP 8d46553a36e68f95350a200c12f8f2e2 campinas \n", + "1 SP 8d46553a36e68f95350a200c12f8f2e2 campinas \n", + "2 SP 8d46553a36e68f95350a200c12f8f2e2 campinas \n", + "3 SP 8d46553a36e68f95350a200c12f8f2e2 campinas \n", + "4 SP 8d46553a36e68f95350a200c12f8f2e2 campinas \n", + "... ... ... ... \n", + "11492510 SP ab91571efab27993ff2f6b36e38055c3 sao paulo \n", + "11492511 SP ab91571efab27993ff2f6b36e38055c3 sao paulo \n", + "11492512 SP ab91571efab27993ff2f6b36e38055c3 sao paulo \n", + "11492513 SP ab91571efab27993ff2f6b36e38055c3 sao paulo \n", + "11492514 SP ab91571efab27993ff2f6b36e38055c3 sao paulo \n", + "\n", + " seller_state \n", + "0 SP \n", + "1 SP \n", + "2 SP \n", + "3 SP \n", + "4 SP \n", + "... ... \n", + "11492510 SP \n", + "11492511 SP \n", + "11492512 SP \n", + "11492513 SP \n", + "11492514 SP \n", + "\n", + "[11492515 rows x 11 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_juncao = pd.merge(df_juncao, df_vendedores, on=\"zip_code_prefix\")\n", + "df_juncao" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exportando o arquivo em csv" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "df_juncao.to_csv('tabelas_unidas.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Continuando a análise pelo novo df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "zip_code_prefix\n", + "22790 487770\n", + "38400 335820\n", + "35500 312008\n", + "14940 308308\n", + "22793 226996\n", + " ... \n", + "59775 5\n", + "12250 5\n", + "32419 5\n", + "9336 4\n", + "27972 3\n", + "Name: count, Length: 2160, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Qual o prefixo que mais aparece?\n", + "prefixo = df_juncao[\"zip_code_prefix\"].value_counts()\n", + "prefixo" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_id
zip_code_prefix
22790142
22793121
22775110
29101101
1321295
......
872301
66961
65131
65061
997301
\n", + "

2160 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " customer_id\n", + "zip_code_prefix \n", + "22790 142\n", + "22793 121\n", + "22775 110\n", + "29101 101\n", + "13212 95\n", + "... ...\n", + "87230 1\n", + "6696 1\n", + "6513 1\n", + "6506 1\n", + "99730 1\n", + "\n", + "[2160 rows x 1 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Agrupar prefixo com o id do consumidor\n", + "df_pref = df_juncao.groupby(\"zip_code_prefix\").agg({\"customer_id\" : \"nunique\"})\n", + "df_pref.sort_values(by=\"customer_id\", ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Análise visual\n", + "\n", + "O local dos maiores e menores consumidores é o mesmo local dos vendedores?" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Contagem')" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Consumidores por estado\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "sns.countplot(data = df_juncao,\n", + " x = \"customer_state\")\n", + "\n", + "# definir texto do título e eixos x e y\n", + "\n", + "plt.title(\"Consumidores por estado\")\n", + "plt.xlabel(\"Estado\")\n", + "plt.ylabel(\"Contagem\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Contagem')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Vendedores por estado\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "sns.countplot(data = df_juncao,\n", + " x = \"seller_state\")\n", + "\n", + "# definir texto do título e eixos x e y\n", + "\n", + "plt.title(\"Vendedores por estado\")\n", + "plt.xlabel(\"Estado\")\n", + "plt.ylabel(\"Contagem\")" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Contagem')" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Geolocalização por estado\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "sns.countplot(data = df_juncao,\n", + " x = \"geolocation_state\")\n", + "\n", + "# definir texto do título e eixos x e y\n", + "\n", + "plt.title(\"Geolocalização por estado\")\n", + "plt.xlabel(\"Estado\")\n", + "plt.ylabel(\"Contagem\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#\n", + "Pela análise dos gráficos, os estados com mais e menores consumidores são também dos vendedores, com pouca diferença.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}