From c0450ffad1c08228e022442f97e5450040ce5f49 Mon Sep 17 00:00:00 2001 From: cdolfi Date: Mon, 18 Jan 2021 21:32:46 +0000 Subject: [PATCH] cleaning and saving of email files --- notebooks/add_new_data.ipynb | 172 ++++++ notebooks/hyperkitty_to_csv.ipynb | 841 ++++++++++++++++++++++++++++++ notebooks/parsing_mbox.ipynb | 106 +++- notebooks/retrieve_mbox.ipynb | 47 +- 4 files changed, 1146 insertions(+), 20 deletions(-) create mode 100644 notebooks/add_new_data.ipynb create mode 100644 notebooks/hyperkitty_to_csv.ipynb diff --git a/notebooks/add_new_data.ipynb b/notebooks/add_new_data.ipynb new file mode 100644 index 0000000..b5a6359 --- /dev/null +++ b/notebooks/add_new_data.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "import os \n", + "import boto3 \n", + "import gzip\n", + "from dotenv import load_dotenv, find_dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://s3.upshift.redhat.com\n", + "cdolfi\n" + ] + } + ], + "source": [ + "dotenv_path = find_dotenv()\n", + "load_dotenv(dotenv_path)\n", + "s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']\n", + "s3_bucket_name = os.environ['JUPYTERHUB_USER']\n", + "s3_endpoint_url = os.environ['S3_ENDPOINT_URL']\n", + "s3_access_key = os.environ['AWS_ACCESS_KEY_ID']\n", + "s3bucket = os.environ['BUCKET']\n", + "\n", + "print(s3_endpoint_url)\n", + "print(s3_bucket_name)\n", + "s3 = boto3.client('s3','us-east-1', endpoint_url= s3_endpoint_url,\n", + " aws_access_key_id = s3_access_key,\n", + " aws_secret_access_key = s3_secret_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#CHANGE WITH WHAT YOU ARE UPDATING \n", + "folder = \"user\"\n", + "files = []\n", + "\n", + "for key in s3.list_objects(Bucket=s3_bucket_name)['Contents']:\n", + " if key['Key'][:4] == folder:\n", + " files.append(key['Key'])\n", + " \n", + "obj = s3.get_object(Bucket=s3_bucket_name, Key = files[0]) \n", + "combined_emails = pd.read_csv(obj['Body'])\n", + " \n", + "for f in files[1:]:\n", + " temp = s3.get_object(Bucket=s3_bucket_name, Key = f) \n", + " n_df = pd.read_csv(temp['Body'])\n", + " combined_emails = pd.concat([combined_emails, n_df])\n", + " \n", + " \n", + "combined_emails.sort_values(by= 'datetime', inplace = True)\n", + "combined_emails.drop('Unnamed: 0',axis=1, inplace=True )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "#update complete csv \n", + "combined_emails.to_csv('../data/interim/temp_complete.csv')\n", + "s3_location = folder + \"/\" + folder + \"_complete.csv\"\n", + "s3.upload_file(Filename='../data/interim/temp_complete.csv',Bucket=s3_bucket_name, Key=s3_location)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "#remove merged files\n", + "for k in files:\n", + " if k != s3_location:\n", + " s3.delete_object(Bucket=s3_bucket_name, Key=k)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check what items are in the bucket" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "devl/devl_complete.csv\n", + "user/2004_3-2005_1.csv\n", + "user/2005_2-2006_1.csv\n", + "user/2006_02-2006_05.csv\n", + "user/2006_06-2006_07.csv\n", + "user/2006_07-2006_08.csv\n", + "user/2006_09-2007_01.csv\n", + "user/2007_01-2007_06.csv\n", + "user/2007_07-2008_01.csv\n", + "user/2008_01-2009_01.csv\n", + "user/2009_01-2009_06.csv\n", + "user/2009_07-2010_01.csv\n", + "user/2010_01-2010_01.csv\n", + "user/2010_01-2011_01.csv\n", + "user/2011_01-2012_01.csv\n", + "user/2012_01-2013_01.csv\n", + "user/2013_01-2014_01.csv\n", + "user/2014_01-2015_01.csv\n", + "user/2016_01-2017_01.csv\n", + "user/2017_01-2018_01.csv\n", + "user/2018_01-2019_01.csv\n", + "user/2019_01-2020_01.csv\n", + "user/2020_01-2021_01.csv\n", + "user/user_complete.csv\n" + ] + } + ], + "source": [ + "for key in s3.list_objects(Bucket=s3_bucket_name)['Contents']:\n", + " print(key['Key'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailing", + "language": "python", + "name": "mailing" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/hyperkitty_to_csv.ipynb b/notebooks/hyperkitty_to_csv.ipynb new file mode 100644 index 0000000..d8aaad2 --- /dev/null +++ b/notebooks/hyperkitty_to_csv.ipynb @@ -0,0 +1,841 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from datetime import datetime\n", + "import mailbox\n", + "import regex as re \n", + "import os \n", + "import boto3 \n", + "import gzip\n", + "from dotenv import load_dotenv, find_dotenv\n", + "import wget" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://s3.upshift.redhat.com\n", + "cdolfi\n" + ] + } + ], + "source": [ + "dotenv_path = find_dotenv()\n", + "load_dotenv(dotenv_path)\n", + "s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']\n", + "s3_bucket_name = os.environ['JUPYTERHUB_USER']\n", + "s3_endpoint_url = os.environ['S3_ENDPOINT_URL']\n", + "s3_access_key = os.environ['AWS_ACCESS_KEY_ID']\n", + "s3bucket = os.environ['BUCKET']\n", + "\n", + "print(s3_endpoint_url)\n", + "print(s3_bucket_name)\n", + "s3 = boto3.client('s3','us-east-1', endpoint_url= s3_endpoint_url,\n", + " aws_access_key_id = s3_access_key,\n", + " aws_secret_access_key = s3_secret_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The \"msgs\" portion is where the errors come with trying to get the mbox" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "def gunzip(source_filepath, dest_filepath, block_size=65536):\n", + " with gzip.open(source_filepath, 'rb') as s_file, \\\n", + " open(dest_filepath, 'wb') as d_file:\n", + " while True:\n", + " block = s_file.read(block_size)\n", + " if not block:\n", + " break\n", + " else:\n", + " d_file.write(block)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "#how to get text from a message in the mbox \n", + "def get_text(msg):\n", + " while msg.is_multipart():\n", + " msg = msg.get_payload()[0]\n", + " return msg.get_payload()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "#strip thread text and dates in body text\n", + "def strip_thread(text):\n", + " text = text.replace(\"\\r\", \"\")\n", + " lines = text.split(\"\\n\")\n", + " lines = [l for l in lines if len(l) > 0]\n", + " lines = [line for line in lines if line[0] != \">\"]\n", + " lines = [line for line in lines if line[:3] != \"Re:\"]\n", + " lines = [line for line in lines if line[:7] != \"Subject\"]\n", + " lines = [line for line in lines if line[:5] != \"From:\"]\n", + " lines = [line for line in lines if line[:5] != \"Date:\"]\n", + " lines = [line for line in lines if \"BEGIN PGP SIGNED MESSAGE\" not in line]\n", + " lines = [line for line in lines if line[:5] != \"Hash:\"]\n", + " lines = [line for line in lines if line[:10] != \"Version: G\"]\n", + " lines = [line for line in lines if \"wrote:\" not in line]\n", + " lines = [line for line in lines if \"wrote :\" not in line]\n", + " lines = [line for line in lines if \"writes:\" not in line]\n", + " lines = [line for line in lines if line[:7] != \"Am Mit,\"]\n", + " lines = [line for line in lines if line[:7] != \"Am Don,\"]\n", + " lines = [line for line in lines if line[:7] != \"Am Mon,\"]\n", + " lines = [line for line in lines if line[:7] != \"Quoting\"]\n", + " lines = [line for line in lines if line[:10] != \"Em Quinta,\"]\n", + " lines = [line for line in lines if \"said:\" not in line]\n", + " lines = [line for line in lines if re.match(\".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), .. (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) 20..*\", line) == None]\n", + " lines = [line for line in lines if re.match(\".*n (Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday) .. (January|February|March|April|May|June|July|August|September|October|November|December) 20..*\", line) == None]\n", + " lines = [line for line in lines if re.match(\".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) .., 20..*\", line) == None]\n", + " lines = [line for line in lines if re.match(\".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), 20[\\d]{2}-[\\d]{2}-[\\d]{2} at.*\", line) == None]\n", + " lines = [line for line in lines if line[-6:] != \"said: \"]\n", + " lines = [line for line in lines if line[-8:] != \"babbled:\"]\n", + " lines = [line for line in lines if line[-7:] != \"wrot=e:\"]\n", + " lines = [line for line in lines if line[-8:] != \"A9crit :\"]\n", + " lines = [line for line in lines if line[0] != \"|\"]\n", + " return \"\\n\".join(lines)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "#build 2d list of email data with thread text and dates removed\n", + "def build_list(mbox):\n", + " chart = [] \n", + " for msg in mbox: \n", + " clean_body = strip_thread(get_text(msg))\n", + " entry = [clean_body, msg[\"Date\"][:-9], msg[\"From\"], msg[\"Subject\"], msg[\"Message-ID\"], msg[\"In-Reply-To\"]]\n", + " chart.append(entry)\n", + " return chart\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "#format for CSV, clean special characters, and remove extranous emails \n", + "def pandas_clean(chart):\n", + " emails = pd.DataFrame(chart, columns = [\"Body\", 'Date', \"From\", \"Subject\", \"Message ID\", \"In-Reply\"]) \n", + " emails['Body'].replace(to_replace=[r\"\\n\", \"\\n\",], value= \" \", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace=[r\"\\'\", \"\\'\",\">\",'<',\"= \", \"-\", \"http\\S+\" ], value=\"\", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace=[ r\"\\\\\\s+\", \"\\\\\\s+\" , \"=\"], value= \"\", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace= [ \" \", \" \"], value= \" \", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace= [\"_\",\"3D\"], value= \"\", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace= [ \" \", \" \"], value= \" \", regex=True, inplace=True)\n", + " emails['Body'].replace(to_replace= [ \" \", \" \"], value= \" \", regex=True, inplace=True)\n", + " \n", + " emails.drop(emails.index[emails['Body'] == ''], inplace = True)\n", + " emails.drop(emails.index[emails['Body'] == ' '], inplace = True)\n", + " emails.drop(emails.index[emails['Body'] == '+1'], inplace = True)\n", + " emails.dropna(subset=['Body'], inplace=True)\n", + " emails['location'] = emails['Body'].str.find('Missing expected images')\n", + " \n", + " emails.drop(emails.index[emails['location'] == 0], inplace = True)\n", + " emails.drop('location' ,axis=1, inplace=True )\n", + " emails['location'] = emails['Body'].str.find('OLD: Fedora')\n", + " emails.drop(emails.index[emails['location'] == 0], inplace = True)\n", + " emails.drop('location' ,axis=1, inplace=True )\n", + " \n", + " emails['datetime'] = pd.to_datetime(emails['Date'], format='%a, %d %b %Y %H:%M')\n", + " emails.sort_values(by= 'datetime', inplace = True)\n", + " \n", + " emails = emails.reset_index()\n", + " emails.drop('index',axis=1, inplace=True )\n", + " return emails\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://lists.fedoraproject.org/archives/list/users@lists.fedoraproject.org/export/users@lists.fedoraproject.org.mbox.gz?start=2006-06-01&end=2006-09-01\n" + ] + }, + { + "ename": "EOFError", + "evalue": "Compressed file ended before the end-of-stream marker was reached", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mEOFError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mwget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"../data/raw\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m \u001b[0mgunzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'../data/interim/pulled.mbox'\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0mmsgs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmailbox\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmbox\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/interim/pulled.mbox'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mgunzip\u001b[0;34m(source_filepath, dest_filepath, block_size)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdest_filepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0md_file\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ms_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/gzip.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEBADF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"read() on write-only GzipFile object\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/_compression.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mview\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mview\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"B\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mbyte_view\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbyte_view\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0mbyte_view\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/gzip.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m raise EOFError(\"Compressed file ended before the \"\n\u001b[0m\u001b[1;32m 483\u001b[0m \"end-of-stream marker was reached\")\n\u001b[1;32m 484\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mEOFError\u001b[0m: Compressed file ended before the end-of-stream marker was reached" + ] + } + ], + "source": [ + "#EDIT\n", + "f_list = 'user'\n", + "start_year = '2006'\n", + "start_month = '06'\n", + "finish_year = '2006'\n", + "finish_month = '09'\n", + "\n", + "\n", + "\n", + "if f_list == 'devl':\n", + " s = \"https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=\"\n", + " name = \"../data/raw/devel@lists.fedoraproject.org.mbox.gz\"\n", + "elif f_list == 'user':\n", + " s = \"https://lists.fedoraproject.org/archives/list/users@lists.fedoraproject.org/export/users@lists.fedoraproject.org.mbox.gz?start=\"\n", + " name = \"../data/raw/users@lists.fedoraproject.org.mbox.gz\"\n", + "\n", + "\n", + "\n", + "s = f\"{s}{start_year}-{start_month}-01&end={finish_year}-{finish_month}-01\"\n", + "print(s)\n", + "\n", + "wget.download(s, out=\"../data/raw\")\n", + "\n", + "gunzip(name,'../data/interim/pulled.mbox' )\n", + "msgs = mailbox.mbox('../data/interim/pulled.mbox')\n", + "\n", + "os.remove(name)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "#preforming function on mbox \n", + "\n", + "mbox_info = build_list(msgs)\n", + "data = pandas_clean(mbox_info)\n", + "\n", + "#updated this to save to bucket with naming conventions depending on the mbox that are retrieved\n", + "#updated_email.to_csv('/opt/app-root/src/data/user_clean2.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv('../data/interim/temp_clean.csv')\n", + "s3_location = f_list + \"/\" + start_year + \"_\" + start_month + \"-\" + finish_year + \"_\"+ finish_month + \".csv\"\n", + "s3.upload_file(Filename='../data/interim/temp_clean.csv',Bucket=s3_bucket_name, Key=s3_location)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BodyDateFromSubjectMessage IDIn-Replydatetime
0This is roughly 45 to 75 C. 45C is fine, 75 is...Sat, 31 Jan 2004 18:27Stephen Walton <stephen.walton at csun.edu>Re: CPU Temp under lm sensors<1075602464.15697.3.camel@dhcppc1>1075595900.6869.0.camel@insomnia.lmig.com2004-01-31 18:27:00
1Whats a safe temp for a CPU to run at? My insp...Sat, 31 Jan 2004 19:38Troy Campano <troycampano at yahoo.com>Re: CPU Temp under lm sensors<1075595900.6869.0.camel@insomnia.lmig.com>401B3766.2090506@csun.edu2004-01-31 19:38:00
2Could be your sensors.conf file needs some twe...Sat, 31 Jan 2004 20:50Randy Kelsoe <randykel at swbell.net>Re: CPU Temp under lm sensors {Scanned}<401C696F.20003@swbell.net>1075602464.15697.3.camel@dhcppc12004-01-31 20:50:00
3Anyone else noticed that this rsync mirror isn...Sat, 31 Jan 2004 21:17Mike Chambers <mike at netlyncs.com>dulug.duke.edu mirror<1075605454.3081.13.camel@bart.netlyncs.com>None2004-01-31 21:17:00
4Did you do a yumarch on the directories in you...Sat, 31 Jan 2004 21:47Randy Kelsoe <randykel at swbell.net>Re: Yum and xmms problem {Scanned}<401C76D0.7010505@swbell.net>401C6FD5.6040903@margo.bijoux.nom.br2004-01-31 21:47:00
........................
84298Hi, I want to know if a project like this exis...Mon, 31 Jan 2005 23:22bsebastien at bluewin.chSystem Summary for Webpage<41FEAFA7.808@bluewin.ch>None2005-01-31 23:22:00
84299Hi, I want to know if a project like this exi...Mon, 31 Jan 2005 23:52bsebastien at bluewin.chRe: System Summary for Webpage<41FEB6A7.6010408@bluewin.ch>None2005-01-31 23:52:00
84300i have changed the file to enable Per user Web...Tue, 01 Feb 2005 00:51Prudhvi Krishna Surapaneni <prudhvikrishna at ...FC3 :Apache Problem<ac3b0f10501311121549f3847@mail.gmail.com>None2005-02-01 00:51:00
84301p: Are you using nscd? Try wiping out its cach...Tue, 01 Feb 2005 09:00Dan <ml at mutox.org>Re: Problems with ldap on Fedora Core 3<1107212431.10494.58.camel@devel2.x32.com.au>41FE8CEF.7020900@virtc.com2005-02-01 09:00:00
84302The work around in the bugzilla entry: says th...Tue, 01 Feb 2005 10:21Norman Gaywood <norm at turing.une.edu.au>Re: FC3 and ghostscript<20050131232132.GA16528@turing.une.edu.au>41FC87C8.7040908@prodigy.net.mx2005-02-01 10:21:00
\n", + "

84303 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Body \\\n", + "0 This is roughly 45 to 75 C. 45C is fine, 75 is... \n", + "1 Whats a safe temp for a CPU to run at? My insp... \n", + "2 Could be your sensors.conf file needs some twe... \n", + "3 Anyone else noticed that this rsync mirror isn... \n", + "4 Did you do a yumarch on the directories in you... \n", + "... ... \n", + "84298 Hi, I want to know if a project like this exis... \n", + "84299 Hi, I want to know if a project like this exi... \n", + "84300 i have changed the file to enable Per user Web... \n", + "84301 p: Are you using nscd? Try wiping out its cach... \n", + "84302 The work around in the bugzilla entry: says th... \n", + "\n", + " Date \\\n", + "0 Sat, 31 Jan 2004 18:27 \n", + "1 Sat, 31 Jan 2004 19:38 \n", + "2 Sat, 31 Jan 2004 20:50 \n", + "3 Sat, 31 Jan 2004 21:17 \n", + "4 Sat, 31 Jan 2004 21:47 \n", + "... ... \n", + "84298 Mon, 31 Jan 2005 23:22 \n", + "84299 Mon, 31 Jan 2005 23:52 \n", + "84300 Tue, 01 Feb 2005 00:51 \n", + "84301 Tue, 01 Feb 2005 09:00 \n", + "84302 Tue, 01 Feb 2005 10:21 \n", + "\n", + " From \\\n", + "0 Stephen Walton \n", + "1 Troy Campano \n", + "2 Randy Kelsoe \n", + "3 Mike Chambers \n", + "4 Randy Kelsoe \n", + "... ... \n", + "84298 bsebastien at bluewin.ch \n", + "84299 bsebastien at bluewin.ch \n", + "84300 Prudhvi Krishna Surapaneni \n", + "84302 Norman Gaywood \n", + "\n", + " Subject \\\n", + "0 Re: CPU Temp under lm sensors \n", + "1 Re: CPU Temp under lm sensors \n", + "2 Re: CPU Temp under lm sensors {Scanned} \n", + "3 dulug.duke.edu mirror \n", + "4 Re: Yum and xmms problem {Scanned} \n", + "... ... \n", + "84298 System Summary for Webpage \n", + "84299 Re: System Summary for Webpage \n", + "84300 FC3 :Apache Problem \n", + "84301 Re: Problems with ldap on Fedora Core 3 \n", + "84302 Re: FC3 and ghostscript \n", + "\n", + " Message ID \\\n", + "0 <1075602464.15697.3.camel@dhcppc1> \n", + "1 <1075595900.6869.0.camel@insomnia.lmig.com> \n", + "2 <401C696F.20003@swbell.net> \n", + "3 <1075605454.3081.13.camel@bart.netlyncs.com> \n", + "4 <401C76D0.7010505@swbell.net> \n", + "... ... \n", + "84298 <41FEAFA7.808@bluewin.ch> \n", + "84299 <41FEB6A7.6010408@bluewin.ch> \n", + "84300 \n", + "84301 <1107212431.10494.58.camel@devel2.x32.com.au> \n", + "84302 <20050131232132.GA16528@turing.une.edu.au> \n", + "\n", + " In-Reply datetime \n", + "0 1075595900.6869.0.camel@insomnia.lmig.com 2004-01-31 18:27:00 \n", + "1 401B3766.2090506@csun.edu 2004-01-31 19:38:00 \n", + "2 1075602464.15697.3.camel@dhcppc1 2004-01-31 20:50:00 \n", + "3 None 2004-01-31 21:17:00 \n", + "4 401C6FD5.6040903@margo.bijoux.nom.br 2004-01-31 21:47:00 \n", + "... ... ... \n", + "84298 None 2005-01-31 23:22:00 \n", + "84299 None 2005-01-31 23:52:00 \n", + "84300 None 2005-02-01 00:51:00 \n", + "84301 41FE8CEF.7020900@virtc.com 2005-02-01 09:00:00 \n", + "84302 41FC87C8.7040908@prodigy.net.mx 2005-02-01 10:21:00 \n", + "\n", + "[84303 rows x 7 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Mon, 16 Oct 2017 09:30'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.iloc[400]['Date']" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "os.remove(name)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BodyDateFromSubjectMessage IDIn-Replydatetime
0Redistribution of flashplugin is completely il...Sat, 31 Jan 2004 17:00Warren Togami <warren at togami.com>Re: mplayer vs. xine<401C6BC1.8040600@togami.com>20040131152547.GC5507@thyrsus.com2004-01-31 17:00:00
1I am guessing what people are looking for is s...Sat, 31 Jan 2004 19:15Stephen Smoogen <smoogen at lanl.gov>Re: Upgrade of unmaintained packages<Pine.LNX.4.58.0401311913570.32075@rhel3dev.ds...1075478191.25735.14.camel@chip.laiskiainen.org2004-01-31 19:15:00
2Wow. Sweet. As an FYI, 1.5.3 not only fixes t...Sat, 31 Jan 2004 20:19Paul Iadonisi <pri.rhl1 at iadonisi.to>Evolution 1.5.3 (was: Re: Evolution 1.5.2)<1075598343.2711.5.camel@va.local.linuxlobbyis...1074557012.14797.11.camel@va.local.linuxlobbyi...2004-01-31 20:19:00
3I just did the upgrade and found that there wa...Sat, 31 Jan 2004 21:46Jim Cornette <cornette at insight.rr.com>Menus back, but where is...<401C6894.4020601@insight.rr.com>None2004-01-31 21:46:00
4This is now correct behaviour. Nautilus is no ...Sat, 31 Jan 2004 21:54Gerald Henriksen <ghenriks at rogers.com>Re: Nautilus toolbars<tgqo10t2n2840lpc3bm9f0sld73303trrn@4ax.com>1075592741.3699.3.camel@aurora.localdomain2004-01-31 21:54:00
........................
15872The \"problem\" I have about changelog is its du...Tue, 01 Feb 2005 00:02=?utf-8?q?F=C3=A9liciano_Matias_=3Cfeliciano=2...Re: radical suggestion for fc4 release<1107212547.17531.10.camel@one.myworld>1107209076.5291.26.camel@opus.phy.duke.edu2005-02-01 00:02:00
15873Hopefully not :). Joe is about the only useful...Tue, 01 Feb 2005 00:22Pekka Savola <pekkas at netcore.fi>Re: Volunteers? was Re: further package remova...<Pine.LNX.4.61.0502010020130.26769@netcore.fi>Pine.LNX.4.58.0501261047070.29773@devserv.deve...2005-02-01 00:22:00
15874first. As they are stored compressed, it is m...Tue, 01 Feb 2005 00:28Enrico Scholz <enrico.scholz at informatik.tu-...Re: radical suggestion for fc4 release<87mzupch6k.fsf@kosh.ultra.csn.tu-chemnitz.de>1107212145.30653.174.camel@shahms.mesd.k12.or.us2005-02-01 00:28:00
15875e first. For sake of completeness: $ rpm qa ch...Tue, 01 Feb 2005 00:59Ziga Mahkovec <ziga.mahkovec at klika.si>Re: radical suggestion for fc4 release<1107215993.14233.5.camel@serenity.klika.si>1107212145.30653.174.camel@shahms.mesd.k12.or.us2005-02-01 00:59:00
15876in ago. Actually, a web browser is a little l...Tue, 01 Feb 2005 07:42Rodd Clarkson <rodd at clarkson.id.au>Re: rawhide firefox and background colour of w...<1107204139.5316.6.camel@localhost.localdomain>c79487d605013105562a1a3ef0@mail.gmail.com2005-02-01 07:42:00
\n", + "

15877 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Body \\\n", + "0 Redistribution of flashplugin is completely il... \n", + "1 I am guessing what people are looking for is s... \n", + "2 Wow. Sweet. As an FYI, 1.5.3 not only fixes t... \n", + "3 I just did the upgrade and found that there wa... \n", + "4 This is now correct behaviour. Nautilus is no ... \n", + "... ... \n", + "15872 The \"problem\" I have about changelog is its du... \n", + "15873 Hopefully not :). Joe is about the only useful... \n", + "15874 first. As they are stored compressed, it is m... \n", + "15875 e first. For sake of completeness: $ rpm qa ch... \n", + "15876 in ago. Actually, a web browser is a little l... \n", + "\n", + " Date \\\n", + "0 Sat, 31 Jan 2004 17:00 \n", + "1 Sat, 31 Jan 2004 19:15 \n", + "2 Sat, 31 Jan 2004 20:19 \n", + "3 Sat, 31 Jan 2004 21:46 \n", + "4 Sat, 31 Jan 2004 21:54 \n", + "... ... \n", + "15872 Tue, 01 Feb 2005 00:02 \n", + "15873 Tue, 01 Feb 2005 00:22 \n", + "15874 Tue, 01 Feb 2005 00:28 \n", + "15875 Tue, 01 Feb 2005 00:59 \n", + "15876 Tue, 01 Feb 2005 07:42 \n", + "\n", + " From \\\n", + "0 Warren Togami \n", + "1 Stephen Smoogen \n", + "2 Paul Iadonisi \n", + "3 Jim Cornette \n", + "4 Gerald Henriksen \n", + "... ... \n", + "15872 =?utf-8?q?F=C3=A9liciano_Matias_=3Cfeliciano=2... \n", + "15873 Pekka Savola \n", + "15874 Enrico Scholz \n", + "15876 Rodd Clarkson \n", + "\n", + " Subject \\\n", + "0 Re: mplayer vs. xine \n", + "1 Re: Upgrade of unmaintained packages \n", + "2 Evolution 1.5.3 (was: Re: Evolution 1.5.2) \n", + "3 Menus back, but where is... \n", + "4 Re: Nautilus toolbars \n", + "... ... \n", + "15872 Re: radical suggestion for fc4 release \n", + "15873 Re: Volunteers? was Re: further package remova... \n", + "15874 Re: radical suggestion for fc4 release \n", + "15875 Re: radical suggestion for fc4 release \n", + "15876 Re: rawhide firefox and background colour of w... \n", + "\n", + " Message ID \\\n", + "0 <401C6BC1.8040600@togami.com> \n", + "1 \n", + "4 \n", + "... ... \n", + "15872 <1107212547.17531.10.camel@one.myworld> \n", + "15873 \n", + "15874 <87mzupch6k.fsf@kosh.ultra.csn.tu-chemnitz.de> \n", + "15875 <1107215993.14233.5.camel@serenity.klika.si> \n", + "15876 <1107204139.5316.6.camel@localhost.localdomain> \n", + "\n", + " In-Reply datetime \n", + "0 20040131152547.GC5507@thyrsus.com 2004-01-31 17:00:00 \n", + "1 1075478191.25735.14.camel@chip.laiskiainen.org 2004-01-31 19:15:00 \n", + "2 1074557012.14797.11.camel@va.local.linuxlobbyi... 2004-01-31 20:19:00 \n", + "3 None 2004-01-31 21:46:00 \n", + "4 1075592741.3699.3.camel@aurora.localdomain 2004-01-31 21:54:00 \n", + "... ... ... \n", + "15872 1107209076.5291.26.camel@opus.phy.duke.edu 2005-02-01 00:02:00 \n", + "15873 Pine.LNX.4.58.0501261047070.29773@devserv.deve... 2005-02-01 00:22:00 \n", + "15874 1107212145.30653.174.camel@shahms.mesd.k12.or.us 2005-02-01 00:28:00 \n", + "15875 1107212145.30653.174.camel@shahms.mesd.k12.or.us 2005-02-01 00:59:00 \n", + "15876 c79487d605013105562a1a3ef0@mail.gmail.com 2005-02-01 07:42:00 \n", + "\n", + "[15877 rows x 7 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "ename": "EOFError", + "evalue": "Compressed file ended before the end-of-stream marker was reached", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mEOFError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgunzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'../data/interim/pulled.mbox'\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mgunzip\u001b[0;34m(source_filepath, dest_filepath, block_size)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdest_filepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0md_file\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ms_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/gzip.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEBADF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"read() on write-only GzipFile object\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/_compression.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mview\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mview\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"B\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mbyte_view\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbyte_view\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0mbyte_view\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/gzip.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m raise EOFError(\"Compressed file ended before the \"\n\u001b[0m\u001b[1;32m 483\u001b[0m \"end-of-stream marker was reached\")\n\u001b[1;32m 484\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mEOFError\u001b[0m: Compressed file ended before the end-of-stream marker was reached" + ] + } + ], + "source": [ + "gunzip(name,'../data/interim/pulled.mbox' )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailing", + "language": "python", + "name": "mailing" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/parsing_mbox.ipynb b/notebooks/parsing_mbox.ipynb index f5c258a..03e1a9c 100644 --- a/notebooks/parsing_mbox.ipynb +++ b/notebooks/parsing_mbox.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 11, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -11,19 +11,103 @@ "from datetime import datetime\n", "import mailbox\n", "import regex as re \n", - "#make sure this works " + "import os \n", + "import boto3 \n", + "from dotenv import load_dotenv, find_dotenv" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://s3.upshift.redhat.com\n", + "cdolfi\n" + ] + } + ], + "source": [ + "dotenv_path = find_dotenv()\n", + "load_dotenv(dotenv_path)\n", + "s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']\n", + "s3_bucket_name = os.environ['JUPYTERHUB_USER']\n", + "s3_endpoint_url = os.environ['S3_ENDPOINT_URL']\n", + "s3_access_key = os.environ['AWS_ACCESS_KEY_ID']\n", + "s3bucket = os.environ['BUCKET']\n", + "\n", + "print(s3_endpoint_url)\n", + "print(s3_bucket_name)\n", + "s3 = boto3.client('s3','us-east-1', endpoint_url= s3_endpoint_url,\n", + " aws_access_key_id = s3_access_key,\n", + " aws_secret_access_key = s3_secret_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The \"msgs\" portion is where the errors come with trying to get the mbox" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#get mbox data to work with. Update later with retrieval from buckets when it is discovered how to get all emails\n", - "#from the archieve\n", + "obj = s3.get_object(Bucket =s3bucket, Key = 'cdolfi/gzip/2017_10-2018_11.gzip')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Are you looking at dev or user? dev\n", + "What year do you want to start with? example 2016: 2016\n", + "What month do you want to start with? example 01: 10\n", + "What year do you want to end with? example 2016: 2017\n", + "What month do you want to end with? example 01: 10\n" + ] + }, + { + "ename": "TypeError", + "evalue": "expected str, bytes or os.PathLike object, not StreamingBody", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBucket\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0ms3bucket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m#THIS IS WHERE THE ISSUE COMES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mmsgs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmailbox\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmbox\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/mailbox.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, factory, create)\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;34m\"\"\"Initialize an mbox mailbox.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 846\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_message_factory\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmboxMessage\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 847\u001b[0;31m \u001b[0m_mboxMMDF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_post_message_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/mailbox.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, factory, create)\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[0;34m\"\"\"Initialize a single-file mailbox.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 579\u001b[0;31m \u001b[0mMailbox\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 580\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb+'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/mailbox.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, factory, create)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;34m\"\"\"Initialize a Mailbox instance.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpanduser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_factory\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/rh/rh-python36/root/usr/lib64/python3.6/posixpath.py\u001b[0m in \u001b[0;36mexpanduser\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 231\u001b[0m \"\"\"Expand ~ and ~user constructions. If user or $HOME is unknown,\n\u001b[1;32m 232\u001b[0m do nothing.\"\"\"\n\u001b[0;32m--> 233\u001b[0;31m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 234\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[0mtilde\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mb'~'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: expected str, bytes or os.PathLike object, not StreamingBody" + ] + } + ], + "source": [ + "\n", + "f_list = input(\"Are you looking at dev or user? \")\n", + "start_year = input(\"What year do you want to start with? example 2016: \" )\n", + "start_month = input(\"What month do you want to start with? example 01: \" )\n", + "\n", + "finish_year = input(\"What year do you want to end with? example 2016: \" )\n", + "finish_month = input(\"What month do you want to end with? example 01: \" )\n", + "finish_month = str((int(finish_month)%12)+1)\n", "\n", - "msgs = mailbox.mbox(\"/opt/app-root/src/devel_2003_2008.mbox\")" + "file = \"cdolfi/mboxes/\" + start_year + \"_\" + start_month + \"-\" + finish_year + \"_\"+ finish_month + \".mbox\"\n", + "obj = s3.get_object(Bucket =s3bucket, Key = file)\n", + "#THIS IS WHERE THE ISSUE COMES\n", + "msgs = mailbox.mbox(obj['Body'])" ] }, { @@ -153,7 +237,11 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data.to_csv('/opt/app-root/src/data/temp_clean.csv')\n", + "s3_location = \"cdolfi/cleaned_\" +f_list + \"/\" + start_year + \"_\" + start_month + \"-\" + finish_year + \"_\"+ finish_month + \".mbox\"\n", + "s3.upload_file(Filename='/opt/app-root/src/data/temp_clean.csv',Bucket=s3bucket, Key=s3_location)" + ] }, { "cell_type": "code", @@ -165,9 +253,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "mailing", "language": "python", - "name": "python3" + "name": "mailing" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/retrieve_mbox.ipynb b/notebooks/retrieve_mbox.ipynb index 75b5629..49f5a1b 100644 --- a/notebooks/retrieve_mbox.ipynb +++ b/notebooks/retrieve_mbox.ipynb @@ -2,15 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 11, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import mailbox\n", "import os\n", - "import boto3 \n", + "import io\n", + "import boto3\n", "import gzip\n", - "import wget" + "import wget\n", + "from dotenv import load_dotenv, find_dotenv" ] }, { @@ -22,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -91,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": { "scrolled": true }, @@ -106,6 +107,8 @@ } ], "source": [ + "dotenv_path = find_dotenv()\n", + "load_dotenv(dotenv_path)\n", "s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']\n", "s3_bucket_name = os.environ['JUPYTERHUB_USER']\n", "s3_endpoint_url = os.environ['S3_ENDPOINT_URL']\n", @@ -121,11 +124,33 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "s3_location = \"cdolfi/mboxes/\" + start_year + \"_\" + start_month + \"-\" + finish_year + \"_\"+ finish_month + \".mbox\"\n", + "s3.upload_file(Filename=\"/opt/app-root/src/trying.mbox\",Bucket=s3bucket, Key=s3_location)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import mailbox\n", + "msgs = mailbox.mbox(\"/opt/app-root/src/trying.mbox\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "s3.upload_file(Filename=\"/opt/app-root/src/trying.mbox\",Bucket=s3bucket, Key='cdolfi/trial.mbox')" + "s3_location = \"cdolfi/gzip/\" + start_year + \"_\" + start_month + \"-\" + finish_year + \"_\"+ finish_month + \".gzip\"\n", + "s3.upload_file(Filename=name,Bucket=s3bucket, Key=s3_location)" ] }, { @@ -138,9 +163,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "mailing", "language": "python", - "name": "python3" + "name": "mailing" }, "language_info": { "codemirror_mode": {