From 5f89546cb054a896de6fd4480440ebe63f2b42dc Mon Sep 17 00:00:00 2001 From: "Saurabh S. Chaturvedi" Date: Tue, 8 Aug 2017 20:32:03 +0530 Subject: [PATCH] Complete Exercise 4-5: Threads, Files and Regex --- .gitignore | 2 ++ Chap4/mt_simple_header_analysis.py | 51 ++++++++++++++++++++++++++++++ Chap4/simple_header_analysis.py | 16 ++++++++++ README.md | 3 ++ 4 files changed, 72 insertions(+) create mode 100644 Chap4/mt_simple_header_analysis.py create mode 100644 Chap4/simple_header_analysis.py diff --git a/.gitignore b/.gitignore index f817c47..d961667 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ email my_mbox.txt learn/ Chap4/bytes_file +Chap4/links.html +Chap4/links_v2.html diff --git a/Chap4/mt_simple_header_analysis.py b/Chap4/mt_simple_header_analysis.py new file mode 100644 index 0000000..5197ac7 --- /dev/null +++ b/Chap4/mt_simple_header_analysis.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import linecache, re, sys +from threading import Lock, Thread + + +def render_HTML(start, end): + with open("links_v2.html", "a") as page: + for i in range(start, end): + line = linecache.getline(filename, i) + match = EMAIL_PATT.match(line) + if match: + sender = match.groups()[0] + addr = match.groups()[1] + site = 'http://www.' + addr.split('@')[1] + lock.acquire() + page.write("

{1}

\n".format(site, sender)) # [1:] removes leading '>'. + lock.release() + + +filename = sys.argv[1] +nthreads = int(sys.argv[2]) +messages = [] +EMAIL_PATT = re.compile(r"From: (.+) <(\S+)>") +lock = Lock() + +with open(filename) as f: + nlines = int(f.readline().strip()) # Assumes 1st line to be # of lines in the file. +if nlines < nthreads: + print("Too many threads for processing file. Use less threads.") + quit() + +chunksize = nlines // nthreads +threads = [] +i = 0 +with open("links_v2.html", "w") as f: + f.write("\n") + +while i < nlines: + thread = Thread(target=render_HTML, args=(i, i+chunksize)) + thread.start() + threads.append(thread) + i += chunksize + if i > nlines: + i = nlines + +for thread in threads: + thread.join() + +with open("links_v2.html", "a") as f: + f.write("") +print("HTML file written. Open links_v2.html to view the contents.") diff --git a/Chap4/simple_header_analysis.py b/Chap4/simple_header_analysis.py new file mode 100644 index 0000000..9c6c32d --- /dev/null +++ b/Chap4/simple_header_analysis.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +import email, mailbox, re + +mbox = mailbox.mbox("../Chap3/my_mbox.txt") +messages = [] +with open("links.html", "w") as page: + page.write("\n") + for mail in mbox: + mail_str = str(mail) + msg = email.message_from_string(mail_str[mail_str.find('X-Received'):]) # [1:] removes the leading '\n'. + addr = msg['From'] + site = 'http://www.' + addr.split('@')[1] + name = msg["From"].split('<')[0][:-1] + page.write("

{1}

\n".format(site, name)) # [1:] removes leading '>'. + page.write("") +print("HTML file written. Open links.html to view the contents.") diff --git a/README.md b/README.md index f4cb74f..ba168ad 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ * [Exercise 4-3: Mulithreading on Multicore System (threads_multicore.md)][4-3] * [Exercise 4-4-a: Simple Byte Count (bytes_count.py)][4-4-a] * [Exercise 4-4-b: Multithreaded Byte Count (mt_bytes_count.py)][4-4-b] + * Exercise 4-5: Threads, Files and Regex ([mt_simple_header_analysis.py][4-5-i], [simple_header_analysis.py][4-5-ii]) [chap4]: /Chap4 [e4-10]: /Chap4/mtsleepF.py @@ -27,3 +28,5 @@ [4-3]: /Chap4/threads_multicore.md [4-4-a]: /Chap4/bytes_count.py [4-4-b]: /Chap4/mt_bytes_count.py +[4-5-i]: /Chap4/simple_header_analysis.py +[4-5-ii]: /Chap4/mt_simple_header_analysis.py