From 5f89546cb054a896de6fd4480440ebe63f2b42dc Mon Sep 17 00:00:00 2001
From: "Saurabh S. Chaturvedi" <saurabh.chaturvedi63@gmail.com>
Date: Tue, 8 Aug 2017 20:32:03 +0530
Subject: [PATCH] Complete Exercise 4-5: Threads, Files and Regex

---
 .gitignore                         |  2 ++
 Chap4/mt_simple_header_analysis.py | 51 ++++++++++++++++++++++++++++++
 Chap4/simple_header_analysis.py    | 16 ++++++++++
 README.md                          |  3 ++
 4 files changed, 72 insertions(+)
 create mode 100644 Chap4/mt_simple_header_analysis.py
 create mode 100644 Chap4/simple_header_analysis.py
diff --git a/.gitignore b/.gitignore
index f817c47..d961667 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@ email
 my_mbox.txt
 learn/
 Chap4/bytes_file
+Chap4/links.html
+Chap4/links_v2.html
diff --git a/Chap4/mt_simple_header_analysis.py b/Chap4/mt_simple_header_analysis.py
new file mode 100644
index 0000000..5197ac7
--- /dev/null
+++ b/Chap4/mt_simple_header_analysis.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import linecache, re, sys
+from threading import Lock, Thread
+
+
+def render_HTML(start, end):
+    with open("links_v2.html", "a") as page:
+        for i in range(start, end):
+            line = linecache.getline(filename, i)
+            match = EMAIL_PATT.match(line)
+            if match:
+                sender = match.groups()[0]
+                addr = match.groups()[1]
+                site = 'http://www.' + addr.split('@')[1]
+                lock.acquire()
+                page.write("<p><a href='{0}'>{1}</a></p>\n".format(site, sender))  # [1:] removes leading '>'.
+                lock.release()
+
+
+filename = sys.argv[1]
+nthreads = int(sys.argv[2])
+messages = []
+EMAIL_PATT = re.compile(r"From: (.+) <(\S+)>")
+lock = Lock()
+
+with open(filename) as f:
+    nlines = int(f.readline().strip())  # Assumes 1st line to be # of lines in the file.
+if nlines < nthreads:
+    print("Too many threads for processing file. Use less threads.")
+    quit()
+
+chunksize = nlines // nthreads
+threads = []
+i = 0
+with open("links_v2.html", "w") as f:
+    f.write("<html>\n")
+
+while i < nlines:
+    thread = Thread(target=render_HTML, args=(i, i+chunksize))
+    thread.start()
+    threads.append(thread)
+    i += chunksize
+    if i > nlines:
+        i = nlines
+
+for thread in threads:
+    thread.join()
+
+with open("links_v2.html", "a") as f:
+    f.write("</html>")
+print("HTML file written. Open links_v2.html to view the contents.")
diff --git a/Chap4/simple_header_analysis.py b/Chap4/simple_header_analysis.py
new file mode 100644
index 0000000..9c6c32d
--- /dev/null
+++ b/Chap4/simple_header_analysis.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+import email, mailbox, re
+
+mbox = mailbox.mbox("../Chap3/my_mbox.txt")
+messages = []
+with open("links.html", "w") as page:
+    page.write("<html>\n")
+    for mail in mbox:
+        mail_str = str(mail)
+        msg = email.message_from_string(mail_str[mail_str.find('X-Received'):]) # [1:] removes the leading '\n'.
+        addr = msg['From']
+        site = 'http://www.' + addr.split('@')[1]
+        name = msg["From"].split('<')[0][:-1]
+        page.write("<p><a href={0}>{1}</a></p>\n".format(site, name))  # [1:] removes leading '>'.
+    page.write("</html>")
+print("HTML file written. Open links.html to view the contents.")
diff --git a/README.md b/README.md
index f4cb74f..ba168ad 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
     * [Exercise 4-3: Mulithreading on Multicore System (threads_multicore.md)][4-3]
     * [Exercise 4-4-a: Simple Byte Count (bytes_count.py)][4-4-a]
     * [Exercise 4-4-b: Multithreaded Byte Count (mt_bytes_count.py)][4-4-b]
+    * Exercise 4-5: Threads, Files and Regex ([mt_simple_header_analysis.py][4-5-i], [simple_header_analysis.py][4-5-ii])
 
 [chap4]: /Chap4
 [e4-10]: /Chap4/mtsleepF.py
@@ -27,3 +28,5 @@
 [4-3]: /Chap4/threads_multicore.md
 [4-4-a]: /Chap4/bytes_count.py
 [4-4-b]: /Chap4/mt_bytes_count.py
+[4-5-i]: /Chap4/simple_header_analysis.py
+[4-5-ii]: /Chap4/mt_simple_header_analysis.py