Skip to content

Commit

Permalink
Complete Exercise 4-5: Threads, Files and Regex
Browse files Browse the repository at this point in the history
  • Loading branch information
schedutron committed Aug 8, 2017
1 parent f36e041 commit 5f89546
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ email
my_mbox.txt
learn/
Chap4/bytes_file
Chap4/links.html
Chap4/links_v2.html
51 changes: 51 additions & 0 deletions Chap4/mt_simple_header_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
import linecache, re, sys
from threading import Lock, Thread


def render_HTML(start, end):
with open("links_v2.html", "a") as page:
for i in range(start, end):
line = linecache.getline(filename, i)
match = EMAIL_PATT.match(line)
if match:
sender = match.groups()[0]
addr = match.groups()[1]
site = 'http://www.' + addr.split('@')[1]
lock.acquire()
page.write("<p><a href='{0}'>{1}</a></p>\n".format(site, sender)) # [1:] removes leading '>'.
lock.release()


filename = sys.argv[1]
nthreads = int(sys.argv[2])
messages = []
EMAIL_PATT = re.compile(r"From: (.+) <(\S+)>")
lock = Lock()

with open(filename) as f:
nlines = int(f.readline().strip()) # Assumes 1st line to be # of lines in the file.
if nlines < nthreads:
print("Too many threads for processing file. Use less threads.")
quit()

chunksize = nlines // nthreads
threads = []
i = 0
with open("links_v2.html", "w") as f:
f.write("<html>\n")

while i < nlines:
thread = Thread(target=render_HTML, args=(i, i+chunksize))
thread.start()
threads.append(thread)
i += chunksize
if i > nlines:
i = nlines

for thread in threads:
thread.join()

with open("links_v2.html", "a") as f:
f.write("</html>")
print("HTML file written. Open links_v2.html to view the contents.")
16 changes: 16 additions & 0 deletions Chap4/simple_header_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python3
import email, mailbox, re

mbox = mailbox.mbox("../Chap3/my_mbox.txt")
messages = []
with open("links.html", "w") as page:
page.write("<html>\n")
for mail in mbox:
mail_str = str(mail)
msg = email.message_from_string(mail_str[mail_str.find('X-Received'):]) # [1:] removes the leading '\n'.
addr = msg['From']
site = 'http://www.' + addr.split('@')[1]
name = msg["From"].split('<')[0][:-1]
page.write("<p><a href={0}>{1}</a></p>\n".format(site, name)) # [1:] removes leading '>'.
page.write("</html>")
print("HTML file written. Open links.html to view the contents.")
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* [Exercise 4-3: Mulithreading on Multicore System (threads_multicore.md)][4-3]
* [Exercise 4-4-a: Simple Byte Count (bytes_count.py)][4-4-a]
* [Exercise 4-4-b: Multithreaded Byte Count (mt_bytes_count.py)][4-4-b]
* Exercise 4-5: Threads, Files and Regex ([mt_simple_header_analysis.py][4-5-i], [simple_header_analysis.py][4-5-ii])

[chap4]: /Chap4
[e4-10]: /Chap4/mtsleepF.py
Expand All @@ -27,3 +28,5 @@
[4-3]: /Chap4/threads_multicore.md
[4-4-a]: /Chap4/bytes_count.py
[4-4-b]: /Chap4/mt_bytes_count.py
[4-5-i]: /Chap4/simple_header_analysis.py
[4-5-ii]: /Chap4/mt_simple_header_analysis.py

0 comments on commit 5f89546

Please sign in to comment.