diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 16191fe..ad7cced 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,4 +1,5 @@ import re +from regexps import regexps """ email_reply_parser is a python library port of GitHub's Email Reply Parser. @@ -12,17 +13,17 @@ class EmailReplyParser(object): """ @staticmethod - def read(text): + def read(text, locale='en'): """ Factory method that splits email into list of fragments text - A string email body Returns an EmailMessage instance """ - return EmailMessage(text).read() + return EmailMessage(text, locale).read() @staticmethod - def parse_reply(text): + def parse_reply(text, locale='en'): """ Provides the reply portion of email. text - A string email body @@ -36,17 +37,12 @@ class EmailMessage(object): """ An email message represents a parsed email body. """ - SIG_REGEX = r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})' - QUOTE_HDR_REGEX = r'^:etorw.*nO' - MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' - QUOTED_REGEX = r'(>+)' - HEADER_REGEX = r'^(From|Sent|To|Subject): .+' - - def __init__(self, text): + def __init__(self, text, locale='en'): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') self.found_visible = False + self.regexps = regexps(locale) def read(self): """ Creates new fragment for each line @@ -57,9 +53,9 @@ def read(self): self.found_visible = False - is_multi_quote_header = re.search(self.MULTI_QUOTE_HDR_REGEX, self.text, re.MULTILINE | re.DOTALL) + is_multi_quote_header = re.search(self.regexps['multi_quote_hdr'], self.text, re.MULTILINE | re.DOTALL) if is_multi_quote_header: - expr = re.compile(self.MULTI_QUOTE_HDR_REGEX, flags=re.DOTALL) + expr = re.compile(self.regexps['multi_quote_hdr'], flags=re.DOTALL) self.text = expr.sub( is_multi_quote_header.groups()[0].replace('\n', ''), self.text) @@ -92,11 +88,11 @@ def _scan_line(self, line): line - a row of text from an email message """ - is_quoted = re.match(self.QUOTED_REGEX, line) is not None - is_header = re.match(self.HEADER_REGEX, line) is not None + is_quoted = re.match(self.regexps['quoted'], line) is not None + is_header = re.match(self.regexps['header'], line) is not None if self.fragment and len(line.strip()) == 0: - if re.match(self.SIG_REGEX, self.fragment.lines[-1]): + if re.match(self.regexps['sig'], self.fragment.lines[-1]): self.fragment.signature = True self._finish_fragment() @@ -115,7 +111,7 @@ def quote_header(self, line): Returns True or False """ - return re.match(self.QUOTE_HDR_REGEX, line[::-1]) != None + return re.match(self.regexps['quote_hdr'], line[::-1]) != None def _finish_fragment(self): """ Creates fragment diff --git a/email_reply_parser/locales.yaml b/email_reply_parser/locales.yaml new file mode 100644 index 0000000..d3e22f1 --- /dev/null +++ b/email_reply_parser/locales.yaml @@ -0,0 +1,13 @@ +regexps: + en: + sig: '(--|__|-\w)|(^Sent from my (\w+\s*){1,3})' + quote_hdr: '^:etorw.*nO' + multi_quote_hdr: '(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + quoted: '(>+)' + header: '^(From|Sent|To|Subject): .+' + it: + sig: '(--|__|-\w)|(^Inviato da (\w+\s*){1,3})' + quote_hdr: '^:ottircs\sah.*lI' + multi_quote_hdr: '(?!Il.*Il\s.+?ha\sscritto:)(Il\s(.+?)ha\sscritto:)' + quoted: '(>+)' + header: '^(Da|Data|A|Ogg): .+' diff --git a/email_reply_parser/regexps.py b/email_reply_parser/regexps.py new file mode 100644 index 0000000..fe43d4c --- /dev/null +++ b/email_reply_parser/regexps.py @@ -0,0 +1,6 @@ +from yaml import load + +def regexps(locale): + with open('email_reply_parser/locales.yaml', 'r') as stream: + return load(stream)['regexps'][locale] + diff --git a/setup.py b/setup.py index f0531b1..3b32ed7 100644 --- a/setup.py +++ b/setup.py @@ -32,5 +32,7 @@ "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", - ] + ], + requires=['pyyaml'], + tests_require=['pyyaml'] ) diff --git a/test/test_email_reply_parser.py b/test/email_reply_parser_test.py similarity index 99% rename from test/test_email_reply_parser.py rename to test/email_reply_parser_test.py index 713ddfe..536fada 100644 --- a/test/test_email_reply_parser.py +++ b/test/email_reply_parser_test.py @@ -126,6 +126,5 @@ def get_email(self, name): text = f.read() return EmailReplyParser.read(text) - if __name__ == '__main__': unittest.main() diff --git a/test/emails/it/correct_sig.txt b/test/emails/it/correct_sig.txt new file mode 100644 index 0000000..4bb8a2a --- /dev/null +++ b/test/emails/it/correct_sig.txt @@ -0,0 +1,4 @@ +this is an email with a correct -- signature. + +-- +rick diff --git a/test/emails/it/email_1_1.txt b/test/emails/it/email_1_1.txt new file mode 100644 index 0000000..8b3ba6c --- /dev/null +++ b/test/emails/it/email_1_1.txt @@ -0,0 +1,13 @@ +Hi folks + +What is the best way to clear a Riak bucket of all key, values after +running a test? +I am currently using the Java HTTP API. + +-Abhishek Kona + + +_______________________________________________ +riak-users mailing list +riak-users@lists.basho.com +http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com diff --git a/test/emails/it/email_1_2.txt b/test/emails/it/email_1_2.txt new file mode 100644 index 0000000..991370d --- /dev/null +++ b/test/emails/it/email_1_2.txt @@ -0,0 +1,51 @@ +Hi, +Il giorno sabato 26 maggio 2012 16:58:20 UTC+2, Avv. Michele D'Auria ha scritto: +> Hi folks +> +> What is the best way to clear a Riak bucket of all key, values after +> running a test? +> I am currently using the Java HTTP API. + +You can list the keys for the bucket and call delete for each. Or if you +put the keys (and kept track of them in your test) you can delete them +one at a time (without incurring the cost of calling list first.) + +Something like: + + String bucket = "my_bucket"; + BucketResponse bucketResponse = riakClient.listBucket(bucket); + RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo(); + + for(String key : bucketInfo.getKeys()) { + riakClient.delete(bucket, key); + } + + +would do it. + +See also + +http://wiki.basho.com/REST-API.html#Bucket-operations + +which says + +"At the moment there is no straightforward way to delete an entire +Bucket. There is, however, an open ticket for the feature. To delete all +the keys in a bucket, you’ll need to delete them all individually." + +> +> -Abhishek Kona +> +> +> _______________________________________________ +> riak-users mailing list +> riak-users@lists.basho.com +> http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com + + + + +_______________________________________________ +riak-users mailing list +riak-users@lists.basho.com +http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com diff --git a/test/emails/it/email_1_3.txt b/test/emails/it/email_1_3.txt new file mode 100644 index 0000000..948461f --- /dev/null +++ b/test/emails/it/email_1_3.txt @@ -0,0 +1,55 @@ +Oh thanks. + +Having the function would be great. + +-Abhishek Kona + +Il giorno 12/dic/2015 11:35, "studiolaportamichele via Domiciliazioni Legali" ha scritto: +> Hi, +> On Tue, 2011-03-01 at 18:02 +0530, Abhishek Kona wrote: +>> Hi folks +>> +>> What is the best way to clear a Riak bucket of all key, values after +>> running a test? +>> I am currently using the Java HTTP API. +> You can list the keys for the bucket and call delete for each. Or if you +> put the keys (and kept track of them in your test) you can delete them +> one at a time (without incurring the cost of calling list first.) +> +> Something like: +> +> String bucket = "my_bucket"; +> BucketResponse bucketResponse = riakClient.listBucket(bucket); +> RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo(); +> +> for(String key : bucketInfo.getKeys()) { +> riakClient.delete(bucket, key); +> } +> +> +> would do it. +> +> See also +> +> http://wiki.basho.com/REST-API.html#Bucket-operations +> +> which says +> +> "At the moment there is no straightforward way to delete an entire +> Bucket. There is, however, an open ticket for the feature. To delete all +> the keys in a bucket, you’ll need to delete them all individually." +> +>> -Abhishek Kona +>> +>> +>> _______________________________________________ +>> riak-users mailing list +>> riak-users@lists.basho.com +>> http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com +> + + +_______________________________________________ +riak-users mailing list +riak-users@lists.basho.com +http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com diff --git a/test/emails/it/email_1_4.txt b/test/emails/it/email_1_4.txt new file mode 100644 index 0000000..bfbf863 --- /dev/null +++ b/test/emails/it/email_1_4.txt @@ -0,0 +1,5 @@ +Awesome! I haven't had another problem with it. + +Il Domenica 6 Dicembre 2015 7:34, 'Elisa Di Maggio' via Domiciliazioni Legali ha scritto: + +> Loader seems to be working well. diff --git a/test/emails/it/email_1_5.txt b/test/emails/it/email_1_5.txt new file mode 100644 index 0000000..2498775 --- /dev/null +++ b/test/emails/it/email_1_5.txt @@ -0,0 +1,15 @@ +One: Here's what I've got. + +- This would be the first bullet point that wraps to the second line +to the next +- This is the second bullet point and it doesn't wrap +- This is the third bullet point and I'm having trouble coming up with enough +to say +- This is the fourth bullet point + +Two: +- Here is another bullet point +- And another one + +This is a paragraph that talks about a bunch of stuff. It goes on and on +for a while. diff --git a/test/emails/it/email_1_6.txt b/test/emails/it/email_1_6.txt new file mode 100644 index 0000000..f884ccc --- /dev/null +++ b/test/emails/it/email_1_6.txt @@ -0,0 +1,15 @@ +I get proper rendering as well. + +Sent from a magnificent torch of pixels + +Il 19/dic/2015 19:37, "Claudio Cardinali" + +ha scritto: + +> Was this caching related or fixed already? I get proper rendering here. +> +> ![](https://img.skitch.com/20111216-m9munqjsy112yqap5cjee5wr6c.jpg) +> +> --- +> Reply to this email directly or view it on GitHub: +> https://github.com/github/github/issues/2278#issuecomment-3182418 diff --git a/test/emails/it/email_1_7.txt b/test/emails/it/email_1_7.txt new file mode 100644 index 0000000..5d133ea --- /dev/null +++ b/test/emails/it/email_1_7.txt @@ -0,0 +1,11 @@ +:+1: + +Il 19/dic/2015 19:37, "Claudio Cardinali" ha scritto: + +> Steps 0-2 are in prod. Gonna let them sit for a bit then start cleaning up +> the old code with 3 & 4. +> +> +> Reply to this email directly or view it on GitHub. +> +> diff --git a/test/emails/it/email_2_1.txt b/test/emails/it/email_2_1.txt new file mode 100644 index 0000000..e919ea3 --- /dev/null +++ b/test/emails/it/email_2_1.txt @@ -0,0 +1,25 @@ +Outlook with a reply + + + ------------------------------ + +*Da:* Google Apps Sync Team [mailto:mail-noreply@google.com] +*Data:* Thursday, February 09, 2012 1:36 PM +*A:* jow@xxxx.com +*Ogg:* Google Apps Sync was updated! + + + +Dear Google Apps Sync user, + +Google Apps Sync for Microsoft Outlook® was recently updated. Your computer +now has the latest version (version 2.5). This release includes bug fixes +to improve product reliability. For more information about these and other +changes, please see the help article here: + +http://www.google.com/support/a/bin/answer.py?answer=153463 + +Sincerely, + +The Google Apps Sync Team. + diff --git a/test/emails/it/email_BlackBerry.txt b/test/emails/it/email_BlackBerry.txt new file mode 100644 index 0000000..bdd3aa1 --- /dev/null +++ b/test/emails/it/email_BlackBerry.txt @@ -0,0 +1,3 @@ +Here is another email + +Inviato da BlackBerry diff --git a/test/emails/it/email_bullets.txt b/test/emails/it/email_bullets.txt new file mode 100644 index 0000000..f103052 --- /dev/null +++ b/test/emails/it/email_bullets.txt @@ -0,0 +1,22 @@ +test 2 this should list second + +and have spaces + +and retain this formatting + + + - how about bullets + - and another + + +Il 19/dic/2015 19:37, "Claudio Cardinali" ha scritto: + +> Give us an example of how you applied what they learned to achieve +> something in your organization + + + + +-- + +*Joe Smith | Director, Product Management* diff --git a/test/emails/it/email_headers_no_delimiter.txt b/test/emails/it/email_headers_no_delimiter.txt new file mode 100644 index 0000000..4878cac --- /dev/null +++ b/test/emails/it/email_headers_no_delimiter.txt @@ -0,0 +1,15 @@ +And another reply! + +Da: Dan Watson [mailto:user@host.com] +Data: Monday, November 26, 2012 10:48 AM +A: Watson, Dan +Ogg: Re: New Issue + +A reply + +-- +Sent from my iPhone + +Il 19/dic/2015 19:37, "Claudio Cardinali" ha scritto: +This is a message. +With a second line. diff --git a/test/emails/it/email_iPhone.txt b/test/emails/it/email_iPhone.txt new file mode 100644 index 0000000..81fee8f --- /dev/null +++ b/test/emails/it/email_iPhone.txt @@ -0,0 +1,3 @@ +Here is another email + +Inviato da iPhone diff --git a/test/emails/it/email_multi_word_sent_from_my_mobile_device.txt b/test/emails/it/email_multi_word_sent_from_my_mobile_device.txt new file mode 100644 index 0000000..63825a9 --- /dev/null +++ b/test/emails/it/email_multi_word_sent_from_my_mobile_device.txt @@ -0,0 +1,3 @@ +Here is another email + +Inviato da Verizon Wireless BlackBerry diff --git a/test/emails/it/email_one_is_not_on.txt b/test/emails/it/email_one_is_not_on.txt new file mode 100644 index 0000000..3ebf136 --- /dev/null +++ b/test/emails/it/email_one_is_not_on.txt @@ -0,0 +1,10 @@ +Thank, this is really helpful. + +One outstanding question I had: + +Locally (on development), when I run... + +Il 19/dic/2015 19:37, "Claudio Cardinali" ha scritto: + +> The good news is that I've found a much better query for lastLocation. +> diff --git a/test/emails/it/email_partial_quote_header.txt b/test/emails/it/email_partial_quote_header.txt new file mode 100644 index 0000000..974c4f9 --- /dev/null +++ b/test/emails/it/email_partial_quote_header.txt @@ -0,0 +1,13 @@ +On your remote host you can run: + + telnet 127.0.0.1 52698 + +This should connect to TextMate (on your Mac, via the tunnel). If that +fails, the tunnel is not working. + +Il 19/dic/2015 19:37, "Claudio Cardinali" ha scritto: + +> I am having an odd issue wherein suddenly port forwarding stopped +> working in a particular scenario for me. By default I have ssh set to +> use the following config (my ~/.ssh/config file): +> […] diff --git a/test/emails/it/email_sent_from_my_not_signature.txt b/test/emails/it/email_sent_from_my_not_signature.txt new file mode 100644 index 0000000..e3f8b68 --- /dev/null +++ b/test/emails/it/email_sent_from_my_not_signature.txt @@ -0,0 +1,3 @@ +Here is another email + +Inviato da desk, is much easier then my mobile phone. diff --git a/test/it_email_reply_parser_test.py b/test/it_email_reply_parser_test.py new file mode 100644 index 0000000..415df2f --- /dev/null +++ b/test/it_email_reply_parser_test.py @@ -0,0 +1,130 @@ +import os +import sys +import unittest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from email_reply_parser import EmailReplyParser + + +class ItEmailMessageTest(unittest.TestCase): + + def test_simple_body(self): + message = self.get_email('email_1_1') + + self.assertEqual(3, len(message.fragments)) + self.assertEqual( + [False, True, True], + [f.signature for f in message.fragments] + ) + self.assertEqual( + [False, True, True], + [f.hidden for f in message.fragments] + ) + self.assertTrue("folks" in message.fragments[0].content) + self.assertTrue("riak-users" in message.fragments[2].content) + + def test_reads_bottom_message(self): + message = self.get_email('email_1_2') + + self.assertEqual(6, len(message.fragments)) + self.assertEqual( + [False, True, False, True, False, False], + [f.quoted for f in message.fragments] + ) + + self.assertEqual( + [False, False, False, False, False, True], + [f.signature for f in message.fragments] + ) + + self.assertEqual( + [False, False, False, True, True, True], + [f.hidden for f in message.fragments] + ) + + self.assertTrue("Hi," in message.fragments[0].content) + self.assertTrue("Il" in message.fragments[1].content) + self.assertTrue(">" in message.fragments[3].content) + self.assertTrue("riak-users" in message.fragments[5].content) + + def test_reads_top_post(self): + message = self.get_email('email_1_3') + + self.assertEqual(5, len(message.fragments)) + + def test_multiline_reply_headers(self): + message = self.get_email('email_1_6') + self.assertTrue('I get' in message.fragments[0].content) + self.assertTrue('Il' in message.fragments[1].content) + + def test_captures_date_string(self): + message = self.get_email('email_1_4') + + self.assertTrue('Awesome' in message.fragments[0].content) + self.assertTrue('Il' in message.fragments[1].content) + self.assertTrue('Loader' in message.fragments[1].content) + + def test_complex_body_with_one_fragment(self): + message = self.get_email('email_1_5') + + self.assertEqual(1, len(message.fragments)) + + def test_verify_reads_signature_correct(self): + message = self.get_email('correct_sig') + self.assertEqual(2, len(message.fragments)) + + self.assertEqual( + [False, False], + [f.quoted for f in message.fragments] + ) + + self.assertEqual( + [False, True], + [f.signature for f in message.fragments] + ) + + self.assertEqual( + [False, True], + [f.hidden for f in message.fragments] + ) + + self.assertTrue('--' in message.fragments[1].content) + + def test_deals_with_windows_line_endings(self): + msg = self.get_email('email_1_7') + + self.assertTrue(':+1:' in msg.fragments[0].content) + self.assertTrue('Il' in msg.fragments[1].content) + self.assertTrue('Steps 0-2' in msg.fragments[1].content) + + def test_reply_is_parsed(self): + message = self.get_email('email_1_2') + self.assertTrue("You can list the keys for the bucket" in message.reply) + + def test_sent_from_iphone(self): + with open('test/emails/email_iPhone.txt') as email: + self.assertTrue("Sent from my iPhone" not in EmailReplyParser.parse_reply(email.read())) + + def test_email_one_is_not_on(self): + with open('test/emails/email_one_is_not_on.txt') as email: + self.assertTrue("On Oct 1, 2012, at 11:55 PM, Dave Tapley wrote:" not in EmailReplyParser.parse_reply(email.read())) + + def test_partial_quote_header(self): + message = self.get_email('email_partial_quote_header') + self.assertTrue("On your remote host you can run:" in message.reply) + self.assertTrue("telnet 127.0.0.1 52698" in message.reply) + self.assertTrue("This should connect to TextMate" in message.reply) + + def test_email_headers_no_delimiter(self): + message = self.get_email('email_headers_no_delimiter') + self.assertEqual(message.reply.strip(), 'And another reply!') + + def get_email(self, name): + """ Return EmailMessage instance + """ + with open('test/emails/it/%s.txt' % name) as f: + text = f.read() + return EmailReplyParser.read(text, 'it') + +if __name__ == '__main__': + unittest.main()