Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inbound email: download from S3 + convert HTML to plaintext #5348

Merged
merged 7 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 115 additions & 14 deletions engine/apps/email/inbound.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from anymail.inbound import AnymailInboundMessage
from anymail.signals import AnymailInboundEvent
from anymail.webhooks import amazon_ses, mailgun, mailjet, mandrill, postal, postmark, sendgrid, sparkpost
from bs4 import BeautifulSoup
from django.conf import settings
from django.http import HttpResponse, HttpResponseNotAllowed
from django.utils import timezone
from rest_framework import status
Expand All @@ -25,6 +27,15 @@ class AmazonSESValidatedInboundWebhookView(amazon_ses.AmazonSESInboundWebhookVie
# disable "Your Anymail webhooks are insecure and open to anyone on the web." warning
warn_if_no_basic_auth = False

def __init__(self):
super().__init__(
session_params={
"aws_access_key_id": settings.INBOUND_EMAIL_AWS_ACCESS_KEY_ID,
"aws_secret_access_key": settings.INBOUND_EMAIL_AWS_SECRET_ACCESS_KEY,
"region_name": settings.INBOUND_EMAIL_AWS_REGION,
},
)

Comment on lines +30 to +38
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adding AWS creds so OnCall can download emails from S3

def validate_request(self, request):
"""Add SNS message validation to Amazon SES inbound webhook view, which is not implemented in Anymail."""
if not validate_amazon_sns_message(self._parse_sns_message(request)):
Expand Down Expand Up @@ -74,11 +85,10 @@ def dispatch(self, request):
if request.method.lower() == "head":
return HttpResponse(status=status.HTTP_200_OK)

integration_token = self.get_integration_token_from_request(request)
if integration_token is None:
if self.integration_token is None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a little bit of refactoring: instead of having a self.get_integration_token_from_request(request) method, it's now just a property self.integration_token. no functional changes, just looks better IMO

return HttpResponse(status=status.HTTP_400_BAD_REQUEST)
request.inbound_email_integration_token = integration_token # used in RequestTimeLoggingMiddleware
return super().dispatch(request, alert_channel_key=integration_token)
request.inbound_email_integration_token = self.integration_token # used in RequestTimeLoggingMiddleware
return super().dispatch(request, alert_channel_key=self.integration_token)

def post(self, request):
payload = self.get_alert_payload_from_email_message(self.message)
Expand All @@ -94,7 +104,8 @@ def post(self, request):
)
return Response("OK", status=status.HTTP_200_OK)

def get_integration_token_from_request(self, request) -> Optional[str]:
@cached_property
def integration_token(self) -> Optional[str]:
if not self.message:
return None
# First try envelope_recipient field.
Expand Down Expand Up @@ -151,7 +162,8 @@ def message(self) -> AnymailInboundMessage | None:
logger.error("Failed to parse inbound email message")
return None

def check_inbound_email_settings_set(self):
@staticmethod
def check_inbound_email_settings_set():
"""
Guard method to checks if INBOUND_EMAIL settings present.
Returns InternalServerError if not.
Expand All @@ -167,16 +179,105 @@ def check_inbound_email_settings_set(self):
logger.error("InboundEmailWebhookView: INBOUND_EMAIL_DOMAIN env variable must be set.")
return HttpResponse(status=status.HTTP_500_INTERNAL_SERVER_ERROR)

def get_alert_payload_from_email_message(self, email: AnymailInboundMessage) -> EmailAlertPayload:
subject = email.subject or ""
subject = subject.strip()
message = email.text or ""
message = message.strip()
sender = self.get_sender_from_email_message(email)
@classmethod
def get_alert_payload_from_email_message(cls, email: AnymailInboundMessage) -> EmailAlertPayload:
if email.text:
message = email.text.strip()
elif email.html:
message = cls.html_to_plaintext(email.html)
Comment on lines +186 to +187
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the main change of the PR. If there's no plaintext representation for an email, we'll convert it from HTML to plaintext in the html_to_plaintext method below

else:
message = ""

return {
"subject": email.subject.strip() if email.subject else "",
"message": message,
"sender": cls.get_sender_from_email_message(email),
}

@staticmethod
def html_to_plaintext(html: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yikes, converting HTML to plaintext looks complicated! 😬

Copy link
Member Author

@vadimkerr vadimkerr Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the super quick review @joeyorlando ❤️
Yes converting HTML to plaintext is non-trivial (and technically not possible) because HTML is basically a "picture", not text. I added this mostly for backward compatibility so I tried to keep it similar to how Mailgun does it. Also see #5348 (comment)

"""
Converts HTML to plain text. Renders links as "text (href)" and removes any empty lines.
Converting HTML to plaintext is a non-trivial task, so this method may not work perfectly for all cases.
"""
Comment on lines +198 to +202
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are ESPs such as Mailgun that provide the same HTML -> plaintext functionality but there are also ESPs such as AWS SES that don't do it. In order to keep things relatively backward compatible when migrating from Mailgun to AWS SES, we want to convert HTML to plaintext in OnCall (and do it at least somewhat similarly to how Mailgun does it).

soup = BeautifulSoup(html, "html.parser")

# Browsers typically render these elements on their own line.
# There is no single official HTML5 list for this, so we go with HTML tags that render as
# display: block, display: list-item, display: table, display: table-row by default according to the HTML standard:
# https://html.spec.whatwg.org/multipage/rendering.html
newline_tags = [
"address",
"article",
"aside",
"blockquote",
"body",
"center",
"dd",
"details",
"dialog",
"dir",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"html",
"legend",
"li",
"listing",
"main",
"menu",
"nav",
"ol",
"p",
"plaintext",
"pre",
"search",
"section",
"summary",
"table",
"tr",
"ul",
"xmp",
]
# Insert a newline after each block-level element
for tag in soup.find_all(newline_tags):
tag.insert_before("\n")
tag.insert_after("\n")

# <br> tags are also typically rendered as newlines
for br in soup.find_all("br"):
br.replace_with("\n")

# example: "<a href="https://example.com">example</a>" -> "example (https://example.com)"
for a in soup.find_all("a"):
if href := a.get("href"):
a.append(f" ({href})")

for li in soup.find_all("li"):
li.insert_before("* ")

for hr in soup.find_all("hr"):
hr.replace_with("-" * 32)

return {"subject": subject, "message": message, "sender": sender}
# remove empty lines
return "\n".join(line.strip() for line in soup.get_text().splitlines() if line.strip())

def get_sender_from_email_message(self, email: AnymailInboundMessage) -> str:
@staticmethod
def get_sender_from_email_message(email: AnymailInboundMessage) -> str:
try:
if isinstance(email.from_email, list):
sender = email.from_email[0].addr_spec
Expand Down
Loading
Loading