Files
2025-03-31 11:03:35 -07:00

740 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import math
import re
from abc import ABC, abstractmethod
from functools import lru_cache
from html import unescape
from os import path
from jinja2 import Environment, FileSystemLoader, select_autoescape
from markupsafe import Markup
from notifications_utils import MAGIC_SEQUENCE, SMS_CHAR_COUNT_LIMIT
from notifications_utils.field import Field, PlainTextField
from notifications_utils.formatters import (
add_prefix,
add_trailing_newline,
autolink_urls,
escape_html,
make_quotes_smart,
nl2br,
normalise_multiple_newlines,
normalise_whitespace,
normalise_whitespace_and_newlines,
remove_smart_quotes_from_email_addresses,
remove_whitespace_before_punctuation,
replace_hyphens_with_en_dashes,
sms_encode,
strip_leading_whitespace,
strip_unsupported_characters,
unlink_govuk_escaped,
)
from notifications_utils.insensitive_dict import InsensitiveDict
from notifications_utils.markdown import (
notify_email_markdown,
notify_email_preheader_markdown,
notify_plain_text_email_markdown,
)
from notifications_utils.sanitise_text import SanitiseSMS
from notifications_utils.take import Take
from notifications_utils.template_change import TemplateChange
template_env = Environment(
autoescape=select_autoescape(),
loader=FileSystemLoader(
path.join(
path.dirname(path.abspath(__file__)),
"jinja_templates",
)
),
)
class Template(ABC):
encoding = "utf-8"
def __init__(
self,
template,
values=None,
redact_missing_personalisation=False,
):
if not isinstance(template, dict):
raise TypeError("Template must be a dict")
if values is not None and not isinstance(values, dict):
raise TypeError("Values must be a dict")
if template.get("template_type") != self.template_type:
raise TypeError(
f"Cannot initialise {self.__class__.__name__} "
f'with {template.get("template_type")} template_type'
)
self.id = template.get("id", None)
self.name = template.get("name", None)
self.content = template["content"]
self.values = values
self._template = template
self.redact_missing_personalisation = redact_missing_personalisation
def __repr__(self):
return '{}("{}", {})'.format(self.__class__.__name__, self.content, self.values)
@abstractmethod
def __str__(self):
pass
@property
def content_with_placeholders_filled_in(self):
return str(
Field(
self.content,
self.values,
html="passthrough",
redact_missing_personalisation=self.redact_missing_personalisation,
markdown_lists=True,
)
).strip()
@property
def values(self):
if hasattr(self, "_values"):
return self._values
return {}
@values.setter
def values(self, value):
if not value:
self._values = {}
else:
placeholders = InsensitiveDict.from_keys(self.placeholders)
self._values = InsensitiveDict(value).as_dict_with_keys(
self.placeholders
| set(
key
for key in value.keys()
if InsensitiveDict.make_key(key) not in placeholders.keys()
)
)
@property
def placeholders(self):
return get_placeholders(self.content)
@property
def missing_data(self):
return list(
placeholder
for placeholder in self.placeholders
if self.values.get(placeholder) is None
)
@property
def additional_data(self):
return self.values.keys() - self.placeholders
def get_raw(self, key, default=None):
return self._template.get(key, default)
def compare_to(self, new):
return TemplateChange(self, new)
@property
def content_count(self):
return len(self.content_with_placeholders_filled_in)
def is_message_empty(self):
if not self.content:
return True
if not self.content.startswith("((") or not self.content.endswith("))"):
# If the content doesnt start or end with a placeholder we
# can guarantee its not empty, no matter what
# personalisation has been provided.
return False
return self.content_count == 0
def is_message_too_long(self):
return False
class BaseSMSTemplate(Template):
template_type = "sms"
def __init__(
self,
template,
values=None,
prefix=None,
show_prefix=True,
sender=None,
):
self.prefix = prefix
self.show_prefix = show_prefix
self.sender = sender
self._content_count = None
super().__init__(template, values)
@property
def values(self):
return super().values
@values.setter
def values(self, value):
# If we change the values of the template its possible the
# content count will have changed, so we need to reset the
# cached count.
if self._content_count is not None:
self._content_count = None
# Assigning to super().values doesnt work here. We need to get
# the property object instead, which has the special method
# fset, which invokes the setter it as if we were
# assigning to it outside this class.
super(BaseSMSTemplate, type(self)).values.fset(self, value)
@property
def content_with_placeholders_filled_in(self):
# We always call SMSMessageTemplate.__str__ regardless of
# subclass, to avoid any HTML formatting. SMS templates differ
# in that the content can include the service name as a prefix.
# So historically weve returned the fully-formatted message,
# rather than some plain-text represenation of the content. To
# preserve compatibility for consumers of the API we maintain
# that behaviour by overriding this method here.
return SMSMessageTemplate.__str__(self)
@property
def prefix(self):
return self._prefix if self.show_prefix else None
@prefix.setter
def prefix(self, value):
self._prefix = value
@property
def content_count(self):
"""
Return the number of characters in the message. Note that we don't distinguish between GSM and non-GSM
characters at this point, as `get_sms_fragment_count` handles that separately.
Also note that if values aren't provided, will calculate the raw length of the unsubstituted placeholders,
as in the message `foo ((placeholder))` has a length of 19.
"""
if self._content_count is None:
self._content_count = len(self._get_unsanitised_content())
return self._content_count
@property
def content_count_without_prefix(self):
# subtract 2 extra characters to account for the colon and the space,
# added max zero in case the content is empty the __str__ methods strips the white space.
if self.prefix:
return max((self.content_count - len(self.prefix) - 2), 0)
else:
return self.content_count
@property
def fragment_count(self):
"""
A fragment is up to 140 bytes, which could consist of 160 GSM chars, 140 ascii chars, or 70 ucs-2 chars,
or any combination thereof.
Since we are supporting more or less "all" languages, it doesn't seem like we really want to count chars,
and that counting bytes should suffice.
"""
# check if all chars are in the GSM-7 character set
def gsm_check(x):
rule = re.compile(
r'^[\sa-zA-Z0-9_@?£!1$"¥#è?¤é%ù&ì\\ò(Ç)*:Ø+;ÄäøÆ,<LÖlöæ\-=ÑñÅß.>ÜüåÉ/§à¡¿\']+$'
)
gsm_match = rule.search(x)
if gsm_match is None:
return False
return True
message_str = self.content_with_placeholders_filled_in
content_len = len(message_str)
"""
Checks for GSM-7 char set, calculates msg size, and
then fragments based on multipart message rules. ASCII
was not specifically called out as almost all messages will
switch from 7bit GSM to Unicode.
Calculations are based on https://messente.com/documentation/tools/sms-length-calculator
"""
if gsm_check(message_str):
if content_len <= 160:
return math.ceil(content_len / 160)
else:
return math.ceil(content_len / 153)
else:
if content_len <= 70:
return math.ceil(content_len / 70)
else:
return math.ceil(content_len / 67)
def is_message_too_long(self):
"""
Message is validated with out the prefix.
We have decided to be lenient and let the message go over the character limit. The SMS provider will
send messages well over our limit. There were some inconsistencies with how we were validating the
length of a message. This should be the method used anytime we want to reject a message for being too long.
"""
return self.content_count_without_prefix > SMS_CHAR_COUNT_LIMIT
def is_message_empty(self):
return self.content_count_without_prefix == 0
def _get_unsanitised_content(self):
# This is faster to call than SMSMessageTemplate.__str__ if all
# you need to know is how many characters are in the message
if self.values:
values = self.values
else:
values = {key: MAGIC_SEQUENCE for key in self.placeholders}
return (
Take(PlainTextField(self.content, values, html="passthrough"))
.then(add_prefix, self.prefix)
.then(remove_whitespace_before_punctuation)
.then(normalise_whitespace_and_newlines)
.then(normalise_multiple_newlines)
.then(str.strip)
.then(str.replace, MAGIC_SEQUENCE, "")
)
class SMSMessageTemplate(BaseSMSTemplate):
def __str__(self):
return sms_encode(self._get_unsanitised_content())
class SMSBodyPreviewTemplate(BaseSMSTemplate):
def __init__(
self,
template,
values=None,
):
super().__init__(template, values, show_prefix=False)
def __str__(self):
return Markup(
Take(
Field(
self.content,
self.values,
html="escape",
redact_missing_personalisation=True,
)
)
.then(sms_encode)
.then(remove_whitespace_before_punctuation)
.then(normalise_whitespace_and_newlines)
.then(normalise_multiple_newlines)
.then(str.strip)
)
class SMSPreviewTemplate(BaseSMSTemplate):
jinja_template = template_env.get_template("sms_preview_template.jinja2")
def __init__(
self,
template,
values=None,
prefix=None,
show_prefix=True,
sender=None,
show_recipient=False,
show_sender=False,
downgrade_non_sms_characters=True,
redact_missing_personalisation=False,
):
self.show_recipient = show_recipient
self.show_sender = show_sender
self.downgrade_non_sms_characters = downgrade_non_sms_characters
super().__init__(template, values, prefix, show_prefix, sender)
self.redact_missing_personalisation = redact_missing_personalisation
def __str__(self):
return Markup(
self.jinja_template.render(
{
"sender": self.sender,
"show_sender": self.show_sender,
"recipient": Field(
"((phone number))",
self.values,
with_brackets=False,
html="escape",
),
"show_recipient": self.show_recipient,
"body": Take(
Field(
self.content,
self.values,
html="escape",
redact_missing_personalisation=self.redact_missing_personalisation,
)
)
.then(
add_prefix,
(
(escape_html(self.prefix) or None)
if self.show_prefix
else None
),
)
.then(sms_encode if self.downgrade_non_sms_characters else str)
.then(remove_whitespace_before_punctuation)
.then(normalise_whitespace_and_newlines)
.then(normalise_multiple_newlines)
.then(nl2br)
.then(
autolink_urls,
classes="govuk-link govuk-link--no-visited-state",
),
}
)
)
class BaseBroadcastTemplate(BaseSMSTemplate):
template_type = "broadcast"
MAX_CONTENT_COUNT_GSM = 1_395
MAX_CONTENT_COUNT_UCS2 = 615
@property
def encoded_content_count(self):
if self.non_gsm_characters:
return self.content_count
return self.content_count + count_extended_gsm_chars(
self.content_with_placeholders_filled_in
)
@property
def non_gsm_characters(self):
return non_gsm_characters(self.content)
@property
def max_content_count(self):
if self.non_gsm_characters:
return self.MAX_CONTENT_COUNT_UCS2
return self.MAX_CONTENT_COUNT_GSM
@property
def content_too_long(self):
return self.encoded_content_count > self.max_content_count
class BroadcastPreviewTemplate(BaseBroadcastTemplate, SMSPreviewTemplate):
jinja_template = template_env.get_template("broadcast_preview_template.jinja2")
class BroadcastMessageTemplate(BaseBroadcastTemplate, SMSMessageTemplate):
@classmethod
def from_content(cls, content):
return cls(
template={
"template_type": cls.template_type,
"content": content,
},
values=None, # events have already done interpolation of any personalisation
)
@classmethod
def from_event(cls, broadcast_event):
"""
should be directly callable with the results of the BroadcastEvent.serialize() function from api/models.py
"""
return cls.from_content(broadcast_event["transmitted_content"]["body"])
def __str__(self):
return (
Take(
Field(
self.content.strip(),
self.values,
html="escape",
)
)
.then(sms_encode)
.then(remove_whitespace_before_punctuation)
.then(normalise_whitespace_and_newlines)
.then(normalise_multiple_newlines)
)
class SubjectMixin:
def __init__(self, template, values=None, **kwargs):
self._subject = template["subject"]
super().__init__(template, values, **kwargs)
@property
def subject(self):
return Markup(
Take(
Field(
self._subject,
self.values,
html="escape",
redact_missing_personalisation=self.redact_missing_personalisation,
)
)
.then(do_nice_typography)
.then(normalise_whitespace)
)
@property
def placeholders(self):
return get_placeholders(self._subject) | super().placeholders
class BaseEmailTemplate(SubjectMixin, Template):
template_type = "email"
@property
def html_body(self):
return (
Take(
Field(
self.content,
self.values,
html="escape",
markdown_lists=True,
redact_missing_personalisation=self.redact_missing_personalisation,
)
)
.then(unlink_govuk_escaped)
.then(strip_unsupported_characters)
.then(add_trailing_newline)
.then(notify_email_markdown)
.then(do_nice_typography)
)
@property
def content_size_in_bytes(self):
return len(self.content_with_placeholders_filled_in.encode("utf8"))
def is_message_too_long(self):
"""
SES rejects email messages bigger than 10485760 bytes (just over 10 MB per message (after base64 encoding)):
https://docs.aws.amazon.com/ses/latest/DeveloperGuide/quotas.html#limits-message
Base64 is apparently wasteful because we use just 64 different values per byte, whereas a byte can represent
256 different characters. That is, we use bytes (which are 8-bit words) as 6-bit words. There is
a waste of 2 bits for each 8 bits of transmission data. To send three bytes of information
(3 times 8 is 24 bits), you need to use four bytes (4 times 6 is again 24 bits). Thus the base64 version
of a file is 4/3 larger than it might be. So we use 33% more storage than we could.
https://lemire.me/blog/2019/01/30/what-is-the-space-overhead-of-base64-encoding/
That brings down our max safe size to 7.5 MB == 7500000 bytes before base64 encoding
But this is not the end! The message we send to SES is structured as follows:
"Message": {
'Subject': {
'Data': subject,
},
'Body': {'Text': {'Data': body}, 'Html': {'Data': html_body}}
},
Which means that we are sending the contents of email message twice in one request: once in plain text
and once with html tags. That means our plain text content needs to be much shorter to make sure we
fit within the limit, especially since HTML body can be much byte-heavier than plain text body.
Hence, we decided to put the limit at 1MB, which is equivalent of between 250 and 500 pages of text.
That's still an extremely long email, and should be sufficient for all normal use, while at the same
time giving us safe margin while sending the emails through Amazon SES.
EDIT: putting size up to 2MB as GOV.UK email digests are hitting the limit.
"""
return self.content_size_in_bytes > 2000000
class PlainTextEmailTemplate(BaseEmailTemplate):
def __str__(self):
return (
Take(
Field(
self.content, self.values, html="passthrough", markdown_lists=True
)
)
.then(unlink_govuk_escaped)
.then(strip_unsupported_characters)
.then(add_trailing_newline)
.then(notify_plain_text_email_markdown)
.then(do_nice_typography)
.then(unescape)
.then(strip_leading_whitespace)
.then(add_trailing_newline)
)
@property
def subject(self):
return Markup(
Take(
Field(
self._subject,
self.values,
html="passthrough",
redact_missing_personalisation=self.redact_missing_personalisation,
)
)
.then(do_nice_typography)
.then(normalise_whitespace)
)
class HTMLEmailTemplate(BaseEmailTemplate):
jinja_template = template_env.get_template("email_template.jinja2")
PREHEADER_LENGTH_IN_CHARACTERS = 256
def __init__(
self,
template,
values=None,
govuk_banner=True,
complete_html=True,
brand_logo=None,
brand_text=None,
brand_colour=None,
brand_banner=False,
brand_name=None,
):
super().__init__(template, values)
self.govuk_banner = govuk_banner
self.complete_html = complete_html
self.brand_logo = brand_logo
self.brand_text = brand_text
self.brand_colour = brand_colour
self.brand_banner = brand_banner
self.brand_name = brand_name
@property
def preheader(self):
return " ".join(
Take(
Field(
self.content,
self.values,
html="escape",
markdown_lists=True,
)
)
.then(unlink_govuk_escaped)
.then(strip_unsupported_characters)
.then(add_trailing_newline)
.then(notify_email_preheader_markdown)
.then(do_nice_typography)
.split()
)[: self.PREHEADER_LENGTH_IN_CHARACTERS].strip()
def __str__(self):
return self.jinja_template.render(
{
"subject": self.subject,
"body": self.html_body,
"preheader": self.preheader,
"govuk_banner": self.govuk_banner,
"complete_html": self.complete_html,
"brand_logo": self.brand_logo,
"brand_text": self.brand_text,
"brand_colour": self.brand_colour,
"brand_banner": self.brand_banner,
"brand_name": self.brand_name,
}
)
class EmailPreviewTemplate(BaseEmailTemplate):
jinja_template = template_env.get_template("email_preview_template.jinja2")
def __init__(
self,
template,
values=None,
from_name=None,
from_address=None,
reply_to=None,
show_recipient=True,
redact_missing_personalisation=False,
):
super().__init__(
template,
values,
redact_missing_personalisation=redact_missing_personalisation,
)
self.from_name = from_name
self.from_address = from_address
self.reply_to = reply_to
self.show_recipient = show_recipient
def __str__(self):
return Markup(
self.jinja_template.render(
{
"body": self.html_body,
"subject": self.subject,
"from_name": escape_html(self.from_name),
"from_address": self.from_address,
"reply_to": self.reply_to,
"recipient": Field(
"((email address))", self.values, with_brackets=False
),
"show_recipient": self.show_recipient,
}
)
)
@property
def subject(self):
return (
Take(
Field(
self._subject,
self.values,
html="escape",
redact_missing_personalisation=self.redact_missing_personalisation,
)
)
.then(do_nice_typography)
.then(normalise_whitespace)
)
def get_sms_fragment_count(character_count, non_gsm_characters):
if non_gsm_characters:
return 1 if character_count <= 70 else math.ceil(float(character_count) / 67)
else:
return 1 if character_count <= 160 else math.ceil(float(character_count) / 153)
def non_gsm_characters(content):
"""
Returns a set of all the non gsm characters in a text. this doesn't include characters that we will downgrade (eg
emoji, ellipsis, ñ, etc). This only includes welsh non gsm characters that will force the entire SMS to be encoded
with UCS-2.
"""
return set(content) & set(SanitiseSMS.WELSH_NON_GSM_CHARACTERS)
def count_extended_gsm_chars(content):
return sum(map(content.count, SanitiseSMS.EXTENDED_GSM_CHARACTERS))
def do_nice_typography(value):
return (
Take(value)
.then(remove_whitespace_before_punctuation)
.then(make_quotes_smart)
.then(remove_smart_quotes_from_email_addresses)
.then(replace_hyphens_with_en_dashes)
)
@lru_cache(maxsize=1024)
def get_placeholders(content):
return Field(content).placeholders