Files
notifications-api/tests/notifications_utils/test_formatters.py
2024-05-17 17:14:46 -04:00

578 lines
18 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pytest
from markupsafe import Markup
from notifications_utils.formatters import (
autolink_urls,
escape_html,
formatted_list,
make_quotes_smart,
normalise_whitespace,
remove_smart_quotes_from_email_addresses,
remove_whitespace_before_punctuation,
replace_hyphens_with_en_dashes,
sms_encode,
strip_all_whitespace,
strip_and_remove_obscure_whitespace,
strip_unsupported_characters,
unlink_govuk_escaped,
)
from notifications_utils.template import (
HTMLEmailTemplate,
PlainTextEmailTemplate,
SMSMessageTemplate,
SMSPreviewTemplate,
)
@pytest.mark.parametrize(
("url", "expected_html"),
[
(
"""https://example.com/"onclick="alert('hi')""",
"""<a class="govuk-link govuk-link--no-visited-state" href="https://example.com/%22onclick=%22alert%28%27hi%27%29">https://example.com/"onclick="alert('hi')</a>""", # noqa
),
(
"""https://example.com/"style='text-decoration:blink'""",
"""<a class="govuk-link govuk-link--no-visited-state" href="https://example.com/%22style=%27text-decoration:blink%27">https://example.com/"style='text-decoration:blink'</a>""", # noqa
),
],
)
def test_URLs_get_escaped_in_sms(url, expected_html):
assert expected_html in str(
SMSPreviewTemplate({"content": url, "template_type": "sms"})
)
def test_HTML_template_has_URLs_replaced_with_links():
assert (
'<a style="word-wrap: break-word; color: #1D70B8;" href="https://service.example.com/accept_invite/a1b2c3d4">'
"https://service.example.com/accept_invite/a1b2c3d4"
"</a>"
) in str(
HTMLEmailTemplate(
{
"content": (
"Youve been invited to a service. Click this link:\n"
"https://service.example.com/accept_invite/a1b2c3d4\n"
"\n"
"Thanks\n"
),
"subject": "",
"template_type": "email",
}
)
)
def test_escaping_govuk_in_email_templates():
template_content = "GOV.UK"
expected = "GOV.\u200BUK"
assert unlink_govuk_escaped(template_content) == expected
template_json = {
"content": template_content,
"subject": "",
"template_type": "email",
}
assert expected in str(PlainTextEmailTemplate(template_json))
assert expected in str(HTMLEmailTemplate(template_json))
@pytest.mark.parametrize(
("template_content", "expected"),
[
# Cases that we add the breaking space
("GOV.UK", "GOV.\u200BUK"),
("gov.uk", "gov.\u200Buk"),
(
"content with space infront GOV.UK",
"content with space infront GOV.\u200BUK",
),
("content with tab infront\tGOV.UK", "content with tab infront\tGOV.\u200BUK"),
(
"content with newline infront\nGOV.UK",
"content with newline infront\nGOV.\u200BUK",
),
("*GOV.UK", "*GOV.\u200BUK"),
("#GOV.UK", "#GOV.\u200BUK"),
("^GOV.UK", "^GOV.\u200BUK"),
(" #GOV.UK", " #GOV.\u200BUK"),
("GOV.UK with CONTENT after", "GOV.\u200BUK with CONTENT after"),
("#GOV.UK with CONTENT after", "#GOV.\u200BUK with CONTENT after"),
# Cases that we don't add the breaking space
("https://gov.uk", "https://gov.uk"),
("https://www.gov.uk", "https://www.gov.uk"),
("www.gov.uk", "www.gov.uk"),
("WWW.GOV.UK", "WWW.GOV.UK"),
("WWW.GOV.UK.", "WWW.GOV.UK."),
(
"https://www.gov.uk/?utm_source=gov.uk",
"https://www.gov.uk/?utm_source=gov.uk",
),
("mygov.uk", "mygov.uk"),
("www.this-site-is-not-gov.uk", "www.this-site-is-not-gov.uk"),
(
"www.gov.uk?websites=bbc.co.uk;gov.uk;nsh.scot",
"www.gov.uk?websites=bbc.co.uk;gov.uk;nsh.scot",
),
("reply to: xxxx@xxx.gov.uk", "reply to: xxxx@xxx.gov.uk"),
("southwark.gov.uk", "southwark.gov.uk"),
("data.gov.uk", "data.gov.uk"),
("gov.uk/foo", "gov.uk/foo"),
("*GOV.UK/foo", "*GOV.UK/foo"),
("#GOV.UK/foo", "#GOV.UK/foo"),
("^GOV.UK/foo", "^GOV.UK/foo"),
("gov.uk#departments-and-policy", "gov.uk#departments-and-policy"),
# Cases that we know currently aren't supported by our regex and have a non breaking space added when they
# shouldn't however, we accept the fact that our regex isn't perfect as we think the chance of a user using a
# URL like this in their content is very small.
# We document these edge cases here
pytest.param("gov.uk.com", "gov.uk.com", marks=pytest.mark.xfail),
pytest.param("gov.ukandi.com", "gov.ukandi.com", marks=pytest.mark.xfail),
pytest.param("gov.uks", "gov.uks", marks=pytest.mark.xfail),
],
)
def test_unlink_govuk_escaped(template_content, expected):
assert unlink_govuk_escaped(template_content) == expected
@pytest.mark.parametrize(
("prefix", "body", "expected"),
[
("a", "b", "a: b"),
(None, "b", "b"),
],
)
def test_sms_message_adds_prefix(prefix, body, expected):
template = SMSMessageTemplate({"content": body, "template_type": "sms"})
template.prefix = prefix
template.sender = None
assert str(template) == expected
def test_sms_preview_adds_newlines():
template = SMSPreviewTemplate(
{
"content": """
the
quick
brown fox
""",
"template_type": "sms",
}
)
template.prefix = None
template.sender = None
assert "<br>" in str(template)
def test_sms_encode(mocker):
sanitise_mock = mocker.patch("notifications_utils.formatters.SanitiseSMS")
assert sms_encode("foo") == sanitise_mock.encode.return_value
sanitise_mock.encode.assert_called_once_with("foo")
@pytest.mark.parametrize(
("items", "kwargs", "expected_output"),
[
([1], {}, "1"),
([1, 2], {}, "1 and 2"),
([1, 2, 3], {}, "1, 2 and 3"),
([1, 2, 3], {"prefix": "foo", "prefix_plural": "bar"}, "bar 1, 2 and 3"),
([1], {"prefix": "foo", "prefix_plural": "bar"}, "foo 1"),
([1, 2, 3], {"before_each": "a", "after_each": "b"}, "a1b, a2b and a3b"),
([1, 2, 3], {"conjunction": "foo"}, "1, 2 foo 3"),
(["&"], {"before_each": "<i>", "after_each": "</i>"}, "<i>&amp;</i>"),
(
[1, 2, 3],
{"before_each": "<i>", "after_each": "</i>"},
"<i>1</i>, <i>2</i> and <i>3</i>",
),
],
)
def test_formatted_list(items, kwargs, expected_output):
assert formatted_list(items, **kwargs) == expected_output
def test_formatted_list_returns_markup():
assert isinstance(formatted_list([0]), Markup)
def test_bleach_doesnt_try_to_make_valid_html_before_cleaning():
assert escape_html("<to cancel daily cat facts reply 'cancel'>") == (
"&lt;to cancel daily cat facts reply 'cancel'&gt;"
)
@pytest.mark.parametrize(
("content", "expected_escaped"),
[
("&?a;", "&amp;?a;"),
("&>a;", "&amp;&gt;a;"),
("&*a;", "&amp;*a;"),
("&a?;", "&amp;a?;"),
("&x?xa;", "&amp;x?xa;"),
# We need to be careful that query arguments dont get turned into entities
("&timestamp=&times;", "&amp;timestamp=×"),
("&times=1,2,3", "&amp;times=1,2,3"),
# &minus; should have a trailing semicolon according to the HTML5
# spec but &micro doesnt need one
("2&minus;1", "21"),
("200&micro;g", "200µg"),
# …we ignore it when its ambiguous
("2&minus1", "2&amp;minus1"),
("200&microg", "200&amp;microg"),
# …we still ignore when theres a space afterwards
("2 &minus 1", "2 &amp;minus 1"),
("200&micro g", "200&amp;micro g"),
# Things which arent real entities are ignored, not removed
("This &isnotarealentity;", "This &amp;isnotarealentity;"),
# We let users use &nbsp; for backwards compatibility
("Before&nbsp;after", "Before&nbsp;after"),
# We let users use &amp; because its often pasted in URLs
("?a=1&amp;b=2", "?a=1&amp;b=2"),
# We let users use &lpar; and &rpar; because otherwise its
# impossible to put brackets in the body of conditional placeholders
("((var??&lpar;in brackets&rpar;))", "((var??&lpar;in brackets&rpar;))"),
],
)
def test_escaping_html_entities(
content,
expected_escaped,
):
assert escape_html(content) == expected_escaped
@pytest.mark.parametrize(
("dirty", "clean"),
[
(
"Hello ((name)) ,\n\nThis is a message",
"Hello ((name)),\n\nThis is a message",
),
("Hello Jo ,\n\nThis is a message", "Hello Jo,\n\nThis is a message"),
(
"\n \t , word",
"\n, word",
),
],
)
def test_removing_whitespace_before_commas(dirty, clean):
assert remove_whitespace_before_punctuation(dirty) == clean
@pytest.mark.parametrize(
("dirty", "clean"),
[
(
"Hello ((name)) .\n\nThis is a message",
"Hello ((name)).\n\nThis is a message",
),
("Hello Jo .\n\nThis is a message", "Hello Jo.\n\nThis is a message"),
(
"\n \t . word",
"\n. word",
),
],
)
def test_removing_whitespace_before_full_stops(dirty, clean):
assert remove_whitespace_before_punctuation(dirty) == clean
@pytest.mark.parametrize(
("dumb", "smart"),
[
(
"""And I said, "what about breakfast at Tiffany's"?""",
"""And I said, “what about breakfast at Tiffanys”?""",
),
(
"""
<a href="http://example.com?q='foo'">http://example.com?q='foo'</a>
""",
"""
<a href="http://example.com?q='foo'">http://example.com?q='foo'</a>
""",
),
],
)
def test_smart_quotes(dumb, smart):
assert make_quotes_smart(dumb) == smart
@pytest.mark.parametrize(
("nasty", "nice"),
[
(
(
"The en dash - always with spaces in running text when, as "
"discussed in this section, indicating a parenthesis or "
"pause - and the spaced em dash both have a certain "
"technical advantage over the unspaced em dash. "
),
(
"The en dash \u2013 always with spaces in running text when, as "
"discussed in this section, indicating a parenthesis or "
"pause \u2013 and the spaced em dash both have a certain "
"technical advantage over the unspaced em dash. "
),
),
(
"double -- dash",
"double \u2013 dash",
),
(
"triple --- dash",
"triple \u2013 dash",
),
(
"quadruple ---- dash",
"quadruple ---- dash",
),
(
"em — dash",
"em dash",
),
(
"already\u0020\u0020correct", # \u0020 is a normal space character
"already\u0020\u0020correct",
),
(
"2004-2008",
"2004-2008", # no replacement
),
],
)
def test_en_dashes(nasty, nice):
assert replace_hyphens_with_en_dashes(nasty) == nice
def test_unicode_dash_lookup():
en_dash_replacement_sequence = "\u0020\u2013"
hyphen = "-"
en_dash = ""
space = " "
non_breaking_space = " "
assert en_dash_replacement_sequence == space + en_dash
assert non_breaking_space not in en_dash_replacement_sequence
assert hyphen not in en_dash_replacement_sequence
@pytest.mark.parametrize(
"value",
[
"bar",
" bar ",
"""
\t bar
""",
" \u180E\u200B \u200C bar \u200D \u2060\uFEFF ",
],
)
def test_strip_all_whitespace(value):
assert strip_all_whitespace(value) == "bar"
@pytest.mark.parametrize(
"value",
[
"notifications-email",
" \tnotifications-email \x0c ",
"\rn\u200Coti\u200Dfi\u200Bcati\u2060ons-\u180Eemai\uFEFFl\uFEFF",
],
)
def test_strip_and_remove_obscure_whitespace(value):
assert strip_and_remove_obscure_whitespace(value) == "notifications-email"
def test_strip_and_remove_obscure_whitespace_only_removes_normal_whitespace_from_ends():
sentence = " words \n over multiple lines with \ttabs\t "
assert (
strip_and_remove_obscure_whitespace(sentence)
== "words \n over multiple lines with \ttabs"
)
def test_remove_smart_quotes_from_email_addresses():
assert (
remove_smart_quotes_from_email_addresses(
"""
line ones quote
first.olast@example.com is someones email address
line three
"""
)
== (
"""
line ones quote
first.o'last@example.com is someones email address
line three
"""
)
)
def test_strip_unsupported_characters():
assert strip_unsupported_characters("line one\u2028line two") == (
"line oneline two"
)
@pytest.mark.parametrize(
"value",
[
"\u200C Your tax is\ndue\n\n",
" Your tax is due ",
# Non breaking spaces replaced by single spaces
"\u00A0Your\u00A0tax\u00A0 is\u00A0\u00A0due\u00A0",
# zero width spaces are removed
"\u180EYour \u200Btax\u200C is \u200D\u2060due \uFEFF",
# tabs are replaced by single spaces
"\tYour tax\tis due ",
],
)
def test_normalise_whitespace(value):
assert normalise_whitespace(value) == "Your tax is due"
@pytest.mark.parametrize(
("content", "expected_html"),
[
(
"http://example.com",
'<a href="http://example.com">http://example.com</a>',
),
(
"https://example.com",
'<a href="https://example.com">https://example.com</a>',
),
(
"example.com",
'<a href="http://example.com">example.com</a>',
),
(
"www.foo.bar.example.com",
'<a href="http://www.foo.bar.example.com">www.foo.bar.example.com</a>',
),
(
"example.com/",
'<a href="http://example.com/">example.com/</a>',
),
(
"www.foo.bar.example.com/",
'<a href="http://www.foo.bar.example.com/">www.foo.bar.example.com/</a>',
),
(
"example.com/foo",
'<a href="http://example.com/foo">example.com/foo</a>',
),
(
"example.com?foo",
'<a href="http://example.com?foo">example.com?foo</a>',
),
(
"example.com#foo",
'<a href="http://example.com#foo">example.com#foo</a>',
),
(
"Go to gov.uk/example.",
"Go to " '<a href="http://gov.uk/example">gov.uk/example</a>.',
),
(
"Go to gov.uk/example:",
"Go to " '<a href="http://gov.uk/example">gov.uk/example</a>:',
),
(
"Go to gov.uk/example;",
"Go to " '<a href="http://gov.uk/example;">gov.uk/example;</a>',
),
(
"(gov.uk/example)",
"(" '<a href="http://gov.uk/example">gov.uk/example</a>)',
),
(
"(gov.uk/example)...",
"(" '<a href="http://gov.uk/example">gov.uk/example</a>)...',
),
(
"(gov.uk/example.)",
"(" '<a href="http://gov.uk/example">gov.uk/example</a>.)',
),
(
"(see example.com/foo_(bar))",
"(see "
'<a href="http://example.com/foo_%28bar%29">example.com/foo_(bar)</a>)',
),
(
"example.com/foo(((((((bar",
'<a href="http://example.com/foo%28%28%28%28%28%28%28bar">example.com/foo(((((((bar</a>',
),
(
"government website (gov.uk). Other websites…",
"government website ("
'<a href="http://gov.uk">gov.uk</a>). Other websites…',
),
(
"[gov.uk/example]",
"[" '<a href="http://gov.uk/example">gov.uk/example</a>]',
),
(
"gov.uk/foo, gov.uk/bar",
'<a href="http://gov.uk/foo">gov.uk/foo</a>, '
'<a href="http://gov.uk/bar">gov.uk/bar</a>',
),
(
"<p>gov.uk/foo</p>",
"<p>" '<a href="http://gov.uk/foo">gov.uk/foo</a></p>',
),
(
"gov.uk?foo&amp;",
'<a href="http://gov.uk?foo&amp;">gov.uk?foo&amp;</a>',
),
(
"a .service.gov.uk domain",
"a .service.gov.uk domain",
),
(
'http://foo.com/"bar"?x=1#2',
'<a href="http://foo.com/%22bar%22?x=1#2">http://foo.com/"bar"?x=1#2</a>',
),
(
"firstname.lastname@example.com",
"firstname.lastname@example.com",
),
(
"with-subdomain@test.example.com",
"with-subdomain@test.example.com",
),
],
)
def test_autolink_urls_matches_correctly(content, expected_html):
assert autolink_urls(content) == expected_html
@pytest.mark.parametrize(
("extra_kwargs", "expected_html"),
[
(
{},
'<a href="http://example.com">http://example.com</a>',
),
(
{
"classes": "govuk-link",
},
'<a class="govuk-link" href="http://example.com">http://example.com</a>',
),
],
)
def test_autolink_urls_applies_correct_attributes(extra_kwargs, expected_html):
assert autolink_urls("http://example.com", **extra_kwargs) == expected_html
@pytest.mark.parametrize(
"content", ["without link", "with link to https://example.com"]
)
def test_autolink_urls_returns_markup(content):
assert isinstance(autolink_urls(content), Markup)