import pytest
from markupsafe import Markup
from notifications_utils.formatters import (
autolink_urls,
escape_html,
formatted_list,
make_quotes_smart,
normalise_whitespace,
remove_smart_quotes_from_email_addresses,
remove_whitespace_before_punctuation,
replace_hyphens_with_en_dashes,
sms_encode,
strip_all_whitespace,
strip_and_remove_obscure_whitespace,
strip_unsupported_characters,
unlink_govuk_escaped,
)
from notifications_utils.template import (
HTMLEmailTemplate,
PlainTextEmailTemplate,
SMSMessageTemplate,
SMSPreviewTemplate,
)
@pytest.mark.parametrize(
("url", "expected_html"),
[
(
"""https://example.com/"onclick="alert('hi')""",
"""https://example.com/"onclick="alert('hi')""", # noqa
),
(
"""https://example.com/"style='text-decoration:blink'""",
"""https://example.com/"style='text-decoration:blink'""", # noqa
),
],
)
def test_URLs_get_escaped_in_sms(url, expected_html):
assert expected_html in str(
SMSPreviewTemplate({"content": url, "template_type": "sms"})
)
def test_HTML_template_has_URLs_replaced_with_links():
assert (
''
"https://service.example.com/accept_invite/a1b2c3d4"
""
) in str(
HTMLEmailTemplate(
{
"content": (
"You’ve been invited to a service. Click this link:\n"
"https://service.example.com/accept_invite/a1b2c3d4\n"
"\n"
"Thanks\n"
),
"subject": "",
"template_type": "email",
}
)
)
def test_escaping_govuk_in_email_templates():
template_content = "GOV.UK"
expected = "GOV.\u200BUK"
assert unlink_govuk_escaped(template_content) == expected
template_json = {
"content": template_content,
"subject": "",
"template_type": "email",
}
assert expected in str(PlainTextEmailTemplate(template_json))
assert expected in str(HTMLEmailTemplate(template_json))
@pytest.mark.parametrize(
("template_content", "expected"),
[
# Cases that we add the breaking space
("GOV.UK", "GOV.\u200BUK"),
("gov.uk", "gov.\u200Buk"),
(
"content with space infront GOV.UK",
"content with space infront GOV.\u200BUK",
),
("content with tab infront\tGOV.UK", "content with tab infront\tGOV.\u200BUK"),
(
"content with newline infront\nGOV.UK",
"content with newline infront\nGOV.\u200BUK",
),
("*GOV.UK", "*GOV.\u200BUK"),
("#GOV.UK", "#GOV.\u200BUK"),
("^GOV.UK", "^GOV.\u200BUK"),
(" #GOV.UK", " #GOV.\u200BUK"),
("GOV.UK with CONTENT after", "GOV.\u200BUK with CONTENT after"),
("#GOV.UK with CONTENT after", "#GOV.\u200BUK with CONTENT after"),
# Cases that we don't add the breaking space
("https://gov.uk", "https://gov.uk"),
("https://www.gov.uk", "https://www.gov.uk"),
("www.gov.uk", "www.gov.uk"),
("WWW.GOV.UK", "WWW.GOV.UK"),
("WWW.GOV.UK.", "WWW.GOV.UK."),
(
"https://www.gov.uk/?utm_source=gov.uk",
"https://www.gov.uk/?utm_source=gov.uk",
),
("mygov.uk", "mygov.uk"),
("www.this-site-is-not-gov.uk", "www.this-site-is-not-gov.uk"),
(
"www.gov.uk?websites=bbc.co.uk;gov.uk;nsh.scot",
"www.gov.uk?websites=bbc.co.uk;gov.uk;nsh.scot",
),
("reply to: xxxx@xxx.gov.uk", "reply to: xxxx@xxx.gov.uk"),
("southwark.gov.uk", "southwark.gov.uk"),
("data.gov.uk", "data.gov.uk"),
("gov.uk/foo", "gov.uk/foo"),
("*GOV.UK/foo", "*GOV.UK/foo"),
("#GOV.UK/foo", "#GOV.UK/foo"),
("^GOV.UK/foo", "^GOV.UK/foo"),
("gov.uk#departments-and-policy", "gov.uk#departments-and-policy"),
# Cases that we know currently aren't supported by our regex and have a non breaking space added when they
# shouldn't however, we accept the fact that our regex isn't perfect as we think the chance of a user using a
# URL like this in their content is very small.
# We document these edge cases here
pytest.param("gov.uk.com", "gov.uk.com", marks=pytest.mark.xfail),
pytest.param("gov.ukandi.com", "gov.ukandi.com", marks=pytest.mark.xfail),
pytest.param("gov.uks", "gov.uks", marks=pytest.mark.xfail),
],
)
def test_unlink_govuk_escaped(template_content, expected):
assert unlink_govuk_escaped(template_content) == expected
@pytest.mark.parametrize(
("prefix", "body", "expected"),
[
("a", "b", "a: b"),
(None, "b", "b"),
],
)
def test_sms_message_adds_prefix(prefix, body, expected):
template = SMSMessageTemplate({"content": body, "template_type": "sms"})
template.prefix = prefix
template.sender = None
assert str(template) == expected
def test_sms_preview_adds_newlines():
template = SMSPreviewTemplate(
{
"content": """
the
quick
brown fox
""",
"template_type": "sms",
}
)
template.prefix = None
template.sender = None
assert "
" in str(template)
def test_sms_encode(mocker):
sanitise_mock = mocker.patch("notifications_utils.formatters.SanitiseSMS")
assert sms_encode("foo") == sanitise_mock.encode.return_value
sanitise_mock.encode.assert_called_once_with("foo")
@pytest.mark.parametrize(
("items", "kwargs", "expected_output"),
[
([1], {}, "‘1’"),
([1, 2], {}, "‘1’ and ‘2’"),
([1, 2, 3], {}, "‘1’, ‘2’ and ‘3’"),
([1, 2, 3], {"prefix": "foo", "prefix_plural": "bar"}, "bar ‘1’, ‘2’ and ‘3’"),
([1], {"prefix": "foo", "prefix_plural": "bar"}, "foo ‘1’"),
([1, 2, 3], {"before_each": "a", "after_each": "b"}, "a1b, a2b and a3b"),
([1, 2, 3], {"conjunction": "foo"}, "‘1’, ‘2’ foo ‘3’"),
(["&"], {"before_each": "", "after_each": ""}, "&"),
(
[1, 2, 3],
{"before_each": "", "after_each": ""},
"1, 2 and 3",
),
],
)
def test_formatted_list(items, kwargs, expected_output):
assert formatted_list(items, **kwargs) == expected_output
def test_formatted_list_returns_markup():
assert isinstance(formatted_list([0]), Markup)
def test_bleach_doesnt_try_to_make_valid_html_before_cleaning():
assert escape_html("") == (
"<to cancel daily cat facts reply 'cancel'>"
)
@pytest.mark.parametrize(
("content", "expected_escaped"),
[
("&?a;", "&?a;"),
("&>a;", "&>a;"),
("&*a;", "&*a;"),
("&a?;", "&a?;"),
("&x?xa;", "&x?xa;"),
# We need to be careful that query arguments don’t get turned into entities
("×tamp=×", "×tamp=×"),
("×=1,2,3", "×=1,2,3"),
# − should have a trailing semicolon according to the HTML5
# spec but µ doesn’t need one
("2−1", "2−1"),
("200µg", "200µg"),
# …we ignore it when it’s ambiguous
("2&minus1", "2&minus1"),
("200µg", "200µg"),
# …we still ignore when there’s a space afterwards
("2 &minus 1", "2 &minus 1"),
("200µ g", "200µ g"),
# Things which aren’t real entities are ignored, not removed
("This &isnotarealentity;", "This &isnotarealentity;"),
# We let users use for backwards compatibility
("Before after", "Before after"),
# We let users use & because it’s often pasted in URLs
("?a=1&b=2", "?a=1&b=2"),
# We let users use ( and ) because otherwise it’s
# impossible to put brackets in the body of conditional placeholders
("((var??(in brackets)))", "((var??(in brackets)))"),
],
)
def test_escaping_html_entities(
content,
expected_escaped,
):
assert escape_html(content) == expected_escaped
@pytest.mark.parametrize(
("dirty", "clean"),
[
(
"Hello ((name)) ,\n\nThis is a message",
"Hello ((name)),\n\nThis is a message",
),
("Hello Jo ,\n\nThis is a message", "Hello Jo,\n\nThis is a message"),
(
"\n \t , word",
"\n, word",
),
],
)
def test_removing_whitespace_before_commas(dirty, clean):
assert remove_whitespace_before_punctuation(dirty) == clean
@pytest.mark.parametrize(
("dirty", "clean"),
[
(
"Hello ((name)) .\n\nThis is a message",
"Hello ((name)).\n\nThis is a message",
),
("Hello Jo .\n\nThis is a message", "Hello Jo.\n\nThis is a message"),
(
"\n \t . word",
"\n. word",
),
],
)
def test_removing_whitespace_before_full_stops(dirty, clean):
assert remove_whitespace_before_punctuation(dirty) == clean
@pytest.mark.parametrize(
("dumb", "smart"),
[
(
"""And I said, "what about breakfast at Tiffany's"?""",
"""And I said, “what about breakfast at Tiffany’s”?""",
),
(
"""
http://example.com?q='foo'
""",
"""
http://example.com?q='foo'
""",
),
],
)
def test_smart_quotes(dumb, smart):
assert make_quotes_smart(dumb) == smart
@pytest.mark.parametrize(
("nasty", "nice"),
[
(
(
"The en dash - always with spaces in running text when, as "
"discussed in this section, indicating a parenthesis or "
"pause - and the spaced em dash both have a certain "
"technical advantage over the unspaced em dash. "
),
(
"The en dash \u2013 always with spaces in running text when, as "
"discussed in this section, indicating a parenthesis or "
"pause \u2013 and the spaced em dash both have a certain "
"technical advantage over the unspaced em dash. "
),
),
(
"double -- dash",
"double \u2013 dash",
),
(
"triple --- dash",
"triple \u2013 dash",
),
(
"quadruple ---- dash",
"quadruple ---- dash",
),
(
"em — dash",
"em – dash",
),
(
"already\u0020–\u0020correct", # \u0020 is a normal space character
"already\u0020–\u0020correct",
),
(
"2004-2008",
"2004-2008", # no replacement
),
],
)
def test_en_dashes(nasty, nice):
assert replace_hyphens_with_en_dashes(nasty) == nice
def test_unicode_dash_lookup():
en_dash_replacement_sequence = "\u0020\u2013"
hyphen = "-"
en_dash = "–"
space = " "
non_breaking_space = " "
assert en_dash_replacement_sequence == space + en_dash
assert non_breaking_space not in en_dash_replacement_sequence
assert hyphen not in en_dash_replacement_sequence
@pytest.mark.parametrize(
"value",
[
"bar",
" bar ",
"""
\t bar
""",
" \u180E\u200B \u200C bar \u200D \u2060\uFEFF ",
],
)
def test_strip_all_whitespace(value):
assert strip_all_whitespace(value) == "bar"
@pytest.mark.parametrize(
"value",
[
"notifications-email",
" \tnotifications-email \x0c ",
"\rn\u200Coti\u200Dfi\u200Bcati\u2060ons-\u180Eemai\uFEFFl\uFEFF",
],
)
def test_strip_and_remove_obscure_whitespace(value):
assert strip_and_remove_obscure_whitespace(value) == "notifications-email"
def test_strip_and_remove_obscure_whitespace_only_removes_normal_whitespace_from_ends():
sentence = " words \n over multiple lines with \ttabs\t "
assert (
strip_and_remove_obscure_whitespace(sentence)
== "words \n over multiple lines with \ttabs"
)
def test_remove_smart_quotes_from_email_addresses():
assert (
remove_smart_quotes_from_email_addresses(
"""
line one’s quote
first.o’last@example.com is someone’s email address
line ‘three’
"""
)
== (
"""
line one’s quote
first.o'last@example.com is someone’s email address
line ‘three’
"""
)
)
def test_strip_unsupported_characters():
assert strip_unsupported_characters("line one\u2028line two") == (
"line oneline two"
)
@pytest.mark.parametrize(
"value",
[
"\u200C Your tax is\ndue\n\n",
" Your tax is due ",
# Non breaking spaces replaced by single spaces
"\u00A0Your\u00A0tax\u00A0 is\u00A0\u00A0due\u00A0",
# zero width spaces are removed
"\u180EYour \u200Btax\u200C is \u200D\u2060due \uFEFF",
# tabs are replaced by single spaces
"\tYour tax\tis due ",
],
)
def test_normalise_whitespace(value):
assert normalise_whitespace(value) == "Your tax is due"
@pytest.mark.parametrize(
("content", "expected_html"),
[
(
"http://example.com",
'http://example.com',
),
(
"https://example.com",
'https://example.com',
),
(
"example.com",
'example.com',
),
(
"www.foo.bar.example.com",
'www.foo.bar.example.com',
),
(
"example.com/",
'example.com/',
),
(
"www.foo.bar.example.com/",
'www.foo.bar.example.com/',
),
(
"example.com/foo",
'example.com/foo',
),
(
"example.com?foo",
'example.com?foo',
),
(
"example.com#foo",
'example.com#foo',
),
(
"Go to gov.uk/example.",
"Go to " 'gov.uk/example.',
),
(
"Go to gov.uk/example:",
"Go to " 'gov.uk/example:',
),
(
"Go to gov.uk/example;",
"Go to " 'gov.uk/example;',
),
(
"(gov.uk/example)",
"(" 'gov.uk/example)',
),
(
"(gov.uk/example)...",
"(" 'gov.uk/example)...',
),
(
"(gov.uk/example.)",
"(" 'gov.uk/example.)',
),
(
"(see example.com/foo_(bar))",
"(see "
'example.com/foo_(bar))',
),
(
"example.com/foo(((((((bar",
'example.com/foo(((((((bar',
),
(
"government website (gov.uk). Other websites…",
"government website ("
'gov.uk). Other websites…',
),
(
"[gov.uk/example]",
"[" 'gov.uk/example]',
),
(
"gov.uk/foo, gov.uk/bar",
'gov.uk/foo, '
'gov.uk/bar',
),
(
"gov.uk/foo
",
"" 'gov.uk/foo
',
),
(
"gov.uk?foo&",
'gov.uk?foo&',
),
(
"a .service.gov.uk domain",
"a .service.gov.uk domain",
),
(
'http://foo.com/"bar"?x=1#2',
'http://foo.com/"bar"?x=1#2',
),
(
"firstname.lastname@example.com",
"firstname.lastname@example.com",
),
(
"with-subdomain@test.example.com",
"with-subdomain@test.example.com",
),
],
)
def test_autolink_urls_matches_correctly(content, expected_html):
assert autolink_urls(content) == expected_html
@pytest.mark.parametrize(
("extra_kwargs", "expected_html"),
[
(
{},
'http://example.com',
),
(
{
"classes": "govuk-link",
},
'http://example.com',
),
],
)
def test_autolink_urls_applies_correct_attributes(extra_kwargs, expected_html):
assert autolink_urls("http://example.com", **extra_kwargs) == expected_html
@pytest.mark.parametrize(
"content", ["without link", "with link to https://example.com"]
)
def test_autolink_urls_returns_markup(content):
assert isinstance(autolink_urls(content), Markup)