Make the guessing a bit more sophisticated

Things we’ve noticed from looking at real data that we could handle in a
smarter way:
- removing numbers (there might be a tom.smith2@dept.gov.uk if tom.smith
  is already taken)
- removing middle initials (again, these tend to be used for
  disambiguation and aren’t included when we ask people for their names)
- ignoring email addresses which only have someone’s initial, not their
  first name (because we can’t make a decent guess in this case)
This commit is contained in:
Chris Hill-Scott
2018-07-11 13:31:38 +01:00
parent 5c5e0bac02
commit 660fadbce7
2 changed files with 44 additions and 3 deletions

View File

@@ -27,6 +27,7 @@ from flask import (
from flask_login import current_user
from notifications_utils.formatters import make_quotes_smart
from notifications_utils.recipients import RecipientCSV
from notifications_utils.take import Take
from notifications_utils.template import (
EmailPreviewTemplate,
LetterImageTemplate,
@@ -616,11 +617,41 @@ def unicode_truncate(s, length):
return encoded.decode('utf-8', 'ignore')
def starts_with_initial(name):
return bool(re.match(r'^.\.', name))
def remove_middle_initial(name):
return re.sub(r'\s+.\s+', ' ', name)
def remove_digits(name):
return ''.join(c for c in name if not c.isdigit())
def normalize_spaces(name):
return ' '.join(name.split())
def guess_name_from_email_address(email_address):
possible_name = re.split(r'[\@\+]', email_address)[0]
if '.' not in possible_name:
if '.' not in possible_name or starts_with_initial(possible_name):
return ''
return make_quotes_smart(possible_name.replace('.', ' ').title())
return Take(
possible_name
).then(
str.replace, '.', ' '
).then(
remove_digits
).then(
remove_middle_initial
).then(
str.title
).then(
make_quotes_smart
).then(
normalize_spaces
)

View File

@@ -171,13 +171,23 @@ def test_register_with_existing_email_sends_emails(
@pytest.mark.parametrize('email_address, expected_value', [
("example123@example.com", ""),
("first.last@example.com", "First Last"),
("first.middle.last@example.com", "First Middle Last"),
("first.m.last@example.com", "First Last"),
("first.last-last@example.com", "First Last-Last"),
("first.o'last@example.com", "First OLast"),
("first.last+testing@example.com", "First Last"),
("first.last+testing+testing@example.com", "First Last"),
("first.last6@example.com", "First Last"),
("first.last.212@example.com", "First Last"),
("first.2.last@example.com", "First Last"),
("first.2b.last@example.com", "First Last"),
("first.1.2.3.last@example.com", "First Last"),
("first.last.1.2.3@example.com", "First Last"),
# Instances where we cant make a good-enough guess:
("example123@example.com", ""),
("f.last@example.com", ""),
("f.m.last@example.com", ""),
])
def test_shows_registration_page_from_invite(
client_request,