From 660fadbce7771f2844187ffe1fbb1917a03f73ea Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Wed, 11 Jul 2018 13:31:38 +0100 Subject: [PATCH] Make the guessing a bit more sophisticated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Things we’ve noticed from looking at real data that we could handle in a smarter way: - removing numbers (there might be a tom.smith2@dept.gov.uk if tom.smith is already taken) - removing middle initials (again, these tend to be used for disambiguation and aren’t included when we ask people for their names) - ignoring email addresses which only have someone’s initial, not their first name (because we can’t make a decent guess in this case) --- app/utils.py | 35 +++++++++++++++++++++++++-- tests/app/main/views/test_register.py | 12 ++++++++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/app/utils.py b/app/utils.py index b57732bd4..2c57a5d9c 100644 --- a/app/utils.py +++ b/app/utils.py @@ -27,6 +27,7 @@ from flask import ( from flask_login import current_user from notifications_utils.formatters import make_quotes_smart from notifications_utils.recipients import RecipientCSV +from notifications_utils.take import Take from notifications_utils.template import ( EmailPreviewTemplate, LetterImageTemplate, @@ -616,11 +617,41 @@ def unicode_truncate(s, length): return encoded.decode('utf-8', 'ignore') +def starts_with_initial(name): + return bool(re.match(r'^.\.', name)) + + +def remove_middle_initial(name): + return re.sub(r'\s+.\s+', ' ', name) + + +def remove_digits(name): + return ''.join(c for c in name if not c.isdigit()) + + +def normalize_spaces(name): + return ' '.join(name.split()) + + def guess_name_from_email_address(email_address): possible_name = re.split(r'[\@\+]', email_address)[0] - if '.' not in possible_name: + if '.' not in possible_name or starts_with_initial(possible_name): return '' - return make_quotes_smart(possible_name.replace('.', ' ').title()) + return Take( + possible_name + ).then( + str.replace, '.', ' ' + ).then( + remove_digits + ).then( + remove_middle_initial + ).then( + str.title + ).then( + make_quotes_smart + ).then( + normalize_spaces + ) diff --git a/tests/app/main/views/test_register.py b/tests/app/main/views/test_register.py index 45c4aa714..d5cd1847c 100644 --- a/tests/app/main/views/test_register.py +++ b/tests/app/main/views/test_register.py @@ -171,13 +171,23 @@ def test_register_with_existing_email_sends_emails( @pytest.mark.parametrize('email_address, expected_value', [ - ("example123@example.com", ""), ("first.last@example.com", "First Last"), ("first.middle.last@example.com", "First Middle Last"), + ("first.m.last@example.com", "First Last"), ("first.last-last@example.com", "First Last-Last"), ("first.o'last@example.com", "First O’Last"), ("first.last+testing@example.com", "First Last"), ("first.last+testing+testing@example.com", "First Last"), + ("first.last6@example.com", "First Last"), + ("first.last.212@example.com", "First Last"), + ("first.2.last@example.com", "First Last"), + ("first.2b.last@example.com", "First Last"), + ("first.1.2.3.last@example.com", "First Last"), + ("first.last.1.2.3@example.com", "First Last"), + # Instances where we can’t make a good-enough guess: + ("example123@example.com", ""), + ("f.last@example.com", ""), + ("f.m.last@example.com", ""), ]) def test_shows_registration_page_from_invite( client_request,