From 660fadbce7771f2844187ffe1fbb1917a03f73ea Mon Sep 17 00:00:00 2001
From: Chris Hill-Scott <me@quis.cc>
Date: Wed, 11 Jul 2018 13:31:38 +0100
Subject: [PATCH] Make the guessing a bit more sophisticated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Things we’ve noticed from looking at real data that we could handle in a
smarter way:
- removing numbers (there might be a tom.smith2@dept.gov.uk if tom.smith
  is already taken)
- removing middle initials (again, these tend to be used for
  disambiguation and aren’t included when we ask people for their names)
- ignoring email addresses which only have someone’s initial, not their
  first name (because we can’t make a decent guess in this case)
---
 app/utils.py                          | 35 +++++++++++++++++++++++++--
 tests/app/main/views/test_register.py | 12 ++++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/app/utils.py b/app/utils.py
index b57732bd4..2c57a5d9c 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -27,6 +27,7 @@ from flask import (
 from flask_login import current_user
 from notifications_utils.formatters import make_quotes_smart
 from notifications_utils.recipients import RecipientCSV
+from notifications_utils.take import Take
 from notifications_utils.template import (
     EmailPreviewTemplate,
     LetterImageTemplate,
@@ -616,11 +617,41 @@ def unicode_truncate(s, length):
     return encoded.decode('utf-8', 'ignore')
 
 
+def starts_with_initial(name):
+    return bool(re.match(r'^.\.', name))
+
+
+def remove_middle_initial(name):
+    return re.sub(r'\s+.\s+', ' ', name)
+
+
+def remove_digits(name):
+    return ''.join(c for c in name if not c.isdigit())
+
+
+def normalize_spaces(name):
+    return ' '.join(name.split())
+
+
 def guess_name_from_email_address(email_address):
 
     possible_name = re.split(r'[\@\+]', email_address)[0]
 
-    if '.' not in possible_name:
+    if '.' not in possible_name or starts_with_initial(possible_name):
         return ''
 
-    return make_quotes_smart(possible_name.replace('.', ' ').title())
+    return Take(
+        possible_name
+    ).then(
+        str.replace, '.', ' '
+    ).then(
+        remove_digits
+    ).then(
+        remove_middle_initial
+    ).then(
+        str.title
+    ).then(
+        make_quotes_smart
+    ).then(
+        normalize_spaces
+    )
diff --git a/tests/app/main/views/test_register.py b/tests/app/main/views/test_register.py
index 45c4aa714..d5cd1847c 100644
--- a/tests/app/main/views/test_register.py
+++ b/tests/app/main/views/test_register.py
@@ -171,13 +171,23 @@ def test_register_with_existing_email_sends_emails(
 
 
 @pytest.mark.parametrize('email_address, expected_value', [
-    ("example123@example.com", ""),
     ("first.last@example.com", "First Last"),
     ("first.middle.last@example.com", "First Middle Last"),
+    ("first.m.last@example.com", "First Last"),
     ("first.last-last@example.com", "First Last-Last"),
     ("first.o'last@example.com", "First O’Last"),
     ("first.last+testing@example.com", "First Last"),
     ("first.last+testing+testing@example.com", "First Last"),
+    ("first.last6@example.com", "First Last"),
+    ("first.last.212@example.com", "First Last"),
+    ("first.2.last@example.com", "First Last"),
+    ("first.2b.last@example.com", "First Last"),
+    ("first.1.2.3.last@example.com", "First Last"),
+    ("first.last.1.2.3@example.com", "First Last"),
+    # Instances where we can’t make a good-enough guess:
+    ("example123@example.com", ""),
+    ("f.last@example.com", ""),
+    ("f.m.last@example.com", ""),
 ])
 def test_shows_registration_page_from_invite(
     client_request,