From 92ffe3a78c2691341bc9dc478d55ec341b5dca59 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 26 May 2020 17:39:25 +0100 Subject: [PATCH 1/4] Use meta tag to tell search engines not to index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google’s documentation says: > robots.txt is not a mechanism for keeping a web page out of Google. To > keep a web page out of Google, you should use noindex directives A noindex directive means adding the following meta tag to pages that shouldn’t be indexed: ```html ``` It’s also possible to set the directive as a HTTP header, but this seems trickier to achieve on a per-view basis in Flask. I’ve implemented this as a decorator so it can quickly be added to any other pages that we decide shouldn’t appear in search results. --- app/main/views/feedback.py | 7 +++++++ app/main/views/register.py | 2 ++ app/main/views/sign_in.py | 2 ++ app/templates/admin_template.html | 3 +++ app/templates/views/support/public.html | 2 +- app/utils.py | 10 +++++++++- tests/app/main/views/test_index.py | 21 +++++++++++++++++++++ 7 files changed, 45 insertions(+), 2 deletions(-) diff --git a/app/main/views/feedback.py b/app/main/views/feedback.py index 65e2dc106..d2494be10 100644 --- a/app/main/views/feedback.py +++ b/app/main/views/feedback.py @@ -19,11 +19,13 @@ from app.models.feedback import ( PROBLEM_TICKET_TYPE, QUESTION_TICKET_TYPE, ) +from app.utils import hide_from_search_engines bank_holidays = BankHolidays(use_cached_holidays=True) @main.route('/support', methods=['GET', 'POST']) +@hide_from_search_engines def support(): if current_user.is_authenticated: @@ -50,12 +52,14 @@ def support(): @main.route('/support/public') +@hide_from_search_engines def support_public(): return render_template('views/support/public.html') @main.route('/support/triage', methods=['GET', 'POST']) @main.route('/support/triage/', methods=['GET', 'POST']) +@hide_from_search_engines def triage(ticket_type=PROBLEM_TICKET_TYPE): form = Triage() if form.validate_on_submit(): @@ -75,6 +79,7 @@ def triage(ticket_type=PROBLEM_TICKET_TYPE): @main.route('/support/', methods=['GET', 'POST']) +@hide_from_search_engines def feedback(ticket_type): form = FeedbackOrProblem() @@ -153,6 +158,7 @@ def feedback(ticket_type): @main.route('/support/escalate', methods=['GET', 'POST']) +@hide_from_search_engines def bat_phone(): if current_user.is_authenticated: @@ -162,6 +168,7 @@ def bat_phone(): @main.route('/support/thanks', methods=['GET', 'POST']) +@hide_from_search_engines def thanks(): return render_template( 'views/support/thanks.html', diff --git a/app/main/views/register.py b/app/main/views/register.py index 8134538f0..35693a5ee 100644 --- a/app/main/views/register.py +++ b/app/main/views/register.py @@ -11,9 +11,11 @@ from app.main.forms import ( ) from app.main.views.verify import activate_user from app.models.user import InvitedOrgUser, InvitedUser, User +from app.utils import hide_from_search_engines @main.route('/register', methods=['GET', 'POST']) +@hide_from_search_engines def register(): if current_user and current_user.is_authenticated: return redirect(url_for('main.show_accounts_or_dashboard')) diff --git a/app/main/views/sign_in.py b/app/main/views/sign_in.py index 567bb3f5a..ae15fa751 100644 --- a/app/main/views/sign_in.py +++ b/app/main/views/sign_in.py @@ -14,9 +14,11 @@ from app import login_manager from app.main import main from app.main.forms import LoginForm from app.models.user import InvitedUser, User +from app.utils import hide_from_search_engines @main.route('/sign-in', methods=(['GET', 'POST'])) +@hide_from_search_engines def sign_in(): if current_user and current_user.is_authenticated: return redirect(url_for('main.show_accounts_or_dashboard')) diff --git a/app/templates/admin_template.html b/app/templates/admin_template.html index 1371f4ff9..ec40d460b 100644 --- a/app/templates/admin_template.html +++ b/app/templates/admin_template.html @@ -17,6 +17,9 @@ + {% if g.hide_from_search_engines %} + + {% endif %} {% block meta_format_detection %} diff --git a/app/templates/views/support/public.html b/app/templates/views/support/public.html index c965e5e72..aca84aece 100644 --- a/app/templates/views/support/public.html +++ b/app/templates/views/support/public.html @@ -3,7 +3,7 @@ {% from "components/page-header.html" import page_header %} {% block per_page_title %} - The GOV.UK Notify team cannot give advice to members of the public + The GOV.UK Notify service is for people who work in the government {% endblock %} {% block maincolumn_content %} diff --git a/app/utils.py b/app/utils.py index 6de6baf3b..ec4313a90 100644 --- a/app/utils.py +++ b/app/utils.py @@ -16,7 +16,7 @@ import pyexcel import pyexcel_xlsx import pytz from dateutil import parser -from flask import abort, current_app, redirect, request, session, url_for +from flask import abort, current_app, g, redirect, request, session, url_for from flask_login import current_user, login_required from notifications_utils.field import Field from notifications_utils.formatters import ( @@ -767,3 +767,11 @@ def is_less_than_90_days_ago(date_from_db): return (datetime.utcnow() - datetime.strptime( date_from_db, "%Y-%m-%dT%H:%M:%S.%fZ" )).days < 90 + + +def hide_from_search_engines(f): + @wraps(f) + def decorated_function(*args, **kwargs): + g.hide_from_search_engines = True + return f(*args, **kwargs) + return decorated_function diff --git a/tests/app/main/views/test_index.py b/tests/app/main/views/test_index.py index ea6f4b54a..6bc322b44 100644 --- a/tests/app/main/views/test_index.py +++ b/tests/app/main/views/test_index.py @@ -3,6 +3,7 @@ from functools import partial import pytest from bs4 import BeautifulSoup from flask import url_for +from freezegun import freeze_time from app.main.forms import FieldWithNoneOption from tests.conftest import SERVICE_ONE_ID, normalize_spaces, sample_uuid @@ -76,6 +77,26 @@ def test_robots(client): ) +@pytest.mark.parametrize('endpoint, kwargs', ( + ('sign_in', {}), + ('support', {}), + ('support_public', {}), + ('triage', {}), + ('feedback', {'ticket_type': 'ask-question-give-feedback'}), + ('feedback', {'ticket_type': 'general'}), + ('feedback', {'ticket_type': 'report-problem'}), + ('bat_phone', {}), + ('thanks', {}), + ('register', {}), + pytest.param('index', {}, marks=pytest.mark.xfail(raises=AssertionError)), +)) +@freeze_time('2012-12-12 12:12') # So we don’t go out of business hours +def test_hiding_pages_from_search_engines(client_request, endpoint, kwargs): + client_request.logout() + page = client_request.get(f'main.{endpoint}', **kwargs) + assert page.select_one('meta[name=robots]')['content'] == 'noindex' + + @pytest.mark.parametrize('view', [ 'cookies', 'privacy', 'pricing', 'terms', 'roadmap', 'features', 'documentation', 'security', From f902205ef380da595cd38774a9f00c74667b19ff Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 26 May 2020 17:45:55 +0100 Subject: [PATCH 2/4] Remove email features page from search engines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reimplements https://github.com/alphagov/notifications-aws/pull/796 Since deploying alphagov/notifications-utils#736 I’ve been looking at how members of the public are ending up on our support page. The vast majority are landing on https://www.notifications.service.gov.uk/features/email Previously we thought that they were clicking the ‘contact us’ link in the page, which deep linked into the support journey, so we removed these deep links in alphagov/notifications-admin#3451 But the tickets are still coming in, so I think that people are still landing on this page, then going directly to ‘support’ in the top navigation. So the next measure we have available is to try to stop people from landing on this page in the first place. All the examples I’ve looked at show people coming from Google to this page. By putting the page’s URL in our robots.txt it should stop Google (and other search engines) listing it in search results. --- app/main/views/index.py | 3 ++- tests/app/main/views/test_index.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/app/main/views/index.py b/app/main/views/index.py index 4ed477e70..a36fdd133 100644 --- a/app/main/views/index.py +++ b/app/main/views/index.py @@ -21,7 +21,7 @@ from app.main.views.sub_navigation_dictionaries import ( using_notify_nav, ) from app.models.feedback import QUESTION_TICKET_TYPE -from app.utils import get_logo_cdn_domain +from app.utils import get_logo_cdn_domain, hide_from_search_engines @main.route('/') @@ -253,6 +253,7 @@ def roadmap(): @main.route('/features/email') +@hide_from_search_engines def features_email(): return render_template( 'views/features/emails.html', diff --git a/tests/app/main/views/test_index.py b/tests/app/main/views/test_index.py index 6bc322b44..7dc94201a 100644 --- a/tests/app/main/views/test_index.py +++ b/tests/app/main/views/test_index.py @@ -88,6 +88,7 @@ def test_robots(client): ('bat_phone', {}), ('thanks', {}), ('register', {}), + ('features_email', {}), pytest.param('index', {}, marks=pytest.mark.xfail(raises=AssertionError)), )) @freeze_time('2012-12-12 12:12') # So we don’t go out of business hours From f12f0fae87bf0301acfba54287852cbed73495d5 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 26 May 2020 17:47:43 +0100 Subject: [PATCH 3/4] Remove robots.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google’s documentation says: > robots.txt is not a mechanism for keeping a web page out of Google. To > keep a web page out of Google, you should use noindex directives We’ve implemented a noindex directive now, so we don’t need to serve robots.txt any more. --- app/main/views/index.py | 11 ----------- app/navigation.py | 4 ---- tests/app/main/views/test_index.py | 14 ++------------ 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/app/main/views/index.py b/app/main/views/index.py index a36fdd133..7e59a0347 100644 --- a/app/main/views/index.py +++ b/app/main/views/index.py @@ -36,17 +36,6 @@ def index(): ) -@main.route('/robots.txt') -def robots(): - return ( - 'User-agent: *\n' - 'Disallow: /sign-in\n' - 'Disallow: /support\n' - 'Disallow: /support/\n' - 'Disallow: /register\n' - ), 200, {'Content-Type': 'text/plain'} - - @main.route('/error/') def error(status_code): if status_code >= 500: diff --git a/app/navigation.py b/app/navigation.py index b407db0a5..d98fba4cb 100644 --- a/app/navigation.py +++ b/app/navigation.py @@ -256,7 +256,6 @@ class HeaderNavigation(Navigation): 'returned_letters', 'returned_letters_report', 'revoke_api_key', - 'robots', 'send_messages', 'send_notification', 'send_one_off', @@ -608,7 +607,6 @@ class MainNavigation(Navigation): 'returned_letters_report', 'revalidate_email_sent', 'roadmap', - 'robots', 'security', 'send_notification', 'send_from_contact_list', @@ -870,7 +868,6 @@ class CaseworkNavigation(Navigation): 'revalidate_email_sent', 'revoke_api_key', 'roadmap', - 'robots', 'security', 'send_messages', 'send_notification', @@ -1163,7 +1160,6 @@ class OrgNavigation(Navigation): 'revalidate_email_sent', 'revoke_api_key', 'roadmap', - 'robots', 'security', 'send_messages', 'send_notification', diff --git a/tests/app/main/views/test_index.py b/tests/app/main/views/test_index.py index 7dc94201a..9242b9c88 100644 --- a/tests/app/main/views/test_index.py +++ b/tests/app/main/views/test_index.py @@ -63,18 +63,8 @@ def test_logged_in_user_redirects_to_choose_account( ) -def test_robots(client): - assert url_for('main.robots') == '/robots.txt' - response = client.get(url_for('main.robots')) - assert response.headers['Content-Type'] == 'text/plain' - assert response.status_code == 200 - assert response.get_data(as_text=True) == ( - 'User-agent: *\n' - 'Disallow: /sign-in\n' - 'Disallow: /support\n' - 'Disallow: /support/\n' - 'Disallow: /register\n' - ) +def test_robots(client_request): + client_request.get_url('/robots.txt', _expected_status=404) @pytest.mark.parametrize('endpoint, kwargs', ( From 978ebcbe9b3bc21f592d8e8fb4617443ca797de4 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Wed, 27 May 2020 09:03:14 +0100 Subject: [PATCH 4/4] Send HTTP header as well as inserting meta tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will let us use the decorator on endpoints that don’t return HTML. --- app/utils.py | 15 +++++++++++++-- tests/app/main/views/test_index.py | 13 ++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/app/utils.py b/app/utils.py index ec4313a90..01f85de3b 100644 --- a/app/utils.py +++ b/app/utils.py @@ -16,7 +16,16 @@ import pyexcel import pyexcel_xlsx import pytz from dateutil import parser -from flask import abort, current_app, g, redirect, request, session, url_for +from flask import ( + abort, + current_app, + g, + make_response, + redirect, + request, + session, + url_for, +) from flask_login import current_user, login_required from notifications_utils.field import Field from notifications_utils.formatters import ( @@ -773,5 +782,7 @@ def hide_from_search_engines(f): @wraps(f) def decorated_function(*args, **kwargs): g.hide_from_search_engines = True - return f(*args, **kwargs) + response = make_response(f(*args, **kwargs)) + response.headers['X-Robots-Tag'] = 'noindex' + return response return decorated_function diff --git a/tests/app/main/views/test_index.py b/tests/app/main/views/test_index.py index 9242b9c88..25bbc69d5 100644 --- a/tests/app/main/views/test_index.py +++ b/tests/app/main/views/test_index.py @@ -82,9 +82,16 @@ def test_robots(client_request): pytest.param('index', {}, marks=pytest.mark.xfail(raises=AssertionError)), )) @freeze_time('2012-12-12 12:12') # So we don’t go out of business hours -def test_hiding_pages_from_search_engines(client_request, endpoint, kwargs): - client_request.logout() - page = client_request.get(f'main.{endpoint}', **kwargs) +def test_hiding_pages_from_search_engines( + client, + mock_get_service_and_organisation_counts, + endpoint, + kwargs, +): + response = client.get(url_for(f'main.{endpoint}', **kwargs)) + assert 'X-Robots-Tag' in response.headers + assert response.headers['X-Robots-Tag'] == 'noindex' + page = BeautifulSoup(response.data.decode('utf-8'), 'html.parser') assert page.select_one('meta[name=robots]')['content'] == 'noindex'