Accept common spreadsheet formats, not just CSV

We require users to export their spreadsheets as CSV files before
uploading them. But this seems like the sort of thing a computer should
be able to do.

So this commit adds a wrapper class which:
- takes a the uploaded file
- returns it in a normalised format, or reads it using pyexcel[1]
- gives the data back in CSV format

This allows us to accept `.csv`, `.xlsx`, `.xls` (97 and 95), `.ods`,
`.xlsm` and `.tsv` files. We can upload the resultant CSV just like
normal, and process it for errors as before.

Testing
---

To test this I’ve added a selection of common spreadsheet files as test
data. They all contain the same data, so the tests look to see that the
resultant CSV output is the same for each.

UI changes
---

This commit doesn’t change the UI, apart from to give a different error
message if a user uploads a file type that we still don’t understand.

I intend to do this as a separate pull request, in order to fulfil
https://www.pivotaltracker.com/story/show/119371637
This commit is contained in:
Chris Hill-Scott
2016-05-05 15:41:11 +01:00
parent a348bce191
commit 1409ca36ca
13 changed files with 111 additions and 9 deletions

View File

@@ -1,6 +1,7 @@
import re
from wtforms import ValidationError
from notifications_utils.template import Template
from app.utils import Spreadsheet
class Blacklist(object):
@@ -20,8 +21,8 @@ class CsvFileValidator(object):
self.message = message
def __call__(self, form, field):
if not field.data.filename.lower().endswith('.csv'):
raise ValidationError("{} is not a CSV file".format(field.data.filename))
if not Spreadsheet.can_handle(field.data.filename):
raise ValidationError("{} isnt a spreadsheet that Notify can read".format(field.data.filename))
class ValidEmailDomainRegex(object):

View File

@@ -27,7 +27,7 @@ from app.main.uploader import (
s3download
)
from app import job_api_client, service_api_client, current_service, user_api_client, statistics_api_client
from app.utils import user_has_permissions, get_errors_for_csv
from app.utils import user_has_permissions, get_errors_for_csv, Spreadsheet
def get_send_button_text(template_type, number_of_messages):
@@ -112,10 +112,7 @@ def send_messages(service_id, template_id):
s3upload(
upload_id,
service_id,
{
'file_name': form.file.data.filename,
'data': form.file.data.read().decode('utf-8')
},
Spreadsheet.from_file(form.file.data.filename, form.file.data).as_dict,
current_app.config['AWS_REGION']
)
session['upload_data'] = {

View File

@@ -1,8 +1,14 @@
import re
import csv
from io import StringIO
from io import BytesIO, StringIO
from os import path
from functools import wraps
from flask import (abort, session, request, url_for)
import pyexcel
import pyexcel.ext.io
import pyexcel.ext.xls
import pyexcel.ext.xlsx
import pyexcel.ext.ods3
class BrowsableItem(object):
@@ -131,3 +137,51 @@ def email_safe(string):
return "".join([
character.lower() if character.isalnum() or character == "." else "" for character in re.sub("\s+", ".", string.strip()) # noqa
])
class Spreadsheet():
allowed_file_extensions = ['csv', 'xlsx', 'xls', 'ods', 'xlsm', 'tsv']
def __init__(self, filename, csv_data):
self.filename = filename
self.as_csv_data = csv_data
self.as_dict = {
'file_name': self.filename,
'data': self.as_csv_data
}
@classmethod
def can_handle(cls, filename):
return cls.get_extension(filename) in cls.allowed_file_extensions
@staticmethod
def get_extension(filename):
return path.splitext(filename)[1].lower().lstrip('.')
@staticmethod
def normalise_newlines(file_content):
return '\r\n'.join(file_content.getvalue().decode('utf-8').splitlines())
@classmethod
def from_file(cls, filename, file_content):
extension = cls.get_extension(filename)
if extension == 'csv':
return cls(filename, Spreadsheet.normalise_newlines(file_content))
if extension == 'tsv':
file_content = StringIO(Spreadsheet.normalise_newlines(file_content))
with StringIO() as converted:
output = csv.writer(converted)
for row in pyexcel.get_sheet(
file_type=extension,
file_content=file_content.getvalue()
).to_array():
output.writerow(row)
return cls(filename, converted.getvalue())

View File

@@ -8,6 +8,11 @@ Pygments==2.0.2
py-gfm==0.1.2
blinker==1.4
monotonic==0.3
pyexcel==0.2.1
pyexcel-io==0.1.0
pyexcel-xls==0.1.0
pyexcel-xlsx==0.1.0
pyexcel-ods3==0.1.1
git+https://github.com/alphagov/notifications-python-client.git@1.0.0#egg=notifications-python-client==1.0.0

View File

@@ -1,6 +1,8 @@
import pytest
import re
from io import BytesIO
from os import path
from glob import glob
from bs4 import BeautifulSoup
from flask import url_for
from unittest.mock import ANY
@@ -8,6 +10,43 @@ from tests import validate_route_permission, job_json_with_created_by
template_types = ['email', 'sms']
# The * ignores hidden files, eg .DS_Store
test_spreadsheet_files = glob(path.join('tests', 'spreadsheet_files', '*'))
def test_that_test_files_exist():
assert len(test_spreadsheet_files) == 8
@pytest.mark.parametrize("filename", test_spreadsheet_files)
def test_upload_files_in_different_formats(
filename,
app_,
api_user_active,
mocker,
mock_login,
mock_get_service,
mock_get_service_template,
mock_s3_upload,
mock_has_permissions,
fake_uuid
):
with app_.test_request_context(), app_.test_client() as client, open(filename, 'rb') as uploaded:
client.login(api_user_active)
client.post(
url_for('main.send_messages', service_id=fake_uuid, template_id=fake_uuid),
data={'file': (BytesIO(uploaded.read()), filename)},
content_type='multipart/form-data'
)
assert mock_s3_upload.call_args[0][2]['data'].strip() == (
"phone number,name,favourite colour,fruit\r\n"
"07739 468 050,Pete,Coral,tomato\r\n"
"07527 125 974,Not Pete,Magenta,Avacado\r\n"
"07512 058 823,Still Not Pete,Crimson,Pear"
)
def test_upload_csvfile_with_errors_shows_check_page_with_errors(
app_,
@@ -74,7 +113,7 @@ def test_upload_csv_invalid_extension(app_,
)
assert resp.status_code == 200
assert "{} is not a CSV file".format(filename) in resp.get_data(as_text=True)
assert "{} isnt a spreadsheet that Notify can read".format(filename) in resp.get_data(as_text=True)
def test_send_test_sms_message(

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,4 @@
phone number,name,favourite colour,fruit
07739 468 050,Pete,Coral,tomato
07527 125 974,Not Pete,Magenta,Avacado
07512 058 823,Still Not Pete,Crimson,Pear
1 phone number name favourite colour fruit
2 07739 468 050 Pete Coral tomato
3 07527 125 974 Not Pete Magenta Avacado
4 07512 058 823 Still Not Pete Crimson Pear

View File

@@ -0,0 +1 @@
phone number,name,favourite colour,fruit
1 phone number name favourite colour fruit 07739 468 050 Pete Coral tomato 07527 125 974 Not Pete Magenta Avacado 07512 058 823 Still Not Pete Crimson Pear

Binary file not shown.

View File

@@ -0,0 +1 @@
phone number name favourite colour fruit
1 phone number name favourite colour fruit 07739 468 050 Pete Coral tomato 07527 125 974 Not Pete Magenta Avacado 07512 058 823 Still Not Pete Crimson Pear