Files
notifications-admin/notifications_utils/sanitise_text.py

311 lines
12 KiB
Python
Raw Permalink Normal View History

import ast
import unicodedata
from regex import regex
class SanitiseText:
ALLOWED_CHARACTERS = set()
REPLACEMENT_CHARACTERS = {
"": "-", # EN DASH (U+2013)
"": "-", # EM DASH (U+2014)
"": "...", # HORIZONTAL ELLIPSIS (U+2026)
"": "'", # LEFT SINGLE QUOTATION MARK (U+2018)
"": "'", # RIGHT SINGLE QUOTATION MARK (U+2019)
"": '"', # LEFT DOUBLE QUOTATION MARK (U+201C)
"": '"', # RIGHT DOUBLE QUOTATION MARK (U+201D)
2025-03-31 08:45:33 -07:00
"\u180e": "", # Mongolian vowel separator
"\u200b": "", # zero width space
"\u200c": "", # zero width non-joiner
"\u200d": "", # zero width joiner
"\u2060": "", # word joiner
2025-03-31 08:45:33 -07:00
"\ufeff": "", # zero width non-breaking space
"\u00a0": " ", # NON BREAKING WHITE SPACE (U+200B)
"\t": " ", # TAB
}
@classmethod
def encode(cls, content):
return "".join(cls.encode_char(char) for char in content)
@classmethod
def get_non_compatible_characters(cls, content):
"""
Given an input string, return a set of non compatible characters.
This follows the same rules as `cls.encode`, but returns just the characters that encode would replace with `?`
"""
return set(
c
for c in content
if c not in cls.ALLOWED_CHARACTERS
and cls.is_extended_language(c) is False
and cls.downgrade_character(c) is None
)
@staticmethod
def get_unicode_char_from_codepoint(codepoint):
"""
Given a unicode codepoint (eg 002E for '.', 0061 for 'a', etc), return that actual unicode character.
unicodedata.decomposition returns strings containing codepoints, so we need to eval them ourselves
"""
# lets just make sure we aren't evaling anything weird
if not set(codepoint) <= set("0123456789ABCDEF") or not len(codepoint) == 4:
raise ValueError("{} is not a valid unicode codepoint".format(codepoint))
return ast.literal_eval('"\\u{}"'.format(codepoint))
@classmethod
def downgrade_character(cls, c):
"""
Attempt to downgrade a non-compatible character to the allowed character set. May downgrade to multiple
characters, eg ` -> ...`
Will return None if character is either already valid or has no known downgrade
"""
decomposed = unicodedata.decomposition(c)
if decomposed != "" and "<" not in decomposed:
# decomposition lists the unicode code points a character is made up of, if it's made up of multiple
# points. For example the á character returns '0061 0301', as in, the character a, followed by a combining
# acute accent. The decomposition might, however, also contain a decomposition mapping in angle parenthesis.
# For a full list of the types, see here: https://www.compart.com/en/unicode/decomposition.
# If it's got a mapping, we're not sure how best to downgrade it, so just see if it's in the
# REPLACEMENT_CHARACTERS map. If not, then it's probably a letter with a modifier, eg á
# ASSUMPTION: The first character of a combined unicode character (eg 'á' == '0061 0301')
# will be the ascii char
return cls.get_unicode_char_from_codepoint(decomposed.split()[0])
else:
# try and find a mapping (eg en dash -> hyphen ('': '-')), else return None
return cls.REPLACEMENT_CHARACTERS.get(c)
@classmethod
def is_japanese(cls, value):
if regex.search(r"([\p{IsHan}\p{IsHiragana}\p{IsKatakana}]+)", value):
return True
return False
@classmethod
def is_chinese(cls, value):
# This range supports all "CJK Unified Ideoglyphs"
# It may be missing some rare/historic characters that are not in common use
if regex.search(r"[\u4e00-\u9fff]+", value) or value in [
"",
"",
"",
"",
"",
";",
"(",
")",
"",
"",
"",
]:
return True
return False
@classmethod
def is_arabic(cls, value):
# For some reason, the python definition of Arabic (IsArabic) doesn't include
# some standard diacritics, so add them here.
if (
regex.search(r"\p{IsArabic}", value)
or regex.search(r"[\uFE70]+", value)
or regex.search(r"[\u064B]+", value)
or regex.search(r"[\u064F]+", value)
):
return True
return False
@classmethod
def is_punjabi(cls, value):
# Gukmukhi script or Shahmukhi script
if regex.search(r"[\u0A00-\u0A7F]+", value):
return True
elif regex.search(r"[\u0600-\u06FF]+", value):
return True
elif regex.search(r"[\u0750-\u077F]+", value):
return True
elif regex.search(r"[\u08A0-\u08FF]+", value):
return True
elif regex.search(r"[\uFB50-\uFDFF]+", value):
return True
elif regex.search(r"[\uFE70-\uFEFF]+", value):
return True
elif regex.search(r"[\u0900-\u097F]+", value):
return True
return False
@classmethod
def _is_extended_language_group_one(cls, value):
if regex.search(r"\p{IsHangul}", value): # Korean
return True
elif regex.search(r"\p{IsCyrillic}", value):
return True
elif SanitiseText.is_arabic(value):
return True
elif regex.search(r"\p{IsArmenian}", value):
return True
elif regex.search(r"\p{IsBengali}", value):
return True
elif SanitiseText.is_punjabi(value):
return True
return False
@classmethod
def _is_extended_language_group_two(cls, value):
if regex.search(r"\p{IsBuhid}", value):
return True
if regex.search(r"\p{IsCanadian_Aboriginal}", value):
return True
if regex.search(r"\p{IsCherokee}", value):
return True
if regex.search(r"\p{IsDevanagari}", value):
return True
if regex.search(r"\p{IsEthiopic}", value):
return True
if regex.search(r"\p{IsGeorgian}", value):
return True
return False
@classmethod
def _is_extended_language_group_three(cls, value):
if regex.search(r"\p{IsGreek}", value):
return True
if regex.search(r"\p{IsGujarati}", value):
return True
if regex.search(r"\p{IsHanunoo}", value):
return True
if regex.search(r"\p{IsHebrew}", value):
return True
if regex.search(r"\p{IsLimbu}", value):
return True
if regex.search(r"\p{IsKannada}", value):
return True
return False
@classmethod
def _is_extended_language_group_four(cls, value):
if regex.search(
r"([\p{IsKhmer}\p{IsLao}\p{IsMongolian}\p{IsMyanmar}\p{IsTibetan}\p{IsYi}]+)",
value,
):
return True
if regex.search(
r"([\p{IsOgham}\p{IsOriya}\p{IsSinhala}\p{IsSyriac}\p{IsTagalog}]+)", value
):
return True
if regex.search(
r"([\p{IsTagbanwa}\p{IsTaiLe}\p{IsTamil}\p{IsTelugu}\p{IsThaana}\p{IsThai}]+)",
value,
):
return True
# Vietnamese
if regex.search(
r"\b\S*[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴđa-zA-Z]+\S*\b", # noqa
value,
):
return True
# Turkish
if regex.search(r"\b\S*[a-zA-ZçğışöüÇĞİŞÖÜ]+\S*\b", value):
return True
return False
@classmethod
def is_extended_language(cls, value):
"""
Languages are combined in groups to handle cyclomatic complexity warnings
"""
if cls._is_extended_language_group_one(value):
return True
if cls._is_extended_language_group_two(value):
return True
if cls._is_extended_language_group_three(value):
return True
if cls.is_japanese(value):
return True
if cls._is_extended_language_group_four(value):
return True
if cls.is_chinese(value):
return True
return False
@classmethod
def encode_char(cls, c):
"""
Given a single unicode character, return a compatible character from the allowed set.
"""
# char is a good character already - return that native character.
if c in cls.ALLOWED_CHARACTERS:
return c
elif cls.is_extended_language(c):
return c
else:
c = cls.downgrade_character(c)
return c if c is not None else "?"
class SanitiseSMS(SanitiseText):
"""
Given an input string, makes it GSM and Welsh character compatible. This involves removing all non-gsm characters by
applying the following rules
* characters within the GSM character set (https://en.wikipedia.org/wiki/GSM_03.38)
and extension character set are kept
* Welsh characters not included in the default GSM character set are kept
* characters with sensible downgrades are replaced in place
* characters with diacritics (accents, umlauts, cedillas etc) are replaced with their base character, eg é -> e
* en dash and em dash ( and ) are replaced with hyphen (-)
* left/right quotation marks (, , , ) are replaced with ' and "
* zero width spaces (sometimes used to stop eg "gov.uk" linkifying) are removed
* tabs are replaced with a single space
* any remaining unicode characters (eg chinese/cyrillic/glyphs/emoji) are replaced with ?
"""
WELSH_DIACRITICS = set(
"àèìòùẁỳ"
"ÀÈÌÒÙẀỲ" # grave
"áéíóúẃý"
"ÁÉÍÓÚẂÝ" # acute
"äëïöüẅÿ"
"ÄËÏÖÜẄŸ" # diaeresis
"âêîôûŵŷ"
"ÂÊÎÔÛŴŶ" # carets
)
EXTENDED_GSM_CHARACTERS = set("^{}\\[~]|€")
GSM_CHARACTERS = (
set(
"@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ !\"%&'()*+,-./0123456789:;<=>?"
+ "¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà"
)
| EXTENDED_GSM_CHARACTERS
)
ALLOWED_CHARACTERS = GSM_CHARACTERS | WELSH_DIACRITICS
# some welsh characters are in GSM and some aren't - we need to distinguish between these for counting fragments
WELSH_NON_GSM_CHARACTERS = WELSH_DIACRITICS - GSM_CHARACTERS
class SanitiseASCII(SanitiseText):
"""
As SMS above, but the allowed characters are printable ascii, from character range 32 to 126 inclusive.
[chr(x) for x in range(32, 127)]
"""
ALLOWED_CHARACTERS = set(
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
)