mirror of
https://github.com/GSA/notifications-admin.git
synced 2025-12-08 22:24:22 -05:00
311 lines
12 KiB
Python
311 lines
12 KiB
Python
import ast
|
||
import unicodedata
|
||
|
||
from regex import regex
|
||
|
||
|
||
class SanitiseText:
|
||
ALLOWED_CHARACTERS = set()
|
||
|
||
REPLACEMENT_CHARACTERS = {
|
||
"–": "-", # EN DASH (U+2013)
|
||
"—": "-", # EM DASH (U+2014)
|
||
"…": "...", # HORIZONTAL ELLIPSIS (U+2026)
|
||
"‘": "'", # LEFT SINGLE QUOTATION MARK (U+2018)
|
||
"’": "'", # RIGHT SINGLE QUOTATION MARK (U+2019)
|
||
"“": '"', # LEFT DOUBLE QUOTATION MARK (U+201C)
|
||
"”": '"', # RIGHT DOUBLE QUOTATION MARK (U+201D)
|
||
"\u180e": "", # Mongolian vowel separator
|
||
"\u200b": "", # zero width space
|
||
"\u200c": "", # zero width non-joiner
|
||
"\u200d": "", # zero width joiner
|
||
"\u2060": "", # word joiner
|
||
"\ufeff": "", # zero width non-breaking space
|
||
"\u00a0": " ", # NON BREAKING WHITE SPACE (U+200B)
|
||
"\t": " ", # TAB
|
||
}
|
||
|
||
@classmethod
|
||
def encode(cls, content):
|
||
return "".join(cls.encode_char(char) for char in content)
|
||
|
||
@classmethod
|
||
def get_non_compatible_characters(cls, content):
|
||
"""
|
||
Given an input string, return a set of non compatible characters.
|
||
|
||
This follows the same rules as `cls.encode`, but returns just the characters that encode would replace with `?`
|
||
"""
|
||
return set(
|
||
c
|
||
for c in content
|
||
if c not in cls.ALLOWED_CHARACTERS
|
||
and cls.is_extended_language(c) is False
|
||
and cls.downgrade_character(c) is None
|
||
)
|
||
|
||
@staticmethod
|
||
def get_unicode_char_from_codepoint(codepoint):
|
||
"""
|
||
Given a unicode codepoint (eg 002E for '.', 0061 for 'a', etc), return that actual unicode character.
|
||
|
||
unicodedata.decomposition returns strings containing codepoints, so we need to eval them ourselves
|
||
"""
|
||
# lets just make sure we aren't evaling anything weird
|
||
if not set(codepoint) <= set("0123456789ABCDEF") or not len(codepoint) == 4:
|
||
raise ValueError("{} is not a valid unicode codepoint".format(codepoint))
|
||
return ast.literal_eval('"\\u{}"'.format(codepoint))
|
||
|
||
@classmethod
|
||
def downgrade_character(cls, c):
|
||
"""
|
||
Attempt to downgrade a non-compatible character to the allowed character set. May downgrade to multiple
|
||
characters, eg `… -> ...`
|
||
|
||
Will return None if character is either already valid or has no known downgrade
|
||
"""
|
||
decomposed = unicodedata.decomposition(c)
|
||
if decomposed != "" and "<" not in decomposed:
|
||
# decomposition lists the unicode code points a character is made up of, if it's made up of multiple
|
||
# points. For example the á character returns '0061 0301', as in, the character a, followed by a combining
|
||
# acute accent. The decomposition might, however, also contain a decomposition mapping in angle parenthesis.
|
||
# For a full list of the types, see here: https://www.compart.com/en/unicode/decomposition.
|
||
# If it's got a mapping, we're not sure how best to downgrade it, so just see if it's in the
|
||
# REPLACEMENT_CHARACTERS map. If not, then it's probably a letter with a modifier, eg á
|
||
# ASSUMPTION: The first character of a combined unicode character (eg 'á' == '0061 0301')
|
||
# will be the ascii char
|
||
return cls.get_unicode_char_from_codepoint(decomposed.split()[0])
|
||
else:
|
||
# try and find a mapping (eg en dash -> hyphen ('–': '-')), else return None
|
||
return cls.REPLACEMENT_CHARACTERS.get(c)
|
||
|
||
@classmethod
|
||
def is_japanese(cls, value):
|
||
if regex.search(r"([\p{IsHan}\p{IsHiragana}\p{IsKatakana}]+)", value):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def is_chinese(cls, value):
|
||
# This range supports all "CJK Unified Ideoglyphs"
|
||
# It may be missing some rare/historic characters that are not in common use
|
||
if regex.search(r"[\u4e00-\u9fff]+", value) or value in [
|
||
"。",
|
||
"、",
|
||
":",
|
||
"?",
|
||
"!",
|
||
";",
|
||
"(",
|
||
")",
|
||
"“",
|
||
"”",
|
||
",",
|
||
]:
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def is_arabic(cls, value):
|
||
# For some reason, the python definition of Arabic (IsArabic) doesn't include
|
||
# some standard diacritics, so add them here.
|
||
if (
|
||
regex.search(r"\p{IsArabic}", value)
|
||
or regex.search(r"[\uFE70]+", value)
|
||
or regex.search(r"[\u064B]+", value)
|
||
or regex.search(r"[\u064F]+", value)
|
||
):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def is_punjabi(cls, value):
|
||
# Gukmukhi script or Shahmukhi script
|
||
|
||
if regex.search(r"[\u0A00-\u0A7F]+", value):
|
||
return True
|
||
elif regex.search(r"[\u0600-\u06FF]+", value):
|
||
return True
|
||
elif regex.search(r"[\u0750-\u077F]+", value):
|
||
return True
|
||
elif regex.search(r"[\u08A0-\u08FF]+", value):
|
||
return True
|
||
elif regex.search(r"[\uFB50-\uFDFF]+", value):
|
||
return True
|
||
elif regex.search(r"[\uFE70-\uFEFF]+", value):
|
||
return True
|
||
elif regex.search(r"[\u0900-\u097F]+", value):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def _is_extended_language_group_one(cls, value):
|
||
if regex.search(r"\p{IsHangul}", value): # Korean
|
||
return True
|
||
elif regex.search(r"\p{IsCyrillic}", value):
|
||
return True
|
||
elif SanitiseText.is_arabic(value):
|
||
return True
|
||
elif regex.search(r"\p{IsArmenian}", value):
|
||
return True
|
||
elif regex.search(r"\p{IsBengali}", value):
|
||
return True
|
||
elif SanitiseText.is_punjabi(value):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def _is_extended_language_group_two(cls, value):
|
||
if regex.search(r"\p{IsBuhid}", value):
|
||
return True
|
||
if regex.search(r"\p{IsCanadian_Aboriginal}", value):
|
||
return True
|
||
if regex.search(r"\p{IsCherokee}", value):
|
||
return True
|
||
if regex.search(r"\p{IsDevanagari}", value):
|
||
return True
|
||
if regex.search(r"\p{IsEthiopic}", value):
|
||
return True
|
||
if regex.search(r"\p{IsGeorgian}", value):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def _is_extended_language_group_three(cls, value):
|
||
if regex.search(r"\p{IsGreek}", value):
|
||
return True
|
||
if regex.search(r"\p{IsGujarati}", value):
|
||
return True
|
||
if regex.search(r"\p{IsHanunoo}", value):
|
||
return True
|
||
if regex.search(r"\p{IsHebrew}", value):
|
||
return True
|
||
if regex.search(r"\p{IsLimbu}", value):
|
||
return True
|
||
if regex.search(r"\p{IsKannada}", value):
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def _is_extended_language_group_four(cls, value):
|
||
if regex.search(
|
||
r"([\p{IsKhmer}\p{IsLao}\p{IsMongolian}\p{IsMyanmar}\p{IsTibetan}\p{IsYi}]+)",
|
||
value,
|
||
):
|
||
return True
|
||
|
||
if regex.search(
|
||
r"([\p{IsOgham}\p{IsOriya}\p{IsSinhala}\p{IsSyriac}\p{IsTagalog}]+)", value
|
||
):
|
||
return True
|
||
|
||
if regex.search(
|
||
r"([\p{IsTagbanwa}\p{IsTaiLe}\p{IsTamil}\p{IsTelugu}\p{IsThaana}\p{IsThai}]+)",
|
||
value,
|
||
):
|
||
return True
|
||
|
||
# Vietnamese
|
||
if regex.search(
|
||
r"\b\S*[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴđa-zA-Z]+\S*\b", # noqa
|
||
value,
|
||
):
|
||
return True
|
||
|
||
# Turkish
|
||
if regex.search(r"\b\S*[a-zA-ZçğışöüÇĞİŞÖÜ]+\S*\b", value):
|
||
return True
|
||
|
||
return False
|
||
|
||
@classmethod
|
||
def is_extended_language(cls, value):
|
||
"""
|
||
Languages are combined in groups to handle cyclomatic complexity warnings
|
||
"""
|
||
if cls._is_extended_language_group_one(value):
|
||
return True
|
||
if cls._is_extended_language_group_two(value):
|
||
return True
|
||
if cls._is_extended_language_group_three(value):
|
||
return True
|
||
if cls.is_japanese(value):
|
||
return True
|
||
if cls._is_extended_language_group_four(value):
|
||
return True
|
||
if cls.is_chinese(value):
|
||
return True
|
||
|
||
return False
|
||
|
||
@classmethod
|
||
def encode_char(cls, c):
|
||
"""
|
||
Given a single unicode character, return a compatible character from the allowed set.
|
||
"""
|
||
# char is a good character already - return that native character.
|
||
if c in cls.ALLOWED_CHARACTERS:
|
||
return c
|
||
elif cls.is_extended_language(c):
|
||
return c
|
||
else:
|
||
c = cls.downgrade_character(c)
|
||
return c if c is not None else "?"
|
||
|
||
|
||
class SanitiseSMS(SanitiseText):
|
||
"""
|
||
Given an input string, makes it GSM and Welsh character compatible. This involves removing all non-gsm characters by
|
||
applying the following rules
|
||
* characters within the GSM character set (https://en.wikipedia.org/wiki/GSM_03.38)
|
||
and extension character set are kept
|
||
|
||
* Welsh characters not included in the default GSM character set are kept
|
||
|
||
* characters with sensible downgrades are replaced in place
|
||
* characters with diacritics (accents, umlauts, cedillas etc) are replaced with their base character, eg é -> e
|
||
* en dash and em dash (– and —) are replaced with hyphen (-)
|
||
* left/right quotation marks (‘, ’, “, ”) are replaced with ' and "
|
||
* zero width spaces (sometimes used to stop eg "gov.uk" linkifying) are removed
|
||
* tabs are replaced with a single space
|
||
|
||
* any remaining unicode characters (eg chinese/cyrillic/glyphs/emoji) are replaced with ?
|
||
"""
|
||
|
||
WELSH_DIACRITICS = set(
|
||
"àèìòùẁỳ"
|
||
"ÀÈÌÒÙẀỲ" # grave
|
||
"áéíóúẃý"
|
||
"ÁÉÍÓÚẂÝ" # acute
|
||
"äëïöüẅÿ"
|
||
"ÄËÏÖÜẄŸ" # diaeresis
|
||
"âêîôûŵŷ"
|
||
"ÂÊÎÔÛŴŶ" # carets
|
||
)
|
||
|
||
EXTENDED_GSM_CHARACTERS = set("^{}\\[~]|€")
|
||
|
||
GSM_CHARACTERS = (
|
||
set(
|
||
"@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ !\"#¤%&'()*+,-./0123456789:;<=>?"
|
||
+ "¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà"
|
||
)
|
||
| EXTENDED_GSM_CHARACTERS
|
||
)
|
||
|
||
ALLOWED_CHARACTERS = GSM_CHARACTERS | WELSH_DIACRITICS
|
||
# some welsh characters are in GSM and some aren't - we need to distinguish between these for counting fragments
|
||
WELSH_NON_GSM_CHARACTERS = WELSH_DIACRITICS - GSM_CHARACTERS
|
||
|
||
|
||
class SanitiseASCII(SanitiseText):
|
||
"""
|
||
As SMS above, but the allowed characters are printable ascii, from character range 32 to 126 inclusive.
|
||
[chr(x) for x in range(32, 127)]
|
||
"""
|
||
|
||
ALLOWED_CHARACTERS = set(
|
||
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||
+ "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
|
||
)
|