Coverage for website/utils/sanitizer.py: 92%
13 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-09-13 15:29 -0300
« prev ^ index » next coverage.py v7.5.0, created at 2025-09-13 15:29 -0300
1"""
2Small sanitizer utility using bleach to clean user-provided HTML.
4This centralizes allowed tags/attributes and can be tuned later.
5"""
7import bleach
9# Keep this conservative: allow a small set of formatting tags and images/links
10ALLOWED_TAGS = [
11 "a",
12 "b",
13 "blockquote",
14 "br",
15 "code",
16 "em",
17 "i",
18 "li",
19 "ol",
20 "p",
21 "strong",
22 "ul",
23 "img",
24 "iframe",
25 "h1",
26 "h2",
27 "h3",
28 "h4",
29 "h5",
30 "h6",
31]
33ALLOWED_ATTRIBUTES = {
34 "a": ["href", "title", "rel", "target"],
35 "img": ["src", "alt", "title", "width", "height"],
36}
38ALLOWED_PROTOCOLS = ["http", "https", "mailto", "data"]
40# Optional: link rel="nofollow" for user-provided links
41LINKIFY = True
44def sanitize_html(value: str) -> str:
45 """Return a cleaned HTML string safe for storage and rendering.
47 This removes dangerous tags and attributes while preserving basic
48 formatting. Use this on user-submitted rich text before saving.
49 """
50 if not value:
51 return value
53 # First, strip control characters
54 value = "".join(ch for ch in value if ord(ch) >= 32 or ch == "\n" or ch == "\t")
56 cleaned = bleach.clean(
57 value,
58 tags=ALLOWED_TAGS,
59 attributes=ALLOWED_ATTRIBUTES,
60 protocols=ALLOWED_PROTOCOLS,
61 strip=True,
62 )
64 if LINKIFY:
65 cleaned = bleach.linkify(cleaned)
67 return cleaned