Coverage for website/utils/sanitizer.py: 92%

13 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-09-13 15:29 -0300

1""" 

2Small sanitizer utility using bleach to clean user-provided HTML. 

3 

4This centralizes allowed tags/attributes and can be tuned later. 

5""" 

6 

7import bleach 

8 

9# Keep this conservative: allow a small set of formatting tags and images/links 

10ALLOWED_TAGS = [ 

11 "a", 

12 "b", 

13 "blockquote", 

14 "br", 

15 "code", 

16 "em", 

17 "i", 

18 "li", 

19 "ol", 

20 "p", 

21 "strong", 

22 "ul", 

23 "img", 

24 "iframe", 

25 "h1", 

26 "h2", 

27 "h3", 

28 "h4", 

29 "h5", 

30 "h6", 

31] 

32 

33ALLOWED_ATTRIBUTES = { 

34 "a": ["href", "title", "rel", "target"], 

35 "img": ["src", "alt", "title", "width", "height"], 

36} 

37 

38ALLOWED_PROTOCOLS = ["http", "https", "mailto", "data"] 

39 

40# Optional: link rel="nofollow" for user-provided links 

41LINKIFY = True 

42 

43 

44def sanitize_html(value: str) -> str: 

45 """Return a cleaned HTML string safe for storage and rendering. 

46 

47 This removes dangerous tags and attributes while preserving basic 

48 formatting. Use this on user-submitted rich text before saving. 

49 """ 

50 if not value: 

51 return value 

52 

53 # First, strip control characters 

54 value = "".join(ch for ch in value if ord(ch) >= 32 or ch == "\n" or ch == "\t") 

55 

56 cleaned = bleach.clean( 

57 value, 

58 tags=ALLOWED_TAGS, 

59 attributes=ALLOWED_ATTRIBUTES, 

60 protocols=ALLOWED_PROTOCOLS, 

61 strip=True, 

62 ) 

63 

64 if LINKIFY: 

65 cleaned = bleach.linkify(cleaned) 

66 

67 return cleaned