Coverage for website/utils/youtube_validator.py: 15%

40 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-09-13 15:29 -0300

1""" 

2YouTube URL validation and sanitization utilities 

3Provides secure handling of YouTube video URLs for embedding 

4""" 

5 

6import re 

7 

8 

9def extract_youtube_video_id(url): 

10 """ 

11 Extract YouTube video ID from various YouTube URL formats 

12 

13 Args: 

14 url (str): YouTube URL 

15 

16 Returns: 

17 str: Video ID if valid, None otherwise 

18 """ 

19 if not url or not isinstance(url, str): 

20 return None 

21 

22 url = url.strip() 

23 

24 # Common YouTube URL patterns 

25 patterns = [ 

26 r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})", 

27 r"(?:https?://)?(?:www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", 

28 r"(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]{11})", 

29 r"(?:https?://)?(?:www\.)?youtube-nocookie\.com/embed/([a-zA-Z0-9_-]{11})", 

30 ] 

31 

32 for pattern in patterns: 

33 match = re.search(pattern, url) 

34 if match: 

35 video_id = match.group(1) 

36 # Validate video ID format (exactly 11 characters, alphanumeric + _ -) 

37 if re.match(r"^[a-zA-Z0-9_-]{11}$", video_id): 

38 return video_id 

39 

40 return None 

41 

42 

43def validate_youtube_url(url): 

44 """ 

45 Validate if a URL is a valid YouTube URL 

46 

47 Args: 

48 url (str): URL to validate 

49 

50 Returns: 

51 bool: True if valid YouTube URL, False otherwise 

52 """ 

53 video_id = extract_youtube_video_id(url) 

54 return video_id is not None 

55 

56 

57def create_secure_embed_url(video_id): 

58 """ 

59 Create a secure YouTube embed URL using youtube-nocookie.com 

60 

61 Args: 

62 video_id (str): YouTube video ID 

63 

64 Returns: 

65 str: Secure embed URL 

66 """ 

67 if not video_id or not re.match(r"^[a-zA-Z0-9_-]{11}$", video_id): 

68 raise ValueError("Invalid YouTube video ID") 

69 

70 return f"https://www.youtube-nocookie.com/embed/{video_id}" 

71 

72 

73def sanitize_youtube_content(content): 

74 """ 

75 Sanitize HTML content to ensure only secure YouTube embeds are allowed 

76 

77 Args: 

78 content (str): HTML content 

79 

80 Returns: 

81 str: Sanitized content with secure YouTube embeds 

82 """ 

83 # This is a basic implementation - in production, consider using 

84 # a more robust HTML sanitization library like bleach 

85 

86 # Pattern to match YouTube iframe elements 

87 youtube_pattern = ( 

88 r'<iframe[^>]*src=["\']https://(?:www\.)?youtube(?:-nocookie)?\.com/' 

89 r'embed/([a-zA-Z0-9_-]{11})[^"\']*["\'][^>]*></iframe>' 

90 ) 

91 

92 def replace_youtube_iframe(match): 

93 video_id = match.group(1) 

94 if re.match(r"^[a-zA-Z0-9_-]{11}$", video_id): 

95 # Create secure iframe 

96 secure_url = create_secure_embed_url(video_id) 

97 return ( 

98 f'<iframe src="{secure_url}" frameborder="0" ' 

99 f'allow="accelerometer; autoplay; clipboard-write; ' 

100 f'encrypted-media; gyroscope; picture-in-picture" ' 

101 f'allowfullscreen loading="lazy"></iframe>' 

102 ) 

103 return "" # Remove invalid iframes 

104 

105 return re.sub(youtube_pattern, replace_youtube_iframe, content) 

106 

107 

108# Example usage: 

109if __name__ == "__main__": 

110 # Test URLs 

111 test_urls = [ 

112 "https://www.youtube.com/watch?v=dQw4w9WgXcQ", 

113 "https://youtu.be/dQw4w9WgXcQ", 

114 "https://www.youtube.com/embed/dQw4w9WgXcQ", 

115 "invalid-url", 

116 "https://not-youtube.com/watch?v=dQw4w9WgXcQ", 

117 ] 

118 

119 for test_url in test_urls: 

120 test_video_id = extract_youtube_video_id(test_url) 

121 is_valid = validate_youtube_url(test_url) 

122 print(f"URL: {test_url}") 

123 print(f"Video ID: {test_video_id}") 

124 print(f"Valid: {is_valid}") 

125 if test_video_id: 

126 print(f"Secure Embed: {create_secure_embed_url(test_video_id)}") 

127 print("-" * 50)