chg: fix: html detector

This commit is contained in:
2025-07-30 14:16:56 +00:00
parent 3b9f49a9c0
commit 9bb98beb61

View File

@@ -2,9 +2,7 @@ package utils
import ( import (
"fmt" "fmt"
"strings" "regexp"
"golang.org/x/net/html"
) )
// Filter filters a slice based on a predicate function. // Filter filters a slice based on a predicate function.
@@ -84,10 +82,19 @@ func StableUniq(s []string) []string {
return uniqValues return uniqValues
} }
var (
doctypeRegex = regexp.MustCompile(`(?i)<!DOCTYPE\s+html>`)
htmlTagRegex = regexp.MustCompile(`(?i)<html[\s\S]*?>[\s\S]*?</html>`)
bodyTagRegex = regexp.MustCompile(`(?i)<body[\s\S]*?>[\s\S]*?</body>`)
)
func IsValidHTML(input string) bool { func IsValidHTML(input string) bool {
r := strings.NewReader(input) // Check for <!DOCTYPE>, <html>, or <body> tags
_, err := html.Parse(r) if !doctypeRegex.MatchString(input) && !htmlTagRegex.MatchString(input) && !bodyTagRegex.MatchString(input) {
return err == nil return false
}
return true
} }
// FormatBytes formats a byte size into a human-readable string. // FormatBytes formats a byte size into a human-readable string.