Feat/Add post processors + refactor project (#37)
* chg: feat: clean known patterns from title * chg: refactor: remove duplicated code, and improve maintainability * chg: feat: add audio tagging post-processor * chg: refactor: add generic parallelMap function * chg: refactor: move more function to common locations * chg: docs: add func docs
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Filter filters a slice based on a predicate function.
|
||||
func Filter[A any](arr []A, f func(A) bool) []A {
|
||||
var res []A
|
||||
res = make([]A, 0)
|
||||
@@ -16,6 +19,71 @@ func Filter[A any](arr []A, f func(A) bool) []A {
|
||||
return res
|
||||
}
|
||||
|
||||
// ParallelMap applies a function to each item in the iterable concurrently
|
||||
// and returns a slice of results. It can handle errors by passing an error handler function.
|
||||
func ParallelMap[T any, R any](iterable []T, mapper func(item T) ([]R, error), errHandler ...func(error)) []R {
|
||||
var itChan = make(chan []R)
|
||||
var errChan = make(chan error)
|
||||
mappedItems := []R{}
|
||||
for _, link := range iterable {
|
||||
go func(link T) {
|
||||
items, err := mapper(link)
|
||||
if err != nil {
|
||||
errChan <- err
|
||||
}
|
||||
itChan <- items
|
||||
}(link)
|
||||
}
|
||||
|
||||
for range iterable {
|
||||
select {
|
||||
case items := <-itChan:
|
||||
mappedItems = append(mappedItems, items...)
|
||||
case err := <-errChan:
|
||||
for _, handler := range errHandler {
|
||||
handler(err)
|
||||
}
|
||||
if len(errHandler) == 0 {
|
||||
fmt.Println(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return mappedItems
|
||||
}
|
||||
|
||||
// StableUniq removes duplicates from a slice while maintaining the order of elements.
|
||||
func StableUniq(s []string) []string {
|
||||
var uniq []map[string]interface{}
|
||||
m := make(map[string]map[string]interface{})
|
||||
for i, v := range s {
|
||||
m[v] = map[string]interface{}{
|
||||
"v": v,
|
||||
"i": i,
|
||||
}
|
||||
}
|
||||
// to order by index
|
||||
for _, v := range m {
|
||||
uniq = append(uniq, v)
|
||||
}
|
||||
|
||||
// sort by index
|
||||
for i := 0; i < len(uniq); i++ {
|
||||
for j := i + 1; j < len(uniq); j++ {
|
||||
if uniq[i]["i"].(int) > uniq[j]["i"].(int) {
|
||||
uniq[i], uniq[j] = uniq[j], uniq[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// get only values
|
||||
var uniqValues []string
|
||||
for _, v := range uniq {
|
||||
uniqValues = append(uniqValues, v["v"].(string))
|
||||
}
|
||||
|
||||
return uniqValues
|
||||
}
|
||||
|
||||
func IsValidHTML(input string) bool {
|
||||
r := strings.NewReader(input)
|
||||
_, err := html.Parse(r)
|
||||
|
||||
97
utils/website.go
Normal file
97
utils/website.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var commonTLDs = []string{
|
||||
".com",
|
||||
".net",
|
||||
".org",
|
||||
".info",
|
||||
".biz",
|
||||
".co",
|
||||
".io",
|
||||
".xyz",
|
||||
".me",
|
||||
".tv",
|
||||
".cc",
|
||||
".us",
|
||||
".online",
|
||||
".site",
|
||||
".la",
|
||||
".se",
|
||||
".to",
|
||||
}
|
||||
|
||||
var commonSubdomains = []string{
|
||||
"", // no prefix
|
||||
"www.",
|
||||
}
|
||||
|
||||
var commonWebsiteSLDs = []string{
|
||||
"bludv",
|
||||
"torrentdosfilmes",
|
||||
"comando",
|
||||
"comandotorrents",
|
||||
"comandohds",
|
||||
"redetorrent",
|
||||
"torrenting",
|
||||
"baixarfilmesdubladosviatorrent",
|
||||
"hidratorrents",
|
||||
"wolverdonfilmes",
|
||||
"starckfilmes",
|
||||
"rapidotorrents",
|
||||
"sitedetorrents",
|
||||
"vamostorrent",
|
||||
"AZTORRENTS",
|
||||
}
|
||||
|
||||
var websitePatterns = []string{
|
||||
`\[\s*ACESSE\s+%s\s*\]`,
|
||||
`\[?\s*%s(\s*\])?`,
|
||||
}
|
||||
|
||||
var regexesOnce sync.Once
|
||||
var regexes []*regexp.Regexp
|
||||
|
||||
func getRegexes() []*regexp.Regexp {
|
||||
regexesOnce.Do(func() {
|
||||
var websites strings.Builder
|
||||
websites.WriteString("(?i)(")
|
||||
for _, prefix := range commonSubdomains {
|
||||
for _, name := range commonWebsiteSLDs {
|
||||
for _, tld := range commonTLDs {
|
||||
websites.WriteString(fmt.Sprintf("%s%s%s|", prefix, name, tld))
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove the last pipe character
|
||||
websites.WriteString(")")
|
||||
|
||||
websitesStr := websites.String()
|
||||
websitesStr = strings.Replace(websitesStr, "|)", ")", 1)
|
||||
|
||||
for _, pattern := range websitePatterns {
|
||||
regexes = append(regexes, regexp.MustCompile(fmt.Sprintf(pattern, websitesStr)))
|
||||
}
|
||||
})
|
||||
return regexes
|
||||
}
|
||||
|
||||
// RemoveKnownWebsites removes known website patterns from the title.
|
||||
// It uses a set of common prefixes, names, and TLDs to identify and remove
|
||||
// website references from the title.
|
||||
// It also removes any common patterns like "[ ACESSE bludv.com ]" or
|
||||
// "[ bludv.se ]" or "bludv.xyz".
|
||||
func RemoveKnownWebsites(title string) string {
|
||||
regexes := getRegexes()
|
||||
for _, re := range regexes {
|
||||
title = re.ReplaceAllString(title, "")
|
||||
}
|
||||
title = strings.TrimSpace(title)
|
||||
return title
|
||||
}
|
||||
Reference in New Issue
Block a user