Feat/Add post processors + refactor project (#37)

* chg: feat: clean known patterns from title * chg: refactor: remove duplicated code, and improve maintainability * chg: feat: add audio tagging post-processor * chg: refactor: add generic parallelMap function * chg: refactor: move more function to common locations * chg: docs: add func docs
2025-07-24 01:03:38 -03:00
parent 6eba15d52a
commit 455f734c8a
12 changed files with 532 additions and 433 deletions
--- a/utils/util.go
+++ b/utils/util.go
@@ -1,10 +1,13 @@
 package utils

 import (
+	"fmt"
 	"strings"
+
 	"golang.org/x/net/html"
 )

+// Filter filters a slice based on a predicate function.
 func Filter[A any](arr []A, f func(A) bool) []A {
 	var res []A
 	res = make([]A, 0)
@@ -16,6 +19,71 @@ func Filter[A any](arr []A, f func(A) bool) []A {
 	return res
 }

+// ParallelMap applies a function to each item in the iterable concurrently
+// and returns a slice of results. It can handle errors by passing an error handler function.
+func ParallelMap[T any, R any](iterable []T, mapper func(item T) ([]R, error), errHandler ...func(error)) []R {
+	var itChan = make(chan []R)
+	var errChan = make(chan error)
+	mappedItems := []R{}
+	for _, link := range iterable {
+		go func(link T) {
+			items, err := mapper(link)
+			if err != nil {
+				errChan <- err
+			}
+			itChan <- items
+		}(link)
+	}
+
+	for range iterable {
+		select {
+		case items := <-itChan:
+			mappedItems = append(mappedItems, items...)
+		case err := <-errChan:
+			for _, handler := range errHandler {
+				handler(err)
+			}
+			if len(errHandler) == 0 {
+				fmt.Println(err)
+			}
+		}
+	}
+	return mappedItems
+}
+
+// StableUniq removes duplicates from a slice while maintaining the order of elements.
+func StableUniq(s []string) []string {
+	var uniq []map[string]interface{}
+	m := make(map[string]map[string]interface{})
+	for i, v := range s {
+		m[v] = map[string]interface{}{
+			"v": v,
+			"i": i,
+		}
+	}
+	// to order by index
+	for _, v := range m {
+		uniq = append(uniq, v)
+	}
+
+	// sort by index
+	for i := 0; i < len(uniq); i++ {
+		for j := i + 1; j < len(uniq); j++ {
+			if uniq[i]["i"].(int) > uniq[j]["i"].(int) {
+				uniq[i], uniq[j] = uniq[j], uniq[i]
+			}
+		}
+	}
+
+	// get only values
+	var uniqValues []string
+	for _, v := range uniq {
+		uniqValues = append(uniqValues, v["v"].(string))
+	}
+
+	return uniqValues
+}
+
 func IsValidHTML(input string) bool {
 	r := strings.NewReader(input)
 	_, err := html.Parse(r)
--- a/utils/website.go
+++ b/utils/website.go
@@ -0,0 +1,97 @@
+package utils
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+	"sync"
+)
+
+var commonTLDs = []string{
+	".com",
+	".net",
+	".org",
+	".info",
+	".biz",
+	".co",
+	".io",
+	".xyz",
+	".me",
+	".tv",
+	".cc",
+	".us",
+	".online",
+	".site",
+	".la",
+	".se",
+	".to",
+}
+
+var commonSubdomains = []string{
+	"", // no prefix
+	"www.",
+}
+
+var commonWebsiteSLDs = []string{
+	"bludv",
+	"torrentdosfilmes",
+	"comando",
+	"comandotorrents",
+	"comandohds",
+	"redetorrent",
+	"torrenting",
+	"baixarfilmesdubladosviatorrent",
+	"hidratorrents",
+	"wolverdonfilmes",
+	"starckfilmes",
+	"rapidotorrents",
+	"sitedetorrents",
+	"vamostorrent",
+	"AZTORRENTS",
+}
+
+var websitePatterns = []string{
+	`\[\s*ACESSE\s+%s\s*\]`,
+	`\[?\s*%s(\s*\])?`,
+}
+
+var regexesOnce sync.Once
+var regexes []*regexp.Regexp
+
+func getRegexes() []*regexp.Regexp {
+	regexesOnce.Do(func() {
+		var websites strings.Builder
+		websites.WriteString("(?i)(")
+		for _, prefix := range commonSubdomains {
+			for _, name := range commonWebsiteSLDs {
+				for _, tld := range commonTLDs {
+					websites.WriteString(fmt.Sprintf("%s%s%s|", prefix, name, tld))
+				}
+			}
+		}
+		// remove the last pipe character
+		websites.WriteString(")")
+
+		websitesStr := websites.String()
+		websitesStr = strings.Replace(websitesStr, "|)", ")", 1)
+
+		for _, pattern := range websitePatterns {
+			regexes = append(regexes, regexp.MustCompile(fmt.Sprintf(pattern, websitesStr)))
+		}
+	})
+	return regexes
+}
+
+// RemoveKnownWebsites removes known website patterns from the title.
+// It uses a set of common prefixes, names, and TLDs to identify and remove
+// website references from the title.
+// It also removes any common patterns like "[ ACESSE bludv.com ]" or
+// "[ bludv.se ]" or "bludv.xyz".
+func RemoveKnownWebsites(title string) string {
+	regexes := getRegexes()
+	for _, re := range regexes {
+		title = re.ReplaceAllString(title, "")
+	}
+	title = strings.TrimSpace(title)
+	return title
+}