Feat/Add post processors + refactor project (#37)

* chg: feat: clean known patterns from title

* chg: refactor: remove duplicated code, and improve maintainability

* chg: feat: add audio tagging post-processor

* chg: refactor: add generic parallelMap function

* chg: refactor: move more function to common locations

* chg: docs: add func docs
This commit is contained in:
2025-07-24 01:03:38 -03:00
committed by GitHub
parent 6eba15d52a
commit 455f734c8a
12 changed files with 532 additions and 433 deletions

View File

@@ -6,12 +6,10 @@ import (
"fmt"
"net/http"
"net/url"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
@@ -20,15 +18,19 @@ import (
)
var bludv = IndexerMeta{
URL: "https://bludv.xyz/",
SearchURL: "?s=",
Label: "bludv",
URL: "https://bludv.xyz/",
SearchURL: "?s=",
PagePattern: "page/%s",
}
func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := bludv
defer func() {
i.metrics.IndexerDuration.WithLabelValues("bludv").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("bludv").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -38,11 +40,11 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
// URL encode query param
q = url.QueryEscape(q)
url := bludv.URL
url := metadata.URL
if page != "" {
url = fmt.Sprintf("%spage/%s", url, page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
} else {
url = fmt.Sprintf("%s%s%s", url, bludv.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
}
fmt.Println("URL:>", url)
@@ -53,7 +55,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("bludv").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -66,7 +68,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("bludv").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -77,57 +79,21 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrentsBluDV(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrentsBluDV(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -216,7 +182,7 @@ func getTorrentsBluDV(ctx context.Context, i *Indexer, link string) ([]schema.In
}
})
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -247,7 +213,7 @@ func getTorrentsBluDV(ctx context.Context, i *Indexer, link string) ([]schema.In
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,

View File

@@ -1,15 +1,12 @@
package handler
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
@@ -18,12 +15,13 @@ import (
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
"github.com/hbollon/go-edlib"
)
var comando = IndexerMeta{
URL: "https://comando.la/",
SearchURL: "?s=",
Label: "comando",
URL: "https://comando.la/",
SearchURL: "?s=",
PagePattern: "page/%s",
}
var replacer = strings.NewReplacer(
@@ -43,9 +41,11 @@ var replacer = strings.NewReplacer(
func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := comando
defer func() {
i.metrics.IndexerDuration.WithLabelValues("comando").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("comando").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -55,11 +55,11 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
// URL encode query param
q = url.QueryEscape(q)
url := comando.URL
url := metadata.URL
if q != "" {
url = fmt.Sprintf("%s%s%s", url, comando.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
} else if page != "" {
url = fmt.Sprintf("%spage/%s", url, page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
}
fmt.Println("URL:>", url)
@@ -70,7 +70,7 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("comando").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -82,7 +82,7 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("comando").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -93,57 +93,21 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrents(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrents(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -215,7 +179,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]schema.Indexed
}
})
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -246,7 +210,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]schema.Indexed
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,
@@ -293,38 +257,6 @@ func parseLocalizedDate(datePublished string) (time.Time, error) {
return time.Time{}, nil
}
func stableUniq(s []string) []string {
var uniq []map[string]interface{}
m := make(map[string]map[string]interface{})
for i, v := range s {
m[v] = map[string]interface{}{
"v": v,
"i": i,
}
}
// to order by index
for _, v := range m {
uniq = append(uniq, v)
}
// sort by index
for i := 0; i < len(uniq); i++ {
for j := i + 1; j < len(uniq); j++ {
if uniq[i]["i"].(int) > uniq[j]["i"].(int) {
uniq[i], uniq[j] = uniq[j], uniq[i]
}
}
}
// get only values
var uniqValues []string
for _, v := range uniq {
uniqValues = append(uniqValues, v["v"].(string))
}
return uniqValues
}
func processTitle(title string, a []schema.Audio) string {
// remove ' - Donwload' from title
title = strings.Replace(title, " Download", "", -1)
@@ -337,38 +269,3 @@ func processTitle(title string, a []schema.Audio) string {
return title
}
func getDocument(ctx context.Context, i *Indexer, link string) (*goquery.Document, error) {
// try to get from redis first
docCache, err := i.redis.Get(ctx, link)
if err == nil {
i.metrics.CacheHits.WithLabelValues("document_body").Inc()
fmt.Printf("returning from long-lived cache: %s\n", link)
return goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(docCache)))
}
defer i.metrics.CacheMisses.WithLabelValues("document_body").Inc()
resp, err := i.requester.GetDocument(ctx, link)
if err != nil {
return nil, err
}
defer resp.Close()
body, err := io.ReadAll(resp)
if err != nil {
return nil, err
}
// set cache
err = i.redis.Set(ctx, link, body)
if err != nil {
fmt.Println(err)
}
doc, err := goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(body)))
if err != nil {
return nil, err
}
return doc, nil
}

View File

@@ -7,12 +7,10 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
@@ -21,17 +19,21 @@ import (
)
var comandohds = IndexerMeta{
URL: "https://comandohds.org/",
SearchURL: "?s=",
Label: "comandohds",
URL: "https://comandohds.org/",
SearchURL: "?s=",
PagePattern: "page/%s",
}
var title_re = regexp.MustCompile(`^[(Filme)|(Série)\s]+`)
func (i *Indexer) HandlerComandoHDsIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := comandohds
defer func() {
i.metrics.IndexerDuration.WithLabelValues("comandohds").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("comandohds").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -41,11 +43,11 @@ func (i *Indexer) HandlerComandoHDsIndexer(w http.ResponseWriter, r *http.Reques
// URL encode query param
q = url.QueryEscape(q)
url := comandohds.URL
url := metadata.URL
if q != "" {
url = fmt.Sprintf("%s%s%s", url, comandohds.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
} else if page != "" {
url = fmt.Sprintf("%spage/%s", url, page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
}
fmt.Println("URL:>", url)
@@ -56,7 +58,7 @@ func (i *Indexer) HandlerComandoHDsIndexer(w http.ResponseWriter, r *http.Reques
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("comandohds").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -69,7 +71,7 @@ func (i *Indexer) HandlerComandoHDsIndexer(w http.ResponseWriter, r *http.Reques
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("comandohds").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -79,57 +81,21 @@ func (i *Indexer) HandlerComandoHDsIndexer(w http.ResponseWriter, r *http.Reques
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrentsComandoHDs(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrentsComandoHDs(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -193,7 +159,7 @@ func getTorrentsComandoHDs(ctx context.Context, i *Indexer, link string) ([]sche
}
})
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -228,7 +194,7 @@ func getTorrentsComandoHDs(ctx context.Context, i *Indexer, link string) ([]sche
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,

View File

@@ -1,7 +1,10 @@
package handler
import (
"bytes"
"context"
"fmt"
"io"
"regexp"
"slices"
"strings"
@@ -11,6 +14,43 @@ import (
"github.com/felipemarinho97/torrent-indexer/schema"
)
// getDocument retrieves a document from the cache or makes a request to get it.
// It first checks the Redis cache for the document body.
func getDocument(ctx context.Context, i *Indexer, link string) (*goquery.Document, error) {
// try to get from redis first
docCache, err := i.redis.Get(ctx, link)
if err == nil {
i.metrics.CacheHits.WithLabelValues("document_body").Inc()
fmt.Printf("returning from long-lived cache: %s\n", link)
return goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(docCache)))
}
defer i.metrics.CacheMisses.WithLabelValues("document_body").Inc()
resp, err := i.requester.GetDocument(ctx, link)
if err != nil {
return nil, err
}
defer resp.Close()
body, err := io.ReadAll(resp)
if err != nil {
return nil, err
}
// set cache
err = i.redis.Set(ctx, link, body)
if err != nil {
fmt.Println(err)
}
doc, err := goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(body)))
if err != nil {
return nil, err
}
return doc, nil
}
func getPublishedDateFromMeta(document *goquery.Document) time.Time {
var date time.Time
//<meta property="article:published_time" content="2019-08-23T13:20:57+00:00">

View File

@@ -13,17 +13,18 @@ import (
)
type Indexer struct {
redis *cache.Redis
metrics *monitoring.Metrics
requester *requester.Requster
search *meilisearch.SearchIndexer
redis *cache.Redis
metrics *monitoring.Metrics
requester *requester.Requster
search *meilisearch.SearchIndexer
postProcessors []PostProcessorFunc
}
type IndexerMeta struct {
URL string
SearchURL string
// pattern for pagination, e.g. "page/%s"
PagePattern string
Label string // Label is used for Prometheus metrics and logging. Must be alphanumeric optionally with underscores.
URL string // URL is the base URL of the indexer, e.g. "https://example.com/"
SearchURL string // SearchURL is the base URL for search queries, e.g. "?s="
PagePattern string // PagePattern for pagination, e.g. "page/%s"
}
type Response struct {
@@ -31,12 +32,22 @@ type Response struct {
Count int `json:"count"`
}
type PostProcessorFunc func(*Indexer, *http.Request, []schema.IndexedTorrent) []schema.IndexedTorrent
var GlobalPostProcessors = []PostProcessorFunc{
AddSimilarityCheck, // Jaccard similarity
CleanupTitleWebsites, // Remove website names from titles
AppendAudioTags, // Add (brazilian, eng, etc.) audio tags to titles
SendToSearchIndexer, // Send indexed torrents to Meilisearch
}
func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics, req *requester.Requster, si *meilisearch.SearchIndexer) *Indexer {
return &Indexer{
redis: redis,
metrics: metrics,
requester: req,
search: si,
redis: redis,
metrics: metrics,
requester: req,
search: si,
postProcessors: GlobalPostProcessors,
}
}

61
api/post_processors.go Normal file
View File

@@ -0,0 +1,61 @@
package handler
import (
"fmt"
"net/http"
"slices"
"strings"
"github.com/felipemarinho97/torrent-indexer/schema"
"github.com/felipemarinho97/torrent-indexer/utils"
"github.com/hbollon/go-edlib"
)
// CleanupTitleWebsites removes unwanted characters from the title
func CleanupTitleWebsites(_ *Indexer, _ *http.Request, torrents []schema.IndexedTorrent) []schema.IndexedTorrent {
for i := range torrents {
torrents[i].Title = utils.RemoveKnownWebsites(torrents[i].Title)
}
return torrents
}
func AppendAudioTags(_ *Indexer, _ *http.Request, torrents []schema.IndexedTorrent) []schema.IndexedTorrent {
for i, it := range torrents {
torrents[i].Title = appendAudioISO639_2Code(torrents[i].Title, it.Audio)
}
return torrents
}
// SendToSearchIndexer sends the indexed torrents to the search indexer
func SendToSearchIndexer(i *Indexer, _ *http.Request, torrents []schema.IndexedTorrent) []schema.IndexedTorrent {
go func() {
_ = i.search.IndexTorrents(torrents)
}()
return torrents
}
func AddSimilarityCheck(i *Indexer, r *http.Request, torrents []schema.IndexedTorrent) []schema.IndexedTorrent {
q := r.URL.Query().Get("q")
for i, it := range torrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
torrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(torrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
torrents = utils.Filter(torrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(torrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})
return torrents
}

View File

@@ -7,12 +7,10 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
@@ -21,6 +19,7 @@ import (
)
var rede_torrent = IndexerMeta{
Label: "rede_torrent",
URL: "https://redetorrent.com/",
SearchURL: "index.php?s=",
PagePattern: "%s",
@@ -28,9 +27,11 @@ var rede_torrent = IndexerMeta{
func (i *Indexer) HandlerRedeTorrentIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := rede_torrent
defer func() {
i.metrics.IndexerDuration.WithLabelValues("rede_torrent").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("rede_torrent").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -40,11 +41,11 @@ func (i *Indexer) HandlerRedeTorrentIndexer(w http.ResponseWriter, r *http.Reque
// URL encode query param
q = url.QueryEscape(q)
url := rede_torrent.URL
url := metadata.URL
if q != "" {
url = fmt.Sprintf("%s%s%s", url, rede_torrent.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
} else if page != "" {
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, rede_torrent.PagePattern), page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
}
fmt.Println("URL:>", url)
@@ -55,7 +56,7 @@ func (i *Indexer) HandlerRedeTorrentIndexer(w http.ResponseWriter, r *http.Reque
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("rede_torrent").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -68,7 +69,7 @@ func (i *Indexer) HandlerRedeTorrentIndexer(w http.ResponseWriter, r *http.Reque
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("rede_torrent").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -78,57 +79,21 @@ func (i *Indexer) HandlerRedeTorrentIndexer(w http.ResponseWriter, r *http.Reque
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrentsRedeTorrent(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrentsRedeTorrent(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -222,7 +187,7 @@ func getTorrentsRedeTorrent(ctx context.Context, i *Indexer, link string) ([]sch
}
})
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -253,7 +218,7 @@ func getTorrentsRedeTorrent(ctx context.Context, i *Indexer, link string) ([]sch
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,

View File

@@ -6,12 +6,10 @@ import (
"fmt"
"net/http"
"net/url"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
@@ -20,15 +18,19 @@ import (
)
var starck_filmes = IndexerMeta{
URL: "https://www.starckfilmes.online/",
SearchURL: "?s=",
Label: "starck_filmes",
URL: "https://www.starckfilmes.online/",
SearchURL: "?s=",
PagePattern: "page/%s",
}
func (i *Indexer) HandlerStarckFilmesIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := starck_filmes
defer func() {
i.metrics.IndexerDuration.WithLabelValues("starck_filmes").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("starck_filmes").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -38,11 +40,11 @@ func (i *Indexer) HandlerStarckFilmesIndexer(w http.ResponseWriter, r *http.Requ
// URL encode query param
q = url.QueryEscape(q)
url := starck_filmes.URL
url := metadata.URL
if q != "" {
url = fmt.Sprintf("%s%s%s", url, starck_filmes.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
} else if page != "" {
url = fmt.Sprintf("%spage/%s", url, page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
}
fmt.Println("URL:>", url)
@@ -53,7 +55,7 @@ func (i *Indexer) HandlerStarckFilmesIndexer(w http.ResponseWriter, r *http.Requ
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("starck_filmes").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -66,7 +68,7 @@ func (i *Indexer) HandlerStarckFilmesIndexer(w http.ResponseWriter, r *http.Requ
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("starck_filmes").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -76,57 +78,21 @@ func (i *Indexer) HandlerStarckFilmesIndexer(w http.ResponseWriter, r *http.Requ
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrentStarckFilmes(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrentStarckFilmes(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -184,7 +150,7 @@ func getTorrentStarckFilmes(ctx context.Context, i *Indexer, link string) ([]sch
// TODO: find any link from imdb
imdbLink := ""
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -228,7 +194,7 @@ func getTorrentStarckFilmes(ctx context.Context, i *Indexer, link string) ([]sch
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,

View File

@@ -6,12 +6,10 @@ import (
"fmt"
"net/http"
"net/url"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
@@ -20,15 +18,19 @@ import (
)
var torrent_dos_filmes = IndexerMeta{
URL: "https://torrentdosfilmes.se/",
SearchURL: "?s=",
Label: "torrent_dos_filmes",
URL: "https://torrentdosfilmes.se/",
SearchURL: "?s=",
PagePattern: "page/%s",
}
func (i *Indexer) HandlerTorrentDosFilmesIndexer(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metadata := torrent_dos_filmes
defer func() {
i.metrics.IndexerDuration.WithLabelValues("torrent_dos_filmes").Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues("torrent_dos_filmes").Inc()
i.metrics.IndexerDuration.WithLabelValues(metadata.Label).Observe(time.Since(start).Seconds())
i.metrics.IndexerRequests.WithLabelValues(metadata.Label).Inc()
}()
ctx := r.Context()
@@ -38,11 +40,11 @@ func (i *Indexer) HandlerTorrentDosFilmesIndexer(w http.ResponseWriter, r *http.
// URL encode query param
q = url.QueryEscape(q)
url := torrent_dos_filmes.URL
url := metadata.URL
if q != "" {
url = fmt.Sprintf("%s%s%s", url, torrent_dos_filmes.SearchURL, q)
url = fmt.Sprintf("%s%s%s", url, metadata.SearchURL, q)
} else if page != "" {
url = fmt.Sprintf("%spage/%s", url, page)
url = fmt.Sprintf(fmt.Sprintf("%s%s", url, metadata.PagePattern), page)
}
fmt.Println("URL:>", url)
@@ -53,7 +55,7 @@ func (i *Indexer) HandlerTorrentDosFilmesIndexer(w http.ResponseWriter, r *http.
if err != nil {
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("torrent_dos_filmes").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
defer resp.Close()
@@ -66,7 +68,7 @@ func (i *Indexer) HandlerTorrentDosFilmesIndexer(w http.ResponseWriter, r *http.
fmt.Println(err)
}
i.metrics.IndexerErrors.WithLabelValues("torrent_dos_filmes").Inc()
i.metrics.IndexerErrors.WithLabelValues(metadata.Label).Inc()
return
}
@@ -76,57 +78,21 @@ func (i *Indexer) HandlerTorrentDosFilmesIndexer(w http.ResponseWriter, r *http.
links = append(links, link)
})
var itChan = make(chan []schema.IndexedTorrent)
var errChan = make(chan error)
indexedTorrents := []schema.IndexedTorrent{}
for _, link := range links {
go func(link string) {
torrents, err := getTorrentsTorrentDosFilmes(ctx, i, link)
if err != nil {
fmt.Println(err)
errChan <- err
}
itChan <- torrents
}(link)
}
for i := 0; i < len(links); i++ {
select {
case torrents := <-itChan:
indexedTorrents = append(indexedTorrents, torrents...)
case err := <-errChan:
fmt.Println(err)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it schema.IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j schema.IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
// extract each torrent link
indexedTorrents := utils.ParallelMap(links, func(link string) ([]schema.IndexedTorrent, error) {
return getTorrentsTorrentDosFilmes(ctx, i, link)
})
// send to search index
go func() {
_ = i.search.IndexTorrents(indexedTorrents)
}()
// Apply post-processors
postProcessedTorrents := indexedTorrents
for _, processor := range i.postProcessors {
postProcessedTorrents = processor(i, r, postProcessedTorrents)
}
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Count: len(indexedTorrents),
Results: postProcessedTorrents,
Count: len(postProcessedTorrents),
})
if err != nil {
fmt.Println(err)
@@ -191,7 +157,7 @@ func getTorrentsTorrentDosFilmes(ctx context.Context, i *Indexer, link string) (
}
})
size = stableUniq(size)
size = utils.StableUniq(size)
var chanIndexedTorrent = make(chan schema.IndexedTorrent)
@@ -222,7 +188,7 @@ func getTorrentsTorrentDosFilmes(ctx context.Context, i *Indexer, link string) (
}
ixt := schema.IndexedTorrent{
Title: appendAudioISO639_2Code(releaseTitle, magnetAudio),
Title: releaseTitle,
OriginalTitle: title,
Details: link,
Year: year,

View File

@@ -25,6 +25,7 @@ const (
AudioMandarin3 = "Chines"
AudioRussian = "Russo"
AudioSwedish = "Sueco"
AudioSwedish2 = "Suéco"
AudioUkrainian = "Ucraniano"
AudioPolish = "Polaco"
AudioPolish2 = "Polonês"
@@ -34,9 +35,32 @@ const (
AudioTurkish = "Turco"
AudioHindi = "Hindi"
AudioFarsi = "Persa"
AudioFarsi2 = "Farsi"
AudioFarsi3 = "Iraniano"
AudioMalay = "Malaio"
AudioDutch = "Holandês"
AudioDutch2 = "Holandes"
AudioFinnish = "Finlandês"
AudioFinnish2 = "Finlandes"
AudioDanish = "Dinamarquês"
AudioDanish2 = "Dinamarques"
AudioNorwegian = "Norueguês"
AudioNorwegian2 = "Noruegues"
AudioIcelandic = "Islandês"
AudioIcelandic2 = "Islandes"
AudioGreek = "Grego"
AudioArabic = "Árabe"
AudioArabic2 = "Arabe"
AudioHebrew = "Hebraico"
AudioVietnamese = "Vietnamita"
AudioIndonesian = "Indonésio"
AudioIndonesian2 = "Indonesio"
AudioFilipino = "Filipino"
AudioBengali = "Bengali"
AudioTamil = "Tamil"
AudioTelugu = "Telugu"
AudioGujarati = "Gujarati"
AudioMarathi = "Marathi"
)
var AudioList = []Audio{
@@ -60,6 +84,7 @@ var AudioList = []Audio{
AudioMandarin3,
AudioRussian,
AudioSwedish,
AudioSwedish2,
AudioUkrainian,
AudioPolish,
AudioPolish2,
@@ -69,9 +94,32 @@ var AudioList = []Audio{
AudioTurkish,
AudioHindi,
AudioFarsi,
AudioFarsi2,
AudioFarsi3,
AudioMalay,
AudioDutch,
AudioDutch2,
AudioFinnish,
AudioFinnish2,
AudioDanish,
AudioDanish2,
AudioNorwegian,
AudioNorwegian2,
AudioIcelandic,
AudioIcelandic2,
AudioGreek,
AudioArabic,
AudioArabic2,
AudioHebrew,
AudioVietnamese,
AudioIndonesian,
AudioIndonesian2,
AudioFilipino,
AudioBengali,
AudioTamil,
AudioTelugu,
AudioGujarati,
AudioMarathi,
}
func (a Audio) String() string {
@@ -129,6 +177,8 @@ func (a Audio) toTag() string {
return "rus"
case AudioSwedish:
return "swe"
case AudioSwedish2:
return "swe"
case AudioUkrainian:
return "ukr"
case AudioPolish:
@@ -147,12 +197,58 @@ func (a Audio) toTag() string {
return "hin"
case AudioFarsi:
return "fas"
case AudioFarsi2:
return "fas"
case AudioFarsi3:
return "fas"
case AudioMalay:
return "msa"
case AudioDutch:
return "nld"
case AudioDutch2:
return "nld"
case AudioFinnish:
return "fin"
case AudioFinnish2:
return "fin"
case AudioDanish:
return "dan"
case AudioDanish2:
return "dan"
case AudioNorwegian:
return "nor"
case AudioNorwegian2:
return "nor"
case AudioIcelandic:
return "isl"
case AudioIcelandic2:
return "isl"
case AudioGreek:
return "ell"
case AudioArabic:
return "ara"
case AudioArabic2:
return "ara"
case AudioHebrew:
return "heb"
case AudioVietnamese:
return "vie"
case AudioIndonesian:
return "ind"
case AudioIndonesian2:
return "ind"
case AudioFilipino:
return "fil"
case AudioBengali:
return "ben"
case AudioTamil:
return "tam"
case AudioTelugu:
return "tel"
case AudioGujarati:
return "guj"
case AudioMarathi:
return "mar"
default:
return ""
}

View File

@@ -1,10 +1,13 @@
package utils
import (
"fmt"
"strings"
"golang.org/x/net/html"
)
// Filter filters a slice based on a predicate function.
func Filter[A any](arr []A, f func(A) bool) []A {
var res []A
res = make([]A, 0)
@@ -16,6 +19,71 @@ func Filter[A any](arr []A, f func(A) bool) []A {
return res
}
// ParallelMap applies a function to each item in the iterable concurrently
// and returns a slice of results. It can handle errors by passing an error handler function.
func ParallelMap[T any, R any](iterable []T, mapper func(item T) ([]R, error), errHandler ...func(error)) []R {
var itChan = make(chan []R)
var errChan = make(chan error)
mappedItems := []R{}
for _, link := range iterable {
go func(link T) {
items, err := mapper(link)
if err != nil {
errChan <- err
}
itChan <- items
}(link)
}
for range iterable {
select {
case items := <-itChan:
mappedItems = append(mappedItems, items...)
case err := <-errChan:
for _, handler := range errHandler {
handler(err)
}
if len(errHandler) == 0 {
fmt.Println(err)
}
}
}
return mappedItems
}
// StableUniq removes duplicates from a slice while maintaining the order of elements.
func StableUniq(s []string) []string {
var uniq []map[string]interface{}
m := make(map[string]map[string]interface{})
for i, v := range s {
m[v] = map[string]interface{}{
"v": v,
"i": i,
}
}
// to order by index
for _, v := range m {
uniq = append(uniq, v)
}
// sort by index
for i := 0; i < len(uniq); i++ {
for j := i + 1; j < len(uniq); j++ {
if uniq[i]["i"].(int) > uniq[j]["i"].(int) {
uniq[i], uniq[j] = uniq[j], uniq[i]
}
}
}
// get only values
var uniqValues []string
for _, v := range uniq {
uniqValues = append(uniqValues, v["v"].(string))
}
return uniqValues
}
func IsValidHTML(input string) bool {
r := strings.NewReader(input)
_, err := html.Parse(r)

97
utils/website.go Normal file
View File

@@ -0,0 +1,97 @@
package utils
import (
"fmt"
"regexp"
"strings"
"sync"
)
var commonTLDs = []string{
".com",
".net",
".org",
".info",
".biz",
".co",
".io",
".xyz",
".me",
".tv",
".cc",
".us",
".online",
".site",
".la",
".se",
".to",
}
var commonSubdomains = []string{
"", // no prefix
"www.",
}
var commonWebsiteSLDs = []string{
"bludv",
"torrentdosfilmes",
"comando",
"comandotorrents",
"comandohds",
"redetorrent",
"torrenting",
"baixarfilmesdubladosviatorrent",
"hidratorrents",
"wolverdonfilmes",
"starckfilmes",
"rapidotorrents",
"sitedetorrents",
"vamostorrent",
"AZTORRENTS",
}
var websitePatterns = []string{
`\[\s*ACESSE\s+%s\s*\]`,
`\[?\s*%s(\s*\])?`,
}
var regexesOnce sync.Once
var regexes []*regexp.Regexp
func getRegexes() []*regexp.Regexp {
regexesOnce.Do(func() {
var websites strings.Builder
websites.WriteString("(?i)(")
for _, prefix := range commonSubdomains {
for _, name := range commonWebsiteSLDs {
for _, tld := range commonTLDs {
websites.WriteString(fmt.Sprintf("%s%s%s|", prefix, name, tld))
}
}
}
// remove the last pipe character
websites.WriteString(")")
websitesStr := websites.String()
websitesStr = strings.Replace(websitesStr, "|)", ")", 1)
for _, pattern := range websitePatterns {
regexes = append(regexes, regexp.MustCompile(fmt.Sprintf(pattern, websitesStr)))
}
})
return regexes
}
// RemoveKnownWebsites removes known website patterns from the title.
// It uses a set of common prefixes, names, and TLDs to identify and remove
// website references from the title.
// It also removes any common patterns like "[ ACESSE bludv.com ]" or
// "[ bludv.se ]" or "bludv.xyz".
func RemoveKnownWebsites(title string) string {
regexes := getRegexes()
for _, re := range regexes {
title = re.ReplaceAllString(title, "")
}
title = strings.TrimSpace(title)
return title
}