Feat/Add post processors + refactor project (#37)

* chg: feat: clean known patterns from title

* chg: refactor: remove duplicated code, and improve maintainability

* chg: feat: add audio tagging post-processor

* chg: refactor: add generic parallelMap function

* chg: refactor: move more function to common locations

* chg: docs: add func docs
This commit is contained in:
2025-07-24 01:03:38 -03:00
committed by GitHub
parent 6eba15d52a
commit 455f734c8a
12 changed files with 532 additions and 433 deletions

View File

@@ -1,7 +1,10 @@
package handler
import (
"bytes"
"context"
"fmt"
"io"
"regexp"
"slices"
"strings"
@@ -11,6 +14,43 @@ import (
"github.com/felipemarinho97/torrent-indexer/schema"
)
// getDocument retrieves a document from the cache or makes a request to get it.
// It first checks the Redis cache for the document body.
func getDocument(ctx context.Context, i *Indexer, link string) (*goquery.Document, error) {
// try to get from redis first
docCache, err := i.redis.Get(ctx, link)
if err == nil {
i.metrics.CacheHits.WithLabelValues("document_body").Inc()
fmt.Printf("returning from long-lived cache: %s\n", link)
return goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(docCache)))
}
defer i.metrics.CacheMisses.WithLabelValues("document_body").Inc()
resp, err := i.requester.GetDocument(ctx, link)
if err != nil {
return nil, err
}
defer resp.Close()
body, err := io.ReadAll(resp)
if err != nil {
return nil, err
}
// set cache
err = i.redis.Set(ctx, link, body)
if err != nil {
fmt.Println(err)
}
doc, err := goquery.NewDocumentFromReader(io.NopCloser(bytes.NewReader(body)))
if err != nil {
return nil, err
}
return doc, nil
}
func getPublishedDateFromMeta(document *goquery.Document) time.Time {
var date time.Time
//<meta property="article:published_time" content="2019-08-23T13:20:57+00:00">