Feat/Jaccard Similarity (#5)
* new: feat: add sorting and filtering by similarity * chg: fix: imdb link retrieval * chg: fix: do not filter when queryis empty
This commit is contained in:
25
api/bludv.go
25
api/bludv.go
@@ -7,13 +7,17 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/hbollon/go-edlib"
|
||||
|
||||
"github.com/felipemarinho97/torrent-indexer/magnet"
|
||||
"github.com/felipemarinho97/torrent-indexer/schema"
|
||||
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
||||
"github.com/felipemarinho97/torrent-indexer/utils"
|
||||
)
|
||||
|
||||
var bludv = IndexerMeta{
|
||||
@@ -29,7 +33,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
|
||||
}()
|
||||
|
||||
ctx := r.Context()
|
||||
// supported query params: q, season, episode
|
||||
// supported query params: q, season, episode, filter_results
|
||||
q := r.URL.Query().Get("q")
|
||||
|
||||
// URL encode query param
|
||||
@@ -87,6 +91,25 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
for i, it := range indexedTorrents {
|
||||
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
|
||||
qLower := strings.ToLower(q)
|
||||
splitLength := 2
|
||||
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
|
||||
}
|
||||
|
||||
// remove the ones with zero similarity
|
||||
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
|
||||
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
|
||||
return it.Similarity > 0
|
||||
})
|
||||
}
|
||||
|
||||
// sort by similarity
|
||||
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
|
||||
return int((j.Similarity - i.Similarity) * 1000)
|
||||
})
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(Response{
|
||||
Results: indexedTorrents,
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -16,6 +17,8 @@ import (
|
||||
"github.com/felipemarinho97/torrent-indexer/magnet"
|
||||
"github.com/felipemarinho97/torrent-indexer/schema"
|
||||
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
||||
"github.com/felipemarinho97/torrent-indexer/utils"
|
||||
"github.com/hbollon/go-edlib"
|
||||
)
|
||||
|
||||
var comando = IndexerMeta{
|
||||
@@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
}
|
||||
|
||||
for i, it := range indexedTorrents {
|
||||
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
|
||||
qLower := strings.ToLower(q)
|
||||
splitLength := 2
|
||||
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
|
||||
}
|
||||
|
||||
// remove the ones with zero similarity
|
||||
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
|
||||
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
|
||||
return it.Similarity > 0
|
||||
})
|
||||
}
|
||||
|
||||
// sort by similarity
|
||||
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
|
||||
return int((j.Similarity - i.Similarity) * 1000)
|
||||
})
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(Response{
|
||||
Results: indexedTorrents,
|
||||
@@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent
|
||||
|
||||
// find any link from imdb
|
||||
imdbLink := ""
|
||||
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
|
||||
article.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
link, _ := s.Attr("href")
|
||||
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
|
||||
matches := re.FindStringSubmatch(link)
|
||||
|
||||
@@ -39,6 +39,7 @@ type IndexedTorrent struct {
|
||||
Size string `json:"size"`
|
||||
LeechCount int `json:"leech_count"`
|
||||
SeedCount int `json:"seed_count"`
|
||||
Similarity float32 `json:"similarity"`
|
||||
}
|
||||
|
||||
func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics) *Indexer {
|
||||
@@ -59,14 +60,16 @@ func HandlerIndex(w http.ResponseWriter, r *http.Request) {
|
||||
"method": "GET",
|
||||
"description": "Indexer for comando torrents",
|
||||
"query_params": map[string]string{
|
||||
"q": "search query",
|
||||
"q": "search query",
|
||||
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
|
||||
},
|
||||
},
|
||||
"/indexers/bludv": map[string]interface{}{
|
||||
"method": "GET",
|
||||
"description": "Indexer for bludv",
|
||||
"query_params": map[string]string{
|
||||
"q": "search query",
|
||||
"q": "search query",
|
||||
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
1
go.mod
1
go.mod
@@ -19,5 +19,6 @@ require (
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.9.1
|
||||
github.com/hbollon/go-edlib v1.6.0
|
||||
github.com/prometheus/client_golang v1.19.0
|
||||
)
|
||||
|
||||
4
go.sum
4
go.sum
@@ -10,10 +10,14 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
|
||||
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/hbollon/go-edlib v1.6.0 h1:ga7AwwVIvP8mHm9GsPueC0d71cfRU/52hmPJ7Tprv4E=
|
||||
github.com/hbollon/go-edlib v1.6.0/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
|
||||
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
|
||||
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
|
||||
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=
|
||||
|
||||
12
utils/util.go
Normal file
12
utils/util.go
Normal file
@@ -0,0 +1,12 @@
|
||||
package utils
|
||||
|
||||
func Filter[A any](arr []A, f func(A) bool) []A {
|
||||
var res []A
|
||||
res = make([]A, 0)
|
||||
for _, v := range arr {
|
||||
if f(v) {
|
||||
res = append(res, v)
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
Reference in New Issue
Block a user