Feat/Jaccard Similarity (#5)

* new: feat: add sorting and filtering by similarity

* chg: fix: imdb link retrieval

* chg: fix: do not filter when queryis empty
This commit is contained in:
2024-03-10 14:58:15 -03:00
committed by GitHub
parent 322bb34ebb
commit 268ece5650
6 changed files with 69 additions and 4 deletions

View File

@@ -7,13 +7,17 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
)
var bludv = IndexerMeta{
@@ -29,7 +33,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
}()
ctx := r.Context()
// supported query params: q, season, episode
// supported query params: q, season, episode, filter_results
q := r.URL.Query().Get("q")
// URL encode query param
@@ -87,6 +91,25 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,

View File

@@ -9,6 +9,7 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
@@ -16,6 +17,8 @@ import (
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
"github.com/hbollon/go-edlib"
)
var comando = IndexerMeta{
@@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
@@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent
// find any link from imdb
imdbLink := ""
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
article.Find("a").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
matches := re.FindStringSubmatch(link)

View File

@@ -39,6 +39,7 @@ type IndexedTorrent struct {
Size string `json:"size"`
LeechCount int `json:"leech_count"`
SeedCount int `json:"seed_count"`
Similarity float32 `json:"similarity"`
}
func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics) *Indexer {
@@ -59,14 +60,16 @@ func HandlerIndex(w http.ResponseWriter, r *http.Request) {
"method": "GET",
"description": "Indexer for comando torrents",
"query_params": map[string]string{
"q": "search query",
"q": "search query",
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
},
},
"/indexers/bludv": map[string]interface{}{
"method": "GET",
"description": "Indexer for bludv",
"query_params": map[string]string{
"q": "search query",
"q": "search query",
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
},
},
},

1
go.mod
View File

@@ -19,5 +19,6 @@ require (
require (
github.com/PuerkitoBio/goquery v1.9.1
github.com/hbollon/go-edlib v1.6.0
github.com/prometheus/client_golang v1.19.0
)

4
go.sum
View File

@@ -10,10 +10,14 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/hbollon/go-edlib v1.6.0 h1:ga7AwwVIvP8mHm9GsPueC0d71cfRU/52hmPJ7Tprv4E=
github.com/hbollon/go-edlib v1.6.0/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=

12
utils/util.go Normal file
View File

@@ -0,0 +1,12 @@
package utils
func Filter[A any](arr []A, f func(A) bool) []A {
var res []A
res = make([]A, 0)
for _, v := range arr {
if f(v) {
res = append(res, v)
}
}
return res
}