Feat/Jaccard Similarity (#5)
* new: feat: add sorting and filtering by similarity * chg: fix: imdb link retrieval * chg: fix: do not filter when queryis empty
This commit is contained in:
25
api/bludv.go
25
api/bludv.go
@@ -7,13 +7,17 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/hbollon/go-edlib"
|
||||||
|
|
||||||
"github.com/felipemarinho97/torrent-indexer/magnet"
|
"github.com/felipemarinho97/torrent-indexer/magnet"
|
||||||
"github.com/felipemarinho97/torrent-indexer/schema"
|
"github.com/felipemarinho97/torrent-indexer/schema"
|
||||||
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
||||||
|
"github.com/felipemarinho97/torrent-indexer/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
var bludv = IndexerMeta{
|
var bludv = IndexerMeta{
|
||||||
@@ -29,7 +33,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
ctx := r.Context()
|
ctx := r.Context()
|
||||||
// supported query params: q, season, episode
|
// supported query params: q, season, episode, filter_results
|
||||||
q := r.URL.Query().Get("q")
|
q := r.URL.Query().Get("q")
|
||||||
|
|
||||||
// URL encode query param
|
// URL encode query param
|
||||||
@@ -87,6 +91,25 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for i, it := range indexedTorrents {
|
||||||
|
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
|
||||||
|
qLower := strings.ToLower(q)
|
||||||
|
splitLength := 2
|
||||||
|
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove the ones with zero similarity
|
||||||
|
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
|
||||||
|
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
|
||||||
|
return it.Similarity > 0
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort by similarity
|
||||||
|
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
|
||||||
|
return int((j.Similarity - i.Similarity) * 1000)
|
||||||
|
})
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
json.NewEncoder(w).Encode(Response{
|
json.NewEncoder(w).Encode(Response{
|
||||||
Results: indexedTorrents,
|
Results: indexedTorrents,
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -16,6 +17,8 @@ import (
|
|||||||
"github.com/felipemarinho97/torrent-indexer/magnet"
|
"github.com/felipemarinho97/torrent-indexer/magnet"
|
||||||
"github.com/felipemarinho97/torrent-indexer/schema"
|
"github.com/felipemarinho97/torrent-indexer/schema"
|
||||||
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
||||||
|
"github.com/felipemarinho97/torrent-indexer/utils"
|
||||||
|
"github.com/hbollon/go-edlib"
|
||||||
)
|
)
|
||||||
|
|
||||||
var comando = IndexerMeta{
|
var comando = IndexerMeta{
|
||||||
@@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for i, it := range indexedTorrents {
|
||||||
|
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
|
||||||
|
qLower := strings.ToLower(q)
|
||||||
|
splitLength := 2
|
||||||
|
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove the ones with zero similarity
|
||||||
|
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
|
||||||
|
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
|
||||||
|
return it.Similarity > 0
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort by similarity
|
||||||
|
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
|
||||||
|
return int((j.Similarity - i.Similarity) * 1000)
|
||||||
|
})
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
json.NewEncoder(w).Encode(Response{
|
json.NewEncoder(w).Encode(Response{
|
||||||
Results: indexedTorrents,
|
Results: indexedTorrents,
|
||||||
@@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent
|
|||||||
|
|
||||||
// find any link from imdb
|
// find any link from imdb
|
||||||
imdbLink := ""
|
imdbLink := ""
|
||||||
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
|
article.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||||
link, _ := s.Attr("href")
|
link, _ := s.Attr("href")
|
||||||
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
|
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
|
||||||
matches := re.FindStringSubmatch(link)
|
matches := re.FindStringSubmatch(link)
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ type IndexedTorrent struct {
|
|||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
LeechCount int `json:"leech_count"`
|
LeechCount int `json:"leech_count"`
|
||||||
SeedCount int `json:"seed_count"`
|
SeedCount int `json:"seed_count"`
|
||||||
|
Similarity float32 `json:"similarity"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics) *Indexer {
|
func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics) *Indexer {
|
||||||
@@ -59,14 +60,16 @@ func HandlerIndex(w http.ResponseWriter, r *http.Request) {
|
|||||||
"method": "GET",
|
"method": "GET",
|
||||||
"description": "Indexer for comando torrents",
|
"description": "Indexer for comando torrents",
|
||||||
"query_params": map[string]string{
|
"query_params": map[string]string{
|
||||||
"q": "search query",
|
"q": "search query",
|
||||||
|
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"/indexers/bludv": map[string]interface{}{
|
"/indexers/bludv": map[string]interface{}{
|
||||||
"method": "GET",
|
"method": "GET",
|
||||||
"description": "Indexer for bludv",
|
"description": "Indexer for bludv",
|
||||||
"query_params": map[string]string{
|
"query_params": map[string]string{
|
||||||
"q": "search query",
|
"q": "search query",
|
||||||
|
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
1
go.mod
1
go.mod
@@ -19,5 +19,6 @@ require (
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/PuerkitoBio/goquery v1.9.1
|
github.com/PuerkitoBio/goquery v1.9.1
|
||||||
|
github.com/hbollon/go-edlib v1.6.0
|
||||||
github.com/prometheus/client_golang v1.19.0
|
github.com/prometheus/client_golang v1.19.0
|
||||||
)
|
)
|
||||||
|
|||||||
4
go.sum
4
go.sum
@@ -10,10 +10,14 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
|||||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||||
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
|
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
|
||||||
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/hbollon/go-edlib v1.6.0 h1:ga7AwwVIvP8mHm9GsPueC0d71cfRU/52hmPJ7Tprv4E=
|
||||||
|
github.com/hbollon/go-edlib v1.6.0/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
|
||||||
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
|
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
|
||||||
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
|
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
|
||||||
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=
|
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=
|
||||||
|
|||||||
12
utils/util.go
Normal file
12
utils/util.go
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
package utils
|
||||||
|
|
||||||
|
func Filter[A any](arr []A, f func(A) bool) []A {
|
||||||
|
var res []A
|
||||||
|
res = make([]A, 0)
|
||||||
|
for _, v := range arr {
|
||||||
|
if f(v) {
|
||||||
|
res = append(res, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user