Feat/Jaccard Similarity (#5)
* new: feat: add sorting and filtering by similarity * chg: fix: imdb link retrieval * chg: fix: do not filter when queryis empty
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -16,6 +17,8 @@ import (
|
||||
"github.com/felipemarinho97/torrent-indexer/magnet"
|
||||
"github.com/felipemarinho97/torrent-indexer/schema"
|
||||
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
|
||||
"github.com/felipemarinho97/torrent-indexer/utils"
|
||||
"github.com/hbollon/go-edlib"
|
||||
)
|
||||
|
||||
var comando = IndexerMeta{
|
||||
@@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
}
|
||||
|
||||
for i, it := range indexedTorrents {
|
||||
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
|
||||
qLower := strings.ToLower(q)
|
||||
splitLength := 2
|
||||
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
|
||||
}
|
||||
|
||||
// remove the ones with zero similarity
|
||||
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
|
||||
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
|
||||
return it.Similarity > 0
|
||||
})
|
||||
}
|
||||
|
||||
// sort by similarity
|
||||
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
|
||||
return int((j.Similarity - i.Similarity) * 1000)
|
||||
})
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(Response{
|
||||
Results: indexedTorrents,
|
||||
@@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent
|
||||
|
||||
// find any link from imdb
|
||||
imdbLink := ""
|
||||
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
|
||||
article.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
link, _ := s.Attr("href")
|
||||
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
|
||||
matches := re.FindStringSubmatch(link)
|
||||
|
||||
Reference in New Issue
Block a user