Feat/Jaccard Similarity (#5)

* new: feat: add sorting and filtering by similarity

* chg: fix: imdb link retrieval

* chg: fix: do not filter when queryis empty
This commit is contained in:
2024-03-10 14:58:15 -03:00
committed by GitHub
parent 322bb34ebb
commit 268ece5650
6 changed files with 69 additions and 4 deletions

View File

@@ -9,6 +9,7 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"
@@ -16,6 +17,8 @@ import (
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
"github.com/hbollon/go-edlib"
)
var comando = IndexerMeta{
@@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
}
}
for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}
// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
return it.Similarity > 0
})
}
// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
@@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent
// find any link from imdb
imdbLink := ""
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
article.Find("a").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
matches := re.FindStringSubmatch(link)