diff --git a/api/comando_torrents.go b/api/comando_torrents.go index bc66a48..3d8ef26 100644 --- a/api/comando_torrents.go +++ b/api/comando_torrents.go @@ -13,6 +13,7 @@ import ( "time" "github.com/PuerkitoBio/goquery" + "github.com/felipemarinho97/torrent-indexer/magnet" "github.com/felipemarinho97/torrent-indexer/schema" goscrape "github.com/felipemarinho97/torrent-indexer/scrape" ) @@ -47,6 +48,7 @@ type IndexedTorrent struct { Date time.Time `json:"date"` InfoHash string `json:"info_hash"` Trackers []string `json:"trackers"` + Size string `json:"size"` LeechCount int `json:"leech_count"` SeedCount int `json:"seed_count"` } @@ -148,6 +150,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent var audio []schema.Audio var year string + var size []string article.Find("div.entry-content > p").Each(func(i int, s *goquery.Selection) { // pattern: // Título Traduzido: Fundação @@ -167,45 +170,26 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent // Servidor: Torrent text := s.Text() - //re := regexp.MustCompile(`Áudio: (.*)`) - re := regexp.MustCompile(`(Áudio|Idioma): (.*)`) - audioMatch := re.FindStringSubmatch(text) - if len(audioMatch) > 0 { - sep := getSeparator(audioMatch[2]) - langs_raw := strings.Split(audioMatch[2], sep) - for _, lang := range langs_raw { - lang = strings.TrimSpace(lang) - a := schema.GetAudioFromString(lang) - if a != nil { - audio = append(audio, *a) - } else { - fmt.Println("unknown language:", lang) - } - } - } - - re = regexp.MustCompile(`Lançamento: (.*)`) - yearMatch := re.FindStringSubmatch(text) - if len(yearMatch) > 0 { - year = yearMatch[1] - } - - // if year is empty, try to get it from title - if year == "" { - re = regexp.MustCompile(`\((\d{4})\)`) - yearMatch := re.FindStringSubmatch(title) - if len(yearMatch) > 0 { - year = yearMatch[1] - } - } + audio = append(audio, findAudioFromText(text)...) + year = findYearFromText(text, title) + size = append(size, findSizesFromText(text)...) }) + size = stableUniq(size) + var chanIndexedTorrent = make(chan IndexedTorrent) // for each magnet link, create a new indexed torrent - for _, magnetLink := range magnetLinks { - go func(magnetLink string) { - releaseTitle := extractReleaseName(magnetLink) + for it, magnetLink := range magnetLinks { + it := it + go func(it int, magnetLink string) { + magnet, err := magnet.ParseMagnetUri(magnetLink) + if err != nil { + fmt.Println(err) + } + releaseTitle := magnet.DisplayName + infoHash := magnet.InfoHash.String() + trackers := magnet.Trackers magnetAudio := []schema.Audio{} if strings.Contains(strings.ToLower(releaseTitle), "dual") { magnetAudio = append(magnetAudio, audio...) @@ -219,11 +203,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent } else { magnetAudio = append(magnetAudio, audio...) } - // decode url encoded title - releaseTitle, _ = url.QueryUnescape(releaseTitle) - infoHash := extractInfoHash(magnetLink) - trackers := extractTrackers(magnetLink) peer, seed, err := goscrape.GetLeechsAndSeeds(ctx, i.redis, infoHash, trackers) if err != nil { fmt.Println(err) @@ -231,8 +211,14 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent title := processTitle(title, magnetAudio) - it := IndexedTorrent{ - Title: releaseTitle, + // if the number of sizes is equal to the number of magnets, then assign the size to each indexed torrent in order + var mySize string + if len(size) == len(magnetLinks) { + mySize = size[it] + } + + ixt := IndexedTorrent{ + Title: appendAudioISO639_2Code(releaseTitle, magnetAudio), OriginalTitle: title, Details: link, Year: year, @@ -243,9 +229,10 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent Trackers: trackers, LeechCount: peer, SeedCount: seed, + Size: mySize, } - chanIndexedTorrent <- it - }(magnetLink) + chanIndexedTorrent <- ixt + }(it, magnetLink) } for i := 0; i < len(magnetLinks); i++ { @@ -256,14 +243,102 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent return indexedTorrents, nil } +func stableUniq(s []string) []string { + var uniq []map[string]interface{} + m := make(map[string]map[string]interface{}) + for i, v := range s { + m[v] = map[string]interface{}{ + "v": v, + "i": i, + } + } + // to order by index + for _, v := range m { + uniq = append(uniq, v) + } + + // sort by index + for i := 0; i < len(uniq); i++ { + for j := i + 1; j < len(uniq); j++ { + if uniq[i]["i"].(int) > uniq[j]["i"].(int) { + uniq[i], uniq[j] = uniq[j], uniq[i] + } + } + } + + // get only values + var uniqValues []string + for _, v := range uniq { + uniqValues = append(uniqValues, v["v"].(string)) + } + + return uniqValues +} + +func findYearFromText(text string, title string) (year string) { + re := regexp.MustCompile(`Lançamento: (.*)`) + yearMatch := re.FindStringSubmatch(text) + if len(yearMatch) > 0 { + year = yearMatch[1] + } + + if year == "" { + re = regexp.MustCompile(`\((\d{4})\)`) + yearMatch := re.FindStringSubmatch(title) + if len(yearMatch) > 0 { + year = yearMatch[1] + } + } + return year +} + +func findAudioFromText(text string) []schema.Audio { + var audio []schema.Audio + re := regexp.MustCompile(`(.udio|Idioma):.?(.*)`) + audioMatch := re.FindStringSubmatch(text) + if len(audioMatch) > 0 { + sep := getSeparator(audioMatch[2]) + langs_raw := strings.Split(audioMatch[2], sep) + for _, lang := range langs_raw { + lang = strings.TrimSpace(lang) + a := schema.GetAudioFromString(lang) + if a != nil { + audio = append(audio, *a) + } else { + fmt.Println("unknown language:", lang) + } + } + } + return audio +} + +func findSizesFromText(text string) []string { + var sizes []string + // everything that ends with GB or MB, using ',' or '.' as decimal separator + re := regexp.MustCompile(`(\d+[\.,]?\d+) ?(GB|MB)`) + sizesMatch := re.FindAllStringSubmatch(text, -1) + if len(sizesMatch) > 0 { + for _, size := range sizesMatch { + sizes = append(sizes, size[0]) + } + } + return sizes +} + func processTitle(title string, a []schema.Audio) string { // remove ' - Donwload' from title - title = strings.Replace(title, " - Download", "", -1) + title = strings.Replace(title, " – Download", "", -1) // remove 'comando.la' from title title = strings.Replace(title, "comando.la", "", -1) // add audio ISO 639-2 code to title between () + title = appendAudioISO639_2Code(title, a) + + return title +} + +func appendAudioISO639_2Code(title string, a []schema.Audio) string { if len(a) > 0 { audio := []string{} for _, lang := range a { @@ -271,7 +346,6 @@ func processTitle(title string, a []schema.Audio) string { } title = fmt.Sprintf("%s (%s)", title, strings.Join(audio, ", ")) } - return title } @@ -315,33 +389,3 @@ func getDocument(ctx context.Context, i *Indexer, link string) (*goquery.Documen return doc, nil } - -func extractReleaseName(magnetLink string) string { - re := regexp.MustCompile(`dn=(.*?)&`) - matches := re.FindStringSubmatch(magnetLink) - if len(matches) > 0 { - return matches[1] - } - return "" -} - -func extractInfoHash(magnetLink string) string { - re := regexp.MustCompile(`btih:(.*?)&`) - matches := re.FindStringSubmatch(magnetLink) - if len(matches) > 0 { - return matches[1] - } - return "" -} - -func extractTrackers(magnetLink string) []string { - re := regexp.MustCompile(`tr=(.*?)&`) - matches := re.FindAllStringSubmatch(magnetLink, -1) - var trackers []string - for _, match := range matches { - // url decode - tracker, _ := url.QueryUnescape(match[1]) - trackers = append(trackers, tracker) - } - return trackers -} diff --git a/api/comando_torrents_test.go b/api/comando_torrents_test.go new file mode 100644 index 0000000..f6065b1 --- /dev/null +++ b/api/comando_torrents_test.go @@ -0,0 +1,79 @@ +package handler + +import ( + "reflect" + "testing" + + "github.com/felipemarinho97/torrent-indexer/schema" +) + +func Test_findAudioFromText(t *testing.T) { + type args struct { + text string + } + tests := []struct { + name string + args args + want []schema.Audio + }{ + { + name: "should return audio in portuguese", + args: args{ + text: "Áudio: Português", + }, + want: []schema.Audio{ + schema.AudioPortuguese, + }, + }, + { + name: "should return audio in portuguese", + args: args{ + text: "Idioma: Português", + }, + want: []schema.Audio{ + schema.AudioPortuguese, + }, + }, + { + name: "should return audio in portuguese", + args: args{ + text: "Audio: Português", + }, + want: []schema.Audio{ + schema.AudioPortuguese, + }, + }, + { + name: "should return audio in portuguese", + args: args{ + text: ` +»INFORMAÇÕES« +Título Traduzido: O Cangaceiro do Futuro +Título Original: O Cangaceiro do Futuro +IMDb: 7,1 +Gênero:Comédia +Lançamento: 2022 +Qualidade: WEB-DL +Áudio: Português +Legenda: S/L +Formato: MKV +Tamanho: 5.77 GB | 9.60 GB +Duração: 30 Min./Ep. +Qualidade de Áudio: 10 +Qualidade de Vídeo: 10 +Servidor Via: Torrent + `, + }, + want: []schema.Audio{ + schema.AudioPortuguese, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := findAudioFromText(tt.args.text); !reflect.DeepEqual(got, tt.want) { + t.Errorf("findAudioFromText() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/magnet/infohash.go b/magnet/infohash.go new file mode 100644 index 0000000..0a6db06 --- /dev/null +++ b/magnet/infohash.go @@ -0,0 +1,80 @@ +package magnet + +import ( + "crypto/sha1" + "encoding" + "encoding/hex" + "fmt" +) + +const Size = 20 + +// 20-byte SHA1 hash used for info and pieces. +type T [Size]byte + +var _ fmt.Formatter = (*T)(nil) + +func (t T) Format(f fmt.State, c rune) { + // TODO: I can't figure out a nice way to just override the 'x' rune, since it's meaningless + // with the "default" 'v', or .String() already returning the hex. + f.Write([]byte(t.HexString())) +} + +func (t T) Bytes() []byte { + return t[:] +} + +func (t T) AsString() string { + return string(t[:]) +} + +func (t T) String() string { + return t.HexString() +} + +func (t T) HexString() string { + return fmt.Sprintf("%x", t[:]) +} + +func (t *T) FromHexString(s string) (err error) { + if len(s) != 2*Size { + err = fmt.Errorf("hash hex string has bad length: %d", len(s)) + return + } + n, err := hex.Decode(t[:], []byte(s)) + if err != nil { + return + } + if n != Size { + panic(n) + } + return +} + +var ( + _ encoding.TextUnmarshaler = (*T)(nil) + _ encoding.TextMarshaler = T{} +) + +func (t *T) UnmarshalText(b []byte) error { + return t.FromHexString(string(b)) +} + +func (t T) MarshalText() (text []byte, err error) { + return []byte(t.HexString()), nil +} + +func FromHexString(s string) (h T) { + err := h.FromHexString(s) + if err != nil { + panic(err) + } + return +} + +func HashBytes(b []byte) (ret T) { + hasher := sha1.New() + hasher.Write(b) + copy(ret[:], hasher.Sum(nil)) + return +} diff --git a/magnet/magnet.go b/magnet/magnet.go new file mode 100644 index 0000000..a731ac9 --- /dev/null +++ b/magnet/magnet.go @@ -0,0 +1,93 @@ +package magnet + +import ( + "encoding/base32" + "encoding/hex" + "errors" + "fmt" + "net/url" + "strings" +) + +// Magnet link components. +type Magnet struct { + InfoHash T // Expected in this implementation + Trackers []string // "tr" values + DisplayName string // "dn" value, if not empty + Params url.Values // All other values, such as "x.pe", "as", "xs" etc. +} + +const xtPrefix = "urn:btih:" + +// Deprecated: Use ParseMagnetUri. +var ParseMagnetURI = ParseMagnetUri + +// ParseMagnetUri parses Magnet-formatted URIs into a Magnet instance +func ParseMagnetUri(uri string) (m Magnet, err error) { + u, err := url.Parse(uri) + if err != nil { + err = fmt.Errorf("error parsing uri: %w", err) + return + } + if u.Scheme != "magnet" { + err = fmt.Errorf("unexpected scheme %q", u.Scheme) + return + } + q := u.Query() + xt := q.Get("xt") + m.InfoHash, err = parseInfohash(q.Get("xt")) + if err != nil { + err = fmt.Errorf("error parsing infohash %q: %w", xt, err) + return + } + dropFirst(q, "xt") + m.DisplayName = q.Get("dn") + dropFirst(q, "dn") + m.Trackers = q["tr"] + delete(q, "tr") + if len(q) == 0 { + q = nil + } + m.Params = q + return +} + +func parseInfohash(xt string) (ih T, err error) { + if !strings.HasPrefix(xt, xtPrefix) { + err = errors.New("bad xt parameter prefix") + return + } + encoded := xt[len(xtPrefix):] + decode := func() func(dst, src []byte) (int, error) { + switch len(encoded) { + case 40: + return hex.Decode + case 32: + return base32.StdEncoding.Decode + } + return nil + }() + if decode == nil { + err = fmt.Errorf("unhandled xt parameter encoding (encoded length %d)", len(encoded)) + return + } + n, err := decode(ih[:], []byte(encoded)) + if err != nil { + err = fmt.Errorf("error decoding xt: %w", err) + return + } + if n != 20 { + panic(n) + } + return +} + +func dropFirst(vs url.Values, key string) { + sl := vs[key] + switch len(sl) { + case 0, 1: + vs.Del(key) + default: + vs[key] = sl[1:] + } +} diff --git a/schema/audio.go b/schema/audio.go index 0605171..b1fa270 100644 --- a/schema/audio.go +++ b/schema/audio.go @@ -3,42 +3,58 @@ package schema type Audio string const ( - AudioPortuguese = "Português" - AudioEnglish = "Inglês" - AudioSpanish = "Espanhol" - AudioFrench = "Francês" - AudioGerman = "Alemão" - AudioItalian = "Italiano" - AudioJapanese = "Japonês" - AudioKorean = "Coreano" - AudioMandarin = "Mandarim" - AudioMandarin2 = "Chinês" - AudioRussian = "Russo" - AudioSwedish = "Sueco" - AudioUkrainian = "Ucraniano" - AudioPolish = "Polaco" - AudioPolish2 = "Polonês" - AudioThai = "Tailandês" - AudioTurkish = "Turco" + AudioPortuguese = "Português" + AudioPortuguese2 = "Portugues" + AudioEnglish = "Inglês" + AudioEnglish2 = "Ingles" + AudioSpanish = "Espanhol" + AudioFrench = "Francês" + AudioFrench2 = "Frances" + AudioGerman = "Alemão" + AudioGerman2 = "Alemao" + AudioItalian = "Italiano" + AudioJapanese = "Japonês" + AudioJapanese2 = "Japones" + AudioKorean = "Coreano" + AudioMandarin = "Mandarim" + AudioMandarin2 = "Chinês" + AudioMandarin3 = "Chines" + AudioRussian = "Russo" + AudioSwedish = "Sueco" + AudioUkrainian = "Ucraniano" + AudioPolish = "Polaco" + AudioPolish2 = "Polonês" + AudioPolish3 = "Polones" + AudioThai = "Tailandês" + AudioThai2 = "Tailandes" + AudioTurkish = "Turco" ) var AudioList = []Audio{ AudioPortuguese, + AudioPortuguese2, AudioEnglish, + AudioEnglish2, AudioSpanish, AudioFrench, + AudioFrench2, AudioGerman, + AudioGerman2, AudioItalian, AudioJapanese, + AudioJapanese2, AudioKorean, AudioMandarin, AudioMandarin2, + AudioMandarin3, AudioRussian, AudioSwedish, AudioUkrainian, AudioPolish, AudioPolish2, + AudioPolish3, AudioThai, + AudioThai2, AudioTurkish, } @@ -59,24 +75,36 @@ func (a Audio) toISO639_2() string { switch a { case AudioPortuguese: return "por" + case AudioPortuguese2: + return "por" case AudioEnglish: return "eng" + case AudioEnglish2: + return "eng" case AudioSpanish: return "spa" case AudioFrench: return "fra" + case AudioFrench2: + return "fra" case AudioGerman: return "deu" + case AudioGerman2: + return "deu" case AudioItalian: return "ita" case AudioJapanese: return "jpn" + case AudioJapanese2: + return "jpn" case AudioKorean: return "kor" case AudioMandarin: return "chi" case AudioMandarin2: return "chi" + case AudioMandarin3: + return "chi" case AudioRussian: return "rus" case AudioSwedish: @@ -87,8 +115,12 @@ func (a Audio) toISO639_2() string { return "pol" case AudioPolish2: return "pol" + case AudioPolish3: + return "pol" case AudioThai: return "tha" + case AudioThai2: + return "tha" case AudioTurkish: return "tur" default: