diff --git a/client.go b/client.go index 438b8990..3cda4dc9 100644 --- a/client.go +++ b/client.go @@ -4,13 +4,30 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "log" + "math/rand" "net/http" + "net/url" "strconv" + "sync" ) +const ( + Size1Kb = 1024 + Size1Mb = Size1Kb * 1024 + Size10Mb = Size1Mb * 10 +) + +var ( + ErrNoFormat = errors.New("no video format provided") +) + +// DefaultClient type to use. No reason to change but you could if you wanted to. +var DefaultClient = AndroidClient + // Client offers methods to download video metadata and video streams. type Client struct { // Debug enables debugging output through log package @@ -20,8 +37,24 @@ type Client struct { // If not set, http.DefaultClient will be used HTTPClient *http.Client + // MaxRoutines to use when downloading a video. + MaxRoutines int + + // ChunkSize to use when downloading videos in chunks. Default is Size10Mb. + ChunkSize int64 + // playerCache caches the JavaScript code of a player response playerCache playerCache + + client *clientInfo + + consentID string +} + +func (c *Client) assureClient() { + if c.client == nil { + c.client = &DefaultClient + } } // GetVideo fetches video metadata @@ -35,58 +68,62 @@ func (c *Client) GetVideoContext(ctx context.Context, url string) (*Video, error if err != nil { return nil, fmt.Errorf("extractVideoID failed: %w", err) } + return c.videoFromID(ctx, id) } func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) { - body, err := c.videoDataByInnertube(ctx, id, webClient) + c.assureClient() + + body, err := c.videoDataByInnertube(ctx, id) if err != nil { return nil, err } - v := &Video{ + v := Video{ ID: id, } - err = v.parseVideoInfo(body) // return early if all good - if err == nil { - return v, nil + if err = v.parseVideoInfo(body); err == nil { + return &v, nil } // If the uploader has disabled embedding the video on other sites, parse video page - if err == ErrNotPlayableInEmbed { + if errors.Is(err, ErrNotPlayableInEmbed) { // additional parameters are required to access clips with sensitiv content html, err := c.httpGetBodyBytes(ctx, "https://www.youtube.com/watch?v="+id+"&bpctr=9999999999&has_verified=1") if err != nil { return nil, err } - return v, v.parseVideoPage(html) + return &v, v.parseVideoPage(html) } // If the uploader marked the video as inappropriate for some ages, use embed player - if err == ErrLoginRequired { - bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id, embeddedClient) + if errors.Is(err, ErrLoginRequired) { + c.client = &EmbeddedClient + + bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id) if errEmbed == nil { errEmbed = v.parseVideoInfo(bodyEmbed) } if errEmbed == nil { - return v, nil + return &v, nil } // private video clearly not age-restricted and thus should be explicit if errEmbed == ErrVideoPrivate { - return v, errEmbed + return &v, errEmbed } // wrapping error so its clear whats happened - return v, fmt.Errorf("can't bypass age restriction: %w", errEmbed) + return &v, fmt.Errorf("can't bypass age restriction: %w", errEmbed) } // undefined error - return v, err + return &v, err } type innertubeRequest struct { @@ -94,7 +131,10 @@ type innertubeRequest struct { BrowseID string `json:"browseId,omitempty"` Continuation string `json:"continuation,omitempty"` Context inntertubeContext `json:"context"` - PlaybackContext playbackContext `json:"playbackContext,omitempty"` + PlaybackContext *playbackContext `json:"playbackContext,omitempty"` + ContentCheckOK bool `json:"contentCheckOk,omitempty"` + RacyCheckOk bool `json:"racyCheckOk,omitempty"` + Params string `json:"params"` } type playbackContext struct { @@ -102,7 +142,8 @@ type playbackContext struct { } type contentPlaybackContext struct { - SignatureTimestamp string `json:"signatureTimestamp"` + // SignatureTimestamp string `json:"signatureTimestamp"` + HTML5Preference string `json:"html5Preference"` } type inntertubeContext struct { @@ -110,68 +151,89 @@ type inntertubeContext struct { } type innertubeClient struct { - HL string `json:"hl"` - GL string `json:"gl"` - ClientName string `json:"clientName"` - ClientVersion string `json:"clientVersion"` + HL string `json:"hl"` + GL string `json:"gl"` + ClientName string `json:"clientName"` + ClientVersion string `json:"clientVersion"` + AndroidSDKVersion int `json:"androidSDKVersion,omitempty"` + UserAgent string `json:"userAgent,omitempty"` + TimeZone string `json:"timeZone"` + UTCOffset int `json:"utcOffsetMinutes"` } // client info for the innertube API type clientInfo struct { - name string - key string - version string + name string + key string + version string + userAgent string + androidVersion int } var ( - // might add ANDROID and other in future, but i don't see reason yet - webClient = clientInfo{ - name: "WEB", - version: "2.20210617.01.00", - key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", - } - - embeddedClient = clientInfo{ - name: "WEB_EMBEDDED_PLAYER", - version: "1.19700101", - key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients + // WebClient, better to use Android client but go ahead. + WebClient = clientInfo{ + name: "WEB", + version: "2.20210617.01.00", + key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + } + + // AndroidClient, download go brrrrrr. + AndroidClient = clientInfo{ + name: "ANDROID", + version: "17.31.35", + key: "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w", + userAgent: "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip", + androidVersion: 30, + } + + // EmbeddedClient, not really tested. + EmbeddedClient = clientInfo{ + name: "WEB_EMBEDDED_PLAYER", + version: "1.19700101", + key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", } ) -func (c *Client) videoDataByInnertube(ctx context.Context, id string, clientInfo clientInfo) ([]byte, error) { - config, err := c.getPlayerConfig(ctx, id) - if err != nil { - return nil, err - } - - // fetch sts first - sts, err := config.getSignatureTimestamp() - if err != nil { - return nil, err - } - - context := prepareInnertubeContext(clientInfo) - +func (c *Client) videoDataByInnertube(ctx context.Context, id string) ([]byte, error) { data := innertubeRequest{ - VideoID: id, - Context: context, - PlaybackContext: playbackContext{ + VideoID: id, + Context: prepareInnertubeContext(*c.client), + ContentCheckOK: true, + RacyCheckOk: true, + Params: "8AEB", + PlaybackContext: &playbackContext{ ContentPlaybackContext: contentPlaybackContext{ - SignatureTimestamp: sts, + // SignatureTimestamp: sts, + HTML5Preference: "HTML5_PREF_WANTS", }, }, } - return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+clientInfo.key, data) + return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+c.client.key, data) +} + +func (c *Client) transcriptDataByInnertube(ctx context.Context, id string) ([]byte, error) { + data := innertubeRequest{ + Context: prepareInnertubeContext(*c.client), + Params: transcriptVideoID(id), + } + + return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/get_transcript?key="+c.client.key, data) } func prepareInnertubeContext(clientInfo clientInfo) inntertubeContext { return inntertubeContext{ Client: innertubeClient{ - HL: "en", - GL: "US", - ClientName: clientInfo.name, - ClientVersion: clientInfo.version, + HL: "en", + GL: "US", + TimeZone: "UTC", + ClientName: clientInfo.name, + ClientVersion: clientInfo.version, + AndroidSDKVersion: clientInfo.androidVersion, + UserAgent: clientInfo.userAgent, }, } } @@ -180,10 +242,38 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien context := prepareInnertubeContext(clientInfo) if continuation { - return innertubeRequest{Context: context, Continuation: ID} + return innertubeRequest{ + Context: context, + Continuation: ID, + ContentCheckOK: true, + RacyCheckOk: true, + Params: "8AEB", + } + } + + return innertubeRequest{ + Context: context, + BrowseID: "VL" + ID, + ContentCheckOK: true, + RacyCheckOk: true, + Params: "8AEB", } +} + +// transcriptVideoID encodes the video ID to the param used to fetch transcripts. +func transcriptVideoID(videoID string) string { + langCode := encTranscriptLang("en") - return innertubeRequest{Context: context, BrowseID: "VL" + ID} + // This can be optionally appened to the Sprintf str, not sure what it means + // *3engagement-panel-searchable-transcript-search-panel\x30\x00\x38\x01\x40\x01 + return base64Enc(fmt.Sprintf("\n\x0b%s\x12\x12%s\x18\x01", videoID, langCode)) +} + +func encTranscriptLang(languageCode string) string { + s := fmt.Sprintf("\n\x03asr\x12\x02%s\x1a\x00", languageCode) + s = base64PadEnc(s) + + return url.QueryEscape(s) } // GetPlaylist fetches playlist metadata @@ -195,13 +285,15 @@ func (c *Client) GetPlaylist(url string) (*Playlist, error) { // for these videos. Playlist entries cannot be downloaded, as they lack all the required metadata, but // can be used to enumerate all IDs, Authors, Titles, etc. func (c *Client) GetPlaylistContext(ctx context.Context, url string) (*Playlist, error) { + c.assureClient() + id, err := extractPlaylistID(url) if err != nil { return nil, fmt.Errorf("extractPlaylistID failed: %w", err) } - data := prepareInnertubePlaylistData(id, false, webClient) - body, err := c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+webClient.key, data) + data := prepareInnertubePlaylistData(id, false, *c.client) + body, err := c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+c.client.key, data) if err != nil { return nil, err } @@ -243,7 +335,19 @@ func (c *Client) GetStreamContext(ctx context.Context, video *Video, format *For contentLength = c.downloadOnce(req, w, format) } else { // we have length information, let's download by chunks! - go c.downloadChunked(req, w, format) + data, err := c.downloadChunked(ctx, req, format) + if err != nil { + return nil, 0, err + } + + go func() { + if _, err := w.Write(data); err != nil { + w.CloseWithError(err) + return + } + + w.Close() //nolint:errcheck + }() } return r, contentLength, nil @@ -252,8 +356,7 @@ func (c *Client) GetStreamContext(ctx context.Context, video *Video, format *For func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) int64 { resp, err := c.httpDo(req) if err != nil { - //nolint:errcheck - w.CloseWithError(err) + w.CloseWithError(err) //nolint:errcheck return 0 } @@ -263,8 +366,7 @@ func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) in if err == nil { w.Close() } else { - //nolint:errcheck - w.CloseWithError(err) + w.CloseWithError(err) //nolint:errcheck } }() @@ -274,40 +376,98 @@ func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) in return length } -func (c *Client) downloadChunked(req *http.Request, w *io.PipeWriter, format *Format) { - const chunkSize int64 = 10_000_000 - // Loads a chunk a returns the written bytes. - // Downloading in multiple chunks is much faster: - // https://github.com/kkdai/youtube/pull/190 - loadChunk := func(pos int64) (int64, error) { - req.Header.Set("Range", fmt.Sprintf("bytes=%v-%v", pos, pos+chunkSize-1)) +type chunkData struct { + index int + data []byte +} - resp, err := c.httpDo(req) - if err != nil { - return 0, err - } - defer resp.Body.Close() +func (c *Client) getChunkSize() int64 { + if c.ChunkSize > 0 { + return c.ChunkSize + } - if resp.StatusCode != http.StatusPartialContent { - return 0, ErrUnexpectedStatusCode(resp.StatusCode) - } + return Size10Mb +} + +func (c *Client) getMaxRoutines(limit int) int { + routines := 10 + + if c.MaxRoutines > 0 { + routines = c.MaxRoutines + } + + if limit > 0 && routines > limit { + routines = limit + } + + return routines +} + +func (c *Client) downloadChunked(ctx context.Context, req *http.Request, format *Format) ([]byte, error) { + chunks := getChunks(format.ContentLength, c.getChunkSize()) + maxRoutines := c.getMaxRoutines(len(chunks)) + + chunkChan := make(chan chunk, len(chunks)) + chunkDataChan := make(chan chunkData, len(chunks)) + errChan := make(chan error, 1) + + for _, c := range chunks { + chunkChan <- c + } + close(chunkChan) - return io.Copy(w, resp.Body) + var wg sync.WaitGroup + + for i := 0; i < maxRoutines; i++ { + wg.Add(1) + + go func() { + defer wg.Done() + + for { + select { + case <-ctx.Done(): + errChan <- context.DeadlineExceeded + return + case ch, open := <-chunkChan: + if !open { + return + } + + data, err := c.downloadChunk(req.Clone(ctx), ch) + if err != nil { + errChan <- err + return + } + + chunkDataChan <- chunkData{ch.index, data} + } + } + }() } + wg.Wait() - defer w.Close() + close(errChan) + close(chunkDataChan) - //nolint:revive,errcheck - // load all the chunks - for pos := int64(0); pos < format.ContentLength; { - written, err := loadChunk(pos) + for err := range errChan { if err != nil { - w.CloseWithError(err) - return + return nil, err } + } - pos += written + chunkDatas := make([]chunkData, len(chunks)) + + for cd := range chunkDataChan { + chunkDatas[cd.index] = cd + } + + data := make([]byte, 0, format.ContentLength) + for _, chunk := range chunkDatas { + data = append(data, chunk.data...) } + + return data, nil } // GetStreamURL returns the url for a specific format @@ -317,10 +477,20 @@ func (c *Client) GetStreamURL(video *Video, format *Format) (string, error) { // GetStreamURLContext returns the url for a specific format with a context func (c *Client) GetStreamURLContext(ctx context.Context, video *Video, format *Format) (string, error) { + if format == nil { + return "", ErrNoFormat + } + if format.URL != "" { + if c.client.androidVersion > 0 { + return format.URL, nil + } + return c.unThrottle(ctx, video.ID, format.URL) } + // TODO: check rest of this function, is it redundant? + cipher := format.Cipher if cipher == "" { return "", ErrCipherNotFound @@ -345,6 +515,21 @@ func (c *Client) httpDo(req *http.Request) (*http.Response, error) { log.Println(req.Method, req.URL) } + req.Header.Set("User-Agent", c.client.userAgent) + req.Header.Set("Origin", "https://youtube.com") + req.Header.Set("Sec-Fetch-Mode", "navigate") + + if len(c.consentID) == 0 { + c.consentID = strconv.Itoa(rand.Intn(899) + 100) //nolint:gosec + } + + req.AddCookie(&http.Cookie{ + Name: "CONSENT", + Value: "YES+cb.20210328-17-p0.en+FX+" + c.consentID, + Path: "/", + Domain: ".youtube.com", + }) + res, err := client.Do(req) if c.Debug && res != nil { @@ -370,6 +555,7 @@ func (c *Client) httpGet(ctx context.Context, url string) (*http.Response, error resp.Body.Close() return nil, ErrUnexpectedStatusCode(resp.StatusCode) } + return resp, nil } @@ -396,6 +582,11 @@ func (c *Client) httpPost(ctx context.Context, url string, body interface{}) (*h return nil, err } + req.Header.Set("X-Youtube-Client-Name", "3") + req.Header.Set("X-Youtube-Client-Version", c.client.version) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + resp, err := c.httpDo(req) if err != nil { return nil, err @@ -405,6 +596,7 @@ func (c *Client) httpPost(ctx context.Context, url string, body interface{}) (*h resp.Body.Close() return nil, ErrUnexpectedStatusCode(resp.StatusCode) } + return resp, nil } @@ -418,3 +610,29 @@ func (c *Client) httpPostBodyBytes(ctx context.Context, url string, body interfa return io.ReadAll(resp.Body) } + +// downloadChunk returns the chunk bytes. +// Downloading in multiple chunks is much faster: +// https://github.com/kkdai/youtube/pull/190 +func (c *Client) downloadChunk(req *http.Request, chunk chunk) ([]byte, error) { + q := req.URL.Query() + q.Set("range", fmt.Sprintf("%d-%d", chunk.start, chunk.end)) + req.URL.RawQuery = q.Encode() + + resp, err := c.httpDo(req) + if err != nil { + return nil, ErrUnexpectedStatusCode(resp.StatusCode) + } + defer resp.Body.Close() + + if resp.StatusCode < http.StatusOK && resp.StatusCode >= 300 { + return nil, ErrUnexpectedStatusCode(resp.StatusCode) + } + + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read chunk body: %w", err) + } + + return b, nil +} diff --git a/client_test.go b/client_test.go index 3d6205e0..92df2d66 100644 --- a/client_test.go +++ b/client_test.go @@ -83,7 +83,7 @@ func TestGetVideoWithoutManifestURL(t *testing.T) { assert, require := assert.New(t), require.New(t) video, err := testClient.GetVideo(dwlURL) - require.NoError(err) + require.NoError(err, "get video") require.NotNil(video) assert.NotEmpty(video.Thumbnails) @@ -95,9 +95,11 @@ func TestGetVideoWithoutManifestURL(t *testing.T) { assert.Equal("rFejpH_tAHM", video.ID) assert.Equal("dotGo 2015 - Rob Pike - Simplicity is Complicated", video.Title) assert.Equal("dotconferences", video.Author) - assert.Equal(1392*time.Second, video.Duration) + assert.GreaterOrEqual(video.Duration, 1390*time.Second) assert.Contains(video.Description, "Go is often described as a simple language.") - assert.Equal("2015-12-02 00:00:00 +0000 UTC", video.PublishDate.String()) + + // Publishing date doesn't seem to be present in android client + // assert.Equal("2015-12-02 00:00:00 +0000 UTC", video.PublishDate.String()) } func TestGetVideoWithManifestURL(t *testing.T) { @@ -175,8 +177,10 @@ func TestGetBigPlaylist(t *testing.T) { assert.NotEmpty(playlist.Description) assert.NotEmpty(playlist.Author) - assert.Greater(len(playlist.Videos), 100) - assert.NotEmpty(playlist.Videos[100].ID) + assert.Greater(len(playlist.Videos), 300) + assert.NotEmpty(playlist.Videos[300].ID) + + t.Logf("Playlist Title: %s, Video Count: %d", playlist.Title, len(playlist.Videos)) } func TestClient_httpGetBodyBytes(t *testing.T) { diff --git a/cmd/youtubedr/downloader.go b/cmd/youtubedr/downloader.go index 657608e8..3a478940 100644 --- a/cmd/youtubedr/downloader.go +++ b/cmd/youtubedr/downloader.go @@ -11,10 +11,11 @@ import ( "strconv" "time" - "github.com/kkdai/youtube/v2" - ytdl "github.com/kkdai/youtube/v2/downloader" "github.com/spf13/pflag" "golang.org/x/net/http/httpproxy" + + "github.com/kkdai/youtube/v2" + ytdl "github.com/kkdai/youtube/v2/downloader" ) var ( @@ -94,7 +95,7 @@ func getVideoWithFormat(id string) (*youtube.Video, *youtube.Format, error) { } case outputQuality != "": - format = formats.WithAudioChannels().FindByQuality(outputQuality) + format = formats.FindByQuality(outputQuality) if format == nil { return nil, nil, fmt.Errorf("unable to find format with quality %s", outputQuality) } diff --git a/decipher.go b/decipher.go index c5121752..a8ce33da 100644 --- a/decipher.go +++ b/decipher.go @@ -283,12 +283,3 @@ func (config playerConfig) parseDecipherOps() (operations []DecipherOperation, e } return ops, nil } - -func (config playerConfig) getSignatureTimestamp() (string, error) { - result := signatureRegexp.FindSubmatch(config) - if result == nil { - return "", ErrSignatureTimestampNotFound - } - - return string(result[1]), nil -} diff --git a/downloader/downloader_test.go b/downloader/downloader_test.go index a94468ff..84910715 100644 --- a/downloader/downloader_test.go +++ b/downloader/downloader_test.go @@ -7,9 +7,10 @@ import ( "testing" "time" - "github.com/kkdai/youtube/v2" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/kkdai/youtube/v2" ) var testDownloader = func() (dl Downloader) { @@ -39,7 +40,7 @@ func TestDownload_FirstStream(t *testing.T) { assert.Equal(`youtube-dl test video "'/\ä↭𝕐`, video.Title) assert.Equal(`Philipp Hagemeister`, video.Author) assert.Equal(10*time.Second, video.Duration) - assert.Len(video.Formats, 18) + assert.GreaterOrEqual(len(video.Formats), 18) if assert.Greater(len(video.Formats), 0) { assert.NoError(testDownloader.Download(ctx, video, &video.Formats[0], "")) diff --git a/fetch_testdata_helper.go b/fetch_testdata_helper.go index 9d106729..989970c3 100644 --- a/fetch_testdata_helper.go +++ b/fetch_testdata_helper.go @@ -1,3 +1,4 @@ +//go:build fetch // +build fetch package youtube diff --git a/format_list.go b/format_list.go index e2a0ae7e..827d2f0d 100644 --- a/format_list.go +++ b/format_list.go @@ -9,6 +9,8 @@ import ( type FormatList []Format // FindByQuality returns the first format matching Quality or QualityLabel +// +// Examples: tiny, small, medium, large, 720p, hd720, hd1080 func (list FormatList) FindByQuality(quality string) *Format { for i := range list { if list[i].Quality == quality || list[i].QualityLabel == quality { diff --git a/itag_test.go b/itag_test.go index 66e83bcb..bb919ce2 100644 --- a/itag_test.go +++ b/itag_test.go @@ -14,5 +14,5 @@ func TestYoutube_GetItagInfo(t *testing.T) { url := "https://www.youtube.com/watch?v=rFejpH_tAHM" video, err := client.GetVideo(url) require.NoError(err) - require.Len(video.Formats, 24) + require.GreaterOrEqual(len(video.Formats), 24) } diff --git a/player_parse.go b/player_parse.go index cf551aec..7790eeb2 100644 --- a/player_parse.go +++ b/player_parse.go @@ -15,11 +15,7 @@ type playerConfig []byte var basejsPattern = regexp.MustCompile(`(/s/player/\w+/player_ias.vflset/\w+/base.js)`) -// we may use \d{5} instead of \d+ since currently its 5 digits, but i can't be sure it will be 5 digits always -var signatureRegexp = regexp.MustCompile(`(?m)(?:^|,)(?:signatureTimestamp:)(\d+)`) - func (c *Client) getPlayerConfig(ctx context.Context, videoID string) (playerConfig, error) { - embedURL := fmt.Sprintf("https://youtube.com/embed/%s?hl=en", videoID) embedBody, err := c.httpGetBodyBytes(ctx, embedURL) if err != nil { diff --git a/playlist.go b/playlist.go index 4b73488d..60462f56 100644 --- a/playlist.go +++ b/playlist.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "regexp" + "runtime/debug" "strconv" "time" @@ -68,8 +69,9 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ } defer func() { + stack := debug.Stack() if r := recover(); r != nil { - err = fmt.Errorf("JSON parsing error: %v", r) + err = fmt.Errorf("JSON parsing error: %v\n%s", r, stack) } }() @@ -80,27 +82,70 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return ErrPlaylistStatus{Reason: message} } - p.Title = j.GetPath("metadata", "playlistMetadataRenderer", "title").MustString() - p.Description = j.GetPath("metadata", "playlistMetadataRenderer", "description").MustString() + // Metadata can be located in multiple places depending on client type + var metadata *sjson.Json + if node, ok := j.CheckGet("metadata"); ok { + metadata = node + } else if node, ok := j.CheckGet("header"); ok { + metadata = node + } else { + return fmt.Errorf("no playlist header / metadata found") + } + + metadata = metadata.Get("playlistHeaderRenderer") + + p.Title = sjsonGetText(metadata, "title") + p.Description = sjsonGetText(metadata, "description", "descriptionText") p.Author = j.GetPath("sidebar", "playlistSidebarRenderer", "items").GetIndex(1). GetPath("playlistSidebarSecondaryInfoRenderer", "videoOwner", "videoOwnerRenderer", "title", "runs"). GetIndex(0).Get("text").MustString() - vJSON, err := j.GetPath("contents", "twoColumnBrowseResultsRenderer", "tabs").GetIndex(0). - GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0). - GetPath("itemSectionRenderer", "contents").GetIndex(0). - GetPath("playlistVideoListRenderer", "contents").MarshalJSON() + + if len(p.Author) == 0 { + p.Author = sjsonGetText(metadata, "owner", "ownerText") + } + + contents, ok := j.CheckGet("contents") + if !ok { + return fmt.Errorf("contents not found in json body") + } + + // contents can have different keys with same child structure + firstPart := getFirstKeyJSON(contents).GetPath("tabs").GetIndex(0). + GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0) + + // This extra nested item is only set with the web client + if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValidJSON(n) { + firstPart = n + } + + vJSON, err := firstPart.GetPath("playlistVideoListRenderer", "contents").MarshalJSON() + if err != nil { + return err + } + + if len(vJSON) <= 4 { + return fmt.Errorf("no video data found in JSON") + } entries, continuation, err := extractPlaylistEntries(vJSON) if err != nil { return err } + if len(continuation) == 0 { + continuation = getContinuation(firstPart.Get("playlistVideoListRenderer")) + } + + if len(entries) == 0 { + return fmt.Errorf("no videos found in playlist") + } + p.Videos = entries for continuation != "" { - data := prepareInnertubePlaylistData(continuation, true, webClient) + data := prepareInnertubePlaylistData(continuation, true, *client.client) - body, err := client.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+webClient.key, data) + body, err := client.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+client.client.key, data) if err != nil { return err } @@ -110,9 +155,14 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return err } - vJSON, err := j.GetPath("onResponseReceivedActions").GetIndex(0). - GetPath("appendContinuationItemsAction", "continuationItems").MarshalJSON() + next := j.GetPath("onResponseReceivedActions").GetIndex(0). + GetPath("appendContinuationItemsAction", "continuationItems") + + if !isValidJSON(next) { + next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents") + } + vJSON, err := next.MarshalJSON() if err != nil { return err } @@ -122,7 +172,13 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return err } - p.Videos, continuation = append(p.Videos, entries...), token + if len(token) > 0 { + continuation = token + } else { + continuation = getContinuation(j.GetPath("continuationContents", "playlistVideoListContinuation")) + } + + p.Videos = append(p.Videos, entries...) } return err diff --git a/transcript.go b/transcript.go new file mode 100644 index 00000000..84571f45 --- /dev/null +++ b/transcript.go @@ -0,0 +1,214 @@ +package youtube + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strconv" + "strings" +) + +var ( + ErrTranscriptDisabled = errors.New("transcript is disabled on this video") +) + +// TranscriptSegment is a single transcipt segment spanning a few milliseconds. +type TranscriptSegment struct { + // Text is the transcipt text. + Text string `json:"text"` + + // StartMs is the start timestamp in ms. + StartMs int `json:"offset"` + + // OffsetText e.g. '4:00'. + OffsetText string `json:"offsetText"` + + // Duration the transcript segment spans in ms. + Duration int `json:"duration"` +} + +func (tr TranscriptSegment) String() string { + return tr.OffsetText + " - " + strings.TrimSpace(tr.Text) +} + +type VideoTranscript []TranscriptSegment + +func (vt VideoTranscript) String() string { + var str string + for _, tr := range vt { + str += tr.String() + "\n" + } + + return str +} + +// GetTranscript fetches the video transcript if available. +// +// Not all videos have transcripts, only relatively new videos. +// If transcripts are disabled or not available, ErrTranscriptDisabled is returned. +func (c *Client) GetTranscript(video *Video) (VideoTranscript, error) { + return c.GetTranscriptCtx(context.Background(), video) +} + +// GetTranscriptCtx fetches the video transcript if available. +// +// Not all videos have transcripts, only relatively new videos. +// If transcripts are disabled or not available, ErrTranscriptDisabled is returned. +func (c *Client) GetTranscriptCtx(ctx context.Context, video *Video) (VideoTranscript, error) { + c.assureClient() + + if video == nil || video.ID == "" { + return nil, fmt.Errorf("no video provided") + } + + body, err := c.transcriptDataByInnertube(ctx, video.ID) + if err != nil { + return nil, err + } + + transcript, err := parseTranscript(body) + if err != nil { + return nil, err + } + + return transcript, nil +} + +func parseTranscript(body []byte) (VideoTranscript, error) { + var resp transcriptResp + if err := json.Unmarshal(body, &resp); err != nil { + return nil, err + } + + if len(resp.Actions) > 0 { + // Android client response + if app := resp.Actions[0].AppSegment; app != nil { + return getSegments(app) + } + + // Web client response + if web := resp.Actions[0].WebSegment; web != nil { + return nil, fmt.Errorf("not implemented") + } + } + + return nil, ErrTranscriptDisabled +} + +type segmenter interface { + ParseSegments() []TranscriptSegment +} + +func getSegments(f segmenter) (VideoTranscript, error) { + if segments := f.ParseSegments(); len(segments) > 0 { + return segments, nil + } + + return nil, ErrTranscriptDisabled +} + +// transcriptResp is the JSON structure as returned by the transcript API. +type transcriptResp struct { + Actions []struct { + AppSegment *appData `json:"elementsCommand"` + WebSegment *webData `json:"updateEngagementPanelAction"` + } `json:"actions"` +} + +type appData struct { + TEC struct { + Args struct { + ListArgs struct { + Ow struct { + InitialSeg []struct { + TranscriptSegment struct { + StartMs string `json:"startMs"` + EndMs string `json:"endMs"` + Text struct { + String struct { + // Content is the actual transctipt text + Content string `json:"content"` + } `json:"elementsAttributedString"` + } `json:"snippet"` + StartTimeText struct { + String struct { + // Content is the fomratted timestamp, e.g. '4:00' + Content string `json:"content"` + } `json:"elementsAttributedString"` + } `json:"startTimeText"` + } `json:"transcriptSegmentRenderer"` + } `json:"initialSegments"` + } `json:"overwrite"` + } `json:"transformTranscriptSegmentListArguments"` + } `json:"arguments"` + } `json:"transformEntityCommand"` +} + +func (s *appData) ParseSegments() []TranscriptSegment { + rawSegments := s.TEC.Args.ListArgs.Ow.InitialSeg + segments := make([]TranscriptSegment, 0, len(rawSegments)) + + for _, segment := range rawSegments { + startMs, _ := strconv.Atoi(segment.TranscriptSegment.StartMs) + endMs, _ := strconv.Atoi(segment.TranscriptSegment.EndMs) + + segments = append(segments, TranscriptSegment{ + Text: segment.TranscriptSegment.Text.String.Content, + StartMs: startMs, + OffsetText: segment.TranscriptSegment.StartTimeText.String.Content, + Duration: endMs - startMs, + }) + } + + return segments +} + +type webData struct { + Content struct { + TR struct { + Body struct { + TBR struct { + Cues []struct { + Transcript struct { + FormattedStartOffset struct { + SimpleText string `json:"simpleText"` + } `json:"formattedStartOffset"` + Cues []struct { + TranscriptCueRenderer struct { + Cue struct { + SimpleText string `json:"simpleText"` + } `json:"cue"` + StartOffsetMs string `json:"startOffsetMs"` + DurationMs string `json:"durationMs"` + } `json:"transcriptCueRenderer"` + } `json:"cues"` + } `json:"transcriptCueGroupRenderer"` + } `json:"cueGroups"` + } `json:"transcriptSearchPanelRenderer"` + } `json:"content"` + } `json:"transcriptRenderer"` + } `json:"content"` +} + +func (s *webData) ParseSegments() []TranscriptSegment { + // TODO: doesn't actually work now, check json. + cues := s.Content.TR.Body.TBR.Cues + segments := make([]TranscriptSegment, 0, len(cues)) + + for _, s := range cues { + formatted := s.Transcript.FormattedStartOffset.SimpleText + segment := s.Transcript.Cues[0].TranscriptCueRenderer + start, _ := strconv.Atoi(segment.StartOffsetMs) + duration, _ := strconv.Atoi(segment.DurationMs) + + segments = append(segments, TranscriptSegment{ + Text: segment.Cue.SimpleText, + StartMs: start, + OffsetText: formatted, + Duration: duration, + }) + } + + return segments +} diff --git a/transcript_test.go b/transcript_test.go new file mode 100644 index 00000000..748a3610 --- /dev/null +++ b/transcript_test.go @@ -0,0 +1,32 @@ +package youtube + +import ( + "strconv" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestTranscript(t *testing.T) { + client := Client{Debug: true} + + video := &Video{ID: "9_MbW9FK1fA"} + + transcript, err := client.GetTranscript(video) + require.NoError(t, err, "get transcript") + require.Greater(t, len(transcript), 0, "no transcript segments found") + + for i, segment := range transcript { + index := strconv.Itoa(i) + + require.NotEmpty(t, segment.Text, "text "+index) + require.NotEmpty(t, segment.Duration, "duration "+index) + require.NotEmpty(t, segment.OffsetText, "offset "+index) + + if i != 0 { + require.NotEmpty(t, segment.StartMs, "startMs "+index) + } + } + + t.Log(transcript.String()) +} diff --git a/utils.go b/utils.go new file mode 100644 index 00000000..4704a2aa --- /dev/null +++ b/utils.go @@ -0,0 +1,99 @@ +package youtube + +import ( + "encoding/base64" + "math" + + sjson "github.com/bitly/go-simplejson" +) + +type chunk struct { + index int + start int64 + end int64 +} + +func getChunks(totalSize, chunkSize int64) []chunk { + var chunks []chunk + + for i := 0; i < int(math.Ceil(float64(totalSize)/float64(chunkSize))); i++ { + start := int64(i) * chunkSize + end := start + chunkSize - 1 + if end >= totalSize { + end = totalSize - 1 + } + + chunks = append(chunks, chunk{i, start, end}) + } + + return chunks +} + +func getFirstKeyJSON(j *sjson.Json) *sjson.Json { + m, err := j.Map() + if err != nil { + return j + } + + for key := range m { + return j.Get(key) + } + + return j +} + +func isValidJSON(j *sjson.Json) bool { + b, err := j.MarshalJSON() + if err != nil { + return false + } + + if len(b) <= 4 { + return false + } + + return true +} + +func sjsonGetText(j *sjson.Json, paths ...string) string { + for _, path := range paths { + if isValidJSON(j.Get(path)) { + j = j.Get(path) + } + } + + if text, err := j.String(); err == nil { + return text + } + + if isValidJSON(j.Get("text")) { + return j.Get("text").MustString() + } + + if p := j.Get("runs"); isValidJSON(p) { + var text string + + for i := 0; i < len(p.MustArray()); i++ { + if textNode := p.GetIndex(i).Get("text"); isValidJSON(textNode) { + text += textNode.MustString() + } + } + + return text + } + + return "" +} + +func getContinuation(j *sjson.Json) string { + return j.GetPath("continuations"). + GetIndex(0).GetPath("nextContinuationData", "continuation").MustString() +} + +func base64PadEnc(str string) string { + return base64.StdEncoding.EncodeToString([]byte(str)) +} + +func base64Enc(str string) string { + return base64.RawStdEncoding.EncodeToString([]byte(str)) +} diff --git a/video.go b/video.go index 4bb7019c..b150fca7 100644 --- a/video.go +++ b/video.go @@ -103,6 +103,10 @@ func (v *Video) extractDataFromPlayerResponse(prData playerResponseData) error { v.Views = views } + if seconds, _ := strconv.Atoi(prData.VideoDetails.LengthSeconds); seconds > 0 { + v.Duration = time.Duration(seconds) * time.Second + } + if seconds, _ := strconv.Atoi(prData.Microformat.PlayerMicroformatRenderer.LengthSeconds); seconds > 0 { v.Duration = time.Duration(seconds) * time.Second } diff --git a/video_id.go b/video_id.go index bc7f1eb6..7de09cb6 100644 --- a/video_id.go +++ b/video_id.go @@ -25,6 +25,7 @@ func ExtractVideoID(videoID string) (string, error) { if strings.ContainsAny(videoID, "?&/<%=") { return "", ErrInvalidCharactersInVideoID } + if len(videoID) < 10 { return "", ErrVideoIDMinLength } diff --git a/video_test.go b/video_test.go index c9e15c0a..9105310e 100644 --- a/video_test.go +++ b/video_test.go @@ -1,7 +1,9 @@ package youtube import ( + "io" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -9,7 +11,7 @@ import ( func ExampleClient_GetStream() { client := Client{Debug: true} - video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") if err != nil { panic(err) } @@ -27,6 +29,32 @@ func ExampleClient_GetStream() { reader.Close() } +func TestSimpleTest(t *testing.T) { + client := Client{Debug: true, ChunkSize: Size10Mb} + + video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") + require.NoError(t, err, "get body") + + _, err = client.GetTranscript(video) + require.NoError(t, err, "get transcript") + + // Typically youtube only provides separate streams for video and audio. + // If you want audio and video combined, take a look a the downloader package. + format := video.Formats.FindByQuality("hd1080") + + start := time.Now() + reader, _, err := client.GetStream(video, format) + require.NoError(t, err, "get stream") + + t.Log("Duration Milliseconds: ", time.Since(start).Milliseconds()) + + // do something with the reader + b, err := io.ReadAll(reader) + require.NoError(t, err, "read body") + + t.Log("Downloaded ", len(b)) +} + func TestDownload_Regular(t *testing.T) { testcases := []struct {