youtube: fix using the new layout and JSON extraction - frontends - front-ends for some sites (experiment)
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a
(DIR) parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Thu, 3 Sep 2020 11:23:10 +0200
youtube: fix using the new layout and JSON extraction
Instead of scraping HTML from the site it now extracts the initial JSON data
and parses it.
Diffstat:
M youtube/youtube.c | 399 ++++++++++++-------------------
1 file changed, 149 insertions(+), 250 deletions(-)
---
(DIR) diff --git a/youtube/youtube.c b/youtube/youtube.c
@@ -11,291 +11,192 @@
#include <unistd.h>
#include "https.h"
+#include "json.h"
#include "util.h"
#include "youtube.h"
-#include "xml.h"
-
-#define STRP(s) s,sizeof(s)-1
-
-/* temporary variables to copy for states */
-static char id[256], userid[256];
-
-/* states */
-static int metainfocount;
-static enum ItemState {
- None = 0,
- Item = 1, Pager = 2,
- Metainfo = 4, Title = 8, User = 16, Videotime = 32,
-} state;
-
-static struct item *videos;
-static size_t nvideos;
static char *
youtube_request(const char *path)
{
- return request("www.youtube.com", path,
- "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\r\n");
-}
-
-static int
-isclassmatch(const char *classes, const char *clss, size_t len)
-{
- const char *p;
-
- if (!(p = strstr(classes, clss)))
- return 0;
- return (p == classes || isspace((unsigned char)p[-1])) &&
- (isspace((unsigned char)p[len]) || !p[len]);
-}
-
-/* XML/HTML entity conversion */
-static const char *
-entitytostr(const char *s)
-{
- static char buf[16];
- ssize_t len;
-
- if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0)
- return buf;
-
- return s;
+ return request("www.youtube.com", path, "");
}
-static void
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+static char *
+request_search(const char *s, const char *page, const char *order)
{
- /* grouped channel index, used for channelid and channel title */
- static int grouped = -1;
-
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("search-pager"))) {
- /* last video */
- if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) {
- if (grouped != -1 && !videos[nvideos].channelid[0]) {
- strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid));
- strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle));
- }
- nvideos++;
- }
- state &= ~Item;
- state |= Pager;
- }
-
- if (nvideos >= MAX_VIDEOS)
- return;
+ char path[4096];
- if (!strcmp(t, "div") && !strcmp(a, "class") &&
- isclassmatch(v, STRP("yt-lockup"))) {
- state |= Item;
- if (videos[nvideos].linktype) {
- if (videos[nvideos].channelid[0] || videos[nvideos].userid[0] ||
- videos[nvideos].linktype != Video)
- grouped = -1;
- if (videos[nvideos].linktype == Channel)
- grouped = nvideos;
- if (grouped != -1 && !videos[nvideos].channelid[0]) {
- strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid));
- strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle));
- }
- nvideos++;
- }
- if (strstr(v, " yt-lockup-channel "))
- videos[nvideos].linktype = Channel;
- else if (strstr(v, "yt-lockup-movie-"))
- videos[nvideos].linktype = Movie;
- else if (strstr(v, " yt-lockup-playlist "))
- videos[nvideos].linktype = Playlist;
- if (strstr(v, " yt-lockup-video "))
- videos[nvideos].linktype = Video;
- }
- if (!(state & Item))
- return;
+ snprintf(path, sizeof(path), "/results?search_query=%s", s);
- if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP("video-time")))
- state |= Videotime;
- if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-meta-info"))) {
- state |= Metainfo;
- metainfocount = 0;
+ if (page[0]) {
+ strlcat(path, "&page=", sizeof(path));
+ strlcat(path, page, sizeof(path));
}
- if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-title")))
- state |= Title;
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-byline")))
- state |= User;
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) {
- if (videos[nvideos].linktype == Channel)
- strlcat(videos[nvideos].channeltitle, v, sizeof(videos[nvideos].channeltitle));
- else
- strlcat(videos[nvideos].title, v, sizeof(videos[nvideos].title));
+ if (order[0]) {
+ strlcat(path, "&search_sort=", sizeof(path));
+ if (!strcmp(order, "date"))
+ strlcat(path, "video_date_uploaded", sizeof(path));
+ else if (!strcmp(order, "relevance"))
+ strlcat(path, "video_relevance", sizeof(path));
+ else if (!strcmp(order, "views"))
+ strlcat(path, "video_view_count", sizeof(path));
+ else if (!strcmp(order, "rating"))
+ strlcat(path, "video_avg_rating", sizeof(path));
}
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href"))
- strlcat(id, v, sizeof(id));
-
- if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id"))
- strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].channelid));
+ /* check if request is too long (truncation) */
+ if (strlen(path) >= sizeof(path) - 1)
+ return NULL;
- if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href"))
- strlcat(userid, v, sizeof(userid));
+ return youtube_request(path);
}
-static void
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+int
+extractjson(const char *s, char **start, char **end)
{
- const char *s;
+ if (!(*start = strstr(s, "window[\"ytInitialData\"] = ")))
+ return -1;
+ if (!(*end = strstr(*start, "};\n")))
+ return -1;
- if (!(state & Pager) && nvideos >= MAX_VIDEOS)
- return;
+ (*start) += sizeof("window[\"ytInitialData\"] = ") - 1;
+ (*end)++;
- s = entitytostr(v);
- xmlattr(x, t, tl, a, al, s, strlen(s));
+ return 0;
}
-static void
-xmldata(XMLParser *x, const char *d, size_t dl)
+void
+processnode(struct json_node *nodes, size_t depth, const char *value,
+ void *pp)
{
- if ((state & Pager))
- return;
+ struct search_response *r = (struct search_response *)pp;
+ static struct item *item;
- /* optimization: no need to process and must not process videos after this */
- if (!state || nvideos >= MAX_VIDEOS)
+ if (r->nitems > MAX_VIDEOS)
return;
- /* use parsed link type for meta info since this metainfo differs per type like:
- channel, playlist, video */
- if ((state & Metainfo)) {
- switch (videos[nvideos].linktype) {
- case Playlist:
- break; /* ignore */
- case Channel:
- if (metainfocount == 1)
- strlcat(videos[nvideos].channelvideos, d, sizeof(videos[nvideos].channelvideos));
- break;
- default:
- if (metainfocount == 1)
- strlcat(videos[nvideos].publishedat, d, sizeof(videos[nvideos].publishedat));
- else if (metainfocount == 2)
- strlcat(videos[nvideos].viewcount, d, sizeof(videos[nvideos].viewcount));
- }
+ /* new item, structures can be very deep, just check the end for:
+ (items|contents)[].videoRenderer objects */
+ if (depth >= 3 &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_OBJECT &&
+ (!strcmp(nodes[depth - 3].name, "items") ||
+ !strcmp(nodes[depth - 3].name, "contents")) &&
+ !strcmp(nodes[depth - 1].name, "videoRenderer")) {
+ r->nitems++;
+ return;
}
- if ((state & Videotime) && !strcmp(x->tag, "span"))
- strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].duration));
- if ((state & User) && !strcmp(x->tag, "a"))
- strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos].channeltitle));
-}
-
-static void
-xmldataentity(XMLParser *x, const char *d, size_t dl)
-{
- const char *s;
- /* optimization: no need for entity conversion */
- if (!state || nvideos >= MAX_VIDEOS)
+ if (r->nitems == 0)
return;
-
- s = entitytostr(d);
- xmldata(x, s, strlen(s));
-}
-
-static void
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
-{
- char *p;
-
- if ((state & Metainfo) && !strcmp(t, "ul"))
- state &= ~Metainfo;
- if ((state & Title) && !strcmp(t, "h3")) {
- state &= ~Title;
-
- if (nvideos >= MAX_VIDEOS)
- return;
-
- if (!strncmp(id, "/watch", sizeof("/watch") - 1)) {
- if (!videos[nvideos].linktype)
- videos[nvideos].linktype = Video;
- if ((p = getparam(id, "v"))) {
- if (decodeparam(videos[nvideos].id, sizeof(videos[nvideos].id), p) == -1)
- videos[nvideos].id[0] = '\0';
- }
- }
-
- id[0] = '\0';
+ item = &(r->items[r->nitems - 1]);
+
+ if (depth >= 4 &&
+ nodes[depth - 4].type == TYPE_ARRAY &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 4].name, "items") ||
+ !strcmp(nodes[depth - 4].name, "contents")) &&
+ !strcmp(nodes[depth - 2].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 1].name, "videoId")) {
+ strlcpy(item->id, value, sizeof(item->id));
}
- if ((state & User)) {
- state &= ~User;
- if (nvideos >= MAX_VIDEOS)
- return;
+ if (depth >= 7 &&
+ nodes[depth - 7].type == TYPE_ARRAY &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_OBJECT &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 7].name, "items") ||
+ !strcmp(nodes[depth - 7].name, "contents")) &&
+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 4].name, "title") &&
+ !strcmp(nodes[depth - 3].name, "runs") &&
+ !strcmp(nodes[depth - 1].name, "text") &&
+ !item->title[0]) {
+ strlcpy(item->title, value, sizeof(item->title));
+ }
- /* can be user or channel */
- if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) {
- strlcpy(videos[nvideos].channelid,
- userid + sizeof("/channel/") - 1,
- sizeof(videos[nvideos].channelid));
- } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) {
- strlcpy(videos[nvideos].userid,
- userid + sizeof("/user/") - 1,
- sizeof(videos[nvideos].userid));
+ if (depth >= 5 &&
+ nodes[depth - 5].type == TYPE_ARRAY &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 5].name, "items") ||
+ !strcmp(nodes[depth - 5].name, "contents")) &&
+ !strcmp(nodes[depth - 3].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 1].name, "simpleText")) {
+ if (!strcmp(nodes[depth - 2].name, "viewCountText") &&
+ !item->viewcount[0]) {
+ strlcpy(item->viewcount, value, sizeof(item->viewcount));
+ } else if (!strcmp(nodes[depth - 2].name, "lengthText") &&
+ !item->duration[0]) {
+ strlcpy(item->duration, value, sizeof(item->duration));
+ } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText") &&
+ !item->publishedat[0]) {
+ strlcpy(item->publishedat, value, sizeof(item->publishedat));
}
-
- userid[0] = '\0';
}
- if ((state & Videotime))
- state &= ~Videotime;
-}
-static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
-{
- if ((state & Metainfo) && !strcmp(t, "li"))
- metainfocount++;
-}
-
-static char *
-request_search(const char *s, const char *page, const char *order)
-{
- char path[4096];
-
- snprintf(path, sizeof(path), "/results?search_query=%s", s);
- if (page[0]) {
- strlcat(path, "&page=", sizeof(path));
- strlcat(path, page, sizeof(path));
+ if (depth >= 9 &&
+ nodes[depth - 9].type == TYPE_ARRAY &&
+ nodes[depth - 8].type == TYPE_OBJECT &&
+ nodes[depth - 7].type == TYPE_OBJECT &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_ARRAY &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 9].name, "items") ||
+ !strcmp(nodes[depth - 9].name, "contents")) &&
+ !strcmp(nodes[depth - 7].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 6].name, "longBylineText") &&
+ !strcmp(nodes[depth - 5].name, "runs") &&
+ !strcmp(nodes[depth - 3].name, "navigationEndpoint") &&
+ !strcmp(nodes[depth - 2].name, "browseEndpoint")) {
+ if (!strcmp(nodes[depth - 1].name, "browseId")) {
+ strlcpy(item->channelid, value, sizeof(item->channelid));
+ }
}
- if (order[0]) {
- strlcat(path, "&search_sort=", sizeof(path));
- if (!strcmp(order, "date"))
- strlcat(path, "video_date_uploaded", sizeof(path));
- else if (!strcmp(order, "relevance"))
- strlcat(path, "video_relevance", sizeof(path));
- else if (!strcmp(order, "views"))
- strlcat(path, "video_view_count", sizeof(path));
- else if (!strcmp(order, "rating"))
- strlcat(path, "video_avg_rating", sizeof(path));
+ if (depth >= 7 &&
+ nodes[depth - 7].type == TYPE_ARRAY &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_OBJECT &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 7].name, "items") ||
+ !strcmp(nodes[depth - 7].name, "contents")) &&
+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 4].name, "longBylineText") &&
+ !strcmp(nodes[depth - 3].name, "runs")) {
+ if (!strcmp(nodes[depth - 1].name, "text") &&
+ !item->channeltitle[0]) {
+ strlcpy(item->channeltitle, value, sizeof(item->channeltitle));
+ }
}
-
- /* force older youtube layout, else youtube will try to randomly serve
- a new layout sometimes breaking the parsing */
- strlcat(path, "&disable_polymer=1", sizeof(path));
-
- /* check if request is too long (truncation) */
- if (strlen(path) >= sizeof(path) - 1)
- return NULL;
-
- return youtube_request(path);
}
struct search_response *
youtube_search(const char *rawsearch, const char *page, const char *order)
{
struct search_response *r;
- XMLParser x = { 0 };
- char *data, *s;
+ char *data, *s, *start, *end;
+ int ret;
if (!(data = request_search(rawsearch, page, order)))
return NULL;
+
if (!(s = strstr(data, "\r\n\r\n")))
return NULL; /* invalid response */
/* skip header */
@@ -304,20 +205,18 @@ youtube_search(const char *rawsearch, const char *page, const char *order)
if (!(r = calloc(1, sizeof(*r))))
return NULL;
- nvideos = 0;
- videos = r->items;
-
- x.xmlattr = xmlattr;
- x.xmlattrentity = xmlattrentity;
- x.xmldata = xmldata;
- x.xmldataentity = xmldataentity;
- x.xmltagend = xmltagend;
- x.xmltagstart = xmltagstart;
-
- setxmldata(s, strlen(s));
- xml_parse(&x);
+ if (extractjson(s, &start, &end) == -1) {
+// fprintf(stderr, "error extracting JSON");
+ free(r);
+ return NULL;
+ }
- r->nitems = nvideos;
+ ret = parsejson(start, end - start, processnode, r);
+ if (ret < 0) {
+// fprintf(stderr, "error parsing JSON");
+ free(r);
+ return NULL;
+ }
return r;
}