extractjson

extract embedded JSON metadata from HTML pages
git clone git://git.codemadness.org/extractjson
Log | Files | Refs | README | LICENSE

commit ad11115ba705c4c5f88f0679f2f807e4d0883970
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 14 Aug 2022 11:58:19 +0200

initial repo

Diffstat:
ALICENSE | 15+++++++++++++++
AMakefile | 5+++++
AREADME | 15+++++++++++++++
Aextractjson.1 | 29+++++++++++++++++++++++++++++
Aextractjson.c | 341+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 405 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2022 Hiltjo Posthuma <hiltjo@codemadness.org> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,5 @@ +build: clean + ${CC} -o extractjson extractjson.c ${CFLAGS} ${LDFLAGS} + +clean: + rm -f *.o extractjson diff --git a/README b/README @@ -0,0 +1,15 @@ +extractjson +----------- + +Extracts embedded JSON metadata from HTML pages, such as data in the tags: +<script type="application/ld+json"> + +It reads HTML from stdin and outputs JSON per line to stdout. + +Example: + + curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv + +This extracts the JSON metadata from the IMDB page of Ground Hog data. +It uses the first embedded JSON fragment and pipes it to json2tsv. +It can then be further processed using awk to get the relevant data. diff --git a/extractjson.1 b/extractjson.1 @@ -0,0 +1,29 @@ +.Dd May 2, 2022 +.Dt EXTRACTJSON 1 +.Os +.Sh NAME +.Nm extractjson +.Nd extracts embedded JSON metadata from HTML pages +.Sh SYNOPSIS +.Nm +.Sh DESCRIPTION +.Nm +extracts embedded JSON metadata from HTML pages, such as data in the tags: +<script type="application/ld+json"> +.Pp +It reads HTML from stdin and outputs JSON per line to stdout. +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +.Bd -literal +curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv +.Ed +.Pp +This extracts the JSON metadata from the IMDB page of the movie "Ground Hog Day". +It uses the first embedded JSON fragment and pipes it to json2tsv. +It can then be further processed using awk to get the relevant data. +.Sh SEE ALSO +.Xr curl 1 , +.Xr json2tsv 1 +.Sh AUTHORS +.An Hiltjo Posthuma Aq Mt hiltjo@codemadness.org diff --git a/extractjson.c b/extractjson.c @@ -0,0 +1,341 @@ +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> + +#define GETNEXT getnext + +#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) +#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) + +typedef struct xmlparser { + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is in shortform ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +static XMLParser parser; +static int isjson; +static const char *ignorestate, *endtag; +static int (*getnext)(void) = getchar; + +/* ignore parsing all HTML data inside <script> tags, because they may contain + characters such as '<' and '>' */ +static int +getnext_json(void) +{ + int c; + + if ((c = getchar()) == EOF) + return EOF; + + if (tolower(c) == tolower((unsigned char)*ignorestate)) { + ignorestate++; + if (*ignorestate == '\0') { + getnext = getchar; /* restore */ + putchar('\n'); + isjson = 0; + return c; + } + + } else { + ignorestate = endtag; + if (c != '\r' && c != '\n') + putchar(c); + } + + return ' '; +} + +static void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + if (!strcasecmp(t, "script") && + !strcasecmp(a, "type") && + (strstr(v, "application/json") || + strstr(v, "application/ld+json") || + strstr(v, "text/json"))) + isjson = 1; +} + +static void +xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) +{ + if (!strcasecmp(t, "script") && isjson) { + ignorestate = endtag = "</script>"; + getnext = getnext_json; + return; + } +} + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = GETNEXT()) != EOF) { + if (ISSPACE(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + x->name[namelen] = '\0'; + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* ISSPACE() */ + goto startvalue; + } + + while ((c = GETNEXT()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen) + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + int c, i = 0; + + while ((c = GETNEXT()) != EOF) { + if (c == '-') { + if (++i > 2) + i = 2; + continue; + } else if (c == '>' && i == 2) { + return; + } else if (i) { + i = 0; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + while ((c = GETNEXT()) != EOF) { + if (c == ']') { + if (++i > 2) + i = 2; + continue; + } else if (c == '>' && i == 2) { + return; + } else if (i) { + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parse(XMLParser *x) +{ + size_t datalen, tagdatalen; + int c, isend; + + while ((c = GETNEXT()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = GETNEXT()) == EOF) + return; + + if (c == '!') { /* cdata and comments */ + for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { + /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + x->tag[0] = c; + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as shorttag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = GETNEXT()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = GETNEXT()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || ISSPACE(c)) { + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + x->tag[0] = '\0'; + x->taglen = 0; + } else { + /* start tag */ + if (ISSPACE(c)) + xml_parseattrs(x); + xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for shortform or processing instruction */ + if (x->isshorttag) { + x->tag[0] = '\0'; + x->taglen = 0; + } + break; + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + while ((c = GETNEXT()) != EOF) { + if (c == '&') { + if (datalen) + x->data[datalen] = '\0'; + x->data[0] = c; + datalen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + break; + } + } + } + } +} + +int +main(void) +{ + xml_parse(&parser); + + return 0; +}