commit ad11115ba705c4c5f88f0679f2f807e4d0883970
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 14 Aug 2022 11:58:19 +0200
initial repo
Diffstat:
5 files changed, 405 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2022 Hiltjo Posthuma <hiltjo@codemadness.org>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,5 @@
+build: clean
+ ${CC} -o extractjson extractjson.c ${CFLAGS} ${LDFLAGS}
+
+clean:
+ rm -f *.o extractjson
diff --git a/README b/README
@@ -0,0 +1,15 @@
+extractjson
+-----------
+
+Extracts embedded JSON metadata from HTML pages, such as data in the tags:
+<script type="application/ld+json">
+
+It reads HTML from stdin and outputs JSON per line to stdout.
+
+Example:
+
+ curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
+
+This extracts the JSON metadata from the IMDB page of Ground Hog data.
+It uses the first embedded JSON fragment and pipes it to json2tsv.
+It can then be further processed using awk to get the relevant data.
diff --git a/extractjson.1 b/extractjson.1
@@ -0,0 +1,29 @@
+.Dd May 2, 2022
+.Dt EXTRACTJSON 1
+.Os
+.Sh NAME
+.Nm extractjson
+.Nd extracts embedded JSON metadata from HTML pages
+.Sh SYNOPSIS
+.Nm
+.Sh DESCRIPTION
+.Nm
+extracts embedded JSON metadata from HTML pages, such as data in the tags:
+<script type="application/ld+json">
+.Pp
+It reads HTML from stdin and outputs JSON per line to stdout.
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+.Bd -literal
+curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
+.Ed
+.Pp
+This extracts the JSON metadata from the IMDB page of the movie "Ground Hog Day".
+It uses the first embedded JSON fragment and pipes it to json2tsv.
+It can then be further processed using awk to get the relevant data.
+.Sh SEE ALSO
+.Xr curl 1 ,
+.Xr json2tsv 1
+.Sh AUTHORS
+.An Hiltjo Posthuma Aq Mt hiltjo@codemadness.org
diff --git a/extractjson.c b/extractjson.c
@@ -0,0 +1,341 @@
+#include <ctype.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#define GETNEXT getnext
+
+#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
+#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
+
+typedef struct xmlparser {
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is in shortform ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[1024];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+static XMLParser parser;
+static int isjson;
+static const char *ignorestate, *endtag;
+static int (*getnext)(void) = getchar;
+
+/* ignore parsing all HTML data inside <script> tags, because they may contain
+ characters such as '<' and '>' */
+static int
+getnext_json(void)
+{
+ int c;
+
+ if ((c = getchar()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+ ignorestate++;
+ if (*ignorestate == '\0') {
+ getnext = getchar; /* restore */
+ putchar('\n');
+ isjson = 0;
+ return c;
+ }
+
+ } else {
+ ignorestate = endtag;
+ if (c != '\r' && c != '\n')
+ putchar(c);
+ }
+
+ return ' ';
+}
+
+static void
+xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl)
+{
+ if (!strcasecmp(t, "script") &&
+ !strcasecmp(a, "type") &&
+ (strstr(v, "application/json") ||
+ strstr(v, "application/ld+json") ||
+ strstr(v, "text/json")))
+ isjson = 1;
+}
+
+static void
+xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+ if (!strcasecmp(t, "script") && isjson) {
+ ignorestate = endtag = "</script>";
+ getnext = getnext_json;
+ return;
+ }
+}
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0, valuestart = 0;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (ISSPACE(c)) {
+ if (namelen)
+ endname = 1;
+ continue;
+ } else if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
+ /* attribute without value */
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+ x->name[namelen] = '\0';
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && valuestart) {
+ /* attribute with value */
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* ISSPACE() */
+ goto startvalue;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+startvalue:
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before entity if there is data */
+ if (valuelen)
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
+ break;
+ if (valuelen < sizeof(x->data) - 1)
+ x->data[valuelen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\0';
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ break;
+ }
+ }
+ namelen = endname = valuestart = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ x->name[0] = '\0';
+ namelen = 0;
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ int c, i = 0;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '-') {
+ if (++i > 2)
+ i = 2;
+ continue;
+ } else if (c == '>' && i == 2) {
+ return;
+ } else if (i) {
+ i = 0;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == ']') {
+ if (++i > 2)
+ i = 2;
+ continue;
+ } else if (c == '>' && i == 2) {
+ return;
+ } else if (i) {
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parse(XMLParser *x)
+{
+ size_t datalen, tagdatalen;
+ int c, isend;
+
+ while ((c = GETNEXT()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = GETNEXT()) == EOF)
+ return;
+
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
+ /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
+ if (tagdatalen <= sizeof("[CDATA[") - 1)
+ x->data[tagdatalen++] = c;
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDATA[") - 1 &&
+ !strncmp(x->data, "[CDATA[", tagdatalen)) {
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), processing instruction. */
+ x->tag[0] = c;
+ x->taglen = 1;
+ x->isshorttag = isend = 0;
+
+ /* treat processing instruction as shorttag, don't strip "?" prefix. */
+ if (c == '?') {
+ x->isshorttag = 1;
+ } else if (c == '/') {
+ if ((c = GETNEXT()) == EOF)
+ return;
+ x->tag[0] = c;
+ isend = 1;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '/')
+ x->isshorttag = 1; /* short tag */
+ else if (c == '>' || ISSPACE(c)) {
+ x->tag[x->taglen] = '\0';
+ if (isend) { /* end tag, starts with </ */
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ } else {
+ /* start tag */
+ if (ISSPACE(c))
+ xml_parseattrs(x);
+ xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
+ }
+ /* call tagend for shortform or processing instruction */
+ if (x->isshorttag) {
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ }
+ break;
+ } else if (x->taglen < sizeof(x->tag) - 1)
+ x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '&') {
+ if (datalen)
+ x->data[datalen] = '\0';
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) - 1)
+ x->data[datalen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[datalen] = '\0';
+ x->data[0] = c;
+ datalen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[datalen] = '\0';
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ break;
+ }
+ }
+ }
+ }
+}
+
+int
+main(void)
+{
+ xml_parse(&parser);
+
+ return 0;
+}