initial repo - extractjson - extract embedded JSON metadata from HTML pages

commit ad11115ba705c4c5f88f0679f2f807e4d0883970
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 14 Aug 2022 11:58:19 +0200

initial repo

Diffstat:
A LICENSE  | 15 +++++++++++++++
A Makefile  | 5 +++++
A README  | 15 +++++++++++++++
A extractjson.1  | 29 +++++++++++++++++++++++++++++
A extractjson.c  | 341 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 405 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2022 Hiltjo Posthuma <hiltjo@codemadness.org>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,5 @@
+build: clean
+	${CC} -o extractjson extractjson.c ${CFLAGS} ${LDFLAGS}
+
+clean:
+	rm -f *.o extractjson
diff --git a/README b/README
@@ -0,0 +1,15 @@
+extractjson
+-----------
+
+Extracts embedded JSON metadata from HTML pages, such as data in the tags:
+<script type="application/ld+json"> 
+
+It reads HTML from stdin and outputs JSON per line to stdout.
+
+Example:
+
+	curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
+
+This extracts the JSON metadata from the IMDB page of Ground Hog data.
+It uses the first embedded JSON fragment and pipes it to json2tsv.
+It can then be further processed using awk to get the relevant data.
diff --git a/extractjson.1 b/extractjson.1
@@ -0,0 +1,29 @@
+.Dd May 2, 2022
+.Dt EXTRACTJSON 1
+.Os
+.Sh NAME
+.Nm extractjson
+.Nd extracts embedded JSON metadata from HTML pages
+.Sh SYNOPSIS
+.Nm
+.Sh DESCRIPTION
+.Nm
+extracts embedded JSON metadata from HTML pages, such as data in the tags:
+<script type="application/ld+json">
+.Pp
+It reads HTML from stdin and outputs JSON per line to stdout.
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+.Bd -literal
+curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
+.Ed
+.Pp
+This extracts the JSON metadata from the IMDB page of the movie "Ground Hog Day".
+It uses the first embedded JSON fragment and pipes it to json2tsv.
+It can then be further processed using awk to get the relevant data.
+.Sh SEE ALSO
+.Xr curl 1 ,
+.Xr json2tsv 1
+.Sh AUTHORS
+.An Hiltjo Posthuma Aq Mt hiltjo@codemadness.org
diff --git a/extractjson.c b/extractjson.c
@@ -0,0 +1,341 @@
+#include <ctype.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#define GETNEXT getnext
+
+#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
+#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
+
+typedef struct xmlparser {
+	/* current tag */
+	char tag[1024];
+	size_t taglen;
+	/* current tag is in shortform ? <tag /> */
+	int isshorttag;
+	/* current attribute name */
+	char name[1024];
+	/* data buffer used for tag data, cdata and attribute data */
+	char data[BUFSIZ];
+} XMLParser;
+
+static XMLParser parser;
+static int isjson;
+static const char *ignorestate, *endtag;
+static int (*getnext)(void) = getchar;
+
+/* ignore parsing all HTML data inside <script> tags, because they may contain
+   characters such as '<' and '>' */
+static int
+getnext_json(void)
+{
+	int c;
+
+	if ((c = getchar()) == EOF)
+		return EOF;
+
+	if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+		ignorestate++;
+		if (*ignorestate == '\0') {
+			getnext = getchar; /* restore */
+			putchar('\n');
+			isjson = 0;
+			return c;
+		}
+
+	} else {
+		ignorestate = endtag;
+		if (c != '\r' && c != '\n')
+			putchar(c);
+	}
+
+	return ' ';
+}
+
+static void
+xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+        const char *v, size_t vl)
+{
+	if (!strcasecmp(t, "script") &&
+	    !strcasecmp(a, "type")  &&
+	    (strstr(v, "application/json") ||
+	    strstr(v, "application/ld+json") ||
+	    strstr(v, "text/json")))
+		isjson = 1;
+}
+
+static void
+xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+	if (!strcasecmp(t, "script") && isjson) {
+		ignorestate = endtag = "</script>";
+		getnext = getnext_json;
+		return;
+	}
+}
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+	size_t namelen = 0, valuelen;
+	int c, endsep, endname = 0, valuestart = 0;
+
+	while ((c = GETNEXT()) != EOF) {
+		if (ISSPACE(c)) {
+			if (namelen)
+				endname = 1;
+			continue;
+		} else if (c == '?')
+			; /* ignore */
+		else if (c == '=') {
+			x->name[namelen] = '\0';
+			valuestart = 1;
+			endname = 1;
+		} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
+			/* attribute without value */
+			xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+			x->name[namelen] = '\0';
+			endname = 0;
+			x->name[0] = c;
+			namelen = 1;
+		} else if (namelen && valuestart) {
+			/* attribute with value */
+			valuelen = 0;
+			if (c == '\'' || c == '"') {
+				endsep = c;
+			} else {
+				endsep = ' '; /* ISSPACE() */
+				goto startvalue;
+			}
+
+			while ((c = GETNEXT()) != EOF) {
+startvalue:
+				if (c == '&') { /* entities */
+					x->data[valuelen] = '\0';
+					/* call data function with data before entity if there is data */
+					if (valuelen)
+						xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+					x->data[0] = c;
+					valuelen = 1;
+					while ((c = GETNEXT()) != EOF) {
+						if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
+							break;
+						if (valuelen < sizeof(x->data) - 1)
+							x->data[valuelen++] = c;
+						else {
+							/* entity too long for buffer, handle as normal data */
+							x->data[valuelen] = '\0';
+							xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+							x->data[0] = c;
+							valuelen = 1;
+							break;
+						}
+						if (c == ';') {
+							x->data[valuelen] = '\0';
+							valuelen = 0;
+							break;
+						}
+					}
+				} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+					if (valuelen < sizeof(x->data) - 1) {
+						x->data[valuelen++] = c;
+					} else {
+						x->data[valuelen] = '\0';
+						xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+						x->data[0] = c;
+						valuelen = 1;
+					}
+				}
+				if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+					x->data[valuelen] = '\0';
+					xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+					break;
+				}
+			}
+			namelen = endname = valuestart = 0;
+		} else if (namelen < sizeof(x->name) - 1) {
+			x->name[namelen++] = c;
+		}
+		if (c == '>') {
+			break;
+		} else if (c == '/') {
+			x->isshorttag = 1;
+			x->name[0] = '\0';
+			namelen = 0;
+		}
+	}
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+	int c, i = 0;
+
+	while ((c = GETNEXT()) != EOF) {
+		if (c == '-') {
+			if (++i > 2)
+				i = 2;
+			continue;
+		} else if (c == '>' && i == 2) {
+			return;
+		} else if (i) {
+			i = 0;
+		}
+	}
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+	size_t datalen = 0, i = 0;
+	int c;
+
+	while ((c = GETNEXT()) != EOF) {
+		if (c == ']') {
+			if (++i > 2)
+				i = 2;
+			continue;
+		} else if (c == '>' && i == 2) {
+			return;
+		} else if (i) {
+			i = 0;
+		}
+
+		if (datalen < sizeof(x->data) - 1) {
+			x->data[datalen++] = c;
+		} else {
+			x->data[datalen] = '\0';
+			x->data[0] = c;
+			datalen = 1;
+		}
+	}
+}
+
+static void
+xml_parse(XMLParser *x)
+{
+	size_t datalen, tagdatalen;
+	int c, isend;
+
+	while ((c = GETNEXT()) != EOF && c != '<')
+		; /* skip until < */
+
+	while (c != EOF) {
+		if (c == '<') { /* parse tag */
+			if ((c = GETNEXT()) == EOF)
+				return;
+
+			if (c == '!') { /* cdata and comments */
+				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
+					/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
+					if (tagdatalen <= sizeof("[CDATA[") - 1)
+						x->data[tagdatalen++] = c;
+					if (c == '>')
+						break;
+					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+							(x->data[0] == '-')) {
+						xml_parsecomment(x);
+						break;
+					} else if (c == '[') {
+						if (tagdatalen == sizeof("[CDATA[") - 1 &&
+						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
+							xml_parsecdata(x);
+							break;
+						}
+					}
+				}
+			} else {
+				/* normal tag (open, short open, close), processing instruction. */
+				x->tag[0] = c;
+				x->taglen = 1;
+				x->isshorttag = isend = 0;
+
+				/* treat processing instruction as shorttag, don't strip "?" prefix. */
+				if (c == '?') {
+					x->isshorttag = 1;
+				} else if (c == '/') {
+					if ((c = GETNEXT()) == EOF)
+						return;
+					x->tag[0] = c;
+					isend = 1;
+				}
+
+				while ((c = GETNEXT()) != EOF) {
+					if (c == '/')
+						x->isshorttag = 1; /* short tag */
+					else if (c == '>' || ISSPACE(c)) {
+						x->tag[x->taglen] = '\0';
+						if (isend) { /* end tag, starts with </ */
+							x->tag[0] = '\0';
+							x->taglen = 0;
+						} else {
+							/* start tag */
+							if (ISSPACE(c))
+								xml_parseattrs(x);
+							xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
+						}
+						/* call tagend for shortform or processing instruction */
+						if (x->isshorttag) {
+							x->tag[0] = '\0';
+							x->taglen = 0;
+						}
+						break;
+					} else if (x->taglen < sizeof(x->tag) - 1)
+						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
+				}
+			}
+		} else {
+			/* parse tag data */
+			datalen = 0;
+			while ((c = GETNEXT()) != EOF) {
+				if (c == '&') {
+					if (datalen)
+						x->data[datalen] = '\0';
+					x->data[0] = c;
+					datalen = 1;
+					while ((c = GETNEXT()) != EOF) {
+						if (c == '<')
+							break;
+						if (datalen < sizeof(x->data) - 1)
+							x->data[datalen++] = c;
+						else {
+							/* entity too long for buffer, handle as normal data */
+							x->data[datalen] = '\0';
+							x->data[0] = c;
+							datalen = 1;
+							break;
+						}
+						if (c == ';') {
+							x->data[datalen] = '\0';
+							datalen = 0;
+							break;
+						}
+					}
+				} else if (c != '<') {
+					if (datalen < sizeof(x->data) - 1) {
+						x->data[datalen++] = c;
+					} else {
+						x->data[datalen] = '\0';
+						x->data[0] = c;
+						datalen = 1;
+					}
+				}
+				if (c == '<') {
+					x->data[datalen] = '\0';
+					break;
+				}
+			}
+		}
+	}
+}
+
+int
+main(void)
+{
+	xml_parse(&parser);
+
+	return 0;
+}

	extractjson extract embedded JSON metadata from HTML pages
	git clone git://git.codemadness.org/extractjson
	Log \| Files \| Refs \| README \| LICENSE

A	LICENSE	\|	15	+++++++++++++++
A	Makefile	\|	5	+++++
A	README	\|	15	+++++++++++++++
A	extractjson.1	\|	29	+++++++++++++++++++++++++++++
A	extractjson.c	\|	341	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++