/* ** LuaHTML parser ** Written by Timm S. Mueller ** This file is licensed under the terms of the free MIT license. */ #include "luahtml_parser.h" static unsigned char *luahtml_encodeutf8(unsigned char *buf, int c) { if (c < 128) { *buf++ = c; } else if (c < 2048) { *buf++ = 0xc0 + (c >> 6); *buf++ = 0x80 + (c & 0x3f); } else if (c < 65536) { *buf++ = 0xe0 + (c >> 12); *buf++ = 0x80 + ((c & 0xfff) >> 6); *buf++ = 0x80 + (c & 0x3f); } else if (c < 2097152) { *buf++ = 0xf0 + (c >> 18); *buf++ = 0x80 + ((c & 0x3ffff) >> 12); *buf++ = 0x80 + ((c & 0xfff) >> 6); *buf++ = 0x80 + (c & 0x3f); } else if (c < 67108864) { *buf++ = 0xf8 + (c >> 24); *buf++ = 0x80 + ((c & 0xffffff) >> 18); *buf++ = 0x80 + ((c & 0x3ffff) >> 12); *buf++ = 0x80 + ((c & 0xfff) >> 6); *buf++ = 0x80 + (c & 0x3f); } else { *buf++ = 0xfc + (c >> 30); *buf++ = 0x80 + ((c & 0x3fffffff) >> 24); *buf++ = 0x80 + ((c & 0xffffff) >> 18); *buf++ = 0x80 + ((c & 0x3ffff) >> 12); *buf++ = 0x80 + ((c & 0xfff) >> 6); *buf++ = 0x80 + (c & 0x3f); } return buf; } int luahtml_readutf8(struct luahtml_utf8reader *rd) { int c; for (;;) { if (rd->bufc >= 0) { c = rd->bufc; rd->bufc = -1; } else c = rd->readchar(rd); if (c < 0) return c; if (c == 254 || c == 255) break; if (c < 128) { if (rd->numa > 0) { rd->bufc = c; break; } return c; } else if (c < 192) { if (rd->numa == 0) break; rd->accu <<= 6; rd->accu += c - 128; rd->numa--; if (rd->numa == 0) { if (rd->accu == 0 || rd->accu < rd->min || (rd->accu >= 55296 && rd->accu <= 57343)) break; c = rd->accu; rd->accu = 0; return c; } } else { if (rd->numa > 0) { rd->bufc = c; break; } if (c < 224) { rd->min = 128; rd->accu = c - 192; rd->numa = 1; } else if (c < 240) { rd->min = 2048; rd->accu = c - 224; rd->numa = 2; } else if (c < 248) { rd->min = 65536; rd->accu = c - 240; rd->numa = 3; } else if (c < 252) { rd->min = 2097152; rd->accu = c - 248; rd->numa = 4; } else { rd->min = 67108864; rd->accu = c - 252; rd->numa = 5; } } } /* bad char */ rd->accu = 0; rd->numa = 0; return 65533; } static unsigned char *luahtml_outchar(lua_State *L, unsigned char *buf, luahtml_parser_state_t state, int c) { if (state == PARSER_HTML) { if (c > 127 /*|| c == '[' || c == ']'*/) return buf + sprintf((char *) buf, "&#%02d;", c); } else if (state == PARSER_CODE) { if (c > 127) return luahtml_encodeutf8(buf, c); } else if (c > 127) { #if !defined(LUAHTML_STANDALONE) luaL_error(L, "Non-ASCII character outside code or HTML context"); #else fprintf(stderr, "Non-ASCII character outside code or HTML context\n"); exit(1); #endif } *buf++ = c; return buf; } const char *luahtml_readparsed(lua_State *L, void *udata, size_t *sz) { struct luahtml_readdata *rd = udata; luahtml_parser_state_t news = rd->state; int c; while ((c = luahtml_readutf8(&rd->utf8)) >= 0) { switch (news) { case PARSER_UNDEF: if (c == '<') { news = PARSER_OPEN1; continue; } rd->state = PARSER_HTML; rd->buf[0] = '['; rd->buf[1] = '['; *sz = luahtml_outchar(L, rd->buf + 2, rd->state, c) - rd->buf0; return (char *) rd->buf0; case PARSER_HTML: if (c == '<') { news = PARSER_OPEN1; continue; } break; case PARSER_OPEN1: if (c == '%') { news = PARSER_OPEN2; continue; } rd->buf[0] = '<'; rd->buf[1] = c; *sz = 2; return (char *) rd->buf; case PARSER_OPEN2: if (c == '=') { if (rd->state == PARSER_UNDEF) { rd->state = PARSER_VAR; *sz = rd->buf - rd->buf0; return (char *) rd->buf0; } rd->state = PARSER_VAR; strcpy((char *) rd->buf, "]])"); memcpy(rd->buf + 3, rd->buf0, rd->buf - rd->buf0); *sz = 3 + rd->buf - rd->buf0; return (char *) rd->buf; } if (rd->state == PARSER_UNDEF) rd->state = PARSER_CODE; else { rd->state = PARSER_CODE; rd->buf[0] = ']'; rd->buf[1] = ']'; rd->buf[2] = ')'; rd->buf[3] = ' '; rd->buf[4] = c; *sz = 5; return (char *) rd->buf; } break; case PARSER_VAR: case PARSER_CODE: if (c == '%') { rd->code_present = 1; news = PARSER_CLOSE; continue; } break; case PARSER_CLOSE: if (c == '>') { size_t len; if (rd->state == PARSER_CODE) { rd->state = PARSER_HTML; rd->buf[0] = '['; rd->buf[1] = '['; *sz = rd->buf + 2 - rd->buf0; return (char *) rd->buf0; } rd->state = PARSER_HTML; strcpy((char *) rd->buf, " or \"nil\")"); memcpy(rd->buf + 10, rd->buf0, rd->buf - rd->buf0); len = 10 + rd->buf - rd->buf0; rd->buf[len++] = '['; rd->buf[len++] = '['; *sz = len; return (char *) rd->buf; } rd->buf[0] = '%'; rd->buf[1] = c; *sz = 2; return (char *) rd->buf; } *sz = luahtml_outchar(L, rd->buf, rd->state, c) - rd->buf; return (char *) rd->buf; } rd->state = PARSER_UNDEF; if (news == PARSER_HTML) { *sz = 4; return "]]) "; } return NULL; } #if defined(LUAHTML_STANDALONE) #include static int luahtml_readstdin(struct luahtml_utf8reader *rd) { unsigned char buf[1]; ssize_t rdlen = read(STDIN_FILENO, &buf, 1); if (rdlen == 0) return EOF; return buf[0]; } int main(int argc, char **argv) { size_t outlen; unsigned char *bufptr; struct luahtml_readdata rd; const char *outcmd = "print"; if (argc >= 2) outcmd = argv[1]; rd.utf8.file = STDIN_FILENO; rd.utf8.readchar = luahtml_readstdin; rd.utf8.accu = 0; rd.utf8.numa = 0; rd.utf8.bufc = -1; rd.state = PARSER_UNDEF; strcpy((char *) rd.buf0, " "); strcat((char *) rd.buf0, outcmd); strcat((char *) rd.buf0, "("); rd.buf = rd.buf0 + strlen((char *) rd.buf0); while ((bufptr = (unsigned char *) luahtml_readparsed(NULL, &rd, &outlen))) { write(STDOUT_FILENO, bufptr, outlen); } return 0; } #endif