どう書く?org:一部のHTMLタグを通すフィルタ
LL、HTMLパーサー、正規表現を禁じて、わりと素のCで書いてみる。
結論。LL、ライブラリ、パーサー、正規表現は偉大だ。
#include <stdio.h> #include <string.h> #include <strings.h> #include <ctype.h> #define MAXDATASIZE (1024 * 1024) #define BUFSIZE 1024 typedef struct { char *p_src; char *p_dest; int size; } te_status; static void te_putc(te_status *status, char c) { if (status->size > 0) { *(status->p_dest++) = c; status->size--; } } static void te_puts(te_status *status, char *str) { char c; while (c = *str++) { te_putc(status, c); } } static void te_skip_space(te_status *status) { char c; while ((c = *(status->p_src)) && isspace(c)) { status->p_src++; } } static void te_parse_name(te_status *status, char *p, int size) { char c; while ((size > 0) && (c = *(status->p_src)) && isalnum(c)) { status->p_src++; *p++ = c; size--; } *p = '\0'; } static void te_parse_tagname(te_status *status, char *p, int size) { if ((size > 0) && (*(status->p_src) == '/')) { status->p_src++; *p++ = '/'; size--; } te_parse_name(status, p, size); } static void te_parse_quote(te_status *status, char *p, int size) { char c; char startchar; int backslash_flag = 0; startchar = *p++ = *(status->p_src++); size--; while ((size > 0) && (c = *(status->p_src))) { status->p_src++; if (((c == '<') || (c == '>')) && (size >= 3)) { sprintf(p, "%%%2X", c); p += 3; size -= 3; } else if (size > 0){ *p++ = c; size--; } if ((! backslash_flag) && (c == startchar)) break; backslash_flag = (c == '\\') ? 1 : 0; } *p = '\0'; } static void te_parse_elmvalue(te_status *status, char *p, int size) { if ((*(status->p_src) == '\'') || (*(status->p_src) == '\"')) { te_parse_quote(status, p, size); } else { char c; while ((size > 0) && (c = *(status->p_src)) && (! isspace(c)) && (c != '>') ) { *p++ = c; status->p_src++; size--; } *p = '\0'; } } static int te_skip_tagbody(te_status *status) { char c; int slash_flag = 0; while ((c = *(status->p_src++)) && (c != '>')){ slash_flag = (c == '/') ? 1 : 0; } return slash_flag; } static void te_copy_tagbody(te_status *status) { char c; int in_quote = 0; while ((c = *(status->p_src++)) && (in_quote || (c != '>'))){ if (c == '<') { te_puts(status, "<"); } else if (c == '>') { te_puts(status, ">"); } else { if (c == in_quote) { in_quote = 0; } else if ((! in_quote) && (c == '\"') || (c == '\'')) { in_quote = c; } te_putc(status, c); } } } static void te_parse_a(te_status *status) { char elmname[BUFSIZE]; char value[BUFSIZE]; while (1){ te_skip_space(status); if (*(status->p_src) == '\0') return; if (*(status->p_src) == '>') { status->p_src++; return; } te_parse_name(status, elmname, BUFSIZE - 1); value[0] = '\0'; te_skip_space(status); if (*(status->p_src) == '=') { status->p_src++; te_skip_space(status); te_parse_elmvalue(status, value, BUFSIZE - 1); } if ((strcasecmp(elmname, "href") == 0) || (strcasecmp(elmname, "name") == 0) ) { te_putc(status, ' '); te_puts(status, elmname); if (value[0]) { te_putc(status, '='); te_puts(status, value); } } } } static void te_parse_tag(te_status *status) { char buf[BUFSIZE]; te_skip_space(status); te_parse_tagname(status, buf, BUFSIZE - 1); if (strcasecmp(buf, "a") == 0) { te_putc(status, '<'); te_puts(status, buf); te_parse_a(status); te_putc(status, '>'); } else if ((strcasecmp(buf, "br") == 0) || (strcasecmp(buf, "/a") == 0) || (strcasecmp(buf, "strong") == 0) || (strcasecmp(buf, "/strong") == 0) ) { te_putc(status, '<'); te_puts(status, buf); if (te_skip_tagbody(status)) te_puts(status, " /"); te_putc(status, '>'); } else { te_puts(status, "<"); te_puts(status, buf); te_copy_tagbody(status); te_puts(status, ">"); } } static void te_parse_plain(te_status *status) { char c; while ((c = *(status->p_src++)) && (status->size > 0)) { if (c == '<') { te_parse_tag(status); } else { te_putc(status, c); } } } void tagescape(char *dest, char *src, int size) { te_status status_buf; status_buf.p_src = src; status_buf.p_dest = dest; status_buf.size = size; te_parse_plain(&status_buf); *(status_buf.p_dest) = '\0'; } /* test */ char sample_input1[] = "<a href='www.google.com'>link</a> <blink>and</blink> <strong onClick='alert(\"NG\")'>click<br/>me!</strong>'"; char sample_input2[] = "<script foo=\"<script>alert('bar')</script>\">alert('foo')</script>"; char sample_input3[] = "<script foo=\"<a href='link'>link</a>\">alert('foo')</script>"; char sample_input4[] = "<a href='www.g>oogle.com'>link</a>"; char outbuf[MAXDATASIZE]; int main() { tagescape(outbuf, sample_input1, MAXDATASIZE - 1); puts(outbuf); tagescape(outbuf, sample_input2, MAXDATASIZE - 1); puts(outbuf); tagescape(outbuf, sample_input3, MAXDATASIZE - 1); puts(outbuf); tagescape(outbuf, sample_input4, MAXDATASIZE - 1); puts(outbuf); }
コメント
コメントの投稿
トラックバック
https://emasaka.blog.fc2.com/tb.php/286-4b891635