fc2ブログ

本を読む

読書やコンピュータなどに関するメモ

どう書く?org:一部のHTMLタグを通すフィルタ

 LL、HTMLパーサー、正規表現を禁じて、わりと素のCで書いてみる。

 結論。LL、ライブラリ、パーサー、正規表現は偉大だ。

#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>

#define MAXDATASIZE     (1024 * 1024)
#define BUFSIZE         1024

typedef struct {
    char *p_src;
    char *p_dest;
    int size;
} te_status;

static void te_putc(te_status *status, char c)
{
    if (status->size > 0) {
        *(status->p_dest++) = c;
        status->size--;
    }
}

static void te_puts(te_status *status, char *str)
{
    char c;

    while (c = *str++) {
        te_putc(status, c);
    }
}

static void te_skip_space(te_status *status)
{
    char c;

    while ((c = *(status->p_src)) && isspace(c)) {
        status->p_src++;
    }
}

static void te_parse_name(te_status *status, char *p, int size)
{
    char c;

    while ((size > 0) && (c = *(status->p_src)) && isalnum(c)) {
        status->p_src++;
        *p++ = c;
        size--;
    }
    *p = '\0';
}

static void te_parse_tagname(te_status *status, char *p, int size)
{
    if ((size > 0) && (*(status->p_src) == '/')) {
        status->p_src++;
        *p++ = '/';
        size--;
    }
    te_parse_name(status, p, size);
}

static void te_parse_quote(te_status *status, char *p, int size)
{
    char c;
    char startchar;
    int backslash_flag = 0;

    startchar = *p++ = *(status->p_src++);
    size--;
    while ((size > 0) && (c = *(status->p_src))) {
        status->p_src++;
        if (((c == '<') || (c == '>')) && (size >= 3)) {
            sprintf(p, "%%%2X", c);
            p += 3;
            size -= 3;
        } else if (size > 0){
            *p++ = c;
            size--;
        }
        if ((! backslash_flag) && (c == startchar)) break;
        backslash_flag = (c == '\\') ? 1 : 0;
    }
    *p = '\0';
}

static void te_parse_elmvalue(te_status *status, char *p, int size)
{
    if ((*(status->p_src) == '\'') || (*(status->p_src) == '\"')) {
        te_parse_quote(status, p, size);
    } else {
        char c;

        while ((size > 0) && (c = *(status->p_src)) &&
               (! isspace(c)) && (c != '>') ) {
            *p++ = c;
            status->p_src++;
            size--;
        }
        *p = '\0';
    }
}

static int te_skip_tagbody(te_status *status)
{
    char c;
    int slash_flag = 0;

    while ((c = *(status->p_src++)) && (c != '>')){
        slash_flag = (c == '/') ? 1 : 0;
    }
    return slash_flag;
}

static void te_copy_tagbody(te_status *status)
{
    char c;
    int in_quote = 0;

    while ((c = *(status->p_src++)) && (in_quote || (c != '>'))){
        if (c == '<') {
            te_puts(status, "&lt;");
        } else if (c == '>') {
            te_puts(status, "&gt;");
        } else {
            if (c == in_quote) {
                in_quote = 0;
            } else if ((! in_quote) && (c == '\"') || (c == '\'')) {
                in_quote = c;
            }
            te_putc(status, c);
        }
    }
}

static void te_parse_a(te_status *status)
{
    char elmname[BUFSIZE];
    char value[BUFSIZE];

    while (1){
        te_skip_space(status);
        if (*(status->p_src) == '\0') return;
        if (*(status->p_src) == '>') {
            status->p_src++;
            return;
        }
        te_parse_name(status, elmname, BUFSIZE - 1);
        value[0] = '\0';
        te_skip_space(status);
        if (*(status->p_src) == '=') {
            status->p_src++;
            te_skip_space(status);
            te_parse_elmvalue(status, value, BUFSIZE - 1);
        }
        if ((strcasecmp(elmname, "href") == 0) ||
            (strcasecmp(elmname, "name") == 0) ) {
            te_putc(status, ' ');
            te_puts(status, elmname);
            if (value[0]) {
                te_putc(status, '=');
                te_puts(status, value);
            }
        }
    }
}

static void te_parse_tag(te_status *status)
{
    char buf[BUFSIZE];

    te_skip_space(status);
    te_parse_tagname(status, buf, BUFSIZE - 1);
    if (strcasecmp(buf, "a") == 0) {
        te_putc(status, '<');
        te_puts(status, buf);
        te_parse_a(status);
        te_putc(status, '>');
    } else if ((strcasecmp(buf, "br") == 0) ||
               (strcasecmp(buf, "/a") == 0) ||
               (strcasecmp(buf, "strong") == 0) ||
               (strcasecmp(buf, "/strong") == 0) ) {
        te_putc(status, '<');
        te_puts(status, buf);
        if (te_skip_tagbody(status)) te_puts(status, " /");
        te_putc(status, '>');
    } else {
        te_puts(status, "&lt;");
        te_puts(status, buf);
        te_copy_tagbody(status);
        te_puts(status, "&gt;");
    }
}

static void te_parse_plain(te_status *status)
{
    char c;

    while ((c = *(status->p_src++)) && (status->size > 0)) {
        if (c == '<') {
            te_parse_tag(status);
        } else {
            te_putc(status, c);
        }
    }
}

void tagescape(char *dest, char *src, int size)
{
    te_status status_buf;

    status_buf.p_src = src;
    status_buf.p_dest = dest;
    status_buf.size = size;
    te_parse_plain(&status_buf);
    *(status_buf.p_dest) = '\0';
}


/* test */

char sample_input1[] = "<a href='www.google.com'>link</a> <blink>and</blink> <strong onClick='alert(\"NG\")'>click<br/>me!</strong>'";

char sample_input2[] = "<script foo=\"<script>alert('bar')</script>\">alert('foo')</script>";

char sample_input3[] = "<script foo=\"<a href='link'>link</a>\">alert('foo')</script>";

char sample_input4[] = "<a href='www.g>oogle.com'>link</a>";

char outbuf[MAXDATASIZE];

int main()
{
    tagescape(outbuf, sample_input1, MAXDATASIZE - 1);
    puts(outbuf);
    tagescape(outbuf, sample_input2, MAXDATASIZE - 1);
    puts(outbuf);
    tagescape(outbuf, sample_input3, MAXDATASIZE - 1);
    puts(outbuf);
    tagescape(outbuf, sample_input4, MAXDATASIZE - 1);
    puts(outbuf);
}

コメント

コメントの投稿

管理者にだけ表示を許可する

トラックバック

https://emasaka.blog.fc2.com/tb.php/286-4b891635

 | HOME | 

Categories

Recent Entries

Recent Comments

Recent Trackbacks

Appendix

emasaka

emasaka

フリーター。
連絡先はこのへん

Monthly