入力文字列自身でマーク付けに使う文字を指定する #3

とりあえず書いてみた。
前述したように、コマンドラインオプションでメタ文字を変更する版の記事の#5のparser.yの末尾に、
以下の字句解析関数yylexのコードを追加しているだけである。

...snip

static char *estrdup(const char *s);

#ifndef YYTEXT_INIT_SIZE
#define YYTEXT_INIT_SIZE 8192
#endif /* YYTEXT_INIT_SIZE */
#if YYTEXT_INIT_SIZE < 2
#error YYTEXT_INIT_SIZE must be greater than 1
#endif

int yylex(void)
{
    static size_t yytext_size = YYTEXT_INIT_SIZE;
    static char *yytext = NULL;
    char *p;
    int c;
    if (! yytext && ! (yytext = malloc(yytext_size))) {
        fputs("error: yylex cannot allocate any memories\n", stderr);
        exit(1);
    }
    p = yytext;
    c = getchar();
    if (c == current_start_mark_char) {
        do {
            int c2 = getchar();
            if (c2 == current_start_mark_char) {
                int c3 = getchar();
                if (c3 == EOF) {
                    fputs("error: yylex detected unexpected EOF\n", stderr);
                    exit(1);
                }
                current_start_mark_char = c3;
            } else if (c2 == current_end_mark_char) {
                int c3 = getchar();
                if (c3 == EOF) {
                    fputs("error: yylex detected unexpected EOF\n", stderr);
                    exit(1);
                }
                current_end_mark_char = c3;
            } else if (c2 == current_start_annotation_char) {
                int c3 = getchar();
                if (c3 == EOF) {
                    fputs("error: yylex detected unexpected EOF\n", stderr);
                    exit(1);
                }
                current_start_annotation_char = c3;
            } else {
                ungetc(c2, stdin);
                return START_MARK;
            }
        } while ((c = getchar()) == current_start_mark_char);
    }
    if (c == EOF) return c;
    if (c == current_end_mark_char) return END_MARK;
    if (c == current_start_annotation_char) {
        yytext[0] = current_start_annotation_char, yytext[1] = '\0';
        yylval = estrdup(yytext);
        return START_ANNOTATION;
    }
    while (1) {
        *p++ = c;
        if (p == yytext + yytext_size) {
            size_t s = yytext_size * 2;
            char *q = realloc(yytext, s);
            if (q == NULL) {
                fputs("error: yylex cannot allocate any memories, because token is too long.\n", stderr);
                exit(1);
            }
            yytext = q, p = yytext + yytext_size, yytext_size = s;
        }
        c = getchar();
        if (c == current_start_mark_char) {
            do {
                int c2 = getchar();
                if (c2 == current_start_mark_char) {
                    int c3 = getchar();
                    if (c3 == EOF) {
                        fputs("error: yylex detected unexpected EOF\n", stderr);
                        exit(1);
                    }
                    current_start_mark_char = c3;
                } else if (c2 == current_end_mark_char) {
                    int c3 = getchar();
                    if (c3 == EOF) {
                        fputs("error: yylex detected unexpected EOF\n", stderr);
                        exit(1);
                    }
                    current_end_mark_char = c3;
                } else if (c2 == current_start_annotation_char) {
                    int c3 = getchar();
                    if (c3 == EOF) {
                        fputs("error: yylex detected unexpected EOF\n", stderr);
                        exit(1);
                    }
                    current_start_annotation_char = c3;
                } else {
                    ungetc(c2, stdin);
                    break;
                }
            } while ((c = getchar()) == current_start_mark_char);
        }
        if (c == EOF || c == current_start_mark_char || c == current_end_mark_char || c == current_start_annotation_char) {
            ungetc(c, stdin);
            *p = '\0';
            yylval = estrdup(yytext);
            return STRING_EXCEPT_SPECIAL;
        }
    }
}

static char *estrdup(const char *s)
{
    char *p = malloc(strlen(s) + 1);
    if (p == NULL) {
        fputs("error: estrdup cannot allocate any memories.\n", stderr);
        exit(1);
    }
    return strcpy(p, s);
}

同じような処理が散らばっていて見苦しいが、

$ echo -n '<html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser -s"<" -e">" -a" "
marked-string [html]: annotation[lang="ja"]
marked-string [body]
marked-string [img]: annotation[src="foo.jpg" alt="description"]
marked-string [/body]
marked-string [/html]

$ echo -n '()>(: ((<<html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser
marked-string [html]: annotation[lang="ja"]
marked-string [body]
marked-string [img]: annotation[src="foo.jpg" alt="description"]
marked-string [/body]
marked-string [/html]

$ echo -n '((<<)><: <html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser
marked-string [html]: annotation[lang="ja"]
marked-string [body]
marked-string [img]: annotation[src="foo.jpg" alt="description"]
marked-string [/body]
marked-string [/html]

なんだか動いているようである。

$ echo -n "(({{({{({)" | ./parser
marked-string [({]

こんなのもうまく解析できているようである。