マーク付き文字列解析器の整理 #5 - 借りてきた猫のように静か

mclex_test2.cはyylex関数の骨格をそのままにmclexに適応させたものなので、
parser.yのyylex関数をmclex_test2.cの形に書き直す。
二つ目の%%より前にあるソースの宣言部と構文規則部については元のparser.yから一切変更していない。
が、元のソースは今までの記事の各場所で...snipで切り刻まれ断片化している。
そこで、利便のためにここに全文出しておく。

parser.y

%{
#define YYSTYPE char*

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

int yylex(void);
void yyerror(char const *);

static char *estrdupcat(const char *s1, const char *s2);
%}

%token STRING_EXCEPT_SPECIAL START_MARK END_MARK START_ANNOTATION

%%

text :
      string_headed
    | marked_string_headed
    ;

string_headed :
      string_except_parenthesis { printf("string [%s]\n", $1); free($1); }
    | string_except_parenthesis { printf("string [%s]\n", $1); free($1); } marked_string_headed
    ;

marked_string_headed :
      marked_string
    | marked_string marked_string_headed
    | marked_string string_headed
    ;

marked_string :
      START_MARK marked_string_body_with_annotation END_MARK
    ;

marked_string_body_with_annotation :
      STRING_EXCEPT_SPECIAL
        {
            printf("marked-string [%s]\n", $1); free($1);
        }
    | STRING_EXCEPT_SPECIAL START_ANNOTATION string_except_parenthesis
        {
            printf("marked-string [%s]: annotation[%s]\n", $1, $3);
            free($1); free($2); free($3);
        }
    ;

string_except_parenthesis :
      STRING_EXCEPT_SPECIAL
        {
            $$ = estrdupcat(NULL, $1);
            free($1);
        }
    | START_ANNOTATION
        {
            $$ = estrdupcat(NULL, $1);
            free($1);
        }
    | string_except_parenthesis STRING_EXCEPT_SPECIAL
        {
            $$ = estrdupcat($1, $2);
            free($1); free($2);
        }
    | string_except_parenthesis START_ANNOTATION
        {
            $$ = estrdupcat($1, $2);
            free($1); free($2);
        }
    ;

%%

#include "mclex.h"

int main(int argc, char *argv[])
{
    int start_mark = '(';
    int end_mark = ')';
    int start_annotation = ':';
    int opt;
    while ((opt = getopt(argc, argv, "s:e:a:")) != -1) {
        switch (opt) {
        case 's':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: start mark requires one character: %s\n", optarg);
            start_mark = optarg[0];
            break;
        case 'e':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: end mark requires one character: %s\n", optarg);
            end_mark = optarg[0];
            break;
        case 'a':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: start annotation requires one character: %s\n", optarg);
            start_annotation = optarg[0];
            break;
        default:
            fprintf(stderr, "usage: %s [-s start_mark] [-e end_mark] [-a start_annotation]\n", argv[0]);
            exit(1);
        }
    }
    if (start_mark == end_mark || end_mark == start_annotation || start_annotation == start_mark) {
        fprintf(stderr, "error: meta characters must differ from each other\n");
        exit(1);
    }
    mcinit(start_mark, end_mark, start_annotation);
    return yyparse();
}

void yyerror(char const *s)
{
    fprintf(stderr, "%s\n", s);
}

static char *estrdupcat(const char *s1, const char *s2)
{
    size_t len = (s1 == NULL ? 0 : strlen(s1)) + strlen(s2) + 1;
    char *p = malloc(len);
    if (p == NULL) {
        fputs("error: estrdupcat cannot allocate any memories.\n", stderr);
        exit(1);
    }
    return s1 == NULL ? strcpy(p, s2) : strcat(strcpy(p, s1), s2);
}

static char *estrdup(const char *s);

#ifndef YYTEXT_INIT_SIZE
#define YYTEXT_INIT_SIZE 8192
#endif /* YYTEXT_INIT_SIZE */
#if YYTEXT_INIT_SIZE < 2
#error YYTEXT_INIT_SIZE must be greater than 1
#endif

int yylex(void)
{
    static size_t yytext_size = YYTEXT_INIT_SIZE;
    static char *yytext = NULL;
    char *p;
    int c;
    if (! yytext && ! (yytext = malloc(yytext_size))) {
        fputs("error: yylex cannot allocate any memories\n", stderr);
        exit(1);
    }
    p = yytext;
    c = mclex();
    switch (c) {
    case START_ANNOTATION:
        yytext[0] = mclval, yytext[1] = '\0';
        yylval = estrdup(yytext);
        /* fall-through */
    case EOF:
    case START_MARK:
    case END_MARK:
        return c;
    default:
        for (;;) {
            *p++ = c;
            if (p == yytext + yytext_size) {
                size_t s = yytext_size * 2;
                char *q = realloc(yytext, s);
                if (q == NULL) {
                    fputs("error: yylex cannot allocate any memories, because token is too long.\n", stderr);
                    exit(1);
                }
                yytext = q, p = yytext + yytext_size, yytext_size = s;
            }
            c = mclex();
            switch (c) {
            case EOF:
            case START_MARK:
            case END_MARK:
            case START_ANNOTATION:
                mcunput(c);
                *p = '\0';
                yylval = estrdup(yytext);
                return STRING_EXCEPT_SPECIAL;
            }
        }
    }
}

static char *estrdup(const char *s)
{
    char *p = malloc(strlen(s) + 1);
    if (p == NULL) {
        fputs("error: estrdup cannot allocate any memories.\n", stderr);
        exit(1);
    }
    return strcpy(p, s);
}

メタ文字をyylex関数とやり取りするためのcurrent_start_mark_char等の三個のグローバル変数については、
mclex.cでメタ文字の割り当てをプライベートに管理するように変更したので不要になった。
そのため、main関数でコマンドラインオプションの解析結果をメタ文字の設定に反映するためにmcinit関数を使う。
yylex関数は上で述べたようにmclex_test2.cの骨格そのままに近い。
トークンを検出する度に構文解析器へそれを返すので、
全文字列を解析しつくさなければならないmclex_test2.cよりも簡単な構造になっているくらいだ。
また、元のparser.yのyylex関数と比べてかなりコンパクトになっている。
処理の一部を追い出したmclex.cと込みで考えても、元のものよりスマートになって見通しがよくなった。

$ bison -dy parser.y

$ gcc -std=c89 -pedantic -Wall -Wextra -o parser y.tab.c mclex.c -s

$ echo -n "a(b" | ./parser
string [a]
marked-string [b]
syntax error

$ echo -n "this symbol string ((''(''(''()')('):parens and quotes) is analysable" | ./parser
string [this symbol string ]
marked-string [('')]: annotation[parens and quotes]
string [ is analysable]

$ echo -n "this symbol string ()>((<<(''):parens and quotes> is analysable" | ./parser
string [this symbol string ]
marked-string [('')]: annotation[parens and quotes]
string [ is analysable]

問題なく実行できた。
メタ文字変更シーケンスとコマンドラインオプションの併用もできる。
その場合、コマンドラインオプションによる変更が文字列の文頭で行われるものとして扱う。

$ echo -n "ab[cd=ef=gh[[([i]jk" | ./parser -s"[" -e"]" -a"="
string [ab]
marked-string [cd]: annotation[ef=gh[i]
string [jk]

は、

$ echo -n "(([[)][:=ab[cd=ef=gh[[([i]jk" | ./parser
string [ab]
marked-string [cd]: annotation[ef=gh[i]
string [jk]

と等価である。