マーク付けに使う文字の変更 #2

構文規則を元にyacc/bison用ソースを書いてみた。

parser.y
%{
#define YYSTYPE char*

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

int yylex(void);
void yyerror(char const *);

static char *estrdupcat(const char *s1, const char *s2);
static char *estrdup(const char *s);
%}

%token STRING_EXCEPT_SPECIAL START_MARK END_MARK START_ANNOTATION

%%

text :
      string_headed
    | marked_string_headed
    ;

string_headed :
      string_except_parenthesis { printf("string [%s]\n", $1); free($1); }
    | string_except_parenthesis { printf("string [%s]\n", $1); free($1); } marked_string_headed
    ;

marked_string_headed :
      marked_string
    | marked_string marked_string_headed
    | marked_string string_headed
    ;

marked_string :
      START_MARK marked_string_body_with_annotation END_MARK
    ;

marked_string_body_with_annotation :
      STRING_EXCEPT_SPECIAL
        {
            printf("marked-string [%s]\n", $1); free($1);
        }
    | STRING_EXCEPT_SPECIAL START_ANNOTATION string_except_parenthesis
        {
            printf("marked-string [%s]: annotation[%s]\n", $1, $3);
            free($1); free($2); free($3);
        }
    ;

string_except_parenthesis :
      STRING_EXCEPT_SPECIAL
        {
            $$ = estrdupcat(NULL, $1);
            free($1);
        }
    | START_ANNOTATION
        {
            $$ = estrdupcat(NULL, $1);
            free($1);
        }
    | string_except_parenthesis STRING_EXCEPT_SPECIAL
        {
            $$ = estrdupcat($1, $2);
            free($1); free($2);
        }
    | string_except_parenthesis START_ANNOTATION
        {
            $$ = estrdupcat($1, $2);
            free($1); free($2);
        }
    ;

%%

static int start_mark_char = '(';
static int end_mark_char = ')';
static int start_annotation_char = ':';

int main(int argc, char *argv[])
{
    int opt;
    while ((opt = getopt(argc, argv, "s:e:a:")) != -1) {
        switch (opt) {
        case 's':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: start mark requires one character: %s\n", optarg);
            start_mark_char = optarg[0];
            break;
        case 'e':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: end mark requires one character: %s\n", optarg);
            end_mark_char = optarg[0];
            break;
        case 'a':
            if (strlen(optarg) > 1) fprintf(stderr, "warning: start annotation requires one character: %s\n", optarg);
            start_annotation_char = optarg[0];
            break;
        default:
            fprintf(stderr, "usage: %s [-s start_mark] [-e end_mark] [-a start_annotation]\n", argv[0]);
            exit(1);
        }
    }
    return yyparse();
}

void yyerror(char const *s)
{
    fprintf(stderr, "%s\n", s);
}

static char *estrdupcat(const char *s1, const char *s2)
{
    size_t len = (s1 == NULL ? 0 : strlen(s1)) + strlen(s2) + 1;
    char *p = malloc(len);
    if (p == NULL) {
        fputs("error: estrdupcat cannot allocate any memories.\n", stderr);
        exit(1);
    }
    return s1 == NULL ? strcpy(p, s2) : strcat(strcpy(p, s1), s2);
}

int yylex(void)
{
#define YYTEXT_INIT_SIZE 8192
    static size_t yytext_size = YYTEXT_INIT_SIZE;
    static char *yytext = NULL;
    char *p;
    int c;
    if (! yytext && ! (yytext = malloc(YYTEXT_INIT_SIZE))) {
        fputs("error: yylex cannot allocate any memories\n", stderr);
        exit(1);
    }
    p = yytext;
    c = getchar();
    if (c == EOF) return c;
    if (c == start_mark_char) return START_MARK;
    if (c == end_mark_char) return END_MARK;
    if (c == start_annotation_char) {
        yytext[0] = start_annotation_char, yytext[1] = '\0';
        yylval = estrdup(yytext);
        return START_ANNOTATION;
    }
    while (1) {
        *p++ = c;
        if (p == yytext + yytext_size) {
            size_t s = yytext_size * 2;
            char *q = realloc(yytext, s);
            if (q == NULL) {
                fputs("error: yylex cannot allocate any memories, because token is too long.\n", stderr);
                exit(1);
            }
            yytext = q, p = yytext + yytext_size, yytext_size = s;
        }
        c = getchar();
        if (c == EOF || c == start_mark_char || c == end_mark_char || c == start_annotation_char) {
            ungetc(c, stdin);
            *p = '\0';
            yylval = estrdup(yytext);
            return STRING_EXCEPT_SPECIAL;
        }
    }
#undef YYTEXT_INIT_SIZE
}

static char *estrdup(const char *s)
{
    char *p = malloc(strlen(s) + 1);
    if (p == NULL) {
        fputs("error: estrdup cannot allocate any memories.\n", stderr);
        exit(1);
    }
    return strcpy(p, s);
}

このyylex関数が扱う字句の長さはメモリの許す限りの長さとなる。
コマンドラインオプションの解析にはgetopt関数を使っている。
オプションでメタ文字を指定しない場合は、今までのメタ文字がデフォルトで設定される。

$ echo -n "a (dog:bowwow) and a (cat:mewmew)" | ./parser
string [a ]
marked-string [dog]: annotation[bowwow]
string [ and a ]
marked-string [cat]: annotation[mewmew]

$ echo -n "a {dog:bowwow} and a {cat:mewmew}" | ./parser
string [a {dog:bowwow} and a {cat:mewmew}]

$ echo -n "a {dog:bowwow} and a {cat:mewmew}" | ./parser -s"{" -e"}"
string [a ]
marked-string [dog]: annotation[bowwow]
string [ and a ]
marked-string [cat]: annotation[mewmew]

$ echo -n "a {dog=bowwow} and a {cat=mewmew}" | ./parser -s"{" -e"}" -a"="
string [a ]
marked-string [dog]: annotation[bowwow]
string [ and a ]
marked-string [cat]: annotation[mewmew]

変な文字指定をすれば、

$ echo -n "a (dog:bowwow) and a (cat:mewmew)" | ./parser -e":" -aa
string [a ]
marked-string [dog]
string [bowwow) and a ]
marked-string [c]: annotation[t]
string [mewmew)]

のような変だが正しい結果も出てくる。