入力文字列自身でマーク付けに使う文字を指定する #3
とりあえず書いてみた。
前述したように、コマンドラインオプションでメタ文字を変更する版の記事の#5のparser.yの末尾に、
以下の字句解析関数yylexのコードを追加しているだけである。
...snip static char *estrdup(const char *s); #ifndef YYTEXT_INIT_SIZE #define YYTEXT_INIT_SIZE 8192 #endif /* YYTEXT_INIT_SIZE */ #if YYTEXT_INIT_SIZE < 2 #error YYTEXT_INIT_SIZE must be greater than 1 #endif int yylex(void) { static size_t yytext_size = YYTEXT_INIT_SIZE; static char *yytext = NULL; char *p; int c; if (! yytext && ! (yytext = malloc(yytext_size))) { fputs("error: yylex cannot allocate any memories\n", stderr); exit(1); } p = yytext; c = getchar(); if (c == current_start_mark_char) { do { int c2 = getchar(); if (c2 == current_start_mark_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_start_mark_char = c3; } else if (c2 == current_end_mark_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_end_mark_char = c3; } else if (c2 == current_start_annotation_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_start_annotation_char = c3; } else { ungetc(c2, stdin); return START_MARK; } } while ((c = getchar()) == current_start_mark_char); } if (c == EOF) return c; if (c == current_end_mark_char) return END_MARK; if (c == current_start_annotation_char) { yytext[0] = current_start_annotation_char, yytext[1] = '\0'; yylval = estrdup(yytext); return START_ANNOTATION; } while (1) { *p++ = c; if (p == yytext + yytext_size) { size_t s = yytext_size * 2; char *q = realloc(yytext, s); if (q == NULL) { fputs("error: yylex cannot allocate any memories, because token is too long.\n", stderr); exit(1); } yytext = q, p = yytext + yytext_size, yytext_size = s; } c = getchar(); if (c == current_start_mark_char) { do { int c2 = getchar(); if (c2 == current_start_mark_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_start_mark_char = c3; } else if (c2 == current_end_mark_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_end_mark_char = c3; } else if (c2 == current_start_annotation_char) { int c3 = getchar(); if (c3 == EOF) { fputs("error: yylex detected unexpected EOF\n", stderr); exit(1); } current_start_annotation_char = c3; } else { ungetc(c2, stdin); break; } } while ((c = getchar()) == current_start_mark_char); } if (c == EOF || c == current_start_mark_char || c == current_end_mark_char || c == current_start_annotation_char) { ungetc(c, stdin); *p = '\0'; yylval = estrdup(yytext); return STRING_EXCEPT_SPECIAL; } } } static char *estrdup(const char *s) { char *p = malloc(strlen(s) + 1); if (p == NULL) { fputs("error: estrdup cannot allocate any memories.\n", stderr); exit(1); } return strcpy(p, s); }
同じような処理が散らばっていて見苦しいが、
$ echo -n '<html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser -s"<" -e">" -a" " marked-string [html]: annotation[lang="ja"] marked-string [body] marked-string [img]: annotation[src="foo.jpg" alt="description"] marked-string [/body] marked-string [/html] $ echo -n '()>(: ((<<html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser marked-string [html]: annotation[lang="ja"] marked-string [body] marked-string [img]: annotation[src="foo.jpg" alt="description"] marked-string [/body] marked-string [/html] $ echo -n '((<<)><: <html lang="ja"><body><img src="foo.jpg" alt="description"></body></html>' | ./parser marked-string [html]: annotation[lang="ja"] marked-string [body] marked-string [img]: annotation[src="foo.jpg" alt="description"] marked-string [/body] marked-string [/html]
なんだか動いているようである。
$ echo -n "(({{({{({)" | ./parser marked-string [({]
こんなのもうまく解析できているようである。