単純なマーク付けがなされた文字列の解析 #12

#10のプログラムから#11のプログラムへは手動で書き換えた。
試験データでは同様に動いているようだが、本当にちゃんと書き換えられているのだろうか。
というわけで、#10のプログラムから状態遷移表を作成してみて、
#11のアクション関数やtable等と突き合わせてみよう。

この状態遷移表の作成は間違いの無いように自動化して人間が介在しない方がいいだろう。
ということで、gvpr（え？）のありえない使い方をする。

gentable.g

BEGIN {
  if (ARGC != 1) exit(0);
  int fd = openF(ARGV[0], 'r');
  string s, state, c, next_state, action;
  string states[], cs[], list[];
  graph_t g = graph('', 'U');
  node_t items[];
  int nstates = 0, ncs = 0, nlist = 0, i, j, found;
  while (1) {
    s = readL(fd);
    if (s == '') break;
    if (match(s, '*([ \t])switch*([ \t])\\(state\\)') >= 0) {
      while (1) {
        s = readL(fd);
        if (match(s, '*([ \t])case+([ \t])') >= 0) {
          state = sub(sub(s, '*([ \t])case+([ \t])'), '*([ \t]):*');
          states[nstates++] = state;
          printf(2, '%s\n', state);
          while (1) {
            s = readL(fd);
            if (match(s, '*([ \t])switch*([ \t])\\(c\\)') >= 0) {
              while (1) {
                s = readL(fd);
                int in_case_or_default = 0;
                if (match(s, '*([ \t])case+([ \t])') >= 0) {
                  in_case_or_default = 1;
                  c = sub(sub(s, '*([ \t])case+([ \t])'), '*([ \t]):*');
                } else if (match(s, '*([ \t])default*([ \t]):') >= 0) {
                  c = 'default';
                  in_case_or_default = -1;
                }
                if (in_case_or_default != 0) {
                  found = -1;
                  for (cs[i]) {
                    if (length(cs[i]) == length(c) && index(cs[i], c) == 0) {
                      found = i;
                      break;
                    }
                  }
                  if (found < 0) {
                    cs[ncs++] = c;
                  }
                  printf(2, '    %s\n', c);
                  next_state = state;
                  action = '';
                  while (1) {
                    s = readL(fd);
                    if (match(s, '*([ \t])break*([ \t]);') >= 0) {
                      if (action == '') action = 'null action\n';
                      action = gsub(action, '\n', '\\l');
                      found = -1;
                      for (list[i]) {
                        if (length(list[i]) == length(action) && index(list[i], action) == 0) {
                          found = i;
                          break;
                        }
                      }
                      if (found < 0) {
                        found = nlist;
                        list[nlist++] = action;
                      }
                      printf(2, '        next: %s\n', next_state);
                      printf(2, '        action: %d : %s\n', found, list[found]);
                      items[c] = node(g, c);
                      aset(items[c], state, sprintf('%d, %s', found, next_state));
                      break;
                    } else if (match(s, '*([ \t])state*([ \t])=') >= 0) {
                      next_state = sub(sub(s, '*([ \t])state*([ \t])=*([ \t])'), '*([ \t]);*');
                    } else {
                      action += sub(s, '*([ \t])');
                    }
                  }
                  if (in_case_or_default < 0) break;
                }
              }
              while (1) {
                s = readL(fd);
                if (match(s, '*([ \t])break*([ \t]);') >= 0) {
                  break;
                }
              }
              break;
            }
          }
        }
        if (match(s, '*([ \t])default*([ \t]):') >= 0) break;
      }
    }
  }
  closeF(fd);
  printf(2, 'action list\n');
  for (list[i]) printf(2, '    %d : %s\n', i, list[i]);
  graph_t out = graph('', 'U');
  out.label = 'the table of (action, next_state)';
  out.labelloc = 't';
  node_t n = node(out, '1');
  n.shape = 'record';
  n.label = '{state \\\\ c';
  for (states[i]) n.label += '|' + states[i] + '\\l';
  n.label += '}';
  for (cs[i]) {
    n.label += '|{' + cs[i];
    for (states[j]) {
      n.label += '|' + aget(items[cs[i]], states[j]) + '\\l';
    }
    n.label += '}';
  }
  n = node(out, '2');
  n.shape = 'plaintext';
  n.label = 'action\\n';
  for (list[i]) n.label += sprintf('%d : %s', i, list[i]);
  write(out);
}

もっさりしていて長い。
ひどく冗長なところがあるくせに特定の形式しか受け付けなかったりでかなりいい加減である。
特定の形式というのは、switch文のcase群の最後がdefaultであること前提であったりとかである。
まず大まかなデザインだけ頭の中で描いて、あとは上から下へ後戻りなしでコーディングしていった結果である。
同じ結果を得るもっと短くまとまったコードを作れるはずだが、
やりたい目的を考えると、#10のコードを目と手で変換した#11のコードと、
#10のコードから別の手段で情報を抽出した結果との突合せをするためなので、
とりあえずこれでいいだろう。

gvprの-aオプションで#10のプログラムソースを指定すると、ソースを解析して、
標準出力にDOT形式で状態遷移表のグラフ？を出力し、
標準エラーに解析の途中経過等を出力する。
したがって、標準出力の方をdotコマンドに渡すと状態遷移表をレンダリングできる。

$ gvpr -f gentable.g -a parser.c | dot -T png -o stt.png
START
    '('
        next: HEAD
        action: 0 : null action\l
    ')'
        next: ERROR
        action: 1 : eprintf("illegal right parenthesis");\l
    EOF
        next: ERROR
        action: 2 : eprintf("unexpected EOF");\l
    default
        next: STRING
        action: 3 : printf("string [%c", c);\l
RESTART
    '('
        next: HEAD
        action: 0 : null action\l
    ')'
        next: ERROR
        action: 1 : eprintf("illegal right parenthesis");\l
    EOF
        next: ACCEPT
        action: 0 : null action\l
    default
        next: STRING
        action: 3 : printf("string [%c", c);\l
STRING
    '('
        next: HEAD
        action: 4 : puts("]");\l
    ')'
        next: ERROR
        action: 5 : neprintf("illegal right parenthesis");\l
    EOF
        next: ACCEPT
        action: 4 : puts("]");\l
    default
        next: STRING
        action: 6 : putchar(c);\l
HEAD
    '('
        next: ERROR
        action: 7 : eprintf("illegal left parenthesis");\l
    ')'
        next: ERROR
        action: 8 : eprintf("null string between parentheses");\l
    EOF
        next: ERROR
        action: 2 : eprintf("unexpected EOF");\l
    default
        next: BODY
        action: 9 : printf("marked-string [%c", c);\l
BODY
    '('
        next: ERROR
        action: 10 : neprintf("illegal left parenthesis");\l
    ')'
        next: RESTART
        action: 4 : puts("]");\l
    EOF
        next: ERROR
        action: 11 : neprintf("unexpected EOF");\l
    default
        next: BODY
        action: 6 : putchar(c);\l
action list
    0 : null action\l
    1 : eprintf("illegal right parenthesis");\l
    2 : eprintf("unexpected EOF");\l
    3 : printf("string [%c", c);\l
    4 : puts("]");\l
    5 : neprintf("illegal right parenthesis");\l
    6 : putchar(c);\l
    7 : eprintf("illegal left parenthesis");\l
    8 : eprintf("null string between parentheses");\l
    9 : printf("marked-string [%c", c);\l
    10 : neprintf("illegal left parenthesis");\l
    11 : neprintf("unexpected EOF");\l

同じアクションはアクションリストでは一つにまとまるようにしている。
状態遷移表の各項目は現在の状態stateと入力cで決まるアクション（のリスト番号）と次の状態である。
gvprではこういう妙なこともできるのだ。しかも図にするまでの操作がGraphvizだけで閉じている。
いや、やるにしても、もっと作りやすいスクリプト言語を使うべきであることは分かっているが……

この状態遷移表と#11のコードとの突き合わせだが、
DEFINE_ACTIONマクロで定義したアクションの数が13個であるのに対して、
状態遷移表のアクションの数が12個であるのは、
遷移表のリストでは一つにまとめてしまう同じ動作を行うアクションを、
end_string関数とend_marked_string関数の二つの名前で定義したからである。
二つは同じ動作だが、#11ではstringの終りとmarked-stringの終りという意味の違いで二つにしてある。
その点も考慮して、tableについては遷移表と合致しているようだ。
ということで、#10を手動変換した#11はたぶん等価なのだろう。

大体において元々この流れは順番が間違っている。
状態遷移表がまずあって分かりやすく可視化した遷移図を作ったりコーディングに供するとか、
遷移図で大まかに設計し、抜けがないことを確認しつつ遷移表を完成させてからコーディングするとかが普通の順序で、
最後に状態遷移表を作るのは何だか逆だなあと思う。
とりあえずgvprもそれなりに汎用性のあるスクリプト言語だということを確認したかっただけなのかも。
それでもまあC言語のソースをフィルタしてDOTファイルを吐き出すのに使うとかは……