動的ダブル配列を実装してみた - Negative/Positive Thinking

はじめに

形態素解析器などで利用されている「ダブル配列(double array)」の勉強するため、実装をしてみた。
動作を確認するために書いただけなので重い。。。

ダブル配列とは

文字列の検索をO(1)で行えるTrie(トライ木)の効率的なデータ構造
2つの配列BASEとCHECKで木の遷移を表現する
あるノードsにいるときに、次の文字がaだったとき、以下を満たす
- 次のノード : $t = BASE[s]+N(a)$
- 現在のノード :
  - ただし、N(a)は文字aを表す整数(内部コード)

トライ木の遷移の(2次元)配列構造を圧縮したものに相当
- 2次元配列構造とは、縦をノード番号、横を文字にしたとき次のノード番号を保持する
- この構造では、かなり疎になる(無駄が多い)→圧縮！！！

コード

以下の論文の前半部分を参考に改良なしの動的ダブル配列を実装してみた
- http://ci.nii.ac.jp/naid/110002725995
- 追加するときに、配列サイズに線形な処理時間のかかる関数があるので大きくなると非常に遅くなる
  - 論文では、未使用要素リスト法、不要ノード削除法、未使用要素圧縮法を提案している

#include <iostream>
#include <vector>
#include <map>

class double_array {
  std::map<char,int> N;
  std::vector<int> base, check;
  int N_MAX;
  //FORWARD関数 : ノードsから文字aで遷移したときの次のノード番号を返す
  int forward(int s, char a){
    if(N.count(a)==0) N[a] = N_MAX++;
    int t = base[s] + N[a];
    if(t < base.size()){
      if(check[t]==s) return t;
    }
    return 0;
  }
  //W_BASE関数 : 配列BASEのindexの値をvalにする
  void w_base(int index, int val){
    while(index>=base.size()){ base.push_back(0); }
    base[index] = val;
  }
  //W_CHECK関数 : 配列CHECKのindexの値をvalにする
  void w_check(int index, int val){
    while(index>=check.size()){ check.push_back(0); }
    check[index] = val;
  }
  //GET_LABEL関数 : indexからでているすべてのラベル集合を返す
  std::vector<char> get_label(int index){
    std::vector<char> ret;
    std::map<char,int>::iterator it = N.begin();
    while(it != N.end()){
      char a = (*it).first;
      int t = forward(index, a);
      if(t != 0) ret.push_back(a);
      ++it;
    }
    return ret;
  }
  //X_CHECK関数 : すべてのラベルが追加できるindexを返す
  int x_check(std::vector<char> A, char b){
    if(b!='\0') A.push_back(b);
    bool flg;
    int q = 1, t;
    while(q < base.size()){ //O(n)!!!
      flg = true;
      for(int i=0; i<A.size(); i++){
	if(N.count(A[i])==0) N[A[i]] = N_MAX++;
	t = q + N[A[i]];
	if(t<check.size() && check[t] != 0){ flg = false; break; }
      }
      if(flg) return q;
      ++q;
    }
    return q;
  }
  //MODIFY関数 : 衝突を回避できるノードに移し替える
  void modify(int index, std::vector<char> R, char b){
    int oldbase = base[index], old_t, t;
    w_base(index, x_check(R,b));
    for(int i=0; i<R.size(); i++){
      if(N.count(R[i])==0) N[R[i]] = N_MAX++;
      t = base[index] + N[R[i]];
      w_check(t,index);
      old_t = oldbase + N[R[i]];
      if(base[old_t] > 0){
	for(int q=1; q<check.size(); q++){
	  if(check[q]==old_t){
	    w_check(q, t);
	  }
	}
      }
      w_base(t, base[old_t]);
    }
    w_base(old_t, 0);
    w_check(old_t, 0);
  }
  //INSERT関数 : 指定のindex以降に文字列Xのpos以降の文字列を追加する
  void insert(int index, const std::string &X, int pos){
    char a = X[pos];
    if(N.count(a)==0) N[a] = N_MAX++;
    int t = base[index] + N[a], newbase;
    if(t < check.size()){
      if(check[t]>0){ //衝突
	std::vector<char> R = get_label(index);
	modify(index, R, a);
      }
    }
    t = base[index] + N[a];
    w_check(t, index);
    if(X[pos]=='#'){
      w_base(t,-1);
    }else{
      index = t;
      pos++;
      while(pos<X.size()){
	std::vector<char> R;
	newbase = x_check(R,X[pos]);
	w_base(index, newbase);
	if(N.count(X[pos])==0) N[X[pos]] = N_MAX++;
	t = base[index] + N[X[pos]];
	w_check(t, index);
	if(X[pos]=='#'){
	  w_base(t,-1);
	  break;
	}
	index = t;
	pos++;
      }
    }
  }
  //DELETE関数 : 指定indexを未使用にする
  void del(int index){
    w_base(index, 0);
    w_check(index, 0);
  }
public:
  double_array(){
    N_MAX = 1;
    base.push_back(-1);
    base.push_back(1);
    check.push_back(-1);
    check.push_back(1);
  }
  //要素の検索
  int trie_search(std::string X){
    X += "#"; //終端文字
    int index = 1, pos = 0, t;
    while(true){
      t = forward(index, X[pos]);
      std::cout << "search: " << X[pos] << " " << t << std::endl;
      if(t==0) break; //見つからなかった
      if(base[t]<0) return index; //見つかった
      if(X[pos]=='#') break;
      index = t;
      pos++;
    }
    return -1;
  }
  //要素の追加
  void add(std::string X){
    X += "#"; //終端文字
    int index = 1, pos = 0, t;
    while(true){
      t = forward(index, X[pos]);
      if(t==0){
	insert(index, X, pos);
	return;
      }
      if(X[pos]=='#') break;
      index = t;
      pos++;
    }
  }    
  //要素の削除
  int erase(std::string X){
    X += "#"; //終端文字
    int index = 1, pos = 0, t;
    while(true){
      t = forward(index, X[pos]);
      std::cout << "search: " << X[pos] << " " << t << std::endl;
      if(t==0) break; //見つからなかった
      if(base[t]<0){ //見つかった
	del(index);
	return 1;
      }
      if(X[pos]=='#') break;
      index = t;
      pos++;
    }
    return 0;
  }

  //内部状態のDEBUG出力
  void debug(){
    std::cout << "=====================================" << std::endl;
    std::cout << "BASE: ";
    for(int i=1; i<base.size(); i++){
      std::cout << base[i] << " ";
    }
    std::cout << std::endl;
    std::cout << "CHECK:";
    for(int i=1; i<check.size(); i++){
      std::cout << check[i] << " ";
    }
    std::cout << std::endl;
    std::cout << "CODE:" << std::endl;
    std::map<char,int>::iterator it = N.begin();
    while(it != N.end()){
      char a = (*it).first;
      std::cout << a << " : " << N[a] << std::endl;
      ++it;
    }
    int num_unuse = 0;
    for(int i=1; i<check.size(); i++){
      if(check[i]==0) num_unuse++;
    }
    std::cout << "num of unused element : " << num_unuse << std::endl;
    std::cout << "=====================================" << std::endl;
  }
};

int main(){
  using namespace std;
  double_array da;

  //追加
  da.add("bachelor");
  da.add("back");
  da.add("badge");
  da.add("badger");
  da.add("beach");
  da.add("beta");
  da.add("bevel");

  //内部状態の表示
  da.debug();

  //検索結果
  cout << da.trie_search("hoge") << endl;
  cout << da.trie_search("bachelor") << endl;
  cout << da.trie_search("back") << endl;
  cout << da.trie_search("badge") << endl;
  cout << da.trie_search("badger") << endl;
  cout << da.trie_search("beach") << endl;
  cout << da.trie_search("beta") << endl;
  cout << da.trie_search("bevel") << endl;

  return 0;
}

(できる限り論文にあわせたつもりだけど、間違ってるかもしれない...)
メモリやキャッシュ、計算量の効率化が行ってある以下のようなライブラリを使うのがいい
- darts
- darts-clone
- TinyDA
- DynDA
- Doar
- DasTrie
- dda
- など
http://d.hatena.ne.jp/nokuno/20101211/1292059102

実行結果

上記の実行結果

$ time ./a.out
=====================================
BASE: 1 17 16 16 1 1 1 1 1 -1 3 -1 1 10 9 -1 7 -1 10 14 2 22 -1 1 -1 21 19 -1 0 0 0 0 0 0 1 21 
CHECK:1 1 35 24 13 5 6 7 8 9 13 11 19 21 14 17 15 15 2 4 19 2 20 22 3 36 26 27 0 0 0 0 0 0 22 22 
CODE:
# : 9
a : 2
b : 1
c : 3
d : 11
e : 5
g : 12
h : 4
k : 10
l : 6
o : 7
r : 8
t : 13
v : 14
num of unused element : 6
=====================================
search: h 0
-1
search: b 2
search: a 19
search: c 13
search: h 5
search: e 6
search: l 7
search: o 8
search: r 9
search: # 10
9
search: b 2
search: a 19
search: c 13
search: k 11
search: # 12
11
search: b 2
search: a 19
search: d 21
search: g 14
search: e 15
search: # 18
15
search: b 2
search: a 19
search: d 21
search: g 14
search: e 15
search: r 17
search: # 16
17
search: b 2
search: e 22
search: a 24
search: c 4
search: h 20
search: # 23
20
search: b 2
search: e 22
search: t 35
search: a 3
search: # 25
3
search: b 2
search: e 22
search: v 36
search: e 26
search: l 27
search: # 28
27

8文字のランダムな文字列5000個を追加した結果

# ソート済みランダム文字列5000個を持つ辞書dic.txt
$ head dic.txt
002IiwSM
008ZJpS9
00Gx1Pwn
00O8X8iJ
00bvOWAv
01P2EZdN
01UHYDZN
01j35vrW
02uu1Jl6
0382TE54
$ time ./a.out
..................................................
=====================================
num of all element : 69146
num of unused element : 13
=====================================
./a.out  46.09s user 0.02s system 99% cpu 46.105 total