diff 周りの処理
きつねのしっぽで使っている diff まわりを公開してみます。元コードを作成した id:siokoshou さんへのフィードバックと感謝を込めて。どうもありがとうございました!
using System; using System.Collections.Generic; using System.Text.RegularExpressions; using System.Diagnostics; // http://d.hatena.ne.jp/siokoshou/20070315#p1 namespace FoxTails.Text { public enum EditType: byte { Nothing, Add, Delete } [Serializable] public class DiffResult { public readonly EditType Type; public readonly string Text; public readonly int Line; public DiffResult(EditType type, int line, string text) { this.Type = type; this.Line = line; this.Text = text; } public override string ToString() { return string.Format("{0}: {1}, {2}", Line, Type, Text); } } public class FastDiff { private readonly int[] dataA, dataB; private readonly string[] textA, textB; private readonly bool isSwap; private Snake[] fp; private static readonly Regex crlf = new Regex("\r\n|\r|\n", RegexOptions.Compiled); private FastDiff(string[] textA, string[] textB) { if (textA.Length <= textB.Length) { this.isSwap = false; this.textA = textA; this.textB = textB; } else { this.isSwap = true; this.textA = textB; this.textB = textA; } this.dataA = MakeHash(this.textA); this.dataB = MakeHash(this.textB); } /// <summary>複数行の文字列を行単位で比較します</summary> /// <param name="textA">元テキスト</param> /// <param name="textB">変更テキスト</param> /// <returns>比較結果</returns> public static DiffResult[] Diff(string textA, string textB) { if (textA == null || textB == null) { throw new ArgumentNullException(); } if (textA == textB) { return new DiffResult[0]; } if (textA == string.Empty) { return CreateSimpleResult(EditType.Add, textB); } if (textB == string.Empty) { return CreateSimpleResult(EditType.Delete, textA); } return new FastDiff(SplitLine(textA), SplitLine(textB)).DetectDiff(); } private static DiffResult[] CreateSimpleResult(EditType type, string text) { string[] line = SplitLine(text); DiffResult[] result = new DiffResult[line.Length]; for (int i = 0; i < line.Length; i++) { result[i] = new DiffResult(type, i, line[i]); } return result; } private static string[] SplitLine(string text) { return crlf.Split(text); } private static int[] MakeHash(string[] text) { List<int> list = new List<int>(); foreach (string line in text) { list.Add(MakeHash(line)); } return list.ToArray(); } private static int MakeHash(string line) { const int seed = 5381; int hash = seed; foreach (char c in line) { hash = ((hash << 5) ^ (hash >> 27)) ^ c; } return hash; } private DiffResult[] DetectDiff() { Debug.Assert(this.dataA.Length <= this.dataB.Length); int DELTA = dataB.Length - dataA.Length; this.fp = new Snake[dataA.Length + dataB.Length + 3]; int p = 0; do { for (int k = -p; k < DELTA; k++) { SearchSnake(k); } for (int k = DELTA + p; k >= DELTA; k--) { SearchSnake(k); } p++; } while (this.fp[dataB.Length + 1].posB != (dataB.Length + 1)); // 末尾検出用のCS CommonSubsequence endCS = new CommonSubsequence(this.dataA.Length, this.dataB.Length, 0, this.fp[this.dataB.Length + 1].CS); CommonSubsequence result = CommonSubsequence.Reverse(endCS); if (this.isSwap) return PresentDiffSwap(result); else return PresentDiff(result); } private void SearchSnake(int k) { int kk = this.dataA.Length + 1 + k; CommonSubsequence previousCS = null; int posA = 0, posB = 0; int lk = kk - 1; int rk = kk + 1; // 論文のfp[n]は-1始まりだが、0始まりのほうが初期化の都合がよいため、 // +1のゲタを履かせる。fpから読む際は-1し、書く際は+1する。 int lb = this.fp[lk].posB; int rb = this.fp[rk].posB - 1; if (lb > rb) { posB = lb; previousCS = this.fp[lk].CS; } else { posB = rb; previousCS = this.fp[rk].CS; } posA = posB - k; int startA = posA; int startB = posB; while ((posA < dataA.Length) && (posB < dataB.Length) && (dataA[posA] == dataB[posB]) && textA[posA] == textB[posB]) { posA++; posB++; } if (startA != posA) { this.fp[kk].CS = new CommonSubsequence(startA, startB, posA - startA, previousCS); } else { this.fp[kk].CS = previousCS; } this.fp[kk].posB = posB + 1; // fpへ+1して書く。論文のfpに+1のゲタを履かせる。 } private DiffResult[] PresentDiff(CommonSubsequence cs) { List<DiffResult> list = new List<DiffResult>(); int originalStart = 0, modifiedStart = 0; while (true) { if (originalStart < cs.StartA || modifiedStart < cs.StartB) { for (int line = originalStart; line < cs.StartA; line++) { list.Add(new DiffResult(EditType.Delete, line, this.textA[line])); } for (int line = modifiedStart; line < cs.StartB; line++) { list.Add(new DiffResult(EditType.Add, line, this.textB[line])); } } // 末尾検出 if (cs.Length == 0) break; originalStart = cs.StartA + cs.Length; modifiedStart = cs.StartB + cs.Length; cs = cs.Next; } return list.ToArray(); } private DiffResult[] PresentDiffSwap(CommonSubsequence cs) { List<DiffResult> list = new List<DiffResult>(); int originalStart = 0, modifiedStart = 0; while (true) { if (originalStart < cs.StartB || modifiedStart < cs.StartA) { for (int line = originalStart; line < cs.StartB; line++) { list.Add(new DiffResult(EditType.Delete, line, this.textB[line])); } for (int line = modifiedStart; line < cs.StartA; line++) { list.Add(new DiffResult(EditType.Add, line, this.textA[line])); } } // 末尾検出 if (cs.Length == 0) break; originalStart = cs.StartB + cs.Length; modifiedStart = cs.StartA + cs.Length; cs = cs.Next; } return list.ToArray(); } private struct Snake { public int posB; public CommonSubsequence CS; public override string ToString() { return "posB:" + this.posB + ", CS:" + ((this.CS == null) ? "null" : "exist"); } } private class CommonSubsequence { private int startA_, startB_; private int length_; public CommonSubsequence Next; public int StartA { get { return this.startA_; } } public int StartB { get { return this.startB_; } } public int Length { get { return this.length_; } } public CommonSubsequence() { } public CommonSubsequence(int startA, int startB, int length, CommonSubsequence next) { this.startA_ = startA; this.startB_ = startB; this.length_ = length; this.Next = next; } public static CommonSubsequence Reverse(CommonSubsequence old) { CommonSubsequence newTop = null; while (old != null) { CommonSubsequence next = old.Next; old.Next = newTop; newTop = old; old = next; } return newTop; } } } }
オリジナルには幾つか diff オプション付けられる様になっていましたが、きつねのしっぽでは不要だったのでぜんぶ外してしまいました。その他、オリジナルでは「変更箇所」を出力する形式だったのを、こちらでは「削除行」「追加行」に分けて出力するようになっています。
ロックマン3のスネークマンが使ってくる技名っぽい関数の中は、ほぼブラックボックス状態でした。ここには手を付けられませぬ(´・ω・`)