X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/a8bbe4703654e6d6131368c0c42995f8933b3dce..26c05eec96c89590ddf4d516ae0cf9fc6bdbec82:/dtl/Diff.hpp diff --git a/dtl/Diff.hpp b/dtl/Diff.hpp new file mode 100644 index 0000000..5840a71 --- /dev/null +++ b/dtl/Diff.hpp @@ -0,0 +1,629 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_DIFF_H +#define DTL_DIFF_H + +namespace dtl { + + /** + * diff class template + * sequence must support random_access_iterator. + */ + template , typename comparator = Compare< elem > > + class Diff + { + private : + dtl_typedefs(elem, sequence) + sequence A; + sequence B; + size_t M; + size_t N; + size_t delta; + size_t offset; + long long *fp; + long long editDistance; + Lcs< elem > lcs; + Ses< elem > ses; + editPath path; + editPathCordinates pathCordinates; + bool reverse; + bool huge; + bool unserious; + bool onlyEditDistance; + uniHunkVec uniHunks; + comparator cmp; + public : + Diff () {} + + Diff (const sequence& a, + const sequence& b) : A(a), B(b) { + init(); + } + + Diff (const sequence& a, + const sequence& b, + const comparator& comp) : A(a), B(b), cmp(comp) { + init(); + } + + ~Diff() {} + + long long getEditDistance () const { + return editDistance; + } + + Lcs< elem > getLcs () const { + return lcs; + } + + elemVec getLcsVec () const { + return lcs.getSequence(); + } + + Ses< elem > getSes () const { + return ses; + } + + uniHunkVec getUniHunks () const { + return uniHunks; + } + + bool isHuge () const { + return huge; + } + + void onHuge () { + this->huge = true; + } + + void offHuge () { + this->huge = false; + } + + bool isUnserious () const { + return unserious; + } + + void onUnserious () { + this->unserious = true; + } + + void offUnserious () { + this->unserious = false; + } + + void onOnlyEditDistance () { + this->onlyEditDistance = true; + } + + /** + * patching with Unified Format Hunks + */ + sequence uniPatch (const sequence& seq) { + elemList seqLst(seq.begin(), seq.end()); + sesElemVec shunk; + sesElemVec_iter vsesIt; + elemList_iter lstIt = seqLst.begin(); + long long inc_dec_total = 0; + long long gap = 1; + for (uniHunkVec_iter it=uniHunks.begin();it!=uniHunks.end();++it) { + joinSesVec(shunk, it->common[0]); + joinSesVec(shunk, it->change); + joinSesVec(shunk, it->common[1]); + it->a += inc_dec_total; + inc_dec_total += it->inc_dec_count; + for (long long i=0;ia - gap;++i) { + ++lstIt; + } + gap = it->a + it->b + it->inc_dec_count; + vsesIt = shunk.begin(); + while (vsesIt!=shunk.end()) { + switch (vsesIt->second.type) { + case SES_ADD : + seqLst.insert(lstIt, vsesIt->first); + break; + case SES_DELETE : + if (lstIt != seqLst.end()) { + lstIt = seqLst.erase(lstIt); + } + break; + case SES_COMMON : + if (lstIt != seqLst.end()) { + ++lstIt; + } + break; + default : + // no through + break; + } + ++vsesIt; + } + shunk.clear(); + } + + sequence patchedSeq(seqLst.begin(), seqLst.end()); + return patchedSeq; + } + + /** + * patching with Shortest Edit Script + */ + sequence patch (const sequence& seq) const { + sesElemVec sesSeq = ses.getSequence(); + elemList seqLst(seq.begin(), seq.end()); + elemList_iter lstIt = seqLst.begin(); + for (sesElemVec_iter sesIt=sesSeq.begin();sesIt!=sesSeq.end();++sesIt) { + switch (sesIt->second.type) { + case SES_ADD : + seqLst.insert(lstIt, sesIt->first); + break; + case SES_DELETE : + lstIt = seqLst.erase(lstIt); + break; + case SES_COMMON : + ++lstIt; + break; + default : + // no through + break; + } + } + sequence patchedSeq(seqLst.begin(), seqLst.end()); + return patchedSeq; + } + + /** + * compose Longest Common Subsequence and Shortest Edit Script. + * The algorithm implemented here is based on "An O(NP) Sequence Comparison Algorithm" + * described by Sun Wu, Udi Manber and Gene Myers + */ + void compose() { + + if (isHuge()) { + pathCordinates.reserve(MAX_CORDINATES_SIZE); + } + + long long p = -1; + fp = new long long[M + N + 3]; + fill(&fp[0], &fp[M + N + 3], -1); + path = editPath(M + N + 3); + fill(path.begin(), path.end(), -1); + ONP: + do { + ++p; + for (long long k=-p;k<=static_cast(delta)-1;++k) { + fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]); + } + for (long long k=static_cast(delta)+p;k>=static_cast(delta)+1;--k) { + fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]); + } + fp[delta+offset] = snake(static_cast(delta), fp[delta-1+offset]+1, fp[delta+1+offset]); + } while (fp[delta+offset] != static_cast(N) && pathCordinates.size() < MAX_CORDINATES_SIZE); + + editDistance += static_cast(delta) + 2 * p; + long long r = path[delta+offset]; + P cordinate; + editPathCordinates epc(0); + + // only recoding editdistance + if (onlyEditDistance) { + delete[] this->fp; + return; + } + + while(r != -1) { + cordinate.x = pathCordinates[(size_t)r].x; + cordinate.y = pathCordinates[(size_t)r].y; + epc.push_back(cordinate); + r = pathCordinates[(size_t)r].k; + } + + // record Longest Common Subsequence & Shortest Edit Script + if (!recordSequence(epc)) { + pathCordinates.resize(0); + epc.resize(0); + p = -1; + goto ONP; + } + delete[] this->fp; + } + + /** + * print difference between A and B with SES + */ + template < typename stream > + void printSES (stream& out) const { + sesElemVec ses_v = ses.getSequence(); + for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out)); + } + + void printSES (ostream& out = cout) const { + printSES< ostream >(out); + } + + /** + * print difference with given SES + */ + template < typename stream > + static void printSES (const Ses< elem >& s, stream& out) { + sesElemVec ses_v = s.getSequence(); + for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out)); + } + + static void printSES (const Ses< elem >& s, ostream& out = cout) { + printSES< ostream >(s, out); + } + + /** + * print difference between A and B with the format such as Unified Format + */ + template < typename stream > + void printUnifiedFormat (stream& out) const { + for_each(uniHunks.begin(), uniHunks.end(), UniHunkPrinter< sesElem, stream >(out)); + } + + void printUnifiedFormat (ostream& out = cout) const { + printUnifiedFormat< ostream >(out); + } + + /** + * print unified format difference with given unified format hunks + */ + template < typename stream > + static void printUnifiedFormat (const uniHunkVec& hunks, stream& out) { + for_each(hunks.begin(), hunks.end(), UniHunkPrinter< sesElem >(out)); + } + + static void printUnifiedFormat (const uniHunkVec& hunks, ostream& out = cout) { + printUnifiedFormat< ostream >(hunks, out); + } + + /** + * compose Unified Format Hunks from Shortest Edit Script + */ + void composeUnifiedHunks () { + sesElemVec common[2]; + sesElemVec change; + sesElemVec ses_v = ses.getSequence(); + long long l_cnt = 1; + long long length = distance(ses_v.begin(), ses_v.end()); + long long middle = 0; + bool isMiddle, isAfter; + elem e; + elemInfo einfo; + long long a, b, c, d; // @@ -a,b +c,d @@ + long long inc_dec_count = 0; + uniHunk< sesElem > hunk; + sesElemVec adds; + sesElemVec deletes; + + isMiddle = isAfter = false; + a = b = c = d = 0; + + for (sesElemVec_iter it=ses_v.begin();it!=ses_v.end();++it, ++l_cnt) { + e = it->first; + einfo = it->second; + switch (einfo.type) { + case SES_ADD : + middle = 0; + ++inc_dec_count; + adds.push_back(*it); + if (!isMiddle) isMiddle = true; + if (isMiddle) ++d; + if (l_cnt >= length) { + joinSesVec(change, deletes); + joinSesVec(change, adds); + isAfter = true; + } + break; + case SES_DELETE : + middle = 0; + --inc_dec_count; + deletes.push_back(*it); + if (!isMiddle) isMiddle = true; + if (isMiddle) ++b; + if (l_cnt >= length) { + joinSesVec(change, deletes); + joinSesVec(change, adds); + isAfter = true; + } + break; + case SES_COMMON : + ++b;++d; + if (common[1].empty() && adds.empty() && deletes.empty() && change.empty()) { + if (static_cast(common[0].size()) < DTL_CONTEXT_SIZE) { + if (a == 0 && c == 0) { + a = einfo.beforeIdx; + c = einfo.afterIdx; + } + common[0].push_back(*it); + } else { + rotate(common[0].begin(), common[0].begin() + 1, common[0].end()); + common[0].pop_back(); + common[0].push_back(*it); + ++a;++c; + --b;--d; + } + } + if (isMiddle && !isAfter) { + ++middle; + joinSesVec(change, deletes); + joinSesVec(change, adds); + change.push_back(*it); + if (middle >= DTL_SEPARATE_SIZE || l_cnt >= length) { + isAfter = true; + } + adds.clear(); + deletes.clear(); + } + break; + default : + // no through + break; + } + // compose unified format hunk + if (isAfter && !change.empty()) { + sesElemVec_iter cit = it; + long long cnt = 0; + for (long long i=0;isecond.type == SES_COMMON) { + ++cnt; + } + } + if (cnt < DTL_SEPARATE_SIZE && l_cnt < length) { + middle = 0; + isAfter = false; + continue; + } + if (static_cast(common[0].size()) >= DTL_SEPARATE_SIZE) { + long long c0size = static_cast(common[0].size()); + rotate(common[0].begin(), + common[0].begin() + (size_t)c0size - DTL_SEPARATE_SIZE, + common[0].end()); + for (long long i=0;i + static Ses< elem > composeSesFromStream (stream& st) + { + elem line; + Ses< elem > ret; + long long x_idx, y_idx; + x_idx = y_idx = 1; + while (getline(st, line)) { + elem mark(line.begin(), line.begin() + 1); + elem e(line.begin() + 1, line.end()); + if (mark == SES_MARK_DELETE) { + ret.addSequence(e, x_idx, 0, SES_DELETE); + ++x_idx; + } else if (mark == SES_MARK_ADD) { + ret.addSequence(e, y_idx, 0, SES_ADD); + ++y_idx; + } else if (mark == SES_MARK_COMMON) { + ret.addSequence(e, x_idx, y_idx, SES_COMMON); + ++x_idx; + ++y_idx; + } + } + return ret; + } + + private : + /** + * initialize + */ + void init () { + M = distance(A.begin(), A.end()); + N = distance(B.begin(), B.end()); + if (M < N) { + reverse = false; + } else { + swap(A, B); + swap(M, N); + reverse = true; + } + editDistance = 0; + delta = N - M; + offset = M + 1; + huge = false; + unserious = false; + onlyEditDistance = false; + fp = NULL; + } + + /** + * search shortest path and record the path + */ + long long snake(const long long& k, const long long& above, const long long& below) { + long long r = above > below ? path[(size_t)k-1+offset] : path[(size_t)k+1+offset]; + long long y = max(above, below); + long long x = y - k; + while ((size_t)x < M && (size_t)y < N && cmp.impl(A[(size_t)x], B[(size_t)y])) { + ++x;++y; + } + + path[(size_t)k+offset] = static_cast(pathCordinates.size()); + if (!onlyEditDistance) { + P p; + p.x = x;p.y = y;p.k = r; + pathCordinates.push_back(p); + } + return y; + } + + /** + * record SES and LCS + */ + bool recordSequence (const editPathCordinates& v) { + sequence_const_iter x(A.begin()); + sequence_const_iter y(B.begin()); + long long x_idx, y_idx; // line number for Unified Format + long long px_idx, py_idx; // cordinates + bool complete = false; + x_idx = y_idx = 1; + px_idx = py_idx = 0; + for (size_t i=v.size()-1;!complete;--i) { + while(px_idx < v[i].x || py_idx < v[i].y) { + if (v[i].y - v[i].x > py_idx - px_idx) { + if (!isReverse()) { + ses.addSequence(*y, y_idx, 0, SES_ADD); + } else { + ses.addSequence(*y, y_idx, 0, SES_DELETE); + } + ++y; + ++y_idx; + ++py_idx; + } else if (v[i].y - v[i].x < py_idx - px_idx) { + if (!isReverse()) { + ses.addSequence(*x, x_idx, 0, SES_DELETE); + } else { + ses.addSequence(*x, x_idx, 0, SES_ADD); + } + ++x; + ++x_idx; + ++px_idx; + } else { + lcs.addSequence(*x); + ses.addSequence(*x, x_idx, y_idx, SES_COMMON); + ++x; + ++y; + ++x_idx; + ++y_idx; + ++px_idx; + ++py_idx; + } + } + if (i == 0) complete = true; + } + + if (x_idx > static_cast(M) && y_idx > static_cast(N)) { + // all recording success + } else { + // unserious difference + if (isUnserious()) { + if (!isReverse()) { + recordOddSequence(x_idx, M, x, SES_DELETE); + recordOddSequence(y_idx, N, y, SES_ADD); + } else { + recordOddSequence(x_idx, M, x, SES_ADD); + recordOddSequence(y_idx, N, y, SES_DELETE); + } + return true; + } + + // decent difference + sequence A_(A.begin() + (size_t)x_idx - 1, A.end()); + sequence B_(B.begin() + (size_t)y_idx - 1, B.end()); + A = A_; + B = B_; + M = distance(A.begin(), A.end()); + N = distance(B.begin(), B.end()); + delta = N - M; + offset = M + 1; + delete[] fp; + fp = new long long[M + N + 3]; + fill(&fp[0], &fp[M + N + 3], -1); + fill(path.begin(), path.end(), -1); + return false; + } + return true; + } + + /** + * record odd sequence to ses + */ + void inline recordOddSequence (long long idx, long long length, sequence_const_iter it, const edit_t et) { + while(idx < length){ + ses.addSequence(*it, idx, 0, et); + ++it; + ++idx; + ++editDistance; + } + ses.addSequence(*it, idx, 0, et); + ++editDistance; + } + + /** + * join ses vectors + */ + void inline joinSesVec (sesElemVec& s1, sesElemVec& s2) const { + if (!s2.empty()) { + for (sesElemVec_iter vit=s2.begin();vit!=s2.end();++vit) { + s1.push_back(*vit); + } + } + } + + /** + * check sequence is replaced each other + */ + bool inline isReverse () const { + return reverse; + } + + }; +} + +#endif // DTL_DIFF_H