From: Erik Garrison Date: Sun, 13 Mar 2011 18:17:39 +0000 (-0400) Subject: added (broken, but running) diff routines for block-level diffs X-Git-Url: https://projects.mako.cc/source/wikiq/commitdiff_plain/26c05eec96c89590ddf4d516ae0cf9fc6bdbec82 added (broken, but running) diff routines for block-level diffs --- diff --git a/Makefile b/Makefile index e92da1f..e1d898a 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ OBJECTS = disorder.o md5.o all: wikiq -wikiq: wikiq.c $(OBJECTS) - $(CXX) $(CFLAGS) wikiq.c $(OBJECTS) -o wikiq -lexpat +wikiq: wikiq.cpp $(OBJECTS) + $(CXX) $(CFLAGS) wikiq.cpp $(OBJECTS) -o wikiq -lexpat disorder.o: disorder.c disorder.h $(CXX) $(CFLAGS) -c disorder.c diff --git a/dtl/Diff.hpp b/dtl/Diff.hpp new file mode 100644 index 0000000..5840a71 --- /dev/null +++ b/dtl/Diff.hpp @@ -0,0 +1,629 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_DIFF_H +#define DTL_DIFF_H + +namespace dtl { + + /** + * diff class template + * sequence must support random_access_iterator. + */ + template , typename comparator = Compare< elem > > + class Diff + { + private : + dtl_typedefs(elem, sequence) + sequence A; + sequence B; + size_t M; + size_t N; + size_t delta; + size_t offset; + long long *fp; + long long editDistance; + Lcs< elem > lcs; + Ses< elem > ses; + editPath path; + editPathCordinates pathCordinates; + bool reverse; + bool huge; + bool unserious; + bool onlyEditDistance; + uniHunkVec uniHunks; + comparator cmp; + public : + Diff () {} + + Diff (const sequence& a, + const sequence& b) : A(a), B(b) { + init(); + } + + Diff (const sequence& a, + const sequence& b, + const comparator& comp) : A(a), B(b), cmp(comp) { + init(); + } + + ~Diff() {} + + long long getEditDistance () const { + return editDistance; + } + + Lcs< elem > getLcs () const { + return lcs; + } + + elemVec getLcsVec () const { + return lcs.getSequence(); + } + + Ses< elem > getSes () const { + return ses; + } + + uniHunkVec getUniHunks () const { + return uniHunks; + } + + bool isHuge () const { + return huge; + } + + void onHuge () { + this->huge = true; + } + + void offHuge () { + this->huge = false; + } + + bool isUnserious () const { + return unserious; + } + + void onUnserious () { + this->unserious = true; + } + + void offUnserious () { + this->unserious = false; + } + + void onOnlyEditDistance () { + this->onlyEditDistance = true; + } + + /** + * patching with Unified Format Hunks + */ + sequence uniPatch (const sequence& seq) { + elemList seqLst(seq.begin(), seq.end()); + sesElemVec shunk; + sesElemVec_iter vsesIt; + elemList_iter lstIt = seqLst.begin(); + long long inc_dec_total = 0; + long long gap = 1; + for (uniHunkVec_iter it=uniHunks.begin();it!=uniHunks.end();++it) { + joinSesVec(shunk, it->common[0]); + joinSesVec(shunk, it->change); + joinSesVec(shunk, it->common[1]); + it->a += inc_dec_total; + inc_dec_total += it->inc_dec_count; + for (long long i=0;ia - gap;++i) { + ++lstIt; + } + gap = it->a + it->b + it->inc_dec_count; + vsesIt = shunk.begin(); + while (vsesIt!=shunk.end()) { + switch (vsesIt->second.type) { + case SES_ADD : + seqLst.insert(lstIt, vsesIt->first); + break; + case SES_DELETE : + if (lstIt != seqLst.end()) { + lstIt = seqLst.erase(lstIt); + } + break; + case SES_COMMON : + if (lstIt != seqLst.end()) { + ++lstIt; + } + break; + default : + // no through + break; + } + ++vsesIt; + } + shunk.clear(); + } + + sequence patchedSeq(seqLst.begin(), seqLst.end()); + return patchedSeq; + } + + /** + * patching with Shortest Edit Script + */ + sequence patch (const sequence& seq) const { + sesElemVec sesSeq = ses.getSequence(); + elemList seqLst(seq.begin(), seq.end()); + elemList_iter lstIt = seqLst.begin(); + for (sesElemVec_iter sesIt=sesSeq.begin();sesIt!=sesSeq.end();++sesIt) { + switch (sesIt->second.type) { + case SES_ADD : + seqLst.insert(lstIt, sesIt->first); + break; + case SES_DELETE : + lstIt = seqLst.erase(lstIt); + break; + case SES_COMMON : + ++lstIt; + break; + default : + // no through + break; + } + } + sequence patchedSeq(seqLst.begin(), seqLst.end()); + return patchedSeq; + } + + /** + * compose Longest Common Subsequence and Shortest Edit Script. + * The algorithm implemented here is based on "An O(NP) Sequence Comparison Algorithm" + * described by Sun Wu, Udi Manber and Gene Myers + */ + void compose() { + + if (isHuge()) { + pathCordinates.reserve(MAX_CORDINATES_SIZE); + } + + long long p = -1; + fp = new long long[M + N + 3]; + fill(&fp[0], &fp[M + N + 3], -1); + path = editPath(M + N + 3); + fill(path.begin(), path.end(), -1); + ONP: + do { + ++p; + for (long long k=-p;k<=static_cast(delta)-1;++k) { + fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]); + } + for (long long k=static_cast(delta)+p;k>=static_cast(delta)+1;--k) { + fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]); + } + fp[delta+offset] = snake(static_cast(delta), fp[delta-1+offset]+1, fp[delta+1+offset]); + } while (fp[delta+offset] != static_cast(N) && pathCordinates.size() < MAX_CORDINATES_SIZE); + + editDistance += static_cast(delta) + 2 * p; + long long r = path[delta+offset]; + P cordinate; + editPathCordinates epc(0); + + // only recoding editdistance + if (onlyEditDistance) { + delete[] this->fp; + return; + } + + while(r != -1) { + cordinate.x = pathCordinates[(size_t)r].x; + cordinate.y = pathCordinates[(size_t)r].y; + epc.push_back(cordinate); + r = pathCordinates[(size_t)r].k; + } + + // record Longest Common Subsequence & Shortest Edit Script + if (!recordSequence(epc)) { + pathCordinates.resize(0); + epc.resize(0); + p = -1; + goto ONP; + } + delete[] this->fp; + } + + /** + * print difference between A and B with SES + */ + template < typename stream > + void printSES (stream& out) const { + sesElemVec ses_v = ses.getSequence(); + for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out)); + } + + void printSES (ostream& out = cout) const { + printSES< ostream >(out); + } + + /** + * print difference with given SES + */ + template < typename stream > + static void printSES (const Ses< elem >& s, stream& out) { + sesElemVec ses_v = s.getSequence(); + for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out)); + } + + static void printSES (const Ses< elem >& s, ostream& out = cout) { + printSES< ostream >(s, out); + } + + /** + * print difference between A and B with the format such as Unified Format + */ + template < typename stream > + void printUnifiedFormat (stream& out) const { + for_each(uniHunks.begin(), uniHunks.end(), UniHunkPrinter< sesElem, stream >(out)); + } + + void printUnifiedFormat (ostream& out = cout) const { + printUnifiedFormat< ostream >(out); + } + + /** + * print unified format difference with given unified format hunks + */ + template < typename stream > + static void printUnifiedFormat (const uniHunkVec& hunks, stream& out) { + for_each(hunks.begin(), hunks.end(), UniHunkPrinter< sesElem >(out)); + } + + static void printUnifiedFormat (const uniHunkVec& hunks, ostream& out = cout) { + printUnifiedFormat< ostream >(hunks, out); + } + + /** + * compose Unified Format Hunks from Shortest Edit Script + */ + void composeUnifiedHunks () { + sesElemVec common[2]; + sesElemVec change; + sesElemVec ses_v = ses.getSequence(); + long long l_cnt = 1; + long long length = distance(ses_v.begin(), ses_v.end()); + long long middle = 0; + bool isMiddle, isAfter; + elem e; + elemInfo einfo; + long long a, b, c, d; // @@ -a,b +c,d @@ + long long inc_dec_count = 0; + uniHunk< sesElem > hunk; + sesElemVec adds; + sesElemVec deletes; + + isMiddle = isAfter = false; + a = b = c = d = 0; + + for (sesElemVec_iter it=ses_v.begin();it!=ses_v.end();++it, ++l_cnt) { + e = it->first; + einfo = it->second; + switch (einfo.type) { + case SES_ADD : + middle = 0; + ++inc_dec_count; + adds.push_back(*it); + if (!isMiddle) isMiddle = true; + if (isMiddle) ++d; + if (l_cnt >= length) { + joinSesVec(change, deletes); + joinSesVec(change, adds); + isAfter = true; + } + break; + case SES_DELETE : + middle = 0; + --inc_dec_count; + deletes.push_back(*it); + if (!isMiddle) isMiddle = true; + if (isMiddle) ++b; + if (l_cnt >= length) { + joinSesVec(change, deletes); + joinSesVec(change, adds); + isAfter = true; + } + break; + case SES_COMMON : + ++b;++d; + if (common[1].empty() && adds.empty() && deletes.empty() && change.empty()) { + if (static_cast(common[0].size()) < DTL_CONTEXT_SIZE) { + if (a == 0 && c == 0) { + a = einfo.beforeIdx; + c = einfo.afterIdx; + } + common[0].push_back(*it); + } else { + rotate(common[0].begin(), common[0].begin() + 1, common[0].end()); + common[0].pop_back(); + common[0].push_back(*it); + ++a;++c; + --b;--d; + } + } + if (isMiddle && !isAfter) { + ++middle; + joinSesVec(change, deletes); + joinSesVec(change, adds); + change.push_back(*it); + if (middle >= DTL_SEPARATE_SIZE || l_cnt >= length) { + isAfter = true; + } + adds.clear(); + deletes.clear(); + } + break; + default : + // no through + break; + } + // compose unified format hunk + if (isAfter && !change.empty()) { + sesElemVec_iter cit = it; + long long cnt = 0; + for (long long i=0;isecond.type == SES_COMMON) { + ++cnt; + } + } + if (cnt < DTL_SEPARATE_SIZE && l_cnt < length) { + middle = 0; + isAfter = false; + continue; + } + if (static_cast(common[0].size()) >= DTL_SEPARATE_SIZE) { + long long c0size = static_cast(common[0].size()); + rotate(common[0].begin(), + common[0].begin() + (size_t)c0size - DTL_SEPARATE_SIZE, + common[0].end()); + for (long long i=0;i + static Ses< elem > composeSesFromStream (stream& st) + { + elem line; + Ses< elem > ret; + long long x_idx, y_idx; + x_idx = y_idx = 1; + while (getline(st, line)) { + elem mark(line.begin(), line.begin() + 1); + elem e(line.begin() + 1, line.end()); + if (mark == SES_MARK_DELETE) { + ret.addSequence(e, x_idx, 0, SES_DELETE); + ++x_idx; + } else if (mark == SES_MARK_ADD) { + ret.addSequence(e, y_idx, 0, SES_ADD); + ++y_idx; + } else if (mark == SES_MARK_COMMON) { + ret.addSequence(e, x_idx, y_idx, SES_COMMON); + ++x_idx; + ++y_idx; + } + } + return ret; + } + + private : + /** + * initialize + */ + void init () { + M = distance(A.begin(), A.end()); + N = distance(B.begin(), B.end()); + if (M < N) { + reverse = false; + } else { + swap(A, B); + swap(M, N); + reverse = true; + } + editDistance = 0; + delta = N - M; + offset = M + 1; + huge = false; + unserious = false; + onlyEditDistance = false; + fp = NULL; + } + + /** + * search shortest path and record the path + */ + long long snake(const long long& k, const long long& above, const long long& below) { + long long r = above > below ? path[(size_t)k-1+offset] : path[(size_t)k+1+offset]; + long long y = max(above, below); + long long x = y - k; + while ((size_t)x < M && (size_t)y < N && cmp.impl(A[(size_t)x], B[(size_t)y])) { + ++x;++y; + } + + path[(size_t)k+offset] = static_cast(pathCordinates.size()); + if (!onlyEditDistance) { + P p; + p.x = x;p.y = y;p.k = r; + pathCordinates.push_back(p); + } + return y; + } + + /** + * record SES and LCS + */ + bool recordSequence (const editPathCordinates& v) { + sequence_const_iter x(A.begin()); + sequence_const_iter y(B.begin()); + long long x_idx, y_idx; // line number for Unified Format + long long px_idx, py_idx; // cordinates + bool complete = false; + x_idx = y_idx = 1; + px_idx = py_idx = 0; + for (size_t i=v.size()-1;!complete;--i) { + while(px_idx < v[i].x || py_idx < v[i].y) { + if (v[i].y - v[i].x > py_idx - px_idx) { + if (!isReverse()) { + ses.addSequence(*y, y_idx, 0, SES_ADD); + } else { + ses.addSequence(*y, y_idx, 0, SES_DELETE); + } + ++y; + ++y_idx; + ++py_idx; + } else if (v[i].y - v[i].x < py_idx - px_idx) { + if (!isReverse()) { + ses.addSequence(*x, x_idx, 0, SES_DELETE); + } else { + ses.addSequence(*x, x_idx, 0, SES_ADD); + } + ++x; + ++x_idx; + ++px_idx; + } else { + lcs.addSequence(*x); + ses.addSequence(*x, x_idx, y_idx, SES_COMMON); + ++x; + ++y; + ++x_idx; + ++y_idx; + ++px_idx; + ++py_idx; + } + } + if (i == 0) complete = true; + } + + if (x_idx > static_cast(M) && y_idx > static_cast(N)) { + // all recording success + } else { + // unserious difference + if (isUnserious()) { + if (!isReverse()) { + recordOddSequence(x_idx, M, x, SES_DELETE); + recordOddSequence(y_idx, N, y, SES_ADD); + } else { + recordOddSequence(x_idx, M, x, SES_ADD); + recordOddSequence(y_idx, N, y, SES_DELETE); + } + return true; + } + + // decent difference + sequence A_(A.begin() + (size_t)x_idx - 1, A.end()); + sequence B_(B.begin() + (size_t)y_idx - 1, B.end()); + A = A_; + B = B_; + M = distance(A.begin(), A.end()); + N = distance(B.begin(), B.end()); + delta = N - M; + offset = M + 1; + delete[] fp; + fp = new long long[M + N + 3]; + fill(&fp[0], &fp[M + N + 3], -1); + fill(path.begin(), path.end(), -1); + return false; + } + return true; + } + + /** + * record odd sequence to ses + */ + void inline recordOddSequence (long long idx, long long length, sequence_const_iter it, const edit_t et) { + while(idx < length){ + ses.addSequence(*it, idx, 0, et); + ++it; + ++idx; + ++editDistance; + } + ses.addSequence(*it, idx, 0, et); + ++editDistance; + } + + /** + * join ses vectors + */ + void inline joinSesVec (sesElemVec& s1, sesElemVec& s2) const { + if (!s2.empty()) { + for (sesElemVec_iter vit=s2.begin();vit!=s2.end();++vit) { + s1.push_back(*vit); + } + } + } + + /** + * check sequence is replaced each other + */ + bool inline isReverse () const { + return reverse; + } + + }; +} + +#endif // DTL_DIFF_H diff --git a/dtl/Diff3.hpp b/dtl/Diff3.hpp new file mode 100644 index 0000000..1740ec4 --- /dev/null +++ b/dtl/Diff3.hpp @@ -0,0 +1,245 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_DIFF3_H +#define DTL_DIFF3_H + +namespace dtl { + + /** + * diff3 class template + * sequence must support random_access_iterator. + */ + template , typename comparator = Compare< elem > > + class Diff3 + { + private: + dtl_typedefs(elem, sequence) + sequence A; + sequence B; + sequence C; + sequence S; + Diff< elem, sequence, comparator > diff_ba; + Diff< elem, sequence, comparator > diff_bc; + bool conflict; + elem csepabegin; + elem csepa; + elem csepaend; + public : + Diff3 () {} + Diff3 (const sequence& a, + const sequence& b, + const sequence& c) : A(a), B(b), C(c), + diff_ba(b, a), diff_bc(b, c), + conflict(false) {} + + ~Diff3 () {} + + bool isConflict () const { + return conflict; + } + + sequence getMergedSequence () const { + return S; + } + + /** + * merge changes B and C to A + */ + bool merge () { + if (diff_ba.getEditDistance() == 0) { // A == B + if (diff_bc.getEditDistance() == 0) { // A == B == C + S = B; + return true; + } + S = C; + return true; + } else { // A != B + if (diff_bc.getEditDistance() == 0) { // A != B == C + S = A; + return true; + } else { // A != B != C + S = merge_(); + if (isConflict()) { // conflict occured + return false; + } + } + } + return true; + } + + /** + * compose differences + */ + void compose () { + diff_ba.compose(); + diff_bc.compose(); + } + + private : + /** + * merge implementation + */ + sequence merge_ () { + elemVec seq; + Ses< elem > ses_ba = diff_ba.getSes(); + Ses< elem > ses_bc = diff_bc.getSes(); + sesElemVec ses_ba_v = ses_ba.getSequence(); + sesElemVec ses_bc_v = ses_bc.getSequence(); + sesElemVec_iter ba_it = ses_ba_v.begin(); + sesElemVec_iter bc_it = ses_bc_v.begin(); + sesElemVec_iter ba_end = ses_ba_v.end(); + sesElemVec_iter bc_end = ses_bc_v.end(); + + while (!isEnd(ba_end, ba_it) || !isEnd(bc_end, bc_it)) { + while (true) { + if (!isEnd(ba_end, ba_it) && + !isEnd(bc_end, bc_it) && + ba_it->first == bc_it->first && + ba_it->second.type == SES_COMMON && + bc_it->second.type == SES_COMMON) { + // do nothing + } else { + break; + } + if (!isEnd(ba_end, ba_it)) seq.push_back(ba_it->first); + else if (!isEnd(bc_end, bc_it)) seq.push_back(bc_it->first); + forwardUntilEnd(ba_end, ba_it); + forwardUntilEnd(bc_end, bc_it); + } + if (isEnd(ba_end, ba_it) || isEnd(bc_end, bc_it)) break; + if ( ba_it->second.type == SES_COMMON + && bc_it->second.type == SES_DELETE) { + forwardUntilEnd(ba_end, ba_it); + forwardUntilEnd(bc_end, bc_it); + } else if (ba_it->second.type == SES_COMMON && + bc_it->second.type == SES_ADD) { + seq.push_back(bc_it->first); + forwardUntilEnd(bc_end, bc_it); + } else if (ba_it->second.type == SES_DELETE && + bc_it->second.type == SES_COMMON) { + forwardUntilEnd(ba_end, ba_it); + forwardUntilEnd(bc_end, bc_it); + } else if (ba_it->second.type == SES_DELETE && + bc_it->second.type == SES_DELETE) { + if (ba_it->first == bc_it->first) { + forwardUntilEnd(ba_end, ba_it); + forwardUntilEnd(bc_end, bc_it); + } else { + // conflict + conflict = true; + return B; + } + } else if (ba_it->second.type == SES_DELETE && + bc_it->second.type == SES_ADD) { + // conflict + conflict = true; + return B; + } else if (ba_it->second.type == SES_ADD && + bc_it->second.type == SES_COMMON) { + seq.push_back(ba_it->first); + forwardUntilEnd(ba_end, ba_it); + } else if (ba_it->second.type == SES_ADD && + bc_it->second.type == SES_DELETE) { + // conflict + conflict = true; + return B; + } else if (ba_it->second.type == SES_ADD && + bc_it->second.type == SES_ADD) { + if (ba_it->first == bc_it->first) { + seq.push_back(ba_it->first); + forwardUntilEnd(ba_end, ba_it); + forwardUntilEnd(bc_end, bc_it); + } else { + // conflict + conflict = true; + return B; + } + } + } + + if (isEnd(ba_end, ba_it)) { + addDecentSequence(bc_end, bc_it, seq); + } else if (isEnd(bc_end, bc_it)) { + addDecentSequence(ba_end, ba_it, seq); + } + + sequence mergedSeq(seq.begin(), seq.end()); + return mergedSeq; + } + + /** + * join elem vectors + */ + void inline joinElemVec (elemVec& s1, elemVec& s2) const { + if (!s2.empty()) { + for (elemVec_iter vit=s2.begin();vit!=s2.end();++vit) { + s1.push_back(*vit); + } + } + } + + /** + * check sequence is end + */ + template + bool inline isEnd (const T_iter& end, const T_iter& it) const { + return it == end ? true : false; + } + + /** + * increment iterator until iterator is end + */ + template + void inline forwardUntilEnd (const T_iter& end, T_iter& it) const { + if (!isEnd(end, it)) ++it; + } + + /** + * add elements which SES's type is ADD + */ + void inline addDecentSequence (const sesElemVec_iter& end, sesElemVec_iter& it, elemVec& seq) const { + while (!isEnd(end, it)) { + if (it->second.type == SES_ADD) seq.push_back(it->first); + ++it; + } + } + + }; +} + +#endif // DTL_DIFF3_H diff --git a/dtl/Lcs.hpp b/dtl/Lcs.hpp new file mode 100644 index 0000000..16e1440 --- /dev/null +++ b/dtl/Lcs.hpp @@ -0,0 +1,55 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_LCS_H +#define DTL_LCS_H + +namespace dtl { + + /** + * Longest Common Subsequence template class + */ + template + class Lcs : public Sequence< elem > + { + public : + Lcs () {} + ~Lcs () {} + }; +} + +#endif // DTL_LCS_H diff --git a/dtl/Sequence.hpp b/dtl/Sequence.hpp new file mode 100644 index 0000000..7c3df9b --- /dev/null +++ b/dtl/Sequence.hpp @@ -0,0 +1,65 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_SEQUENCE_H +#define DTL_SEQUENCE_H + +namespace dtl { + + /** + * sequence class template + */ + template + class Sequence + { + public : + typedef vector< elem > elemVec; + Sequence () {} + virtual ~Sequence () {} + + elemVec getSequence () const { + return sequence; + } + void addSequence (elem e) { + sequence.push_back(e); + } + protected : + elemVec sequence; + }; +} + +#endif // DTL_SEQUENCE_H diff --git a/dtl/Ses.hpp b/dtl/Ses.hpp new file mode 100644 index 0000000..ed50d40 --- /dev/null +++ b/dtl/Ses.hpp @@ -0,0 +1,112 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_SES_H +#define DTL_SES_H + +namespace dtl { + + /** + * Shortest Edit Script template class + */ + template + class Ses : public Sequence< elem > + { + private : + typedef pair< elem, elemInfo > sesElem; + typedef vector< sesElem > sesElemVec; + public : + + Ses () : onlyAdd(true), onlyDelete(true), onlyCopy(true) { } + ~Ses () {} + + bool isOnlyAdd () const { + return onlyAdd; + } + + bool isOnlyDelete () const { + return onlyDelete; + } + + bool isOnlyCopy () const { + return onlyCopy; + } + + bool isOnlyOneOperation () const { + return isOnlyAdd() || isOnlyAdd() || isOnlyCopy(); + } + + bool isChange () const { + return !onlyCopy; + } + + using Sequence< elem >::addSequence; + void addSequence (elem e, long long beforeIdx, long long afterIdx, const edit_t type) { + elemInfo info; + info.beforeIdx = beforeIdx; + info.afterIdx = afterIdx; + info.type = type; + sesElem pe(e, info); + sequence.push_back(pe); + switch (type) { + case SES_DELETE: + onlyCopy = false; + onlyAdd = false; + break; + case SES_COMMON: + onlyAdd = false; + onlyDelete = false; + break; + case SES_ADD: + onlyDelete = false; + onlyCopy = false; + break; + } + } + + sesElemVec getSequence () const { + return sequence; + } + private : + sesElemVec sequence; + bool onlyAdd; + bool onlyDelete; + bool onlyCopy; + }; +} + +#endif // DTL_SES_H diff --git a/dtl/dtl.hpp b/dtl/dtl.hpp new file mode 100644 index 0000000..dd999f8 --- /dev/null +++ b/dtl/dtl.hpp @@ -0,0 +1,47 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DTL_H +#define DTL_H + +#include "variables.hpp" +#include "functors.hpp" +#include "Sequence.hpp" +#include "Lcs.hpp" +#include "Ses.hpp" +#include "Diff.hpp" +#include "Diff3.hpp" + +#endif // DTL_H diff --git a/dtl/functors.hpp b/dtl/functors.hpp new file mode 100644 index 0000000..5ddf1eb --- /dev/null +++ b/dtl/functors.hpp @@ -0,0 +1,137 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_FUNCTORS_H +#define DTL_FUNCTORS_H + +namespace dtl { + + /** + * printer class template + */ + template + class Printer + { + public : + Printer () : out_(cout) {} + Printer (stream& out) : out_(out) {} + virtual ~Printer () {} + virtual void operator() (const sesElem& se) const = 0; + protected : + stream& out_; + }; + + /** + * common element printer class template + */ + template + class CommonPrinter : public Printer < sesElem, stream > + { + public : + CommonPrinter () : Printer < sesElem, stream > () {} + CommonPrinter (stream& out) : Printer < sesElem, stream > (out) {} + ~CommonPrinter () {} + void operator() (const sesElem& se) const { + this->out_ << SES_MARK_COMMON << se.first << endl; + } + }; + + /** + * ses element printer class template + */ + template + class ChangePrinter : public Printer < sesElem, stream > + { + public : + ChangePrinter () : Printer < sesElem, stream > () {} + ChangePrinter (stream& out) : Printer < sesElem, stream > (out) {} + ~ChangePrinter () {} + void operator() (const sesElem& se) const { + switch (se.second.type) { + case SES_ADD: + this->out_ << SES_MARK_ADD << se.first << endl; + break; + case SES_DELETE: + this->out_ << SES_MARK_DELETE << se.first << endl; + break; + case SES_COMMON: + this->out_ << SES_MARK_COMMON << se.first << endl; + break; + } + } + }; + + /** + * unfiend format element printer class template + */ + template + class UniHunkPrinter + { + public : + UniHunkPrinter () : out_(cout) {} + UniHunkPrinter (stream& out) : out_(out) {} + ~UniHunkPrinter () {} + void operator() (const uniHunk< sesElem >& hunk) const { + out_ << "@@" + << " -" << hunk.a << "," << hunk.b + << " +" << hunk.c << "," << hunk.d + << " @@" << endl; + + for_each(hunk.common[0].begin(), hunk.common[0].end(), CommonPrinter< sesElem, stream >(out_)); + for_each(hunk.change.begin(), hunk.change.end(), ChangePrinter< sesElem, stream >(out_)); + for_each(hunk.common[1].begin(), hunk.common[1].end(), CommonPrinter< sesElem, stream >(out_)); + } + private : + stream& out_; + }; + + /** + * compare class template + */ + template + class Compare + { + public : + Compare () {} + virtual ~Compare () {} + virtual inline bool impl (const elem& e1, const elem& e2) const { + return e1 == e2; + } + }; +} + +#endif // DTL_FUNCTORS_H diff --git a/dtl/variables.hpp b/dtl/variables.hpp new file mode 100644 index 0000000..259ee29 --- /dev/null +++ b/dtl/variables.hpp @@ -0,0 +1,134 @@ +/** + dtl-1.12 -- Diff Template Library + + In short, Diff Template Library is distributed under so called "BSD license", + + Copyright (c) 2008-2011 Tatsuhiko Kubo + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the authors nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* If you use this library, you must include dtl.hpp only. */ + +#ifndef DTL_VARIABLES_H +#define DTL_VARIABLES_H + +#include +#include +#include +#include +#include + +namespace dtl { + + using std::vector; + using std::string; + using std::pair; + using std::ostream; + using std::list; + using std::for_each; + using std::distance; + using std::fill; + using std::cout; + using std::endl; + using std::rotate; + using std::swap; + using std::max; + + /** + * type of edit for SES + */ + typedef int edit_t; + const edit_t SES_DELETE = -1; + const edit_t SES_COMMON = 0; + const edit_t SES_ADD = 1; + + /** + * mark of SES + */ +#define SES_MARK_DELETE "-" +#define SES_MARK_COMMON " " +#define SES_MARK_ADD "+" + + /** + * info for Unified Format + */ + typedef struct eleminfo { + long long beforeIdx; // index of prev sequence + long long afterIdx; // index of after sequence + edit_t type; // type of edit(Add, Delete, Common) + } elemInfo; + + const long long DTL_SEPARATE_SIZE = 3; + const long long DTL_CONTEXT_SIZE = 3; + + /** + * cordinate for registering route + */ + typedef struct Point { + long long x; // x cordinate + long long y; // y cordinate + long long k; // vertex + } P; + + /** + * limit of cordinate size + */ + const unsigned long long MAX_CORDINATES_SIZE = 2000000; + + typedef vector< long long > editPath; + typedef vector< P > editPathCordinates; + + /** + * Structure of Unified Format Hunk + */ + template + struct uniHunk { + long long a, b, c, d; // @@ -a,b +c,d @@ + vector< sesElem > common[2]; // anteroposterior commons on changes + vector< sesElem > change; // changes + long long inc_dec_count; // count of increace and decrease + }; + +#define dtl_typedefs(elem, sequence) \ + typedef pair< elem, elemInfo > sesElem; \ + typedef vector< sesElem > sesElemVec; \ + typedef vector< uniHunk< sesElem > > uniHunkVec; \ + typedef list< elem > elemList; \ + typedef vector< elem > elemVec; \ + typedef typename uniHunkVec::iterator uniHunkVec_iter; \ + typedef typename sesElemVec::iterator sesElemVec_iter; \ + typedef typename elemList::iterator elemList_iter; \ + typedef typename sequence::iterator sequence_iter; \ + typedef typename sequence::const_iterator sequence_const_iter; \ + typedef typename elemVec::iterator elemVec_iter; + + +} + +#endif // DTL_VARIABLES_H diff --git a/wikiq.c b/wikiq.cpp similarity index 88% rename from wikiq.c rename to wikiq.cpp index a8cf97d..144e563 100644 --- a/wikiq.c +++ b/wikiq.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -12,6 +13,10 @@ #include #include "disorder.h" #include "md5.h" +#include "dtl/dtl.hpp" +#include + +using namespace std; // timestamp of the form 2003-11-07T00:43:23Z #define DATE_LENGTH 10 @@ -47,6 +52,7 @@ typedef struct { char *editorid; char *comment; char *text; + vector last_tokens; // track string size of the elements, to prevent O(N^2) processing in charhndl // when we have to take strlen for every character which we append to the buffer @@ -133,6 +139,7 @@ free_data(revisionData *data, int title) free(data->editorid); free(data->comment); free(data->text); + data->last_tokens.clear(); } void cleanup_revision(revisionData *data) { @@ -141,6 +148,7 @@ void cleanup_revision(revisionData *data) { void cleanup_article(revisionData *data) { clean_data(data, 1); + data->last_tokens.clear(); } @@ -223,8 +231,57 @@ write_row(revisionData *data) sprintf(md5_hex_output + di * 2, "%02x", digest[di]); } + string text = string(data->text, data->text_size); + vector text_tokens; + size_t period_pos = 0; + size_t paragraph_pos = 0; + size_t start = 0; + while ((period_pos = text.find(".", period_pos + 1)) != string::npos && + (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) { + if (paragraph_pos < period_pos) { + text_tokens.push_back(text.substr(start, paragraph_pos - start)); + start = paragraph_pos; + } else { + text_tokens.push_back(text.substr(start, period_pos - start)); + start = period_pos; + } + } + + vector additions; + vector deletions; + + if (data->last_tokens.empty()) { + data->last_tokens = text_tokens; + } else { + // do the diff + + dtl::Diff< string, vector > d(data->last_tokens, text_tokens); + //d.onOnlyEditDistance(); + d.compose(); + + vector > ses_v = d.getSes().getSequence(); + for (vector >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) { + switch (sit->second.type) { + case dtl::SES_ADD: + cout << "ADD: \"" << sit->first << "\"" << endl; + additions.push_back(sit->first); + break; + case dtl::SES_DELETE: + cout << "DEL: \"" << sit->first << "\"" << endl; + deletions.push_back(sit->first); + break; + } + } + + // apply regex to the diff + + + data->last_tokens = text_tokens; + } + + // print line of tsv output - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n", + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n", data->title, data->articleid, data->revid, @@ -236,7 +293,9 @@ write_row(revisionData *data) (data->minor) ? "1" : "0", (unsigned int) data->text_size, shannon_H(data->text, data->text_size), - md5_hex_output + md5_hex_output, + (int) additions.size(), + (int) deletions.size() ); //