added (broken, but running) diff routines for block-level diffs
authorErik Garrison <erik@hypervolu.me>
Sun, 13 Mar 2011 18:17:39 +0000 (14:17 -0400)
committerErik Garrison <erik@hypervolu.me>
Sun, 13 Mar 2011 18:17:39 +0000 (14:17 -0400)
Makefile
dtl/Diff.hpp [new file with mode: 0644]
dtl/Diff3.hpp [new file with mode: 0644]
dtl/Lcs.hpp [new file with mode: 0644]
dtl/Sequence.hpp [new file with mode: 0644]
dtl/Ses.hpp [new file with mode: 0644]
dtl/dtl.hpp [new file with mode: 0644]
dtl/functors.hpp [new file with mode: 0644]
dtl/variables.hpp [new file with mode: 0644]
wikiq.cpp [moved from wikiq.c with 88% similarity]

index e92da1f6e54ee94590107a082644fca9ed853c30..e1d898a740de586ff5b22c5ae9cee7d56805d17e 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -4,8 +4,8 @@ OBJECTS = disorder.o md5.o
 
 all: wikiq
 
-wikiq: wikiq.c $(OBJECTS)
-       $(CXX) $(CFLAGS) wikiq.c $(OBJECTS) -o wikiq -lexpat
+wikiq: wikiq.cpp $(OBJECTS)
+       $(CXX) $(CFLAGS) wikiq.cpp $(OBJECTS) -o wikiq -lexpat
 
 disorder.o: disorder.c disorder.h
        $(CXX) $(CFLAGS) -c disorder.c
diff --git a/dtl/Diff.hpp b/dtl/Diff.hpp
new file mode 100644 (file)
index 0000000..5840a71
--- /dev/null
@@ -0,0 +1,629 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_DIFF_H
+#define DTL_DIFF_H
+
+namespace dtl {
+    
+    /**
+     * diff class template
+     * sequence must support random_access_iterator.
+     */
+    template <typename elem, typename sequence = vector< elem >, typename comparator = Compare< elem > >
+    class Diff
+    {
+    private :
+        dtl_typedefs(elem, sequence)
+        sequence           A;
+        sequence           B;
+        size_t             M;
+        size_t             N;
+        size_t             delta;
+        size_t             offset;
+        long long          *fp;
+        long long          editDistance;
+        Lcs< elem >        lcs;
+        Ses< elem >        ses;
+        editPath           path;
+        editPathCordinates pathCordinates;
+        bool               reverse;
+        bool               huge;
+        bool               unserious;
+        bool               onlyEditDistance;
+        uniHunkVec         uniHunks;
+        comparator         cmp;
+    public :
+        Diff () {}
+        
+        Diff (const sequence& a, 
+              const sequence& b) : A(a), B(b) {
+            init();
+        }
+        
+        Diff (const sequence& a, 
+              const sequence& b, 
+              const comparator& comp) : A(a), B(b), cmp(comp) {
+            init();
+        }
+        
+        ~Diff() {}
+        
+        long long getEditDistance () const {
+            return editDistance;
+        }
+        
+        Lcs< elem > getLcs () const {
+            return lcs;
+        }
+        
+        elemVec getLcsVec () const {
+            return lcs.getSequence();
+        }
+        
+        Ses< elem > getSes () const {
+            return ses;
+        }
+        
+        uniHunkVec getUniHunks () const {
+            return uniHunks;
+        }
+        
+        bool isHuge () const {
+            return huge;
+        }
+        
+        void onHuge () {
+            this->huge = true;
+        }
+        
+        void offHuge () {
+            this->huge = false;
+        }
+        
+        bool isUnserious () const {
+            return unserious;
+        }
+        
+        void onUnserious () {
+            this->unserious = true;
+        }
+        
+        void offUnserious () {
+            this->unserious = false;
+        }
+        
+        void onOnlyEditDistance () {
+            this->onlyEditDistance = true;
+        }
+        
+        /**
+         * patching with Unified Format Hunks
+         */
+        sequence uniPatch (const sequence& seq) {
+            elemList        seqLst(seq.begin(), seq.end());
+            sesElemVec      shunk;
+            sesElemVec_iter vsesIt;
+            elemList_iter   lstIt         = seqLst.begin();
+            long long       inc_dec_total = 0;
+            long long       gap           = 1;
+            for (uniHunkVec_iter it=uniHunks.begin();it!=uniHunks.end();++it) {
+                joinSesVec(shunk, it->common[0]);
+                joinSesVec(shunk, it->change);
+                joinSesVec(shunk, it->common[1]);
+                it->a         += inc_dec_total;
+                inc_dec_total += it->inc_dec_count;
+                for (long long i=0;i<it->a - gap;++i) {
+                    ++lstIt;
+                }
+                gap = it->a + it->b + it->inc_dec_count;
+                vsesIt = shunk.begin();
+                while (vsesIt!=shunk.end()) {
+                    switch (vsesIt->second.type) {
+                    case SES_ADD :
+                        seqLst.insert(lstIt, vsesIt->first);
+                        break;
+                    case SES_DELETE :
+                        if (lstIt != seqLst.end()) {
+                            lstIt = seqLst.erase(lstIt);
+                        }
+                        break;
+                    case SES_COMMON :
+                        if (lstIt != seqLst.end()) {
+                            ++lstIt;
+                        }
+                        break;
+                    default :
+                        // no through
+                        break;
+                    }
+                    ++vsesIt;
+                }
+                shunk.clear();
+            }
+            
+            sequence patchedSeq(seqLst.begin(), seqLst.end());
+            return patchedSeq;
+        }
+        
+        /**
+         * patching with Shortest Edit Script
+         */
+        sequence patch (const sequence& seq) const {
+            sesElemVec    sesSeq = ses.getSequence();
+            elemList      seqLst(seq.begin(), seq.end());
+            elemList_iter lstIt  = seqLst.begin();
+            for (sesElemVec_iter sesIt=sesSeq.begin();sesIt!=sesSeq.end();++sesIt) {
+                switch (sesIt->second.type) {
+                case SES_ADD :
+                    seqLst.insert(lstIt, sesIt->first);
+                    break;
+                case SES_DELETE :
+                    lstIt = seqLst.erase(lstIt);
+                    break;
+                case SES_COMMON :
+                    ++lstIt;
+                    break;
+                default :
+                    // no through
+                    break;
+                }
+            }
+            sequence patchedSeq(seqLst.begin(), seqLst.end());
+            return patchedSeq;
+        }
+        
+        /**
+         * compose Longest Common Subsequence and Shortest Edit Script.
+         * The algorithm implemented here is based on "An O(NP) Sequence Comparison Algorithm"
+         * described by Sun Wu, Udi Manber and Gene Myers
+         */
+        void compose() {
+            
+            if (isHuge()) {
+                pathCordinates.reserve(MAX_CORDINATES_SIZE);
+            }
+            
+            long long p = -1;
+            fp = new long long[M + N + 3];
+            fill(&fp[0], &fp[M + N + 3], -1);
+            path = editPath(M + N + 3);
+            fill(path.begin(), path.end(), -1);
+        ONP:
+            do {
+                ++p;
+                for (long long k=-p;k<=static_cast<long long>(delta)-1;++k) {
+                    fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
+                }
+                for (long long k=static_cast<long long>(delta)+p;k>=static_cast<long long>(delta)+1;--k) {
+                    fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
+                }
+                fp[delta+offset] = snake(static_cast<long long>(delta), fp[delta-1+offset]+1, fp[delta+1+offset]);
+            } while (fp[delta+offset] != static_cast<long long>(N) && pathCordinates.size() < MAX_CORDINATES_SIZE);
+            
+            editDistance += static_cast<long long>(delta) + 2 * p;
+            long long r = path[delta+offset];
+            P cordinate;
+            editPathCordinates epc(0);
+            
+            // only recoding editdistance
+            if (onlyEditDistance) {
+                delete[] this->fp;
+                return;
+            }
+            
+            while(r != -1) {
+                cordinate.x = pathCordinates[(size_t)r].x;
+                cordinate.y = pathCordinates[(size_t)r].y;
+                epc.push_back(cordinate);
+                r = pathCordinates[(size_t)r].k;
+            }
+            
+            // record Longest Common Subsequence & Shortest Edit Script
+            if (!recordSequence(epc)) {
+                pathCordinates.resize(0);
+                epc.resize(0);
+                p = -1;
+                goto ONP;
+            }
+            delete[] this->fp;
+        }
+
+        /**
+         * print difference between A and B with SES
+         */
+        template < typename stream >
+        void printSES (stream& out) const {
+            sesElemVec ses_v = ses.getSequence();
+            for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
+        }
+        
+        void printSES (ostream& out = cout) const {
+            printSES< ostream >(out);
+        }
+        
+        /**
+         * print difference with given SES
+         */
+        template < typename stream >
+        static void printSES (const Ses< elem >& s, stream& out) {
+            sesElemVec ses_v = s.getSequence();
+            for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
+        }
+        
+        static void printSES (const Ses< elem >& s, ostream& out = cout) {
+            printSES< ostream >(s, out);
+        }
+        
+        /**
+         * print difference between A and B with the format such as Unified Format
+         */
+        template < typename stream >
+        void printUnifiedFormat (stream& out) const {
+            for_each(uniHunks.begin(), uniHunks.end(), UniHunkPrinter< sesElem, stream >(out));
+        }
+        
+        void printUnifiedFormat (ostream& out = cout) const {
+            printUnifiedFormat< ostream >(out);
+        }
+        
+        /**
+         * print unified format difference with given unified format hunks
+         */
+        template < typename stream >
+        static void printUnifiedFormat (const uniHunkVec& hunks, stream& out) {
+            for_each(hunks.begin(), hunks.end(), UniHunkPrinter< sesElem >(out));
+        }
+
+        static void printUnifiedFormat (const uniHunkVec& hunks, ostream& out = cout) {
+            printUnifiedFormat< ostream >(hunks, out);
+        }
+
+        /**
+         * compose Unified Format Hunks from Shortest Edit Script
+         */
+        void composeUnifiedHunks () {
+            sesElemVec         common[2];
+            sesElemVec         change;
+            sesElemVec         ses_v  = ses.getSequence();
+            long long          l_cnt  = 1;
+            long long          length = distance(ses_v.begin(), ses_v.end());
+            long long          middle = 0;
+            bool               isMiddle, isAfter;
+            elem               e;
+            elemInfo           einfo;
+            long long          a, b, c, d;        // @@ -a,b +c,d @@
+            long long          inc_dec_count = 0;
+            uniHunk< sesElem > hunk;
+            sesElemVec         adds;
+            sesElemVec         deletes;
+            
+            isMiddle = isAfter = false;
+            a = b = c = d = 0;
+            
+            for (sesElemVec_iter it=ses_v.begin();it!=ses_v.end();++it, ++l_cnt) {
+                e = it->first;
+                einfo = it->second;
+                switch (einfo.type) {
+                case SES_ADD :
+                    middle = 0;
+                    ++inc_dec_count;
+                    adds.push_back(*it);
+                    if (!isMiddle)       isMiddle = true;
+                    if (isMiddle)        ++d;
+                    if (l_cnt >= length) {
+                        joinSesVec(change, deletes);
+                        joinSesVec(change, adds);
+                        isAfter = true;
+                    }
+                    break;
+                case SES_DELETE :
+                    middle = 0;
+                    --inc_dec_count;
+                    deletes.push_back(*it);
+                    if (!isMiddle)       isMiddle = true;
+                    if (isMiddle)        ++b;
+                    if (l_cnt >= length) {
+                        joinSesVec(change, deletes);
+                        joinSesVec(change, adds);
+                        isAfter = true;
+                    }
+                    break;
+                case SES_COMMON :
+                    ++b;++d;
+                    if (common[1].empty() && adds.empty() && deletes.empty() && change.empty()) {
+                        if (static_cast<long long>(common[0].size()) < DTL_CONTEXT_SIZE) {
+                            if (a == 0 && c == 0) {
+                                a = einfo.beforeIdx;
+                                c = einfo.afterIdx;
+                            }
+                            common[0].push_back(*it);
+                        } else {
+                            rotate(common[0].begin(), common[0].begin() + 1, common[0].end());
+                            common[0].pop_back();
+                            common[0].push_back(*it);
+                            ++a;++c;
+                            --b;--d;
+                        }
+                    }
+                    if (isMiddle && !isAfter) {
+                        ++middle;
+                        joinSesVec(change, deletes);
+                        joinSesVec(change, adds);
+                        change.push_back(*it);
+                        if (middle >= DTL_SEPARATE_SIZE || l_cnt >= length) {
+                            isAfter = true;
+                        }
+                        adds.clear();
+                        deletes.clear();
+                    }
+                    break;
+                default :
+                    // no through
+                    break;
+                }
+                // compose unified format hunk
+                if (isAfter && !change.empty()) {
+                    sesElemVec_iter cit = it;
+                    long long       cnt = 0;
+                    for (long long i=0;i<DTL_SEPARATE_SIZE && (cit != ses_v.end());++i, ++cit) {
+                        if (cit->second.type == SES_COMMON) {
+                            ++cnt;
+                        }
+                    }
+                    if (cnt < DTL_SEPARATE_SIZE && l_cnt < length) {
+                        middle = 0;
+                        isAfter = false;
+                        continue;
+                    }
+                    if (static_cast<long long>(common[0].size()) >= DTL_SEPARATE_SIZE) {
+                        long long c0size = static_cast<long long>(common[0].size());
+                        rotate(common[0].begin(), 
+                               common[0].begin() + (size_t)c0size - DTL_SEPARATE_SIZE, 
+                               common[0].end());
+                        for (long long i=0;i<c0size - DTL_SEPARATE_SIZE;++i) {
+                            common[0].pop_back();
+                        }
+                        a += c0size - DTL_SEPARATE_SIZE;
+                        c += c0size - DTL_SEPARATE_SIZE;
+                    }
+                    if (a == 0) ++a;
+                    if (c == 0) ++c;
+                    if (isReverse()) swap(a, c);
+                    hunk.a = a;hunk.b = b;hunk.c = c;hunk.d = d;
+                    hunk.common[0] = common[0];
+                    hunk.change = change;
+                    hunk.common[1] = common[1];
+                    hunk.inc_dec_count = inc_dec_count;
+                    uniHunks.push_back(hunk);
+                    isMiddle = false;
+                    isAfter = false;
+                    common[0].clear();
+                    common[1].clear();
+                    adds.clear();
+                    deletes.clear();
+                    change.clear();
+                    a = b = c = d = middle = inc_dec_count = 0;
+                }
+            }
+        }
+        
+        /**
+         * compose ses from stream
+         */
+        template <typename stream>
+        static Ses< elem > composeSesFromStream (stream& st)
+        {
+            elem        line;
+            Ses< elem > ret;
+            long long   x_idx, y_idx;
+            x_idx = y_idx = 1;
+            while (getline(st, line)) {
+                elem mark(line.begin(), line.begin() + 1);
+                elem e(line.begin() + 1, line.end());
+                if (mark == SES_MARK_DELETE) {
+                    ret.addSequence(e, x_idx, 0, SES_DELETE);
+                    ++x_idx;
+                } else if (mark == SES_MARK_ADD) {
+                    ret.addSequence(e, y_idx, 0, SES_ADD);
+                    ++y_idx;
+                } else if (mark == SES_MARK_COMMON) {
+                    ret.addSequence(e, x_idx, y_idx, SES_COMMON);
+                    ++x_idx;
+                    ++y_idx;
+                }
+            }
+            return ret;
+        }
+
+    private :
+        /**
+         * initialize
+         */
+        void init () {
+            M = distance(A.begin(), A.end());
+            N = distance(B.begin(), B.end());
+            if (M < N) {
+                reverse = false;
+            } else {
+                swap(A, B);
+                swap(M, N);
+                reverse = true;
+            }
+            editDistance     = 0;
+            delta            = N - M;
+            offset           = M + 1;
+            huge             = false;
+            unserious        = false;
+            onlyEditDistance = false;
+            fp               = NULL;
+        }
+        
+        /**
+         * search shortest path and record the path
+         */
+        long long snake(const long long& k, const long long& above, const long long& below) {
+            long long r = above > below ? path[(size_t)k-1+offset] : path[(size_t)k+1+offset];
+            long long y = max(above, below);
+            long long x = y - k;
+            while ((size_t)x < M && (size_t)y < N && cmp.impl(A[(size_t)x], B[(size_t)y])) {
+                ++x;++y;
+            }
+            
+            path[(size_t)k+offset] = static_cast<long long>(pathCordinates.size());
+            if (!onlyEditDistance) {
+                P p;
+                p.x = x;p.y = y;p.k = r;
+                pathCordinates.push_back(p);      
+            }
+            return y;
+        }
+        
+        /**
+         * record SES and LCS
+         */
+        bool recordSequence (const editPathCordinates& v) {
+            sequence_const_iter x(A.begin());
+            sequence_const_iter y(B.begin());
+            long long           x_idx,  y_idx;  // line number for Unified Format
+            long long           px_idx, py_idx; // cordinates
+            bool                complete = false;
+            x_idx  = y_idx  = 1;
+            px_idx = py_idx = 0;
+            for (size_t i=v.size()-1;!complete;--i) {
+                while(px_idx < v[i].x || py_idx < v[i].y) {
+                    if (v[i].y - v[i].x > py_idx - px_idx) {
+                        if (!isReverse()) {
+                            ses.addSequence(*y, y_idx, 0, SES_ADD);
+                        } else {
+                            ses.addSequence(*y, y_idx, 0, SES_DELETE);
+                        }
+                        ++y;
+                        ++y_idx;
+                        ++py_idx;
+                    } else if (v[i].y - v[i].x < py_idx - px_idx) {
+                        if (!isReverse()) {
+                            ses.addSequence(*x, x_idx, 0, SES_DELETE);
+                        } else {
+                            ses.addSequence(*x, x_idx, 0, SES_ADD);
+                        }
+                        ++x;
+                        ++x_idx;
+                        ++px_idx;
+                    } else {
+                        lcs.addSequence(*x);
+                        ses.addSequence(*x, x_idx, y_idx, SES_COMMON);
+                        ++x;
+                        ++y;
+                        ++x_idx;
+                        ++y_idx;
+                        ++px_idx;
+                        ++py_idx;
+                    }
+                }
+                if (i == 0) complete = true;
+            }
+            
+            if (x_idx > static_cast<long long>(M) && y_idx > static_cast<long long>(N)) {
+                // all recording success
+            } else {
+                // unserious difference
+                if (isUnserious()) {
+                    if (!isReverse()) {
+                        recordOddSequence(x_idx, M, x, SES_DELETE);
+                        recordOddSequence(y_idx, N, y, SES_ADD);
+                    } else {
+                        recordOddSequence(x_idx, M, x, SES_ADD);
+                        recordOddSequence(y_idx, N, y, SES_DELETE);
+                    }
+                    return true;
+                }
+                
+                // decent difference
+                sequence A_(A.begin() + (size_t)x_idx - 1, A.end());
+                sequence B_(B.begin() + (size_t)y_idx - 1, B.end());
+                A        = A_;
+                B        = B_;
+                M        = distance(A.begin(), A.end());
+                N        = distance(B.begin(), B.end());
+                delta    = N - M;
+                offset   = M + 1;
+                delete[] fp;
+                fp = new long long[M + N + 3];
+                fill(&fp[0], &fp[M + N + 3], -1);
+                fill(path.begin(), path.end(), -1);
+                return false;
+            }
+            return true;
+        }
+        
+        /**
+         * record odd sequence to ses
+         */
+        void inline recordOddSequence (long long idx, long long length, sequence_const_iter it, const edit_t et) {
+            while(idx < length){
+                ses.addSequence(*it, idx, 0, et);
+                ++it;
+                ++idx;
+                ++editDistance;
+            }
+            ses.addSequence(*it, idx, 0, et);
+            ++editDistance;
+        }
+        
+        /**
+         * join ses vectors
+         */
+        void inline joinSesVec (sesElemVec& s1, sesElemVec& s2) const {
+            if (!s2.empty()) {
+                for (sesElemVec_iter vit=s2.begin();vit!=s2.end();++vit) {
+                    s1.push_back(*vit);
+                }
+            }      
+        }
+        
+        /**
+         * check sequence is replaced each other
+         */
+        bool inline isReverse () const {
+            return reverse;
+        }
+
+    };
+}
+
+#endif // DTL_DIFF_H
diff --git a/dtl/Diff3.hpp b/dtl/Diff3.hpp
new file mode 100644 (file)
index 0000000..1740ec4
--- /dev/null
@@ -0,0 +1,245 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_DIFF3_H
+#define DTL_DIFF3_H
+
+namespace dtl {
+    
+    /**
+     * diff3 class template
+     * sequence must support random_access_iterator.
+     */
+    template <typename elem, typename sequence = vector< elem >, typename comparator = Compare< elem > >
+    class Diff3
+    {
+    private:
+        dtl_typedefs(elem, sequence)
+        sequence                           A;
+        sequence                           B;
+        sequence                           C;
+        sequence                           S;
+        Diff< elem, sequence, comparator > diff_ba;
+        Diff< elem, sequence, comparator > diff_bc;
+        bool                               conflict;
+        elem                               csepabegin;
+        elem                               csepa;
+        elem                               csepaend;
+    public :
+        Diff3 () {}
+        Diff3 (const sequence& a, 
+               const sequence& b, 
+               const sequence& c) : A(a), B(b), C(c), 
+                                    diff_ba(b, a), diff_bc(b, c), 
+                                    conflict(false) {} 
+        
+        ~Diff3 () {}
+        
+        bool isConflict () const {
+            return conflict;
+        }
+        
+        sequence getMergedSequence () const {
+            return S;
+        }
+        
+        /**
+         * merge changes B and C to A
+         */
+        bool merge () {
+            if (diff_ba.getEditDistance() == 0) {     // A == B
+                if (diff_bc.getEditDistance() == 0) { // A == B == C
+                    S = B;
+                    return true;
+                }
+                S = C;
+                return true;
+            } else {                                  // A != B
+                if (diff_bc.getEditDistance() == 0) { // A != B == C
+                    S = A;                              
+                    return true;
+                } else {                              // A != B != C
+                    S = merge_();
+                    if (isConflict()) {               // conflict occured
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+        
+        /**
+         * compose differences
+         */
+        void compose () {
+            diff_ba.compose();
+            diff_bc.compose();
+        }
+        
+    private :
+        /**
+         * merge implementation
+         */
+        sequence merge_ () {
+            elemVec         seq;
+            Ses< elem >     ses_ba   = diff_ba.getSes();
+            Ses< elem >     ses_bc   = diff_bc.getSes();
+            sesElemVec      ses_ba_v = ses_ba.getSequence();
+            sesElemVec      ses_bc_v = ses_bc.getSequence();
+            sesElemVec_iter ba_it    = ses_ba_v.begin();
+            sesElemVec_iter bc_it    = ses_bc_v.begin();
+            sesElemVec_iter ba_end   = ses_ba_v.end();
+            sesElemVec_iter bc_end   = ses_bc_v.end();
+            
+            while (!isEnd(ba_end, ba_it) || !isEnd(bc_end, bc_it)) {
+                while (true) {
+                    if (!isEnd(ba_end, ba_it)            && 
+                        !isEnd(bc_end, bc_it)            &&
+                        ba_it->first == bc_it->first     && 
+                        ba_it->second.type == SES_COMMON && 
+                        bc_it->second.type == SES_COMMON) {
+                        // do nothing
+                    } else {
+                        break;
+                    }
+                    if      (!isEnd(ba_end, ba_it)) seq.push_back(ba_it->first);
+                    else if (!isEnd(bc_end, bc_it)) seq.push_back(bc_it->first);
+                    forwardUntilEnd(ba_end, ba_it);
+                    forwardUntilEnd(bc_end, bc_it);
+                }
+                if (isEnd(ba_end, ba_it) || isEnd(bc_end, bc_it)) break;
+                if (   ba_it->second.type == SES_COMMON 
+                       && bc_it->second.type == SES_DELETE) {
+                    forwardUntilEnd(ba_end, ba_it);
+                    forwardUntilEnd(bc_end, bc_it);
+                } else if (ba_it->second.type == SES_COMMON && 
+                           bc_it->second.type == SES_ADD) {
+                    seq.push_back(bc_it->first);
+                    forwardUntilEnd(bc_end, bc_it);
+                } else if (ba_it->second.type == SES_DELETE && 
+                           bc_it->second.type == SES_COMMON) {
+                    forwardUntilEnd(ba_end, ba_it);
+                    forwardUntilEnd(bc_end, bc_it);
+                } else if (ba_it->second.type == SES_DELETE && 
+                           bc_it->second.type == SES_DELETE) {
+                    if (ba_it->first == bc_it->first) {
+                        forwardUntilEnd(ba_end, ba_it);
+                        forwardUntilEnd(bc_end, bc_it);
+                    } else {
+                        // conflict
+                        conflict = true;
+                        return B;
+                    }
+                } else if (ba_it->second.type == SES_DELETE && 
+                           bc_it->second.type == SES_ADD) {
+                    // conflict
+                    conflict = true;
+                    return B;
+                } else if (ba_it->second.type == SES_ADD && 
+                           bc_it->second.type == SES_COMMON) {
+                    seq.push_back(ba_it->first);
+                    forwardUntilEnd(ba_end, ba_it);
+                } else if (ba_it->second.type == SES_ADD && 
+                           bc_it->second.type == SES_DELETE) {
+                    // conflict
+                    conflict = true;
+                    return B;
+                } else if (ba_it->second.type == SES_ADD && 
+                           bc_it->second.type == SES_ADD) {
+                    if (ba_it->first == bc_it->first) {
+                        seq.push_back(ba_it->first);
+                        forwardUntilEnd(ba_end, ba_it);
+                        forwardUntilEnd(bc_end, bc_it);
+                    } else {
+                        // conflict
+                        conflict = true;
+                        return B;
+                    }
+                }
+            }
+            
+            if (isEnd(ba_end, ba_it)) {
+                addDecentSequence(bc_end, bc_it, seq);
+            } else if (isEnd(bc_end, bc_it)) {
+                addDecentSequence(ba_end, ba_it, seq);
+            }
+            
+            sequence mergedSeq(seq.begin(), seq.end());
+            return mergedSeq;
+        }
+        
+        /**
+         * join elem vectors
+         */
+        void inline joinElemVec (elemVec& s1, elemVec& s2) const {
+            if (!s2.empty()) {
+                for (elemVec_iter vit=s2.begin();vit!=s2.end();++vit) {
+                    s1.push_back(*vit);
+                }
+            }
+        }
+        
+        /**
+         * check sequence is end
+         */
+        template <typename T_iter>
+        bool inline isEnd (const T_iter& end, const T_iter& it) const {
+            return it == end ? true : false;
+        }
+        
+        /**
+         * increment iterator until iterator is end
+         */
+        template <typename T_iter>
+        void inline forwardUntilEnd (const T_iter& end, T_iter& it) const {
+            if (!isEnd(end, it)) ++it;
+        }
+        
+        /**
+         * add elements which SES's type is ADD
+         */
+        void inline addDecentSequence (const sesElemVec_iter& end, sesElemVec_iter& it, elemVec& seq) const {
+            while (!isEnd(end, it)) {
+                if (it->second.type == SES_ADD) seq.push_back(it->first);
+                ++it;
+            }      
+        }
+        
+    };
+}
+
+#endif // DTL_DIFF3_H
diff --git a/dtl/Lcs.hpp b/dtl/Lcs.hpp
new file mode 100644 (file)
index 0000000..16e1440
--- /dev/null
@@ -0,0 +1,55 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_LCS_H
+#define DTL_LCS_H
+
+namespace dtl {
+    
+    /**
+     * Longest Common Subsequence template class
+     */
+    template <typename elem>
+    class Lcs : public Sequence< elem >
+    {
+    public :
+        Lcs ()  {}
+        ~Lcs () {}
+    };
+}
+
+#endif // DTL_LCS_H
diff --git a/dtl/Sequence.hpp b/dtl/Sequence.hpp
new file mode 100644 (file)
index 0000000..7c3df9b
--- /dev/null
@@ -0,0 +1,65 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_SEQUENCE_H
+#define DTL_SEQUENCE_H
+
+namespace dtl {
+    
+    /**  
+     * sequence class template
+     */
+    template <typename elem>
+    class Sequence
+    {
+    public :
+        typedef vector< elem > elemVec;
+        Sequence () {}
+        virtual ~Sequence () {}
+        
+        elemVec getSequence () const {
+            return sequence;
+        }
+        void addSequence (elem e) {
+            sequence.push_back(e);
+        }
+    protected :
+        elemVec sequence;
+    };
+}
+
+#endif // DTL_SEQUENCE_H
diff --git a/dtl/Ses.hpp b/dtl/Ses.hpp
new file mode 100644 (file)
index 0000000..ed50d40
--- /dev/null
@@ -0,0 +1,112 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_SES_H
+#define DTL_SES_H
+
+namespace dtl {
+    
+    /**
+     * Shortest Edit Script template class
+     */
+    template <typename elem>
+    class Ses : public Sequence< elem >
+    {
+    private :
+        typedef pair< elem, elemInfo > sesElem;
+        typedef vector< sesElem >      sesElemVec;
+    public :
+        
+        Ses () : onlyAdd(true), onlyDelete(true), onlyCopy(true) { }
+        ~Ses () {}
+        
+        bool isOnlyAdd () const {
+            return onlyAdd;
+        }
+        
+        bool isOnlyDelete () const {
+            return onlyDelete;
+        }
+        
+        bool isOnlyCopy () const {
+            return onlyCopy;
+        }
+        
+        bool isOnlyOneOperation () const {
+            return isOnlyAdd() || isOnlyAdd() || isOnlyCopy();
+        }
+        
+        bool isChange () const {
+            return !onlyCopy;
+        }
+        
+        using Sequence< elem >::addSequence;
+        void addSequence (elem e, long long beforeIdx, long long afterIdx, const edit_t type) {
+            elemInfo info;
+            info.beforeIdx = beforeIdx;
+            info.afterIdx  = afterIdx;
+            info.type      = type;
+            sesElem pe(e, info);
+            sequence.push_back(pe);
+            switch (type) {
+            case SES_DELETE:
+                onlyCopy   = false;
+                onlyAdd    = false;
+                break;
+            case SES_COMMON:
+                onlyAdd    = false;
+                onlyDelete = false;
+                break;
+            case SES_ADD:
+                onlyDelete = false;
+                onlyCopy   = false;
+                break;
+            }
+        }
+        
+        sesElemVec getSequence () const {
+            return sequence;
+        }
+    private :
+        sesElemVec sequence;
+        bool       onlyAdd;
+        bool       onlyDelete;
+        bool       onlyCopy;
+    };
+}
+
+#endif // DTL_SES_H
diff --git a/dtl/dtl.hpp b/dtl/dtl.hpp
new file mode 100644 (file)
index 0000000..dd999f8
--- /dev/null
@@ -0,0 +1,47 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DTL_H
+#define DTL_H
+
+#include "variables.hpp"
+#include "functors.hpp"
+#include "Sequence.hpp"
+#include "Lcs.hpp"
+#include "Ses.hpp"
+#include "Diff.hpp"
+#include "Diff3.hpp"
+
+#endif // DTL_H
diff --git a/dtl/functors.hpp b/dtl/functors.hpp
new file mode 100644 (file)
index 0000000..5ddf1eb
--- /dev/null
@@ -0,0 +1,137 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_FUNCTORS_H
+#define DTL_FUNCTORS_H
+
+namespace dtl {
+    
+    /**
+     * printer class template
+     */
+    template <typename sesElem, typename stream = ostream >
+    class Printer
+    {
+    public :
+        Printer ()            : out_(cout) {}
+        Printer (stream& out) : out_(out)  {}
+        virtual ~Printer () {}
+        virtual void operator() (const sesElem& se) const = 0;
+    protected :
+        stream& out_;
+    };
+    
+    /**
+     * common element printer class template
+     */
+    template <typename sesElem, typename stream = ostream >
+    class CommonPrinter : public Printer < sesElem, stream >
+    {
+    public :
+        CommonPrinter  ()            : Printer < sesElem, stream > ()    {}
+        CommonPrinter  (stream& out) : Printer < sesElem, stream > (out) {}
+        ~CommonPrinter () {}
+        void operator() (const sesElem& se) const {
+            this->out_ << SES_MARK_COMMON << se.first << endl;    
+        }
+    };
+    
+    /**
+     * ses element printer class template
+     */
+    template <typename sesElem, typename stream = ostream >
+    class ChangePrinter : public Printer < sesElem, stream >
+    {
+    public :
+        ChangePrinter  ()            : Printer < sesElem, stream > ()    {}
+        ChangePrinter  (stream& out) : Printer < sesElem, stream > (out) {}
+        ~ChangePrinter () {}
+        void operator() (const sesElem& se) const {
+            switch (se.second.type) {
+            case SES_ADD:
+                this->out_ << SES_MARK_ADD    << se.first << endl;
+                break;
+            case SES_DELETE:
+                this->out_ << SES_MARK_DELETE << se.first << endl;
+                break;
+            case SES_COMMON:
+                this->out_ << SES_MARK_COMMON << se.first << endl;
+                break;
+            }
+        }
+    };
+    
+    /**
+     * unfiend format element printer class template
+     */
+    template <typename sesElem, typename stream = ostream >
+    class UniHunkPrinter
+    {
+    public :
+        UniHunkPrinter  ()            : out_(cout) {}
+        UniHunkPrinter  (stream& out) : out_(out)  {}
+        ~UniHunkPrinter () {}
+        void operator() (const uniHunk< sesElem >& hunk) const {
+            out_ << "@@"
+                 << " -"  << hunk.a << "," << hunk.b
+                 << " +"  << hunk.c << "," << hunk.d
+                 << " @@" << endl;
+            
+            for_each(hunk.common[0].begin(), hunk.common[0].end(), CommonPrinter< sesElem, stream >(out_));
+            for_each(hunk.change.begin(),    hunk.change.end(),    ChangePrinter< sesElem, stream >(out_));
+            for_each(hunk.common[1].begin(), hunk.common[1].end(), CommonPrinter< sesElem, stream >(out_));
+        }
+    private :
+        stream& out_;
+    };
+    
+    /**
+     * compare class template
+     */
+    template <typename elem>
+    class Compare
+    {
+    public :
+        Compare () {}
+        virtual ~Compare () {}
+        virtual inline bool impl (const elem& e1, const elem& e2) const {
+            return e1 == e2;
+        }
+    };
+}
+
+#endif // DTL_FUNCTORS_H
diff --git a/dtl/variables.hpp b/dtl/variables.hpp
new file mode 100644 (file)
index 0000000..259ee29
--- /dev/null
@@ -0,0 +1,134 @@
+/**
+   dtl-1.12 -- Diff Template Library
+   
+   In short, Diff Template Library is distributed under so called "BSD license",
+   
+   Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
+   All rights reserved.
+   
+   Redistribution and use in source and binary forms, with or without modification,
+   are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+   
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+   
+   * Neither the name of the authors nor the names of its contributors
+   may be used to endorse or promote products derived from this software 
+   without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* If you use this library, you must include dtl.hpp only. */
+
+#ifndef DTL_VARIABLES_H
+#define DTL_VARIABLES_H
+
+#include <vector>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <iostream>
+
+namespace dtl {
+    
+    using std::vector;
+    using std::string;
+    using std::pair;
+    using std::ostream;
+    using std::list;
+    using std::for_each;
+    using std::distance;
+    using std::fill;
+    using std::cout;
+    using std::endl;
+    using std::rotate;
+    using std::swap;
+    using std::max;
+    
+    /**
+     * type of edit for SES
+     */
+    typedef int edit_t;
+    const   edit_t SES_DELETE = -1;
+    const   edit_t SES_COMMON = 0;
+    const   edit_t SES_ADD    = 1;
+    
+    /**
+     * mark of SES
+     */
+#define SES_MARK_DELETE "-"
+#define SES_MARK_COMMON " "
+#define SES_MARK_ADD    "+"
+
+    /**
+     * info for Unified Format
+     */
+    typedef struct eleminfo {
+        long long beforeIdx;           // index of prev sequence
+        long long afterIdx;            // index of after sequence
+        edit_t    type;                // type of edit(Add, Delete, Common)
+    } elemInfo;
+    
+    const long long DTL_SEPARATE_SIZE = 3;
+    const long long DTL_CONTEXT_SIZE  = 3;
+    
+    /**
+     * cordinate for registering route
+     */
+    typedef struct Point {
+        long long x;                         // x cordinate
+        long long y;                         // y cordinate
+        long long k;                         // vertex
+    } P;
+    
+    /**
+     * limit of cordinate size
+     */
+    const unsigned long long MAX_CORDINATES_SIZE = 2000000;
+    
+    typedef vector< long long > editPath;
+    typedef vector< P >         editPathCordinates;
+    
+    /**
+     * Structure of Unified Format Hunk
+     */
+    template <typename sesElem>
+    struct uniHunk {
+        long long a, b, c, d;        // @@ -a,b +c,d @@
+        vector< sesElem > common[2]; // anteroposterior commons on changes
+        vector< sesElem > change;    // changes
+        long long inc_dec_count;     // count of increace and decrease
+    };
+
+#define dtl_typedefs(elem, sequence)                                    \
+    typedef pair< elem, elemInfo >            sesElem;                  \
+    typedef vector< sesElem >                 sesElemVec;               \
+    typedef vector< uniHunk< sesElem > >      uniHunkVec;               \
+    typedef list< elem >                      elemList;                 \
+    typedef vector< elem >                    elemVec;                  \
+    typedef typename uniHunkVec::iterator     uniHunkVec_iter;          \
+    typedef typename sesElemVec::iterator     sesElemVec_iter;          \
+    typedef typename elemList::iterator       elemList_iter;            \
+    typedef typename sequence::iterator       sequence_iter;            \
+    typedef typename sequence::const_iterator sequence_const_iter;      \
+    typedef typename elemVec::iterator        elemVec_iter;
+
+
+} 
+
+#endif // DTL_VARIABLES_H
similarity index 88%
rename from wikiq.c
rename to wikiq.cpp
index a8cf97de7245abca47810854ca3f96c03a79e279..144e563b213a9181716b4142c95e466d2eda70c7 100644 (file)
--- a/wikiq.c
+++ b/wikiq.cpp
@@ -5,6 +5,7 @@
  */
 
 #include <stdio.h>
+#include <iostream>
 #include <string.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <getopt.h>
 #include "disorder.h"
 #include "md5.h"
+#include "dtl/dtl.hpp"
+#include <vector>
+
+using namespace std;
 
 // timestamp of the form 2003-11-07T00:43:23Z
 #define DATE_LENGTH 10
@@ -47,6 +52,7 @@ typedef struct {
     char *editorid;
     char *comment;
     char *text;
+    vector<string> last_tokens;
 
     // track string size of the elements, to prevent O(N^2) processing in charhndl
     // when we have to take strlen for every character which we append to the buffer
@@ -133,6 +139,7 @@ free_data(revisionData *data, int title)
     free(data->editorid);
     free(data->comment);
     free(data->text);
+    data->last_tokens.clear();
 }
 
 void cleanup_revision(revisionData *data) {
@@ -141,6 +148,7 @@ void cleanup_revision(revisionData *data) {
 
 void cleanup_article(revisionData *data) {
     clean_data(data, 1);
+    data->last_tokens.clear();
 }
 
 
@@ -223,8 +231,57 @@ write_row(revisionData *data)
         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
     }
 
+    string text = string(data->text, data->text_size);
+    vector<string> text_tokens;
+    size_t period_pos = 0;
+    size_t paragraph_pos = 0;
+    size_t start = 0;
+    while ((period_pos = text.find(".", period_pos + 1)) != string::npos &&
+            (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) {
+        if (paragraph_pos < period_pos) {
+            text_tokens.push_back(text.substr(start, paragraph_pos - start));
+            start = paragraph_pos;
+        } else {
+            text_tokens.push_back(text.substr(start, period_pos - start));
+            start = period_pos;
+        }
+    }
+
+    vector<string> additions;
+    vector<string> deletions;
+
+    if (data->last_tokens.empty()) {
+        data->last_tokens = text_tokens;
+    } else {
+        // do the diff
+        
+        dtl::Diff< string, vector<string> > d(data->last_tokens, text_tokens);
+        //d.onOnlyEditDistance();
+        d.compose();
+
+        vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
+        for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
+            switch (sit->second.type) {
+            case dtl::SES_ADD:
+                cout << "ADD: \"" << sit->first << "\"" << endl;
+                additions.push_back(sit->first);
+                break;
+            case dtl::SES_DELETE:
+                cout << "DEL: \"" << sit->first << "\"" << endl;
+                deletions.push_back(sit->first);
+                break;
+            }
+        }
+
+        // apply regex to the diff
+
+
+        data->last_tokens = text_tokens;
+    }
+
+
     // print line of tsv output
-    printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n",
+    printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n",
         data->title,
         data->articleid,
         data->revid,
@@ -236,7 +293,9 @@ write_row(revisionData *data)
         (data->minor) ? "1" : "0",
         (unsigned int) data->text_size,
         shannon_H(data->text, data->text_size),
-        md5_hex_output
+        md5_hex_output,
+        (int) additions.size(),
+        (int) deletions.size()
         );
 
     // 

Benjamin Mako Hill || Want to submit a patch?