]> projects.mako.cc - wikiq/blob - dtl/Diff.hpp
added (broken, but running) diff routines for block-level diffs
[wikiq] / dtl / Diff.hpp
1 /**
2    dtl-1.12 -- Diff Template Library
3    
4    In short, Diff Template Library is distributed under so called "BSD license",
5    
6    Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
7    All rights reserved.
8    
9    Redistribution and use in source and binary forms, with or without modification,
10    are permitted provided that the following conditions are met:
11    
12    * Redistributions of source code must retain the above copyright notice,
13    this list of conditions and the following disclaimer.
14    
15    * Redistributions in binary form must reproduce the above copyright notice,
16    this list of conditions and the following disclaimer in the documentation
17    and/or other materials provided with the distribution.
18    
19    * Neither the name of the authors nor the names of its contributors
20    may be used to endorse or promote products derived from this software 
21    without specific prior written permission.
22    
23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 /* If you use this library, you must include dtl.hpp only. */
37
38 #ifndef DTL_DIFF_H
39 #define DTL_DIFF_H
40
41 namespace dtl {
42     
43     /**
44      * diff class template
45      * sequence must support random_access_iterator.
46      */
47     template <typename elem, typename sequence = vector< elem >, typename comparator = Compare< elem > >
48     class Diff
49     {
50     private :
51         dtl_typedefs(elem, sequence)
52         sequence           A;
53         sequence           B;
54         size_t             M;
55         size_t             N;
56         size_t             delta;
57         size_t             offset;
58         long long          *fp;
59         long long          editDistance;
60         Lcs< elem >        lcs;
61         Ses< elem >        ses;
62         editPath           path;
63         editPathCordinates pathCordinates;
64         bool               reverse;
65         bool               huge;
66         bool               unserious;
67         bool               onlyEditDistance;
68         uniHunkVec         uniHunks;
69         comparator         cmp;
70     public :
71         Diff () {}
72         
73         Diff (const sequence& a, 
74               const sequence& b) : A(a), B(b) {
75             init();
76         }
77         
78         Diff (const sequence& a, 
79               const sequence& b, 
80               const comparator& comp) : A(a), B(b), cmp(comp) {
81             init();
82         }
83         
84         ~Diff() {}
85         
86         long long getEditDistance () const {
87             return editDistance;
88         }
89         
90         Lcs< elem > getLcs () const {
91             return lcs;
92         }
93         
94         elemVec getLcsVec () const {
95             return lcs.getSequence();
96         }
97         
98         Ses< elem > getSes () const {
99             return ses;
100         }
101         
102         uniHunkVec getUniHunks () const {
103             return uniHunks;
104         }
105         
106         bool isHuge () const {
107             return huge;
108         }
109         
110         void onHuge () {
111             this->huge = true;
112         }
113         
114         void offHuge () {
115             this->huge = false;
116         }
117         
118         bool isUnserious () const {
119             return unserious;
120         }
121         
122         void onUnserious () {
123             this->unserious = true;
124         }
125         
126         void offUnserious () {
127             this->unserious = false;
128         }
129         
130         void onOnlyEditDistance () {
131             this->onlyEditDistance = true;
132         }
133         
134         /**
135          * patching with Unified Format Hunks
136          */
137         sequence uniPatch (const sequence& seq) {
138             elemList        seqLst(seq.begin(), seq.end());
139             sesElemVec      shunk;
140             sesElemVec_iter vsesIt;
141             elemList_iter   lstIt         = seqLst.begin();
142             long long       inc_dec_total = 0;
143             long long       gap           = 1;
144             for (uniHunkVec_iter it=uniHunks.begin();it!=uniHunks.end();++it) {
145                 joinSesVec(shunk, it->common[0]);
146                 joinSesVec(shunk, it->change);
147                 joinSesVec(shunk, it->common[1]);
148                 it->a         += inc_dec_total;
149                 inc_dec_total += it->inc_dec_count;
150                 for (long long i=0;i<it->a - gap;++i) {
151                     ++lstIt;
152                 }
153                 gap = it->a + it->b + it->inc_dec_count;
154                 vsesIt = shunk.begin();
155                 while (vsesIt!=shunk.end()) {
156                     switch (vsesIt->second.type) {
157                     case SES_ADD :
158                         seqLst.insert(lstIt, vsesIt->first);
159                         break;
160                     case SES_DELETE :
161                         if (lstIt != seqLst.end()) {
162                             lstIt = seqLst.erase(lstIt);
163                         }
164                         break;
165                     case SES_COMMON :
166                         if (lstIt != seqLst.end()) {
167                             ++lstIt;
168                         }
169                         break;
170                     default :
171                         // no through
172                         break;
173                     }
174                     ++vsesIt;
175                 }
176                 shunk.clear();
177             }
178             
179             sequence patchedSeq(seqLst.begin(), seqLst.end());
180             return patchedSeq;
181         }
182         
183         /**
184          * patching with Shortest Edit Script
185          */
186         sequence patch (const sequence& seq) const {
187             sesElemVec    sesSeq = ses.getSequence();
188             elemList      seqLst(seq.begin(), seq.end());
189             elemList_iter lstIt  = seqLst.begin();
190             for (sesElemVec_iter sesIt=sesSeq.begin();sesIt!=sesSeq.end();++sesIt) {
191                 switch (sesIt->second.type) {
192                 case SES_ADD :
193                     seqLst.insert(lstIt, sesIt->first);
194                     break;
195                 case SES_DELETE :
196                     lstIt = seqLst.erase(lstIt);
197                     break;
198                 case SES_COMMON :
199                     ++lstIt;
200                     break;
201                 default :
202                     // no through
203                     break;
204                 }
205             }
206             sequence patchedSeq(seqLst.begin(), seqLst.end());
207             return patchedSeq;
208         }
209         
210         /**
211          * compose Longest Common Subsequence and Shortest Edit Script.
212          * The algorithm implemented here is based on "An O(NP) Sequence Comparison Algorithm"
213          * described by Sun Wu, Udi Manber and Gene Myers
214          */
215         void compose() {
216             
217             if (isHuge()) {
218                 pathCordinates.reserve(MAX_CORDINATES_SIZE);
219             }
220             
221             long long p = -1;
222             fp = new long long[M + N + 3];
223             fill(&fp[0], &fp[M + N + 3], -1);
224             path = editPath(M + N + 3);
225             fill(path.begin(), path.end(), -1);
226         ONP:
227             do {
228                 ++p;
229                 for (long long k=-p;k<=static_cast<long long>(delta)-1;++k) {
230                     fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
231                 }
232                 for (long long k=static_cast<long long>(delta)+p;k>=static_cast<long long>(delta)+1;--k) {
233                     fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
234                 }
235                 fp[delta+offset] = snake(static_cast<long long>(delta), fp[delta-1+offset]+1, fp[delta+1+offset]);
236             } while (fp[delta+offset] != static_cast<long long>(N) && pathCordinates.size() < MAX_CORDINATES_SIZE);
237             
238             editDistance += static_cast<long long>(delta) + 2 * p;
239             long long r = path[delta+offset];
240             P cordinate;
241             editPathCordinates epc(0);
242             
243             // only recoding editdistance
244             if (onlyEditDistance) {
245                 delete[] this->fp;
246                 return;
247             }
248             
249             while(r != -1) {
250                 cordinate.x = pathCordinates[(size_t)r].x;
251                 cordinate.y = pathCordinates[(size_t)r].y;
252                 epc.push_back(cordinate);
253                 r = pathCordinates[(size_t)r].k;
254             }
255             
256             // record Longest Common Subsequence & Shortest Edit Script
257             if (!recordSequence(epc)) {
258                 pathCordinates.resize(0);
259                 epc.resize(0);
260                 p = -1;
261                 goto ONP;
262             }
263             delete[] this->fp;
264         }
265
266         /**
267          * print difference between A and B with SES
268          */
269         template < typename stream >
270         void printSES (stream& out) const {
271             sesElemVec ses_v = ses.getSequence();
272             for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
273         }
274         
275         void printSES (ostream& out = cout) const {
276             printSES< ostream >(out);
277         }
278         
279         /**
280          * print difference with given SES
281          */
282         template < typename stream >
283         static void printSES (const Ses< elem >& s, stream& out) {
284             sesElemVec ses_v = s.getSequence();
285             for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
286         }
287         
288         static void printSES (const Ses< elem >& s, ostream& out = cout) {
289             printSES< ostream >(s, out);
290         }
291         
292         /**
293          * print difference between A and B with the format such as Unified Format
294          */
295         template < typename stream >
296         void printUnifiedFormat (stream& out) const {
297             for_each(uniHunks.begin(), uniHunks.end(), UniHunkPrinter< sesElem, stream >(out));
298         }
299         
300         void printUnifiedFormat (ostream& out = cout) const {
301             printUnifiedFormat< ostream >(out);
302         }
303         
304         /**
305          * print unified format difference with given unified format hunks
306          */
307         template < typename stream >
308         static void printUnifiedFormat (const uniHunkVec& hunks, stream& out) {
309             for_each(hunks.begin(), hunks.end(), UniHunkPrinter< sesElem >(out));
310         }
311
312         static void printUnifiedFormat (const uniHunkVec& hunks, ostream& out = cout) {
313             printUnifiedFormat< ostream >(hunks, out);
314         }
315
316         /**
317          * compose Unified Format Hunks from Shortest Edit Script
318          */
319         void composeUnifiedHunks () {
320             sesElemVec         common[2];
321             sesElemVec         change;
322             sesElemVec         ses_v  = ses.getSequence();
323             long long          l_cnt  = 1;
324             long long          length = distance(ses_v.begin(), ses_v.end());
325             long long          middle = 0;
326             bool               isMiddle, isAfter;
327             elem               e;
328             elemInfo           einfo;
329             long long          a, b, c, d;        // @@ -a,b +c,d @@
330             long long          inc_dec_count = 0;
331             uniHunk< sesElem > hunk;
332             sesElemVec         adds;
333             sesElemVec         deletes;
334             
335             isMiddle = isAfter = false;
336             a = b = c = d = 0;
337             
338             for (sesElemVec_iter it=ses_v.begin();it!=ses_v.end();++it, ++l_cnt) {
339                 e = it->first;
340                 einfo = it->second;
341                 switch (einfo.type) {
342                 case SES_ADD :
343                     middle = 0;
344                     ++inc_dec_count;
345                     adds.push_back(*it);
346                     if (!isMiddle)       isMiddle = true;
347                     if (isMiddle)        ++d;
348                     if (l_cnt >= length) {
349                         joinSesVec(change, deletes);
350                         joinSesVec(change, adds);
351                         isAfter = true;
352                     }
353                     break;
354                 case SES_DELETE :
355                     middle = 0;
356                     --inc_dec_count;
357                     deletes.push_back(*it);
358                     if (!isMiddle)       isMiddle = true;
359                     if (isMiddle)        ++b;
360                     if (l_cnt >= length) {
361                         joinSesVec(change, deletes);
362                         joinSesVec(change, adds);
363                         isAfter = true;
364                     }
365                     break;
366                 case SES_COMMON :
367                     ++b;++d;
368                     if (common[1].empty() && adds.empty() && deletes.empty() && change.empty()) {
369                         if (static_cast<long long>(common[0].size()) < DTL_CONTEXT_SIZE) {
370                             if (a == 0 && c == 0) {
371                                 a = einfo.beforeIdx;
372                                 c = einfo.afterIdx;
373                             }
374                             common[0].push_back(*it);
375                         } else {
376                             rotate(common[0].begin(), common[0].begin() + 1, common[0].end());
377                             common[0].pop_back();
378                             common[0].push_back(*it);
379                             ++a;++c;
380                             --b;--d;
381                         }
382                     }
383                     if (isMiddle && !isAfter) {
384                         ++middle;
385                         joinSesVec(change, deletes);
386                         joinSesVec(change, adds);
387                         change.push_back(*it);
388                         if (middle >= DTL_SEPARATE_SIZE || l_cnt >= length) {
389                             isAfter = true;
390                         }
391                         adds.clear();
392                         deletes.clear();
393                     }
394                     break;
395                 default :
396                     // no through
397                     break;
398                 }
399                 // compose unified format hunk
400                 if (isAfter && !change.empty()) {
401                     sesElemVec_iter cit = it;
402                     long long       cnt = 0;
403                     for (long long i=0;i<DTL_SEPARATE_SIZE && (cit != ses_v.end());++i, ++cit) {
404                         if (cit->second.type == SES_COMMON) {
405                             ++cnt;
406                         }
407                     }
408                     if (cnt < DTL_SEPARATE_SIZE && l_cnt < length) {
409                         middle = 0;
410                         isAfter = false;
411                         continue;
412                     }
413                     if (static_cast<long long>(common[0].size()) >= DTL_SEPARATE_SIZE) {
414                         long long c0size = static_cast<long long>(common[0].size());
415                         rotate(common[0].begin(), 
416                                common[0].begin() + (size_t)c0size - DTL_SEPARATE_SIZE, 
417                                common[0].end());
418                         for (long long i=0;i<c0size - DTL_SEPARATE_SIZE;++i) {
419                             common[0].pop_back();
420                         }
421                         a += c0size - DTL_SEPARATE_SIZE;
422                         c += c0size - DTL_SEPARATE_SIZE;
423                     }
424                     if (a == 0) ++a;
425                     if (c == 0) ++c;
426                     if (isReverse()) swap(a, c);
427                     hunk.a = a;hunk.b = b;hunk.c = c;hunk.d = d;
428                     hunk.common[0] = common[0];
429                     hunk.change = change;
430                     hunk.common[1] = common[1];
431                     hunk.inc_dec_count = inc_dec_count;
432                     uniHunks.push_back(hunk);
433                     isMiddle = false;
434                     isAfter = false;
435                     common[0].clear();
436                     common[1].clear();
437                     adds.clear();
438                     deletes.clear();
439                     change.clear();
440                     a = b = c = d = middle = inc_dec_count = 0;
441                 }
442             }
443         }
444         
445         /**
446          * compose ses from stream
447          */
448         template <typename stream>
449         static Ses< elem > composeSesFromStream (stream& st)
450         {
451             elem        line;
452             Ses< elem > ret;
453             long long   x_idx, y_idx;
454             x_idx = y_idx = 1;
455             while (getline(st, line)) {
456                 elem mark(line.begin(), line.begin() + 1);
457                 elem e(line.begin() + 1, line.end());
458                 if (mark == SES_MARK_DELETE) {
459                     ret.addSequence(e, x_idx, 0, SES_DELETE);
460                     ++x_idx;
461                 } else if (mark == SES_MARK_ADD) {
462                     ret.addSequence(e, y_idx, 0, SES_ADD);
463                     ++y_idx;
464                 } else if (mark == SES_MARK_COMMON) {
465                     ret.addSequence(e, x_idx, y_idx, SES_COMMON);
466                     ++x_idx;
467                     ++y_idx;
468                 }
469             }
470             return ret;
471         }
472
473     private :
474         /**
475          * initialize
476          */
477         void init () {
478             M = distance(A.begin(), A.end());
479             N = distance(B.begin(), B.end());
480             if (M < N) {
481                 reverse = false;
482             } else {
483                 swap(A, B);
484                 swap(M, N);
485                 reverse = true;
486             }
487             editDistance     = 0;
488             delta            = N - M;
489             offset           = M + 1;
490             huge             = false;
491             unserious        = false;
492             onlyEditDistance = false;
493             fp               = NULL;
494         }
495         
496         /**
497          * search shortest path and record the path
498          */
499         long long snake(const long long& k, const long long& above, const long long& below) {
500             long long r = above > below ? path[(size_t)k-1+offset] : path[(size_t)k+1+offset];
501             long long y = max(above, below);
502             long long x = y - k;
503             while ((size_t)x < M && (size_t)y < N && cmp.impl(A[(size_t)x], B[(size_t)y])) {
504                 ++x;++y;
505             }
506             
507             path[(size_t)k+offset] = static_cast<long long>(pathCordinates.size());
508             if (!onlyEditDistance) {
509                 P p;
510                 p.x = x;p.y = y;p.k = r;
511                 pathCordinates.push_back(p);      
512             }
513             return y;
514         }
515         
516         /**
517          * record SES and LCS
518          */
519         bool recordSequence (const editPathCordinates& v) {
520             sequence_const_iter x(A.begin());
521             sequence_const_iter y(B.begin());
522             long long           x_idx,  y_idx;  // line number for Unified Format
523             long long           px_idx, py_idx; // cordinates
524             bool                complete = false;
525             x_idx  = y_idx  = 1;
526             px_idx = py_idx = 0;
527             for (size_t i=v.size()-1;!complete;--i) {
528                 while(px_idx < v[i].x || py_idx < v[i].y) {
529                     if (v[i].y - v[i].x > py_idx - px_idx) {
530                         if (!isReverse()) {
531                             ses.addSequence(*y, y_idx, 0, SES_ADD);
532                         } else {
533                             ses.addSequence(*y, y_idx, 0, SES_DELETE);
534                         }
535                         ++y;
536                         ++y_idx;
537                         ++py_idx;
538                     } else if (v[i].y - v[i].x < py_idx - px_idx) {
539                         if (!isReverse()) {
540                             ses.addSequence(*x, x_idx, 0, SES_DELETE);
541                         } else {
542                             ses.addSequence(*x, x_idx, 0, SES_ADD);
543                         }
544                         ++x;
545                         ++x_idx;
546                         ++px_idx;
547                     } else {
548                         lcs.addSequence(*x);
549                         ses.addSequence(*x, x_idx, y_idx, SES_COMMON);
550                         ++x;
551                         ++y;
552                         ++x_idx;
553                         ++y_idx;
554                         ++px_idx;
555                         ++py_idx;
556                     }
557                 }
558                 if (i == 0) complete = true;
559             }
560             
561             if (x_idx > static_cast<long long>(M) && y_idx > static_cast<long long>(N)) {
562                 // all recording success
563             } else {
564                 // unserious difference
565                 if (isUnserious()) {
566                     if (!isReverse()) {
567                         recordOddSequence(x_idx, M, x, SES_DELETE);
568                         recordOddSequence(y_idx, N, y, SES_ADD);
569                     } else {
570                         recordOddSequence(x_idx, M, x, SES_ADD);
571                         recordOddSequence(y_idx, N, y, SES_DELETE);
572                     }
573                     return true;
574                 }
575                 
576                 // decent difference
577                 sequence A_(A.begin() + (size_t)x_idx - 1, A.end());
578                 sequence B_(B.begin() + (size_t)y_idx - 1, B.end());
579                 A        = A_;
580                 B        = B_;
581                 M        = distance(A.begin(), A.end());
582                 N        = distance(B.begin(), B.end());
583                 delta    = N - M;
584                 offset   = M + 1;
585                 delete[] fp;
586                 fp = new long long[M + N + 3];
587                 fill(&fp[0], &fp[M + N + 3], -1);
588                 fill(path.begin(), path.end(), -1);
589                 return false;
590             }
591             return true;
592         }
593         
594         /**
595          * record odd sequence to ses
596          */
597         void inline recordOddSequence (long long idx, long long length, sequence_const_iter it, const edit_t et) {
598             while(idx < length){
599                 ses.addSequence(*it, idx, 0, et);
600                 ++it;
601                 ++idx;
602                 ++editDistance;
603             }
604             ses.addSequence(*it, idx, 0, et);
605             ++editDistance;
606         }
607         
608         /**
609          * join ses vectors
610          */
611         void inline joinSesVec (sesElemVec& s1, sesElemVec& s2) const {
612             if (!s2.empty()) {
613                 for (sesElemVec_iter vit=s2.begin();vit!=s2.end();++vit) {
614                     s1.push_back(*vit);
615                 }
616             }      
617         }
618         
619         /**
620          * check sequence is replaced each other
621          */
622         bool inline isReverse () const {
623             return reverse;
624         }
625
626     };
627 }
628
629 #endif // DTL_DIFF_H

Benjamin Mako Hill || Want to submit a patch?