2 dtl-1.12 -- Diff Template Library
4 In short, Diff Template Library is distributed under so called "BSD license",
6 Copyright (c) 2008-2011 Tatsuhiko Kubo <cubicdaiya@gmail.com>
9 Redistribution and use in source and binary forms, with or without modification,
10 are permitted provided that the following conditions are met:
12 * Redistributions of source code must retain the above copyright notice,
13 this list of conditions and the following disclaimer.
15 * Redistributions in binary form must reproduce the above copyright notice,
16 this list of conditions and the following disclaimer in the documentation
17 and/or other materials provided with the distribution.
19 * Neither the name of the authors nor the names of its contributors
20 may be used to endorse or promote products derived from this software
21 without specific prior written permission.
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 /* If you use this library, you must include dtl.hpp only. */
45 * sequence must support random_access_iterator.
47 template <typename elem, typename sequence = vector< elem >, typename comparator = Compare< elem > >
51 dtl_typedefs(elem, sequence)
59 long long editDistance;
63 editPathCordinates pathCordinates;
67 bool onlyEditDistance;
73 Diff (const sequence& a,
74 const sequence& b) : A(a), B(b) {
78 Diff (const sequence& a,
80 const comparator& comp) : A(a), B(b), cmp(comp) {
86 long long getEditDistance () const {
90 Lcs< elem > getLcs () const {
94 elemVec getLcsVec () const {
95 return lcs.getSequence();
98 Ses< elem > getSes () const {
102 uniHunkVec getUniHunks () const {
106 bool isHuge () const {
118 bool isUnserious () const {
122 void onUnserious () {
123 this->unserious = true;
126 void offUnserious () {
127 this->unserious = false;
130 void onOnlyEditDistance () {
131 this->onlyEditDistance = true;
135 * patching with Unified Format Hunks
137 sequence uniPatch (const sequence& seq) {
138 elemList seqLst(seq.begin(), seq.end());
140 sesElemVec_iter vsesIt;
141 elemList_iter lstIt = seqLst.begin();
142 long long inc_dec_total = 0;
144 for (uniHunkVec_iter it=uniHunks.begin();it!=uniHunks.end();++it) {
145 joinSesVec(shunk, it->common[0]);
146 joinSesVec(shunk, it->change);
147 joinSesVec(shunk, it->common[1]);
148 it->a += inc_dec_total;
149 inc_dec_total += it->inc_dec_count;
150 for (long long i=0;i<it->a - gap;++i) {
153 gap = it->a + it->b + it->inc_dec_count;
154 vsesIt = shunk.begin();
155 while (vsesIt!=shunk.end()) {
156 switch (vsesIt->second.type) {
158 seqLst.insert(lstIt, vsesIt->first);
161 if (lstIt != seqLst.end()) {
162 lstIt = seqLst.erase(lstIt);
166 if (lstIt != seqLst.end()) {
179 sequence patchedSeq(seqLst.begin(), seqLst.end());
184 * patching with Shortest Edit Script
186 sequence patch (const sequence& seq) const {
187 sesElemVec sesSeq = ses.getSequence();
188 elemList seqLst(seq.begin(), seq.end());
189 elemList_iter lstIt = seqLst.begin();
190 for (sesElemVec_iter sesIt=sesSeq.begin();sesIt!=sesSeq.end();++sesIt) {
191 switch (sesIt->second.type) {
193 seqLst.insert(lstIt, sesIt->first);
196 lstIt = seqLst.erase(lstIt);
206 sequence patchedSeq(seqLst.begin(), seqLst.end());
211 * compose Longest Common Subsequence and Shortest Edit Script.
212 * The algorithm implemented here is based on "An O(NP) Sequence Comparison Algorithm"
213 * described by Sun Wu, Udi Manber and Gene Myers
218 pathCordinates.reserve(MAX_CORDINATES_SIZE);
222 fp = new long long[M + N + 3];
223 fill(&fp[0], &fp[M + N + 3], -1);
224 path = editPath(M + N + 3);
225 fill(path.begin(), path.end(), -1);
229 for (long long k=-p;k<=static_cast<long long>(delta)-1;++k) {
230 fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
232 for (long long k=static_cast<long long>(delta)+p;k>=static_cast<long long>(delta)+1;--k) {
233 fp[k+offset] = snake(k, fp[k-1+offset]+1, fp[k+1+offset]);
235 fp[delta+offset] = snake(static_cast<long long>(delta), fp[delta-1+offset]+1, fp[delta+1+offset]);
236 } while (fp[delta+offset] != static_cast<long long>(N) && pathCordinates.size() < MAX_CORDINATES_SIZE);
238 editDistance += static_cast<long long>(delta) + 2 * p;
239 long long r = path[delta+offset];
241 editPathCordinates epc(0);
243 // only recoding editdistance
244 if (onlyEditDistance) {
250 cordinate.x = pathCordinates[(size_t)r].x;
251 cordinate.y = pathCordinates[(size_t)r].y;
252 epc.push_back(cordinate);
253 r = pathCordinates[(size_t)r].k;
256 // record Longest Common Subsequence & Shortest Edit Script
257 if (!recordSequence(epc)) {
258 pathCordinates.resize(0);
267 * print difference between A and B with SES
269 template < typename stream >
270 void printSES (stream& out) const {
271 sesElemVec ses_v = ses.getSequence();
272 for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
275 void printSES (ostream& out = cout) const {
276 printSES< ostream >(out);
280 * print difference with given SES
282 template < typename stream >
283 static void printSES (const Ses< elem >& s, stream& out) {
284 sesElemVec ses_v = s.getSequence();
285 for_each(ses_v.begin(), ses_v.end(), ChangePrinter< sesElem, stream >(out));
288 static void printSES (const Ses< elem >& s, ostream& out = cout) {
289 printSES< ostream >(s, out);
293 * print difference between A and B with the format such as Unified Format
295 template < typename stream >
296 void printUnifiedFormat (stream& out) const {
297 for_each(uniHunks.begin(), uniHunks.end(), UniHunkPrinter< sesElem, stream >(out));
300 void printUnifiedFormat (ostream& out = cout) const {
301 printUnifiedFormat< ostream >(out);
305 * print unified format difference with given unified format hunks
307 template < typename stream >
308 static void printUnifiedFormat (const uniHunkVec& hunks, stream& out) {
309 for_each(hunks.begin(), hunks.end(), UniHunkPrinter< sesElem >(out));
312 static void printUnifiedFormat (const uniHunkVec& hunks, ostream& out = cout) {
313 printUnifiedFormat< ostream >(hunks, out);
317 * compose Unified Format Hunks from Shortest Edit Script
319 void composeUnifiedHunks () {
320 sesElemVec common[2];
322 sesElemVec ses_v = ses.getSequence();
324 long long length = distance(ses_v.begin(), ses_v.end());
325 long long middle = 0;
326 bool isMiddle, isAfter;
329 long long a, b, c, d; // @@ -a,b +c,d @@
330 long long inc_dec_count = 0;
331 uniHunk< sesElem > hunk;
335 isMiddle = isAfter = false;
338 for (sesElemVec_iter it=ses_v.begin();it!=ses_v.end();++it, ++l_cnt) {
341 switch (einfo.type) {
346 if (!isMiddle) isMiddle = true;
348 if (l_cnt >= length) {
349 joinSesVec(change, deletes);
350 joinSesVec(change, adds);
357 deletes.push_back(*it);
358 if (!isMiddle) isMiddle = true;
360 if (l_cnt >= length) {
361 joinSesVec(change, deletes);
362 joinSesVec(change, adds);
368 if (common[1].empty() && adds.empty() && deletes.empty() && change.empty()) {
369 if (static_cast<long long>(common[0].size()) < DTL_CONTEXT_SIZE) {
370 if (a == 0 && c == 0) {
374 common[0].push_back(*it);
376 rotate(common[0].begin(), common[0].begin() + 1, common[0].end());
377 common[0].pop_back();
378 common[0].push_back(*it);
383 if (isMiddle && !isAfter) {
385 joinSesVec(change, deletes);
386 joinSesVec(change, adds);
387 change.push_back(*it);
388 if (middle >= DTL_SEPARATE_SIZE || l_cnt >= length) {
399 // compose unified format hunk
400 if (isAfter && !change.empty()) {
401 sesElemVec_iter cit = it;
403 for (long long i=0;i<DTL_SEPARATE_SIZE && (cit != ses_v.end());++i, ++cit) {
404 if (cit->second.type == SES_COMMON) {
408 if (cnt < DTL_SEPARATE_SIZE && l_cnt < length) {
413 if (static_cast<long long>(common[0].size()) >= DTL_SEPARATE_SIZE) {
414 long long c0size = static_cast<long long>(common[0].size());
415 rotate(common[0].begin(),
416 common[0].begin() + (size_t)c0size - DTL_SEPARATE_SIZE,
418 for (long long i=0;i<c0size - DTL_SEPARATE_SIZE;++i) {
419 common[0].pop_back();
421 a += c0size - DTL_SEPARATE_SIZE;
422 c += c0size - DTL_SEPARATE_SIZE;
426 if (isReverse()) swap(a, c);
427 hunk.a = a;hunk.b = b;hunk.c = c;hunk.d = d;
428 hunk.common[0] = common[0];
429 hunk.change = change;
430 hunk.common[1] = common[1];
431 hunk.inc_dec_count = inc_dec_count;
432 uniHunks.push_back(hunk);
440 a = b = c = d = middle = inc_dec_count = 0;
446 * compose ses from stream
448 template <typename stream>
449 static Ses< elem > composeSesFromStream (stream& st)
453 long long x_idx, y_idx;
455 while (getline(st, line)) {
456 elem mark(line.begin(), line.begin() + 1);
457 elem e(line.begin() + 1, line.end());
458 if (mark == SES_MARK_DELETE) {
459 ret.addSequence(e, x_idx, 0, SES_DELETE);
461 } else if (mark == SES_MARK_ADD) {
462 ret.addSequence(e, y_idx, 0, SES_ADD);
464 } else if (mark == SES_MARK_COMMON) {
465 ret.addSequence(e, x_idx, y_idx, SES_COMMON);
478 M = distance(A.begin(), A.end());
479 N = distance(B.begin(), B.end());
492 onlyEditDistance = false;
497 * search shortest path and record the path
499 long long snake(const long long& k, const long long& above, const long long& below) {
500 long long r = above > below ? path[(size_t)k-1+offset] : path[(size_t)k+1+offset];
501 long long y = max(above, below);
503 while ((size_t)x < M && (size_t)y < N && cmp.impl(A[(size_t)x], B[(size_t)y])) {
507 path[(size_t)k+offset] = static_cast<long long>(pathCordinates.size());
508 if (!onlyEditDistance) {
510 p.x = x;p.y = y;p.k = r;
511 pathCordinates.push_back(p);
519 bool recordSequence (const editPathCordinates& v) {
520 sequence_const_iter x(A.begin());
521 sequence_const_iter y(B.begin());
522 long long x_idx, y_idx; // line number for Unified Format
523 long long px_idx, py_idx; // cordinates
524 bool complete = false;
527 for (size_t i=v.size()-1;!complete;--i) {
528 while(px_idx < v[i].x || py_idx < v[i].y) {
529 if (v[i].y - v[i].x > py_idx - px_idx) {
531 ses.addSequence(*y, y_idx, 0, SES_ADD);
533 ses.addSequence(*y, y_idx, 0, SES_DELETE);
538 } else if (v[i].y - v[i].x < py_idx - px_idx) {
540 ses.addSequence(*x, x_idx, 0, SES_DELETE);
542 ses.addSequence(*x, x_idx, 0, SES_ADD);
549 ses.addSequence(*x, x_idx, y_idx, SES_COMMON);
558 if (i == 0) complete = true;
561 if (x_idx > static_cast<long long>(M) && y_idx > static_cast<long long>(N)) {
562 // all recording success
564 // unserious difference
567 recordOddSequence(x_idx, M, x, SES_DELETE);
568 recordOddSequence(y_idx, N, y, SES_ADD);
570 recordOddSequence(x_idx, M, x, SES_ADD);
571 recordOddSequence(y_idx, N, y, SES_DELETE);
577 sequence A_(A.begin() + (size_t)x_idx - 1, A.end());
578 sequence B_(B.begin() + (size_t)y_idx - 1, B.end());
581 M = distance(A.begin(), A.end());
582 N = distance(B.begin(), B.end());
586 fp = new long long[M + N + 3];
587 fill(&fp[0], &fp[M + N + 3], -1);
588 fill(path.begin(), path.end(), -1);
595 * record odd sequence to ses
597 void inline recordOddSequence (long long idx, long long length, sequence_const_iter it, const edit_t et) {
599 ses.addSequence(*it, idx, 0, et);
604 ses.addSequence(*it, idx, 0, et);
611 void inline joinSesVec (sesElemVec& s1, sesElemVec& s2) const {
613 for (sesElemVec_iter vit=s2.begin();vit!=s2.end();++vit) {
620 * check sequence is replaced each other
622 bool inline isReverse () const {