vtools_genesis 1
vtools_genesis 2
vtools_genesis 3 transforming the first vector into the second vector through a
vtools_genesis 4 sequence of edits (inserts and deletes of one element each), this
vtools_genesis 5 sequence is short - or equivalently, if the ordered list of
vtools_genesis 6 elements that are untouched by these edits is long. For a good
vtools_genesis 7 introduction to the subject, read about the ``Levenshtein
vtools_genesis 8 distance'' in Wikipedia.
vtools_genesis 9
vtools_genesis 10 The basic algorithm is described in: ``An O(ND) Difference
vtools_genesis 11 Algorithm and its Variations'', Eugene W. Myers, Algorithmica
vtools_genesis 12 Vol. 1, 1986, pp. 251-266,
vtools_genesis 13 \url{http:
vtools_genesis 14 4.2, which describes the variation used below.
vtools_genesis 15
vtools_genesis 16 The basic algorithm was independently discovered as described in:
vtools_genesis 17 ``Algorithms for Approximate String Matching'', Esko Ukkonen,
vtools_genesis 18 Information and Control Vol. 64, 1985, pp. 100-118,
vtools_genesis 19 \url{http:
vtools_genesis 20
vtools_genesis 21 Unless the |find_minimal| flag is set, this code uses the
vtools_genesis 22 |TOO_EXPENSIVE| heuristic, by Paul Eggert, to limit the cost to
vtools_genesis 23 $O(N^1.5 \log N)$ at the price of producing suboptimal output for
vtools_genesis 24 large inputs with many differences. */
vtools_genesis 25
vtools_genesis 26
vtools_genesis 27 |ELEMENT| The element type of the vectors being compared.
vtools_genesis 28 |EQUAL| A two-argument macro that tests two elements for
vtools_genesis 29 equality.
vtools_genesis 30 |OFFSET| A signed integer type sufficient to hold the
vtools_genesis 31 difference between two indices. Usually
vtools_genesis 32 something like |ptrdiff_t|.
vtools_genesis 33 |EXTRA_CONTEXT_FIELDS| Declarations of fields for 'struct context'.
vtools_genesis 34 |NOTE_DELETE(ctxt, xoff)| Record the removal of the object xvec[xoff].
vtools_genesis 35 |NOTE_INSERT(ctxt, yoff)| Record the insertion of the object yvec[yoff].
vtools_genesis 36 |EARLY_ABORT(ctxt)| (Optional) A boolean expression that triggers an
vtools_genesis 37 early abort of the computation.
vtools_genesis 38 |USE_HEURISTIC| (Optional) Define if you want to support the
vtools_genesis 39 heuristic for large vectors.
vtools_genesis 40 It is also possible to use this file with abstract arrays. In this case,
vtools_genesis 41 xvec and yvec are not represented in memory. They only exist conceptually.
vtools_genesis 42 In this case, the list of defines above is amended as follows:
vtools_genesis 43 |ELEMENT| Undefined.
vtools_genesis 44 |EQUAL| Undefined.
vtools_genesis 45 |XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)|
vtools_genesis 46 A three-argument macro: References xvec[xoff] and
vtools_genesis 47 yvec[yoff] and tests these elements for equality.
vtools_genesis 48 Before including this file, you also need to include:
vtools_genesis 49 |#include <limits.h>
vtools_genesis 50 #include <stdbool.h>|
vtools_genesis 51 */
vtools_genesis 52
vtools_genesis 53
vtools_genesis 54 #define OFFSET_MAX \
vtools_genesis 55 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
vtools_genesis 56
vtools_genesis 57
vtools_genesis 58 #ifndef EARLY_ABORT
vtools_genesis 59 # define EARLY_ABORT(ctxt) false
vtools_genesis 60 #endif
vtools_genesis 61
vtools_genesis 62
vtools_genesis 63 warnings. Beware: The Code argument must not contain commas. */
vtools_genesis 64 #ifndef IF_LINT
vtools_genesis 65 # if defined GCC_LINT || defined lint
vtools_genesis 66 # define IF_LINT(Code) Code
vtools_genesis 67 # else
vtools_genesis 68 # define IF_LINT(Code) /* empty */
vtools_genesis 69 # endif
vtools_genesis 70 #endif
vtools_genesis 71
vtools_genesis 72
vtools_genesis 73 #ifndef IF_LINT2
vtools_genesis 74 # if defined GCC_LINT || defined lint
vtools_genesis 75 # define IF_LINT2(Code1, Code2) Code1, Code2
vtools_genesis 76 # else
vtools_genesis 77 # define IF_LINT2(Code1, Code2) /* empty */
vtools_genesis 78 # endif
vtools_genesis 79 #endif
vtools_genesis 80
vtools_genesis 81
vtools_genesis 82 * Context of comparison operation.
vtools_genesis 83 */
vtools_genesis 84 struct context {
vtools_genesis 85 #ifdef ELEMENT
vtools_genesis 86
vtools_genesis 87 ELEMENT const *xvec;
vtools_genesis 88 ELEMENT const *yvec;
vtools_genesis 89 #endif
vtools_genesis 90
vtools_genesis 91
vtools_genesis 92 EXTRA_CONTEXT_FIELDS
vtools_genesis 93
vtools_genesis 94
vtools_genesis 95 the point furthest along the given diagonal in the forward
vtools_genesis 96 search of the edit matrix. */
vtools_genesis 97 OFFSET *fdiag;
vtools_genesis 98
vtools_genesis 99
vtools_genesis 100 point furthest along the given diagonal in the backward search
vtools_genesis 101 of the edit matrix. */
vtools_genesis 102 OFFSET *bdiag;
vtools_genesis 103
vtools_genesis 104 #ifdef USE_HEURISTIC
vtools_genesis 105
vtools_genesis 106 this heuristic, for vectors with a constant small density of
vtools_genesis 107 changes, the algorithm is linear in the vector size. */
vtools_genesis 108 bool heuristic;
vtools_genesis 109 #endif
vtools_genesis 110
vtools_genesis 111
vtools_genesis 112 OFFSET too_expensive;
vtools_genesis 113
vtools_genesis 114
vtools_genesis 115 #define SNAKE_LIMIT 20
vtools_genesis 116 };
vtools_genesis 117
vtools_genesis 118 struct partition {
vtools_genesis 119
vtools_genesis 120 OFFSET xmid;
vtools_genesis 121 OFFSET ymid;
vtools_genesis 122
vtools_genesis 123
vtools_genesis 124 bool lo_minimal;
vtools_genesis 125
vtools_genesis 126
vtools_genesis 127 bool hi_minimal;
vtools_genesis 128 };
vtools_genesis 129
vtools_genesis 130
vtools_genesis 131
vtools_genesis 132 portion of the two vectors.
vtools_genesis 133
vtools_genesis 134 Scan from the beginnings of the vectors, and simultaneously from
vtools_genesis 135 the ends, doing a breadth-first search through the space of
vtools_genesis 136 edit-sequence. When the two searches meet, we have found the
vtools_genesis 137 midpoint of the shortest edit sequence.
vtools_genesis 138
vtools_genesis 139 If |find_minimal| is true, find the minimal edit script regardless
vtools_genesis 140 of expense. Otherwise, if the search is too expensive, use
vtools_genesis 141 heuristics to stop the search and report a suboptimal answer.
vtools_genesis 142
vtools_genesis 143 Set |part->(xmid,ymid)| to the midpoint |(xmid,ymid)|. The
vtools_genesis 144 diagonal number |xmid - ymid| equals the number of inserted
vtools_genesis 145 elements minus the number of deleted elements (counting only
vtools_genesis 146 elements before the midpoint).
vtools_genesis 147
vtools_genesis 148 Set |part->lo_minimal| to true iff the minimal edit script for the
vtools_genesis 149 left half of the partition is known; similarly for
vtools_genesis 150 |part->hi_minimal|.
vtools_genesis 151
vtools_genesis 152 This function assumes that the first elements of the specified
vtools_genesis 153 portions of the two vectors do not match, and likewise that the
vtools_genesis 154 last elements do not match. The caller must trim matching elements
vtools_genesis 155 from the beginning and end of the portions it is going to specify.
vtools_genesis 156
vtools_genesis 157 If we return the "wrong" partitions, the worst this can do is cause
vtools_genesis 158 suboptimal diff output. It cannot cause incorrect diff output. */
vtools_genesis 159
vtools_genesis 160 static void
vtools_genesis 161 diag(OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
vtools_genesis 162 struct partition *part, struct context *ctxt) {
vtools_genesis 163 OFFSET *const fd = ctxt->fdiag;
vtools_genesis 164 OFFSET *const bd = ctxt->bdiag;
vtools_genesis 165 #ifdef ELEMENT
vtools_genesis 166 ELEMENT const *const xv = ctxt->xvec;
vtools_genesis 167 ELEMENT const *const yv = ctxt->yvec;
vtools_genesis 168 #define XREF_YREF_EQUAL(x, y) EQUAL (xv[x], yv[y])
vtools_genesis 169 #else
vtools_genesis 170 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
vtools_genesis 171 #endif
vtools_genesis 172 const OFFSET dmin = xoff - ylim;
vtools_genesis 173 const OFFSET dmax = xlim - yoff;
vtools_genesis 174 const OFFSET fmid = xoff - yoff;
vtools_genesis 175 const OFFSET bmid = xlim - ylim;
vtools_genesis 176 OFFSET fmin = fmid;
vtools_genesis 177 OFFSET fmax = fmid;
vtools_genesis 178 OFFSET bmin = bmid;
vtools_genesis 179 OFFSET bmax = bmid;
vtools_genesis 180 OFFSET c;
vtools_genesis 181 bool odd = (fmid - bmid) & 1;
vtools_genesis 182 diagonal with respect to the northwest. */
vtools_genesis 183
vtools_genesis 184 fd[fmid] = xoff;
vtools_genesis 185 bd[bmid] = xlim;
vtools_genesis 186
vtools_genesis 187 for (c = 1;; ++c) {
vtools_genesis 188 OFFSET d;
vtools_genesis 189 bool big_snake = false;
vtools_genesis 190
vtools_genesis 191
vtools_genesis 192 if (fmin > dmin)
vtools_genesis 193 fd[--fmin - 1] = -1;
vtools_genesis 194 else
vtools_genesis 195 ++fmin;
vtools_genesis 196 if (fmax < dmax)
vtools_genesis 197 fd[++fmax + 1] = -1;
vtools_genesis 198 else
vtools_genesis 199 --fmax;
vtools_genesis 200 for (d = fmax; d >= fmin; d -= 2) {
vtools_genesis 201 OFFSET x;
vtools_genesis 202 OFFSET y;
vtools_genesis 203 OFFSET tlo = fd[d - 1];
vtools_genesis 204 OFFSET thi = fd[d + 1];
vtools_genesis 205 OFFSET x0 = tlo < thi ? thi : tlo + 1;
vtools_genesis 206
vtools_genesis 207 for (x = x0, y = x0 - d;
vtools_genesis 208 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
vtools_genesis 209 x++, y++)
vtools_genesis 210 continue;
vtools_genesis 211 if (x - x0 > SNAKE_LIMIT)
vtools_genesis 212 big_snake = true;
vtools_genesis 213 fd[d] = x;
vtools_genesis 214 if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
vtools_genesis 215 part->xmid = x;
vtools_genesis 216 part->ymid = y;
vtools_genesis 217 part->lo_minimal = part->hi_minimal = true;
vtools_genesis 218 return;
vtools_genesis 219 }
vtools_genesis 220 }
vtools_genesis 221
vtools_genesis 222
vtools_genesis 223 if (bmin > dmin)
vtools_genesis 224 bd[--bmin - 1] = OFFSET_MAX;
vtools_genesis 225 else
vtools_genesis 226 ++bmin;
vtools_genesis 227 if (bmax < dmax)
vtools_genesis 228 bd[++bmax + 1] = OFFSET_MAX;
vtools_genesis 229 else
vtools_genesis 230 --bmax;
vtools_genesis 231 for (d = bmax; d >= bmin; d -= 2) {
vtools_genesis 232 OFFSET x;
vtools_genesis 233 OFFSET y;
vtools_genesis 234 OFFSET tlo = bd[d - 1];
vtools_genesis 235 OFFSET thi = bd[d + 1];
vtools_genesis 236 OFFSET x0 = tlo < thi ? tlo : thi - 1;
vtools_genesis 237
vtools_genesis 238 for (x = x0, y = x0 - d;
vtools_genesis 239 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
vtools_genesis 240 x--, y--)
vtools_genesis 241 continue;
vtools_genesis 242 if (x0 - x > SNAKE_LIMIT)
vtools_genesis 243 big_snake = true;
vtools_genesis 244 bd[d] = x;
vtools_genesis 245 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
vtools_genesis 246 part->xmid = x;
vtools_genesis 247 part->ymid = y;
vtools_genesis 248 part->lo_minimal = part->hi_minimal = true;
vtools_genesis 249 return;
vtools_genesis 250 }
vtools_genesis 251 }
vtools_genesis 252
vtools_genesis 253 if (find_minimal)
vtools_genesis 254 continue;
vtools_genesis 255
vtools_genesis 256 #ifdef USE_HEURISTIC
vtools_genesis 257
vtools_genesis 258 lots of progress compared with the edit distance. If we
vtools_genesis 259 have any such, find the one that has made the most progress
vtools_genesis 260 and return it as if it had succeeded.
vtools_genesis 261
vtools_genesis 262 With this heuristic, for vectors with a constant small
vtools_genesis 263 density of changes, the algorithm is linear in the vector
vtools_genesis 264 size. */
vtools_genesis 265
vtools_genesis 266 if (200 < c && big_snake && ctxt->heuristic) {
vtools_genesis 267 {
vtools_genesis 268 OFFSET best = 0;
vtools_genesis 269
vtools_genesis 270 for (d = fmax; d >= fmin; d -= 2) {
vtools_genesis 271 OFFSET dd = d - fmid;
vtools_genesis 272 OFFSET x = fd[d];
vtools_genesis 273 OFFSET y = x - d;
vtools_genesis 274 OFFSET v = (x - xoff) * 2 - dd;
vtools_genesis 275
vtools_genesis 276 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
vtools_genesis 277 if (v > best
vtools_genesis 278 && xoff + SNAKE_LIMIT <= x && x < xlim
vtools_genesis 279 && yoff + SNAKE_LIMIT <= y && y < ylim) {
vtools_genesis 280
vtools_genesis 281 now insist that it end with a
vtools_genesis 282 significant snake. */
vtools_genesis 283 int k;
vtools_genesis 284
vtools_genesis 285 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
vtools_genesis 286 if (k == SNAKE_LIMIT) {
vtools_genesis 287 best = v;
vtools_genesis 288 part->xmid = x;
vtools_genesis 289 part->ymid = y;
vtools_genesis 290 break;
vtools_genesis 291 }
vtools_genesis 292 }
vtools_genesis 293 }
vtools_genesis 294 }
vtools_genesis 295 if (best > 0) {
vtools_genesis 296 part->lo_minimal = true;
vtools_genesis 297 part->hi_minimal = false;
vtools_genesis 298 return;
vtools_genesis 299 }
vtools_genesis 300 }
vtools_genesis 301
vtools_genesis 302 {
vtools_genesis 303 OFFSET best = 0;
vtools_genesis 304
vtools_genesis 305 for (d = bmax; d >= bmin; d -= 2) {
vtools_genesis 306 OFFSET dd = d - bmid;
vtools_genesis 307 OFFSET x = bd[d];
vtools_genesis 308 OFFSET y = x - d;
vtools_genesis 309 OFFSET v = (xlim - x) * 2 + dd;
vtools_genesis 310
vtools_genesis 311 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
vtools_genesis 312 if (v > best
vtools_genesis 313 && xoff < x && x <= xlim - SNAKE_LIMIT
vtools_genesis 314 && yoff < y && y <= ylim - SNAKE_LIMIT) {
vtools_genesis 315
vtools_genesis 316 now insist that it end with a
vtools_genesis 317 significant snake. */
vtools_genesis 318 int k;
vtools_genesis 319
vtools_genesis 320 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
vtools_genesis 321 if (k == SNAKE_LIMIT - 1) {
vtools_genesis 322 best = v;
vtools_genesis 323 part->xmid = x;
vtools_genesis 324 part->ymid = y;
vtools_genesis 325 break;
vtools_genesis 326 }
vtools_genesis 327 }
vtools_genesis 328 }
vtools_genesis 329 }
vtools_genesis 330 if (best > 0) {
vtools_genesis 331 part->lo_minimal = false;
vtools_genesis 332 part->hi_minimal = true;
vtools_genesis 333 return;
vtools_genesis 334 }
vtools_genesis 335 }
vtools_genesis 336 }
vtools_genesis 337 #endif
vtools_genesis 338
vtools_genesis 339
vtools_genesis 340 up and report halfway between our best results so far. */
vtools_genesis 341 if (c >= ctxt->too_expensive) {
vtools_genesis 342 OFFSET fxybest;
vtools_genesis 343 OFFSET fxbest IF_LINT (= 0);
vtools_genesis 344 OFFSET bxybest;
vtools_genesis 345 OFFSET bxbest IF_LINT (= 0);
vtools_genesis 346
vtools_genesis 347
vtools_genesis 348 fxybest = -1;
vtools_genesis 349 for (d = fmax; d >= fmin; d -= 2) {
vtools_genesis 350 OFFSET x = MIN (fd[d], xlim);
vtools_genesis 351 OFFSET y = x - d;
vtools_genesis 352 if (ylim < y) {
vtools_genesis 353 x = ylim + d;
vtools_genesis 354 y = ylim;
vtools_genesis 355 }
vtools_genesis 356 if (fxybest < x + y) {
vtools_genesis 357 fxybest = x + y;
vtools_genesis 358 fxbest = x;
vtools_genesis 359 }
vtools_genesis 360 }
vtools_genesis 361
vtools_genesis 362
vtools_genesis 363 bxybest = OFFSET_MAX;
vtools_genesis 364 for (d = bmax; d >= bmin; d -= 2) {
vtools_genesis 365 OFFSET x = MAX (xoff, bd[d]);
vtools_genesis 366 OFFSET y = x - d;
vtools_genesis 367 if (y < yoff) {
vtools_genesis 368 x = yoff + d;
vtools_genesis 369 y = yoff;
vtools_genesis 370 }
vtools_genesis 371 if (x + y < bxybest) {
vtools_genesis 372 bxybest = x + y;
vtools_genesis 373 bxbest = x;
vtools_genesis 374 }
vtools_genesis 375 }
vtools_genesis 376
vtools_genesis 377
vtools_genesis 378 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
vtools_genesis 379 part->xmid = fxbest;
vtools_genesis 380 part->ymid = fxybest - fxbest;
vtools_genesis 381 part->lo_minimal = true;
vtools_genesis 382 part->hi_minimal = false;
vtools_genesis 383 } else {
vtools_genesis 384 part->xmid = bxbest;
vtools_genesis 385 part->ymid = bxybest - bxbest;
vtools_genesis 386 part->lo_minimal = false;
vtools_genesis 387 part->hi_minimal = true;
vtools_genesis 388 }
vtools_genesis 389 return;
vtools_genesis 390 }
vtools_genesis 391 }
vtools_genesis 392 #undef XREF_YREF_EQUAL
vtools_genesis 393 }
vtools_genesis 394
vtools_genesis 395
vtools_genesis 396
vtools_genesis 397 are known, as a whole, to match each other.
vtools_genesis 398
vtools_genesis 399 The subsequence of vector 0 is |[xoff, xlim)| and likewise for
vtools_genesis 400 vector 1.
vtools_genesis 401
vtools_genesis 402 Note that |xlim, ylim| are exclusive bounds. All indices into the
vtools_genesis 403 vectors are origin-0.
vtools_genesis 404
vtools_genesis 405 If |find_minimal|, find a minimal difference no matter how
vtools_genesis 406 expensive it is.
vtools_genesis 407
vtools_genesis 408 The results are recorded by invoking |note_delete| and
vtools_genesis 409 |note_insert|.
vtools_genesis 410
vtools_genesis 411 Return false if terminated normally, or true if terminated through
vtools_genesis 412 early abort. */
vtools_genesis 413
vtools_genesis 414 static bool
vtools_genesis 415 compareseq(OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
vtools_genesis 416 bool find_minimal, struct context *ctxt) {
vtools_genesis 417 #ifdef ELEMENT
vtools_genesis 418 ELEMENT const *xv = ctxt->xvec;
vtools_genesis 419 ELEMENT const *yv = ctxt->yvec;
vtools_genesis 420 #define XREF_YREF_EQUAL(x, y) EQUAL (xv[x], yv[y])
vtools_genesis 421 #else
vtools_genesis 422 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
vtools_genesis 423 #endif
vtools_genesis 424
vtools_genesis 425
vtools_genesis 426 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff)) {
vtools_genesis 427 xoff++;
vtools_genesis 428 yoff++;
vtools_genesis 429 }
vtools_genesis 430
vtools_genesis 431
vtools_genesis 432 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1)) {
vtools_genesis 433 xlim--;
vtools_genesis 434 ylim--;
vtools_genesis 435 }
vtools_genesis 436
vtools_genesis 437
vtools_genesis 438 if (xoff == xlim)
vtools_genesis 439 while (yoff < ylim) {
vtools_genesis 440 NOTE_INSERT (ctxt, yoff);
vtools_genesis 441 if (EARLY_ABORT (ctxt))
vtools_genesis 442 return true;
vtools_genesis 443 yoff++;
vtools_genesis 444 }
vtools_genesis 445 else if (yoff == ylim)
vtools_genesis 446 while (xoff < xlim) {
vtools_genesis 447 NOTE_DELETE (ctxt, xoff);
vtools_genesis 448 if (EARLY_ABORT (ctxt))
vtools_genesis 449 return true;
vtools_genesis 450 xoff++;
vtools_genesis 451 }
vtools_genesis 452 else {
vtools_genesis 453 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
vtools_genesis 454
vtools_genesis 455
vtools_genesis 456 diag(xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
vtools_genesis 457
vtools_genesis 458
vtools_genesis 459 if (compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
vtools_genesis 460 return true;
vtools_genesis 461 if (compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
vtools_genesis 462 return true;
vtools_genesis 463 }
vtools_genesis 464
vtools_genesis 465 return false;
vtools_genesis 466 #undef XREF_YREF_EQUAL
vtools_genesis 467 }
vtools_genesis 468
vtools_genesis 469 #undef ELEMENT
vtools_genesis 470 #undef EQUAL
vtools_genesis 471 #undef OFFSET
vtools_genesis 472 #undef EXTRA_CONTEXT_FIELDS
vtools_genesis 473 #undef NOTE_DELETE
vtools_genesis 474 #undef NOTE_INSERT
vtools_genesis 475 #undef EARLY_ABORT
vtools_genesis 476 #undef USE_HEURISTIC
vtools_genesis 477 #undef XVECREF_YVECREF_EQUAL
vtools_genesis 478 #undef OFFSET_MAX