Thoughts on diff extensions

Matt Mackall mpm at selenic.com
Sun Jun 19 02:05:30 UTC 2005


The requirements for replacing mdiff.diff are:

- significantly faster (obviously)
- roughly the same size output or better
- works on line boundaries (so verify continues to work)
- doesn't choke on binary data including embedded nulls
- is reasonably simple.

Unfortunately both of the implementations that got posted have issues
with the above. Chris M's xdiff-derived version is block-based and
Chris L's diffutils-derived version is quite large and presumably has
trouble with embedded nulls in the same way that regular diff does.

So I took a stab at the diff extension problem. And what I came up
with is quite simple (<300 lines), works on line boundaries, deals
with binary data, and passed a fairly stringent regression test
(diffing and reapplying every delta in the kernel source). I've
attached it below.

Unfortunately, it is in fact slower than the stock Python difflib and
sometimes even generates larger diffs, despite using the same core
algorithm as GNU diff which generates a so-called "shortest edit
sequence".

It's slower than GNU diff because GNU diff gives up early and returns
suboptimal sequences unless you pass it the -d flag (see SNAKE_LIMIT
in the source). I haven't quite figured out how to do that with my
code yet.

And its optimal edit sequence is sometimes suboptimal for Mercurial
because it weighs inserts and deletes the same. This isn't a big deal,
but the ideal algorithm would actually take note of the relative
costs.

So now we're up to three not quite good enough implementations!


/*
 bdiff.c - efficient binary diff extension for Mercurial

 Copyright 2005 Matt Mackall <mpm at selenic.com>

 This software may be used and distributed according to the terms of
 the GNU General Public License, incorporated herein by reference.

 Based on:
  diff.c - compute a shortest edit script (SES) given two sequences
  Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>

 This algorithm is basically Myers' solution to SES/LCS with the
 Hirschberg linear space refinement as described in the following
 publication:

   E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
   Algorithmica 1, 2 (1986), 251-266.
   http://www.eecs.berkeley.edu/~gene/Papers/diff.pdf

 This is the same core algorithm used by GNU diff(1).
*/

#include <Python.h>
#include <stdlib.h>
#include <string.h>
#include <netinet/in.h>
#include <sys/types.h>
#include <time.h>

struct line {
	int h, len;
	const char *l;
};

struct hunk {
	int a1, a2, b1, b2;
};

struct hunklist {
	struct hunk *base, *head;
};

struct snake {
	int x, y, u, v;
};

int splitlines(const char *a, int len, struct line **lr)
{
	int i = 1, h = 0;
	const char *p, *b = a;
	struct line *l;

	for (p = a; p < a + len; p++)
		if (*p == '\n' || p == a + len - 1)
			i++;

	*lr = l = malloc(sizeof(struct line) * i);
	if (!l)
		return -1;

	for (p = a; p < a + len; p++) {
		/* a simple hash from GNU diff */
		h = *p + ((h << 7) | (h >> (sizeof(h) * 8 - 7)));
		if (*p == '\n' || p == a + len - 1) {
			l->h = h;
			l->l = b;
			l->len = p - b + 1;
			l++;
			b = p + 1;
			h = 0;
		}
	}

	l->h = l->len = 0;
	l->l = a + len;

	return i - 1;
}

int inline cmp(struct line *a, struct line *b)
{
	return a->h != b->h || a->len != b->len; // || memcmp(a->l, b->l, a->len);
}

/* Pack -N to N into 0 to N * 2 */
static inline int fp(int k)
{
	return k <=0 ? -k * 4 : k * 4 - 2;
}

static inline int rp(int k)
{
	return k <=0 ? -k * 4 + 1 : k * 4 -1;
}

static int find_snake(struct line *a, int aoff, int n,
		      struct line *b, int boff, int m, struct snake *s, int *v)
{
	int d, k, x, y, q, l = n - m;
	int odd = l & 1;
	int mid = (n + m) / 2 + odd;

	v[fp(1)] = 0;
	v[rp(l - 1)] = n;

	for (d = 0; d <= mid; d++) {
		for (k = d; k >= -d; k -= 2) {
			/* find furthest reaching fwd D-path on diagonal k */
			if (k == -d || (k != d && v[fp(k - 1)] < v[fp(k + 1)]))
				x = v[fp(k + 1)];
			else
				x = v[fp(k - 1)] + 1;

			y = x - k;
			s->x = x;
			s->y = y;

			while (x < n && y < m &&
			       !cmp(a + aoff + x, b + boff + y)) {
				x++;
				y++;
			}

			v[fp(k)] = x;

			if (odd && k >= (l - d + 1) && k <= (l + d - 1)) {
				if (x >= v[rp(k)]) {
					s->u = x;
					s->v = y;
					return 2 * d - 1;
				}
			}
		}

		for (k = d; k >= -d; k -= 2) {
			/* find furthest reaching rev D-path on diagonal l+k */
			q = l + k;
			if (k == d || (k != -d && v[rp(q - 1)] < v[rp(q + 1)]))
				x = v[rp(q - 1)];
			else
				x = v[rp(q + 1)] - 1;

			y = x - q;
			s->u = x;
			s->v = y;

			while (x > 0 && y > 0 &&
			       !cmp(a + aoff + x - 1, b + boff + y - 1)) {
				x--;
				y--;
			}

			v[rp(q)] = x;

			if (!odd && q >= -d && q <= d) {
				if (x <= v[fp(q)]) {
					s->x = x;
					s->y = y;
					return 2 * d;
				}
			}
		}
	}
	return -1;
}

void addhunk(struct hunklist *l, int a1, int a2, int b1, int b2)
{
	if (l->head->a2 != a1 || l->head->b2 != b1) {
		l->head++;
		l->head->a1 = a1;
		l->head->b1 = b1;
	}
	l->head->a2 = a2;
	l->head->b2 = b2;
}

static int subdiff(struct line *a, int aoff, int n,
		   struct line *b, int boff, int m, struct hunklist *l, int *v)
{
	struct snake s;
	int d;

	if (!n) {
		addhunk(l, aoff, aoff, boff, boff + m);
		return m;
	} else if (!m) {
		addhunk(l, aoff, aoff + n, boff, boff);
		return n;
	}

	/* Find the middle "snake" around which we recurse */
	d = find_snake(a, aoff, n, b, boff, m, &s, v);
	if (d > 1) {
		subdiff(a, aoff, s.x, b, boff, s.y, l, v);
		subdiff(a, aoff + s.u, n - s.u, b, boff + s.v, m - s.v, l, v);
		return d;
	}

	/* There are only 4 base cases when the edit distance is 1. */
	if (m > n) {
		if (s.x == s.u) /* 11: insert at back */
			addhunk(l, aoff + n, aoff + n, boff + m - 1, boff + m);
		else              /* 10: insert at front */
			addhunk(l, aoff, aoff, boff, boff + 1);
	} else {
		if (s.x == s.u) /* 01: delete from back */
			addhunk(l, aoff + n - 1, aoff + n, boff + m, boff + m);
		else              /* 00: delete from front */
			addhunk(l, aoff, aoff + 1, boff, boff);
	}
	return d;
}

static PyObject *bdiff(PyObject *self, PyObject *args)
{
	PyObject *sa, *sb, *result = NULL;
	struct hunklist l;
	struct hunk *h;
	struct line *al, *bl;
	char encode[12], *rb;
	int m, n, len, t = 0, *v;

	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
		return NULL;

	n = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
	m = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);

	v = malloc(sizeof(int) * (m + n) * 6);
	l.head = l.base = malloc(sizeof(struct hunk) * (m + n + 1));
	if (!al || !bl || !v || !l.base)
		goto nomem;

	/* skip to the first difference */
	while (t < n && t < m && !cmp(al + t, bl + t))
		t++;

	/* prime the hunk list */
	l.base->a1 = l.base->a2 = l.base->b1 = l.base->b2 = t;

	/* generate the hunk list */
	subdiff(al, t, n - t, bl, t, m - t, &l, v);
	if (l.head->a2 > 0 || l.head->b2 > 0)
		l.head++;

	t = 0;
	for(h = l.base; h != l.head; h++)
		t += 12 + bl[h->b2].l - bl[h->b1].l;

	result = PyString_FromStringAndSize(NULL, t);
	if (!result)
		goto nomem;

	rb = PyString_AsString(result);
	for(h = l.base; h != l.head; h++) {
		len = bl[h->b2].l - bl[h->b1].l;
		*(uint32_t *)(encode)     = htonl(al[h->a1].l - al->l);
		*(uint32_t *)(encode + 4) = htonl(al[h->a2].l - al->l);
		*(uint32_t *)(encode + 8) = htonl(len);
		memcpy(rb, encode, 12);
		memcpy(rb + 12, bl[h->b1].l, len);
		rb += 12 + len;
	}

nomem:
	free(al);
	free(bl);
	free(l.base);
	free(v);
	return result ? result : PyErr_NoMemory();
}

static char mdiff_doc[] = "Efficient binary diff.";

static PyMethodDef methods[] = {
	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
	{NULL, NULL}
};

PyMODINIT_FUNC initbdiff(void)
{
	Py_InitModule3("bdiff", methods, mdiff_doc);
}


-- 
Mathematics is the supreme nostalgia of our time.



More information about the Mercurial mailing list