[PATCH 1 of 2] parsers: incrementally parse the revlog index in C
Bryan O'Sullivan
bos at serpentine.com
Thu Mar 22 18:44:12 UTC 2012
We only parse entries in a revlog index file when they are actually needed.
This makes a huge difference to performance on large revlogs when
performing numeric lookups, if only a few entries are used (a common
case). For instance, "hg tip" on a tree with 300,000 revs takes
0.3s before, 0.02 after.
For revlog-intensive operations (e.g. running "hg log" to completion),
the lazy approach is about 1% slower than the eager parse_index2.
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -241,8 +241,39 @@
return ret;
}
-const char nullid[20];
-const int nullrev = -1;
+/*
+ * A list-like object that decodes the contents of a RevlogNG index
+ * file on demand. It has limited support for insert and delete at the
+ * last element before the end. The last entry is always a sentinel
+ * nullid.
+ */
+typedef struct {
+ PyObject_HEAD
+ /* Type-specific fields go here. */
+ PyObject *data;
+ const char **offsets; /* populated on demand */
+ Py_ssize_t raw_length; /* original number of elements */
+ Py_ssize_t length; /* current number of elements */
+ PyObject *added; /* populated on demand */
+ int inlined;
+} indexObject;
+
+static Py_ssize_t index_length(indexObject *self)
+{
+ if (self->added == NULL)
+ return self->length;
+ return self->length + PyList_GET_SIZE(self->added);
+}
+
+static PyObject *nullentry;
+
+static long inline_scan(indexObject *self, const char **offsets);
+
+#if LONG_MAX == 0x7fffffffL
+static const char *tuple_format = "Kiiiiiis#";
+#else
+static const char *tuple_format = "kiiiiiis#";
+#endif
/* RevlogNG format (all in big endian, data may be inlined):
* 6 bytes: offset
@@ -255,138 +286,374 @@
* 4 bytes: parent 2 revision
* 32 bytes: nodeid (only 20 bytes used)
*/
-static int _parse_index_ng(const char *data, int size, int inlined,
- PyObject *index)
+static PyObject *index_get(indexObject *self, Py_ssize_t pos)
{
- PyObject *entry;
- int n = 0, err;
+ uint32_t decode[8]; /* to enforce alignment with inline data */
uint64_t offset_flags;
int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
const char *c_node_id;
- const char *end = data + size;
- uint32_t decode[8]; /* to enforce alignment with inline data */
+ const char *data;
+ Py_ssize_t length = index_length(self);
+ PyObject *entry;
- while (data < end) {
- unsigned int step;
+ if (pos >= length) {
+ PyErr_SetString(PyExc_IndexError, "revlog index out of range");
+ return NULL;
+ }
- memcpy(decode, data, 32);
- offset_flags = ntohl(decode[1]);
- if (n == 0) /* mask out version number for the first entry */
- offset_flags &= 0xFFFF;
- else {
- uint32_t offset_high = ntohl(decode[0]);
- offset_flags |= ((uint64_t)offset_high) << 32;
+ if (pos == length - 1) {
+ Py_INCREF(nullentry);
+ return nullentry;
+ }
+
+ if (pos >= self->length - 1) {
+ PyObject *obj;
+ obj = PyList_GET_ITEM(self->added, pos - self->length + 1);
+ Py_INCREF(obj);
+ return obj;
+ }
+
+ if (self->inlined && pos > 0) {
+ if (self->offsets == NULL) {
+ self->offsets = malloc(self->raw_length *
+ sizeof(*self->offsets));
+ if (self->offsets == NULL)
+ return PyErr_NoMemory();
+ inline_scan(self, self->offsets);
}
+ data = self->offsets[pos];
+ } else
+ data = PyString_AS_STRING(self->data) + pos * 64;
- comp_len = ntohl(decode[2]);
- uncomp_len = ntohl(decode[3]);
- base_rev = ntohl(decode[4]);
- link_rev = ntohl(decode[5]);
- parent_1 = ntohl(decode[6]);
- parent_2 = ntohl(decode[7]);
- c_node_id = data + 32;
+ memcpy(decode, data, 8 * sizeof(uint32_t));
- entry = Py_BuildValue("Liiiiiis#", offset_flags, comp_len,
+ offset_flags = ntohl(decode[1]);
+ if (pos == 0) /* mask out version number for the first entry */
+ offset_flags &= 0xFFFF;
+ else {
+ uint32_t offset_high = ntohl(decode[0]);
+ offset_flags |= ((uint64_t)offset_high) << 32;
+ }
+
+ comp_len = ntohl(decode[2]);
+ uncomp_len = ntohl(decode[3]);
+ base_rev = ntohl(decode[4]);
+ link_rev = ntohl(decode[5]);
+ parent_1 = ntohl(decode[6]);
+ parent_2 = ntohl(decode[7]);
+ c_node_id = data + 32;
+
+ entry = Py_BuildValue(tuple_format, offset_flags, comp_len,
uncomp_len, base_rev, link_rev,
parent_1, parent_2, c_node_id, 20);
- if (!entry)
- return 0;
+ if (entry)
+ PyObject_GC_UnTrack(entry);
- PyObject_GC_UnTrack(entry); /* don't waste time with this */
+ return entry;
+}
- if (inlined) {
- err = PyList_Append(index, entry);
- Py_DECREF(entry);
- if (err)
- return 0;
- } else
- PyList_SET_ITEM(index, n, entry); /* steals reference */
+static PyObject *index_insert(indexObject *self, PyObject *args)
+{
+ PyObject *obj, *node;
+ long offset;
+ Py_ssize_t len;
- n++;
- step = 64 + (inlined ? comp_len : 0);
- if (data + step > end || data + step < data)
- break;
- data += step;
+ if (!PyArg_ParseTuple(args, "lO", &offset, &obj))
+ return NULL;
+
+ if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {
+ PyErr_SetString(PyExc_ValueError, "8-tuple required");
+ return NULL;
}
- if (data != end) {
- if (!PyErr_Occurred())
- PyErr_SetString(PyExc_ValueError, "corrupt index file");
+
+ node = PyTuple_GET_ITEM(obj, 7);
+ if (!PyString_Check(node) || PyString_GET_SIZE(node) != 20) {
+ PyErr_SetString(PyExc_ValueError,
+ "20-byte hash required as last element");
+ return NULL;
+ }
+
+ len = index_length(self);
+
+ if (offset < 0)
+ offset += len;
+
+ if (offset != len - 1) {
+ PyErr_SetString(PyExc_IndexError,
+ "insert only supported at index -1");
+ return NULL;
+ }
+
+ if (self->added == NULL) {
+ self->added = PyList_New(0);
+ if (self->added == NULL)
+ return NULL;
+ }
+
+ if (PyList_Append(self->added, obj) == -1)
+ return NULL;
+
+ Py_RETURN_NONE;
+}
+
+static int
+index_ass_subscript(indexObject* self, PyObject* item, PyObject* value)
+{
+ Py_ssize_t start, stop, step, slicelength;
+ Py_ssize_t length = index_length(self);
+
+ if (!PySlice_Check(item) || value != NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "revlog index only supports slice deletion");
+ return -1;
+ }
+
+ if (PySlice_GetIndicesEx((PySliceObject*)item, length,
+ &start, &stop, &step, &slicelength) < 0)
+ return -1;
+
+ if (slicelength <= 0)
+ return 0;
+
+ if ((step < 0 && start < stop) || (step > 0 && start > stop))
+ stop = start;
+
+ if (step < 0) {
+ stop = start + 1;
+ start = stop + step*(slicelength - 1) - 1;
+ step = -step;
+ }
+
+ if (step != 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "revlog index delete requires step size of 1");
+ return -1;
+ }
+
+ if (stop != length - 1) {
+ PyErr_SetString(PyExc_IndexError,
+ "revlog index deletion indices are invalid");
+ return -1;
+ }
+
+ if (start < self->length) {
+ self->length = start + 1;
+ if (self->added) {
+ Py_DECREF(self->added);
+ self->added = NULL;
+ }
return 0;
}
- /* create the magic nullid entry in the index at [-1] */
- entry = Py_BuildValue("Liiiiiis#", (uint64_t)0, 0, 0, -1, -1, -1, -1, nullid, 20);
-
- if (!entry)
- return 0;
-
- PyObject_GC_UnTrack(entry); /* don't waste time with this */
-
- if (inlined) {
- err = PyList_Append(index, entry);
- Py_DECREF(entry);
- if (err)
- return 0;
- } else
- PyList_SET_ITEM(index, n, entry); /* steals reference */
-
- return 1;
+ return PyList_SetSlice(self->added, start - self->length + 1,
+ PyList_GET_SIZE(self->added),
+ NULL);
}
-/* This function parses a index file and returns a Python tuple of the
- * following format: (index, cache)
+static long inline_scan(indexObject *self, const char **offsets)
+{
+ const char *data = PyString_AS_STRING(self->data);
+ const char *end = data + PyString_GET_SIZE(self->data);
+ const long hdrsize = 64;
+ long incr = hdrsize;
+ Py_ssize_t len = 0;
+
+ while (data + hdrsize <= end) {
+ uint32_t comp_len;
+ const char *old_data;
+ /* 3rd element of header is length of compressed inline data */
+ memcpy(&comp_len, data + 8, sizeof(uint32_t));
+ incr = hdrsize + ntohl(comp_len);
+ if (incr < hdrsize)
+ break;
+ if (offsets)
+ offsets[len] = data;
+ len++;
+ old_data = data;
+ data += incr;
+ if (data <= old_data)
+ break;
+ }
+
+ if (data != end && data + hdrsize != end) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ return -1;
+ }
+
+ return len;
+}
+
+static int index_real_init(indexObject *self, const char *data, int size,
+ PyObject *inlined_obj, PyObject *data_obj)
+{
+ self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
+ self->data = data_obj;
+
+ self->added = NULL;
+ self->offsets = NULL;
+ Py_INCREF(self->data);
+
+ if (self->inlined) {
+ long len = inline_scan(self, NULL);
+ if (len == -1)
+ goto bail;
+ self->raw_length = len;
+ self->length = len + 1;
+ } else {
+ if (size % 64) {
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ goto bail;
+ }
+ self->raw_length = size / 64;
+ self->length = self->raw_length + 1;
+ }
+
+ return 0;
+bail:
+ return -1;
+}
+
+static int
+index_init(indexObject *self, PyObject *args, PyObject *kwds)
+{
+ const char *data;
+ int size;
+ PyObject *inlined_obj;
+
+ if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
+ return -1;
+
+ return index_real_init(self, data, size, inlined_obj,
+ PyTuple_GET_ITEM(args, 0));
+}
+
+static void
+index_dealloc(indexObject *self)
+{
+ Py_DECREF(self->data);
+ Py_XDECREF(self->added);
+ free(self->offsets);
+ PyObject_Del(self);
+}
+
+static PySequenceMethods index_sequence_methods = {
+ (lenfunc)index_length, /* sq_length */
+ 0, /* sq_concat */
+ 0, /* sq_repeat */
+ (ssizeargfunc)index_get, /* sq_item */
+};
+
+static PyMappingMethods index_mapping_methods = {
+ (lenfunc)index_length, /* mp_length */
+ NULL, /* mp_subscript */
+ (objobjargproc)index_ass_subscript, /* mp_ass_subscript */
+};
+
+static PyMethodDef index_methods[] = {
+ {"insert", (PyCFunction)index_insert, METH_VARARGS,
+ "insert an index entry"},
+ {NULL} /* Sentinel */
+};
+
+static PyTypeObject indexType = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "index.index", /*tp_name*/
+ sizeof(indexObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ (destructor)index_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ &index_sequence_methods, /*tp_as_sequence*/
+ &index_mapping_methods, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ "revlog index", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ index_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)index_init, /* tp_init */
+ 0, /* tp_alloc */
+ PyType_GenericNew, /* tp_new */
+};
+
+/*
+ * returns a tuple of the form (index, None, cache) with elements as
+ * follows:
*
- * index: a list of tuples containing the RevlogNG records
- * cache: if data is inlined, a tuple (index_file_content, 0) else None
+ * index: an index object that lazily parses the RevlogNG records
+ * cache: if data is inlined, a tuple (index_file_content, 0), else None
+ *
+ * added complications are for backwards compatibility
*/
static PyObject *parse_index2(PyObject *self, PyObject *args)
{
const char *data;
- int size, inlined;
- PyObject *rval = NULL, *index = NULL, *cache = NULL;
- PyObject *data_obj = NULL, *inlined_obj;
+ int size, ret;
+ PyObject *inlined_obj, *tuple = NULL, *cache = NULL, *none = NULL;
+ indexObject *idx;
if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
return NULL;
- inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
- /* If no data is inlined, we know the size of the index list in
- * advance: size divided by the size of one revlog record (64 bytes)
- * plus one for nullid */
- index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1);
- if (!index)
- goto quit;
+ idx = PyObject_New(indexObject, &indexType);
- /* set up the cache return value */
- if (inlined) {
- /* Note that the reference to data_obj is only borrowed */
- data_obj = PyTuple_GET_ITEM(args, 0);
- cache = Py_BuildValue("iO", 0, data_obj);
- if (!cache)
- goto quit;
+ if (idx == NULL)
+ goto bail;
+
+ ret = index_real_init(idx, data, size, inlined_obj,
+ PyTuple_GET_ITEM(args, 0));
+ if (ret)
+ goto bail;
+
+ if (idx->inlined) {
+ Py_INCREF(idx->data);
+ cache = Py_BuildValue("iO", 0, idx->data);
+ if (cache == NULL)
+ goto bail;
} else {
cache = Py_None;
- Py_INCREF(Py_None);
+ Py_INCREF(cache);
}
- /* actually populate the index with data */
- if (!_parse_index_ng(data, size, inlined, index))
- goto quit;
+ none = Py_None;
+ Py_INCREF(none);
- rval = Py_BuildValue("NN", index, cache);
- if (!rval)
- goto quit;
- return rval;
+ tuple = Py_BuildValue("NNN", idx, none, cache);
+ if (!tuple)
+ goto bail;
+ return tuple;
-quit:
- Py_XDECREF(index);
+bail:
+ Py_XDECREF(idx);
Py_XDECREF(cache);
- Py_XDECREF(rval);
+ Py_XDECREF(none);
+ Py_XDECREF(tuple);
return NULL;
}
-
static char parsers_doc[] = "Efficient content parsing.";
static PyMethodDef methods[] = {
@@ -396,6 +663,22 @@
{NULL, NULL}
};
+static void module_init(PyObject *mod)
+{
+ static const char nullid[20];
+
+ if (PyType_Ready(&indexType) < 0)
+ return;
+ Py_INCREF(&indexType);
+
+ PyModule_AddObject(mod, "index", (PyObject *)&indexType);
+
+ nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0,
+ -1, -1, -1, -1, nullid, 20);
+ if (nullentry)
+ PyObject_GC_UnTrack(nullentry);
+}
+
#ifdef IS_PY3K
static struct PyModuleDef parsers_module = {
PyModuleDef_HEAD_INIT,
@@ -407,12 +690,15 @@
PyMODINIT_FUNC PyInit_parsers(void)
{
- return PyModule_Create(&parsers_module);
+ PyObject *mod = PyModule_Create(&parsers_module);
+ module_init(mod);
+ return mod;
}
#else
PyMODINIT_FUNC initparsers(void)
{
- Py_InitModule3("parsers", methods, parsers_doc);
+ PyObject *mod = Py_InitModule3("parsers", methods, parsers_doc);
+ module_init(mod);
}
#endif
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -171,10 +171,7 @@
def __init__(self):
self.size = struct.calcsize(indexformatng)
- def parseindex(self, data, inline):
- # call the C implementation to parse the index data
- index, cache = parsers.parse_index2(data, inline)
- return index, None, cache
+ parseindex = staticmethod(parsers.parse_index2)
def packentry(self, entry, node, version, rev):
p = _pack(indexformatng, *entry)
diff --git a/tests/test-parseindex2.py b/tests/test-parseindex2.py
--- a/tests/test-parseindex2.py
+++ b/tests/test-parseindex2.py
@@ -52,7 +52,6 @@
return index, cache
-
data_inlined = '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x01\x8c' \
'\x00\x00\x04\x07\x00\x00\x00\x00\x00\x00\x15\x15\xff\xff\xff' \
'\xff\xff\xff\xff\xff\xebG\x97\xb7\x1fB\x04\xcf\x13V\x81\tw\x1b' \
@@ -94,13 +93,16 @@
'\xb6\r\x98B\xcb\x07\xbd`\x8f\x92\xd9\xc4\x84\xbdK\x00\x00\x00' \
'\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+def parse_index2(data, inline):
+ index, alwaysnone, chunkcache = parsers.parse_index2(data, inline)
+ return list(index), chunkcache
+
def runtest() :
-
py_res_1 = py_parseindex(data_inlined, True)
- c_res_1 = parsers.parse_index2(data_inlined, True)
+ c_res_1 = parse_index2(data_inlined, True)
py_res_2 = py_parseindex(data_non_inlined, False)
- c_res_2 = parsers.parse_index2(data_non_inlined, False)
+ c_res_2 = parse_index2(data_non_inlined, False)
if py_res_1 != c_res_1:
print "Parse index result (with inlined data) differs!"
More information about the Mercurial-devel
mailing list