D9450: cext: isolate hash size in the revlog handling in a single place

joerg.sonnenberger (Joerg Sonnenberger) phabricator at mercurial-scm.org
Sat Nov 28 22:40:03 UTC 2020


joerg.sonnenberger created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D9450

AFFECTED FILES
  mercurial/cext/revlog.c

CHANGE DETAILS

diff --git a/mercurial/cext/revlog.c b/mercurial/cext/revlog.c
--- a/mercurial/cext/revlog.c
+++ b/mercurial/cext/revlog.c
@@ -54,6 +54,7 @@
 typedef struct {
 	indexObject *index;
 	nodetreenode *nodes;
+	Py_ssize_t nodelen;
 	unsigned length;   /* # nodes in use */
 	unsigned capacity; /* # nodes allocated */
 	int depth;         /* maximum depth of tree */
@@ -80,6 +81,8 @@
 	PyObject_HEAD
 	    /* Type-specific fields go here. */
 	    PyObject *data;     /* raw bytes of index */
+	Py_ssize_t nodelen;     /* digest size of the hash, 20 for SHA-1 */
+	PyObject *nullentry;    /* fast path for references to null */
 	Py_buffer buf;          /* buffer of data */
 	const char **offsets;   /* populated on demand */
 	Py_ssize_t length;      /* current on-disk number of elements */
@@ -101,14 +104,12 @@
 	return self->length + self->new_length;
 }
 
-static PyObject *nullentry = NULL;
-static const char nullid[20] = {0};
+static const char nullid[32] = {0};
 static const Py_ssize_t nullrev = -1;
 
 static Py_ssize_t inline_scan(indexObject *self, const char **offsets);
 
-static int index_find_node(indexObject *self, const char *node,
-                           Py_ssize_t nodelen);
+static int index_find_node(indexObject *self, const char *node);
 
 #if LONG_MAX == 0x7fffffffL
 static const char *const tuple_format = PY23("Kiiiiiis#", "Kiiiiiiy#");
@@ -274,7 +275,7 @@
  *    4 bytes: link revision
  *    4 bytes: parent 1 revision
  *    4 bytes: parent 2 revision
- *   32 bytes: nodeid (only 20 bytes used)
+ *   32 bytes: nodeid (only 20 bytes used with SHA-1)
  */
 static PyObject *index_get(indexObject *self, Py_ssize_t pos)
 {
@@ -285,8 +286,8 @@
 	Py_ssize_t length = index_length(self);
 
 	if (pos == nullrev) {
-		Py_INCREF(nullentry);
-		return nullentry;
+		Py_INCREF(self->nullentry);
+		return self->nullentry;
 	}
 
 	if (pos < 0 || pos >= length) {
@@ -320,11 +321,11 @@
 
 	return Py_BuildValue(tuple_format, offset_flags, comp_len, uncomp_len,
 	                     base_rev, link_rev, parent_1, parent_2, c_node_id,
-	                     (Py_ssize_t)20);
+	                     self->nodelen);
 }
 
 /*
- * Return the 20-byte SHA of the node corresponding to the given rev.
+ * Return the hash of node corresponding to the given rev.
  */
 static const char *index_node(indexObject *self, Py_ssize_t pos)
 {
@@ -342,7 +343,7 @@
 }
 
 /*
- * Return the 20-byte SHA of the node corresponding to the given rev. The
+ * Return the hash of the node corresponding to the given rev. The
  * rev is assumed to be existing. If not, an exception is set.
  */
 static const char *index_node_existing(indexObject *self, Py_ssize_t pos)
@@ -357,14 +358,15 @@
 
 static int nt_insert(nodetree *self, const char *node, int rev);
 
-static int node_check(PyObject *obj, char **node)
+static int node_check(Py_ssize_t nodelen, PyObject *obj, char **node)
 {
-	Py_ssize_t nodelen;
-	if (PyBytes_AsStringAndSize(obj, node, &nodelen) == -1)
+	Py_ssize_t thisnodelen;
+	if (PyBytes_AsStringAndSize(obj, node, &thisnodelen) == -1)
 		return -1;
-	if (nodelen == 20)
+	if (nodelen == thisnodelen)
 		return 0;
-	PyErr_SetString(PyExc_ValueError, "20-byte hash required");
+	PyErr_Format(PyExc_ValueError, "node len %zd != expected node len %zd",
+	             thisnodelen, nodelen);
 	return -1;
 }
 
@@ -382,7 +384,7 @@
 		PyErr_SetString(PyExc_TypeError, "8-tuple required");
 		return NULL;
 	}
-	if (c_node_id_len != 20 && c_node_id_len != 32) {
+	if (c_node_id_len != self->nodelen) {
 		PyErr_SetString(PyExc_TypeError, "invalid node");
 		return NULL;
 	}
@@ -697,9 +699,9 @@
 	if (iterator == NULL)
 		return -2;
 	while ((item = PyIter_Next(iterator))) {
-		if (node_check(item, &node) == -1)
+		if (node_check(self->nodelen, item, &node) == -1)
 			goto failed;
-		rev = index_find_node(self, node, 20);
+		rev = index_find_node(self, node);
 		/* null is implicitly public, so negative is invalid */
 		if (rev < 0 || rev >= len)
 			goto failed;
@@ -1493,13 +1495,17 @@
 	int (*getnybble)(const char *, Py_ssize_t) = hex ? hexdigit : nt_level;
 	int level, maxlevel, off;
 
-	if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+	/* If the input is binary, do a fast check for the nullid first. */
+	if (!hex && nodelen == self->nodelen && node[0] == '\0' &&
+	    node[1] == '\0' && memcmp(node, nullid, self->nodelen) == 0)
 		return -1;
 
 	if (hex)
-		maxlevel = nodelen > 40 ? 40 : (int)nodelen;
+		maxlevel = nodelen;
 	else
-		maxlevel = nodelen > 20 ? 40 : ((int)nodelen * 2);
+		maxlevel = 2 * nodelen;
+	if (maxlevel > 2 * self->nodelen)
+		maxlevel = 2 * self->nodelen;
 
 	for (level = off = 0; level < maxlevel; level++) {
 		int k = getnybble(node, level);
@@ -1557,7 +1563,7 @@
 	int level = 0;
 	int off = 0;
 
-	while (level < 40) {
+	while (level < 2 * self->nodelen) {
 		int k = nt_level(node, level);
 		nodetreenode *n;
 		int v;
@@ -1576,7 +1582,7 @@
 
 			if (oldnode == NULL)
 				return -1;
-			if (!memcmp(oldnode, node, 20)) {
+			if (!memcmp(oldnode, node, self->nodelen)) {
 				n->children[k] = -rev - 2;
 				return 0;
 			}
@@ -1634,6 +1640,7 @@
 	/* The input capacity is in terms of revisions, while the field is in
 	 * terms of nodetree nodes. */
 	self->capacity = (capacity < 4 ? 4 : capacity / 2);
+	self->nodelen = index->nodelen;
 	self->depth = 0;
 	self->splits = 0;
 	if ((size_t)self->capacity > INT_MAX / sizeof(nodetreenode)) {
@@ -1678,7 +1685,7 @@
 {
 	int level, off;
 
-	for (level = off = 0; level < 40; level++) {
+	for (level = off = 0; level < 2 * self->nodelen; level++) {
 		int k, v;
 		nodetreenode *n = &self->nodes[off];
 		k = nt_level(node, level);
@@ -1689,7 +1696,7 @@
 			n = index_node_existing(self->index, v);
 			if (n == NULL)
 				return -3;
-			if (memcmp(node, n, 20) != 0)
+			if (memcmp(node, n, self->nodelen) != 0)
 				/*
 				 * Found a unique prefix, but it wasn't for the
 				 * requested node (i.e the requested node does
@@ -1719,7 +1726,7 @@
 
 	if (!PyArg_ParseTuple(args, "O", &val))
 		return NULL;
-	if (node_check(val, &node) == -1)
+	if (node_check(self->nt.nodelen, val, &node) == -1)
 		return NULL;
 
 	length = nt_shortest(&self->nt, node);
@@ -1819,8 +1826,7 @@
  *   -2: not found (no exception set)
  * rest: valid rev
  */
-static int index_find_node(indexObject *self, const char *node,
-                           Py_ssize_t nodelen)
+static int index_find_node(indexObject *self, const char *node)
 {
 	int rev;
 
@@ -1828,7 +1834,7 @@
 		return -3;
 
 	self->ntlookups++;
-	rev = nt_find(&self->nt, node, nodelen, 0);
+	rev = nt_find(&self->nt, node, self->nodelen, 0);
 	if (rev >= -1)
 		return rev;
 
@@ -1846,7 +1852,7 @@
 			const char *n = index_node_existing(self, rev);
 			if (n == NULL)
 				return -3;
-			if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+			if (memcmp(node, n, self->nodelen) == 0) {
 				if (nt_insert(&self->nt, n, rev) == -1)
 					return -3;
 				break;
@@ -1861,7 +1867,7 @@
 				self->ntrev = rev + 1;
 				return -3;
 			}
-			if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+			if (memcmp(node, n, self->nodelen) == 0) {
 				break;
 			}
 		}
@@ -1886,9 +1892,9 @@
 		return index_get(self, idx);
 	}
 
-	if (node_check(value, &node) == -1)
+	if (node_check(self->nodelen, value, &node) == -1)
 		return NULL;
-	rev = index_find_node(self, node, 20);
+	rev = index_find_node(self, node);
 	if (rev >= -1)
 		return PyInt_FromLong(rev);
 	if (rev == -2)
@@ -1930,7 +1936,7 @@
 		return NULL;
 	}
 
-	if (nodelen > 40) {
+	if (nodelen > 2 * self->nodelen) {
 		PyErr_SetString(PyExc_ValueError, "key too long");
 		return NULL;
 	}
@@ -1956,14 +1962,14 @@
 	case -2:
 		Py_RETURN_NONE;
 	case -1:
-		return PyBytes_FromStringAndSize(nullid, 20);
+		return PyBytes_FromStringAndSize(nullid, self->nodelen);
 	}
 
 	fullnode = index_node_existing(self, rev);
 	if (fullnode == NULL) {
 		return NULL;
 	}
-	return PyBytes_FromStringAndSize(fullnode, 20);
+	return PyBytes_FromStringAndSize(fullnode, self->nodelen);
 }
 
 static PyObject *index_shortest(indexObject *self, PyObject *args)
@@ -1974,7 +1980,7 @@
 
 	if (!PyArg_ParseTuple(args, "O", &val))
 		return NULL;
-	if (node_check(val, &node) == -1)
+	if (node_check(self->nodelen, val, &node) == -1)
 		return NULL;
 
 	self->ntlookups++;
@@ -2000,9 +2006,9 @@
 
 	if (!PyArg_ParseTuple(args, "O", &val))
 		return NULL;
-	if (node_check(val, &node) == -1)
+	if (node_check(self->nodelen, val, &node) == -1)
 		return NULL;
-	rev = index_find_node(self, node, 20);
+	rev = index_find_node(self, node);
 	if (rev == -3)
 		return NULL;
 	if (rev == -2)
@@ -2022,10 +2028,10 @@
 		return rev >= -1 && rev < index_length(self);
 	}
 
-	if (node_check(value, &node) == -1)
+	if (node_check(self->nodelen, value, &node) == -1)
 		return -1;
 
-	switch (index_find_node(self, node, 20)) {
+	switch (index_find_node(self, node)) {
 	case -3:
 		return -1;
 	case -2:
@@ -2048,9 +2054,9 @@
 	char *node;
 	int rev;
 
-	if (node_check(val, &node) == -1)
+	if (node_check(self->nodelen, val, &node) == -1)
 		return NULL;
-	rev = index_find_node(self, node, 20);
+	rev = index_find_node(self, node);
 	if (rev >= -1)
 		return PyInt_FromLong(rev);
 	if (rev == -2)
@@ -2529,7 +2535,7 @@
 	if (PySlice_Check(item) && value == NULL)
 		return index_slice_del(self, item);
 
-	if (node_check(item, &node) == -1)
+	if (node_check(self->nodelen, item, &node) == -1)
 		return -1;
 
 	if (value == NULL)
@@ -2596,6 +2602,8 @@
 	Py_INCREF(Py_None);
 	self->ntinitialized = 0;
 	self->offsets = NULL;
+	self->nodelen = 20;
+	self->nullentry = NULL;
 
 	if (!PyArg_ParseTuple(args, "OO", &data_obj, &inlined_obj))
 		return -1;
@@ -2604,6 +2612,16 @@
 		                "data does not support buffer interface");
 		return -1;
 	}
+	if (self->nodelen < 20 || self->nodelen > sizeof(nullid)) {
+		PyErr_SetString(PyExc_RuntimeError, "unsupported node size");
+		return -1;
+	}
+
+	self->nullentry = Py_BuildValue(PY23("iiiiiiis#", "iiiiiiiy#"), 0, 0, 0,
+	                                -1, -1, -1, -1, nullid, self->nodelen);
+	if (!self->nullentry)
+		return -1;
+	PyObject_GC_UnTrack(self->nullentry);
 
 	if (PyObject_GetBuffer(data_obj, &self->buf, PyBUF_SIMPLE) == -1)
 		return -1;
@@ -2671,6 +2689,7 @@
 	}
 	Py_XDECREF(self->data);
 	PyMem_Free(self->added);
+	Py_XDECREF(self->nullentry);
 	PyObject_Del(self);
 }
 
@@ -2841,14 +2860,6 @@
 	Py_INCREF(&nodetreeType);
 	PyModule_AddObject(mod, "nodetree", (PyObject *)&nodetreeType);
 
-	if (!nullentry) {
-		nullentry =
-		    Py_BuildValue(PY23("iiiiiiis#", "iiiiiiiy#"), 0, 0, 0, -1,
-		                  -1, -1, -1, nullid, (Py_ssize_t)20);
-	}
-	if (nullentry)
-		PyObject_GC_UnTrack(nullentry);
-
 	caps = PyCapsule_New(&CAPI, "mercurial.cext.parsers.revlog_CAPI", NULL);
 	if (caps != NULL)
 		PyModule_AddObject(mod, "revlog_CAPI", caps);



To: joerg.sonnenberger, #hg-reviewers
Cc: mercurial-patches, mercurial-devel


More information about the Mercurial-devel mailing list