[PATCH] avoid some false positives for addremove -s

Erling Ellingsen erlingalf+mrc at gmail.com
Wed Feb 14 23:56:41 UTC 2007


# HG changeset patch
# User Erling Ellingsen <erlingalf at gmail.com>
# Date 1171497082 -3600
# Node ID 475fc420289b50577d59a293e7e24586d0c81ec4
# Parent  5b1f663ef86d68ce11d70de8e5ab61d93341a18c
avoid some false positives for addremove -s

The original code uses the similary score

  1 - len(diff(after, before)) / len(after)

The diff can at most be the size of the 'before' file, so any small
'before' file would be considered very similar. Removing an empty file
would cause all files added in the same revision to be considered
copies of the removed file.

This changes the metric to

  bytes_overlap(before, after) / len(before + after)

i.e. the actual percentage of bytes shared between the two files.

diff -r 5b1f663ef86d -r 475fc420289b mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Tue Feb 06 16:12:22 2007 -0600
+++ b/mercurial/cmdutil.py	Thu Feb 15 00:51:22 2007 +0100
@@ -8,7 +8,6 @@ from node import *
 from node import *
 from i18n import _
 import os, sys, mdiff, util, templater, patch
-
 revrangesep = ':'

 def revpair(repo, revs):
@@ -146,20 +145,31 @@ def walk(repo, pats=[], opts={}, node=No
         yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact

 def findrenames(repo, added=None, removed=None, threshold=0.5):
+    '''find renamed files -- yields (before, after, score) tuples'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
+    import bdiff
     ctx = repo.changectx()
     for a in added:
         aa = repo.wread(a)
-        bestscore, bestname = None, None
+        bestname, bestscore = None, threshold
         for r in removed:
             rr = ctx.filectx(r).data()
-            delta = mdiff.textdiff(aa, rr)
-            if len(delta) < len(aa):
-                myscore = 1.0 - (float(len(delta)) / len(aa))
-                if bestscore is None or myscore > bestscore:
-                    bestscore, bestname = myscore, r
-        if bestname and bestscore >= threshold:
+
+            # bdiff.blocks() returns blocks of matching lines
+            # count the number of bytes in each
+            equal = 0
+            alines = mdiff.splitnewlines(aa)
+            matches = bdiff.blocks(aa, rr)[:-1]
+            for x1,x2,y1,y2 in matches:
+                assert x2-x1 == y2-y1
+                for line in alines[x1:x2]:
+                    equal += len(line)
+
+            myscore = equal*2.0 / (len(aa)+len(rr))
+            if myscore >= bestscore:
+                bestscore, bestname = myscore, r
+        if bestname:
             yield bestname, a, bestscore

 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None,
diff -r 5b1f663ef86d -r 475fc420289b tests/test-addremove-similar
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar	Thu Feb 15 00:51:22 2007 +0100
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+hg init rep
+cd rep
+
+touch empty-file
+python -c 'for x in range(10000): print x' > large-file
+
+hg addremove
+
+hg commit -m A
+
+rm large-file empty-file
+python -c 'for x in range(10,10000): print x' > another-file
+
+hg addremove -s50
+
+hg commit -m B
+
diff -r 5b1f663ef86d -r 475fc420289b tests/test-addremove-similar-2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar-2	Thu Feb 15 00:51:22 2007 +0100
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+hg init rep
+cd rep
+
+python -c 'for x in range(10000): print x' > large-file
+python -c 'for x in range(50): print x' > tiny-file
+
+hg addremove
+
+hg commit -m A
+
+python -c 'for x in range(70): print x' > small-file
+rm tiny-file
+rm large-file
+
+hg addremove -s50
+
+hg commit -m B
+
diff -r 5b1f663ef86d -r 475fc420289b tests/test-addremove-similar-2.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar-2.out	Thu Feb 15 00:51:22 2007 +0100
@@ -0,0 +1,6 @@
+adding large-file
+adding tiny-file
+adding small-file
+removing large-file
+removing tiny-file
+recording removal of tiny-file as rename to small-file (82% similar)
diff -r 5b1f663ef86d -r 475fc420289b tests/test-addremove-similar.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar.out	Thu Feb 15 00:51:22 2007 +0100
@@ -0,0 +1,6 @@
+adding empty-file
+adding large-file
+adding another-file
+removing empty-file
+removing large-file
+recording removal of large-file as rename to another-file (99% similar)



More information about the Mercurial-devel mailing list