From c44b50c507a56814e7585982f7bc09aa95ed81be Mon Sep 17 00:00:00 2001 From: j--baker Date: Tue, 3 Mar 2015 13:29:46 -0500 Subject: [PATCH 1/8] commons-text - SANDBOX-491: - Restore the non-threshold implementation of the Levenshtein distance algorithm from commons-lang3's StringUtils. This version avoids copying arrays. compare(left, right) no longer delegates to compare(left, right, threshold). - Fix the threshold implementation's table striping ASCII art. - Add "threshold" field to LevenshteinDistance. If this is null, then the unlimited version of the algorithm is used. Else, the limited version is used. - Add default and threshold versions of the constructor. - Add static getDefaultInstance() which returns the unlimited version. --- .../text/similarity/LevenshteinDistance.java | 227 +++++++++++++++--- .../similarity/LevenshteinDistanceTest.java | 110 +++++---- 2 files changed, 250 insertions(+), 87 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index 4bc2e8a5f4..e1bb08b7be 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -33,27 +33,52 @@ */ public class LevenshteinDistance implements StringMetric { + private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance(); + + private final Integer threshold; + /** - * Find the Levenshtein distance between two Strings. - * - *

A higher score indicates a greater distance.

- * *

- * The previous implementation of the Levenshtein distance algorithm was - * from http://www.merriampark.com - * /ld.htm + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. *

* + * @see {@link #getDefaultInstance()} + */ + public LevenshteinDistance() { + this(null); + } + + /** *

- * Chas Emerick has written an implementation in Java, which avoids an - * OutOfMemoryError which can occur when my Java implementation is used with - * very large strings.
- * This implementation of the Levenshtein distance algorithm is from http://www.merriampark.com/ - * ldjava.htm + * If the threshold is not null, distance calculations will be limited to a maximum length. + * If the threshold is null, the unlimited version of the algorithm will be used. *

* + * @param threshold + * If this is null then distances calculations will not be limited. + * This may not be negative. + */ + public LevenshteinDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + *

Find the Levenshtein distance between two Strings.

+ * + *

A higher score indicates a greater distance.

+ * + *

The previous implementation of the Levenshtein distance algorithm + * was from http://www.merriampark.com/ld.htm

+ * + *

Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.
+ * This implementation of the Levenshtein distance algorithm + * is from http://www.merriampark.com/ldjava.htm

+ * *
      * distance.compare(null, *)             = IllegalArgumentException
      * distance.compare(*, null)             = IllegalArgumentException
@@ -68,14 +93,25 @@ public class LevenshteinDistance implements StringMetric {
      * distance.compare("hello", "hallo")    = 1
      * 
* - * @param left the first string, must not be null - * @param right the second string, must not be null - * @return result distance + * @param s the first String, must not be null + * @param t the second String, must not be null + * @return result distance, or -1 * @throws IllegalArgumentException if either String input {@code null} */ - @Override - public Integer compare(CharSequence left, CharSequence right) { - return compare(left, right, Integer.MAX_VALUE); + public Integer compare(CharSequence s, CharSequence t) { + if (threshold != null) { + return limitedCompare(s, t, threshold); + } else { + return unlimitedCompare(s, t); + } + } + + public static LevenshteinDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + public Integer getThreshold() { + return threshold; } /** @@ -91,27 +127,25 @@ public Integer compare(CharSequence left, CharSequence right) { *

* *
-     * distance.compare(null, *, *)             = IllegalArgumentException
-     * distance.compare(*, null, *)             = IllegalArgumentException
-     * distance.compare(*, *, -1)               = IllegalArgumentException
-     * distance.compare("","", 0)               = 0
-     * distance.compare("aaapppp", "", 8)       = 7
-     * distance.compare("aaapppp", "", 7)       = 7
-     * distance.compare("aaapppp", "", 6))      = -1
-     * distance.compare("elephant", "hippo", 7) = 7
-     * distance.compare("elephant", "hippo", 6) = -1
-     * distance.compare("hippo", "elephant", 7) = 7
-     * distance.compare("hippo", "elephant", 6) = -1
+     * limitedCompare(null, *, *)             = IllegalArgumentException
+     * limitedCompare(*, null, *)             = IllegalArgumentException
+     * limitedCompare(*, *, -1)               = IllegalArgumentException
+     * limitedCompare("","", 0)               = 0
+     * limitedCompare("aaapppp", "", 8)       = 7
+     * limitedCompare("aaapppp", "", 7)       = 7
+     * limitedCompare("aaapppp", "", 6))      = -1
+     * limitedCompare("elephant", "hippo", 7) = 7
+     * limitedCompare("elephant", "hippo", 6) = -1
+     * limitedCompare("hippo", "elephant", 7) = 7
+     * limitedCompare("hippo", "elephant", 6) = -1
      * 
* * @param left the first string, must not be null * @param right the second string, must not be null * @param threshold the target threshold, must not be negative - * @return result distance - * @throws IllegalArgumentException if either String input {@code null} or - * negative threshold + * @return result distance, or -1 */ - public Integer compare(CharSequence left, CharSequence right, int threshold) { + private static int limitedCompare(CharSequence left, CharSequence right, int threshold) { if (left == null || right == null) { throw new IllegalArgumentException("Strings must not be null"); } @@ -124,7 +158,7 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { * equal to the threshold value, returning -1 if it's greater. The * advantage is performance: unbounded distance is O(nm), but a bound of * k allows us to reduce it to O(km) time by only computing a diagonal - * stripe of width 2k + 1 of the cost table. It is also possible to use + * stripe of width 2k + 1 of the cost table. It is also possible to use* this to compute the unbounded Levenshtein distance by starting the * this to compute the unbounded Levenshtein distance by starting the * threshold at 1 and doubling each time until the distance is found; * this is O(dm), where d is the distance. @@ -143,8 +177,16 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { * and our threshold is 1. In this case we're going to walk a stripe of * length 3. The matrix would look like so: * - * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | - * | | |#|#| 6 | | | | |#| 7 | | | | | | + *
+         *    1 2 3 4 5
+         * 1 |#|#| | | |
+         * 2 |#|#|#| | |
+         * 3 | |#|#|#| |
+         * 4 | | |#|#|#|
+         * 5 | | | |#|#|
+         * 6 | | | | |#|
+         * 7 | | | | | |
+         * 
* * Note how the stripe leads off the table as there is no possible way * to turn a string of length 5 into one of length 7 in edit distance of @@ -243,4 +285,113 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { return -1; } + /** + *

Find the Levenshtein distance between two Strings.

+ * + *

A higher score indicates a greater distance.

+ * + *

The previous implementation of the Levenshtein distance algorithm + * was from http://www.merriampark.com/ld.htm

+ * + *

Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.
+ * This implementation of the Levenshtein distance algorithm + * is from http://www.merriampark.com/ldjava.htm

+ * + *
+     * unlimitedCompare(null, *)             = IllegalArgumentException
+     * unlimitedCompare(*, null)             = IllegalArgumentException
+     * unlimitedCompare("","")               = 0
+     * unlimitedCompare("","a")              = 1
+     * unlimitedCompare("aaapppp", "")       = 7
+     * unlimitedCompare("frog", "fog")       = 1
+     * unlimitedCompare("fly", "ant")        = 3
+     * unlimitedCompare("elephant", "hippo") = 7
+     * unlimitedCompare("hippo", "elephant") = 7
+     * unlimitedCompare("hippo", "zzzzzzzz") = 8
+     * unlimitedCompare("hello", "hallo")    = 1
+     * 
+ * + * @param s the first String, must not be null + * @param t the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static int unlimitedCompare(CharSequence s, CharSequence t) { + if (s == null || t == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, + we maintain two single-dimensional arrays of length s.length() + 1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = s.length(); // length of s + int m = t.length(); // length of t + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } + + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = s; + s = t; + t = tmp; + n = m; + m = t.length(); + } + + int p[] = new int[n + 1]; //'previous' cost array, horizontally + int d[] = new int[n + 1]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d + + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t + + char t_j; // jth character of t + + int cost; // cost + + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + t_j = t.charAt(j - 1); + d[0] = j; + + for (i = 1; i <= n; i++) { + cost = s.charAt(i - 1) == t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return p[n]; + } + } diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index 1d4e6fa2ce..ea5bd348e6 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -18,6 +18,9 @@ import static org.junit.Assert.assertEquals; +import java.util.Map; +import java.util.TreeMap; + import org.junit.BeforeClass; import org.junit.Test; @@ -26,15 +29,10 @@ */ public class LevenshteinDistanceTest { - private static LevenshteinDistance distance; - - @BeforeClass - public static void setUp() { - distance = new LevenshteinDistance(); - } - @Test public void testGetLevenshteinDistance_StringString() { + LevenshteinDistance distance = new LevenshteinDistance(); + assertEquals(0, (int) distance.compare("", "")); assertEquals(1, (int) distance.compare("", "a")); assertEquals(7, (int) distance.compare("aaapppp", "")); @@ -49,98 +47,112 @@ public void testGetLevenshteinDistance_StringString() { @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullString() throws Exception { + LevenshteinDistance distance = new LevenshteinDistance(); + distance.compare("a", null); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNull() throws Exception { + LevenshteinDistance distance = new LevenshteinDistance(); + distance.compare(null, "a"); } @Test public void testGetLevenshteinDistance_StringStringInt() { + Map distances = new TreeMap(); + for (int threshold : new int[] { 0, 1, 2, 3, 4, 6, 7, 8, 10, Integer.MAX_VALUE }) { + distances.put(threshold, new LevenshteinDistance(threshold)); + } + // empty strings - assertEquals(0, (int) distance.compare("", "", 0)); - assertEquals(7, (int) distance.compare("aaapppp", "", 8)); - assertEquals(7, (int) distance.compare("aaapppp", "", 7)); - assertEquals(-1, (int) distance.compare("aaapppp", "", 6)); + assertEquals(0, (int) distances.get(0).compare("", "")); + assertEquals(7, (int) distances.get(8).compare("aaapppp", "")); + assertEquals(7, (int) distances.get(7).compare("aaapppp", "")); + assertEquals(-1, (int) distances.get(6).compare("aaapppp", "")); // unequal strings, zero threshold - assertEquals(-1, (int) distance.compare("b", "a", 0)); - assertEquals(-1, (int) distance.compare("a", "b", 0)); + assertEquals(-1, (int) distances.get(0).compare("b", "a")); + assertEquals(-1, (int) distances.get(0).compare("a", "b")); // equal strings - assertEquals(0, (int) distance.compare("aa", "aa", 0)); - assertEquals(0, (int) distance.compare("aa", "aa", 2)); + assertEquals(0, (int) distances.get(0).compare("aa", "aa")); + assertEquals(0, (int) distances.get(2).compare("aa", "aa")); // same length - assertEquals(-1, (int) distance.compare("aaa", "bbb", 2)); - assertEquals(3, (int) distance.compare("aaa", "bbb", 3)); + assertEquals(-1, (int) distances.get(2).compare("aaa", "bbb")); + assertEquals(3, (int) distances.get(3).compare("aaa", "bbb")); // big stripe - assertEquals(6, (int) distance.compare("aaaaaa", "b", 10)); + assertEquals(6, (int) distances.get(10).compare("aaaaaa", "b")); // distance less than threshold - assertEquals(7, (int) distance.compare("aaapppp", "b", 8)); - assertEquals(3, (int) distance.compare("a", "bbb", 4)); + assertEquals(7, (int) distances.get(8).compare("aaapppp", "b")); + assertEquals(3, (int) distances.get(4).compare("a", "bbb")); // distance equal to threshold - assertEquals(7, (int) distance.compare("aaapppp", "b", 7)); - assertEquals(3, (int) distance.compare("a", "bbb", 3)); + assertEquals(7, (int) distances.get(7).compare("aaapppp", "b")); + assertEquals(3, (int) distances.get(3).compare("a", "bbb")); // distance greater than threshold - assertEquals(-1, (int) distance.compare("a", "bbb", 2)); - assertEquals(-1, (int) distance.compare("bbb", "a", 2)); - assertEquals(-1, (int) distance.compare("aaapppp", "b", 6)); + assertEquals(-1, (int) distances.get(2).compare("a", "bbb")); + assertEquals(-1, (int) distances.get(2).compare("bbb", "a")); + assertEquals(-1, (int) distances.get(6).compare("aaapppp", "b")); // stripe runs off array, strings not similar - assertEquals(-1, (int) distance.compare("a", "bbb", 1)); - assertEquals(-1, (int) distance.compare("bbb", "a", 1)); + assertEquals(-1, (int) distances.get(1).compare("a", "bbb")); + assertEquals(-1, (int) distances.get(1).compare("bbb", "a")); // stripe runs off array, strings are similar - assertEquals(-1, (int) distance.compare("12345", "1234567", 1)); - assertEquals(-1, (int) distance.compare("1234567", "12345", 1)); + assertEquals(-1, (int) distances.get(1).compare("12345", "1234567")); + assertEquals(-1, (int) distances.get(1).compare("1234567", "12345")); // old getLevenshteinDistance test cases - assertEquals(1, (int) distance.compare("frog", "fog", 1)); - assertEquals(3, (int) distance.compare("fly", "ant", 3)); - assertEquals(7, (int) distance.compare("elephant", "hippo", 7)); - assertEquals(-1, (int) distance.compare("elephant", "hippo", 6)); - assertEquals(7, (int) distance.compare("hippo", "elephant", 7)); - assertEquals(-1, (int) distance.compare("hippo", "elephant", 6)); - assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8)); - assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8)); - assertEquals(1, (int) distance.compare("hello", "hallo", 1)); + assertEquals(1, (int) distances.get(1).compare("frog", "fog")); + assertEquals(3, (int) distances.get(3).compare("fly", "ant")); + assertEquals(7, (int) distances.get(7).compare("elephant", "hippo")); + assertEquals(-1, (int) distances.get(6).compare("elephant", "hippo")); + assertEquals(7, (int) distances.get(7).compare("hippo", "elephant")); + assertEquals(-1, (int) distances.get(6).compare("hippo", "elephant")); + assertEquals(8, (int) distances.get(8).compare("hippo", "zzzzzzzz")); + assertEquals(8, (int) distances.get(8).compare("zzzzzzzz", "hippo")); + assertEquals(1, (int) distances.get(1).compare("hello", "hallo")); assertEquals(1, - (int) distance.compare("frog", "fog", Integer.MAX_VALUE)); - assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("frog", "fog")); + assertEquals(3, (int) distances.get(Integer.MAX_VALUE).compare("fly", "ant")); assertEquals(7, - (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("elephant", "hippo")); assertEquals(7, - (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("hippo", "elephant")); assertEquals(8, - (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("hippo", "zzzzzzzz")); assertEquals(8, - (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("zzzzzzzz", "hippo")); assertEquals(1, - (int) distance.compare("hello", "hallo", Integer.MAX_VALUE)); + (int) distances.get(Integer.MAX_VALUE).compare("hello", "hallo")); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullStringInt() throws Exception { - distance.compare(null, "a", 0); + LevenshteinDistance distance = new LevenshteinDistance(0); + + distance.compare(null, "a"); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNullInt() throws Exception { - distance.compare("a", null, 0); + LevenshteinDistance distance = new LevenshteinDistance(0); + + distance.compare("a", null); } @Test(expected = IllegalArgumentException.class) - public void testGetLevenshteinDistance_StringStringNegativeInt() + public void testConstructorWithNegativeThreshold() throws Exception { - distance.compare("a", "a", -1); + + LevenshteinDistance distance = new LevenshteinDistance(-1); } } From e1148a951d23c677f15aa9a94e8cb4c0a8409ca0 Mon Sep 17 00:00:00 2001 From: j--baker Date: Tue, 3 Mar 2015 13:54:38 -0500 Subject: [PATCH 2/8] commons-text - SANDBOX-491: - Add "locale" field to FuzzyDistance. Add default and Locale constructor. Add static getDefaultInstance(). A null locale will cause Locale.getLocale() to be called each time compare(left, right) is called. - Fix method name type in FuzzyScoreTest. - Remove tests that no longer fail because of a null Locale. The algorithm is no longer public. If the field is null, Locale.getLocale() is used. --- .../commons/text/similarity/FuzzyScore.java | 40 ++++++++++++++++-- .../text/similarity/FuzzyScoreTest.java | 42 +++++++++---------- 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java index 3e72d05a77..9c32ced10c 100644 --- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java @@ -33,10 +33,35 @@ */ public class FuzzyScore implements StringMetric { + public static final FuzzyScore DEFAULT_INSTANCE = new FuzzyScore(); + + private final Locale locale; + + /* + *

+ * This returns an instance that call {@link Locale#getLocale()} + * to get a {@link Locale} for each + * call to {@link #compare(CharSequence, CharSequence)} + *

+ * + * @see {@link #getDefaultInstance()} + */ + public FuzzyScore() { + this(null); + } + + public FuzzyScore(final Locale locale) { + this.locale = locale; + } + /** *

* Find the Fuzzy Score which indicates the similarity score between two - * Strings. This method uses the default locale. + * Strings. + *

+ *

+ * If this instance's {@link Locale} is null, the current default will be + * determined using {@link Locale#getDefault()}. *

* * @param term a full term that should be matched against, must not be null @@ -47,7 +72,8 @@ public class FuzzyScore implements StringMetric { */ @Override public Integer compare(CharSequence term, CharSequence query) { - return compare(term, query, Locale.getDefault()); + Locale locale = this.locale != null ? this.locale : Locale.getDefault(); + return compare(term, query, locale); } /** @@ -76,7 +102,7 @@ public Integer compare(CharSequence term, CharSequence query) { * @throws IllegalArgumentException if either String input {@code null} or * Locale input {@code null} */ - public Integer compare(CharSequence term, CharSequence query, Locale locale) { + private static int compare(CharSequence term, CharSequence query, Locale locale) { if (term == null || query == null) { throw new IllegalArgumentException("Strings must not be null"); } else if (locale == null) { @@ -130,4 +156,12 @@ public Integer compare(CharSequence term, CharSequence query, Locale locale) { return score; } + public static FuzzyScore getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + public Locale getLocale() { + return locale; + } + } diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index b2fab14074..3d6bcd94e3 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -28,48 +28,44 @@ */ public class FuzzyScoreTest { - private static FuzzyScore score; - - @BeforeClass - public static void setUp() { - score = new FuzzyScore(); - } - @Test public void testGetFuzzyScore() throws Exception { - assertEquals(0, (int) score.compare("", "", Locale.ENGLISH)); + FuzzyScore score = new FuzzyScore(Locale.ENGLISH); + + assertEquals(0, (int) score.compare("", "")); assertEquals(0, - (int) score.compare("Workshop", "b", Locale.ENGLISH)); + (int) score.compare("Workshop", "b")); assertEquals(1, - (int) score.compare("Room", "o", Locale.ENGLISH)); + (int) score.compare("Room", "o")); assertEquals(1, - (int) score.compare("Workshop", "w", Locale.ENGLISH)); + (int) score.compare("Workshop", "w")); assertEquals(2, - (int) score.compare("Workshop", "ws", Locale.ENGLISH)); + (int) score.compare("Workshop", "ws")); assertEquals(4, - (int) score.compare("Workshop", "wo", Locale.ENGLISH)); + (int) score.compare("Workshop", "wo")); assertEquals(3, (int) score.compare( - "Apache Software Foundation", "asf", Locale.ENGLISH)); + "Apache Software Foundation", "asf")); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_NullNullNull() throws Exception { - score.compare(null, null, null); - } + public void testGetFuzzyScore_StringNullLocale() throws Exception { + FuzzyScore score = new FuzzyScore(Locale.ENGLISH); - @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_StringNullLoclae() throws Exception { - score.compare(" ", null, Locale.ENGLISH); + score.compare("not null", null); } @Test(expected = IllegalArgumentException.class) public void testGetFuzzyScore_NullStringLocale() throws Exception { - score.compare(null, "clear", Locale.ENGLISH); + FuzzyScore score = new FuzzyScore(Locale.ENGLISH); + + score.compare(null, "not null"); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_StringStringNull() throws Exception { - score.compare(" ", "clear", null); + public void testGetFuzzyScore_NullNullLocale() throws Exception { + FuzzyScore score = new FuzzyScore(Locale.ENGLISH); + + score.compare(null, null); } } From 98d7b6ae89061abdabe9397025da55cc05c91497 Mon Sep 17 00:00:00 2001 From: j--baker Date: Tue, 3 Mar 2015 14:08:45 -0500 Subject: [PATCH 3/8] commons-text - SANDBOX-491: - Fix "s" and "t" -> "left" and "right" variable in Levenshtein distance algorithms. --- .../text/similarity/LevenshteinDistance.java | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index e1bb08b7be..38e5f14b1f 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -93,16 +93,16 @@ public LevenshteinDistance(final Integer threshold) { * distance.compare("hello", "hallo") = 1 * * - * @param s the first String, must not be null - * @param t the second String, must not be null + * @param left the first string, must not be null + * @param right the second string, must not be null * @return result distance, or -1 * @throws IllegalArgumentException if either String input {@code null} */ - public Integer compare(CharSequence s, CharSequence t) { + public Integer compare(CharSequence left, CharSequence right) { if (threshold != null) { - return limitedCompare(s, t, threshold); + return limitedCompare(left, right, threshold); } else { - return unlimitedCompare(s, t); + return unlimitedCompare(left, right); } } @@ -203,8 +203,8 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr * some discussion. */ - int n = left.length(); // length of s - int m = right.length(); // length of t + int n = left.length(); // length of left + int m = right.length(); // length of right // if one string is empty, the edit distance is necessarily the length // of the other @@ -239,7 +239,7 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr // iterates through t for (int j = 1; j <= m; j++) { - final char t_j = right.charAt(j - 1); // jth character of t + final char right_j = right.charAt(j - 1); // jth character of right d[0] = j; // compute stripe indices, constrain to array size @@ -260,7 +260,7 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr // iterates through [min, max] in s for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == t_j) { + if (left.charAt(i - 1) == right_j) { // diagonally left and up d[i] = p[i - 1]; } else { @@ -312,13 +312,13 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr * unlimitedCompare("hello", "hallo") = 1 * * - * @param s the first String, must not be null - * @param t the second String, must not be null + * @param left the first String, must not be null + * @param right the second String, must not be null * @return result distance, or -1 * @throws IllegalArgumentException if either String input {@code null} */ - private static int unlimitedCompare(CharSequence s, CharSequence t) { - if (s == null || t == null) { + private static int unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { throw new IllegalArgumentException("Strings must not be null"); } @@ -339,8 +339,8 @@ allows us to retain the previous cost counts as required by the algorithm (takin cause an out of memory condition when calculating the LD over two very large strings. */ - int n = s.length(); // length of s - int m = t.length(); // length of t + int n = left.length(); // length of left + int m = right.length(); // length of right if (n == 0) { return m; @@ -350,22 +350,22 @@ allows us to retain the previous cost counts as required by the algorithm (takin if (n > m) { // swap the input strings to consume less memory - final CharSequence tmp = s; - s = t; - t = tmp; + final CharSequence tmp = left; + left = right; + right = tmp; n = m; - m = t.length(); + m = right.length(); } int p[] = new int[n + 1]; //'previous' cost array, horizontally int d[] = new int[n + 1]; // cost array, horizontally int _d[]; //placeholder to assist in swapping p and d - // indexes into strings s and t - int i; // iterates through s - int j; // iterates through t + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right - char t_j; // jth character of t + char right_j; // jth character of right int cost; // cost @@ -374,11 +374,11 @@ allows us to retain the previous cost counts as required by the algorithm (takin } for (j = 1; j <= m; j++) { - t_j = t.charAt(j - 1); + right_j = right.charAt(j - 1); d[0] = j; for (i = 1; i <= n; i++) { - cost = s.charAt(i - 1) == t_j ? 0 : 1; + cost = left.charAt(i - 1) == right_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); } From a31d8d00045b2d116a29a26fbedf251b297ad171 Mon Sep 17 00:00:00 2001 From: j--baker Date: Tue, 3 Mar 2015 14:47:47 -0500 Subject: [PATCH 4/8] commons-text - SANDBOX-491: - Remove unused org.junit.BeforeClass imports from FuzzyScoreTest and LevenshteinDistancetest. --- .../java/org/apache/commons/text/similarity/FuzzyScoreTest.java | 1 - .../apache/commons/text/similarity/LevenshteinDistanceTest.java | 1 - 2 files changed, 2 deletions(-) diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index 3d6bcd94e3..b54215df8d 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -20,7 +20,6 @@ import java.util.Locale; -import org.junit.BeforeClass; import org.junit.Test; /** diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index ea5bd348e6..385049a7fd 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -21,7 +21,6 @@ import java.util.Map; import java.util.TreeMap; -import org.junit.BeforeClass; import org.junit.Test; /** From 390d1d7f67d01c901cd3e7d59b8366fcabc785d1 Mon Sep 17 00:00:00 2001 From: j--baker Date: Tue, 3 Mar 2015 16:07:56 -0500 Subject: [PATCH 5/8] commons-text - SANDBOX-491: - Remove the Locale.getDefault() logic in FuzzyScore. There is no longer a DEFAULT_INSTANCE and the Locale may not be null. --- .../commons/text/similarity/FuzzyScore.java | 56 ++++--------------- .../text/similarity/FuzzyScoreTest.java | 5 ++ 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java index 9c32ced10c..3cf6df24f6 100644 --- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java @@ -33,47 +33,22 @@ */ public class FuzzyScore implements StringMetric { - public static final FuzzyScore DEFAULT_INSTANCE = new FuzzyScore(); - private final Locale locale; - /* - *

- * This returns an instance that call {@link Locale#getLocale()} - * to get a {@link Locale} for each - * call to {@link #compare(CharSequence, CharSequence)} - *

- * - * @see {@link #getDefaultInstance()} - */ - public FuzzyScore() { - this(null); - } - - public FuzzyScore(final Locale locale) { - this.locale = locale; - } /** - *

- * Find the Fuzzy Score which indicates the similarity score between two - * Strings. - *

- *

- * If this instance's {@link Locale} is null, the current default will be - * determined using {@link Locale#getDefault()}. - *

+ *

This returns a {@link Locale}-specific {@link FuzzyScore}.

* - * @param term a full term that should be matched against, must not be null - * @param query the query that will be matched against a term, must not be - * null - * @return result score - * @throws IllegalArgumentException if either String input {@code null} + * @param locale The string matching logic is case insensitive. + A {@link Locale} is necessary to normalize both Strings to lower case. + * @throws IllegalArgumentException + * This is thrown if the {@link Locale} parameter is {@code null}. */ - @Override - public Integer compare(CharSequence term, CharSequence query) { - Locale locale = this.locale != null ? this.locale : Locale.getDefault(); - return compare(term, query, locale); + public FuzzyScore(final Locale locale) { + if (locale == null) { + throw new IllegalArgumentException("Locale must not be null"); + } + this.locale = locale; } /** @@ -96,17 +71,14 @@ public Integer compare(CharSequence term, CharSequence query) { * @param term a full term that should be matched against, must not be null * @param query the query that will be matched against a term, must not be * null - * @param locale This string matching logic is case insensitive. A locale is - * necessary to normalize both Strings to lower case. * @return result score * @throws IllegalArgumentException if either String input {@code null} or * Locale input {@code null} */ - private static int compare(CharSequence term, CharSequence query, Locale locale) { + @Override + public Integer compare(CharSequence term, CharSequence query) { if (term == null || query == null) { throw new IllegalArgumentException("Strings must not be null"); - } else if (locale == null) { - throw new IllegalArgumentException("Locale must not be null"); } // fuzzy logic is case insensitive. We normalize the Strings to lower @@ -156,10 +128,6 @@ private static int compare(CharSequence term, CharSequence query, Locale locale) return score; } - public static FuzzyScore getDefaultInstance() { - return DEFAULT_INSTANCE; - } - public Locale getLocale() { return locale; } diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index b54215df8d..5d5d995bde 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -67,4 +67,9 @@ public void testGetFuzzyScore_NullNullLocale() throws Exception { score.compare(null, null); } + @Test(expected = IllegalArgumentException.class) + public void testMissingLocale() throws Exception { + FuzzyScore score = new FuzzyScore((Locale) null); + } + } From 10cfb00ca8a8d21cc3109ca90fbdbe21ca1c014f Mon Sep 17 00:00:00 2001 From: j--baker Date: Wed, 4 Mar 2015 12:43:14 -0500 Subject: [PATCH 6/8] commons-test - SANDBOX-491: - Replace instances of FuzzyScore(Locale.ENGLISH) in FuzzyScoreTest with a static final ENGLISH_SCORE constant. - Replace instances of LevenshteinDistance() in LevenshteinDistanceTest with a static final UNLIMITED_DISTANCE constant. - Convert LevenshteinDistanceTest.testGetLevenshteinDistance_StringStringInt() into ParameterizedLevenshteinDistanceTest. --- .../text/similarity/FuzzyScoreTest.java | 37 ++--- .../similarity/LevenshteinDistanceTest.java | 122 +++-------------- .../ParameterizedLevenshteinDistanceTest.java | 127 ++++++++++++++++++ 3 files changed, 157 insertions(+), 129 deletions(-) create mode 100644 src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index 5d5d995bde..2ccc60d0f0 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -27,44 +27,33 @@ */ public class FuzzyScoreTest { + private static final FuzzyScore ENGLISH_SCORE = new FuzzyScore(Locale.ENGLISH); + @Test public void testGetFuzzyScore() throws Exception { - FuzzyScore score = new FuzzyScore(Locale.ENGLISH); - - assertEquals(0, (int) score.compare("", "")); - assertEquals(0, - (int) score.compare("Workshop", "b")); - assertEquals(1, - (int) score.compare("Room", "o")); - assertEquals(1, - (int) score.compare("Workshop", "w")); - assertEquals(2, - (int) score.compare("Workshop", "ws")); - assertEquals(4, - (int) score.compare("Workshop", "wo")); - assertEquals(3, (int) score.compare( - "Apache Software Foundation", "asf")); + assertEquals(0, (int) ENGLISH_SCORE.compare("", "")); + assertEquals(0, (int) ENGLISH_SCORE.compare("Workshop", "b")); + assertEquals(1, (int) ENGLISH_SCORE.compare("Room", "o")); + assertEquals(1, (int) ENGLISH_SCORE.compare("Workshop", "w")); + assertEquals(2, (int) ENGLISH_SCORE.compare("Workshop", "ws")); + assertEquals(4, (int) ENGLISH_SCORE.compare("Workshop", "wo")); + assertEquals(3, (int) ENGLISH_SCORE.compare( + "Apache Software Foundation", "asf")); } @Test(expected = IllegalArgumentException.class) public void testGetFuzzyScore_StringNullLocale() throws Exception { - FuzzyScore score = new FuzzyScore(Locale.ENGLISH); - - score.compare("not null", null); + ENGLISH_SCORE.compare("not null", null); } @Test(expected = IllegalArgumentException.class) public void testGetFuzzyScore_NullStringLocale() throws Exception { - FuzzyScore score = new FuzzyScore(Locale.ENGLISH); - - score.compare(null, "not null"); + ENGLISH_SCORE.compare(null, "not null"); } @Test(expected = IllegalArgumentException.class) public void testGetFuzzyScore_NullNullLocale() throws Exception { - FuzzyScore score = new FuzzyScore(Locale.ENGLISH); - - score.compare(null, null); + ENGLISH_SCORE.compare(null, null); } @Test(expected = IllegalArgumentException.class) diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index 385049a7fd..14b842c205 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -18,9 +18,6 @@ import static org.junit.Assert.assertEquals; -import java.util.Map; -import java.util.TreeMap; - import org.junit.Test; /** @@ -28,129 +25,44 @@ */ public class LevenshteinDistanceTest { + private static final LevenshteinDistance UNLIMITED_DISTANCE = new LevenshteinDistance(); + @Test public void testGetLevenshteinDistance_StringString() { - LevenshteinDistance distance = new LevenshteinDistance(); - - assertEquals(0, (int) distance.compare("", "")); - assertEquals(1, (int) distance.compare("", "a")); - assertEquals(7, (int) distance.compare("aaapppp", "")); - assertEquals(1, (int) distance.compare("frog", "fog")); - assertEquals(3, (int) distance.compare("fly", "ant")); - assertEquals(7, (int) distance.compare("elephant", "hippo")); - assertEquals(7, (int) distance.compare("hippo", "elephant")); - assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz")); - assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo")); - assertEquals(1, (int) distance.compare("hello", "hallo")); + assertEquals(0, (int) UNLIMITED_DISTANCE.compare("", "")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("", "a")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("aaapppp", "")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("frog", "fog")); + assertEquals(3, (int) UNLIMITED_DISTANCE.compare("fly", "ant")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("elephant", "hippo")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("hippo", "elephant")); + assertEquals(8, (int) UNLIMITED_DISTANCE.compare("hippo", "zzzzzzzz")); + assertEquals(8, (int) UNLIMITED_DISTANCE.compare("zzzzzzzz", "hippo")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("hello", "hallo")); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullString() throws Exception { - LevenshteinDistance distance = new LevenshteinDistance(); - - distance.compare("a", null); + UNLIMITED_DISTANCE.compare("a", null); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNull() throws Exception { - LevenshteinDistance distance = new LevenshteinDistance(); - - distance.compare(null, "a"); - } - - @Test - public void testGetLevenshteinDistance_StringStringInt() { - Map distances = new TreeMap(); - for (int threshold : new int[] { 0, 1, 2, 3, 4, 6, 7, 8, 10, Integer.MAX_VALUE }) { - distances.put(threshold, new LevenshteinDistance(threshold)); - } - - // empty strings - assertEquals(0, (int) distances.get(0).compare("", "")); - assertEquals(7, (int) distances.get(8).compare("aaapppp", "")); - assertEquals(7, (int) distances.get(7).compare("aaapppp", "")); - assertEquals(-1, (int) distances.get(6).compare("aaapppp", "")); - - // unequal strings, zero threshold - assertEquals(-1, (int) distances.get(0).compare("b", "a")); - assertEquals(-1, (int) distances.get(0).compare("a", "b")); - - // equal strings - assertEquals(0, (int) distances.get(0).compare("aa", "aa")); - assertEquals(0, (int) distances.get(2).compare("aa", "aa")); - - // same length - assertEquals(-1, (int) distances.get(2).compare("aaa", "bbb")); - assertEquals(3, (int) distances.get(3).compare("aaa", "bbb")); - - // big stripe - assertEquals(6, (int) distances.get(10).compare("aaaaaa", "b")); - - // distance less than threshold - assertEquals(7, (int) distances.get(8).compare("aaapppp", "b")); - assertEquals(3, (int) distances.get(4).compare("a", "bbb")); - - // distance equal to threshold - assertEquals(7, (int) distances.get(7).compare("aaapppp", "b")); - assertEquals(3, (int) distances.get(3).compare("a", "bbb")); - - // distance greater than threshold - assertEquals(-1, (int) distances.get(2).compare("a", "bbb")); - assertEquals(-1, (int) distances.get(2).compare("bbb", "a")); - assertEquals(-1, (int) distances.get(6).compare("aaapppp", "b")); - - // stripe runs off array, strings not similar - assertEquals(-1, (int) distances.get(1).compare("a", "bbb")); - assertEquals(-1, (int) distances.get(1).compare("bbb", "a")); - - // stripe runs off array, strings are similar - assertEquals(-1, (int) distances.get(1).compare("12345", "1234567")); - assertEquals(-1, (int) distances.get(1).compare("1234567", "12345")); - - // old getLevenshteinDistance test cases - assertEquals(1, (int) distances.get(1).compare("frog", "fog")); - assertEquals(3, (int) distances.get(3).compare("fly", "ant")); - assertEquals(7, (int) distances.get(7).compare("elephant", "hippo")); - assertEquals(-1, (int) distances.get(6).compare("elephant", "hippo")); - assertEquals(7, (int) distances.get(7).compare("hippo", "elephant")); - assertEquals(-1, (int) distances.get(6).compare("hippo", "elephant")); - assertEquals(8, (int) distances.get(8).compare("hippo", "zzzzzzzz")); - assertEquals(8, (int) distances.get(8).compare("zzzzzzzz", "hippo")); - assertEquals(1, (int) distances.get(1).compare("hello", "hallo")); - - assertEquals(1, - (int) distances.get(Integer.MAX_VALUE).compare("frog", "fog")); - assertEquals(3, (int) distances.get(Integer.MAX_VALUE).compare("fly", "ant")); - assertEquals(7, - (int) distances.get(Integer.MAX_VALUE).compare("elephant", "hippo")); - assertEquals(7, - (int) distances.get(Integer.MAX_VALUE).compare("hippo", "elephant")); - assertEquals(8, - (int) distances.get(Integer.MAX_VALUE).compare("hippo", "zzzzzzzz")); - assertEquals(8, - (int) distances.get(Integer.MAX_VALUE).compare("zzzzzzzz", "hippo")); - assertEquals(1, - (int) distances.get(Integer.MAX_VALUE).compare("hello", "hallo")); + UNLIMITED_DISTANCE.compare(null, "a"); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullStringInt() throws Exception { - LevenshteinDistance distance = new LevenshteinDistance(0); - - distance.compare(null, "a"); + UNLIMITED_DISTANCE.compare(null, "a"); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNullInt() throws Exception { - LevenshteinDistance distance = new LevenshteinDistance(0); - - distance.compare("a", null); + UNLIMITED_DISTANCE.compare("a", null); } @Test(expected = IllegalArgumentException.class) - public void testConstructorWithNegativeThreshold() - throws Exception { - + public void testConstructorWithNegativeThreshold() throws Exception { LevenshteinDistance distance = new LevenshteinDistance(-1); } diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java new file mode 100644 index 0000000000..5a224219c6 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; + +import java.util.Arrays; +import java.util.Map; +import java.util.TreeMap; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +/** + * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}. + */ +@RunWith(Parameterized.class) +public class ParameterizedLevenshteinDistanceTest { + + private final Integer distance; + private final CharSequence left; + private final CharSequence right; + private final Integer threshold; + + public ParameterizedLevenshteinDistanceTest( + final Integer threshold, + final CharSequence left, final CharSequence right, + final Integer distance) { + + this.threshold = threshold; + this.left = left; + this.right = right; + this.distance = distance; + } + + @Parameters + public static Iterable parameters() { + return Arrays.asList( new Object[][] { + + /* empty strings */ + { 0, "", "", 0 }, + { 8, "aaapppp", "", 7 }, + { 7, "aaapppp", "", 7 }, + { 6, "aaapppp", "", -1 }, + + /* unequal strings, zero threshold */ + { 0, "b", "a", -1 }, + { 0, "a", "b", -1 }, + + /* equal strings */ + { 0, "aa", "aa", 0 }, + { 2, "aa", "aa", 0 }, + + /* same length */ + { 2, "aaa", "bbb", -1 }, + { 3, "aaa", "bbb", 3 }, + + /* big stripe */ + { 10, "aaaaaa", "b", 6 }, + + /* distance less than threshold */ + { 8, "aaapppp", "b", 7 }, + { 4, "a", "bbb", 3 }, + + /* distance equal to threshold */ + { 7, "aaapppp", "b", 7 }, + { 3, "a", "bbb", 3 }, + + /* distance greater than threshold */ + { 2, "a", "bbb", -1 }, + { 2, "bbb", "a", -1 }, + { 6, "aaapppp", "b", -1 }, + + /* stripe runs off array, strings not similar */ + { 1, "a", "bbb", -1 }, + { 1, "bbb", "a", -1 }, + + /* stripe runs off array, strings are similar */ + { 1, "12345", "1234567", -1 }, + { 1, "1234567", "12345", -1 }, + + /* old getLevenshteinDistance test cases */ + { 1, "frog", "fog", 1 }, + { 3, "fly", "ant", 3 }, + { 7, "elephant", "hippo", 7 }, + { 6, "elephant", "hippo", -1 }, + { 7, "hippo", "elephant", 7 }, + { 6, "hippo", "elephant", -1 }, + { 8, "hippo", "zzzzzzzz", 8 }, + { 8, "zzzzzzzz", "hippo", 8 }, + { 1, "hello", "hallo", 1 }, + + { Integer.MAX_VALUE, "frog", "fog", 1 }, + { Integer.MAX_VALUE, "fly", "ant", 3 }, + { Integer.MAX_VALUE, "elephant", "hippo", 7 }, + { Integer.MAX_VALUE, "hippo", "elephant", 7 }, + { Integer.MAX_VALUE, "hippo", "zzzzzzzz", 8 }, + { Integer.MAX_VALUE, "zzzzzzzz", "hippo", 8 }, + { Integer.MAX_VALUE, "hello", "hallo", 1 } + + } ); + } + + @Test + public void test() { + LevenshteinDistance metric = new LevenshteinDistance(threshold); + assertThat(metric.compare(left, right), equalTo(distance)); + } + +} From aa1ce97f8ffca895e36e3401988573cd5b5b7416 Mon Sep 17 00:00:00 2001 From: j--baker Date: Wed, 4 Mar 2015 13:23:18 -0500 Subject: [PATCH 7/8] commons-text - SANDBOX-491: Add missing ".similarity" part of package path to @link's in the unit test classes. --- .../java/org/apache/commons/text/similarity/FuzzyScoreTest.java | 2 +- .../commons/text/similarity/JaroWrinklerDistanceTest.java | 2 +- .../apache/commons/text/similarity/LevenshteinDistanceTest.java | 2 +- .../text/similarity/ParameterizedLevenshteinDistanceTest.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index 2ccc60d0f0..88778fc4ef 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -23,7 +23,7 @@ import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.FuzzyScore}. + * Unit tests for {@link org.apache.commons.text.similarity.FuzzyScore}. */ public class FuzzyScoreTest { diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java index 73e18f735e..7050b05ac2 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java @@ -22,7 +22,7 @@ import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}. + * Unit tests for {@link org.apache.commons.text.similarity.JaroWrinklerDistance}. */ public class JaroWrinklerDistanceTest { diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index 14b842c205..814677d35e 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -21,7 +21,7 @@ import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}. + * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}. */ public class LevenshteinDistanceTest { diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java index 5a224219c6..ada1e38864 100644 --- a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java @@ -29,7 +29,7 @@ import org.junit.runners.Parameterized.Parameters; /** - * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}. + * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}. */ @RunWith(Parameterized.class) public class ParameterizedLevenshteinDistanceTest { From 45bfffce0ff2c2c4fb4f5c829db60d20c468aeda Mon Sep 17 00:00:00 2001 From: j--baker Date: Wed, 4 Mar 2015 13:29:40 -0500 Subject: [PATCH 8/8] commons-text - SANDBOX-491: Remove unused Map and TreeMap imports from ParameterizedLevenshteinDistanceTest. --- .../text/similarity/ParameterizedLevenshteinDistanceTest.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java index ada1e38864..c6fd116292 100644 --- a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java @@ -20,8 +20,6 @@ import static org.junit.Assert.assertThat; import java.util.Arrays; -import java.util.Map; -import java.util.TreeMap; import org.junit.Test; import org.junit.runner.RunWith;