diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
index 3e72d05a77..3cf6df24f6 100644
--- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
@@ -33,21 +33,22 @@
*/
public class FuzzyScore implements StringMetric
- * Find the Fuzzy Score which indicates the similarity score between two
- * Strings. This method uses the default locale.
- * This returns a {@link Locale}-specific {@link FuzzyScore}. A higher score indicates a greater distance.
- * The previous implementation of the Levenshtein distance algorithm was
- * from http://www.merriampark.com
- * /ld.htm
+ * This returns the default instance that uses a version
+ * of the algorithm that does not use a threshold parameter.
*
- * Chas Emerick has written an implementation in Java, which avoids an
- * OutOfMemoryError which can occur when my Java implementation is used with
- * very large strings. Find the Levenshtein distance between two Strings. A higher score indicates a greater distance. The previous implementation of the Levenshtein distance algorithm
+ * was from http://www.merriampark.com/ld.htm Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+ * which can occur when my Java implementation is used with very large strings.
- * This implementation of the Levenshtein distance algorithm is from http://www.merriampark.com/
- * ldjava.htm
+ * If the threshold is not null, distance calculations will be limited to a maximum length.
+ * If the threshold is null, the unlimited version of the algorithm will be used.
*
+ * This implementation of the Levenshtein distance algorithm
+ * is from http://www.merriampark.com/ldjava.htm
* distance.compare(null, *) = IllegalArgumentException
* distance.compare(*, null) = IllegalArgumentException
@@ -70,12 +95,23 @@ public class LevenshteinDistance implements StringMetric
- * distance.compare(null, *, *) = IllegalArgumentException
- * distance.compare(*, null, *) = IllegalArgumentException
- * distance.compare(*, *, -1) = IllegalArgumentException
- * distance.compare("","", 0) = 0
- * distance.compare("aaapppp", "", 8) = 7
- * distance.compare("aaapppp", "", 7) = 7
- * distance.compare("aaapppp", "", 6)) = -1
- * distance.compare("elephant", "hippo", 7) = 7
- * distance.compare("elephant", "hippo", 6) = -1
- * distance.compare("hippo", "elephant", 7) = 7
- * distance.compare("hippo", "elephant", 6) = -1
+ * limitedCompare(null, *, *) = IllegalArgumentException
+ * limitedCompare(*, null, *) = IllegalArgumentException
+ * limitedCompare(*, *, -1) = IllegalArgumentException
+ * limitedCompare("","", 0) = 0
+ * limitedCompare("aaapppp", "", 8) = 7
+ * limitedCompare("aaapppp", "", 7) = 7
+ * limitedCompare("aaapppp", "", 6)) = -1
+ * limitedCompare("elephant", "hippo", 7) = 7
+ * limitedCompare("elephant", "hippo", 6) = -1
+ * limitedCompare("hippo", "elephant", 7) = 7
+ * limitedCompare("hippo", "elephant", 6) = -1
*
*
* @param left the first string, must not be null
* @param right the second string, must not be null
* @param threshold the target threshold, must not be negative
- * @return result distance
- * @throws IllegalArgumentException if either String input {@code null} or
- * negative threshold
+ * @return result distance, or -1
*/
- public Integer compare(CharSequence left, CharSequence right, int threshold) {
+ private static int limitedCompare(CharSequence left, CharSequence right, int threshold) {
if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}
@@ -124,7 +158,7 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) {
* equal to the threshold value, returning -1 if it's greater. The
* advantage is performance: unbounded distance is O(nm), but a bound of
* k allows us to reduce it to O(km) time by only computing a diagonal
- * stripe of width 2k + 1 of the cost table. It is also possible to use
+ * stripe of width 2k + 1 of the cost table. It is also possible to use* this to compute the unbounded Levenshtein distance by starting the
* this to compute the unbounded Levenshtein distance by starting the
* threshold at 1 and doubling each time until the distance is found;
* this is O(dm), where d is the distance.
@@ -143,8 +177,16 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) {
* and our threshold is 1. In this case we're going to walk a stripe of
* length 3. The matrix would look like so:
*
- * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 |
- * | | |#|#| 6 | | | | |#| 7 | | | | | |
+ * + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + ** * Note how the stripe leads off the table as there is no possible way * to turn a string of length 5 into one of length 7 in edit distance of @@ -161,8 +203,8 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { * some discussion. */ - int n = left.length(); // length of s - int m = right.length(); // length of t + int n = left.length(); // length of left + int m = right.length(); // length of right // if one string is empty, the edit distance is necessarily the length // of the other @@ -197,7 +239,7 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { // iterates through t for (int j = 1; j <= m; j++) { - final char t_j = right.charAt(j - 1); // jth character of t + final char right_j = right.charAt(j - 1); // jth character of right d[0] = j; // compute stripe indices, constrain to array size @@ -218,7 +260,7 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { // iterates through [min, max] in s for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == t_j) { + if (left.charAt(i - 1) == right_j) { // diagonally left and up d[i] = p[i - 1]; } else { @@ -243,4 +285,113 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) { return -1; } + /** + *
Find the Levenshtein distance between two Strings.
+ * + *A higher score indicates a greater distance.
+ * + *The previous implementation of the Levenshtein distance algorithm + * was from http://www.merriampark.com/ld.htm
+ * + *Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+ * which can occur when my Java implementation is used with very large strings.
+ * This implementation of the Levenshtein distance algorithm
+ * is from http://www.merriampark.com/ldjava.htm
+ * unlimitedCompare(null, *) = IllegalArgumentException
+ * unlimitedCompare(*, null) = IllegalArgumentException
+ * unlimitedCompare("","") = 0
+ * unlimitedCompare("","a") = 1
+ * unlimitedCompare("aaapppp", "") = 7
+ * unlimitedCompare("frog", "fog") = 1
+ * unlimitedCompare("fly", "ant") = 3
+ * unlimitedCompare("elephant", "hippo") = 7
+ * unlimitedCompare("hippo", "elephant") = 7
+ * unlimitedCompare("hippo", "zzzzzzzz") = 8
+ * unlimitedCompare("hello", "hallo") = 1
+ *
+ *
+ * @param left the first String, must not be null
+ * @param right the second String, must not be null
+ * @return result distance, or -1
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ private static int unlimitedCompare(CharSequence left, CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
+ we maintain two single-dimensional arrays of length s.length() + 1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ int n = left.length(); // length of left
+ int m = right.length(); // length of right
+
+ if (n == 0) {
+ return m;
+ } else if (m == 0) {
+ return n;
+ }
+
+ if (n > m) {
+ // swap the input strings to consume less memory
+ final CharSequence tmp = left;
+ left = right;
+ right = tmp;
+ n = m;
+ m = right.length();
+ }
+
+ int p[] = new int[n + 1]; //'previous' cost array, horizontally
+ int d[] = new int[n + 1]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
+
+ // indexes into strings left and right
+ int i; // iterates through left
+ int j; // iterates through right
+
+ char right_j; // jth character of right
+
+ int cost; // cost
+
+ for (i = 0; i <= n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 1; j <= m; j++) {
+ right_j = right.charAt(j - 1);
+ d[0] = j;
+
+ for (i = 1; i <= n; i++) {
+ cost = left.charAt(i - 1) == right_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return p[n];
+ }
+
}
diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
index b2fab14074..88778fc4ef 100644
--- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
@@ -20,56 +20,45 @@
import java.util.Locale;
-import org.junit.BeforeClass;
import org.junit.Test;
/**
- * Unit tests for {@link org.apache.commons.text.FuzzyScore}.
+ * Unit tests for {@link org.apache.commons.text.similarity.FuzzyScore}.
*/
public class FuzzyScoreTest {
- private static FuzzyScore score;
-
- @BeforeClass
- public static void setUp() {
- score = new FuzzyScore();
- }
+ private static final FuzzyScore ENGLISH_SCORE = new FuzzyScore(Locale.ENGLISH);
@Test
public void testGetFuzzyScore() throws Exception {
- assertEquals(0, (int) score.compare("", "", Locale.ENGLISH));
- assertEquals(0,
- (int) score.compare("Workshop", "b", Locale.ENGLISH));
- assertEquals(1,
- (int) score.compare("Room", "o", Locale.ENGLISH));
- assertEquals(1,
- (int) score.compare("Workshop", "w", Locale.ENGLISH));
- assertEquals(2,
- (int) score.compare("Workshop", "ws", Locale.ENGLISH));
- assertEquals(4,
- (int) score.compare("Workshop", "wo", Locale.ENGLISH));
- assertEquals(3, (int) score.compare(
- "Apache Software Foundation", "asf", Locale.ENGLISH));
+ assertEquals(0, (int) ENGLISH_SCORE.compare("", ""));
+ assertEquals(0, (int) ENGLISH_SCORE.compare("Workshop", "b"));
+ assertEquals(1, (int) ENGLISH_SCORE.compare("Room", "o"));
+ assertEquals(1, (int) ENGLISH_SCORE.compare("Workshop", "w"));
+ assertEquals(2, (int) ENGLISH_SCORE.compare("Workshop", "ws"));
+ assertEquals(4, (int) ENGLISH_SCORE.compare("Workshop", "wo"));
+ assertEquals(3, (int) ENGLISH_SCORE.compare(
+ "Apache Software Foundation", "asf"));
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_NullNullNull() throws Exception {
- score.compare(null, null, null);
+ public void testGetFuzzyScore_StringNullLocale() throws Exception {
+ ENGLISH_SCORE.compare("not null", null);
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_StringNullLoclae() throws Exception {
- score.compare(" ", null, Locale.ENGLISH);
+ public void testGetFuzzyScore_NullStringLocale() throws Exception {
+ ENGLISH_SCORE.compare(null, "not null");
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_NullStringLocale() throws Exception {
- score.compare(null, "clear", Locale.ENGLISH);
+ public void testGetFuzzyScore_NullNullLocale() throws Exception {
+ ENGLISH_SCORE.compare(null, null);
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_StringStringNull() throws Exception {
- score.compare(" ", "clear", null);
+ public void testMissingLocale() throws Exception {
+ FuzzyScore score = new FuzzyScore((Locale) null);
}
}
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
index 73e18f735e..7050b05ac2 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
@@ -22,7 +22,7 @@
import org.junit.Test;
/**
- * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
+ * Unit tests for {@link org.apache.commons.text.similarity.JaroWrinklerDistance}.
*/
public class JaroWrinklerDistanceTest {
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index 1d4e6fa2ce..814677d35e 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -18,129 +18,52 @@
import static org.junit.Assert.assertEquals;
-import org.junit.BeforeClass;
import org.junit.Test;
/**
- * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
+ * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}.
*/
public class LevenshteinDistanceTest {
- private static LevenshteinDistance distance;
-
- @BeforeClass
- public static void setUp() {
- distance = new LevenshteinDistance();
- }
+ private static final LevenshteinDistance UNLIMITED_DISTANCE = new LevenshteinDistance();
@Test
public void testGetLevenshteinDistance_StringString() {
- assertEquals(0, (int) distance.compare("", ""));
- assertEquals(1, (int) distance.compare("", "a"));
- assertEquals(7, (int) distance.compare("aaapppp", ""));
- assertEquals(1, (int) distance.compare("frog", "fog"));
- assertEquals(3, (int) distance.compare("fly", "ant"));
- assertEquals(7, (int) distance.compare("elephant", "hippo"));
- assertEquals(7, (int) distance.compare("hippo", "elephant"));
- assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
- assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
- assertEquals(1, (int) distance.compare("hello", "hallo"));
+ assertEquals(0, (int) UNLIMITED_DISTANCE.compare("", ""));
+ assertEquals(1, (int) UNLIMITED_DISTANCE.compare("", "a"));
+ assertEquals(7, (int) UNLIMITED_DISTANCE.compare("aaapppp", ""));
+ assertEquals(1, (int) UNLIMITED_DISTANCE.compare("frog", "fog"));
+ assertEquals(3, (int) UNLIMITED_DISTANCE.compare("fly", "ant"));
+ assertEquals(7, (int) UNLIMITED_DISTANCE.compare("elephant", "hippo"));
+ assertEquals(7, (int) UNLIMITED_DISTANCE.compare("hippo", "elephant"));
+ assertEquals(8, (int) UNLIMITED_DISTANCE.compare("hippo", "zzzzzzzz"));
+ assertEquals(8, (int) UNLIMITED_DISTANCE.compare("zzzzzzzz", "hippo"));
+ assertEquals(1, (int) UNLIMITED_DISTANCE.compare("hello", "hallo"));
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_NullString() throws Exception {
- distance.compare("a", null);
+ UNLIMITED_DISTANCE.compare("a", null);
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_StringNull() throws Exception {
- distance.compare(null, "a");
- }
-
- @Test
- public void testGetLevenshteinDistance_StringStringInt() {
- // empty strings
- assertEquals(0, (int) distance.compare("", "", 0));
- assertEquals(7, (int) distance.compare("aaapppp", "", 8));
- assertEquals(7, (int) distance.compare("aaapppp", "", 7));
- assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
-
- // unequal strings, zero threshold
- assertEquals(-1, (int) distance.compare("b", "a", 0));
- assertEquals(-1, (int) distance.compare("a", "b", 0));
-
- // equal strings
- assertEquals(0, (int) distance.compare("aa", "aa", 0));
- assertEquals(0, (int) distance.compare("aa", "aa", 2));
-
- // same length
- assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
- assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
-
- // big stripe
- assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
-
- // distance less than threshold
- assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
- assertEquals(3, (int) distance.compare("a", "bbb", 4));
-
- // distance equal to threshold
- assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
- assertEquals(3, (int) distance.compare("a", "bbb", 3));
-
- // distance greater than threshold
- assertEquals(-1, (int) distance.compare("a", "bbb", 2));
- assertEquals(-1, (int) distance.compare("bbb", "a", 2));
- assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
-
- // stripe runs off array, strings not similar
- assertEquals(-1, (int) distance.compare("a", "bbb", 1));
- assertEquals(-1, (int) distance.compare("bbb", "a", 1));
-
- // stripe runs off array, strings are similar
- assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
- assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
-
- // old getLevenshteinDistance test cases
- assertEquals(1, (int) distance.compare("frog", "fog", 1));
- assertEquals(3, (int) distance.compare("fly", "ant", 3));
- assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
- assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
- assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
- assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
- assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
- assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
- assertEquals(1, (int) distance.compare("hello", "hallo", 1));
-
- assertEquals(1,
- (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
- assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE));
- assertEquals(7,
- (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE));
- assertEquals(7,
- (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE));
- assertEquals(8,
- (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE));
- assertEquals(8,
- (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE));
- assertEquals(1,
- (int) distance.compare("hello", "hallo", Integer.MAX_VALUE));
+ UNLIMITED_DISTANCE.compare(null, "a");
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_NullStringInt() throws Exception {
- distance.compare(null, "a", 0);
+ UNLIMITED_DISTANCE.compare(null, "a");
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_StringNullInt() throws Exception {
- distance.compare("a", null, 0);
+ UNLIMITED_DISTANCE.compare("a", null);
}
@Test(expected = IllegalArgumentException.class)
- public void testGetLevenshteinDistance_StringStringNegativeInt()
- throws Exception {
- distance.compare("a", "a", -1);
+ public void testConstructorWithNegativeThreshold() throws Exception {
+ LevenshteinDistance distance = new LevenshteinDistance(-1);
}
}
diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
new file mode 100644
index 0000000000..c6fd116292
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}.
+ */
+@RunWith(Parameterized.class)
+public class ParameterizedLevenshteinDistanceTest {
+
+ private final Integer distance;
+ private final CharSequence left;
+ private final CharSequence right;
+ private final Integer threshold;
+
+ public ParameterizedLevenshteinDistanceTest(
+ final Integer threshold,
+ final CharSequence left, final CharSequence right,
+ final Integer distance) {
+
+ this.threshold = threshold;
+ this.left = left;
+ this.right = right;
+ this.distance = distance;
+ }
+
+ @Parameters
+ public static Iterable