From c44b50c507a56814e7585982f7bc09aa95ed81be Mon Sep 17 00:00:00 2001
From: j--baker
Date: Tue, 3 Mar 2015 13:29:46 -0500
Subject: [PATCH 1/8] commons-text - SANDBOX-491: - Restore the non-threshold
implementation of the Levenshtein distance algorithm from commons-lang3's
StringUtils. This version avoids copying arrays. compare(left, right) no
longer delegates to compare(left, right, threshold). - Fix the threshold
implementation's table striping ASCII art. - Add "threshold" field to
LevenshteinDistance. If this is null, then the unlimited version of the
algorithm is used. Else, the limited version is used. - Add default and
threshold versions of the constructor. - Add static getDefaultInstance()
which returns the unlimited version.
---
.../text/similarity/LevenshteinDistance.java | 227 +++++++++++++++---
.../similarity/LevenshteinDistanceTest.java | 110 +++++----
2 files changed, 250 insertions(+), 87 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
index 4bc2e8a5f4..e1bb08b7be 100644
--- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -33,27 +33,52 @@
*/
public class LevenshteinDistance implements StringMetric {
+ private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance();
+
+ private final Integer threshold;
+
/**
- * Find the Levenshtein distance between two Strings.
- *
- *
A higher score indicates a greater distance.
- *
*
- * The previous implementation of the Levenshtein distance algorithm was
- * from http://www.merriampark.com
- * /ld.htm
+ * This returns the default instance that uses a version
+ * of the algorithm that does not use a threshold parameter.
*
- * Chas Emerick has written an implementation in Java, which avoids an
- * OutOfMemoryError which can occur when my Java implementation is used with
- * very large strings.
- * This implementation of the Levenshtein distance algorithm is from http://www.merriampark.com/
- * ldjava.htm
+ * If the threshold is not null, distance calculations will be limited to a maximum length.
+ * If the threshold is null, the unlimited version of the algorithm will be used.
*
*
+ * @param threshold
+ * If this is null then distances calculations will not be limited.
+ * This may not be negative.
+ */
+ public LevenshteinDistance(final Integer threshold) {
+ if (threshold != null && threshold < 0) {
+ throw new IllegalArgumentException("Threshold must not be negative");
+ }
+ this.threshold = threshold;
+ }
+
+ /**
+ *
Find the Levenshtein distance between two Strings.
Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+ * which can occur when my Java implementation is used with very large strings.
+ * This implementation of the Levenshtein distance algorithm
+ * is from http://www.merriampark.com/ldjava.htm
*
- * @param left the first string, must not be null
- * @param right the second string, must not be null
- * @return result distance
+ * @param s the first String, must not be null
+ * @param t the second String, must not be null
+ * @return result distance, or -1
* @throws IllegalArgumentException if either String input {@code null}
*/
- @Override
- public Integer compare(CharSequence left, CharSequence right) {
- return compare(left, right, Integer.MAX_VALUE);
+ public Integer compare(CharSequence s, CharSequence t) {
+ if (threshold != null) {
+ return limitedCompare(s, t, threshold);
+ } else {
+ return unlimitedCompare(s, t);
+ }
+ }
+
+ public static LevenshteinDistance getDefaultInstance() {
+ return DEFAULT_INSTANCE;
+ }
+
+ public Integer getThreshold() {
+ return threshold;
}
/**
@@ -91,27 +127,25 @@ public Integer compare(CharSequence left, CharSequence right) {
*
*
* @param left the first string, must not be null
* @param right the second string, must not be null
* @param threshold the target threshold, must not be negative
- * @return result distance
- * @throws IllegalArgumentException if either String input {@code null} or
- * negative threshold
+ * @return result distance, or -1
*/
- public Integer compare(CharSequence left, CharSequence right, int threshold) {
+ private static int limitedCompare(CharSequence left, CharSequence right, int threshold) {
if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}
@@ -124,7 +158,7 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) {
* equal to the threshold value, returning -1 if it's greater. The
* advantage is performance: unbounded distance is O(nm), but a bound of
* k allows us to reduce it to O(km) time by only computing a diagonal
- * stripe of width 2k + 1 of the cost table. It is also possible to use
+ * stripe of width 2k + 1 of the cost table. It is also possible to use* this to compute the unbounded Levenshtein distance by starting the
* this to compute the unbounded Levenshtein distance by starting the
* threshold at 1 and doubling each time until the distance is found;
* this is O(dm), where d is the distance.
@@ -143,8 +177,16 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) {
* and our threshold is 1. In this case we're going to walk a stripe of
* length 3. The matrix would look like so:
*
- * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 |
- * | | |#|#| 6 | | | | |#| 7 | | | | | |
+ *
*
* Note how the stripe leads off the table as there is no possible way
* to turn a string of length 5 into one of length 7 in edit distance of
@@ -243,4 +285,113 @@ public Integer compare(CharSequence left, CharSequence right, int threshold) {
return -1;
}
+ /**
+ *
Find the Levenshtein distance between two Strings.
Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+ * which can occur when my Java implementation is used with very large strings.
+ * This implementation of the Levenshtein distance algorithm
+ * is from http://www.merriampark.com/ldjava.htm
+ *
+ * @param s the first String, must not be null
+ * @param t the second String, must not be null
+ * @return result distance, or -1
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ private static int unlimitedCompare(CharSequence s, CharSequence t) {
+ if (s == null || t == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
+ we maintain two single-dimensional arrays of length s.length() + 1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ int n = s.length(); // length of s
+ int m = t.length(); // length of t
+
+ if (n == 0) {
+ return m;
+ } else if (m == 0) {
+ return n;
+ }
+
+ if (n > m) {
+ // swap the input strings to consume less memory
+ final CharSequence tmp = s;
+ s = t;
+ t = tmp;
+ n = m;
+ m = t.length();
+ }
+
+ int p[] = new int[n + 1]; //'previous' cost array, horizontally
+ int d[] = new int[n + 1]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ char t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i <= n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 1; j <= m; j++) {
+ t_j = t.charAt(j - 1);
+ d[0] = j;
+
+ for (i = 1; i <= n; i++) {
+ cost = s.charAt(i - 1) == t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return p[n];
+ }
+
}
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index 1d4e6fa2ce..ea5bd348e6 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -18,6 +18,9 @@
import static org.junit.Assert.assertEquals;
+import java.util.Map;
+import java.util.TreeMap;
+
import org.junit.BeforeClass;
import org.junit.Test;
@@ -26,15 +29,10 @@
*/
public class LevenshteinDistanceTest {
- private static LevenshteinDistance distance;
-
- @BeforeClass
- public static void setUp() {
- distance = new LevenshteinDistance();
- }
-
@Test
public void testGetLevenshteinDistance_StringString() {
+ LevenshteinDistance distance = new LevenshteinDistance();
+
assertEquals(0, (int) distance.compare("", ""));
assertEquals(1, (int) distance.compare("", "a"));
assertEquals(7, (int) distance.compare("aaapppp", ""));
@@ -49,98 +47,112 @@ public void testGetLevenshteinDistance_StringString() {
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_NullString() throws Exception {
+ LevenshteinDistance distance = new LevenshteinDistance();
+
distance.compare("a", null);
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_StringNull() throws Exception {
+ LevenshteinDistance distance = new LevenshteinDistance();
+
distance.compare(null, "a");
}
@Test
public void testGetLevenshteinDistance_StringStringInt() {
+ Map distances = new TreeMap();
+ for (int threshold : new int[] { 0, 1, 2, 3, 4, 6, 7, 8, 10, Integer.MAX_VALUE }) {
+ distances.put(threshold, new LevenshteinDistance(threshold));
+ }
+
// empty strings
- assertEquals(0, (int) distance.compare("", "", 0));
- assertEquals(7, (int) distance.compare("aaapppp", "", 8));
- assertEquals(7, (int) distance.compare("aaapppp", "", 7));
- assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
+ assertEquals(0, (int) distances.get(0).compare("", ""));
+ assertEquals(7, (int) distances.get(8).compare("aaapppp", ""));
+ assertEquals(7, (int) distances.get(7).compare("aaapppp", ""));
+ assertEquals(-1, (int) distances.get(6).compare("aaapppp", ""));
// unequal strings, zero threshold
- assertEquals(-1, (int) distance.compare("b", "a", 0));
- assertEquals(-1, (int) distance.compare("a", "b", 0));
+ assertEquals(-1, (int) distances.get(0).compare("b", "a"));
+ assertEquals(-1, (int) distances.get(0).compare("a", "b"));
// equal strings
- assertEquals(0, (int) distance.compare("aa", "aa", 0));
- assertEquals(0, (int) distance.compare("aa", "aa", 2));
+ assertEquals(0, (int) distances.get(0).compare("aa", "aa"));
+ assertEquals(0, (int) distances.get(2).compare("aa", "aa"));
// same length
- assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
- assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
+ assertEquals(-1, (int) distances.get(2).compare("aaa", "bbb"));
+ assertEquals(3, (int) distances.get(3).compare("aaa", "bbb"));
// big stripe
- assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
+ assertEquals(6, (int) distances.get(10).compare("aaaaaa", "b"));
// distance less than threshold
- assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
- assertEquals(3, (int) distance.compare("a", "bbb", 4));
+ assertEquals(7, (int) distances.get(8).compare("aaapppp", "b"));
+ assertEquals(3, (int) distances.get(4).compare("a", "bbb"));
// distance equal to threshold
- assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
- assertEquals(3, (int) distance.compare("a", "bbb", 3));
+ assertEquals(7, (int) distances.get(7).compare("aaapppp", "b"));
+ assertEquals(3, (int) distances.get(3).compare("a", "bbb"));
// distance greater than threshold
- assertEquals(-1, (int) distance.compare("a", "bbb", 2));
- assertEquals(-1, (int) distance.compare("bbb", "a", 2));
- assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
+ assertEquals(-1, (int) distances.get(2).compare("a", "bbb"));
+ assertEquals(-1, (int) distances.get(2).compare("bbb", "a"));
+ assertEquals(-1, (int) distances.get(6).compare("aaapppp", "b"));
// stripe runs off array, strings not similar
- assertEquals(-1, (int) distance.compare("a", "bbb", 1));
- assertEquals(-1, (int) distance.compare("bbb", "a", 1));
+ assertEquals(-1, (int) distances.get(1).compare("a", "bbb"));
+ assertEquals(-1, (int) distances.get(1).compare("bbb", "a"));
// stripe runs off array, strings are similar
- assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
- assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
+ assertEquals(-1, (int) distances.get(1).compare("12345", "1234567"));
+ assertEquals(-1, (int) distances.get(1).compare("1234567", "12345"));
// old getLevenshteinDistance test cases
- assertEquals(1, (int) distance.compare("frog", "fog", 1));
- assertEquals(3, (int) distance.compare("fly", "ant", 3));
- assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
- assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
- assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
- assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
- assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
- assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
- assertEquals(1, (int) distance.compare("hello", "hallo", 1));
+ assertEquals(1, (int) distances.get(1).compare("frog", "fog"));
+ assertEquals(3, (int) distances.get(3).compare("fly", "ant"));
+ assertEquals(7, (int) distances.get(7).compare("elephant", "hippo"));
+ assertEquals(-1, (int) distances.get(6).compare("elephant", "hippo"));
+ assertEquals(7, (int) distances.get(7).compare("hippo", "elephant"));
+ assertEquals(-1, (int) distances.get(6).compare("hippo", "elephant"));
+ assertEquals(8, (int) distances.get(8).compare("hippo", "zzzzzzzz"));
+ assertEquals(8, (int) distances.get(8).compare("zzzzzzzz", "hippo"));
+ assertEquals(1, (int) distances.get(1).compare("hello", "hallo"));
assertEquals(1,
- (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
- assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("frog", "fog"));
+ assertEquals(3, (int) distances.get(Integer.MAX_VALUE).compare("fly", "ant"));
assertEquals(7,
- (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("elephant", "hippo"));
assertEquals(7,
- (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("hippo", "elephant"));
assertEquals(8,
- (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("hippo", "zzzzzzzz"));
assertEquals(8,
- (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("zzzzzzzz", "hippo"));
assertEquals(1,
- (int) distance.compare("hello", "hallo", Integer.MAX_VALUE));
+ (int) distances.get(Integer.MAX_VALUE).compare("hello", "hallo"));
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_NullStringInt() throws Exception {
- distance.compare(null, "a", 0);
+ LevenshteinDistance distance = new LevenshteinDistance(0);
+
+ distance.compare(null, "a");
}
@Test(expected = IllegalArgumentException.class)
public void testGetLevenshteinDistance_StringNullInt() throws Exception {
- distance.compare("a", null, 0);
+ LevenshteinDistance distance = new LevenshteinDistance(0);
+
+ distance.compare("a", null);
}
@Test(expected = IllegalArgumentException.class)
- public void testGetLevenshteinDistance_StringStringNegativeInt()
+ public void testConstructorWithNegativeThreshold()
throws Exception {
- distance.compare("a", "a", -1);
+
+ LevenshteinDistance distance = new LevenshteinDistance(-1);
}
}
From e1148a951d23c677f15aa9a94e8cb4c0a8409ca0 Mon Sep 17 00:00:00 2001
From: j--baker
Date: Tue, 3 Mar 2015 13:54:38 -0500
Subject: [PATCH 2/8] commons-text - SANDBOX-491: - Add "locale" field to
FuzzyDistance. Add default and Locale constructor. Add static
getDefaultInstance(). A null locale will cause Locale.getLocale() to be
called each time compare(left, right) is called. - Fix method name type in
FuzzyScoreTest. - Remove tests that no longer fail because of a null Locale.
The algorithm is no longer public. If the field is null, Locale.getLocale()
is used.
---
.../commons/text/similarity/FuzzyScore.java | 40 ++++++++++++++++--
.../text/similarity/FuzzyScoreTest.java | 42 +++++++++----------
2 files changed, 56 insertions(+), 26 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
index 3e72d05a77..9c32ced10c 100644
--- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
@@ -33,10 +33,35 @@
*/
public class FuzzyScore implements StringMetric {
+ public static final FuzzyScore DEFAULT_INSTANCE = new FuzzyScore();
+
+ private final Locale locale;
+
+ /*
+ *
+ * This returns an instance that call {@link Locale#getLocale()}
+ * to get a {@link Locale} for each
+ * call to {@link #compare(CharSequence, CharSequence)}
+ *
* Find the Fuzzy Score which indicates the similarity score between two
- * Strings. This method uses the default locale.
+ * Strings.
+ *
+ *
+ * If this instance's {@link Locale} is null, the current default will be
+ * determined using {@link Locale#getDefault()}.
*
*
* @param term a full term that should be matched against, must not be null
@@ -47,7 +72,8 @@ public class FuzzyScore implements StringMetric {
*/
@Override
public Integer compare(CharSequence term, CharSequence query) {
- return compare(term, query, Locale.getDefault());
+ Locale locale = this.locale != null ? this.locale : Locale.getDefault();
+ return compare(term, query, locale);
}
/**
@@ -76,7 +102,7 @@ public Integer compare(CharSequence term, CharSequence query) {
* @throws IllegalArgumentException if either String input {@code null} or
* Locale input {@code null}
*/
- public Integer compare(CharSequence term, CharSequence query, Locale locale) {
+ private static int compare(CharSequence term, CharSequence query, Locale locale) {
if (term == null || query == null) {
throw new IllegalArgumentException("Strings must not be null");
} else if (locale == null) {
@@ -130,4 +156,12 @@ public Integer compare(CharSequence term, CharSequence query, Locale locale) {
return score;
}
+ public static FuzzyScore getDefaultInstance() {
+ return DEFAULT_INSTANCE;
+ }
+
+ public Locale getLocale() {
+ return locale;
+ }
+
}
diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
index b2fab14074..3d6bcd94e3 100644
--- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
@@ -28,48 +28,44 @@
*/
public class FuzzyScoreTest {
- private static FuzzyScore score;
-
- @BeforeClass
- public static void setUp() {
- score = new FuzzyScore();
- }
-
@Test
public void testGetFuzzyScore() throws Exception {
- assertEquals(0, (int) score.compare("", "", Locale.ENGLISH));
+ FuzzyScore score = new FuzzyScore(Locale.ENGLISH);
+
+ assertEquals(0, (int) score.compare("", ""));
assertEquals(0,
- (int) score.compare("Workshop", "b", Locale.ENGLISH));
+ (int) score.compare("Workshop", "b"));
assertEquals(1,
- (int) score.compare("Room", "o", Locale.ENGLISH));
+ (int) score.compare("Room", "o"));
assertEquals(1,
- (int) score.compare("Workshop", "w", Locale.ENGLISH));
+ (int) score.compare("Workshop", "w"));
assertEquals(2,
- (int) score.compare("Workshop", "ws", Locale.ENGLISH));
+ (int) score.compare("Workshop", "ws"));
assertEquals(4,
- (int) score.compare("Workshop", "wo", Locale.ENGLISH));
+ (int) score.compare("Workshop", "wo"));
assertEquals(3, (int) score.compare(
- "Apache Software Foundation", "asf", Locale.ENGLISH));
+ "Apache Software Foundation", "asf"));
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_NullNullNull() throws Exception {
- score.compare(null, null, null);
- }
+ public void testGetFuzzyScore_StringNullLocale() throws Exception {
+ FuzzyScore score = new FuzzyScore(Locale.ENGLISH);
- @Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_StringNullLoclae() throws Exception {
- score.compare(" ", null, Locale.ENGLISH);
+ score.compare("not null", null);
}
@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyScore_NullStringLocale() throws Exception {
- score.compare(null, "clear", Locale.ENGLISH);
+ FuzzyScore score = new FuzzyScore(Locale.ENGLISH);
+
+ score.compare(null, "not null");
}
@Test(expected = IllegalArgumentException.class)
- public void testGetFuzzyScore_StringStringNull() throws Exception {
- score.compare(" ", "clear", null);
+ public void testGetFuzzyScore_NullNullLocale() throws Exception {
+ FuzzyScore score = new FuzzyScore(Locale.ENGLISH);
+
+ score.compare(null, null);
}
}
From 98d7b6ae89061abdabe9397025da55cc05c91497 Mon Sep 17 00:00:00 2001
From: j--baker
Date: Tue, 3 Mar 2015 14:08:45 -0500
Subject: [PATCH 3/8] commons-text - SANDBOX-491: - Fix "s" and "t" -> "left"
and "right" variable in Levenshtein distance algorithms.
---
.../text/similarity/LevenshteinDistance.java | 50 +++++++++----------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
index e1bb08b7be..38e5f14b1f 100644
--- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -93,16 +93,16 @@ public LevenshteinDistance(final Integer threshold) {
* distance.compare("hello", "hallo") = 1
*
*
- * @param s the first String, must not be null
- * @param t the second String, must not be null
+ * @param left the first string, must not be null
+ * @param right the second string, must not be null
* @return result distance, or -1
* @throws IllegalArgumentException if either String input {@code null}
*/
- public Integer compare(CharSequence s, CharSequence t) {
+ public Integer compare(CharSequence left, CharSequence right) {
if (threshold != null) {
- return limitedCompare(s, t, threshold);
+ return limitedCompare(left, right, threshold);
} else {
- return unlimitedCompare(s, t);
+ return unlimitedCompare(left, right);
}
}
@@ -203,8 +203,8 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr
* some discussion.
*/
- int n = left.length(); // length of s
- int m = right.length(); // length of t
+ int n = left.length(); // length of left
+ int m = right.length(); // length of right
// if one string is empty, the edit distance is necessarily the length
// of the other
@@ -239,7 +239,7 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr
// iterates through t
for (int j = 1; j <= m; j++) {
- final char t_j = right.charAt(j - 1); // jth character of t
+ final char right_j = right.charAt(j - 1); // jth character of right
d[0] = j;
// compute stripe indices, constrain to array size
@@ -260,7 +260,7 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr
// iterates through [min, max] in s
for (int i = min; i <= max; i++) {
- if (left.charAt(i - 1) == t_j) {
+ if (left.charAt(i - 1) == right_j) {
// diagonally left and up
d[i] = p[i - 1];
} else {
@@ -312,13 +312,13 @@ private static int limitedCompare(CharSequence left, CharSequence right, int thr
* unlimitedCompare("hello", "hallo") = 1
*
*
- * @param s the first String, must not be null
- * @param t the second String, must not be null
+ * @param left the first String, must not be null
+ * @param right the second String, must not be null
* @return result distance, or -1
* @throws IllegalArgumentException if either String input {@code null}
*/
- private static int unlimitedCompare(CharSequence s, CharSequence t) {
- if (s == null || t == null) {
+ private static int unlimitedCompare(CharSequence left, CharSequence right) {
+ if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}
@@ -339,8 +339,8 @@ allows us to retain the previous cost counts as required by the algorithm (takin
cause an out of memory condition when calculating the LD over two very large strings.
*/
- int n = s.length(); // length of s
- int m = t.length(); // length of t
+ int n = left.length(); // length of left
+ int m = right.length(); // length of right
if (n == 0) {
return m;
@@ -350,22 +350,22 @@ allows us to retain the previous cost counts as required by the algorithm (takin
if (n > m) {
// swap the input strings to consume less memory
- final CharSequence tmp = s;
- s = t;
- t = tmp;
+ final CharSequence tmp = left;
+ left = right;
+ right = tmp;
n = m;
- m = t.length();
+ m = right.length();
}
int p[] = new int[n + 1]; //'previous' cost array, horizontally
int d[] = new int[n + 1]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
- // indexes into strings s and t
- int i; // iterates through s
- int j; // iterates through t
+ // indexes into strings left and right
+ int i; // iterates through left
+ int j; // iterates through right
- char t_j; // jth character of t
+ char right_j; // jth character of right
int cost; // cost
@@ -374,11 +374,11 @@ allows us to retain the previous cost counts as required by the algorithm (takin
}
for (j = 1; j <= m; j++) {
- t_j = t.charAt(j - 1);
+ right_j = right.charAt(j - 1);
d[0] = j;
for (i = 1; i <= n; i++) {
- cost = s.charAt(i - 1) == t_j ? 0 : 1;
+ cost = left.charAt(i - 1) == right_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}
From a31d8d00045b2d116a29a26fbedf251b297ad171 Mon Sep 17 00:00:00 2001
From: j--baker
Date: Tue, 3 Mar 2015 14:47:47 -0500
Subject: [PATCH 4/8] commons-text - SANDBOX-491: - Remove unused
org.junit.BeforeClass imports from FuzzyScoreTest and
LevenshteinDistancetest.
---
.../java/org/apache/commons/text/similarity/FuzzyScoreTest.java | 1 -
.../apache/commons/text/similarity/LevenshteinDistanceTest.java | 1 -
2 files changed, 2 deletions(-)
diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
index 3d6bcd94e3..b54215df8d 100644
--- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
@@ -20,7 +20,6 @@
import java.util.Locale;
-import org.junit.BeforeClass;
import org.junit.Test;
/**
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index ea5bd348e6..385049a7fd 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -21,7 +21,6 @@
import java.util.Map;
import java.util.TreeMap;
-import org.junit.BeforeClass;
import org.junit.Test;
/**
From 390d1d7f67d01c901cd3e7d59b8366fcabc785d1 Mon Sep 17 00:00:00 2001
From: j--baker
Date: Tue, 3 Mar 2015 16:07:56 -0500
Subject: [PATCH 5/8] commons-text - SANDBOX-491: - Remove the
Locale.getDefault() logic in FuzzyScore. There is no longer a
DEFAULT_INSTANCE and the Locale may not be null.
---
.../commons/text/similarity/FuzzyScore.java | 56 ++++---------------
.../text/similarity/FuzzyScoreTest.java | 5 ++
2 files changed, 17 insertions(+), 44 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
index 9c32ced10c..3cf6df24f6 100644
--- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
@@ -33,47 +33,22 @@
*/
public class FuzzyScore implements StringMetric {
- public static final FuzzyScore DEFAULT_INSTANCE = new FuzzyScore();
-
private final Locale locale;
- /*
- *
- * This returns an instance that call {@link Locale#getLocale()}
- * to get a {@link Locale} for each
- * call to {@link #compare(CharSequence, CharSequence)}
- *