Skip to content

Commit 899036a

Browse files
committed
Refactor for implementing difference() API in both Soundex and RefinedSoundex.
http://nagoya.apache.org/bugzilla/show_bug.cgi?id=25243 git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/codec/trunk@130271 13f79535-47bb-0310-9956-ffa450edef68
1 parent 8d85bf9 commit 899036a

4 files changed

Lines changed: 406 additions & 270 deletions

File tree

src/java/org/apache/commons/codec/language/RefinedSoundex.java

Lines changed: 126 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
* from this software without prior written permission. For written
3131
* permission, please contact apache@apache.org.
3232
*
33-
* 5. Products derived from this software may not be called "Apache",
34-
* "Apache" nor may "Apache" appear in their name without prior
33+
* 5. Products derived from this software may not be called "Apache"
34+
* nor may "Apache" appear in their name without prior
3535
* written permission of the Apache Software Foundation.
3636
*
3737
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@@ -61,139 +61,168 @@
6161
import org.apache.commons.codec.StringEncoder;
6262

6363
/**
64-
* Encodes a string into a refined soundex value.
65-
* A refined soundex code is optimized for spell checking word.
66-
* "Soundex" method originally developed by Margaret Odell and
67-
* Robert Russell
64+
* Encodes a string into a refined soundex value. A refined soundex code is
65+
* optimized for spell checking word. "Soundex" method originally developed by
66+
* Margaret Odell and Robert Russell
6867
*
6968
* @author Tim O'Brien
70-
* @author ggregory@seagullsw.com
71-
* @version $Id: RefinedSoundex.java,v 1.12 2003/11/24 00:11:56 ggregory Exp $
69+
* @author Gary D. Gregory
70+
* @version $Id: RefinedSoundex.java,v 1.13 2003/12/11 23:44:11 ggregory Exp $
7271
*/
7372
public class RefinedSoundex implements StringEncoder {
7473

7574
/**
76-
* RefinedSoundex is *refined* for a number of
77-
* reasons one being that the mappings have been
78-
* altered. This implementation contains default
79-
* mappings for US English.
80-
*/
81-
public static final char[] US_ENGLISH_MAPPING =
82-
"01360240043788015936020505".toCharArray();
75+
* This static variable contains an instance of the RefinedSoundex using
76+
* the US_ENGLISH mapping.
77+
*/
78+
public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
8379

8480
/**
85-
* This static variable contains an instance of the
86-
* RefinedSoundex using the US_ENGLISH mapping.
87-
*/
88-
public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
89-
81+
* RefinedSoundex is *refined* for a number of reasons one being that the
82+
* mappings have been altered. This implementation contains default
83+
* mappings for US English.
84+
*/
85+
public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
86+
9087
/**
91-
* Every letter of the alphabet is "mapped" to a numerical
92-
* value. This char array holds the values to which each
93-
* letter is mapped. This implementation contains a default
94-
* map for US_ENGLISH
95-
*/
88+
* Every letter of the alphabet is "mapped" to a numerical value. This char
89+
* array holds the values to which each letter is mapped. This
90+
* implementation contains a default map for US_ENGLISH
91+
*/
9692
private char[] soundexMapping;
9793

9894
/**
99-
* Creates an instance of the RefinedSoundex object using the
100-
* default US English mapping.
101-
*/
95+
* Creates an instance of the RefinedSoundex object using the default US
96+
* English mapping.
97+
*/
10298
public RefinedSoundex() {
10399
this(US_ENGLISH_MAPPING);
104100
}
105101

106102
/**
107-
* Creates a refined soundex instance using a custom mapping. This
108-
* constructor can be used to customize the mapping, and/or possibly
109-
* provide an internationalized mapping for a non-Western character
110-
* set.
111-
*
112-
* @param mapping Mapping array to use when finding the corresponding
113-
* code for a given character
114-
*/
103+
* Creates a refined soundex instance using a custom mapping. This
104+
* constructor can be used to customize the mapping, and/or possibly
105+
* provide an internationalized mapping for a non-Western character set.
106+
*
107+
* @param mapping
108+
* Mapping array to use when finding the corresponding code for
109+
* a given character
110+
*/
115111
public RefinedSoundex(char[] mapping) {
116112
this.soundexMapping = mapping;
117113
}
118114

119115
/**
120-
* Retreives the Refined Soundex code for a given String object.
121-
*
122-
* @param str String to encode using the Refined Soundex algorithm
123-
* @return A soundex code for the String supplied
124-
*/
125-
public String soundex(String str) {
126-
if (null == str || str.length() == 0) { return str; }
127-
128-
StringBuffer sBuf = new StringBuffer();
129-
str = str.toUpperCase();
130-
131-
sBuf.append(str.charAt(0));
132-
133-
char last, current;
134-
last = '*';
135-
136-
for (int i = 0; i < str.length(); i++) {
137-
138-
current = getMappingCode(str.charAt(i));
139-
if (current == last) {
140-
continue;
141-
} else if (current != 0) {
142-
sBuf.append(current);
143-
}
144-
145-
last = current;
146-
147-
}
148-
149-
return sBuf.toString();
150-
}
151-
152-
/**
153-
* Encodes a String using the refined soundex algorithm.
154-
*
155-
* @param pString A String object to encode
156-
* @return A Soundex code corresponding to the String supplied
157-
*/
158-
public String encode(String pString) {
159-
return soundex(pString);
116+
* Returns the number of characters in the two encoded Strings that are the
117+
* same. This return value ranges from 0 to the length of the shortest
118+
* encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
119+
* example) indicates strong similarity or identical values. For refined
120+
* Soundex, the return value can be greater than 4.
121+
*
122+
* @param s1
123+
* A String that will be encoded and compared.
124+
* @param s2
125+
* A String that will be encoded and compared.
126+
* @return The number of characters in the two encoded Strings that are the
127+
* same from 0 to to the length of the shortest encoded String.
128+
*
129+
* @see SoundexUtils#difference(StringEncoder,String,String)
130+
* @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
131+
* MS T-SQL DIFFERENCE</a>
132+
*
133+
* @throws EncoderException
134+
* if an error occurs encoding one of the strings
135+
*/
136+
public int difference(String s1, String s2) throws EncoderException {
137+
return SoundexUtils.difference(this, s1, s2);
160138
}
161139

162140
/**
163-
* Encodes an Object using the refined soundex algorithm. This method
164-
* is provided in order to satisfy the requirements of the
165-
* Encoder interface, and will throw an EncoderException if the
166-
* supplied object is not of type java.lang.String.
167-
*
168-
* @param pObject Object to encode
169-
* @return An object (or type java.lang.String) containing the
170-
* refined soundex code which corresponds to the String supplied.
171-
* @throws EncoderException if the parameter supplied is not
172-
* of type java.lang.String
173-
*/
141+
* Encodes an Object using the refined soundex algorithm. This method is
142+
* provided in order to satisfy the requirements of the Encoder interface,
143+
* and will throw an EncoderException if the supplied object is not of type
144+
* java.lang.String.
145+
*
146+
* @param pObject
147+
* Object to encode
148+
* @return An object (or type java.lang.String) containing the refined
149+
* soundex code which corresponds to the String supplied.
150+
* @throws EncoderException
151+
* if the parameter supplied is not of type java.lang.String
152+
*/
174153
public Object encode(Object pObject) throws EncoderException {
175154
Object result;
176155
if (!(pObject instanceof java.lang.String)) {
177-
throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
156+
throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
178157
} else {
179158
result = soundex((String) pObject);
180159
}
181160
return result;
182161
}
183162

184163
/**
185-
* Returns the mapping code for a given character. The mapping
186-
* codes are maintained in an internal char array named soundexMapping,
187-
* and the default values of these mappings are US English.
188-
*
189-
* @param c char to get mapping for
190-
* @return A character (really a numeral) to return for the given char
191-
*/
164+
* Encodes a String using the refined soundex algorithm.
165+
*
166+
* @param pString
167+
* A String object to encode
168+
* @return A Soundex code corresponding to the String supplied
169+
*/
170+
public String encode(String pString) {
171+
return soundex(pString);
172+
}
173+
174+
/**
175+
* Returns the mapping code for a given character. The mapping codes are
176+
* maintained in an internal char array named soundexMapping, and the
177+
* default values of these mappings are US English.
178+
*
179+
* @param c
180+
* char to get mapping for
181+
* @return A character (really a numeral) to return for the given char
182+
*/
192183
private char getMappingCode(char c) {
193184
if (!Character.isLetter(c)) {
194185
return 0;
195186
} else {
196-
return soundexMapping[Character.toUpperCase(c) - 'A'];
187+
return this.soundexMapping[Character.toUpperCase(c) - 'A'];
188+
}
189+
}
190+
191+
/**
192+
* Retreives the Refined Soundex code for a given String object.
193+
*
194+
* @param str
195+
* String to encode using the Refined Soundex algorithm
196+
* @return A soundex code for the String supplied
197+
*/
198+
public String soundex(String str) {
199+
if (str == null) {
200+
return null;
201+
}
202+
str = SoundexUtils.clean(str);
203+
if (str.length() == 0) {
204+
return str;
197205
}
206+
207+
StringBuffer sBuf = new StringBuffer();
208+
sBuf.append(str.charAt(0));
209+
210+
char last, current;
211+
last = '*';
212+
213+
for (int i = 0; i < str.length(); i++) {
214+
215+
current = getMappingCode(str.charAt(i));
216+
if (current == last) {
217+
continue;
218+
} else if (current != 0) {
219+
sBuf.append(current);
220+
}
221+
222+
last = current;
223+
224+
}
225+
226+
return sBuf.toString();
198227
}
199-
}
228+
}

0 commit comments

Comments
 (0)