Skip to content

Commit 6b0e453

Browse files
committed
1 parent 9da7d2a commit 6b0e453

3 files changed

Lines changed: 455 additions & 224 deletions

File tree

src/java/org/apache/commons/codec/language/Soundex.java

Lines changed: 173 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -2,142 +2,150 @@
22
* ====================================================================
33
*
44
* The Apache Software License, Version 1.1
5-
*
6-
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
7-
* reserved.
8-
*
5+
*
6+
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
7+
*
98
* Redistribution and use in source and binary forms, with or without
10-
* modification, are permitted provided that the following conditions
11-
* are met:
12-
*
13-
* 1. Redistributions of source code must retain the above copyright
14-
* notice, this list of conditions and the following disclaimer.
15-
*
16-
* 2. Redistributions in binary form must reproduce the above copyright
17-
* notice, this list of conditions and the following disclaimer in
18-
* the documentation and/or other materials provided with the
19-
* distribution.
20-
*
21-
* 3. The end-user documentation included with the redistribution,
22-
* if any, must include the following acknowledgement:
23-
* "This product includes software developed by the
24-
* Apache Software Foundation (http://www.apache.org/)."
25-
* Alternately, this acknowledgement may appear in the software itself,
26-
* if and wherever such third-party acknowledgements normally appear.
27-
*
28-
* 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
29-
* Foundation" must not be used to endorse or promote products derived
30-
* from this software without prior written permission. For written
31-
* permission, please contact apache@apache.org.
32-
*
33-
* 5. Products derived from this software may not be called "Apache",
34-
* "Apache" nor may "Apache" appear in their name without prior
35-
* written permission of the Apache Software Foundation.
36-
*
37-
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38-
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39-
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40-
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41-
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42-
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43-
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44-
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46-
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47-
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48-
* SUCH DAMAGE.
9+
* modification, are permitted provided that the following conditions are met: 1.
10+
* Redistributions of source code must retain the above copyright notice, this
11+
* list of conditions and the following disclaimer. 2. Redistributions in
12+
* binary form must reproduce the above copyright notice, this list of
13+
* conditions and the following disclaimer in the documentation and/or other
14+
* materials provided with the distribution. 3. The end-user documentation
15+
* included with the redistribution, if any, must include the following
16+
* acknowledgement: "This product includes software developed by the Apache
17+
* Software Foundation (http://www.apache.org/)." Alternately, this
18+
* acknowledgement may appear in the software itself, if and wherever such
19+
* third-party acknowledgements normally appear. 4. The names "Apache", "The
20+
* Jakarta Project", "Commons", and "Apache Software Foundation" must not be
21+
* used to endorse or promote products derived from this software without prior
22+
* written permission. For written permission, please contact
23+
* apache@apache.org. 5. Products derived from this software may not be called
24+
* "Apache", "Apache" nor may "Apache" appear in their name without prior
25+
* written permission of the Apache Software Foundation.
26+
*
27+
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
28+
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
29+
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
30+
* APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
31+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
32+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
33+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
34+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
36+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4937
* ====================================================================
50-
*
51-
* This software consists of voluntary contributions made by many
52-
* individuals on behalf of the Apache Software Foundation. For more
53-
* information on the Apache Software Foundation, please see
54-
* <http://www.apache.org/>.
55-
*
56-
*/
38+
*
39+
* This software consists of voluntary contributions made by many individuals
40+
* on behalf of the Apache Software Foundation. For more information on the
41+
* Apache Software Foundation, please see <http://www.apache.org/> .
42+
*
43+
*/
5744

5845
package org.apache.commons.codec.language;
5946

6047
import org.apache.commons.codec.EncoderException;
6148
import org.apache.commons.codec.StringEncoder;
6249

6350
/**
64-
* Encodes a string into a soundex value. Soundex is an encoding used to
65-
* relate similar names, but can also be used as a general purpose
66-
* scheme to find word with similar phonemes.
51+
* Encodes a string into a soundex value. Soundex is an encoding used to relate
52+
* similar names, but can also be used as a general purpose scheme to find word
53+
* with similar phonemes.
6754
*
6855
* @author bayard@generationjava.com
6956
* @author Tim O'Brien
7057
* @author Gary Gregory
71-
* @version $Id: Soundex.java,v 1.10 2003/11/04 02:43:09 ggregory Exp $
58+
* @version $Id: Soundex.java,v 1.11 2003/11/06 16:31:47 ggregory Exp $
7259
*/
7360
public class Soundex implements StringEncoder {
7461

7562
/**
76-
* This static variable contains an instance of the
77-
* Soundex using the US_ENGLISH mapping.
78-
*/
63+
* This static variable contains an instance of the Soundex using the
64+
* US_ENGLISH mapping.
65+
*/
7966
public static final Soundex US_ENGLISH = new Soundex();
8067

8168
/**
82-
* This is a default mapping of the 26 letters used
83-
* in US english.
84-
*/
85-
public static final char[] US_ENGLISH_MAPPING =
86-
"01230120022455012623010202".toCharArray();
69+
* This is a default mapping of the 26 letters used in US english.
70+
* A value of <code>0</code> for a letter position means do not encode.
71+
*/
72+
public static final char[] US_ENGLISH_MAPPING = "01230120022455012623010202".toCharArray();
8773

8874
/**
89-
* The maximum length of a Soundex code - Soundex codes are
90-
* only four characters by definition.
91-
*/
75+
* The maximum length of a Soundex code - Soundex codes are only four
76+
* characters by definition.
77+
*/
9278
private int maxLength = 4;
93-
79+
9480
/**
95-
* Every letter of the alphabet is "mapped" to a numerical
96-
* value. This char array holds the values to which each
97-
* letter is mapped. This implementation contains a default
98-
* map for US_ENGLISH
99-
*/
81+
* Every letter of the alphabet is "mapped" to a numerical value. This char
82+
* array holds the values to which each letter is mapped. This
83+
* implementation contains a default map for US_ENGLISH
84+
*/
10085
private char[] soundexMapping;
10186

10287
/**
103-
* Creates an instance of the Soundex object using the default
104-
* US_ENGLISH mapping.
105-
*/
88+
* Creates an instance of the Soundex object using the default US_ENGLISH
89+
* mapping.
90+
*/
10691
public Soundex() {
10792
this(US_ENGLISH_MAPPING);
10893
}
10994

11095
/**
111-
* Creates a soundex instance using a custom mapping. This
112-
* constructor can be used to customize the mapping, and/or possibly
113-
* provide an internationalized mapping for a non-Western character
114-
* set.
115-
*
116-
* @param mapping Mapping array to use when finding the corresponding
117-
* code for a given character
118-
*/
96+
* Creates a soundex instance using a custom mapping. This constructor can
97+
* be used to customize the mapping, and/or possibly provide an
98+
* internationalized mapping for a non-Western character set.
99+
*
100+
* @param mapping
101+
* Mapping array to use when finding the corresponding code for
102+
* a given character
103+
*/
119104
public Soundex(char[] mapping) {
120105
this.setSoundexMapping(mapping);
121106
}
122107

123108
/**
124-
* Encodes an Object using the soundex algorithm. This method
125-
* is provided in order to satisfy the requirements of the
126-
* Encoder interface, and will throw an EncoderException if the
127-
* supplied object is not of type java.lang.String.
128-
*
129-
* @param pObject Object to encode
130-
* @return An object (or type java.lang.String) containing the
131-
* soundex code which corresponds to the String supplied.
132-
* @throws EncoderException if the parameter supplied is not
133-
* of type java.lang.String
134-
*/
109+
* Cleans up the input string before Soundex processing by trimming and
110+
* removing punctuation characters. The string is returned in upper-case.
111+
*/
112+
private String clean(String str) {
113+
if (str == null || str.length() == 0) {
114+
return str;
115+
}
116+
int len = str.length();
117+
char[] chars = new char[len];
118+
int count = 0;
119+
for (int i = 0; i < len; i++) {
120+
if (Character.isLetter(str.charAt(i))) {
121+
chars[count++] = str.charAt(i);
122+
}
123+
}
124+
if (count == len) {
125+
return str.toUpperCase();
126+
}
127+
return new String(chars, 0, count).toUpperCase();
128+
}
129+
130+
/**
131+
* Encodes an Object using the soundex algorithm. This method is provided
132+
* in order to satisfy the requirements of the Encoder interface, and will
133+
* throw an EncoderException if the supplied object is not of type
134+
* java.lang.String.
135+
*
136+
* @param pObject
137+
* Object to encode
138+
* @return An object (or type java.lang.String) containing the soundex code
139+
* which corresponds to the String supplied.
140+
* @throws EncoderException
141+
* if the parameter supplied is not of type java.lang.String
142+
*/
135143
public Object encode(Object pObject) throws EncoderException {
136144

137145
Object result;
138146

139147
if (!(pObject instanceof java.lang.String)) {
140-
throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
148+
throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
141149
} else {
142150
result = soundex((String) pObject);
143151
}
@@ -147,79 +155,118 @@ public Object encode(Object pObject) throws EncoderException {
147155
}
148156

149157
/**
150-
* Encodes a String using the soundex algorithm.
151-
*
152-
* @param pString A String object to encode
153-
* @return A Soundex code corresponding to the String supplied
154-
*/
158+
* Encodes a String using the soundex algorithm.
159+
*
160+
* @param pString
161+
* A String object to encode
162+
* @return A Soundex code corresponding to the String supplied
163+
*/
155164
public String encode(String pString) {
156-
return (soundex(pString));
165+
return soundex(pString);
157166
}
158167

159168
/**
160-
* Used internally by the SoundEx algorithm.
161-
*
162-
* @param c character to use to retrieve mapping code
163-
* @return Mapping code for a particular character
164-
*/
165-
private char getMappingCode(char c) {
169+
* Used internally by the SoundEx algorithm.
170+
*
171+
* Consonants from the same code group separated by W or H are treated as one.
172+
*
173+
* @param str
174+
* the whole string
175+
* @param index
176+
* the character position to encode
177+
* @return Mapping code for a particular character
178+
*/
179+
private char getMappingCode(String str, int index) {
180+
char c = str.charAt(index);
166181
if (!Character.isLetter(c)) {
167182
return 0;
168183
} else {
169-
return this.getSoundexMapping()[Character.toUpperCase(c) - 'A'];
184+
char mappedChar = this.map(c);
185+
// HW rule check
186+
if (index > 1 && mappedChar != '0') {
187+
char hwChar = str.charAt(index-1);
188+
if ('H' == hwChar || 'W' == hwChar) {
189+
char preHWChar = str.charAt(index - 2);
190+
char firstCode = this.map(preHWChar);
191+
if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
192+
return 0;
193+
}
194+
}
195+
}
196+
return mappedChar;
170197
}
171198
}
172199

173200
/**
174-
* Returns the maxLength. Standard Soundex
175-
* @return int
176-
*/
201+
* Returns the maxLength. Standard Soundex
202+
*
203+
* @return int
204+
*/
177205
public int getMaxLength() {
178206
return this.maxLength;
179207
}
180208

181209
/**
182-
* @return Returns the soundexMapping.
183-
*/
210+
* @return Returns the soundexMapping.
211+
*/
184212
private char[] getSoundexMapping() {
185213
return this.soundexMapping;
186214
}
187215

188216
/**
189-
* Sets the maxLength.
190-
* @param maxLength The maxLength to set
217+
* Maps the given upper-case character to it's Soudex code.
191218
*/
219+
private char map(char c) {
220+
return this.getSoundexMapping()[c - 'A'];
221+
}
222+
223+
/**
224+
* Sets the maxLength.
225+
*
226+
* @param maxLength
227+
* The maxLength to set
228+
*/
192229
public void setMaxLength(int maxLength) {
193230
this.maxLength = maxLength;
194231
}
195232

196233
/**
197-
* @param soundexMapping The soundexMapping to set.
198-
*/
234+
* @param soundexMapping
235+
* The soundexMapping to set.
236+
*/
199237
private void setSoundexMapping(char[] soundexMapping) {
200238
this.soundexMapping = soundexMapping;
201239
}
202240

203241
/**
204-
* Retreives the Soundex code for a given String object.
205-
*
206-
* @param str String to encode using the Soundex algorithm
207-
* @return A soundex code for the String supplied
208-
*/
242+
* Retreives the Soundex code for a given String object.
243+
*
244+
* @param str
245+
* String to encode using the Soundex algorithm
246+
* @return A soundex code for the String supplied
247+
*/
209248
public String soundex(String str) {
210-
if (null == str || str.length() == 0) { return str; }
211-
249+
if (str == null) {
250+
return null;
251+
}
252+
str = this.clean(str);
253+
if (str.length() == 0) {
254+
return str;
255+
}
256+
212257
char out[] = { '0', '0', '0', '0' };
213258
char last, mapped;
214259
int incount = 1, count = 1;
215-
out[0] = Character.toUpperCase(str.charAt(0));
216-
last = getMappingCode(str.charAt(0));
217-
while ((incount < str.length()) && (mapped = getMappingCode(str.charAt(incount++))) != 0 && (count < this.getMaxLength())) {
260+
out[0] = str.charAt(0);
261+
last = getMappingCode(str, 0);
262+
while ((incount < str.length()) && (count < this.getMaxLength())) {
263+
if ((mapped = getMappingCode(str, incount++)) != 0) {
218264
if ((mapped != '0') && (mapped != last)) {
219265
out[count++] = mapped;
220266
}
221267
last = mapped;
222268
}
269+
}
223270
return new String(out);
224271
}
225272

0 commit comments

Comments
 (0)