22 * ====================================================================
33 *
44 * The Apache Software License, Version 1.1
5- *
6- * Copyright (c) 2001-2003 The Apache Software Foundation. All rights
7- * reserved.
8- *
5+ *
6+ * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
7+ *
98 * Redistribution and use in source and binary forms, with or without
10- * modification, are permitted provided that the following conditions
11- * are met:
12- *
13- * 1. Redistributions of source code must retain the above copyright
14- * notice, this list of conditions and the following disclaimer.
15- *
16- * 2. Redistributions in binary form must reproduce the above copyright
17- * notice, this list of conditions and the following disclaimer in
18- * the documentation and/or other materials provided with the
19- * distribution.
20- *
21- * 3. The end-user documentation included with the redistribution,
22- * if any, must include the following acknowledgement:
23- * "This product includes software developed by the
24- * Apache Software Foundation (http://www.apache.org/)."
25- * Alternately, this acknowledgement may appear in the software itself,
26- * if and wherever such third-party acknowledgements normally appear.
27- *
28- * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
29- * Foundation" must not be used to endorse or promote products derived
30- * from this software without prior written permission. For written
31- * permission, please contact apache@apache.org.
32- *
33- * 5. Products derived from this software may not be called "Apache",
34- * "Apache" nor may "Apache" appear in their name without prior
35- * written permission of the Apache Software Foundation.
36- *
37- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48- * SUCH DAMAGE.
9+ * modification, are permitted provided that the following conditions are met: 1.
10+ * Redistributions of source code must retain the above copyright notice, this
11+ * list of conditions and the following disclaimer. 2. Redistributions in
12+ * binary form must reproduce the above copyright notice, this list of
13+ * conditions and the following disclaimer in the documentation and/or other
14+ * materials provided with the distribution. 3. The end-user documentation
15+ * included with the redistribution, if any, must include the following
16+ * acknowledgement: "This product includes software developed by the Apache
17+ * Software Foundation (http://www.apache.org/)." Alternately, this
18+ * acknowledgement may appear in the software itself, if and wherever such
19+ * third-party acknowledgements normally appear. 4. The names "Apache", "The
20+ * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
21+ * used to endorse or promote products derived from this software without prior
22+ * written permission. For written permission, please contact
23+ * apache@apache.org. 5. Products derived from this software may not be called
24+ * "Apache", "Apache" nor may "Apache" appear in their name without prior
25+ * written permission of the Apache Software Foundation.
26+ *
27+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
28+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
29+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
30+ * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
31+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
32+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
33+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
34+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
36+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4937 * ====================================================================
50- *
51- * This software consists of voluntary contributions made by many
52- * individuals on behalf of the Apache Software Foundation. For more
53- * information on the Apache Software Foundation, please see
54- * <http://www.apache.org/>.
55- *
56- */
38+ *
39+ * This software consists of voluntary contributions made by many individuals
40+ * on behalf of the Apache Software Foundation. For more information on the
41+ * Apache Software Foundation, please see <http://www.apache.org/> .
42+ *
43+ */
5744
5845package org .apache .commons .codec .language ;
5946
6047import org .apache .commons .codec .EncoderException ;
6148import org .apache .commons .codec .StringEncoder ;
6249
6350/**
64- * Encodes a string into a soundex value. Soundex is an encoding used to
65- * relate similar names, but can also be used as a general purpose
66- * scheme to find word with similar phonemes.
51+ * Encodes a string into a soundex value. Soundex is an encoding used to relate
52+ * similar names, but can also be used as a general purpose scheme to find word
53+ * with similar phonemes.
6754 *
6855 * @author bayard@generationjava.com
6956 * @author Tim O'Brien
7057 * @author Gary Gregory
71- * @version $Id: Soundex.java,v 1.10 2003/11/04 02:43:09 ggregory Exp $
58+ * @version $Id: Soundex.java,v 1.11 2003/11/06 16:31:47 ggregory Exp $
7259 */
7360public class Soundex implements StringEncoder {
7461
7562 /**
76- * This static variable contains an instance of the
77- * Soundex using the US_ENGLISH mapping.
78- */
63+ * This static variable contains an instance of the Soundex using the
64+ * US_ENGLISH mapping.
65+ */
7966 public static final Soundex US_ENGLISH = new Soundex ();
8067
8168 /**
82- * This is a default mapping of the 26 letters used
83- * in US english.
84- */
85- public static final char [] US_ENGLISH_MAPPING =
86- "01230120022455012623010202" .toCharArray ();
69+ * This is a default mapping of the 26 letters used in US english.
70+ * A value of <code>0</code> for a letter position means do not encode.
71+ */
72+ public static final char [] US_ENGLISH_MAPPING = "01230120022455012623010202" .toCharArray ();
8773
8874 /**
89- * The maximum length of a Soundex code - Soundex codes are
90- * only four characters by definition.
91- */
75+ * The maximum length of a Soundex code - Soundex codes are only four
76+ * characters by definition.
77+ */
9278 private int maxLength = 4 ;
93-
79+
9480 /**
95- * Every letter of the alphabet is "mapped" to a numerical
96- * value. This char array holds the values to which each
97- * letter is mapped. This implementation contains a default
98- * map for US_ENGLISH
99- */
81+ * Every letter of the alphabet is "mapped" to a numerical value. This char
82+ * array holds the values to which each letter is mapped. This
83+ * implementation contains a default map for US_ENGLISH
84+ */
10085 private char [] soundexMapping ;
10186
10287 /**
103- * Creates an instance of the Soundex object using the default
104- * US_ENGLISH mapping.
105- */
88+ * Creates an instance of the Soundex object using the default US_ENGLISH
89+ * mapping.
90+ */
10691 public Soundex () {
10792 this (US_ENGLISH_MAPPING );
10893 }
10994
11095 /**
111- * Creates a soundex instance using a custom mapping. This
112- * constructor can be used to customize the mapping, and/or possibly
113- * provide an internationalized mapping for a non-Western character
114- * set.
115- *
116- * @param mapping Mapping array to use when finding the corresponding
117- * code for a given character
118- */
96+ * Creates a soundex instance using a custom mapping. This constructor can
97+ * be used to customize the mapping, and/or possibly provide an
98+ * internationalized mapping for a non-Western character set.
99+ *
100+ * @param mapping
101+ * Mapping array to use when finding the corresponding code for
102+ * a given character
103+ */
119104 public Soundex (char [] mapping ) {
120105 this .setSoundexMapping (mapping );
121106 }
122107
123108 /**
124- * Encodes an Object using the soundex algorithm. This method
125- * is provided in order to satisfy the requirements of the
126- * Encoder interface, and will throw an EncoderException if the
127- * supplied object is not of type java.lang.String.
128- *
129- * @param pObject Object to encode
130- * @return An object (or type java.lang.String) containing the
131- * soundex code which corresponds to the String supplied.
132- * @throws EncoderException if the parameter supplied is not
133- * of type java.lang.String
134- */
109+ * Cleans up the input string before Soundex processing by trimming and
110+ * removing punctuation characters. The string is returned in upper-case.
111+ */
112+ private String clean (String str ) {
113+ if (str == null || str .length () == 0 ) {
114+ return str ;
115+ }
116+ int len = str .length ();
117+ char [] chars = new char [len ];
118+ int count = 0 ;
119+ for (int i = 0 ; i < len ; i ++) {
120+ if (Character .isLetter (str .charAt (i ))) {
121+ chars [count ++] = str .charAt (i );
122+ }
123+ }
124+ if (count == len ) {
125+ return str .toUpperCase ();
126+ }
127+ return new String (chars , 0 , count ).toUpperCase ();
128+ }
129+
130+ /**
131+ * Encodes an Object using the soundex algorithm. This method is provided
132+ * in order to satisfy the requirements of the Encoder interface, and will
133+ * throw an EncoderException if the supplied object is not of type
134+ * java.lang.String.
135+ *
136+ * @param pObject
137+ * Object to encode
138+ * @return An object (or type java.lang.String) containing the soundex code
139+ * which corresponds to the String supplied.
140+ * @throws EncoderException
141+ * if the parameter supplied is not of type java.lang.String
142+ */
135143 public Object encode (Object pObject ) throws EncoderException {
136144
137145 Object result ;
138146
139147 if (!(pObject instanceof java .lang .String )) {
140- throw new EncoderException ("Parameter supplied to Soundex encode is not of type java.lang.String" );
148+ throw new EncoderException ("Parameter supplied to Soundex encode is not of type java.lang.String" );
141149 } else {
142150 result = soundex ((String ) pObject );
143151 }
@@ -147,79 +155,118 @@ public Object encode(Object pObject) throws EncoderException {
147155 }
148156
149157 /**
150- * Encodes a String using the soundex algorithm.
151- *
152- * @param pString A String object to encode
153- * @return A Soundex code corresponding to the String supplied
154- */
158+ * Encodes a String using the soundex algorithm.
159+ *
160+ * @param pString
161+ * A String object to encode
162+ * @return A Soundex code corresponding to the String supplied
163+ */
155164 public String encode (String pString ) {
156- return ( soundex (pString ));
165+ return soundex (pString );
157166 }
158167
159168 /**
160- * Used internally by the SoundEx algorithm.
161- *
162- * @param c character to use to retrieve mapping code
163- * @return Mapping code for a particular character
164- */
165- private char getMappingCode (char c ) {
169+ * Used internally by the SoundEx algorithm.
170+ *
171+ * Consonants from the same code group separated by W or H are treated as one.
172+ *
173+ * @param str
174+ * the whole string
175+ * @param index
176+ * the character position to encode
177+ * @return Mapping code for a particular character
178+ */
179+ private char getMappingCode (String str , int index ) {
180+ char c = str .charAt (index );
166181 if (!Character .isLetter (c )) {
167182 return 0 ;
168183 } else {
169- return this .getSoundexMapping ()[Character .toUpperCase (c ) - 'A' ];
184+ char mappedChar = this .map (c );
185+ // HW rule check
186+ if (index > 1 && mappedChar != '0' ) {
187+ char hwChar = str .charAt (index -1 );
188+ if ('H' == hwChar || 'W' == hwChar ) {
189+ char preHWChar = str .charAt (index - 2 );
190+ char firstCode = this .map (preHWChar );
191+ if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar ) {
192+ return 0 ;
193+ }
194+ }
195+ }
196+ return mappedChar ;
170197 }
171198 }
172199
173200 /**
174- * Returns the maxLength. Standard Soundex
175- * @return int
176- */
201+ * Returns the maxLength. Standard Soundex
202+ *
203+ * @return int
204+ */
177205 public int getMaxLength () {
178206 return this .maxLength ;
179207 }
180208
181209 /**
182- * @return Returns the soundexMapping.
183- */
210+ * @return Returns the soundexMapping.
211+ */
184212 private char [] getSoundexMapping () {
185213 return this .soundexMapping ;
186214 }
187215
188216 /**
189- * Sets the maxLength.
190- * @param maxLength The maxLength to set
217+ * Maps the given upper-case character to it's Soudex code.
191218 */
219+ private char map (char c ) {
220+ return this .getSoundexMapping ()[c - 'A' ];
221+ }
222+
223+ /**
224+ * Sets the maxLength.
225+ *
226+ * @param maxLength
227+ * The maxLength to set
228+ */
192229 public void setMaxLength (int maxLength ) {
193230 this .maxLength = maxLength ;
194231 }
195232
196233 /**
197- * @param soundexMapping The soundexMapping to set.
198- */
234+ * @param soundexMapping
235+ * The soundexMapping to set.
236+ */
199237 private void setSoundexMapping (char [] soundexMapping ) {
200238 this .soundexMapping = soundexMapping ;
201239 }
202240
203241 /**
204- * Retreives the Soundex code for a given String object.
205- *
206- * @param str String to encode using the Soundex algorithm
207- * @return A soundex code for the String supplied
208- */
242+ * Retreives the Soundex code for a given String object.
243+ *
244+ * @param str
245+ * String to encode using the Soundex algorithm
246+ * @return A soundex code for the String supplied
247+ */
209248 public String soundex (String str ) {
210- if (null == str || str .length () == 0 ) { return str ; }
211-
249+ if (str == null ) {
250+ return null ;
251+ }
252+ str = this .clean (str );
253+ if (str .length () == 0 ) {
254+ return str ;
255+ }
256+
212257 char out [] = { '0' , '0' , '0' , '0' };
213258 char last , mapped ;
214259 int incount = 1 , count = 1 ;
215- out [0 ] = Character .toUpperCase (str .charAt (0 ));
216- last = getMappingCode (str .charAt (0 ));
217- while ((incount < str .length ()) && (mapped = getMappingCode (str .charAt (incount ++))) != 0 && (count < this .getMaxLength ())) {
260+ out [0 ] = str .charAt (0 );
261+ last = getMappingCode (str , 0 );
262+ while ((incount < str .length ()) && (count < this .getMaxLength ())) {
263+ if ((mapped = getMappingCode (str , incount ++)) != 0 ) {
218264 if ((mapped != '0' ) && (mapped != last )) {
219265 out [count ++] = mapped ;
220266 }
221267 last = mapped ;
222268 }
269+ }
223270 return new String (out );
224271 }
225272
0 commit comments