3030 * from this software without prior written permission. For written
3131 * permission, please contact apache@apache.org.
3232 *
33- * 5. Products derived from this software may not be called "Apache",
34- * "Apache" nor may "Apache" appear in their name without prior
33+ * 5. Products derived from this software may not be called "Apache"
34+ * nor may "Apache" appear in their name without prior
3535 * written permission of the Apache Software Foundation.
3636 *
3737 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
6161import org .apache .commons .codec .StringEncoder ;
6262
6363/**
64- * Encodes a string into a refined soundex value.
65- * A refined soundex code is optimized for spell checking word.
66- * "Soundex" method originally developed by Margaret Odell and
67- * Robert Russell
64+ * Encodes a string into a refined soundex value. A refined soundex code is
65+ * optimized for spell checking word. "Soundex" method originally developed by
66+ * Margaret Odell and Robert Russell
6867 *
6968 * @author Tim O'Brien
70- * @author ggregory@seagullsw.com
71- * @version $Id: RefinedSoundex.java,v 1.12 2003/11/24 00:11:56 ggregory Exp $
69+ * @author Gary D. Gregory
70+ * @version $Id: RefinedSoundex.java,v 1.13 2003/12/11 23:44:11 ggregory Exp $
7271 */
7372public class RefinedSoundex implements StringEncoder {
7473
7574 /**
76- * RefinedSoundex is *refined* for a number of
77- * reasons one being that the mappings have been
78- * altered. This implementation contains default
79- * mappings for US English.
80- */
81- public static final char [] US_ENGLISH_MAPPING =
82- "01360240043788015936020505" .toCharArray ();
75+ * This static variable contains an instance of the RefinedSoundex using
76+ * the US_ENGLISH mapping.
77+ */
78+ public static final RefinedSoundex US_ENGLISH = new RefinedSoundex ();
8379
8480 /**
85- * This static variable contains an instance of the
86- * RefinedSoundex using the US_ENGLISH mapping.
87- */
88- public static final RefinedSoundex US_ENGLISH = new RefinedSoundex ();
89-
81+ * RefinedSoundex is *refined* for a number of reasons one being that the
82+ * mappings have been altered. This implementation contains default
83+ * mappings for US English.
84+ */
85+ public static final char [] US_ENGLISH_MAPPING = "01360240043788015936020505" .toCharArray ();
86+
9087 /**
91- * Every letter of the alphabet is "mapped" to a numerical
92- * value. This char array holds the values to which each
93- * letter is mapped. This implementation contains a default
94- * map for US_ENGLISH
95- */
88+ * Every letter of the alphabet is "mapped" to a numerical value. This char
89+ * array holds the values to which each letter is mapped. This
90+ * implementation contains a default map for US_ENGLISH
91+ */
9692 private char [] soundexMapping ;
9793
9894 /**
99- * Creates an instance of the RefinedSoundex object using the
100- * default US English mapping.
101- */
95+ * Creates an instance of the RefinedSoundex object using the default US
96+ * English mapping.
97+ */
10298 public RefinedSoundex () {
10399 this (US_ENGLISH_MAPPING );
104100 }
105101
106102 /**
107- * Creates a refined soundex instance using a custom mapping. This
108- * constructor can be used to customize the mapping, and/or possibly
109- * provide an internationalized mapping for a non-Western character
110- * set.
111- *
112- * @param mapping Mapping array to use when finding the corresponding
113- * code for a given character
114- */
103+ * Creates a refined soundex instance using a custom mapping. This
104+ * constructor can be used to customize the mapping, and/or possibly
105+ * provide an internationalized mapping for a non-Western character set.
106+ *
107+ * @param mapping
108+ * Mapping array to use when finding the corresponding code for
109+ * a given character
110+ */
115111 public RefinedSoundex (char [] mapping ) {
116112 this .soundexMapping = mapping ;
117113 }
118114
119115 /**
120- * Retreives the Refined Soundex code for a given String object.
121- *
122- * @param str String to encode using the Refined Soundex algorithm
123- * @return A soundex code for the String supplied
124- */
125- public String soundex (String str ) {
126- if (null == str || str .length () == 0 ) { return str ; }
127-
128- StringBuffer sBuf = new StringBuffer ();
129- str = str .toUpperCase ();
130-
131- sBuf .append (str .charAt (0 ));
132-
133- char last , current ;
134- last = '*' ;
135-
136- for (int i = 0 ; i < str .length (); i ++) {
137-
138- current = getMappingCode (str .charAt (i ));
139- if (current == last ) {
140- continue ;
141- } else if (current != 0 ) {
142- sBuf .append (current );
143- }
144-
145- last = current ;
146-
147- }
148-
149- return sBuf .toString ();
150- }
151-
152- /**
153- * Encodes a String using the refined soundex algorithm.
154- *
155- * @param pString A String object to encode
156- * @return A Soundex code corresponding to the String supplied
157- */
158- public String encode (String pString ) {
159- return soundex (pString );
116+ * Returns the number of characters in the two encoded Strings that are the
117+ * same. This return value ranges from 0 to the length of the shortest
118+ * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
119+ * example) indicates strong similarity or identical values. For refined
120+ * Soundex, the return value can be greater than 4.
121+ *
122+ * @param s1
123+ * A String that will be encoded and compared.
124+ * @param s2
125+ * A String that will be encoded and compared.
126+ * @return The number of characters in the two encoded Strings that are the
127+ * same from 0 to to the length of the shortest encoded String.
128+ *
129+ * @see SoundexUtils#difference(StringEncoder,String,String)
130+ * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
131+ * MS T-SQL DIFFERENCE</a>
132+ *
133+ * @throws EncoderException
134+ * if an error occurs encoding one of the strings
135+ */
136+ public int difference (String s1 , String s2 ) throws EncoderException {
137+ return SoundexUtils .difference (this , s1 , s2 );
160138 }
161139
162140 /**
163- * Encodes an Object using the refined soundex algorithm. This method
164- * is provided in order to satisfy the requirements of the
165- * Encoder interface, and will throw an EncoderException if the
166- * supplied object is not of type java.lang.String.
167- *
168- * @param pObject Object to encode
169- * @return An object (or type java.lang.String) containing the
170- * refined soundex code which corresponds to the String supplied.
171- * @throws EncoderException if the parameter supplied is not
172- * of type java.lang.String
173- */
141+ * Encodes an Object using the refined soundex algorithm. This method is
142+ * provided in order to satisfy the requirements of the Encoder interface,
143+ * and will throw an EncoderException if the supplied object is not of type
144+ * java.lang.String.
145+ *
146+ * @param pObject
147+ * Object to encode
148+ * @return An object (or type java.lang.String) containing the refined
149+ * soundex code which corresponds to the String supplied.
150+ * @throws EncoderException
151+ * if the parameter supplied is not of type java.lang.String
152+ */
174153 public Object encode (Object pObject ) throws EncoderException {
175154 Object result ;
176155 if (!(pObject instanceof java .lang .String )) {
177- throw new EncoderException ("Parameter supplied to RefinedSoundex encode is not of type java.lang.String" );
156+ throw new EncoderException ("Parameter supplied to RefinedSoundex encode is not of type java.lang.String" );
178157 } else {
179158 result = soundex ((String ) pObject );
180159 }
181160 return result ;
182161 }
183162
184163 /**
185- * Returns the mapping code for a given character. The mapping
186- * codes are maintained in an internal char array named soundexMapping,
187- * and the default values of these mappings are US English.
188- *
189- * @param c char to get mapping for
190- * @return A character (really a numeral) to return for the given char
191- */
164+ * Encodes a String using the refined soundex algorithm.
165+ *
166+ * @param pString
167+ * A String object to encode
168+ * @return A Soundex code corresponding to the String supplied
169+ */
170+ public String encode (String pString ) {
171+ return soundex (pString );
172+ }
173+
174+ /**
175+ * Returns the mapping code for a given character. The mapping codes are
176+ * maintained in an internal char array named soundexMapping, and the
177+ * default values of these mappings are US English.
178+ *
179+ * @param c
180+ * char to get mapping for
181+ * @return A character (really a numeral) to return for the given char
182+ */
192183 private char getMappingCode (char c ) {
193184 if (!Character .isLetter (c )) {
194185 return 0 ;
195186 } else {
196- return soundexMapping [Character .toUpperCase (c ) - 'A' ];
187+ return this .soundexMapping [Character .toUpperCase (c ) - 'A' ];
188+ }
189+ }
190+
191+ /**
192+ * Retreives the Refined Soundex code for a given String object.
193+ *
194+ * @param str
195+ * String to encode using the Refined Soundex algorithm
196+ * @return A soundex code for the String supplied
197+ */
198+ public String soundex (String str ) {
199+ if (str == null ) {
200+ return null ;
201+ }
202+ str = SoundexUtils .clean (str );
203+ if (str .length () == 0 ) {
204+ return str ;
197205 }
206+
207+ StringBuffer sBuf = new StringBuffer ();
208+ sBuf .append (str .charAt (0 ));
209+
210+ char last , current ;
211+ last = '*' ;
212+
213+ for (int i = 0 ; i < str .length (); i ++) {
214+
215+ current = getMappingCode (str .charAt (i ));
216+ if (current == last ) {
217+ continue ;
218+ } else if (current != 0 ) {
219+ sBuf .append (current );
220+ }
221+
222+ last = current ;
223+
224+ }
225+
226+ return sBuf .toString ();
198227 }
199- }
228+ }
0 commit comments