3131 */
3232public class Soundex implements StringEncoder {
3333
34+ /**
35+ * The marker character used to indicate a silent (ignored) character.
36+ * These are ignored except when they appear as the first character.
37+ * <p>
38+ * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
39+ * because changing it might break existing code. Mappings that don't contain
40+ * a silent marker code are treated as though H and W are silent.
41+ * <p>
42+ * To override this, use the {@link #Soundex(String, boolean)} constructor.
43+ * @since 1.11
44+ */
45+ public static final char SILENT_MARKER = '-' ;
46+
3447 /**
3548 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
36- * means do not encode.
49+ * means do not encode, but treat as a separator when it occurs between consonants with the same code .
3750 * <p>
3851 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
3952 * up the value for the constant values page.)
40- * </p>
41- *
53+ * <p>
54+ * <b>Note that letters H and W are treated specially.</b>
55+ * They are ignored (after the first letter) and don't act as separators
56+ * between consonants with the same code.
4257 * @see #US_ENGLISH_MAPPING
4358 */
59+ // ABCDEFGHIJKLMNOPQRSTUVWXYZ
4460 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202" ;
4561
4662 /**
@@ -53,11 +69,44 @@ public class Soundex implements StringEncoder {
5369
5470 /**
5571 * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
72+ * This treats H and W as silent letters.
73+ * Apart from when they appear as the first letter, they are ignored.
74+ * They don't act as separators between duplicate codes.
5675 *
5776 * @see #US_ENGLISH_MAPPING
77+ * @see #US_ENGLISH_MAPPING_STRING
5878 */
5979 public static final Soundex US_ENGLISH = new Soundex ();
6080
81+ /**
82+ * An instance of Soundex using the Simplified Soundex mapping, as described here:
83+ * http://west-penwith.org.uk/misc/soundex.htm
84+ * <p>
85+ * This treats H and W the same as vowels (AEIOUY).
86+ * Such letters aren't encoded (after the first), but they do
87+ * act as separators when dropping duplicate codes.
88+ * The mapping is otherwise the same as for {@link #US_ENGLISH}
89+ * <p>
90+ * @since 1.11
91+ */
92+ public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex (US_ENGLISH_MAPPING_STRING , false );
93+
94+ /**
95+ * An instance of Soundex using the mapping as per the Genealogy site:
96+ * http://www.genealogy.com/articles/research/00000060.html
97+ * <p>
98+ * This treats vowels (AEIOUY), H and W as silent letters.
99+ * Such letters are ignored (after the first) and do not
100+ * act as separators when dropping duplicate codes.
101+ * <p>
102+ * The codes for consonants are otherwise the same as for
103+ * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
104+ *
105+ * @since 1.11
106+ */
107+ public static final Soundex US_ENGLISH_GENEALOGY = new Soundex ("-123-12--22455-12623-1-2-2" );
108+ // ABCDEFGHIJKLMNOPQRSTUVWXYZ
109+
61110 /**
62111 * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
63112 *
@@ -72,6 +121,15 @@ public class Soundex implements StringEncoder {
72121 */
73122 private final char [] soundexMapping ;
74123
124+ /**
125+ * Should H and W be treated specially?
126+ * <p>
127+ * In versions of the code prior to 1.11,
128+ * the code always treated H and W as silent (ignored) letters.
129+ * If this field is false, H and W are no longer special-cased.
130+ */
131+ private final boolean specialCaseHW ;
132+
75133 /**
76134 * Creates an instance using US_ENGLISH_MAPPING
77135 *
@@ -80,6 +138,7 @@ public class Soundex implements StringEncoder {
80138 */
81139 public Soundex () {
82140 this .soundexMapping = US_ENGLISH_MAPPING ;
141+ this .specialCaseHW = true ;
83142 }
84143
85144 /**
@@ -88,25 +147,54 @@ public Soundex() {
88147 *
89148 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
90149 * letter is mapped. This implementation contains a default map for US_ENGLISH
150+ * <p>
151+ * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
91152 *
92153 * @param mapping
93154 * Mapping array to use when finding the corresponding code for a given character
94155 */
95156 public Soundex (final char [] mapping ) {
96157 this .soundexMapping = new char [mapping .length ];
97158 System .arraycopy (mapping , 0 , this .soundexMapping , 0 , mapping .length );
159+ this .specialCaseHW = !hasMarker (this .soundexMapping );
160+ }
161+
162+ private boolean hasMarker (char [] mapping ) {
163+ for (char ch : mapping ) {
164+ if (ch == SILENT_MARKER ) {
165+ return true ;
166+ }
167+ }
168+ return false ;
98169 }
99170
100171 /**
101172 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
102173 * and/or possibly provide an internationalized mapping for a non-Western character set.
174+ * <p>
175+ * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
103176 *
104177 * @param mapping
105178 * Mapping string to use when finding the corresponding code for a given character
106179 * @since 1.4
107180 */
108181 public Soundex (final String mapping ) {
109182 this .soundexMapping = mapping .toCharArray ();
183+ this .specialCaseHW = !hasMarker (this .soundexMapping );
184+ }
185+
186+ /**
187+ * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
188+ * and/or possibly provide an internationalized mapping for a non-Western character set.
189+ *
190+ * @param mapping
191+ * Mapping string to use when finding the corresponding code for a given character
192+ * @param specialCaseHW if true, then
193+ * @since 1.11
194+ */
195+ public Soundex (final String mapping , boolean specialCaseHW ) {
196+ this .soundexMapping = mapping .toCharArray ();
197+ this .specialCaseHW = specialCaseHW ;
110198 }
111199
112200 /**
@@ -190,7 +278,7 @@ public int getMaxLength() {
190278 private char map (final char ch ) {
191279 final int index = ch - 'A' ;
192280 if (index < 0 || index >= this .soundexMapping .length ) {
193- throw new IllegalArgumentException ("The character is not mapped: " + ch );
281+ throw new IllegalArgumentException ("The character is not mapped: " + ch + " (index=" + index + ")" );
194282 }
195283 return this .soundexMapping [index ];
196284 }
@@ -231,10 +319,13 @@ public String soundex(String str) {
231319 char lastDigit = map (first ); // previous digit
232320 for (int i = 1 ; i < str .length () && count < out .length ; i ++) {
233321 char ch = str .charAt (i );
234- if (ch == 'H' || ch == 'W' ) { // these are ignored completely
322+ if (( this . specialCaseHW ) && ( ch == 'H' || ch == 'W' ) ) { // these are ignored completely
235323 continue ;
236324 }
237325 char digit = map (ch );
326+ if (digit == SILENT_MARKER ) {
327+ continue ;
328+ }
238329 if (digit != '0' && digit != lastDigit ) { // don't store vowels or repeats
239330 out [count ++] = digit ;
240331 }
0 commit comments