Skip to content

Commit 87f142f

Browse files
committed
CODEC-233 Soundex should support more algorithm variants
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1789911 13f79535-47bb-0310-9956-ffa450edef68
1 parent 29a7e67 commit 87f142f

3 files changed

Lines changed: 126 additions & 5 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ The <action> type attribute can be add,update,fix,remove.
4545
<release version="1.11" date="2017-MM-DD" description="Feature and fix release.">
4646
<!-- The first attribute below should be the issue id; makes it easier to navigate in the IDE outline -->
4747

48+
<action issue="CODEC-233" dev="sebb" type="update" due-to="Yossi Tamari">Soundex should support more algorithm variants</action>
4849
<action issue="CODEC-145" dev="sebb" type="fix" due-to="Jesse Glick">Base64.encodeBase64String could better use newStringUsAscii (ditto encodeBase64URLSafeString)</action>
4950
<action issue="CODEC-144" dev="sebb" type="fix">BaseNCodec: encodeToString and encodeAsString methods are identical</action>
5051
<action issue="CODEC-232" dev="sebb" type="fix">URLCodec is neither immutable nor threadsafe</action>

src/main/java/org/apache/commons/codec/language/Soundex.java

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,32 @@
3131
*/
3232
public class Soundex implements StringEncoder {
3333

34+
/**
35+
* The marker character used to indicate a silent (ignored) character.
36+
* These are ignored except when they appear as the first character.
37+
* <p>
38+
* Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
39+
* because changing it might break existing code. Mappings that don't contain
40+
* a silent marker code are treated as though H and W are silent.
41+
* <p>
42+
* To override this, use the {@link #Soundex(String, boolean)} constructor.
43+
* @since 1.11
44+
*/
45+
public static final char SILENT_MARKER = '-';
46+
3447
/**
3548
* This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
36-
* means do not encode.
49+
* means do not encode, but treat as a separator when it occurs between consonants with the same code.
3750
* <p>
3851
* (This constant is provided as both an implementation convenience and to allow Javadoc to pick
3952
* up the value for the constant values page.)
40-
* </p>
41-
*
53+
* <p>
54+
* <b>Note that letters H and W are treated specially.</b>
55+
* They are ignored (after the first letter) and don't act as separators
56+
* between consonants with the same code.
4257
* @see #US_ENGLISH_MAPPING
4358
*/
59+
// ABCDEFGHIJKLMNOPQRSTUVWXYZ
4460
public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
4561

4662
/**
@@ -53,11 +69,44 @@ public class Soundex implements StringEncoder {
5369

5470
/**
5571
* An instance of Soundex using the US_ENGLISH_MAPPING mapping.
72+
* This treats H and W as silent letters.
73+
* Apart from when they appear as the first letter, they are ignored.
74+
* They don't act as separators between duplicate codes.
5675
*
5776
* @see #US_ENGLISH_MAPPING
77+
* @see #US_ENGLISH_MAPPING_STRING
5878
*/
5979
public static final Soundex US_ENGLISH = new Soundex();
6080

81+
/**
82+
* An instance of Soundex using the Simplified Soundex mapping, as described here:
83+
* http://west-penwith.org.uk/misc/soundex.htm
84+
* <p>
85+
* This treats H and W the same as vowels (AEIOUY).
86+
* Such letters aren't encoded (after the first), but they do
87+
* act as separators when dropping duplicate codes.
88+
* The mapping is otherwise the same as for {@link #US_ENGLISH}
89+
* <p>
90+
* @since 1.11
91+
*/
92+
public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
93+
94+
/**
95+
* An instance of Soundex using the mapping as per the Genealogy site:
96+
* http://www.genealogy.com/articles/research/00000060.html
97+
* <p>
98+
* This treats vowels (AEIOUY), H and W as silent letters.
99+
* Such letters are ignored (after the first) and do not
100+
* act as separators when dropping duplicate codes.
101+
* <p>
102+
* The codes for consonants are otherwise the same as for
103+
* {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
104+
*
105+
* @since 1.11
106+
*/
107+
public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
108+
// ABCDEFGHIJKLMNOPQRSTUVWXYZ
109+
61110
/**
62111
* The maximum length of a Soundex code - Soundex codes are only four characters by definition.
63112
*
@@ -72,6 +121,15 @@ public class Soundex implements StringEncoder {
72121
*/
73122
private final char[] soundexMapping;
74123

124+
/**
125+
* Should H and W be treated specially?
126+
* <p>
127+
* In versions of the code prior to 1.11,
128+
* the code always treated H and W as silent (ignored) letters.
129+
* If this field is false, H and W are no longer special-cased.
130+
*/
131+
private final boolean specialCaseHW;
132+
75133
/**
76134
* Creates an instance using US_ENGLISH_MAPPING
77135
*
@@ -80,6 +138,7 @@ public class Soundex implements StringEncoder {
80138
*/
81139
public Soundex() {
82140
this.soundexMapping = US_ENGLISH_MAPPING;
141+
this.specialCaseHW = true;
83142
}
84143

85144
/**
@@ -88,25 +147,54 @@ public Soundex() {
88147
*
89148
* Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
90149
* letter is mapped. This implementation contains a default map for US_ENGLISH
150+
* <p>
151+
* If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
91152
*
92153
* @param mapping
93154
* Mapping array to use when finding the corresponding code for a given character
94155
*/
95156
public Soundex(final char[] mapping) {
96157
this.soundexMapping = new char[mapping.length];
97158
System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
159+
this.specialCaseHW = !hasMarker(this.soundexMapping);
160+
}
161+
162+
private boolean hasMarker(char[] mapping) {
163+
for(char ch : mapping) {
164+
if (ch == SILENT_MARKER) {
165+
return true;
166+
}
167+
}
168+
return false;
98169
}
99170

100171
/**
101172
* Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
102173
* and/or possibly provide an internationalized mapping for a non-Western character set.
174+
* <p>
175+
* If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
103176
*
104177
* @param mapping
105178
* Mapping string to use when finding the corresponding code for a given character
106179
* @since 1.4
107180
*/
108181
public Soundex(final String mapping) {
109182
this.soundexMapping = mapping.toCharArray();
183+
this.specialCaseHW = !hasMarker(this.soundexMapping);
184+
}
185+
186+
/**
187+
* Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
188+
* and/or possibly provide an internationalized mapping for a non-Western character set.
189+
*
190+
* @param mapping
191+
* Mapping string to use when finding the corresponding code for a given character
192+
* @param specialCaseHW if true, then
193+
* @since 1.11
194+
*/
195+
public Soundex(final String mapping, boolean specialCaseHW) {
196+
this.soundexMapping = mapping.toCharArray();
197+
this.specialCaseHW = specialCaseHW;
110198
}
111199

112200
/**
@@ -190,7 +278,7 @@ public int getMaxLength() {
190278
private char map(final char ch) {
191279
final int index = ch - 'A';
192280
if (index < 0 || index >= this.soundexMapping.length) {
193-
throw new IllegalArgumentException("The character is not mapped: " + ch);
281+
throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
194282
}
195283
return this.soundexMapping[index];
196284
}
@@ -231,10 +319,13 @@ public String soundex(String str) {
231319
char lastDigit = map(first); // previous digit
232320
for(int i = 1; i < str.length() && count < out.length ; i++) {
233321
char ch = str.charAt(i);
234-
if (ch == 'H' || ch == 'W') { // these are ignored completely
322+
if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
235323
continue;
236324
}
237325
char digit = map(ch);
326+
if (digit == SILENT_MARKER) {
327+
continue;
328+
}
238329
if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
239330
out[count++] = digit;
240331
}

src/test/java/org/apache/commons/codec/language/SoundexTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,4 +403,33 @@ public void testWikipediaAmericanSoundex() {
403403
Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
404404
Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
405405
}
406+
407+
@Test
408+
// examples and algorithm rules from: http://www.genealogy.com/articles/research/00000060.html
409+
public void testGenealogy() { // treat vowels and HW as silent
410+
Soundex s = Soundex.US_ENGLISH_GENEALOGY;
411+
Assert.assertEquals("H251", s.encode("Heggenburger"));
412+
Assert.assertEquals("B425", s.encode("Blackman"));
413+
Assert.assertEquals("S530", s.encode("Schmidt"));
414+
Assert.assertEquals("L150", s.encode("Lippmann"));
415+
// Additional local example
416+
Assert.assertEquals("D200", s.encode("Dodds")); // 'o' is not a separator here - it is silent
417+
Assert.assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
418+
Assert.assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
419+
}
420+
421+
@Test
422+
// examples and algorithm rules from: http://west-penwith.org.uk/misc/soundex.htm
423+
public void testSimplifiedSoundex() { // treat vowels and HW as separators
424+
Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
425+
Assert.assertEquals("W452", s.encode("WILLIAMS"));
426+
Assert.assertEquals("B625", s.encode("BARAGWANATH"));
427+
Assert.assertEquals("D540", s.encode("DONNELL"));
428+
Assert.assertEquals("L300", s.encode("LLOYD"));
429+
Assert.assertEquals("W422", s.encode("WOOLCOCK"));
430+
// Additional local examples
431+
Assert.assertEquals("D320", s.encode("Dodds"));
432+
Assert.assertEquals("D320", s.encode("Dwdds")); // w is a separator
433+
Assert.assertEquals("D320", s.encode("Dhdds")); // h is a separator
434+
}
406435
}

0 commit comments

Comments
 (0)