Skip to content

Commit 2cdfac1

Browse files
committed
[CODEC-317] ColognePhonetic can create duplicate consecutive codes in
some cases.
1 parent ca9a599 commit 2cdfac1

3 files changed

Lines changed: 25 additions & 21 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
4646
<release version="1.22.0" date="YYYY-MM-DD" description="This is a feature and maintenance release. Java 8 or later is required.">
4747
<!-- FIX -->
4848
<action type="fix" dev="ggregory" due-to="Shalu Jha, Andrey, Gary Gregory" issue="CODEC-249">Fix Incorrect transform of CH digraph according Metaphone basic rules #423.</action>
49+
<action type="fix" dev="ggregory" due-to="DRUser123, Shalu Jha, Gary Gregory" issue="CODEC-317">ColognePhonetic can create duplicate consecutive codes in some cases.</action>
4950
<!-- ADD -->
5051
<action type="add" dev="ggregory" due-to="Inkeet, Gary Gregory, Wolff Bock von Wuelfingen" issue="CODEC-326">Add Base58 support.</action>
5152
<action type="add" dev="ggregory" due-to="Gary Gregory">Add BaseNCodecInputStream.AbstracBuilder.setByteArray(byte[]).</action>

src/main/java/org/apache/commons/codec/language/ColognePhonetic.java

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -270,11 +270,15 @@ protected char[] copyData(final int start, final int length) {
270270
* @param code the code to store.
271271
*/
272272
public void put(final char code) {
273-
if (code != CHAR_IGNORE && lastCode != code && (code != '0' || length == 0)) {
273+
final boolean accept = code != CHAR_IGNORE;
274+
final boolean nonZ = code != '0';
275+
if (accept && lastCode != code && (nonZ || length == 0)) {
274276
data[length] = code;
275277
length++;
276278
}
277-
lastCode = code;
279+
if (nonZ && accept) {
280+
lastCode = code;
281+
}
278282
}
279283
}
280284
// Predefined char arrays for better performance and less GC load
@@ -398,8 +402,8 @@ public String colognePhonetic(final String text) {
398402
@Override
399403
public Object encode(final Object object) throws EncoderException {
400404
if (!(object instanceof String)) {
401-
throw new EncoderException("This method's parameter was expected to be of the type " + String.class.getName() + ". But actually it was of the type "
402-
+ object.getClass().getName() + ".");
405+
throw new EncoderException(String.format("This method's parameter was expected to be of the type %s. But actually it was of the type %s.",
406+
String.class.getName(), object.getClass().getName()));
403407
}
404408
return encode((String) object);
405409
}
@@ -434,20 +438,19 @@ public boolean isEncodeEqual(final String text1, final String text2) {
434438
private char[] preprocess(final String text) {
435439
// This converts German small sharp s (Eszett) to SS
436440
final char[] chrs = text.toUpperCase(Locale.GERMAN).toCharArray();
437-
438441
for (int index = 0; index < chrs.length; index++) {
439442
switch (chrs[index]) {
440-
case '\u00C4': // capital A, umlaut mark
441-
chrs[index] = 'A';
442-
break;
443-
case '\u00DC': // capital U, umlaut mark
444-
chrs[index] = 'U';
445-
break;
446-
case '\u00D6': // capital O, umlaut mark
447-
chrs[index] = 'O';
448-
break;
449-
default:
450-
break;
443+
case '\u00C4': // capital A, umlaut mark
444+
chrs[index] = 'A';
445+
break;
446+
case '\u00DC': // capital U, umlaut mark
447+
chrs[index] = 'U';
448+
break;
449+
case '\u00D6': // capital O, umlaut mark
450+
chrs[index] = 'O';
451+
break;
452+
default:
453+
break;
451454
}
452455
}
453456
return chrs;

src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,14 @@ static Stream<Arguments> testExamples() {
162162
Arguments.arguments("weber", "317"),
163163
Arguments.arguments("wagner", "3467"),
164164
Arguments.arguments("becker", "147"),
165-
Arguments.arguments("hoffmann", "0366"),
165+
Arguments.arguments("hoffmann", "036"),
166166
Arguments.arguments("sch\u00C4fer", "837"), // schÄfer - why upper case A-umlaut ?
167167
Arguments.arguments("sch\u00e4fer", "837"), // schäfer - add equivalent lower-case
168168
Arguments.arguments("Breschnew", "17863"),
169169
Arguments.arguments("Wikipedia", "3412"),
170170
Arguments.arguments("peter", "127"),
171171
Arguments.arguments("pharma", "376"),
172-
Arguments.arguments("m\u00f6nchengladbach", "664645214"), // mönchengladbach
172+
Arguments.arguments("m\u00f6nchengladbach", "64645214"), // mönchengladbach
173173
Arguments.arguments("deutsch", "28"),
174174
Arguments.arguments("deutz", "28"),
175175
Arguments.arguments("hamburg", "06174"),
@@ -181,9 +181,9 @@ static Stream<Arguments> testExamples() {
181181
Arguments.arguments("matsch", "68"),
182182
Arguments.arguments("matz", "68"),
183183
Arguments.arguments("Arbeitsamt", "071862"),
184-
Arguments.arguments("Eberhard", "01772"),
185-
Arguments.arguments("Eberhardt", "01772"),
186-
Arguments.arguments("Celsius", "8588"),
184+
Arguments.arguments("Eberhard", "0172"),
185+
Arguments.arguments("Eberhardt", "0172"),
186+
Arguments.arguments("Celsius", "858"),
187187
Arguments.arguments("Ace", "08"),
188188
Arguments.arguments("shch", "84"), // CODEC-254
189189
Arguments.arguments("xch", "484"), // CODEC-255

0 commit comments

Comments
 (0)