Skip to content

Commit 642477a

Browse files
committed
[CODEC-63] Merged duplicate unit tests, added algorithm outline to class description
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1298958 13f79535-47bb-0310-9956-ffa450edef68
1 parent 86beeaf commit 642477a

2 files changed

Lines changed: 66 additions & 101 deletions

File tree

src/main/java/org/apache/commons/codec/language/Nysiis.java

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,36 +27,67 @@
2727
*
2828
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
2929
* general purpose scheme to find word with similar phonemes.
30-
*
30+
*
3131
* <p>
3232
* NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
3333
* </p>
34-
*
34+
*
35+
* <p>Algorithm description:
36+
* <pre>
37+
* 1. Transcode first characters of name
38+
* 1a. MAC -> MCC
39+
* 1b. KN -> NN
40+
* 1c. K -> C
41+
* 1d. PH -> FF
42+
* 1e. PF -> FF
43+
* 1f. SCH -> SSS
44+
* 2. Transcode last characters of name
45+
* 2a. EE, IE -> Y
46+
* 2b. DT,RT,RD,NT,ND -> D
47+
* 3. First character of key = first character of name
48+
* 4. Transcode remaining characters by following these rules, incrementing by one character each time
49+
* 4a. EV -> AF else A,E,I,O,U -> A
50+
* 4b. Q -> G
51+
* 4c. Z -> S
52+
* 4d. M -> N
53+
* 4e. KN -> N else K -> C
54+
* 4f. SCH -> SSS
55+
* 4g. PH -> FF
56+
* 4h. H -> If previous or next is nonvowel, previous
57+
* 4i. W -> If previous is vowel, previous
58+
* 4j. Add current to key if current != last key character
59+
* 5. If last character is S, remove it
60+
* 6. If last characters are AY, replace with Y
61+
* 7. If last character is A, remove it
62+
* 8. Collapse all strings of repeated characters
63+
* 9. Add original first character of name as first character of key
64+
* </pre></p>
65+
*
3566
* @see <a href="http://en.wikipedia.org/wiki/NYSIIS">http://en.wikipedia.org/wiki/NYSIIS</a>
3667
* @see <a href="http://www.dropby.com/NYSIIS.html">http://www.dropby.com/NYSIIS.html</a>
3768
* @see Soundex
3869
* @version $Id$
3970
*/
4071
public class Nysiis implements StringEncoder {
4172

42-
private static final char[] CHARS_A = new char[] { 'A' };
43-
private static final char[] CHARS_AF = new char[] { 'A', 'F' };
44-
private static final char[] CHARS_C = new char[] { 'C' };
45-
private static final char[] CHARS_FF = new char[] { 'F', 'F' };
46-
private static final char[] CHARS_G = new char[] { 'G' };
47-
private static final char[] CHARS_N = new char[] { 'N' };
48-
private static final char[] CHARS_NN = new char[] { 'N', 'N' };
49-
private static final char[] CHARS_S = new char[] { 'S' };
73+
private static final char[] CHARS_A = new char[] { 'A' };
74+
private static final char[] CHARS_AF = new char[] { 'A', 'F' };
75+
private static final char[] CHARS_C = new char[] { 'C' };
76+
private static final char[] CHARS_FF = new char[] { 'F', 'F' };
77+
private static final char[] CHARS_G = new char[] { 'G' };
78+
private static final char[] CHARS_N = new char[] { 'N' };
79+
private static final char[] CHARS_NN = new char[] { 'N', 'N' };
80+
private static final char[] CHARS_S = new char[] { 'S' };
5081
private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
51-
52-
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
53-
private static final Pattern PAT_KN = Pattern.compile("^KN");
54-
private static final Pattern PAT_K = Pattern.compile("^K");
55-
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
56-
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
57-
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
82+
83+
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
84+
private static final Pattern PAT_KN = Pattern.compile("^KN");
85+
private static final Pattern PAT_K = Pattern.compile("^K");
86+
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
87+
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
88+
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
5889
private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
59-
90+
6091
private static final char SPACE = ' ';
6192
private static final int TRUE_LENGTH = 6;
6293

src/test/java/org/apache/commons/codec/language/NysiisTest.java

Lines changed: 17 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717

1818
package org.apache.commons.codec.language;
1919

20-
import java.util.Arrays;
21-
import java.util.List;
22-
2320
import org.apache.commons.codec.EncoderException;
2421
import org.apache.commons.codec.StringEncoder;
2522
import org.apache.commons.codec.StringEncoderAbstractTest;
@@ -83,109 +80,46 @@ public void testDan() throws EncoderException {
8380
this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
8481
}
8582

86-
@Test
87-
public void testDropBy() throws EncoderException {
88-
this.assertEncodings(
89-
new String[] { "MACINTOSH", "MCANT" },
90-
new String[] { "KNUTH", "NAT" },
91-
new String[] { "KOEHN", "CAN" },
92-
new String[] { "PHILLIPSON", "FALAPSAN" },
93-
new String[] { "PFEISTER", "FASTAR" },
94-
new String[] { "MCKEE", "MCY" },
95-
new String[] { "MACKIE", "MCY" },
96-
new String[] { "HEITSCHMIDT", "HATSNAD" },
97-
new String[] { "BART", "BAD" },
98-
new String[] { "HURD", "HAD" },
99-
new String[] { "HUNT", "HAD" },
100-
new String[] { "WESTERLUND", "WASTARLAD" },
101-
new String[] { "CASSTEVENS", "CASTAFAN" },
102-
new String[] { "VASQUEZ", "VASG" },
103-
new String[] { "FRAZIER", "FRASAR" },
104-
new String[] { "BOWMAN", "BANAN" },
105-
new String[] { "RICKERT", "RACAD" },
106-
new String[] { "DEUTSCH", "DAT" },
107-
new String[] { "WESTPHAL", "WASTFAL" },
108-
new String[] { "SHRIVER", "SRAVAR" },
109-
new String[] { "KUHL", "CAL" },
110-
new String[] { "RAWSON", "RASAN" },
111-
new String[] { "JILES", "JAL" },
112-
new String[] { "CARRAWAY", "CARY" },
113-
new String[] { "YAMADA", "YANAD" });
114-
}
115-
11683
/**
117-
* Tests data gathered from around the internets.
84+
* Tests data gathered from around the internet.
11885
*
86+
* @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
11987
* @throws EncoderException
12088
*/
12189
@Test
122-
public void testDropBy2() throws EncoderException {
123-
// Explanation of differences between this implementation and the one at dropby.com.
124-
//
125-
// Algorithm (taken from www.dropby.com/NYSIIS.html):
126-
//
127-
// 1. Transcode first characters of name:
128-
// MAC > MCC
129-
// KN > NN
130-
// K > C
131-
// PH > FF
132-
// PF > FF
133-
// SCH > SSS
134-
//
135-
// 2. Transcode last characters of name:
136-
// EE, IE > Y
137-
// DT,RT,RD,NT,ND > D
138-
//
139-
// 3. First character of key = first character of name.
140-
//
141-
// 4. Transcode remaining characters by following these rules, incrementing by one character each time:
142-
// 4a. EV > AF else A,E,I,O,U > A
143-
// 4b. Q > G
144-
// 4c. Z > S
145-
// 4d. M > N
146-
// 4e. KN > N else K > C
147-
// 4f. SCH > SSS
148-
// 4g. PH > FF
149-
// 4h. H > If previous or next is nonvowel, previous
150-
// 4i. W > If previous is vowel, previous
151-
// 4j. Add current to key if current != last key character
152-
//
153-
// 5. If last character is S, remove it
154-
// 6. If last characters are AY, replace with Y
155-
// 7. If last character is A, remove it
156-
// 8. Collapse all strings of repeated characters
157-
// 9. Add original first character of name as first character of key
90+
public void testDropBy() throws EncoderException {
91+
// Explanation of differences between this implementation and the one at dropby.com is
92+
// prepended to the test string. The referenced rules refer to the outlined steps the
93+
// class description for Nysiis.
15894

15995
this.assertEncodings(
160-
// http://www.dropby.com/indexLF.html?content=/NYSIIS.html
16196
// 1. Transcode first characters of name
16297
new String[] { "MACINTOSH", "MCANT" },
16398
// violates 4j: the second N should not be added, as the first
16499
// key char is already a N
165-
new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
100+
new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
166101
// O and E are transcoded to A because of rule 4a
167102
// H also to A because of rule 4h
168103
// the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
169104
// that skips the next char in such a case?
170105
// the remaining A is removed because of rule 7
171-
new String[] { "KOEHN", "CAN" }, // Original: C
106+
new String[] { "KOEHN", "CAN" }, // Original: C
172107
// violates 4j: see also KNUTH
173108
new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
174109
// violates 4j: see also KNUTH
175-
new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
110+
new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
176111
// violates 4j: see also KNUTH
177-
new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
178-
// http://www.dropby.com/indexLF.html?content=/NYSIIS.html
179-
// 2.Transcode last characters of name:
112+
new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
113+
// 2. Transcode last characters of name:
180114
new String[] { "MCKEE", "MCY" },
181115
new String[] { "MACKIE", "MCY" },
182116
new String[] { "HEITSCHMIDT", "HATSNAD" },
183117
new String[] { "BART", "BAD" },
184118
new String[] { "HURD", "HAD" },
185119
new String[] { "HUNT", "HAD" },
186120
new String[] { "WESTERLUND", "WASTARLAD" },
187-
// http://www.dropby.com/indexLF.html?content=/NYSIIS.html
188-
// 4. Transcode remaining characters by following these rules, incrementing by one character each time:
121+
// 4. Transcode remaining characters by following these rules,
122+
// incrementing by one character each time:
189123
new String[] { "CASSTEVENS", "CASTAFAN" },
190124
new String[] { "VASQUEZ", "VASG" },
191125
new String[] { "FRAZIER", "FRASAR" },
@@ -195,18 +129,18 @@ public void testDropBy2() throws EncoderException {
195129
// violates 5: the last S is not removed
196130
// when comparing to DEUTS, which is phonetically similar
197131
// the result it also DAT, which is correct for DEUTSCH too imo
198-
new String[] { "DEUTSCH", "DAT" }, // Original: DATS
132+
new String[] { "DEUTSCH", "DAT" }, // Original: DATS
199133
new String[] { "WESTPHAL", "WASTFAL" },
200134
// violates 4h: the H should be transcoded to S and thus ignored as
201135
// the first key character is also S
202-
new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
136+
new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
203137
// same as KOEHN, the L gets mysteriously lost
204-
new String[] { "KUHL", "CAL" }, // Original: C
138+
new String[] { "KUHL", "CAL" }, // Original: C
205139
new String[] { "RAWSON", "RASAN" },
206140
// If last character is S, remove it
207141
new String[] { "JILES", "JAL" },
208142
// violates 6: if the last two characters are AY, remove A
209-
new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
143+
new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
210144
new String[] { "YAMADA", "YANAD" });
211145
}
212146

0 commit comments

Comments
 (0)