Skip to content

Commit 711c2e1

Browse files
committed
[CODEC-187] Beider Morse Phonetic Matching producing incorrect tokens. Apply patch https://issues.apache.org/jira/secure/attachment/12651251/CODEC-187_ashkenazi_approx_any_v2.patch
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1603689 13f79535-47bb-0310-9956-ffa450edef68
1 parent 1189e1b commit 711c2e1

2 files changed

Lines changed: 152 additions & 146 deletions

File tree

src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt

Lines changed: 138 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -15,139 +15,141 @@
1515
* limitations under the License.
1616
*/
1717

18-
// CONSTONANTS
19-
"ph" "" "" "f" // foreign
20-
"sh" "" "" "S" // foreign
21-
"kh" "" "" "x" // foreign
22-
23-
"gli" "" "" "(gli|l[italian])"
24-
"gni" "" "" "(gni|ni[italian+french])"
25-
"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn)
26-
"gh" "" "" "g" // It + translit. from Arabic
27-
"dh" "" "" "d" // translit. from Arabic
28-
"bh" "" "" "d" // translit. from Arabic
29-
"th" "" "" "t" // translit. from Arabic
30-
"lh" "" "" "l" // Port
31-
"nh" "" "" "nj" // Port
32-
33-
"ig" "[aeiou]" "" "(ig|tS[spanish])"
34-
"ix" "[aeiou]" "" "S" // Sp
35-
"tx" "" "" "tS" // Sp
36-
"tj" "" "$" "tS" // Sp
37-
"tj" "" "" "dZ" // Sp
38-
"tg" "" "" "(tg|dZ[spanish])"
39-
40-
"gi" "" "[aeou]" "dZ" // Italian
41-
"g" "" "y" "Z" // French
42-
"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])"
43-
"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])"
44-
45-
"guy" "" "" "gi"
46-
"gue" "" "$" "(k[french]|ge)"
47-
"gu" "" "[ei]" "(g|gv") // not It
48-
"gu" "" "[ao]" "gv" // not It
49-
50-
"ñ" "" "" "(n|nj)"
51-
"ny" "" "" "nj"
52-
53-
"sc" "" "[ei]" "(s|S[italian])"
54-
"sç" "" "[aeiou]" "s" // not It
55-
"ss" "" "" "s"
56-
"ç" "" "" "s" // not It
57-
58-
"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])"
59-
"ch" "" "" "(S|tS[spanish]|dZ[spanish])"
60-
61-
"ci" "" "[aeou]" "(tS[italian]|si)"
62-
"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])"
63-
"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])"
64-
//array("c" "" "[aou]" "(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted)
65-
66-
"s" "^" "" "s"
67-
"s" "[aáuiíoóeéêy]" "[aáuiíoóeéêy]" "(s[spanish]|z[portuguese+french+italian])"
68-
"s" "" "[dglmnrv]" "(z|Z[portuguese])"
69-
70-
"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr
71-
"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr
72-
"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp
73-
"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp
74-
75-
"que" "" "$" "(k[french]|ke)"
76-
"qu" "" "[eiu]" "k"
77-
"qu" "" "[ao]" "(kv|k)" // k is It
78-
79-
"ex" "" "[aáuiíoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)"
80-
"ex" "" "[cs]" "(e[portuguese]|ek)"
81-
82-
"m" "" "[cdglnrst]" "(m|n[portuguese])"
83-
"m" "" "[bfpv]" "(m|n[portuguese+spanish])"
84-
"m" "" "$" "(m|n[portuguese])"
85-
86-
"b" "^" "" "(b|V[spanish])"
87-
"v" "^" "" "(v|B[spanish])"
88-
89-
// VOWELS
90-
"eau" "" "" "o" // Fr
91-
92-
"ouh" "" "[aioe]" "(v[french]|uh)"
93-
"uh" "" "[aioe]" "(v|uh)"
94-
"ou" "" "[aioe]" "v" // french
95-
"uo" "" "" "(vo|o)"
96-
"u" "" "[aie]" "v"
97-
98-
"i" "[aáuoóeéê]" "" "j"
99-
"i" "" "[aeou]" "j"
100-
"y" "[aáuiíoóeéê]" "" "j"
101-
"y" "" "[aeiíou]" "j"
102-
"e" "" "$" "(e|E[french])"
103-
104-
"ão" "" "" "(au|an)" // Port
105-
"ãe" "" "" "(aj|an)" // Port
106-
"ãi" "" "" "(aj|an)" // Port
107-
"õe" "" "" "(oj|on)" // Port
108-
"où" "" "" "u" // Fr
109-
"ou" "" "" "(ou|u[french])"
110-
111-
"â" "" "" "a" // Port & Fr
112-
"à" "" "" "a" // Port
113-
"á" "" "" "a" // Port & Sp
114-
"ã" "" "" "(a|an)" // Port
115-
"é" "" "" "e"
116-
"ê" "" "" "e" // Port & Fr
117-
"è" "" "" "e" // Sp & Fr & It
118-
"í" "" "" "i" // Port & Sp
119-
"î" "" "" "i" // Fr
120-
"ô" "" "" "o" // Port & Fr
121-
"ó" "" "" "o" // Port & Sp & It
122-
"õ" "" "" "(o|on)" // Port
123-
"ò" "" "" "o" // Sp & It
124-
"ú" "" "" "u" // Port & Sp
125-
"ü" "" "" "u" // Port & Sp
126-
127-
// LATIN ALPHABET
128-
"a" "" "" "a"
129-
"b" "" "" "(b|v[spanish])"
130-
"c" "" "" "k"
131-
"d" "" "" "d"
132-
"e" "" "" "e"
133-
"f" "" "" "f"
134-
"g" "" "" "g"
135-
"h" "" "" "h"
136-
"i" "" "" "i"
137-
"j" "" "" "(x[spanish]|Z)" // not It
138-
"k" "" "" "k"
139-
"l" "" "" "l"
140-
"m" "" "" "m"
141-
"n" "" "" "n"
142-
"o" "" "" "o"
143-
"p" "" "" "p"
144-
"q" "" "" "k"
145-
"r" "" "" "r"
146-
"s" "" "" "(s|S[portuguese])"
147-
"t" "" "" "t"
148-
"u" "" "" "u"
149-
"v" "" "" "(v|b[spanish])"
150-
"w" "" "" "v" // foreign
151-
"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks
152-
"y" "" "" "i"
153-
"z" "" "" "z"
18+
// ASHKENAZIC
19+
20+
// A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant
21+
// Q = ü ; Y = ä = ö
22+
// H = initial "H" in German/English
23+
24+
// CONSONANTS
25+
"b" "" "" "(b|v[spanish])"
26+
"J" "" "" "z" // Argentina Spanish: "ll" = /Z/, but approximately /Z/ = /z/
27+
28+
// VOWELS
29+
// "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")
30+
// {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")
31+
// {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")
32+
33+
"aiB" "" "[bp]" "(D|Dm)"
34+
"AiB" "" "[bp]" "(D|Dm)"
35+
"oiB" "" "[bp]" "(D|Dm)"
36+
"OiB" "" "[bp]" "(D|Dm)"
37+
"uiB" "" "[bp]" "(D|Dm)"
38+
"UiB" "" "[bp]" "(D|Dm)"
39+
"eiB" "" "[bp]" "(D|Dm)"
40+
"EiB" "" "[bp]" "(D|Dm)"
41+
"iiB" "" "[bp]" "(D|Dm)"
42+
"IiB" "" "[bp]" "(D|Dm)"
43+
44+
"aiB" "" "[dgkstvz]" "(D|Dn)"
45+
"AiB" "" "[dgkstvz]" "(D|Dn)"
46+
"oiB" "" "[dgkstvz]" "(D|Dn)"
47+
"OiB" "" "[dgkstvz]" "(D|Dn)"
48+
"uiB" "" "[dgkstvz]" "(D|Dn)"
49+
"UiB" "" "[dgkstvz]" "(D|Dn)"
50+
"eiB" "" "[dgkstvz]" "(D|Dn)"
51+
"EiB" "" "[dgkstvz]" "(D|Dn)"
52+
"iiB" "" "[dgkstvz]" "(D|Dn)"
53+
"IiB" "" "[dgkstvz]" "(D|Dn)"
54+
55+
"B" "" "[bp]" "(o|om[polish]|im[polish])"
56+
"B" "" "[dgkstvz]" "(a|o|on[polish]|in[polish])"
57+
"B" "" "" "(a|o)"
58+
59+
"aiF" "" "[bp]" "(D|Dm)"
60+
"AiF" "" "[bp]" "(D|Dm)"
61+
"oiF" "" "[bp]" "(D|Dm)"
62+
"OiF" "" "[bp]" "(D|Dm)"
63+
"uiF" "" "[bp]" "(D|Dm)"
64+
"UiF" "" "[bp]" "(D|Dm)"
65+
"eiF" "" "[bp]" "(D|Dm)"
66+
"EiF" "" "[bp]" "(D|Dm)"
67+
"iiF" "" "[bp]" "(D|Dm)"
68+
"IiF" "" "[bp]" "(D|Dm)"
69+
70+
"aiF" "" "[dgkstvz]" "(D|Dn)"
71+
"AiF" "" "[dgkstvz]" "(D|Dn)"
72+
"oiF" "" "[dgkstvz]" "(D|Dn)"
73+
"OiF" "" "[dgkstvz]" "(D|Dn)"
74+
"uiF" "" "[dgkstvz]" "(D|Dn)"
75+
"UiF" "" "[dgkstvz]" "(D|Dn)"
76+
"eiF" "" "[dgkstvz]" "(D|Dn)"
77+
"EiF" "" "[dgkstvz]" "(D|Dn)"
78+
"iiF" "" "[dgkstvz]" "(D|Dn)"
79+
"IiF" "" "[dgkstvz]" "(D|Dn)"
80+
81+
"F" "" "[bp]" "(i|im[polish]|om[polish])"
82+
"F" "" "[dgkstvz]" "(i|in[polish]|on[polish])"
83+
"F" "" "" "i"
84+
85+
"P" "" "" "(o|u)"
86+
87+
"I" "[aeiouAEIBFOUQY]" "" "i"
88+
"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])" // "line"
89+
"I" "" "$" "i"
90+
"I" "" "[^k]$" "i"
91+
"Ik" "[lr]" "$" "(ik|Qk[german])"
92+
"Ik" "" "$" "ik"
93+
"sIts" "" "$" "(sits|sQts[german])"
94+
"Its" "" "$" "its"
95+
"I" "" "" "(Q[german]|i)"
96+
97+
"lE" "[bdfgkmnprsStvzZ]" "$" "(li|il[english])" // Apple < Appel
98+
"lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])" // Applebaum < Appelbaum
99+
100+
"au" "" "" "(D|a|u)"
101+
"ou" "" "" "(D|o|u)"
102+
103+
"ai" "" "" "(D|a|i)"
104+
"Ai" "" "" "(D|a|i)"
105+
"oi" "" "" "(D|o|i)"
106+
"Oi" "" "" "(D|o|i)"
107+
"ui" "" "" "(D|u|i)"
108+
"Ui" "" "" "(D|u|i)"
109+
"ei" "" "" "(D|i)"
110+
"Ei" "" "" "(D|i)"
111+
112+
"iA" "" "$" "(ia|io)"
113+
"iA" "" "" "(ia|io|iY[german])"
114+
"A" "" "[^aeiouAEBFIOU]e" "(a|o|Y[german]|D[english])" // "plane"
115+
116+
"E" "i[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Wineberg (vineberg/vajneberg) --> vajnberg
117+
"E" "a[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Shaneberg (shaneberg/shejneberg) --> shejnberg
118+
119+
"e" "" "[fklmnprstv]$" "i"
120+
"e" "" "ts$" "i"
121+
"e" "" "$" "i"
122+
"e" "[DaoiuAOIUQY]" "" "i"
123+
"e" "" "[aoAOQY]" "i"
124+
"e" "" "" "(i|Y[german])"
125+
126+
"E" "" "[fklmnprst]$" "i"
127+
"E" "" "ts$" "i"
128+
"E" "" "$" "i"
129+
"E" "[DaoiuAOIUQY]" "" "i"
130+
"E" "" "[aoAOQY]" "i"
131+
"E" "" "" "(i|Y[german])"
132+
133+
"a" "" "" "(a|o)"
134+
135+
"O" "" "[fklmnprstv]$" "o"
136+
"O" "" "ts$" "o"
137+
"O" "" "$" "o"
138+
"O" "[oeiuQY]" "" "o"
139+
"O" "" "" "(o|Y[german])"
140+
141+
"A" "" "[fklmnprst]$" "(a|o)"
142+
"A" "" "ts$" "(a|o)"
143+
"A" "" "$" "(a|o)"
144+
"A" "[oeiuQY]" "" "(a|o)"
145+
"A" "" "" "(a|o|Y[german])"
146+
147+
"U" "" "$" "u"
148+
"U" "[DoiuQY]" "" "u"
149+
"U" "" "[^k]$" "u"
150+
"Uk" "[lr]" "$" "(uk|Qk[german])"
151+
"Uk" "" "$" "uk"
152+
153+
"sUts" "" "$" "(suts|sQts[german])"
154+
"Uts" "" "$" "uts"
155+
"U" "" "" "(u|Q[german])"

src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public void testSolrASHKENAZI() {
8686
// concat is true, ruleType is EXACT
8787
args = new TreeMap<String, String>();
8888
args.put("nameType", "ASHKENAZI");
89-
assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
89+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
9090
args.put("ruleType", "EXACT");
9191
assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
9292
assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
@@ -97,7 +97,7 @@ public void testSolrASHKENAZI() {
9797
// concat is false, ruleType is EXACT
9898
args = new TreeMap<String, String>();
9999
args.put("nameType", "ASHKENAZI");
100-
assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
100+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
101101
args.put("ruleType", "EXACT");
102102
assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
103103
assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
@@ -108,23 +108,23 @@ public void testSolrASHKENAZI() {
108108
// concat is true, ruleType is APPROX
109109
args = new TreeMap<String, String>();
110110
args.put("nameType", "ASHKENAZI");
111-
assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
111+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
112112
args.put("ruleType", "APPROX");
113-
assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
114-
assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
113+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
114+
assertEquals(encode(args, true, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
115115
args.put("languageSet", "italian,greek,spanish");
116-
assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
116+
assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|ongilo|onxilo");
117117
assertEquals(encode(args, true, "1234"), "");
118118

119119
// concat is false, ruleType is APPROX
120120
args = new TreeMap<String, String>();
121121
args.put("nameType", "ASHKENAZI");
122-
assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
122+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
123123
args.put("ruleType", "APPROX");
124-
assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
125-
assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
124+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
125+
assertEquals(encode(args, false, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
126126
args.put("languageSet", "italian,greek,spanish");
127-
assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
127+
assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|ongilo|onxilo");
128128
assertEquals(encode(args, false, "1234"), "");
129129
}
130130

@@ -186,6 +186,10 @@ public void testCompatibilityWithOriginalVersion() {
186186
args.put("nameType", "GENERIC");
187187
args.put("ruleType", "APPROX");
188188
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
189+
190+
args.put("nameType", "ASHKENAZI");
191+
args.put("ruleType", "APPROX");
192+
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
189193
}
190194

191195
/**

0 commit comments

Comments
 (0)