Skip to content

Commit 5dd9e60

Browse files
committed
<action dev="ggregory" type="fix" issue="CODEC-187" due-to="Michael Tobias, Thomas Neidhart">Beider Morse Phonetic Matching producing incorrect tokens</action>
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1602044 13f79535-47bb-0310-9956-ffa450edef68
1 parent 1ab9a1c commit 5dd9e60

7 files changed

Lines changed: 36 additions & 14 deletions

File tree

NOTICE.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,11 @@ The Apache Software Foundation (http://www.apache.org/).
77
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
88
contains test data from http://aspell.net/test/orig/batch0.tab.
99
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
10+
11+
===============================================================================
12+
13+
The content of package org.apache.commons.codec.language.bm has been translated
14+
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
15+
with permission from the original authors.
16+
Original source copyright:
17+
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ The <action> type attribute can be add,update,fix,remove.
4343
</properties>
4444
<body>
4545
<release version="1.10" date="DD Mmmm 2014" description="Feature and fix release.">
46+
<action dev="ggregory" type="fix" issue="CODEC-187" due-to="Michael Tobias, Thomas Neidhart">Beider Morse Phonetic Matching producing incorrect tokens</action>
4647
<action dev="ggregory" type="fix" issue="CODEC-184" due-to="Cyrille Artho">NullPointerException in DoubleMetaPhone.isDoubleMetaphoneEqual when using empty strings</action>
4748
<action dev="ggregory" type="add" issue="CODEC-181" due-to="Ivan Martinez-Ortiz">Make possible to provide padding byte to BaseNCodec in constructor</action>
4849
<action dev="ggregory" type="fix" issue="CODEC-180" due-to="Ville Skyttä">Fix Javadoc 1.8.0 errors</action>

src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@
6565
* Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
6666
* splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
6767
*
68+
* @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
69+
* @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
6870
* @since 1.6
6971
* @version $Id$
7072
*/

src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
"i" "" "[aeou]" "j"
100100
"y" "[aáuiíoóeéê]" "" "j"
101101
"y" "" "[aeiíou]" "j"
102-
"e" "" "$" "(e|E[$french])"
102+
"e" "" "$" "(e|E[french])"
103103

104104
"ão" "" "" "(au|an)" // Port
105105
"ãe" "" "" "(aj|an)" // Port

src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
"cz" "" "" "tS" // Polish
8585

8686
"cia" "" "[bcdgkpstwzż]" "(tSB[polish]|tsB)"
87-
"cia" "" "" "(tSa[$polish]|tsa)"
87+
"cia" "" "" "(tSa[polish]|tsa)"
8888
"cią" "" "[bp]" "(tSom[polish]|tsom)"
8989
"cią" "" "" "(tSon[polish]|tson)"
9090
"cię" "" "[bp]" "(tSem[polish]|tsem)"

src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"s" "" "$" "(s|[french])"
4242
"t" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(t|[french])" // Petitjean
4343
"s" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(s|[french])" // Groslot, Grosleau
44-
//array("p" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(p|[$french])"
44+
//array("p" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(p|[french])"
4545

4646
"I" "[aeiouAEIBFOUQY]" "" "i"
4747
"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])" // "line"
@@ -86,22 +86,22 @@
8686
"E" "" "$" "i"
8787
"E" "[DaoiuAOIUQY]" "" "i"
8888
"E" "" "[aoAOQY]" "i"
89-
"E" "" "" "(i|Y[$german])"
89+
"E" "" "" "(i|Y[german])"
9090

9191
"P" "" "" "(o|u)"
9292

9393
"O" "" "[fklmnprstv]$" "o"
9494
"O" "" "ts$" "o"
9595
"O" "" "$" "o"
9696
"O" "[oeiuQY]" "" "o"
97-
"O" "" "" "(o|Y[$german])"
97+
"O" "" "" "(o|Y[german])"
9898
"O" "" "" "o"
9999

100100
"A" "" "[fklmnprst]$" "(a|o)"
101101
"A" "" "ts$" "(a|o)"
102102
"A" "" "$" "(a|o)"
103103
"A" "[oeiuQY]" "" "(a|o)"
104-
"A" "" "" "(a|o|Y[$german])"
104+
"A" "" "" "(a|o|Y[german])"
105105
"A" "" "" "(a|o)"
106106

107107
"U" "" "$" "u"

src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public void testSolrGENERIC() {
4040
// concat is true, ruleType is EXACT
4141
args = new TreeMap<String, String>();
4242
args.put("nameType", "GENERIC");
43-
assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
43+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
4444
args.put("ruleType", "EXACT");
4545
assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
4646
assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
@@ -50,7 +50,7 @@ public void testSolrGENERIC() {
5050

5151
// concat is false, ruleType is EXACT
5252
args = new TreeMap<String, String>();
53-
assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
53+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
5454
args.put("ruleType", "EXACT");
5555
assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
5656
assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
@@ -60,20 +60,20 @@ public void testSolrGENERIC() {
6060

6161
// concat is true, ruleType is APPROX
6262
args = new TreeMap<String, String>();
63-
assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
63+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
6464
args.put("ruleType", "APPROX");
65-
assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
66-
assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
65+
assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
66+
assertEquals(encode(args, true, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
6767
args.put("languageSet", "italian,greek,spanish");
6868
assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
6969
assertEquals(encode(args, true, "1234"), "");
7070

7171
// concat is false, ruleType is APPROX
7272
args = new TreeMap<String, String>();
73-
assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
73+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
7474
args.put("ruleType", "APPROX");
75-
assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
76-
assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
75+
assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
76+
assertEquals(encode(args, false, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
7777
args.put("languageSet", "italian,greek,spanish");
7878
assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
7979
assertEquals(encode(args, false, "1234"), "");
@@ -177,6 +177,17 @@ public void testSolrSEPHARDIC() {
177177
assertEquals(encode(args, false, "1234"), "");
178178
}
179179

180+
@Test
181+
public void testCompatibilityWithOriginalVersion() {
182+
// see CODEC-187
183+
// comparison: http://stevemorse.org/census/soundex.html
184+
185+
Map<String, String> args = new TreeMap<String, String>();
186+
args.put("nameType", "GENERIC");
187+
args.put("ruleType", "APPROX");
188+
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
189+
}
190+
180191
/**
181192
* This code is similar in style to code found in Solr:
182193
* solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java

0 commit comments

Comments
 (0)