Skip to content

Commit 39d5df2

Browse files
committed
[CODEC-187] Apply patch to make BeiderMorse phonetic engine compatible with v3.3 of the reference implementation.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1608115 13f79535-47bb-0310-9956-ffa450edef68
1 parent 50a1d17 commit 39d5df2

24 files changed

Lines changed: 776 additions & 367 deletions

src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,12 @@
6464
* encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
6565
* Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
6666
* splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
67+
* <p>
68+
* <b>Note</b>: this version of the Beider-Morse encoding is equivalent with v3.3 of the reference implementation.
6769
*
6870
* @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
6971
* @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
72+
*
7073
* @since 1.6
7174
* @version $Id$
7275
*/

src/main/java/org/apache/commons/codec/language/bm/Lang.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ public boolean matches(final String txt) {
9595

9696
private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
9797

98-
private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
98+
private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
9999

100100
static {
101101
for (final NameType s : NameType.values()) {
102-
Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
102+
Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
103103
}
104104
}
105105

src/main/java/org/apache/commons/codec/language/bm/Languages.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ public static LanguageSet from(final Set<String> langs) {
7373
public abstract boolean isSingleton();
7474

7575
public abstract LanguageSet restrictTo(LanguageSet other);
76+
77+
public abstract LanguageSet merge(LanguageSet other);
7678
}
7779

7880
/**
@@ -127,6 +129,22 @@ public LanguageSet restrictTo(final LanguageSet other) {
127129
}
128130
}
129131

132+
@Override
133+
public LanguageSet merge(final LanguageSet other) {
134+
if (other == NO_LANGUAGES) {
135+
return this;
136+
} else if (other == ANY_LANGUAGE) {
137+
return other;
138+
} else {
139+
final SomeLanguages sl = (SomeLanguages) other;
140+
final Set<String> ls = new HashSet<String>(languages);
141+
for (String lang : sl.languages) {
142+
ls.add(lang);
143+
}
144+
return from(ls);
145+
}
146+
}
147+
130148
@Override
131149
public String toString() {
132150
return "Languages(" + languages.toString() + ")";
@@ -216,6 +234,11 @@ public LanguageSet restrictTo(final LanguageSet other) {
216234
return this;
217235
}
218236

237+
@Override
238+
public LanguageSet merge(final LanguageSet other) {
239+
return other;
240+
}
241+
219242
@Override
220243
public String toString() {
221244
return "NO_LANGUAGES";
@@ -251,6 +274,11 @@ public LanguageSet restrictTo(final LanguageSet other) {
251274
return other;
252275
}
253276

277+
@Override
278+
public LanguageSet merge(final LanguageSet other) {
279+
return other;
280+
}
281+
254282
@Override
255283
public String toString() {
256284
return "ANY_LANGUAGE";

src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import java.util.Locale;
2929
import java.util.Map;
3030
import java.util.Set;
31-
import java.util.TreeSet;
31+
import java.util.TreeMap;
3232

3333
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
3434
import org.apache.commons.codec.language.bm.Rule.Phoneme;
@@ -335,7 +335,8 @@ private PhonemeBuilder applyFinalRules(final PhonemeBuilder phonemeBuilder,
335335
return phonemeBuilder;
336336
}
337337

338-
final Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
338+
final Map<Rule.Phoneme, Rule.Phoneme> phonemes =
339+
new TreeMap<Rule.Phoneme, Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
339340

340341
for (final Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
341342
PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
@@ -355,10 +356,21 @@ private PhonemeBuilder applyFinalRules(final PhonemeBuilder phonemeBuilder,
355356
i = rulesApplication.getI();
356357
}
357358

358-
phonemes.addAll(subBuilder.getPhonemes());
359+
// the phonemes map orders the phonemes only based on their text, but ignores the language set
360+
// when adding new phonemes, check for equal phonemes and merge their language set, otherwise
361+
// phonemes with the same text but different language set get lost
362+
for (final Rule.Phoneme newPhoneme : subBuilder.getPhonemes()) {
363+
if (phonemes.containsKey(newPhoneme)) {
364+
final Rule.Phoneme oldPhoneme = phonemes.remove(newPhoneme);
365+
final Rule.Phoneme mergedPhoneme = oldPhoneme.mergeWithLanguage(newPhoneme.getLanguages());
366+
phonemes.put(mergedPhoneme, mergedPhoneme);
367+
} else {
368+
phonemes.put(newPhoneme, newPhoneme);
369+
}
370+
}
359371
}
360372

361-
return new PhonemeBuilder(phonemes);
373+
return new PhonemeBuilder(phonemes.keySet());
362374
}
363375

364376
/**

src/main/java/org/apache/commons/codec/language/bm/Rule.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,22 @@ public Phoneme join(final Phoneme right) {
147147
return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
148148
this.languages.restrictTo(right.languages));
149149
}
150+
151+
/**
152+
* Returns a new Phoneme with the same text but a union of its
153+
* current language set and the given one.
154+
*
155+
* @param lang the language set to merge
156+
* @return a new Phoneme
157+
*/
158+
public Phoneme mergeWithLanguage(final LanguageSet lang) {
159+
return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
160+
}
161+
162+
@Override
163+
public String toString() {
164+
return phonemeText.toString() + "[" + languages + "]";
165+
}
150166
}
151167

152168
public interface PhonemeExpr {
@@ -442,6 +458,9 @@ public String toString() {
442458
sb.append("Rule");
443459
sb.append("{line=").append(myLine);
444460
sb.append(", loc='").append(loc).append('\'');
461+
sb.append(", pat='").append(pat).append('\'');
462+
sb.append(", lcon='").append(lCon).append('\'');
463+
sb.append(", rcon='").append(rCon).append('\'');
445464
sb.append('}');
446465
return sb.toString();
447466
}

src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@
1616
*/
1717

1818
"E" "" "" "e"
19-
"I "" "" "i"
19+
"I" "" "" "i"

src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"b" "^" "" "b"
2525
"b" "" "" "(b|v)"
2626

27+
"J" "" "" "l"
2728
"ja" "" "" "i"
2829
"jA" "" "" "i"
2930
"jB" "" "" "i"
@@ -75,17 +76,20 @@
7576
"ou" "^" "" "(u|v|1)"
7677
"o" "^" "" "(u|v|1)"
7778
"O" "^" "" "(u|v|1)"
79+
"P" "^" "" "(u|v|1)"
7880
"U" "^" "" "(u|v|1)"
7981
"u" "^" "" "(u|v|1)"
8082

8183
"o" "" "$" "(u|1)"
8284
"O" "" "$" "(u|1)"
85+
"P" "" "$" "(u|1)"
8386
"u" "" "$" "(u|1)"
8487
"U" "" "$" "(u|1)"
8588

8689
"ou" "" "" "u"
8790
"o" "" "" "u"
8891
"O" "" "" "u"
92+
"P" "" "" "u"
8993
"U" "" "" "u"
9094

9195
"VV" "" "" "u" // alef/ayin + vov from ruleshebrew
@@ -102,8 +106,9 @@
102106
//"z" "" "" "(z|Z)"
103107
//"d" "" "" "(d|dZ)"
104108

105-
"TB" "" "$" "(t|s)" // tav from ruleshebrew; only Ashkenazic
106-
"TB" "" "" "t" // tav from ruleshebrew; only Ashkenazic
109+
"TB" "^" "" "t" // tav from ruleshebrew; only Ashkenazic
110+
"TB" "" "$" "s" // tav from ruleshebrew; only Ashkenazic
111+
"TB" "" "" "(t|s)" // tav from ruleshebrew; only Ashkenazic
107112
"T" "" "" "t" // tet from ruleshebrew
108113

109114
//"k" "" "" "(k|x)"

0 commit comments

Comments
 (0)