Skip to content

Commit 29524ee

Browse files
committed
[CODEC-125] Implement a Beider-Morse phonetic matching codec. Applied patch https://issues.apache.org/jira/secure/attachment/12489755/fightingMemoryChurn.patch
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1155135 13f79535-47bb-0310-9956-ffa450edef68
1 parent 26951aa commit 29524ee

7 files changed

Lines changed: 155 additions & 192 deletions

File tree

src/java/org/apache/commons/codec/language/bm/Lang.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.EnumMap;
2525
import java.util.HashSet;
2626
import java.util.List;
27+
import java.util.Locale;
2728
import java.util.Map;
2829
import java.util.Scanner;
2930
import java.util.Set;
@@ -71,7 +72,7 @@
7172
*/
7273
public class Lang {
7374

74-
private static class LangRule {
75+
private static final class LangRule {
7576
private final boolean acceptOnMatch;
7677
private final Set<String> languages;
7778
private final Pattern pattern;
@@ -199,7 +200,7 @@ private Lang(List<LangRule> rules, Languages languages) {
199200
*/
200201
public String guessLanguage(String text) {
201202
Languages.LanguageSet ls = guessLanguages(text);
202-
return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203+
return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203204
}
204205

205206
/**
@@ -210,7 +211,7 @@ public String guessLanguage(String text) {
210211
* @return a Set of Strings of language names that are potential matches for the input word
211212
*/
212213
public Languages.LanguageSet guessLanguages(String input) {
213-
String text = input.toLowerCase(); // todo: locale?
214+
String text = input.toLowerCase(Locale.ENGLISH);
214215
// System.out.println("Testing text: '" + text + "'");
215216

216217
Set<String> langs = new HashSet<String>(this.languages.getLanguages());

src/java/org/apache/commons/codec/language/bm/Languages.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ public class Languages {
5858
* A set of languages.
5959
*/
6060
public static abstract class LanguageSet {
61-
61+
6262
public static LanguageSet from(Set<String> langs) {
63-
return langs.isEmpty() ? NO_LANGUAGES : new SomeLanguages(langs);
63+
return langs.isEmpty() ? NO_LANGUAGES : new SomeLanguages(langs);
6464
}
6565

6666
public abstract boolean contains(String language);
@@ -77,7 +77,7 @@ public static LanguageSet from(Set<String> langs) {
7777
/**
7878
* Some languages, explicitly enumerated.
7979
*/
80-
public static class SomeLanguages extends LanguageSet {
80+
public static final class SomeLanguages extends LanguageSet {
8181
private final Set<String> languages;
8282

8383
private SomeLanguages(Set<String> languages) {
@@ -116,9 +116,13 @@ public LanguageSet restrictTo(LanguageSet other) {
116116
return this;
117117
} else {
118118
SomeLanguages sl = (SomeLanguages) other;
119-
Set<String> ls = new HashSet<String>(this.languages);
120-
ls.retainAll(sl.languages);
121-
return from(ls);
119+
if (sl.languages.containsAll(languages)) {
120+
return this;
121+
} else {
122+
Set<String> ls = new HashSet<String>(this.languages);
123+
ls.retainAll(sl.languages);
124+
return from(ls);
125+
}
122126
}
123127
}
124128

src/java/org/apache/commons/codec/language/bm/NameType.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
* @since 2.0
2525
*/
2626
public enum NameType {
27-
27+
2828
/** Ashkenazi family names */
2929
ASHKENAZI("ash"),
30-
30+
3131
/** Generic names and words */
3232
GENERIC("gen"),
33-
33+
3434
/** Sephardic family names */
3535
SEPHARDIC("sep");
3636

src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
*/
5252
public class PhoneticEngine {
5353

54-
static class PhonemeBuilder {
54+
static final class PhonemeBuilder {
5555

5656
public static PhonemeBuilder empty(Languages.LanguageSet languages) {
5757
return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
@@ -108,7 +108,7 @@ public String makeString() {
108108
}
109109
}
110110

111-
private static class RulesApplication {
111+
private static final class RulesApplication {
112112
private final List<Rule> finalRules;
113113
private final CharSequence input;
114114

@@ -176,6 +176,32 @@ public boolean isFound() {
176176
"de la", "della", "des", "di", "do", "dos", "du", "van", "von"))));
177177
}
178178

179+
private static CharSequence cacheSubSequence(final CharSequence cached) {
180+
// return cached;
181+
final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()];
182+
return new CharSequence() {
183+
public char charAt(int index) {
184+
return cached.charAt(index);
185+
}
186+
187+
public int length() {
188+
return cached.length();
189+
}
190+
191+
public CharSequence subSequence(int start, int end) {
192+
if (start == end)
193+
return "";
194+
195+
CharSequence res = cache[start][end - 1];
196+
if (res == null) {
197+
res = cached.subSequence(start, end);
198+
cache[start][end - 1] = res;
199+
}
200+
return res;
201+
}
202+
};
203+
}
204+
179205
private static String join(Iterable<String> strings, String sep) {
180206
StringBuilder sb = new StringBuilder();
181207
Iterator<String> si = strings.iterator();
@@ -229,7 +255,7 @@ private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule>
229255

230256
for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
231257
PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
232-
CharSequence phonemeText = phoneme.getPhonemeText();
258+
CharSequence phonemeText = cacheSubSequence(phoneme.getPhonemeText());
233259
// System.err.println("Expanding: " + phonemeText);
234260

235261
for (int i = 0; i < phonemeText.length();) {
@@ -248,7 +274,7 @@ private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule>
248274
}
249275

250276
// System.err.println("Expanded to: " + subBuilder.makeString());
251-
277+
// System.err.println("phenomes in collection of type: " + subBuilder.getPhonemes().getClass());
252278
phonemes.addAll(subBuilder.getPhonemes());
253279
}
254280

@@ -345,8 +371,9 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
345371
PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet);
346372

347373
// loop over each char in the input - we will handle the increment manually
348-
for (int i = 0; i < input.length();) {
349-
RulesApplication rulesApplication = new RulesApplication(rules, input, phonemeBuilder, i).invoke();
374+
CharSequence inputCache = cacheSubSequence(input);
375+
for (int i = 0; i < inputCache.length();) {
376+
RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke();
350377
i = rulesApplication.getI();
351378
phonemeBuilder = rulesApplication.getPhonemeBuilder();
352379
// System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());

0 commit comments

Comments
 (0)