Skip to content

Commit facbc3a

Browse files
committed
[CODEC-132] BeiderMorseEncoder OOM issues
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1298118 13f79535-47bb-0310-9956-ffa450edef68
1 parent 7e8c20b commit facbc3a

5 files changed

Lines changed: 127 additions & 32 deletions

File tree

src/changes/changes.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
org.apache.commons.codec.net.URLCodec charset field final. </action> </release>
2727
-->
2828
<release version="1.6.1" date="TBD" description="Feature and fix release.">
29+
<action dev="ggregory" type="fix" issue="CODEC-132" due-to="rcmuir">
30+
BeiderMorseEncoder OOM issues
31+
</action>
2932
<action dev="ggregory" type="fix" issue="CODEC-121" due-to="javajohn">
3033
QuotedPrintableCodec does not support soft line break per the 'quoted-printable' example on Wikipedia
3134
</action>

src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ public String encode(String source) throws EncoderException {
100100

101101
/**
102102
* Gets the name type currently in operation.
103-
*
103+
*
104104
* @return the NameType currently being used
105105
*/
106106
public NameType getNameType() {
@@ -109,7 +109,7 @@ public NameType getNameType() {
109109

110110
/**
111111
* Gets the rule type currently in operation.
112-
*
112+
*
113113
* @return the RuleType currently being used
114114
*/
115115
public RuleType getRuleType() {
@@ -118,7 +118,7 @@ public RuleType getRuleType() {
118118

119119
/**
120120
* Discovers if multiple possible encodings are concatenated.
121-
*
121+
*
122122
* @return true if multiple encodings are concatenated, false if just the first one is returned
123123
*/
124124
public boolean isConcat() {
@@ -127,33 +127,55 @@ public boolean isConcat() {
127127

128128
/**
129129
* Sets how multiple possible phonetic encodings are combined.
130-
*
130+
*
131131
* @param concat
132132
* true if multiple encodings are to be combined with a '|', false if just the first one is to be considered
133133
*/
134134
public void setConcat(boolean concat) {
135-
this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat);
135+
this.engine = new PhoneticEngine(this.engine.getNameType(),
136+
this.engine.getRuleType(),
137+
concat,
138+
this.engine.getMaxPhonemes());
136139
}
137140

138141
/**
139142
* Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or
140143
* Sephardic Jewish family names.
141-
*
144+
*
142145
* @param nameType
143146
* the NameType in use
144147
*/
145148
public void setNameType(NameType nameType) {
146-
this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat());
149+
this.engine = new PhoneticEngine(nameType,
150+
this.engine.getRuleType(),
151+
this.engine.isConcat(),
152+
this.engine.getMaxPhonemes());
147153
}
148154

149155
/**
150156
* Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
151-
*
157+
*
152158
* @param ruleType
153159
* {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
154160
*/
155161
public void setRuleType(RuleType ruleType) {
156-
this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat());
162+
this.engine = new PhoneticEngine(this.engine.getNameType(),
163+
ruleType,
164+
this.engine.isConcat(),
165+
this.engine.getMaxPhonemes());
166+
}
167+
168+
/**
169+
* Sets the number of maximum of phonemes that shall be considered by the engine.
170+
*
171+
* @param maxPhonemes
172+
* the maximum number of phonemes returned by the engine
173+
*/
174+
public void setMaxPhonemes(int maxPhonemes) {
175+
this.engine = new PhoneticEngine(this.engine.getNameType(),
176+
this.engine.getRuleType(),
177+
this.engine.isConcat(),
178+
maxPhonemes);
157179
}
158180

159181
}

src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,22 @@ public PhonemeBuilder append(CharSequence str) {
101101
* incompatible.
102102
*
103103
* @param phonemeExpr the expression to apply
104+
* @param maxPhonemes the maximum number of phonemes to build up
104105
* @return a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
105106
* in turn
106107
*/
107-
public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
108+
public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr, int maxPhonemes) {
108109
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
109110

110-
for (Rule.Phoneme left : this.phonemes) {
111+
EXPR: for (Rule.Phoneme left : this.phonemes) {
111112
for (Rule.Phoneme right : phonemeExpr.getPhonemes()) {
112113
Rule.Phoneme join = left.join(right);
113114
if (!join.getLanguages().isEmpty()) {
114-
newPhonemes.add(join);
115+
if (newPhonemes.size() < maxPhonemes) {
116+
newPhonemes.add(join);
117+
} else {
118+
break EXPR;
119+
}
115120
}
116121
}
117122
}
@@ -168,16 +173,19 @@ private static final class RulesApplication {
168173

169174
private PhonemeBuilder phonemeBuilder;
170175
private int i;
176+
private int maxPhonemes;
171177
private boolean found;
172178

173-
public RulesApplication(List<Rule> finalRules, CharSequence input, PhonemeBuilder phonemeBuilder, int i) {
179+
public RulesApplication(List<Rule> finalRules, CharSequence input,
180+
PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) {
174181
if (finalRules == null) {
175182
throw new NullPointerException("The finalRules argument must not be null");
176183
}
177184
this.finalRules = finalRules;
178185
this.phonemeBuilder = phonemeBuilder;
179186
this.input = input;
180187
this.i = i;
188+
this.maxPhonemes = maxPhonemes;
181189
}
182190

183191
public int getI() {
@@ -208,7 +216,7 @@ public RulesApplication invoke() {
208216
continue RULES;
209217
}
210218

211-
this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme());
219+
this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
212220
this.found = true;
213221
break RULES;
214222
}
@@ -289,6 +297,8 @@ private static String join(Iterable<String> strings, String sep) {
289297
return sb.toString();
290298
}
291299

300+
private static final int DEFAULT_MAX_PHONEMES = 20;
301+
292302
private final Lang lang;
293303

294304
private final NameType nameType;
@@ -297,9 +307,11 @@ private static String join(Iterable<String> strings, String sep) {
297307

298308
private final boolean concat;
299309

310+
private final int maxPhonemes;
311+
300312
/**
301313
* Generates a new, fully-configured phonetic engine.
302-
*
314+
*
303315
* @param nameType
304316
* the type of names it will use
305317
* @param ruleType
@@ -308,13 +320,30 @@ private static String join(Iterable<String> strings, String sep) {
308320
* if it will concatenate multiple encodings
309321
*/
310322
public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) {
323+
this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES);
324+
}
325+
326+
/**
327+
* Generates a new, fully-configured phonetic engine.
328+
*
329+
* @param nameType
330+
* the type of names it will use
331+
* @param ruleType
332+
* the type of rules it will apply
333+
* @param concat
334+
* if it will concatenate multiple encodings
335+
* @param maxPhonemes
336+
* the maximum number of phonemes that will be handled
337+
*/
338+
public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat, int maxPhonemes) {
311339
if (ruleType == RuleType.RULES) {
312340
throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES);
313341
}
314342
this.nameType = nameType;
315343
this.ruleType = ruleType;
316344
this.concat = concat;
317345
this.lang = Lang.instance(nameType);
346+
this.maxPhonemes = maxPhonemes;
318347
}
319348

320349
/**
@@ -341,7 +370,8 @@ private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule>
341370
// System.err.println("Expanding: " + phonemeText);
342371

343372
for (int i = 0; i < phonemeText.length();) {
344-
RulesApplication rulesApplication = new RulesApplication(finalRules, phonemeText, subBuilder, i).invoke();
373+
RulesApplication rulesApplication =
374+
new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke();
345375
boolean found = rulesApplication.isFound();
346376
subBuilder = rulesApplication.getPhonemeBuilder();
347377

@@ -459,7 +489,8 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
459489
// loop over each char in the input - we will handle the increment manually
460490
CharSequence inputCache = cacheSubSequence(input);
461491
for (int i = 0; i < inputCache.length();) {
462-
RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke();
492+
RulesApplication rulesApplication =
493+
new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke();
463494
i = rulesApplication.getI();
464495
phonemeBuilder = rulesApplication.getPhonemeBuilder();
465496
// System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
@@ -508,4 +539,13 @@ public RuleType getRuleType() {
508539
public boolean isConcat() {
509540
return this.concat;
510541
}
542+
543+
/**
544+
* Gets the maximum number of phonemes the engine will calculate for a given input.
545+
*
546+
* @return the maximum number of phonemes
547+
*/
548+
public int getMaxPhonemes() {
549+
return this.maxPhonemes;
550+
}
511551
}

src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import static org.junit.Assert.assertEquals;
2121
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
2223

2324
import org.apache.commons.codec.EncoderException;
2425
import org.apache.commons.codec.StringEncoder;
@@ -60,15 +61,15 @@ protected StringEncoder createStringEncoder() {
6061
public void testAllChars() throws EncoderException {
6162
BeiderMorseEncoder bmpm = createGenericApproxEncoder();
6263
for (char c = Character.MIN_VALUE; c < Character.MAX_VALUE; c++) {
63-
bmpm.encode("" + c);
64+
bmpm.encode(Character.toString(c));
6465
}
6566
}
6667

6768
@Test
6869
public void testAsciiEncodeNotEmpty1Letter() throws EncoderException {
6970
BeiderMorseEncoder bmpm = createGenericApproxEncoder();
7071
for (char c = 'a'; c <= 'z'; c++) {
71-
final String value = "" + c;
72+
final String value = Character.toString(c);
7273
final String valueU = value.toUpperCase();
7374
assertNotEmpty(bmpm, value);
7475
assertNotEmpty(bmpm, valueU);
@@ -137,6 +138,24 @@ public void testNegativeIndexForRuleMatchIndexOutOfBoundsException() {
137138
r.patternAndContextMatches("bob", -1);
138139
}
139140

141+
@Test
142+
public void testOOM() throws EncoderException {
143+
String phrase = "200697900'-->&#1913348150;</ bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae"
144+
+ "cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" /></ afe >"
145+
+ "<script><!-- f(';< cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >";
146+
147+
BeiderMorseEncoder encoder = new BeiderMorseEncoder();
148+
encoder.setNameType(NameType.GENERIC);
149+
encoder.setRuleType(RuleType.EXACT);
150+
encoder.setMaxPhonemes(10);
151+
152+
String phonemes = encoder.encode(phrase);
153+
assertTrue(phonemes.length() > 0);
154+
155+
String[] phonemeArr = phonemes.split("\\|");
156+
assertTrue(phonemeArr.length <= 10);
157+
}
158+
140159
@Test
141160
public void testSetConcat() {
142161
BeiderMorseEncoder bmpm = new BeiderMorseEncoder();

src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.commons.codec.language.bm;
1919

2020
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertTrue;
2122

2223
import java.util.Arrays;
2324
import java.util.List;
@@ -38,46 +39,56 @@ public class PhoneticEngineTest {
3839
@Parameterized.Parameters
3940
public static List<Object[]> data() {
4041
return Arrays
41-
.asList(new Object[] {
42-
"Renault",
43-
"rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult",
44-
NameType.GENERIC,
45-
RuleType.APPROX,
46-
true },
47-
new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true },
48-
new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true },
49-
new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true },
50-
new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true },
42+
.asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, true, 10 },
43+
new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true, 10 },
44+
new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, true, 1 },
45+
new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true, 10 },
46+
new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true, 10 },
47+
new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true, 10 },
5148
new Object[] {
5249
"van helsing",
5350
"(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)",
5451
NameType.GENERIC,
5552
RuleType.EXACT,
56-
false });
53+
false, 10 });
5754
}
5855

5956
private final boolean concat;
6057
private final String name;
6158
private final NameType nameType;
6259
private final String phoneticExpected;
6360
private final RuleType ruleType;
61+
private final int maxPhonemes;
6462

65-
public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType, RuleType ruleType, boolean concat) {
63+
public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType,
64+
RuleType ruleType, boolean concat, int maxPhonemes) {
6665
this.name = name;
6766
this.phoneticExpected = phoneticExpected;
6867
this.nameType = nameType;
6968
this.ruleType = ruleType;
7069
this.concat = concat;
70+
this.maxPhonemes = maxPhonemes;
7171
}
7272

7373
@Test(timeout = 10000L)
7474
public void testEncode() {
75-
PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat);
75+
PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat, this.maxPhonemes);
7676

7777
String phoneticActual = engine.encode(this.name);
7878

7979
//System.err.println("expecting: " + this.phoneticExpected);
8080
//System.err.println("actual: " + phoneticActual);
8181
assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
82+
83+
if (this.concat) {
84+
String[] split = phoneticActual.split("\\|");
85+
assertTrue(split.length <= this.maxPhonemes);
86+
} else {
87+
String[] words = phoneticActual.split("-");
88+
for (String word : words) {
89+
String[] split = word.split("\\|");
90+
assertTrue(split.length <= this.maxPhonemes);
91+
}
92+
}
8293
}
8394
}

0 commit comments

Comments
 (0)