5151 */
5252public class PhoneticEngine {
5353
54+ /**
55+ * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
56+ * and probably not outside the {@link PhoneticEngine} class.
57+ *
58+ * @author Apache Software Foundation
59+ * @since 1.6
60+ */
5461 static final class PhonemeBuilder {
5562
63+ /**
64+ * An empty builder where all phonemes must come from some set of languages. This will contain a single
65+ * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
66+ * phoneme from scratch.
67+ *
68+ * @param languages the set of languages
69+ * @return a new, empty phoneme builder
70+ */
5671 public static PhonemeBuilder empty (Languages .LanguageSet languages ) {
5772 return new PhonemeBuilder (Collections .singleton (new Rule .Phoneme ("" , languages )));
5873 }
@@ -63,6 +78,12 @@ private PhonemeBuilder(Set<Rule.Phoneme> phonemes) {
6378 this .phonemes = phonemes ;
6479 }
6580
81+ /**
82+ * Create a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
83+ *
84+ * @param str the characters to append to the phonemes
85+ * @return a new phoneme builder lenghened by <code>str</code>
86+ */
6687 public PhonemeBuilder append (CharSequence str ) {
6788 Set <Rule .Phoneme > newPhonemes = new HashSet <Rule .Phoneme >();
6889
@@ -73,6 +94,16 @@ public PhonemeBuilder append(CharSequence str) {
7394 return new PhonemeBuilder (newPhonemes );
7495 }
7596
97+ /**
98+ * Create a new phoneme builder containing the application of the expression to all phonemes in this builder.
99+ *
100+ * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
101+ * incompatible.
102+ *
103+ * @param phonemeExpr the expression to apply
104+ * @return a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
105+ * in turn
106+ */
76107 public PhonemeBuilder apply (Rule .PhonemeExpr phonemeExpr ) {
77108 Set <Rule .Phoneme > newPhonemes = new HashSet <Rule .Phoneme >();
78109
@@ -88,10 +119,22 @@ public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
88119 return new PhonemeBuilder (newPhonemes );
89120 }
90121
122+ /**
123+ * The underlying phoneme set. Please don't mutate.
124+ *
125+ * @return the phoneme set
126+ */
91127 public Set <Rule .Phoneme > getPhonemes () {
92128 return this .phonemes ;
93129 }
94130
131+ /**
132+ * Stringify the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
133+ * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
134+ * avoided when debugging.
135+ *
136+ * @return the stringified phoneme set
137+ */
95138 public String makeString () {
96139
97140 StringBuilder sb = new StringBuilder ();
@@ -108,6 +151,17 @@ public String makeString() {
108151 }
109152 }
110153
154+ /**
155+ * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
156+ * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
157+ * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
158+ * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
159+ * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
160+ * updated by the matching rule.
161+ *
162+ * @author Apache Software Foundation
163+ * @since 1.6
164+ */
111165 private static final class RulesApplication {
112166 private final List <Rule > finalRules ;
113167 private final CharSequence input ;
@@ -134,6 +188,13 @@ public PhonemeBuilder getPhonemeBuilder() {
134188 return this .phonemeBuilder ;
135189 }
136190
191+ /**
192+ * This invokes the rules. It loops over the rules list, stopping at the first one that has a matching context
193+ * and pattern. It then applies this rule to the phoneme builder to produce updated phonemes. If there was no
194+ * match, <code>i</code> is advanced one and the character is silently dropped from the phonetic spelling.
195+ *
196+ * @return <code>this</code>
197+ */
137198 public RulesApplication invoke () {
138199 this .found = false ;
139200 int patternLength = 0 ;
@@ -176,6 +237,12 @@ public boolean isFound() {
176237 "de la" , "della" , "des" , "di" , "do" , "dos" , "du" , "van" , "von" ))));
177238 }
178239
240+ /**
241+ * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
242+ *
243+ * @param cached the character sequence to cache
244+ * @return a <code>CharSequence</code> that internally memoises subSequence values
245+ */
179246 private static CharSequence cacheSubSequence (final CharSequence cached ) {
180247 // return cached;
181248 final CharSequence [][] cache = new CharSequence [cached .length ()][cached .length ()];
@@ -203,6 +270,12 @@ public CharSequence subSequence(int start, int end) {
203270 };
204271 }
205272
273+ /**
274+ * Join some strings with an internal separater.
275+ * @param strings Strings to join
276+ * @param sep String to separate them with
277+ * @return a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
278+ */
206279 private static String join (Iterable <String > strings , String sep ) {
207280 StringBuilder sb = new StringBuilder ();
208281 Iterator <String > si = strings .iterator ();
@@ -244,6 +317,14 @@ public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) {
244317 this .lang = Lang .instance (nameType );
245318 }
246319
320+ /**
321+ * Apply the final rules to convert from a language-specific phonetic representation to a language-independent
322+ * representation.
323+ *
324+ * @param phonemeBuilder
325+ * @param finalRules
326+ * @return
327+ */
247328 private PhonemeBuilder applyFinalRules (PhonemeBuilder phonemeBuilder , List <Rule > finalRules ) {
248329 if (finalRules == null ) {
249330 throw new NullPointerException ("finalRules can not be null" );
@@ -304,8 +385,11 @@ public String encode(String input) {
304385 */
305386 public String encode (String input , final Languages .LanguageSet languageSet ) {
306387 final List <Rule > rules = Rule .getInstance (this .nameType , RuleType .RULES , languageSet );
388+ // rules common across many (all) languages
307389 final List <Rule > finalRules1 = Rule .getInstance (this .nameType , this .ruleType , "common" );
390+ // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
308391 final List <Rule > finalRules2 = Rule .getInstance (this .nameType , this .ruleType , languageSet );
392+
309393 // System.err.println("Languages: " + languageSet);
310394 // System.err.println("Rules: " + rules);
311395
@@ -333,6 +417,7 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
333417 final List <String > words = Arrays .asList (input .split ("\\ s+" ));
334418 final List <String > words2 = new ArrayList <String >();
335419
420+ // special-case handling of word prefixes based upon the name type
336421 switch (this .nameType ) {
337422 case SEPHARDIC :
338423 for (String aWord : words ) {
@@ -380,13 +465,10 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
380465 // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
381466 }
382467
383- // System.err.println("Applying general rules");
468+ // Apply the general rules
384469 phonemeBuilder = applyFinalRules (phonemeBuilder , finalRules1 );
385- // System.err.println("Now got: " + phonemeBuilder.makeString());
386- // System.err.println("Applying language-specific rules");
470+ // Apply the language-specific rules
387471 phonemeBuilder = applyFinalRules (phonemeBuilder , finalRules2 );
388- // System.err.println("Now got: " + phonemeBuilder.makeString());
389- // System.err.println("Done");
390472
391473 return phonemeBuilder .makeString ();
392474 }
0 commit comments