Apply documentation patch from Matthew Pocock. Thank you Matthew!

garydgregory · garydgregory · commit b5657da870ca · 2011-11-13T20:59:07.000Z
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1201511 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java b/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
@@ -31,11 +31,56 @@
  * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
  * thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
  * </p>
+ *
+ * <h2>Encoding overview</h2>
+ *
+ * <p>
+ * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
+ * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. Next,
+ * the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
+ * can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
+ * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
+ * this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
+ * into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
+ * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
+ * pronounced in several ways in the source language have only one way to represent them in this average phonetic
+ * language, so the result is again a set of phonetic spellings.
+ * </p>
+ *
+ * <p>
+ * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
+ * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
+ * Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
+ * ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
+ * and once without it. The resulting encoding contains one and then the other result.
+ * </p>
+ *
+ *
+ * <h2>Encoding format</h2>
+ *
+ * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
+ * are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. If multiple hyphenated
+ * words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
+ * are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible prefix. The form without prefix encodes to
+ * "<code>ortlaj|ortlej</code>", while the form with prefix encodes to "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is
+ * "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
+ *
+ * <p>
+ * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
+ * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
+ * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
+ * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
+ * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
+ * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * </p>
  * 
  * @author Apache Software Foundation
  * @since 1.6
  */
 public class BeiderMorseEncoder implements StringEncoder {
+    // implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
+    // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
+
     // a cached object
     private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
 
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Lang.java b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
@@ -71,6 +71,13 @@
  * @since 1.6
  */
 public class Lang {
+    // implementation note: This class is divided into two sections. The first part is a static factory interface that
+    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
+    // encapsulate a particular language-guessing rule table and the language guessing itself.
+    //
+    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
+    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
+    // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
 
     private static final class LangRule {
         private final boolean acceptOnMatch;
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Languages.java b/src/main/java/org/apache/commons/codec/language/bm/Languages.java
@@ -53,6 +53,9 @@
  * @since 1.6
  */
 public class Languages {
+    // implementation note: This class is divided into two sections. The first part is a static factory interface that
+    // exposes org/apache/commons/codec/language/bm/%s_languages.txt for %s in NameType.* as a list of supported
+    // languages, and a second part that provides instance methods for accessing this set fo supported languages.
 
     /**
      * A set of languages.
diff --git a/src/main/java/org/apache/commons/codec/language/bm/NameType.java b/src/main/java/org/apache/commons/codec/language/bm/NameType.java
@@ -18,7 +18,9 @@
 package org.apache.commons.codec.language.bm;
 
 /**
- * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}.
+ * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}. The
+ * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are specifically
+ * tuned to family names, and may not work well at all for general text.
  * 
  * @author Apache Software Foundation
  * @since 1.6
diff --git a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
@@ -51,8 +51,23 @@
  */
 public class PhoneticEngine {
 
+    /**
+     * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
+     * and probably not outside the {@link PhoneticEngine} class.
+     *
+     * @author Apache Software Foundation
+     * @since 1.6
+     */
     static final class PhonemeBuilder {
 
+        /**
+         * An empty builder where all phonemes must come from some set of languages. This will contain a single
+         * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
+         * phoneme from scratch.
+         *
+         * @param languages the set of languages
+         * @return  a new, empty phoneme builder
+         */
         public static PhonemeBuilder empty(Languages.LanguageSet languages) {
             return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
         }
@@ -63,6 +78,12 @@ private PhonemeBuilder(Set<Rule.Phoneme> phonemes) {
             this.phonemes = phonemes;
         }
 
+        /**
+         * Create a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
+         *
+         * @param str   the characters to append to the phonemes
+         * @return  a new phoneme builder lenghened by <code>str</code>
+         */
         public PhonemeBuilder append(CharSequence str) {
             Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
 
@@ -73,6 +94,16 @@ public PhonemeBuilder append(CharSequence str) {
             return new PhonemeBuilder(newPhonemes);
         }
 
+        /**
+         * Create a new phoneme builder containing the application of the expression to all phonemes in this builder.
+         *
+         * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
+         * incompatible.
+         *
+         * @param phonemeExpr   the expression to apply
+         * @return  a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
+         *      in turn
+         */
         public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
             Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
 
@@ -88,10 +119,22 @@ public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
             return new PhonemeBuilder(newPhonemes);
         }
 
+        /**
+         * The underlying phoneme set. Please don't mutate.
+         *
+         * @return  the phoneme set
+         */
         public Set<Rule.Phoneme> getPhonemes() {
             return this.phonemes;
         }
 
+        /**
+         * Stringify the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
+         * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
+         * avoided when debugging.
+         *
+         * @return  the stringified phoneme set
+         */
         public String makeString() {
 
             StringBuilder sb = new StringBuilder();
@@ -108,6 +151,17 @@ public String makeString() {
         }
     }
 
+    /**
+     * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
+     * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
+     * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
+     * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
+     * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
+     * updated by the matching rule.
+     *
+     * @author Apache Software Foundation
+     * @since 1.6
+     */
     private static final class RulesApplication {
         private final List<Rule> finalRules;
         private final CharSequence input;
@@ -134,6 +188,13 @@ public PhonemeBuilder getPhonemeBuilder() {
             return this.phonemeBuilder;
         }
 
+        /**
+         * This invokes the rules. It loops over the rules list, stopping at the first one that has a matching context
+         * and pattern. It then applies this rule to the phoneme builder to produce updated phonemes. If there was no
+         * match, <code>i</code> is advanced one and the character is silently dropped from the phonetic spelling.
+         *
+         * @return <code>this</code>
+         */
         public RulesApplication invoke() {
             this.found = false;
             int patternLength = 0;
@@ -176,6 +237,12 @@ public boolean isFound() {
                 "de la", "della", "des", "di", "do", "dos", "du", "van", "von"))));
     }
 
+    /**
+     * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
+     *
+     * @param cached the character sequence to cache
+     * @return a <code>CharSequence</code> that internally memoises subSequence values
+     */
     private static CharSequence cacheSubSequence(final CharSequence cached) {
         // return cached;
         final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()];
@@ -203,6 +270,12 @@ public CharSequence subSequence(int start, int end) {
         };
     }
 
+    /**
+     * Join some strings with an internal separater.
+     * @param strings   Strings to join
+     * @param sep       String to separate them with
+     * @return          a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
+     */
     private static String join(Iterable<String> strings, String sep) {
         StringBuilder sb = new StringBuilder();
         Iterator<String> si = strings.iterator();
@@ -244,6 +317,14 @@ public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) {
         this.lang = Lang.instance(nameType);
     }
 
+    /**
+     * Apply the final rules to convert from a language-specific phonetic representation to a language-independent
+     * representation.
+     *
+     * @param phonemeBuilder
+     * @param finalRules
+     * @return
+     */
     private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule> finalRules) {
         if (finalRules == null) {
             throw new NullPointerException("finalRules can not be null");
@@ -304,8 +385,11 @@ public String encode(String input) {
      */
     public String encode(String input, final Languages.LanguageSet languageSet) {
         final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet);
+        // rules common across many (all) languages
         final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common");
+        // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
         final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet);
+
         // System.err.println("Languages: " + languageSet);
         // System.err.println("Rules: " + rules);
 
@@ -333,6 +417,7 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
         final List<String> words = Arrays.asList(input.split("\\s+"));
         final List<String> words2 = new ArrayList<String>();
 
+        // special-case handling of word prefixes based upon the name type
         switch (this.nameType) {
         case SEPHARDIC:
             for (String aWord : words) {
@@ -380,13 +465,10 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
             // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
         }
 
-        // System.err.println("Applying general rules");
+        // Apply the general rules
         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
-        // System.err.println("Now got: " + phonemeBuilder.makeString());
-        // System.err.println("Applying language-specific rules");
+        // Apply the language-specific rules
         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);
-        // System.err.println("Now got: " + phonemeBuilder.makeString());
-        // System.err.println("Done");
 
         return phonemeBuilder.makeString();
     }
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Rule.java b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
@@ -583,7 +583,9 @@ public RPattern getRContext() {
     }
 
     /**
-     * Decides if the pattern and context match the input starting at a position.
+     * Decides if the pattern and context match the input starting at a position. It is a match if the
+     * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
+     * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
      * 
      * @param input
      *            the input String
@@ -604,6 +606,9 @@ public boolean patternAndContextMatches(CharSequence input, int i) {
             return false;
         }
 
+        // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid
+        // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to
+        // read
         boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern);
         boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length()));
         boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i));
diff --git a/src/main/java/org/apache/commons/codec/language/bm/RuleType.java b/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
@@ -25,7 +25,12 @@
  */
 public enum RuleType {
 
-    APPROX("approx"), EXACT("exact"), RULES("rules");
+    /** Approximate rules, which will lead to the largest number of phonetic interpretations. */
+    APPROX("approx"),
+    /** Exact rules, which will lead to a minimum number of phonetic interpretations. */
+    EXACT("exact"),
+    /** For internal use only. Please use {@link #APPROX} or {@link #EXACT}. */
+    RULES("rules");
 
     private final String name;
 

Original file line number	Diff line number	Diff line change
`@@ -583,7 +583,9 @@ public RPattern getRContext() {`
`583`	`583`	`}`
`584`	`584`
`585`	`585`	`/**`
`586`		`- * Decides if the pattern and context match the input starting at a position.`
	`586`	`+ * Decides if the pattern and context match the input starting at a position. It is a match if the`
	`587`	`+ * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and`
	`588`	`+ * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.`
`587`	`589`	`*`
`588`	`590`	`* @param input`
`589`	`591`	`* the input String`
`@@ -604,6 +606,9 @@ public boolean patternAndContextMatches(CharSequence input, int i) {`
`604`	`606`	`return false;`
`605`	`607`	`}`
`606`	`608`
	`609`	`+ // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid`
	`610`	`+ // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to`
	`611`	`+ // read`
`607`	`612`	`boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern);`
`608`	`613`	`boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length()));`
`609`	`614`	`boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i));`