3030import org .xml .sax .InputSource ;
3131import org .xml .sax .SAXException ;
3232import org .xml .sax .ext .EntityResolver2 ;
33+ import org .xml .sax .ext .LexicalHandler ;
3334
3435import io .sf .carte .util .agent .AgentUtil ;
3536
@@ -279,8 +280,88 @@ public void addHostToWhiteList(String fqdn) {
279280 }
280281 }
281282
283+ /**
284+ * Allows applications to provide an external subset for documents that don't
285+ * explicitly define one.
286+ * <p>
287+ * Documents with {@code DOCTYPE} declarations that omit an external subset can
288+ * thus augment the declarations available for validation, entity processing,
289+ * and attribute processing (normalization, defaulting, and reporting types
290+ * including {@code ID}). This augmentation is reported through the
291+ * {@link LexicalHandler#startDTD startDTD()} method as if the document text had
292+ * originally included the external subset; this callback is made before any
293+ * internal subset data or errors are reported.
294+ * </p>
295+ * <p>
296+ * This method can also be used with documents that have no {@code DOCTYPE}
297+ * declaration. When the root element is encountered but no {@code DOCTYPE}
298+ * declaration has been seen, this method is invoked. If it returns a value for
299+ * the external subset, that root element is declared to be the root element,
300+ * giving the effect of splicing a {@code DOCTYPE} declaration at the end the
301+ * prolog of a document that could not otherwise be valid. The sequence of
302+ * parser callbacks in that case logically resembles this:
303+ * </p>
304+ *
305+ * <pre>
306+ * ... comments and PIs from the prolog (as usual)
307+ * startDTD ("rootName", source.getPublicId (), source.getSystemId ());
308+ * startEntity ("[dtd]");
309+ * ... declarations, comments, and PIs from the external subset
310+ * endEntity ("[dtd]");
311+ * endDTD ();
312+ * ... then the rest of the document (as usual)
313+ * startElement (..., "rootName", ...);
314+ * </pre>
315+ *
316+ * <p>
317+ * Note that the {@code InputSource} gets no further resolution. Also, this
318+ * method will never be used by a (non-validating) processor that is not
319+ * including external parameter entities.
320+ * </p>
321+ * <p>
322+ * Uses for this method include facilitating data validation when interoperating
323+ * with XML processors that would always require undesirable network accesses
324+ * for external entities, or which for other reasons adopt a "no DTDs" policy.
325+ * </p>
326+ * <p>
327+ * <strong>Warning:</strong> returning an external subset modifies the input
328+ * document. By providing definitions for general entities, it can make a
329+ * malformed document appear to be well formed.
330+ * </p>
331+ *
332+ * @param name Identifies the document root element. This name comes from a
333+ * {@code DOCTYPE} declaration (where available) or from the
334+ * actual root element.
335+ * @param baseURI The document's base URI, serving as an additional hint for
336+ * selecting the external subset. This is always an absolute URI,
337+ * unless it is {@code null} because the {@code XMLReader} was
338+ * given an {@code InputSource} without one.
339+ *
340+ * @return an {@code InputSource} object describing the new external subset to
341+ * be used by the parser. If no specific subset could be determined, an
342+ * input source describing the HTML5 entities is returned.
343+ *
344+ * @throws SAXException if either the provided arguments or the input
345+ * source were invalid or not allowed.
346+ * @throws java.io.IOException if an I/O problem was found while loading the
347+ * input source.
348+ */
282349 @ Override
283350 public InputSource getExternalSubset (String name , String baseURI ) throws SAXException , IOException {
351+ InputSource is = findExternalSubset (name , baseURI );
352+ if (is == null ) {
353+ String fname = systemIdToFilename .get ("https://www.w3.org/TR/html5/entities.dtd" );
354+ Reader re = dtdLoader .loadDTDfromClasspath (loader , fname );
355+ if (re != null ) {
356+ is = new InputSource (re );
357+ } else {
358+ throw new IOException ("Could not find resource: " + fname );
359+ }
360+ }
361+ return is ;
362+ }
363+
364+ private InputSource findExternalSubset (String name , String baseURI ) throws SAXException , IOException {
284365 InputSource is ;
285366 if ("html" .equalsIgnoreCase (name )) {
286367 is = resolveEntity ("[dtd]" , XHTML1_TRA_PUBLICID , baseURI , XHTML1_TRA_SYSTEMID );
@@ -291,7 +372,6 @@ public InputSource getExternalSubset(String name, String baseURI) throws SAXExce
291372 is .setPublicId (null );
292373 is .setSystemId (null );
293374 } else {
294- // This method can return null safely: there is no SystemId URL to connect to.
295375 is = null ;
296376 }
297377 return is ;
@@ -374,10 +454,7 @@ protected boolean registerSystemIdFilename(String systemId, String filename) {
374454 * the XML specification to be the one associated with the
375455 * "{@literal <}" starting the relevant declaration.
376456 * @param systemId The system identifier of the external entity being
377- * referenced; either a relative or absolute URI. This is never
378- * {@code null} when invoked by a SAX2 parser; only declared
379- * entities, and any external subset, are resolved by such
380- * parsers.
457+ * referenced; either a relative or absolute URI.
381458 *
382459 * @return an {@code InputSource} object describing the new input source to be
383460 * used by the parser. This implementation never returns {@code null} if
@@ -461,7 +538,7 @@ public final InputSource resolveEntity(String name, String publicId, String base
461538 InputStream is = con .getInputStream ();
462539 isrc .setCharacterStream (new InputStreamReader (is , charset ));
463540 } else {
464- isrc = getExternalSubset (name , baseURI );
541+ isrc = findExternalSubset (name , baseURI );
465542 }
466543 return isrc ;
467544 }
0 commit comments