From 5203590bbc5672cc6cc2df1b95179a97a2e42cda Mon Sep 17 00:00:00 2001 From: Fredrik Tolf Date: Sun, 13 Dec 2009 17:02:05 +0100 Subject: [PATCH] Added XHTML validation support. --- build.xml | 3 + etc/catalog/xhtml1-strict.xsd | 2211 ++++++++++++++++++++++++++++++++++++++ etc/catalog/xml.xsd | 287 +++++ src/dolda/jsvc/next/DomUtil.java | 70 ++ src/dolda/jsvc/next/Html.java | 19 +- 5 files changed, 2589 insertions(+), 1 deletion(-) create mode 100644 etc/catalog/xhtml1-strict.xsd create mode 100644 etc/catalog/xml.xsd diff --git a/build.xml b/build.xml index c51aa71..eda27f3 100644 --- a/build.xml +++ b/build.xml @@ -22,6 +22,9 @@ + + + diff --git a/etc/catalog/xhtml1-strict.xsd b/etc/catalog/xhtml1-strict.xsd new file mode 100644 index 0000000..93b80b6 --- /dev/null +++ b/etc/catalog/xhtml1-strict.xsd @@ -0,0 +1,2211 @@ + + + + + + XHTML 1.0 (Second Edition) Strict in XML Schema + + This is the same as HTML 4 Strict except for + changes due to the differences between XML and SGML. + + Namespace = http://www.w3.org/1999/xhtml + + For further information, see: http://www.w3.org/TR/xhtml1 + + Copyright (c) 1998-2002 W3C (MIT, INRIA, Keio), + All Rights Reserved. + + The DTD version is identified by the PUBLIC and SYSTEM identifiers: + + PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + + $Id: xhtml1-strict.xsd,v 1.2 2002/08/28 08:05:44 mimasa Exp $ + + + + + + + + ================ Character mnemonic entities ========================= + + XHTML entity sets are identified by the PUBLIC and SYSTEM identifiers: + + PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" + SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent" + + PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" + SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent" + + PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" + SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent" + + + + + + ================== Imported Names ==================================== + + + + + + + media type, as per [RFC2045] + + + + + + + + + comma-separated list of media types, as per [RFC2045] + + + + + + + + + a character encoding, as per [RFC2045] + + + + + + + + + a space separated list of character encodings, as per [RFC2045] + + + + + + + + + a language code, as per [RFC3066] + + + + + + + + + a single character, as per section 2.2 of [XML] + + + + + + + + + + + one or more digits + + + + + + + + + + + tabindex attribute specifies the position of the current element + in the tabbing order for the current document. This value must be + a number between 0 and 32767. User agents should ignore leading zeros. + + + + + + + + + + + + space-separated list of link types + + + + + + + + + single or comma-separated list of media descriptors + + + + + + + + + + + a Uniform Resource Identifier, see [RFC2396] + + + + + + + + + a space separated list of Uniform Resource Identifiers + + + + + + + + + date and time information. ISO date format + + + + + + + + + script expression + + + + + + + + + style sheet data + + + + + + + + + used for titles etc. + + + + + + + + + nn for pixels or nn% for percentage length + + + + + + + + + + + pixel, percentage, or relative + + + + + + + + + + + integer representing length in pixels + + + + + + + + these are used for image maps + + + + + + + + + + + + + + + + comma separated list of lengths + + + + + + + + + + =================== Generic Attributes =============================== + + + + + + + core attributes common to most elements + id document-wide unique id + class space separated list of classes + style associated style info + title advisory title/amplification + + + + + + + + + + + + internationalization attributes + lang language code (backwards compatible) + xml:lang language code (as per XML 1.0 spec) + dir direction for weak/neutral text + + + + + + + + + + + + + + + + + + attributes for common UI events + onclick a pointer button was clicked + ondblclick a pointer button was double clicked + onmousedown a pointer button was pressed down + onmouseup a pointer button was released + onmousemove a pointer was moved onto the element + onmouseout a pointer was moved away from the element + onkeypress a key was pressed and released + onkeydown a key was pressed down + onkeyup a key was released + + + + + + + + + + + + + + + + + + attributes for elements that can get the focus + accesskey accessibility key character + tabindex position in tabbing order + onfocus the element got the focus + onblur the element lost the focus + + + + + + + + + + + + + + + + + =================== Text Elements ==================================== + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + these can only occur at block level + + + + + + + + + + + + + + + + + + + + + + "Inline" covers inline or "text-level" elements + + + + + + + + + + + ================== Block level elements ============================== + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + "Flow" mixes block and inline and is used for list items etc. + + + + + + + + + + + + + ================== Content models for exclusions ===================== + + + + + + + a elements use "Inline" excluding a + + + + + + + + + + + + + + + pre uses "Inline" excluding big, small, sup or sup + + + + + + + + + + + + + + + + form uses "Block" excluding form + + + + + + + + + + + + button uses "Flow" but excludes a, form and form controls + + + + + + + + + + + + + + + + + + + ================ Document Structure ================================== + + + + + + + + + + + + + + + + + ================ Document Head ======================================= + + + + + + + + + + + + + + + + + + + content model is "head.misc" combined with a single + title and an optional base element in any order + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The title element is not considered part of the flow of text. + It should be displayed, for example as the page header or + window title. Exactly one title is required per document. + + + + + + + + + + + + document base URI + + + + + + + + + + + + generic metainformation + + + + + + + + + + + + + + + + Relationship values can be used in principle: + + a) for document specific toolbars/menus when used + with the link element in document head e.g. + start, contents, previous, next, index, end, help + b) to link to a separate style sheet (rel="stylesheet") + c) to make a link to a script (rel="script") + d) by stylesheets to control how collections of + html nodes are rendered into printed documents + e) to make a link to a printable version of this document + e.g. a PostScript or PDF version (rel="alternate" media="print") + + + + + + + + + + + + + + + + + + style info, which may include CDATA sections + + + + + + + + + + + + + + + + script statements, which may include CDATA sections + + + + + + + + + + + + + + + + + + + + + + alternate content container for non script-based rendering + + + + + + + + + + + + + + =================== Document Body ==================================== + + + + + + + + + + + + + + + + + + + generic language/style container + + + + + + + + + + + + + + =================== Paragraphs ======================================= + + + + + + + + + + + + + + + + =================== Headings ========================================= + + There are six levels of headings from h1 (the most important) + to h6 (the least important). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + =================== Lists ============================================ + + + + + + + Unordered list + + + + + + + + + + + + + + Ordered (numbered) list + + + + + + + + + + + + + + list item + + + + + + + + + + + + + + definition lists - dt for term, dd for its definition + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + =================== Address ========================================== + + + + + + + information on author + + + + + + + + + + + + + + =================== Horizontal Rule ================================== + + + + + + + + + + + + =================== Preformatted Text ================================ + + + + + + + content is "Inline" excluding "img|object|big|small|sub|sup" + + + + + + + + + + + + + + + =================== Block-like Quotes ================================ + + + + + + + + + + + + + + + + + =================== Inserted/Deleted Text ============================ + + ins/del are allowed in block and inline content, but its + inappropriate to include block content within an ins element + occurring in inline content. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ================== The Anchor Element ================================ + + + + + + + content is "Inline" except that anchors shouldn't be nested + + + + + + + + + + + + + + + + + + + + + + + + ===================== Inline Elements ================================ + + + + + + + generic language/style container + + + + + + + + + + + + + + + I18N BiDi over-ride + + + + + + + + + + + + + + + + + + + + + + + + + + forced line break + + + + + + + + + + + emphasis + + + + + + + + + + + + + + + strong emphasis + + + + + + + + + + + + + + + definitional + + + + + + + + + + + + + + + program code + + + + + + + + + + + + + + + sample + + + + + + + + + + + + + + + something user would type + + + + + + + + + + + + + + + variable + + + + + + + + + + + + + + + citation + + + + + + + + + + + + + + + abbreviation + + + + + + + + + + + + + + + acronym + + + + + + + + + + + + + + + inlined quote + + + + + + + + + + + + + + + + subscript + + + + + + + + + + + + + + + superscript + + + + + + + + + + + + + + + fixed pitch font + + + + + + + + + + + + + + + italic font + + + + + + + + + + + + + + + bold font + + + + + + + + + + + + + + + bigger font + + + + + + + + + + + + + + + smaller font + + + + + + + + + + + + + + ==================== Object ====================================== + + object is used to embed objects as part of HTML pages. + param elements should precede other content. Parameters + can also be expressed as attribute/value pairs on the + object element itself when brevity is desired. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param is used to supply a named property value. + In XML it would seem natural to follow RDF and support an + abbreviated syntax where the param elements are replaced + by attribute value pairs on the object start tag. + + + + + + + + + + + + + + + + + + + + + + =================== Images =========================================== + + To avoid accessibility problems for people who aren't + able to see the image, you should provide a text + description using the alt and longdesc attributes. + In addition, avoid the use of server-side image maps. + Note that in this DTD there is no name attribute. That + is only available in the transitional and frameset DTD. + + + + + + + + + + + + + + + usemap points to a map element which may be in this document + or an external document, although the latter is not widely supported + + + + + + + + + + + + + + + + ================== Client-side image maps ============================ + + These can be placed in the same document or grouped in a + separate document although this isn't yet widely supported + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ================ Forms =============================================== + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each label must not contain more than ONE field + Label elements shouldn't be nested. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + form control + + + + + + + + + + the name attribute is required for all but submit & reset + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + option selector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + option group + + + + + + + + + + + + + + + + + + + + + + selectable choice + + + + + + + + + + + + + + + + + + + + + + + + + + + multi-line text field + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The fieldset element is used to group form fields. + Only one legend element should occur in the content + and if present should only be preceded by whitespace. + + NOTE: this content model is different from the XHTML 1.0 DTD, + closer to the intended content model in HTML4 DTD + + + + + + + + + + + + + + + + + + + + fieldset label + + + + + + + + + + + + + + + + Content is "Flow" excluding a, form and form controls + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ======================= Tables ======================================= + + Derived from IETF HTML table standard, see [RFC1942] + + + + + + + The border attribute sets the thickness of the frame around the + table. The default units are screen pixels. + + The frame attribute specifies which parts of the frame around + the table should be rendered. The values are not the same as + CALS to avoid a name clash with the valign attribute. + + + + + + + + + + + + + + + + + + + The rules attribute defines which rules to draw between cells: + + If rules is absent then assume: + "none" if border is absent or border="0" otherwise "all" + + + + + + + + + + + + + + + horizontal alignment attributes for cell contents + + char alignment char, e.g. char=':' + charoff offset for alignment char + + + + + + + + + + + + + + + + + + + + + vertical alignment attributes for cell contents + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Use thead to duplicate headers when breaking table + across page boundaries, or for static headers when + tbody sections are rendered in scrolling panel. + + Use tfoot to duplicate footers when breaking table + across page boundaries, or for static footers when + tbody sections are rendered in scrolling panel. + + Use multiple tbody sections when rules are needed + between groups of table rows. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + colgroup groups a set of col elements. It allows you to group + several semantically related columns together. + + + + + + + + + + + + + + + + + + col elements define the alignment properties for cells in + one or more columns. + + The width attribute specifies the width of the columns, e.g. + + width=64 width in screen pixels + width=0.5* relative width of 0.5 + + The span attribute causes the attributes of one + col element to apply to more than one column. + + + + + + + + + + + + + + + + + + + + + + + + + + + Scope is simpler than headers attribute for common tables + + + + + + + + + + + + + th is for headers, td for data and for cells acting as both + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/etc/catalog/xml.xsd b/etc/catalog/xml.xsd new file mode 100644 index 0000000..aea7d0d --- /dev/null +++ b/etc/catalog/xml.xsd @@ -0,0 +1,287 @@ + + + + + + +
+

About the XML namespace

+ +
+

+ This schema document describes the XML namespace, in a form + suitable for import by other schema documents. +

+

+ See + http://www.w3.org/XML/1998/namespace.html and + + http://www.w3.org/TR/REC-xml for information + about this namespace. +

+

+ Note that local names in this namespace are intended to be + defined only by the World Wide Web Consortium or its subgroups. + The names currently defined in this namespace are listed below. + They should not be used with conflicting semantics by any Working + Group, specification, or document instance. +

+

+ See further below in this document for more information about how to refer to this schema document from your own + XSD schema documents and about the + namespace-versioning policy governing this schema document. +

+
+
+
+
+ + + + +
+ +

lang (as an attribute name)

+

+ denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification.

+ +
+
+

Notes

+

+ Attempting to install the relevant ISO 2- and 3-letter + codes as the enumerated possible values is probably never + going to be a realistic possibility. +

+

+ See BCP 47 at + http://www.rfc-editor.org/rfc/bcp/bcp47.txt + and the IANA language subtag registry at + + http://www.iana.org/assignments/language-subtag-registry + for further information. +

+

+ The union allows for the 'un-declaration' of xml:lang with + the empty string. +

+
+
+
+ + + + + + + + + +
+ + + + +
+ +

space (as an attribute name)

+

+ denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification.

+ +
+
+
+ + + + + + +
+ + + +
+ +

base (as an attribute name)

+

+ denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification.

+ +

+ See http://www.w3.org/TR/xmlbase/ + for information about this attribute. +

+
+
+
+
+ + + + +
+ +

id (as an attribute name)

+

+ denotes an attribute whose value + should be interpreted as if declared to be of type ID. + This name is reserved by virtue of its definition in the + xml:id specification.

+ +

+ See http://www.w3.org/TR/xml-id/ + for information about this attribute. +

+
+
+
+
+ + + + + + + + + + +
+ +

Father (in any context at all)

+ +
+

+ denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: +

+
+

+ In appreciation for his vision, leadership and + dedication the W3C XML Plenary on this 10th day of + February, 2000, reserves for Jon Bosak in perpetuity + the XML name "xml:Father". +

+
+
+
+
+
+ + + +
+

About this schema document

+ +
+

+ This schema defines attributes and an attribute group suitable + for use by schemas wishing to allow xml:base, + xml:lang, xml:space or + xml:id attributes on elements they define. +

+

+ To enable this, such a schema must import this schema for + the XML namespace, e.g. as follows: +

+
+          <schema . . .>
+           . . .
+           <import namespace="http://www.w3.org/XML/1998/namespace"
+                      schemaLocation="http://www.w3.org/2001/xml.xsd"/>
+     
+

+ or +

+
+           <import namespace="http://www.w3.org/XML/1998/namespace"
+                      schemaLocation="http://www.w3.org/2009/01/xml.xsd"/>
+     
+

+ Subsequently, qualified reference to any of the attributes or the + group defined below will have the desired effect, e.g. +

+
+          <type . . .>
+           . . .
+           <attributeGroup ref="xml:specialAttrs"/>
+     
+

+ will define a type which will schema-validate an instance element + with any of those attributes. +

+
+
+
+
+ + + +
+

Versioning policy for this schema document

+
+

+ In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + + http://www.w3.org/2009/01/xml.xsd. +

+

+ At the date of issue it can also be found at + + http://www.w3.org/2001/xml.xsd. +

+

+ The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML + Schema itself, or with the XML namespace itself. In other words, + if the XML Schema or XML namespaces change, the version of this + document at + http://www.w3.org/2001/xml.xsd + + will change accordingly; the version at + + http://www.w3.org/2009/01/xml.xsd + + will not change. +

+

+ Previous dated (and unchanging) versions of this schema + document are at: +

+ +
+
+
+
+ +
+ diff --git a/src/dolda/jsvc/next/DomUtil.java b/src/dolda/jsvc/next/DomUtil.java index bedb992..4275689 100644 --- a/src/dolda/jsvc/next/DomUtil.java +++ b/src/dolda/jsvc/next/DomUtil.java @@ -2,9 +2,31 @@ package dolda.jsvc.next; import org.w3c.dom.*; import org.w3c.dom.bootstrap.*; +import org.w3c.dom.ls.*; +import javax.xml.validation.*; +import java.io.*; public class DomUtil { private static final DOMImplementation domimp; + private static final SchemaFactory xsdfac; + + static { + xsdfac = SchemaFactory.newInstance(javax.xml.XMLConstants.W3C_XML_SCHEMA_NS_URI); + xsdfac.setResourceResolver(new LSResourceResolver() { + public LSInput resolveResource(String type, String ns, String pubid, String sysid, String base) { + if(sysid.indexOf('/') >= 0) { + InputStream in = getcatalog(sysid.substring(sysid.lastIndexOf('/') + 1)); + if(in != null) { + LSInput ret = new LSInputAdapter(pubid, sysid, base); + ret.setByteStream(in); + ret.setEncoding("us-ascii"); + return(ret); + } + } + throw(new RuntimeException(String.format("Will not load external resources (for %s); please fix catalog.", sysid))); + } + }); + } static { DOMImplementationRegistry reg; @@ -51,4 +73,52 @@ public class DomUtil { p.appendChild(t); return(t); } + + public static class LSInputAdapter implements LSInput { + private String pubid, sysid, baseuri, encoding = null, data = null; + private boolean cert = false; + private InputStream bs = null; + private Reader cs = null; + + public LSInputAdapter(String pubid, String sysid, String baseuri) { + this.pubid = pubid; + this.sysid = sysid; + this.baseuri = baseuri; + } + + public String getBaseURI() {return(baseuri);} + public String getPublicId() {return(pubid);} + public String getSystemId() {return(sysid);} + public void setBaseURI(String baseuri) {this.baseuri = baseuri;} + public void setPublicId(String pubid) {this.pubid = pubid;} + public void setSystemId(String sysid) {this.sysid = sysid;} + + public InputStream getByteStream() {return(bs);} + public boolean getCertifiedText() {return(cert);} + public Reader getCharacterStream() {return(cs);} + public String getEncoding() {return(encoding);} + public String getStringData() {return(data);} + public void setByteStream(InputStream bs) {this.bs = bs;} + public void setCertifiedText(boolean cert) {this.cert = cert;} + public void setCharacterStream(Reader cs) {this.cs = cs;} + public void setEncoding(String encoding) {this.encoding = encoding;} + public void setStringData(String data) {this.data = data;} + } + + private static InputStream getcatalog(String name) { + if(name.indexOf('/') >= 0) + throw(new RuntimeException("Illegal catalog resource name `" + name + "'")); + return(DomUtil.class.getResourceAsStream("catalog/" + name)); + } + + public static Schema loadxsd(String name) { + InputStream in = getcatalog(name); + if(in == null) + throw(new RuntimeException("Could not find schema `" + name + "'")); + try { + return(xsdfac.newSchema(new javax.xml.transform.stream.StreamSource(in))); + } catch(org.xml.sax.SAXException e) { + throw(new RuntimeException(e)); + } + } } diff --git a/src/dolda/jsvc/next/Html.java b/src/dolda/jsvc/next/Html.java index 6d6c24e..c90bb40 100644 --- a/src/dolda/jsvc/next/Html.java +++ b/src/dolda/jsvc/next/Html.java @@ -1,10 +1,15 @@ package dolda.jsvc.next; import org.w3c.dom.*; +import org.w3c.dom.ls.*; +import javax.xml.validation.*; +import java.net.*; +import java.io.*; public class Html extends DocBuffer { public static final String ns = "http://www.w3.org/1999/xhtml"; - + private static final Schema schema = DomUtil.loadxsd("xhtml1-strict.xsd"); + private Html(String pubid, String sysid) { super(ns, "html", "html", pubid, sysid); } @@ -36,4 +41,16 @@ public class Html extends DocBuffer { public void addcss(String href, String name) { insert("head", csslink(href, name)); } + + public void validate() { + Validator val = schema.newValidator(); + try { + val.validate(new javax.xml.transform.dom.DOMSource(doc)); + } catch(org.xml.sax.SAXException e) { + throw(new RuntimeException(e)); + } catch(java.io.IOException e) { + /* Should never happen. */ + throw(new Error(e)); + } + } } -- 2.11.0