Replaced the NEXT parser with a new, better, recursive-decent one.
authorFredrik Tolf <fredrik@dolda2000.com>
Sun, 15 Nov 2009 18:16:31 +0000 (19:16 +0100)
committerFredrik Tolf <fredrik@dolda2000.com>
Sun, 15 Nov 2009 18:16:31 +0000 (19:16 +0100)
src/dolda/jsvc/next/Parser.java

index b6bec77..dd03710 100644 (file)
@@ -21,6 +21,15 @@ public class Parser {
        domimp = di;
     }
 
+    public class State {
+       public final Document doc = domimp.createDocument(null, "dummy", null);
+       public final PeekReader in;
+       
+       private State(Reader in) {
+           this.in = new PeekReader(in);
+       }
+    }
+
     private static boolean namechar(char c) {
        return((c == ':') || (c == '_') || (c == '$') || (c == '.') || (c == '-') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')));
     }
@@ -43,253 +52,200 @@ public class Parser {
        return(doc.createElementNS(null, name));
     }
     
+    protected Attr makeattr(Document doc, Element el, String name, String val) {
+       Attr a = doc.createAttributeNS(el.getNamespaceURI(), name);
+       a.setValue(val);
+       return(a);
+    }
+
     protected Attr makeattr(Document doc, Element el, String name) {
        return(doc.createAttributeNS(el.getNamespaceURI(), name));
     }
 
-    public DocumentFragment parse(Reader in) throws IOException {
-       Stack<Node> stack = new Stack<Node>();
-       Document doc = domimp.createDocument(null, "dummy", null);
-       DocumentFragment frag = doc.createDocumentFragment();
-       stack.push(frag);
-       String st = "content";
-       int c = in.read();
+    protected String name(State s) throws IOException {
        StringBuilder buf = new StringBuilder();
-       StringBuilder ebuf = new StringBuilder();
-       char atype = 0;
-       int cdashcnt = 0;
        while(true) {
-           if(st == "content") {
-               if(c == '<') {
-                   st = "tag";
-                   c = in.read();
-               } else if(c < 0) {
-                   if(stack.peek() == frag)
-                       return(frag);
-                   else
-                       throw(new ParseException("Unexpected end-of-file while parsing non-root element"));
-               } else {
-                   st = "text";
-               }
-           } else if(st == "tag") {
-               if(Character.isWhitespace((char)c)) {
-                   c = in.read();
-               } else if(c == '!') {
-                   cdashcnt = 0;
-                   c = in.read();
-                   st = "comment";
-               } else if(namechar((char)c)) {
-                   st = "stag";
-               } else if(c == '/') {
-                   c = in.read();
-                   st = "etag";
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing tag"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name"));
-               }
-           } else if(st == "stag") {
-               boolean flush = false;
-               if(namechar((char)c)) {
-                   buf.append((char)c);
-                   c = in.read();
-               } else if(c == '>') {
-                   flush = true;
-               } else if(Character.isWhitespace((char)c)) {
-                   flush = true;
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing tag name"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name"));
-               }
-               if(flush) {
-                   Element n = makenode(doc, buf.toString());
-                   buf = new StringBuilder();
-                   stack.peek().appendChild(n);
-                   stack.push(n);
-                   st = "attr";
-               }
-           } else if(st == "comment") {
-               if(c == '-') {
-                   cdashcnt++;
-                   c = in.read();
-               } else if((c == '>') && (cdashcnt == 4)) {
-                   stack.peek().appendChild(doc.createComment(buf.toString()));
-                   buf = new StringBuilder();
-                   st = "content";
-                   c = in.read();
-               } else if(cdashcnt >= 2) {
-                   if(cdashcnt > 2)
-                       cdashcnt = 2;
-                   buf.append((char)c);
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing comment"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in comment"));
-               }
-           } else if(st == "attr") {
-               if(namechar((char)c)) {
-                   st = "aname";
-               } else if(c == '>') {
-                   st = "content";
-                   c = in.read();
-               } else if(c == '/') {
-                   st = "stagend";
-                   c = in.read();
-               } else if(Character.isWhitespace((char)c)) {
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing attributes"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered inside tag"));
-               }
-           } else if(st == "stagend") {
-               if(c == '>') {
-                   stack.pop();
-                   c = in.read();
-                   st = "content";
-               } else if(Character.isWhitespace((char)c)) {
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file at end of empty tag"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered at and of empty tag"));
-               }
-           } else if(st == "aname") {
-               if(namechar((char)c)) {
-                   buf.append((char)c);
-                   c = in.read();
-               } else if(Character.isWhitespace((char)c)) {
-                   c = in.read();
-               } else if(c == '=') {
-                   Element el = (Element)stack.peek();
-                   Attr attr = makeattr(doc, el, buf.toString());
-                   el.setAttributeNodeNS(attr);
-                   buf = new StringBuilder();
-                   stack.push(attr);
-                   st = "avalstart";
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing attribute name"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute name"));
-               }
-           } else if(st == "avalstart") {
-               if((c == '\'') || (c == '"')) {
-                   atype = (char)c;
-                   c = in.read();
-                   st = "aval";
-               } else if(Character.isWhitespace((char)c)) {
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing attribute value"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute value"));
-               }
-           } else if(st == "aval") {
-               if(c == atype) {
-                   c = in.read();
-                   Attr a = (Attr)stack.pop();
-                   a.setValue(buf.toString());
-                   buf = new StringBuilder();
-                   st = "attr";
-               } else if(c == '&') {
-                   c = in.read();
-                   st = "aent";
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing attribute value"));
-               } else {
-                   buf.append((char)c);
-                   c = in.read();
-               }
-           } else if(st == "etag") {
-               if(namechar((char)c)) {
-                   buf.append((char)c);
-                   c = in.read();
-               } else if(c == '>') {
-                   String nm = buf.toString();
-                   buf = new StringBuilder();
-                   Node n = stack.pop();
-                   if(n instanceof DocumentFragment)
-                       throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing root content"));
-                   Element el = (Element)n;
-                   if(!nm.equals(el.getTagName()))
-                       throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + el.getTagName() + "'"));
-                   c = in.read();
-                   st = "content";
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing end tag"));
-               } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in end tag"));
-               }
-           } else if(st == "text") {
-               boolean flush = false;
-               if(c == '&') {
-                   st = "ent";
-                   c = in.read();
-               } else if(c == '<') {
-                   flush = true;
-                   st = "content";
-               } else if(c < 0) {
-                   flush = true;
-                   st = "content";
-               } else {
-                   buf.append((char)c);
-                   c = in.read();
-               }
-               if(flush) {
-                   Text n = doc.createTextNode(buf.toString());
-                   buf = new StringBuilder();
-                   stack.peek().appendChild(n);
-               }
-           } else if(st == "ent") {
-               if(c == ';') {
-                   String ename = ebuf.toString();
-                   ebuf = new StringBuilder();
-                   String rep = entity(ename);
-                   if(rep == null)
-                       throw(new ParseException("Unknown entity `" + ename + "' encountered"));
-                   buf.append(rep);
-                   st = "text";
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing entity name"));
-               } else if(namechar((char)c)) {
-                   ebuf.append((char)c);
-                   c = in.read();
+           int c = s.in.peek();
+           if(c < 0) {
+               break;
+           } else if(namechar((char)c)) {
+               buf.append((char)s.in.read());
+           } else {
+               break;
+           }
+       }
+       if(buf.length() == 0)
+           throw(new ParseException("Expected name, got `" + printable(s.in.peek()) + "'"));
+       return(buf.toString());
+    }
+    
+    protected String entity(State s) throws IOException {
+       int c = s.in.read();
+       if(c != '&')
+           throw(new ParseException("Expected `&' while reading entity, got `" + printable(c) + "'"));
+       String nm = name(s);
+       c = s.in.read();
+       if(c != ';')
+           throw(new ParseException("Expected `;' while reading entity, got `" + printable(c) + "'"));
+       return(entity(nm));
+    }
+
+    protected Attr attribute(State s, Element el) throws IOException {
+       String nm = name(s);
+       s.in.peek(true);
+       int c = s.in.read();
+       if(c != '=')
+           throw(new ParseException("Expected `=' while reading attribute, got `" + printable(c) + "'"));
+       s.in.peek(true);
+       int qt = s.in.read();
+       if((qt != '"') && (qt != '\''))
+           throw(new ParseException("Expected double or single quote while reading attribute, got `" + printable(qt) + "'"));
+       StringBuilder buf = new StringBuilder();
+       while(true) {
+           c = s.in.peek();
+           if(c < 0) {
+               throw(new ParseException("Unexpected end-of-file while reading attribute value"));
+           } else if(c == qt) {
+               s.in.read();
+               break;
+           } else if(c == '&') {
+               buf.append(entity(s));
+           } else {
+               buf.append((char)s.in.read());
+           }
+       }
+       return(makeattr(s.doc, el, nm, buf.toString()));
+    }
+    
+    protected Element element(State s) throws IOException {
+       Element n = makenode(s.doc, name(s));
+       while(true) {
+           int c = s.in.peek(true);
+           if(c < 0) {
+               throw(new ParseException("Unexpected end-of-file while parsing start tag"));
+           } else if(c == '>') {
+               s.in.read();
+               break;
+           } else if(c == '/') {
+               s.in.read();
+               s.in.peek(true);
+               c = s.in.read();
+               if(c != '>')
+                   throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in end of empty tag"));
+               return(n);
+           } else if(namechar((char)c)) {
+               n.setAttributeNodeNS(attribute(s, n));
+           } else {
+               throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in start tag"));
+           }
+       }
+       while(true) {
+           int c = s.in.peek();
+           if(c < 0) {
+               break;
+           } else if(c == '<') {
+               s.in.read();
+               c = s.in.peek(true);
+               if(c == '/') {
+                   s.in.read();
+                   s.in.peek(true);
+                   String nm = name(s);
+                   if(!nm.equals(n.getTagName()))
+                       throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + n.getTagName() + "'"));
+                   if(s.in.peek(true) != '>')
+                       throw(new ParseException("Expected `>' while reading end tag, got `" + printable(c) + "'"));
+                   s.in.read();
+                   break;
                } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name"));
+                   n.appendChild(stag(s));
                }
-           } else if(st == "aent") {
-               if(c == ';') {
-                   String ename = ebuf.toString();
-                   ebuf = new StringBuilder();
-                   String rep = entity(ename);
-                   if(rep == null)
-                       throw(new ParseException("Unknown entity `" + ename + "' encountered"));
-                   buf.append(rep);
-                   st = "aval";
-                   c = in.read();
-               } else if(c < 0) {
-                   throw(new ParseException("Unexpected end-of-file while parsing entity name"));
-               } else if(namechar((char)c)) {
-                   ebuf.append((char)c);
-                   c = in.read();
+           } else {
+               n.appendChild(text(s));
+           }
+       }
+       return(n);
+    }
+    
+    protected Comment comment(State s) throws IOException {
+       if((s.in.read() != '!') ||
+          (s.in.read() != '-') ||
+          (s.in.read() != '-'))
+           throw(new ParseException("Illegal start of comment"));
+       StringBuilder buf = new StringBuilder();
+       while(true) {
+           int c = s.in.peek();
+           if(c < 0) {
+               throw(new ParseException("Unexpected end-of-file while parsing comment"));
+           } else if(c == '-') {
+               s.in.read();
+               if(s.in.peek() == '-') {
+                   s.in.read();
+                   if(s.in.peek() == '>') {
+                       s.in.read();
+                       break;
+                   } else {
+                       buf.append("--");
+                   }
                } else {
-                   throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name"));
+                   buf.append("-");
                }
            } else {
-               throw(new Error("BUG: Typoed state " + st));
+               buf.append((char)s.in.read());
            }
        }
+       return(s.doc.createComment(buf.toString()));
     }
-    
-    private static String printable(char c) {
+
+    protected Node stag(State s) throws IOException {
+       int c = s.in.peek(true);
+       if(c < 0) {
+           throw(new ParseException("Unexpected end-of-file while parsing tag type"));
+       } else if(c == '!') {
+           return(comment(s));
+       } else {
+           return(element(s));
+       }
+    }
+
+    protected Text text(State s) throws IOException {
+       StringBuilder buf = new StringBuilder();
+       while(true) {
+           int c = s.in.peek();
+           if(c < 0) {
+               break;
+           } else if(c == '<') {
+               break;
+           } else if(c == '&') {
+               buf.append(entity(s));
+           } else {
+               buf.append((char)s.in.read());
+           }
+       }
+       return(s.doc.createTextNode(buf.toString()));
+    }
+
+    public DocumentFragment parse(Reader in) throws IOException {
+       State s = new State(in);
+       DocumentFragment frag = s.doc.createDocumentFragment();
+       while(true) {
+           int c = s.in.peek();
+           if(c < 0) {
+               return(frag);
+           } else if(c == '<') {
+               s.in.read();
+               frag.appendChild(stag(s));
+           } else {
+               frag.appendChild(text(s));
+           }
+       }
+    }
+
+    private static String printable(int c) {
+       if(c < 0)
+           return("EOF");
        if(c < 32)
            return(String.format("\\%03o", (int)c));
-       return(Character.toString(c));
+       return(Character.toString((char)c));
     }
 
     public static void main(String[] args) throws Exception {