From 7c0e72ac740a2d9a3579977e53d935da849344f2 Mon Sep 17 00:00:00 2001 From: Fredrik Tolf Date: Sun, 15 Nov 2009 19:16:31 +0100 Subject: [PATCH] Replaced the NEXT parser with a new, better, recursive-decent one. --- src/dolda/jsvc/next/Parser.java | 418 ++++++++++++++++++---------------------- 1 file changed, 187 insertions(+), 231 deletions(-) diff --git a/src/dolda/jsvc/next/Parser.java b/src/dolda/jsvc/next/Parser.java index b6bec77..dd03710 100644 --- a/src/dolda/jsvc/next/Parser.java +++ b/src/dolda/jsvc/next/Parser.java @@ -21,6 +21,15 @@ public class Parser { domimp = di; } + public class State { + public final Document doc = domimp.createDocument(null, "dummy", null); + public final PeekReader in; + + private State(Reader in) { + this.in = new PeekReader(in); + } + } + private static boolean namechar(char c) { return((c == ':') || (c == '_') || (c == '$') || (c == '.') || (c == '-') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'))); } @@ -43,253 +52,200 @@ public class Parser { return(doc.createElementNS(null, name)); } + protected Attr makeattr(Document doc, Element el, String name, String val) { + Attr a = doc.createAttributeNS(el.getNamespaceURI(), name); + a.setValue(val); + return(a); + } + protected Attr makeattr(Document doc, Element el, String name) { return(doc.createAttributeNS(el.getNamespaceURI(), name)); } - public DocumentFragment parse(Reader in) throws IOException { - Stack stack = new Stack(); - Document doc = domimp.createDocument(null, "dummy", null); - DocumentFragment frag = doc.createDocumentFragment(); - stack.push(frag); - String st = "content"; - int c = in.read(); + protected String name(State s) throws IOException { StringBuilder buf = new StringBuilder(); - StringBuilder ebuf = new StringBuilder(); - char atype = 0; - int cdashcnt = 0; while(true) { - if(st == "content") { - if(c == '<') { - st = "tag"; - c = in.read(); - } else if(c < 0) { - if(stack.peek() == frag) - return(frag); - else - throw(new ParseException("Unexpected end-of-file while parsing non-root element")); - } else { - st = "text"; - } - } else if(st == "tag") { - if(Character.isWhitespace((char)c)) { - c = in.read(); - } else if(c == '!') { - cdashcnt = 0; - c = in.read(); - st = "comment"; - } else if(namechar((char)c)) { - st = "stag"; - } else if(c == '/') { - c = in.read(); - st = "etag"; - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing tag")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name")); - } - } else if(st == "stag") { - boolean flush = false; - if(namechar((char)c)) { - buf.append((char)c); - c = in.read(); - } else if(c == '>') { - flush = true; - } else if(Character.isWhitespace((char)c)) { - flush = true; - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing tag name")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name")); - } - if(flush) { - Element n = makenode(doc, buf.toString()); - buf = new StringBuilder(); - stack.peek().appendChild(n); - stack.push(n); - st = "attr"; - } - } else if(st == "comment") { - if(c == '-') { - cdashcnt++; - c = in.read(); - } else if((c == '>') && (cdashcnt == 4)) { - stack.peek().appendChild(doc.createComment(buf.toString())); - buf = new StringBuilder(); - st = "content"; - c = in.read(); - } else if(cdashcnt >= 2) { - if(cdashcnt > 2) - cdashcnt = 2; - buf.append((char)c); - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing comment")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in comment")); - } - } else if(st == "attr") { - if(namechar((char)c)) { - st = "aname"; - } else if(c == '>') { - st = "content"; - c = in.read(); - } else if(c == '/') { - st = "stagend"; - c = in.read(); - } else if(Character.isWhitespace((char)c)) { - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing attributes")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered inside tag")); - } - } else if(st == "stagend") { - if(c == '>') { - stack.pop(); - c = in.read(); - st = "content"; - } else if(Character.isWhitespace((char)c)) { - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file at end of empty tag")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered at and of empty tag")); - } - } else if(st == "aname") { - if(namechar((char)c)) { - buf.append((char)c); - c = in.read(); - } else if(Character.isWhitespace((char)c)) { - c = in.read(); - } else if(c == '=') { - Element el = (Element)stack.peek(); - Attr attr = makeattr(doc, el, buf.toString()); - el.setAttributeNodeNS(attr); - buf = new StringBuilder(); - stack.push(attr); - st = "avalstart"; - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing attribute name")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute name")); - } - } else if(st == "avalstart") { - if((c == '\'') || (c == '"')) { - atype = (char)c; - c = in.read(); - st = "aval"; - } else if(Character.isWhitespace((char)c)) { - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing attribute value")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute value")); - } - } else if(st == "aval") { - if(c == atype) { - c = in.read(); - Attr a = (Attr)stack.pop(); - a.setValue(buf.toString()); - buf = new StringBuilder(); - st = "attr"; - } else if(c == '&') { - c = in.read(); - st = "aent"; - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing attribute value")); - } else { - buf.append((char)c); - c = in.read(); - } - } else if(st == "etag") { - if(namechar((char)c)) { - buf.append((char)c); - c = in.read(); - } else if(c == '>') { - String nm = buf.toString(); - buf = new StringBuilder(); - Node n = stack.pop(); - if(n instanceof DocumentFragment) - throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing root content")); - Element el = (Element)n; - if(!nm.equals(el.getTagName())) - throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + el.getTagName() + "'")); - c = in.read(); - st = "content"; - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing end tag")); - } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in end tag")); - } - } else if(st == "text") { - boolean flush = false; - if(c == '&') { - st = "ent"; - c = in.read(); - } else if(c == '<') { - flush = true; - st = "content"; - } else if(c < 0) { - flush = true; - st = "content"; - } else { - buf.append((char)c); - c = in.read(); - } - if(flush) { - Text n = doc.createTextNode(buf.toString()); - buf = new StringBuilder(); - stack.peek().appendChild(n); - } - } else if(st == "ent") { - if(c == ';') { - String ename = ebuf.toString(); - ebuf = new StringBuilder(); - String rep = entity(ename); - if(rep == null) - throw(new ParseException("Unknown entity `" + ename + "' encountered")); - buf.append(rep); - st = "text"; - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing entity name")); - } else if(namechar((char)c)) { - ebuf.append((char)c); - c = in.read(); + int c = s.in.peek(); + if(c < 0) { + break; + } else if(namechar((char)c)) { + buf.append((char)s.in.read()); + } else { + break; + } + } + if(buf.length() == 0) + throw(new ParseException("Expected name, got `" + printable(s.in.peek()) + "'")); + return(buf.toString()); + } + + protected String entity(State s) throws IOException { + int c = s.in.read(); + if(c != '&') + throw(new ParseException("Expected `&' while reading entity, got `" + printable(c) + "'")); + String nm = name(s); + c = s.in.read(); + if(c != ';') + throw(new ParseException("Expected `;' while reading entity, got `" + printable(c) + "'")); + return(entity(nm)); + } + + protected Attr attribute(State s, Element el) throws IOException { + String nm = name(s); + s.in.peek(true); + int c = s.in.read(); + if(c != '=') + throw(new ParseException("Expected `=' while reading attribute, got `" + printable(c) + "'")); + s.in.peek(true); + int qt = s.in.read(); + if((qt != '"') && (qt != '\'')) + throw(new ParseException("Expected double or single quote while reading attribute, got `" + printable(qt) + "'")); + StringBuilder buf = new StringBuilder(); + while(true) { + c = s.in.peek(); + if(c < 0) { + throw(new ParseException("Unexpected end-of-file while reading attribute value")); + } else if(c == qt) { + s.in.read(); + break; + } else if(c == '&') { + buf.append(entity(s)); + } else { + buf.append((char)s.in.read()); + } + } + return(makeattr(s.doc, el, nm, buf.toString())); + } + + protected Element element(State s) throws IOException { + Element n = makenode(s.doc, name(s)); + while(true) { + int c = s.in.peek(true); + if(c < 0) { + throw(new ParseException("Unexpected end-of-file while parsing start tag")); + } else if(c == '>') { + s.in.read(); + break; + } else if(c == '/') { + s.in.read(); + s.in.peek(true); + c = s.in.read(); + if(c != '>') + throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in end of empty tag")); + return(n); + } else if(namechar((char)c)) { + n.setAttributeNodeNS(attribute(s, n)); + } else { + throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in start tag")); + } + } + while(true) { + int c = s.in.peek(); + if(c < 0) { + break; + } else if(c == '<') { + s.in.read(); + c = s.in.peek(true); + if(c == '/') { + s.in.read(); + s.in.peek(true); + String nm = name(s); + if(!nm.equals(n.getTagName())) + throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + n.getTagName() + "'")); + if(s.in.peek(true) != '>') + throw(new ParseException("Expected `>' while reading end tag, got `" + printable(c) + "'")); + s.in.read(); + break; } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name")); + n.appendChild(stag(s)); } - } else if(st == "aent") { - if(c == ';') { - String ename = ebuf.toString(); - ebuf = new StringBuilder(); - String rep = entity(ename); - if(rep == null) - throw(new ParseException("Unknown entity `" + ename + "' encountered")); - buf.append(rep); - st = "aval"; - c = in.read(); - } else if(c < 0) { - throw(new ParseException("Unexpected end-of-file while parsing entity name")); - } else if(namechar((char)c)) { - ebuf.append((char)c); - c = in.read(); + } else { + n.appendChild(text(s)); + } + } + return(n); + } + + protected Comment comment(State s) throws IOException { + if((s.in.read() != '!') || + (s.in.read() != '-') || + (s.in.read() != '-')) + throw(new ParseException("Illegal start of comment")); + StringBuilder buf = new StringBuilder(); + while(true) { + int c = s.in.peek(); + if(c < 0) { + throw(new ParseException("Unexpected end-of-file while parsing comment")); + } else if(c == '-') { + s.in.read(); + if(s.in.peek() == '-') { + s.in.read(); + if(s.in.peek() == '>') { + s.in.read(); + break; + } else { + buf.append("--"); + } } else { - throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name")); + buf.append("-"); } } else { - throw(new Error("BUG: Typoed state " + st)); + buf.append((char)s.in.read()); } } + return(s.doc.createComment(buf.toString())); } - - private static String printable(char c) { + + protected Node stag(State s) throws IOException { + int c = s.in.peek(true); + if(c < 0) { + throw(new ParseException("Unexpected end-of-file while parsing tag type")); + } else if(c == '!') { + return(comment(s)); + } else { + return(element(s)); + } + } + + protected Text text(State s) throws IOException { + StringBuilder buf = new StringBuilder(); + while(true) { + int c = s.in.peek(); + if(c < 0) { + break; + } else if(c == '<') { + break; + } else if(c == '&') { + buf.append(entity(s)); + } else { + buf.append((char)s.in.read()); + } + } + return(s.doc.createTextNode(buf.toString())); + } + + public DocumentFragment parse(Reader in) throws IOException { + State s = new State(in); + DocumentFragment frag = s.doc.createDocumentFragment(); + while(true) { + int c = s.in.peek(); + if(c < 0) { + return(frag); + } else if(c == '<') { + s.in.read(); + frag.appendChild(stag(s)); + } else { + frag.appendChild(text(s)); + } + } + } + + private static String printable(int c) { + if(c < 0) + return("EOF"); if(c < 32) return(String.format("\\%03o", (int)c)); - return(Character.toString(c)); + return(Character.toString((char)c)); } public static void main(String[] args) throws Exception { -- 2.11.0