using System; using System.IO; using System.Text; using System.Collections; using System.Globalization; /* * Copyright 2003 Paulo Soares * * The contents of this file are subject to the Mozilla Public License Version 1.1 * (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the License. * * The Original Code is 'iText, a free JAVA-PDF library'. * * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. * All Rights Reserved. * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. * * Contributor(s): all the names of the contributors are added in the source code * where applicable. * * Alternatively, the contents of this file may be used under the terms of the * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * provisions of LGPL are applicable instead of those above. If you wish to * allow use of your version of this file only under the terms of the LGPL * License and not to allow others to use your version of this file under * the MPL, indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by the LGPL. * If you do not delete the provisions above, a recipient may use your version * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. * * This library is free software; you can redistribute it and/or modify it * under the terms of the MPL as stated above or under the terms of the GNU * Library General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more * details. * * If you didn't download this code from the following link, you should check if * you aren't using an obsolete version: * http://www.lowagie.com/iText/ * * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). * Steven Brandt and JavaWorld gave permission to use the code for free. * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in * conformance with the rest of the code). * The original code can be found on this url: http://www.javaworld.com/javatips/jw-javatip128_p.html. * It was substantially refactored by Bruno Lowagie. * * The method 'private static String getEncodingName(byte[] b4)' was found * in org.apache.xerces.impl.XMLEntityManager, originaly published by the * Apache Software Foundation under the Apache Software License; now being * used in iText under the MPL. */ namespace iTextSharp.text.xml.simpleparser { /** * A simple XML and HTML parser. This parser is, like the SAX parser, * an event based parser, but with much less functionality. *

* The parser can: *

*

*

* The code is based on * http://www.javaworld.com/javaworld/javatips/javatip128/ with some extra * code from XERCES to recognize the encoding. */ public sealed class SimpleXMLParser { /** possible states */ private const int UNKNOWN = 0; private const int TEXT = 1; private const int TAG_ENCOUNTERED = 2; private const int EXAMIN_TAG = 3; private const int TAG_EXAMINED = 4; private const int IN_CLOSETAG = 5; private const int SINGLE_TAG = 6; private const int CDATA = 7; private const int COMMENT = 8; private const int PI = 9; private const int ENTITY = 10; private const int QUOTE = 11; private const int ATTRIBUTE_KEY = 12; private const int ATTRIBUTE_EQUAL = 13; private const int ATTRIBUTE_VALUE = 14; /** the state stack */ internal Stack stack; /** The current character. */ internal int character = 0; /** The previous character. */ internal int previousCharacter = -1; /** the line we are currently reading */ internal int lines = 1; /** the column where the current character occurs */ internal int columns = 0; /** was the last character equivalent to a newline? */ internal bool eol = false; /** the current state */ internal int state; /** Are we parsing HTML? */ internal bool html; /** current text (whatever is encountered between tags) */ internal StringBuilder text = new StringBuilder(); /** current entity (whatever is encountered between & and ;) */ internal StringBuilder entity = new StringBuilder(); /** current tagname */ internal String tag = null; /** current attributes */ internal Hashtable attributes = null; /** The handler to which we are going to forward document content */ internal ISimpleXMLDocHandler doc; /** The handler to which we are going to forward comments. */ internal ISimpleXMLDocHandlerComment comment; /** Keeps track of the number of tags that are open. */ internal int nested = 0; /** the quote character that was used to open the quote. */ internal int quoteCharacter = '"'; /** the attribute key. */ internal String attributekey = null; /** the attribute value. */ internal String attributevalue = null; /** * Creates a Simple XML parser object. * Call Go(BufferedReader) immediately after creation. */ private SimpleXMLParser(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, bool html) { this.doc = doc; this.comment = comment; this.html = html; stack = new Stack(); state = html ? TEXT : UNKNOWN; } /** * Does the actual parsing. Perform this immediately * after creating the parser object. */ private void Go(TextReader reader) { doc.StartDocument(); while (true) { // read a new character if (previousCharacter == -1) { character = reader.Read(); } // or re-examin the previous character else { character = previousCharacter; previousCharacter = -1; } // the end of the file was reached if (character == -1) { if (html) { if (html && state == TEXT) Flush(); doc.EndDocument(); } else { ThrowException("Missing end tag"); } return; } // dealing with \n and \r if (character == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (character == '\n') { lines++; columns = 0; } else if (character == '\r') { eol = true; character = '\n'; lines++; columns = 0; } else { columns++; } switch (state) { // we are in an unknown state before there's actual content case UNKNOWN: if (character == '<') { SaveState(TEXT); state = TAG_ENCOUNTERED; } break; // we can encounter any content case TEXT: if (character == '<') { Flush(); SaveState(state); state = TAG_ENCOUNTERED; } else if (character == '&') { SaveState(state); entity.Length = 0; state = ENTITY; } else text.Append((char)character); break; // we have just seen a < and are wondering what we are looking at // , , , etc. case TAG_ENCOUNTERED: InitTag(); if (character == '/') { state = IN_CLOSETAG; } else if (character == '?') { RestoreState(); state = PI; } else { text.Append((char)character); state = EXAMIN_TAG; } break; // we are processing something like this . // It could still be a or something. case EXAMIN_TAG: if (character == '>') { DoTag(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (character == '/') { state = SINGLE_TAG; } else if (character == '-' && text.ToString().Equals("!-")) { Flush(); state = COMMENT; } else if (character == '[' && text.ToString().Equals("![CDATA")) { Flush(); state = CDATA; } else if (character == 'E' && text.ToString().Equals("!DOCTYP")) { Flush(); state = PI; } else if (char.IsWhiteSpace((char)character)) { DoTag(); state = TAG_EXAMINED; } else { text.Append((char)character); } break; // we know the name of the tag now. case TAG_EXAMINED: if (character == '>') { ProcessTag(true); InitTag(); state = RestoreState(); } else if (character == '/') { state = SINGLE_TAG; } else if (char.IsWhiteSpace((char)character)) { // empty } else { text.Append((char)character); state = ATTRIBUTE_KEY; } break; // we are processing a closing tag: e.g. case IN_CLOSETAG: if (character == '>') { DoTag(); ProcessTag(false); if (!html && nested==0) return; state = RestoreState(); } else { if (!char.IsWhiteSpace((char)character)) text.Append((char)character); } break; // we have just seen something like this: . case SINGLE_TAG: if (character != '>') ThrowException("Expected > for tag: <"+tag+"/>"); DoTag(); ProcessTag(true); ProcessTag(false); InitTag(); if (!html && nested==0) { doc.EndDocument(); return; } state = RestoreState(); break; // we are processing CDATA case CDATA: if (character == '>' && text.ToString().EndsWith("]]")) { text.Length = text.Length - 2; Flush(); state = RestoreState(); } else text.Append((char)character); break; // we are processing a comment. We are inside // the looking for the -->. case COMMENT: if (character == '>' && text.ToString().EndsWith("--")) { text.Length = text.Length - 2; Flush(); state = RestoreState(); } else text.Append((char)character); break; // We are inside one of these or one of these case PI: if (character == '>') { state = RestoreState(); if (state == TEXT) state = UNKNOWN; } break; // we are processing an entity, e.g. <, », etc. case ENTITY: if (character == ';') { state = RestoreState(); String cent = entity.ToString(); entity.Length = 0; char ce = EntitiesToUnicode.DecodeEntity(cent); if (ce == '\0') text.Append('&').Append(cent).Append(';'); else text.Append(ce); } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') && (character < 'A' || character > 'Z')) || entity.Length >= 7) { state = RestoreState(); previousCharacter = character; text.Append('&').Append(entity.ToString()); entity.Length = 0; } else { entity.Append((char)character); } break; // We are processing the quoted right-hand side of an element's attribute. case QUOTE: if (html && quoteCharacter == ' ' && character == '>') { Flush(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (html && quoteCharacter == ' ' && char.IsWhiteSpace((char)character)) { Flush(); state = TAG_EXAMINED; } else if (html && quoteCharacter == ' ') { text.Append((char)character); } else if (character == quoteCharacter) { Flush(); state = TAG_EXAMINED; } else if (" \r\n\u0009".IndexOf((char)character)>=0) { text.Append(' '); } else if (character == '&') { SaveState(state); state = ENTITY; entity.Length = 0; } else { text.Append((char)character); } break; case ATTRIBUTE_KEY: if (char.IsWhiteSpace((char)character)) { Flush(); state = ATTRIBUTE_EQUAL; } else if (character == '=') { Flush(); state = ATTRIBUTE_VALUE; } else if (html && character == '>') { text.Length = 0; ProcessTag(true); InitTag(); state = RestoreState(); } else { text.Append((char)character); } break; case ATTRIBUTE_EQUAL: if (character == '=') { state = ATTRIBUTE_VALUE; } else if (char.IsWhiteSpace((char)character)) { // empty } else if (html && character == '>') { text.Length = 0; ProcessTag(true); InitTag(); state = RestoreState(); } else if (html && character == '/') { Flush(); state = SINGLE_TAG; } else if (html) { Flush(); text.Append((char)character); state = ATTRIBUTE_KEY; } else { ThrowException("Error in attribute processing."); } break; case ATTRIBUTE_VALUE: if (character == '"' || character == '\'') { quoteCharacter = character; state = QUOTE; } else if (char.IsWhiteSpace((char)character)) { // empty } else if (html && character == '>') { Flush(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (html) { text.Append((char)character); quoteCharacter = ' '; state = QUOTE; } else { ThrowException("Error in attribute processing"); } break; } } } /** * Gets a state from the stack * @return the previous state */ private int RestoreState() { if (stack.Count != 0) return (int)stack.Pop(); else return UNKNOWN; } /** * Adds a state to the stack. * @param s a state to add to the stack */ private void SaveState(int s) { stack.Push(s); } /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document * as content or as comment,... depending on the current state. */ private void Flush() { switch (state){ case TEXT: case CDATA: if (text.Length > 0) { doc.Text(text.ToString()); } break; case COMMENT: if (comment != null) { comment.Comment(text.ToString()); } break; case ATTRIBUTE_KEY: attributekey = text.ToString(); if (html) attributekey = attributekey.ToLower(CultureInfo.InvariantCulture); break; case QUOTE: case ATTRIBUTE_VALUE: attributevalue = text.ToString(); attributes[attributekey] = attributevalue; break; default: // do nothing break; } text.Length = 0; } /** * Initialized the tag name and attributes. */ private void InitTag() { tag = null; attributes = new Hashtable(); } /** Sets the name of the tag. */ private void DoTag() { if (tag == null) tag = text.ToString(); if (html) tag = tag.ToLower(CultureInfo.InvariantCulture); text.Length = 0; } /** * processes the tag. * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. */ private void ProcessTag(bool start) { if (start) { nested++; doc.StartElement(tag,attributes); } else { nested--; doc.EndElement(tag); } } /** Throws an exception */ private void ThrowException(String s) { throw new IOException(s+" near line " + lines + ", column " + columns); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param r the document. The encoding is already resolved. The reader is not closed * @throws IOException on error */ public static void Parse(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, TextReader r, bool html) { SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); parser.Go(r); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param in the document. The encoding is deduced from the stream. The stream is not closed * @throws IOException on error */ public static void Parse(ISimpleXMLDocHandler doc, Stream inp) { byte[] b4 = new byte[4]; int count = inp.Read(b4, 0, b4.Length); if (count != 4) throw new IOException("Insufficient length."); String encoding = GetEncodingName(b4); String decl = null; if (encoding.Equals("UTF-8")) { StringBuilder sb = new StringBuilder(); int c; while ((c = inp.ReadByte()) != -1) { if (c == '>') break; sb.Append((char)c); } decl = sb.ToString(); } else if (encoding.Equals("CP037")) { MemoryStream bi = new MemoryStream(); int c; while ((c = inp.ReadByte()) != -1) { if (c == 0x6e) // that's '>' in ebcdic break; bi.WriteByte((byte)c); } decl = Encoding.GetEncoding(37).GetString(bi.ToArray());//cp037 ebcdic } if (decl != null) { decl = GetDeclaredEncoding(decl); if (decl != null) encoding = decl; } Parse(doc, new StreamReader(inp, IanaEncodings.GetEncodingEncoding(encoding))); } private static String GetDeclaredEncoding(String decl) { if (decl == null) return null; int idx = decl.IndexOf("encoding"); if (idx < 0) return null; int idx1 = decl.IndexOf('"', idx); int idx2 = decl.IndexOf('\'', idx); if (idx1 == idx2) return null; if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { int idx3 = decl.IndexOf('\'', idx2 + 1); if (idx3 < 0) return null; return decl.Substring(idx2 + 1, idx3 - (idx2 + 1)); } if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { int idx3 = decl.IndexOf('"', idx1 + 1); if (idx3 < 0) return null; return decl.Substring(idx1 + 1, idx3 - (idx1 + 1)); } return null; } public static void Parse(ISimpleXMLDocHandler doc, TextReader r) { Parse(doc, null, r, false); } /** * Escapes a string with the appropriated XML codes. * @param s the string to be escaped * @param onlyASCII codes above 127 will always be escaped with &#nn; if true * @return the escaped string */ public static String EscapeXML(String s, bool onlyASCII) { char[] cc = s.ToCharArray(); int len = cc.Length; StringBuilder sb = new StringBuilder(); for (int k = 0; k < len; ++k) { int c = cc[k]; switch (c) { case '<': sb.Append("<"); break; case '>': sb.Append(">"); break; case '&': sb.Append("&"); break; case '"': sb.Append("""); break; case '\'': sb.Append("'"); break; default: if (onlyASCII && c > 127) sb.Append("&#").Append(c).Append(';'); else sb.Append((char)c); break; } } return sb.ToString(); } /** * Returns the IANA encoding name that is auto-detected from * the bytes specified, with the endian-ness of that encoding where appropriate. * (method found in org.apache.xerces.impl.XMLEntityManager, originaly published * by the Apache Software Foundation under the Apache Software License; now being * used in iText under the MPL) * @param b4 The first four bytes of the input. * @return an IANA-encoding string */ private static String GetEncodingName(byte[] b4) { // UTF-16, with BOM int b0 = b4[0] & 0xFF; int b1 = b4[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { // UTF-16, big-endian return "UTF-16BE"; } if (b0 == 0xFF && b1 == 0xFE) { // UTF-16, little-endian return "UTF-16LE"; } // UTF-8 with a BOM int b2 = b4[2] & 0xFF; if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { return "UTF-8"; } // other encodings int b3 = b4[3] & 0xFF; if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { // UCS-4, big endian (1234) return "ISO-10646-UCS-4"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { // UCS-4, little endian (4321) return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { // UCS-4, unusual octet order (2143) // REVISIT: What should this be? return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { // UCS-4, unusual octect order (3412) // REVISIT: What should this be? return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM // (or could turn out to be UCS-2... // REVISIT: What should this be? return "UTF-16BE"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM // (or could turn out to be UCS-2... return "UTF-16LE"; } if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { // EBCDIC // a la xerces1, return CP037 instead of EBCDIC here return "CP037"; } // default encoding return "UTF-8"; } } }