using System; using System.IO; using System.Text; using System.Collections; using System.Globalization; /* * Copyright 2003 Paulo Soares * * The contents of this file are subject to the Mozilla Public License Version 1.1 * (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the License. * * The Original Code is 'iText, a free JAVA-PDF library'. * * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. * All Rights Reserved. * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. * * Contributor(s): all the names of the contributors are added in the source code * where applicable. * * Alternatively, the contents of this file may be used under the terms of the * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * provisions of LGPL are applicable instead of those above. If you wish to * allow use of your version of this file only under the terms of the LGPL * License and not to allow others to use your version of this file under * the MPL, indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by the LGPL. * If you do not delete the provisions above, a recipient may use your version * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. * * This library is free software; you can redistribute it and/or modify it * under the terms of the MPL as stated above or under the terms of the GNU * Library General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more * details. * * If you didn't download this code from the following link, you should check if * you aren't using an obsolete version: * http://www.lowagie.com/iText/ * * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). * Steven Brandt and JavaWorld gave permission to use the code for free. * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in * conformance with the rest of the code). * The original code can be found on this url: http://www.javaworld.com/javatips/jw-javatip128_p.html. * It was substantially refactored by Bruno Lowagie. * * The method 'private static String getEncodingName(byte[] b4)' was found * in org.apache.xerces.impl.XMLEntityManager, originaly published by the * Apache Software Foundation under the Apache Software License; now being * used in iText under the MPL. */ namespace iTextSharp.text.xml.simpleparser { /** * A simple XML and HTML parser. This parser is, like the SAX parser, * an event based parser, but with much less functionality. *
* The parser can: *
*
<[CDATA[ ... ]]>
construct
* \r\n
and \r
to \n
on input, in accordance with the XML Specification, Section 2.11
*
* The code is based on
* http://www.javaworld.com/javaworld/javatips/javatip128/ with some extra
* code from XERCES to recognize the encoding.
*/
public sealed class SimpleXMLParser {
/** possible states */
private const int UNKNOWN = 0;
private const int TEXT = 1;
private const int TAG_ENCOUNTERED = 2;
private const int EXAMIN_TAG = 3;
private const int TAG_EXAMINED = 4;
private const int IN_CLOSETAG = 5;
private const int SINGLE_TAG = 6;
private const int CDATA = 7;
private const int COMMENT = 8;
private const int PI = 9;
private const int ENTITY = 10;
private const int QUOTE = 11;
private const int ATTRIBUTE_KEY = 12;
private const int ATTRIBUTE_EQUAL = 13;
private const int ATTRIBUTE_VALUE = 14;
/** the state stack */
internal Stack stack;
/** The current character. */
internal int character = 0;
/** The previous character. */
internal int previousCharacter = -1;
/** the line we are currently reading */
internal int lines = 1;
/** the column where the current character occurs */
internal int columns = 0;
/** was the last character equivalent to a newline? */
internal bool eol = false;
/** the current state */
internal int state;
/** Are we parsing HTML? */
internal bool html;
/** current text (whatever is encountered between tags) */
internal StringBuilder text = new StringBuilder();
/** current entity (whatever is encountered between & and ;) */
internal StringBuilder entity = new StringBuilder();
/** current tagname */
internal String tag = null;
/** current attributes */
internal Hashtable attributes = null;
/** The handler to which we are going to forward document content */
internal ISimpleXMLDocHandler doc;
/** The handler to which we are going to forward comments. */
internal ISimpleXMLDocHandlerComment comment;
/** Keeps track of the number of tags that are open. */
internal int nested = 0;
/** the quote character that was used to open the quote. */
internal int quoteCharacter = '"';
/** the attribute key. */
internal String attributekey = null;
/** the attribute value. */
internal String attributevalue = null;
/**
* Creates a Simple XML parser object.
* Call Go(BufferedReader) immediately after creation.
*/
private SimpleXMLParser(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, bool html) {
this.doc = doc;
this.comment = comment;
this.html = html;
stack = new Stack();
state = html ? TEXT : UNKNOWN;
}
/**
* Does the actual parsing. Perform this immediately
* after creating the parser object.
*/
private void Go(TextReader reader) {
doc.StartDocument();
while (true) {
// read a new character
if (previousCharacter == -1) {
character = reader.Read();
}
// or re-examin the previous character
else {
character = previousCharacter;
previousCharacter = -1;
}
// the end of the file was reached
if (character == -1) {
if (html) {
if (html && state == TEXT)
Flush();
doc.EndDocument();
} else {
ThrowException("Missing end tag");
}
return;
}
// dealing with \n and \r
if (character == '\n' && eol) {
eol = false;
continue;
} else if (eol) {
eol = false;
} else if (character == '\n') {
lines++;
columns = 0;
} else if (character == '\r') {
eol = true;
character = '\n';
lines++;
columns = 0;
} else {
columns++;
}
switch (state) {
// we are in an unknown state before there's actual content
case UNKNOWN:
if (character == '<') {
SaveState(TEXT);
state = TAG_ENCOUNTERED;
}
break;
// we can encounter any content
case TEXT:
if (character == '<') {
Flush();
SaveState(state);
state = TAG_ENCOUNTERED;
} else if (character == '&') {
SaveState(state);
entity.Length = 0;
state = ENTITY;
} else
text.Append((char)character);
break;
// we have just seen a < and are wondering what we are looking at
// true
* @return the escaped string
*/
public static String EscapeXML(String s, bool onlyASCII) {
char[] cc = s.ToCharArray();
int len = cc.Length;
StringBuilder sb = new StringBuilder();
for (int k = 0; k < len; ++k) {
int c = cc[k];
switch (c) {
case '<':
sb.Append("<");
break;
case '>':
sb.Append(">");
break;
case '&':
sb.Append("&");
break;
case '"':
sb.Append(""");
break;
case '\'':
sb.Append("'");
break;
default:
if (onlyASCII && c > 127)
sb.Append("").Append(c).Append(';');
else
sb.Append((char)c);
break;
}
}
return sb.ToString();
}
/**
* Returns the IANA encoding name that is auto-detected from
* the bytes specified, with the endian-ness of that encoding where appropriate.
* (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
* by the Apache Software Foundation under the Apache Software License; now being
* used in iText under the MPL)
* @param b4 The first four bytes of the input.
* @return an IANA-encoding string
*/
private static String GetEncodingName(byte[] b4) {
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
return "UTF-16BE";
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
return "UTF-16LE";
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
return "UTF-8";
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
return "ISO-10646-UCS-4";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return "UTF-16BE";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return "UTF-16LE";
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return "CP037";
}
// default encoding
return "UTF-8";
}
}
}