741 lines
30 KiB
C#
741 lines
30 KiB
C#
using System;
|
|
using System.IO;
|
|
using System.Text;
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
/*
|
|
* Copyright 2003 Paulo Soares
|
|
*
|
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
|
* (the "License"); you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
* for the specific language governing rights and limitations under the License.
|
|
*
|
|
* The Original Code is 'iText, a free JAVA-PDF library'.
|
|
*
|
|
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
|
|
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
|
|
* All Rights Reserved.
|
|
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
|
|
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
|
|
*
|
|
* Contributor(s): all the names of the contributors are added in the source code
|
|
* where applicable.
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of the
|
|
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
|
|
* provisions of LGPL are applicable instead of those above. If you wish to
|
|
* allow use of your version of this file only under the terms of the LGPL
|
|
* License and not to allow others to use your version of this file under
|
|
* the MPL, indicate your decision by deleting the provisions above and
|
|
* replace them with the notice and other provisions required by the LGPL.
|
|
* If you do not delete the provisions above, a recipient may use your version
|
|
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
|
|
*
|
|
* This library is free software; you can redistribute it and/or modify it
|
|
* under the terms of the MPL as stated above or under the terms of the GNU
|
|
* Library General Public License as published by the Free Software Foundation;
|
|
* either version 2 of the License, or any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
|
|
* details.
|
|
*
|
|
* If you didn't download this code from the following link, you should check if
|
|
* you aren't using an obsolete version:
|
|
* http://www.lowagie.com/iText/
|
|
*
|
|
* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
* Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
|
|
* The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
|
|
* Steven Brandt and JavaWorld gave permission to use the code for free.
|
|
* (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
|
|
* conformance with the rest of the code).
|
|
* The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
|
|
* It was substantially refactored by Bruno Lowagie.
|
|
*
|
|
* The method 'private static String getEncodingName(byte[] b4)' was found
|
|
* in org.apache.xerces.impl.XMLEntityManager, originaly published by the
|
|
* Apache Software Foundation under the Apache Software License; now being
|
|
* used in iText under the MPL.
|
|
*/
|
|
|
|
namespace iTextSharp.text.xml.simpleparser {
|
|
/**
|
|
* A simple XML and HTML parser. This parser is, like the SAX parser,
|
|
* an event based parser, but with much less functionality.
|
|
* <p>
|
|
* The parser can:
|
|
* <p>
|
|
* <ul>
|
|
* <li>It recognizes the encoding used
|
|
* <li>It recognizes all the elements' start tags and end tags
|
|
* <li>It lists attributes, where attribute values can be enclosed in single or double quotes
|
|
* <li>It recognizes the <code><[CDATA[ ... ]]></code> construct
|
|
* <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities
|
|
* <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
|
|
* </ul>
|
|
* <p>
|
|
* The code is based on <A HREF="http://www.javaworld.com/javaworld/javatips/javatip128/">
|
|
* http://www.javaworld.com/javaworld/javatips/javatip128/</A> with some extra
|
|
* code from XERCES to recognize the encoding.
|
|
*/
|
|
public sealed class SimpleXMLParser {
|
|
/** possible states */
|
|
private const int UNKNOWN = 0;
|
|
private const int TEXT = 1;
|
|
private const int TAG_ENCOUNTERED = 2;
|
|
private const int EXAMIN_TAG = 3;
|
|
private const int TAG_EXAMINED = 4;
|
|
private const int IN_CLOSETAG = 5;
|
|
private const int SINGLE_TAG = 6;
|
|
private const int CDATA = 7;
|
|
private const int COMMENT = 8;
|
|
private const int PI = 9;
|
|
private const int ENTITY = 10;
|
|
private const int QUOTE = 11;
|
|
private const int ATTRIBUTE_KEY = 12;
|
|
private const int ATTRIBUTE_EQUAL = 13;
|
|
private const int ATTRIBUTE_VALUE = 14;
|
|
|
|
/** the state stack */
|
|
internal Stack stack;
|
|
/** The current character. */
|
|
internal int character = 0;
|
|
/** The previous character. */
|
|
internal int previousCharacter = -1;
|
|
/** the line we are currently reading */
|
|
internal int lines = 1;
|
|
/** the column where the current character occurs */
|
|
internal int columns = 0;
|
|
/** was the last character equivalent to a newline? */
|
|
internal bool eol = false;
|
|
/** the current state */
|
|
internal int state;
|
|
/** Are we parsing HTML? */
|
|
internal bool html;
|
|
/** current text (whatever is encountered between tags) */
|
|
internal StringBuilder text = new StringBuilder();
|
|
/** current entity (whatever is encountered between & and ;) */
|
|
internal StringBuilder entity = new StringBuilder();
|
|
/** current tagname */
|
|
internal String tag = null;
|
|
/** current attributes */
|
|
internal Hashtable attributes = null;
|
|
/** The handler to which we are going to forward document content */
|
|
internal ISimpleXMLDocHandler doc;
|
|
/** The handler to which we are going to forward comments. */
|
|
internal ISimpleXMLDocHandlerComment comment;
|
|
/** Keeps track of the number of tags that are open. */
|
|
internal int nested = 0;
|
|
/** the quote character that was used to open the quote. */
|
|
internal int quoteCharacter = '"';
|
|
/** the attribute key. */
|
|
internal String attributekey = null;
|
|
/** the attribute value. */
|
|
internal String attributevalue = null;
|
|
|
|
/**
|
|
* Creates a Simple XML parser object.
|
|
* Call Go(BufferedReader) immediately after creation.
|
|
*/
|
|
private SimpleXMLParser(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, bool html) {
|
|
this.doc = doc;
|
|
this.comment = comment;
|
|
this.html = html;
|
|
stack = new Stack();
|
|
state = html ? TEXT : UNKNOWN;
|
|
}
|
|
|
|
/**
|
|
* Does the actual parsing. Perform this immediately
|
|
* after creating the parser object.
|
|
*/
|
|
private void Go(TextReader reader) {
|
|
doc.StartDocument();
|
|
while (true) {
|
|
// read a new character
|
|
if (previousCharacter == -1) {
|
|
character = reader.Read();
|
|
}
|
|
// or re-examin the previous character
|
|
else {
|
|
character = previousCharacter;
|
|
previousCharacter = -1;
|
|
}
|
|
|
|
// the end of the file was reached
|
|
if (character == -1) {
|
|
if (html) {
|
|
if (html && state == TEXT)
|
|
Flush();
|
|
doc.EndDocument();
|
|
} else {
|
|
ThrowException("Missing end tag");
|
|
}
|
|
return;
|
|
}
|
|
|
|
// dealing with \n and \r
|
|
if (character == '\n' && eol) {
|
|
eol = false;
|
|
continue;
|
|
} else if (eol) {
|
|
eol = false;
|
|
} else if (character == '\n') {
|
|
lines++;
|
|
columns = 0;
|
|
} else if (character == '\r') {
|
|
eol = true;
|
|
character = '\n';
|
|
lines++;
|
|
columns = 0;
|
|
} else {
|
|
columns++;
|
|
}
|
|
|
|
switch (state) {
|
|
// we are in an unknown state before there's actual content
|
|
case UNKNOWN:
|
|
if (character == '<') {
|
|
SaveState(TEXT);
|
|
state = TAG_ENCOUNTERED;
|
|
}
|
|
break;
|
|
// we can encounter any content
|
|
case TEXT:
|
|
if (character == '<') {
|
|
Flush();
|
|
SaveState(state);
|
|
state = TAG_ENCOUNTERED;
|
|
} else if (character == '&') {
|
|
SaveState(state);
|
|
entity.Length = 0;
|
|
state = ENTITY;
|
|
} else
|
|
text.Append((char)character);
|
|
break;
|
|
// we have just seen a < and are wondering what we are looking at
|
|
// <foo>, </foo>, <!-- ... --->, etc.
|
|
case TAG_ENCOUNTERED:
|
|
InitTag();
|
|
if (character == '/') {
|
|
state = IN_CLOSETAG;
|
|
} else if (character == '?') {
|
|
RestoreState();
|
|
state = PI;
|
|
} else {
|
|
text.Append((char)character);
|
|
state = EXAMIN_TAG;
|
|
}
|
|
break;
|
|
// we are processing something like this <foo ... >.
|
|
// It could still be a <!-- ... --> or something.
|
|
case EXAMIN_TAG:
|
|
if (character == '>') {
|
|
DoTag();
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
} else if (character == '/') {
|
|
state = SINGLE_TAG;
|
|
} else if (character == '-' && text.ToString().Equals("!-")) {
|
|
Flush();
|
|
state = COMMENT;
|
|
} else if (character == '[' && text.ToString().Equals("![CDATA")) {
|
|
Flush();
|
|
state = CDATA;
|
|
} else if (character == 'E' && text.ToString().Equals("!DOCTYP")) {
|
|
Flush();
|
|
state = PI;
|
|
} else if (char.IsWhiteSpace((char)character)) {
|
|
DoTag();
|
|
state = TAG_EXAMINED;
|
|
} else {
|
|
text.Append((char)character);
|
|
}
|
|
break;
|
|
// we know the name of the tag now.
|
|
case TAG_EXAMINED:
|
|
if (character == '>') {
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
} else if (character == '/') {
|
|
state = SINGLE_TAG;
|
|
} else if (char.IsWhiteSpace((char)character)) {
|
|
// empty
|
|
} else {
|
|
text.Append((char)character);
|
|
state = ATTRIBUTE_KEY;
|
|
}
|
|
break;
|
|
|
|
// we are processing a closing tag: e.g. </foo>
|
|
case IN_CLOSETAG:
|
|
if (character == '>') {
|
|
DoTag();
|
|
ProcessTag(false);
|
|
if (!html && nested==0) return;
|
|
state = RestoreState();
|
|
} else {
|
|
if (!char.IsWhiteSpace((char)character))
|
|
text.Append((char)character);
|
|
}
|
|
break;
|
|
|
|
// we have just seen something like this: <foo a="b"/
|
|
// and are looking for the final >.
|
|
case SINGLE_TAG:
|
|
if (character != '>')
|
|
ThrowException("Expected > for tag: <"+tag+"/>");
|
|
DoTag();
|
|
ProcessTag(true);
|
|
ProcessTag(false);
|
|
InitTag();
|
|
if (!html && nested==0) {
|
|
doc.EndDocument();
|
|
return;
|
|
}
|
|
state = RestoreState();
|
|
break;
|
|
|
|
// we are processing CDATA
|
|
case CDATA:
|
|
if (character == '>'
|
|
&& text.ToString().EndsWith("]]")) {
|
|
text.Length = text.Length - 2;
|
|
Flush();
|
|
state = RestoreState();
|
|
} else
|
|
text.Append((char)character);
|
|
break;
|
|
|
|
// we are processing a comment. We are inside
|
|
// the <!-- .... --> looking for the -->.
|
|
case COMMENT:
|
|
if (character == '>'
|
|
&& text.ToString().EndsWith("--")) {
|
|
text.Length = text.Length - 2;
|
|
Flush();
|
|
state = RestoreState();
|
|
} else
|
|
text.Append((char)character);
|
|
break;
|
|
|
|
// We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
|
|
case PI:
|
|
if (character == '>') {
|
|
state = RestoreState();
|
|
if (state == TEXT) state = UNKNOWN;
|
|
}
|
|
break;
|
|
|
|
// we are processing an entity, e.g. <, », etc.
|
|
case ENTITY:
|
|
if (character == ';') {
|
|
state = RestoreState();
|
|
String cent = entity.ToString();
|
|
entity.Length = 0;
|
|
char ce = EntitiesToUnicode.DecodeEntity(cent);
|
|
if (ce == '\0')
|
|
text.Append('&').Append(cent).Append(';');
|
|
else
|
|
text.Append(ce);
|
|
} else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
|
|
&& (character < 'A' || character > 'Z')) || entity.Length >= 7) {
|
|
state = RestoreState();
|
|
previousCharacter = character;
|
|
text.Append('&').Append(entity.ToString());
|
|
entity.Length = 0;
|
|
}
|
|
else {
|
|
entity.Append((char)character);
|
|
}
|
|
break;
|
|
// We are processing the quoted right-hand side of an element's attribute.
|
|
case QUOTE:
|
|
if (html && quoteCharacter == ' ' && character == '>') {
|
|
Flush();
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
}
|
|
else if (html && quoteCharacter == ' ' && char.IsWhiteSpace((char)character)) {
|
|
Flush();
|
|
state = TAG_EXAMINED;
|
|
}
|
|
else if (html && quoteCharacter == ' ') {
|
|
text.Append((char)character);
|
|
}
|
|
else if (character == quoteCharacter) {
|
|
Flush();
|
|
state = TAG_EXAMINED;
|
|
} else if (" \r\n\u0009".IndexOf((char)character)>=0) {
|
|
text.Append(' ');
|
|
} else if (character == '&') {
|
|
SaveState(state);
|
|
state = ENTITY;
|
|
entity.Length = 0;
|
|
} else {
|
|
text.Append((char)character);
|
|
}
|
|
break;
|
|
|
|
case ATTRIBUTE_KEY:
|
|
if (char.IsWhiteSpace((char)character)) {
|
|
Flush();
|
|
state = ATTRIBUTE_EQUAL;
|
|
} else if (character == '=') {
|
|
Flush();
|
|
state = ATTRIBUTE_VALUE;
|
|
} else if (html && character == '>') {
|
|
text.Length = 0;
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
} else {
|
|
text.Append((char)character);
|
|
}
|
|
break;
|
|
|
|
case ATTRIBUTE_EQUAL:
|
|
if (character == '=') {
|
|
state = ATTRIBUTE_VALUE;
|
|
} else if (char.IsWhiteSpace((char)character)) {
|
|
// empty
|
|
} else if (html && character == '>') {
|
|
text.Length = 0;
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
} else if (html && character == '/') {
|
|
Flush();
|
|
state = SINGLE_TAG;
|
|
} else if (html) {
|
|
Flush();
|
|
text.Append((char)character);
|
|
state = ATTRIBUTE_KEY;
|
|
} else {
|
|
ThrowException("Error in attribute processing.");
|
|
}
|
|
break;
|
|
|
|
case ATTRIBUTE_VALUE:
|
|
if (character == '"' || character == '\'') {
|
|
quoteCharacter = character;
|
|
state = QUOTE;
|
|
} else if (char.IsWhiteSpace((char)character)) {
|
|
// empty
|
|
} else if (html && character == '>') {
|
|
Flush();
|
|
ProcessTag(true);
|
|
InitTag();
|
|
state = RestoreState();
|
|
} else if (html) {
|
|
text.Append((char)character);
|
|
quoteCharacter = ' ';
|
|
state = QUOTE;
|
|
} else {
|
|
ThrowException("Error in attribute processing");
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets a state from the stack
|
|
* @return the previous state
|
|
*/
|
|
private int RestoreState() {
|
|
if (stack.Count != 0)
|
|
return (int)stack.Pop();
|
|
else
|
|
return UNKNOWN;
|
|
}
|
|
/**
|
|
* Adds a state to the stack.
|
|
* @param s a state to add to the stack
|
|
*/
|
|
private void SaveState(int s) {
|
|
stack.Push(s);
|
|
}
|
|
/**
|
|
* Flushes the text that is currently in the buffer.
|
|
* The text can be ignored, added to the document
|
|
* as content or as comment,... depending on the current state.
|
|
*/
|
|
private void Flush() {
|
|
switch (state){
|
|
case TEXT:
|
|
case CDATA:
|
|
if (text.Length > 0) {
|
|
doc.Text(text.ToString());
|
|
}
|
|
break;
|
|
case COMMENT:
|
|
if (comment != null) {
|
|
comment.Comment(text.ToString());
|
|
}
|
|
break;
|
|
case ATTRIBUTE_KEY:
|
|
attributekey = text.ToString();
|
|
if (html)
|
|
attributekey = attributekey.ToLower(CultureInfo.InvariantCulture);
|
|
break;
|
|
case QUOTE:
|
|
case ATTRIBUTE_VALUE:
|
|
attributevalue = text.ToString();
|
|
attributes[attributekey] = attributevalue;
|
|
break;
|
|
default:
|
|
// do nothing
|
|
break;
|
|
}
|
|
text.Length = 0;
|
|
}
|
|
/**
|
|
* Initialized the tag name and attributes.
|
|
*/
|
|
private void InitTag() {
|
|
tag = null;
|
|
attributes = new Hashtable();
|
|
}
|
|
/** Sets the name of the tag. */
|
|
private void DoTag() {
|
|
if (tag == null)
|
|
tag = text.ToString();
|
|
if (html)
|
|
tag = tag.ToLower(CultureInfo.InvariantCulture);
|
|
text.Length = 0;
|
|
}
|
|
/**
|
|
* processes the tag.
|
|
* @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
|
|
*/
|
|
private void ProcessTag(bool start) {
|
|
if (start) {
|
|
nested++;
|
|
doc.StartElement(tag,attributes);
|
|
}
|
|
else {
|
|
nested--;
|
|
doc.EndElement(tag);
|
|
}
|
|
}
|
|
/** Throws an exception */
|
|
private void ThrowException(String s) {
|
|
throw new IOException(s+" near line " + lines + ", column " + columns);
|
|
}
|
|
|
|
/**
|
|
* Parses the XML document firing the events to the handler.
|
|
* @param doc the document handler
|
|
* @param r the document. The encoding is already resolved. The reader is not closed
|
|
* @throws IOException on error
|
|
*/
|
|
public static void Parse(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, TextReader r, bool html) {
|
|
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
|
|
parser.Go(r);
|
|
}
|
|
|
|
/**
|
|
* Parses the XML document firing the events to the handler.
|
|
* @param doc the document handler
|
|
* @param in the document. The encoding is deduced from the stream. The stream is not closed
|
|
* @throws IOException on error
|
|
*/
|
|
public static void Parse(ISimpleXMLDocHandler doc, Stream inp) {
|
|
byte[] b4 = new byte[4];
|
|
int count = inp.Read(b4, 0, b4.Length);
|
|
if (count != 4)
|
|
throw new IOException("Insufficient length.");
|
|
String encoding = GetEncodingName(b4);
|
|
String decl = null;
|
|
if (encoding.Equals("UTF-8")) {
|
|
StringBuilder sb = new StringBuilder();
|
|
int c;
|
|
while ((c = inp.ReadByte()) != -1) {
|
|
if (c == '>')
|
|
break;
|
|
sb.Append((char)c);
|
|
}
|
|
decl = sb.ToString();
|
|
}
|
|
else if (encoding.Equals("CP037")) {
|
|
MemoryStream bi = new MemoryStream();
|
|
int c;
|
|
while ((c = inp.ReadByte()) != -1) {
|
|
if (c == 0x6e) // that's '>' in ebcdic
|
|
break;
|
|
bi.WriteByte((byte)c);
|
|
}
|
|
decl = Encoding.GetEncoding(37).GetString(bi.ToArray());//cp037 ebcdic
|
|
}
|
|
if (decl != null) {
|
|
decl = GetDeclaredEncoding(decl);
|
|
if (decl != null)
|
|
encoding = decl;
|
|
}
|
|
Parse(doc, new StreamReader(inp, IanaEncodings.GetEncodingEncoding(encoding)));
|
|
}
|
|
|
|
private static String GetDeclaredEncoding(String decl) {
|
|
if (decl == null)
|
|
return null;
|
|
int idx = decl.IndexOf("encoding");
|
|
if (idx < 0)
|
|
return null;
|
|
int idx1 = decl.IndexOf('"', idx);
|
|
int idx2 = decl.IndexOf('\'', idx);
|
|
if (idx1 == idx2)
|
|
return null;
|
|
if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
|
|
int idx3 = decl.IndexOf('\'', idx2 + 1);
|
|
if (idx3 < 0)
|
|
return null;
|
|
return decl.Substring(idx2 + 1, idx3 - (idx2 + 1));
|
|
}
|
|
if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
|
|
int idx3 = decl.IndexOf('"', idx1 + 1);
|
|
if (idx3 < 0)
|
|
return null;
|
|
return decl.Substring(idx1 + 1, idx3 - (idx1 + 1));
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public static void Parse(ISimpleXMLDocHandler doc, TextReader r) {
|
|
Parse(doc, null, r, false);
|
|
}
|
|
|
|
/**
|
|
* Escapes a string with the appropriated XML codes.
|
|
* @param s the string to be escaped
|
|
* @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE>
|
|
* @return the escaped string
|
|
*/
|
|
public static String EscapeXML(String s, bool onlyASCII) {
|
|
char[] cc = s.ToCharArray();
|
|
int len = cc.Length;
|
|
StringBuilder sb = new StringBuilder();
|
|
for (int k = 0; k < len; ++k) {
|
|
int c = cc[k];
|
|
switch (c) {
|
|
case '<':
|
|
sb.Append("<");
|
|
break;
|
|
case '>':
|
|
sb.Append(">");
|
|
break;
|
|
case '&':
|
|
sb.Append("&");
|
|
break;
|
|
case '"':
|
|
sb.Append(""");
|
|
break;
|
|
case '\'':
|
|
sb.Append("'");
|
|
break;
|
|
default:
|
|
if (onlyASCII && c > 127)
|
|
sb.Append("&#").Append(c).Append(';');
|
|
else
|
|
sb.Append((char)c);
|
|
break;
|
|
}
|
|
}
|
|
return sb.ToString();
|
|
}
|
|
|
|
/**
|
|
* Returns the IANA encoding name that is auto-detected from
|
|
* the bytes specified, with the endian-ness of that encoding where appropriate.
|
|
* (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
|
|
* by the Apache Software Foundation under the Apache Software License; now being
|
|
* used in iText under the MPL)
|
|
* @param b4 The first four bytes of the input.
|
|
* @return an IANA-encoding string
|
|
*/
|
|
private static String GetEncodingName(byte[] b4) {
|
|
// UTF-16, with BOM
|
|
int b0 = b4[0] & 0xFF;
|
|
int b1 = b4[1] & 0xFF;
|
|
if (b0 == 0xFE && b1 == 0xFF) {
|
|
// UTF-16, big-endian
|
|
return "UTF-16BE";
|
|
}
|
|
if (b0 == 0xFF && b1 == 0xFE) {
|
|
// UTF-16, little-endian
|
|
return "UTF-16LE";
|
|
}
|
|
|
|
// UTF-8 with a BOM
|
|
int b2 = b4[2] & 0xFF;
|
|
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
|
|
return "UTF-8";
|
|
}
|
|
|
|
// other encodings
|
|
int b3 = b4[3] & 0xFF;
|
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
|
|
// UCS-4, big endian (1234)
|
|
return "ISO-10646-UCS-4";
|
|
}
|
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
|
|
// UCS-4, little endian (4321)
|
|
return "ISO-10646-UCS-4";
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
|
|
// UCS-4, unusual octet order (2143)
|
|
// REVISIT: What should this be?
|
|
return "ISO-10646-UCS-4";
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
|
|
// UCS-4, unusual octect order (3412)
|
|
// REVISIT: What should this be?
|
|
return "ISO-10646-UCS-4";
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
|
|
// UTF-16, big-endian, no BOM
|
|
// (or could turn out to be UCS-2...
|
|
// REVISIT: What should this be?
|
|
return "UTF-16BE";
|
|
}
|
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
|
|
// UTF-16, little-endian, no BOM
|
|
// (or could turn out to be UCS-2...
|
|
return "UTF-16LE";
|
|
}
|
|
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
|
|
// EBCDIC
|
|
// a la xerces1, return CP037 instead of EBCDIC here
|
|
return "CP037";
|
|
}
|
|
|
|
// default encoding
|
|
return "UTF-8";
|
|
}
|
|
}
|
|
}
|