2023-06-21 12:46:23 -04:00

442 lines
28 KiB
C#

using System;
using System.Collections;
using System.Text;
using System.Globalization;
/*
* $Id: EntitiesToUnicode.cs,v 1.3 2008/05/13 11:26:14 psoares33 Exp $
*
*
* Copyright 2003-2007 Paulo Soares and Bruno Lowagie.
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*/
namespace iTextSharp.text.xml.simpleparser {
/**
* This class contains entities that can be used in an entity tag.
*/
public class EntitiesToUnicode {
/**
* This is a map that contains the names of entities and their unicode value.
*/
public static readonly Hashtable map = new Hashtable();
static EntitiesToUnicode() {
map["nbsp"] = '\u00a0'; // no-break space = non-breaking space, U+00A0 ISOnum
map["iexcl"] = '\u00a1'; // inverted exclamation mark, U+00A1 ISOnum
map["cent"] = '\u00a2'; // cent sign, U+00A2 ISOnum
map["pound"] = '\u00a3'; // pound sign, U+00A3 ISOnum
map["curren"] = '\u00a4'; // currency sign, U+00A4 ISOnum
map["yen"] = '\u00a5'; // yen sign = yuan sign, U+00A5 ISOnum
map["brvbar"] = '\u00a6'; // broken bar = broken vertical bar, U+00A6 ISOnum
map["sect"] = '\u00a7'; // section sign, U+00A7 ISOnum
map["uml"] = '\u00a8'; // diaeresis = spacing diaeresis, U+00A8 ISOdia
map["copy"] = '\u00a9'; // copyright sign, U+00A9 ISOnum
map["ordf"] = '\u00aa'; // feminine ordinal indicator, U+00AA ISOnum
map["laquo"] = '\u00ab'; // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
map["not"] = '\u00ac'; // not sign, U+00AC ISOnum
map["shy"] = '\u00ad'; // soft hyphen = discretionary hyphen, U+00AD ISOnum
map["reg"] = '\u00ae'; // registered sign = registered trade mark sign, U+00AE ISOnum
map["macr"] = '\u00af'; // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
map["deg"] = '\u00b0'; // degree sign, U+00B0 ISOnum
map["plusmn"] = '\u00b1'; // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
map["sup2"] = '\u00b2'; // superscript two = superscript digit two = squared, U+00B2 ISOnum
map["sup3"] = '\u00b3'; // superscript three = superscript digit three = cubed, U+00B3 ISOnum
map["acute"] = '\u00b4'; // acute accent = spacing acute, U+00B4 ISOdia
map["micro"] = '\u00b5'; // micro sign, U+00B5 ISOnum
map["para"] = '\u00b6'; // pilcrow sign = paragraph sign, U+00B6 ISOnum
map["middot"] = '\u00b7'; // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
map["cedil"] = '\u00b8'; // cedilla = spacing cedilla, U+00B8 ISOdia
map["sup1"] = '\u00b9'; // superscript one = superscript digit one, U+00B9 ISOnum
map["ordm"] = '\u00ba'; // masculine ordinal indicator, U+00BA ISOnum
map["raquo"] = '\u00bb'; // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
map["frac14"] = '\u00bc'; // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
map["frac12"] = '\u00bd'; // vulgar fraction one half = fraction one half, U+00BD ISOnum
map["frac34"] = '\u00be'; // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
map["iquest"] = '\u00bf'; // inverted question mark = turned question mark, U+00BF ISOnum
map["Agrave"] = '\u00c0'; // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
map["Aacute"] = '\u00c1'; // latin capital letter A with acute, U+00C1 ISOlat1
map["Acirc"] = '\u00c2'; // latin capital letter A with circumflex, U+00C2 ISOlat1
map["Atilde"] = '\u00c3'; // latin capital letter A with tilde, U+00C3 ISOlat1
map["Auml"] = '\u00c4'; // latin capital letter A with diaeresis, U+00C4 ISOlat1
map["Aring"] = '\u00c5'; // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
map["AElig"] = '\u00c6'; // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
map["Ccedil"] = '\u00c7'; // latin capital letter C with cedilla, U+00C7 ISOlat1
map["Egrave"] = '\u00c8'; // latin capital letter E with grave, U+00C8 ISOlat1
map["Eacute"] = '\u00c9'; // latin capital letter E with acute, U+00C9 ISOlat1
map["Ecirc"] = '\u00ca'; // latin capital letter E with circumflex, U+00CA ISOlat1
map["Euml"] = '\u00cb'; // latin capital letter E with diaeresis, U+00CB ISOlat1
map["Igrave"] = '\u00cc'; // latin capital letter I with grave, U+00CC ISOlat1
map["Iacute"] = '\u00cd'; // latin capital letter I with acute, U+00CD ISOlat1
map["Icirc"] = '\u00ce'; // latin capital letter I with circumflex, U+00CE ISOlat1
map["Iuml"] = '\u00cf'; // latin capital letter I with diaeresis, U+00CF ISOlat1
map["ETH"] = '\u00d0'; // latin capital letter ETH, U+00D0 ISOlat1
map["Ntilde"] = '\u00d1'; // latin capital letter N with tilde, U+00D1 ISOlat1
map["Ograve"] = '\u00d2'; // latin capital letter O with grave, U+00D2 ISOlat1
map["Oacute"] = '\u00d3'; // latin capital letter O with acute, U+00D3 ISOlat1
map["Ocirc"] = '\u00d4'; // latin capital letter O with circumflex, U+00D4 ISOlat1
map["Otilde"] = '\u00d5'; // latin capital letter O with tilde, U+00D5 ISOlat1
map["Ouml"] = '\u00d6'; // latin capital letter O with diaeresis, U+00D6 ISOlat1
map["times"] = '\u00d7'; // multiplication sign, U+00D7 ISOnum
map["Oslash"] = '\u00d8'; // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
map["Ugrave"] = '\u00d9'; // latin capital letter U with grave, U+00D9 ISOlat1
map["Uacute"] = '\u00da'; // latin capital letter U with acute, U+00DA ISOlat1
map["Ucirc"] = '\u00db'; // latin capital letter U with circumflex, U+00DB ISOlat1
map["Uuml"] = '\u00dc'; // latin capital letter U with diaeresis, U+00DC ISOlat1
map["Yacute"] = '\u00dd'; // latin capital letter Y with acute, U+00DD ISOlat1
map["THORN"] = '\u00de'; // latin capital letter THORN, U+00DE ISOlat1
map["szlig"] = '\u00df'; // latin small letter sharp s = ess-zed, U+00DF ISOlat1
map["agrave"] = '\u00e0'; // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
map["aacute"] = '\u00e1'; // latin small letter a with acute, U+00E1 ISOlat1
map["acirc"] = '\u00e2'; // latin small letter a with circumflex, U+00E2 ISOlat1
map["atilde"] = '\u00e3'; // latin small letter a with tilde, U+00E3 ISOlat1
map["auml"] = '\u00e4'; // latin small letter a with diaeresis, U+00E4 ISOlat1
map["aring"] = '\u00e5'; // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
map["aelig"] = '\u00e6'; // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
map["ccedil"] = '\u00e7'; // latin small letter c with cedilla, U+00E7 ISOlat1
map["egrave"] = '\u00e8'; // latin small letter e with grave, U+00E8 ISOlat1
map["eacute"] = '\u00e9'; // latin small letter e with acute, U+00E9 ISOlat1
map["ecirc"] = '\u00ea'; // latin small letter e with circumflex, U+00EA ISOlat1
map["euml"] = '\u00eb'; // latin small letter e with diaeresis, U+00EB ISOlat1
map["igrave"] = '\u00ec'; // latin small letter i with grave, U+00EC ISOlat1
map["iacute"] = '\u00ed'; // latin small letter i with acute, U+00ED ISOlat1
map["icirc"] = '\u00ee'; // latin small letter i with circumflex, U+00EE ISOlat1
map["iuml"] = '\u00ef'; // latin small letter i with diaeresis, U+00EF ISOlat1
map["eth"] = '\u00f0'; // latin small letter eth, U+00F0 ISOlat1
map["ntilde"] = '\u00f1'; // latin small letter n with tilde, U+00F1 ISOlat1
map["ograve"] = '\u00f2'; // latin small letter o with grave, U+00F2 ISOlat1
map["oacute"] = '\u00f3'; // latin small letter o with acute, U+00F3 ISOlat1
map["ocirc"] = '\u00f4'; // latin small letter o with circumflex, U+00F4 ISOlat1
map["otilde"] = '\u00f5'; // latin small letter o with tilde, U+00F5 ISOlat1
map["ouml"] = '\u00f6'; // latin small letter o with diaeresis, U+00F6 ISOlat1
map["divide"] = '\u00f7'; // division sign, U+00F7 ISOnum
map["oslash"] = '\u00f8'; // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
map["ugrave"] = '\u00f9'; // latin small letter u with grave, U+00F9 ISOlat1
map["uacute"] = '\u00fa'; // latin small letter u with acute, U+00FA ISOlat1
map["ucirc"] = '\u00fb'; // latin small letter u with circumflex, U+00FB ISOlat1
map["uuml"] = '\u00fc'; // latin small letter u with diaeresis, U+00FC ISOlat1
map["yacute"] = '\u00fd'; // latin small letter y with acute, U+00FD ISOlat1
map["thorn"] = '\u00fe'; // latin small letter thorn, U+00FE ISOlat1
map["yuml"] = '\u00ff'; // latin small letter y with diaeresis, U+00FF ISOlat1
// Latin Extended-B
map["fnof"] = '\u0192'; // latin small f with hook = function = florin, U+0192 ISOtech
// Greek
map["Alpha"] = '\u0391'; // greek capital letter alpha, U+0391
map["Beta"] = '\u0392'; // greek capital letter beta, U+0392
map["Gamma"] = '\u0393'; // greek capital letter gamma, U+0393 ISOgrk3
map["Delta"] = '\u0394'; // greek capital letter delta, U+0394 ISOgrk3
map["Epsilon"] = '\u0395'; // greek capital letter epsilon, U+0395
map["Zeta"] = '\u0396'; // greek capital letter zeta, U+0396
map["Eta"] = '\u0397'; // greek capital letter eta, U+0397
map["Theta"] = '\u0398'; // greek capital letter theta, U+0398 ISOgrk3
map["Iota"] = '\u0399'; // greek capital letter iota, U+0399
map["Kappa"] = '\u039a'; // greek capital letter kappa, U+039A
map["Lambda"] = '\u039b'; // greek capital letter lambda, U+039B ISOgrk3
map["Mu"] = '\u039c'; // greek capital letter mu, U+039C
map["Nu"] = '\u039d'; // greek capital letter nu, U+039D
map["Xi"] = '\u039e'; // greek capital letter xi, U+039E ISOgrk3
map["Omicron"] = '\u039f'; // greek capital letter omicron, U+039F
map["Pi"] = '\u03a0'; // greek capital letter pi, U+03A0 ISOgrk3
map["Rho"] = '\u03a1'; // greek capital letter rho, U+03A1
// there is no Sigmaf, and no U+03A2 character either
map["Sigma"] = '\u03a3'; // greek capital letter sigma, U+03A3 ISOgrk3
map["Tau"] = '\u03a4'; // greek capital letter tau, U+03A4
map["Upsilon"] = '\u03a5'; // greek capital letter upsilon, U+03A5 ISOgrk3
map["Phi"] = '\u03a6'; // greek capital letter phi, U+03A6 ISOgrk3
map["Chi"] = '\u03a7'; // greek capital letter chi, U+03A7
map["Psi"] = '\u03a8'; // greek capital letter psi, U+03A8 ISOgrk3
map["Omega"] = '\u03a9'; // greek capital letter omega, U+03A9 ISOgrk3
map["alpha"] = '\u03b1'; // greek small letter alpha, U+03B1 ISOgrk3
map["beta"] = '\u03b2'; // greek small letter beta, U+03B2 ISOgrk3
map["gamma"] = '\u03b3'; // greek small letter gamma, U+03B3 ISOgrk3
map["delta"] = '\u03b4'; // greek small letter delta, U+03B4 ISOgrk3
map["epsilon"] = '\u03b5'; // greek small letter epsilon, U+03B5 ISOgrk3
map["zeta"] = '\u03b6'; // greek small letter zeta, U+03B6 ISOgrk3
map["eta"] = '\u03b7'; // greek small letter eta, U+03B7 ISOgrk3
map["theta"] = '\u03b8'; // greek small letter theta, U+03B8 ISOgrk3
map["iota"] = '\u03b9'; // greek small letter iota, U+03B9 ISOgrk3
map["kappa"] = '\u03ba'; // greek small letter kappa, U+03BA ISOgrk3
map["lambda"] = '\u03bb'; // greek small letter lambda, U+03BB ISOgrk3
map["mu"] = '\u03bc'; // greek small letter mu, U+03BC ISOgrk3
map["nu"] = '\u03bd'; // greek small letter nu, U+03BD ISOgrk3
map["xi"] = '\u03be'; // greek small letter xi, U+03BE ISOgrk3
map["omicron"] = '\u03bf'; // greek small letter omicron, U+03BF NEW
map["pi"] = '\u03c0'; // greek small letter pi, U+03C0 ISOgrk3
map["rho"] = '\u03c1'; // greek small letter rho, U+03C1 ISOgrk3
map["sigmaf"] = '\u03c2'; // greek small letter final sigma, U+03C2 ISOgrk3
map["sigma"] = '\u03c3'; // greek small letter sigma, U+03C3 ISOgrk3
map["tau"] = '\u03c4'; // greek small letter tau, U+03C4 ISOgrk3
map["upsilon"] = '\u03c5'; // greek small letter upsilon, U+03C5 ISOgrk3
map["phi"] = '\u03c6'; // greek small letter phi, U+03C6 ISOgrk3
map["chi"] = '\u03c7'; // greek small letter chi, U+03C7 ISOgrk3
map["psi"] = '\u03c8'; // greek small letter psi, U+03C8 ISOgrk3
map["omega"] = '\u03c9'; // greek small letter omega, U+03C9 ISOgrk3
map["thetasym"] = '\u03d1'; // greek small letter theta symbol, U+03D1 NEW
map["upsih"] = '\u03d2'; // greek upsilon with hook symbol, U+03D2 NEW
map["piv"] = '\u03d6'; // greek pi symbol, U+03D6 ISOgrk3
// General Punctuation
map["bull"] = '\u2022'; // bullet = black small circle, U+2022 ISOpub
// bullet is NOT the same as bullet operator, U+2219
map["hellip"] = '\u2026'; // horizontal ellipsis = three dot leader, U+2026 ISOpub
map["prime"] = '\u2032'; // prime = minutes = feet, U+2032 ISOtech
map["Prime"] = '\u2033'; // double prime = seconds = inches, U+2033 ISOtech
map["oline"] = '\u203e'; // overline = spacing overscore, U+203E NEW
map["frasl"] = '\u2044'; // fraction slash, U+2044 NEW
// Letterlike Symbols
map["weierp"] = '\u2118'; // script capital P = power set = Weierstrass p, U+2118 ISOamso
map["image"] = '\u2111'; // blackletter capital I = imaginary part, U+2111 ISOamso
map["real"] = '\u211c'; // blackletter capital R = real part symbol, U+211C ISOamso
map["trade"] = '\u2122'; // trade mark sign, U+2122 ISOnum
map["alefsym"] = '\u2135'; // alef symbol = first transfinite cardinal, U+2135 NEW
// alef symbol is NOT the same as hebrew letter alef,
// U+05D0 although the same glyph could be used to depict both characters
// Arrows
map["larr"] = '\u2190'; // leftwards arrow, U+2190 ISOnum
map["uarr"] = '\u2191'; // upwards arrow, U+2191 ISOnum
map["rarr"] = '\u2192'; // rightwards arrow, U+2192 ISOnum
map["darr"] = '\u2193'; // downwards arrow, U+2193 ISOnum
map["harr"] = '\u2194'; // left right arrow, U+2194 ISOamsa
map["crarr"] = '\u21b5'; // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
map["lArr"] = '\u21d0'; // leftwards double arrow, U+21D0 ISOtech
// ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
// but also does not have any other character for that function. So ? lArr can
// be used for 'is implied by' as ISOtech suggests
map["uArr"] = '\u21d1'; // upwards double arrow, U+21D1 ISOamsa
map["rArr"] = '\u21d2'; // rightwards double arrow, U+21D2 ISOtech
// ISO 10646 does not say this is the 'implies' character but does not have
// another character with this function so ?
// rArr can be used for 'implies' as ISOtech suggests
map["dArr"] = '\u21d3'; // downwards double arrow, U+21D3 ISOamsa
map["hArr"] = '\u21d4'; // left right double arrow, U+21D4 ISOamsa
// Mathematical Operators
map["forall"] = '\u2200'; // for all, U+2200 ISOtech
map["part"] = '\u2202'; // partial differential, U+2202 ISOtech
map["exist"] = '\u2203'; // there exists, U+2203 ISOtech
map["empty"] = '\u2205'; // empty set = null set = diameter, U+2205 ISOamso
map["nabla"] = '\u2207'; // nabla = backward difference, U+2207 ISOtech
map["isin"] = '\u2208'; // element of, U+2208 ISOtech
map["notin"] = '\u2209'; // not an element of, U+2209 ISOtech
map["ni"] = '\u220b'; // contains as member, U+220B ISOtech
// should there be a more memorable name than 'ni'?
map["prod"] = '\u220f'; // n-ary product = product sign, U+220F ISOamsb
// prod is NOT the same character as U+03A0 'greek capital letter pi' though
// the same glyph might be used for both
map["sum"] = '\u2211'; // n-ary sumation, U+2211 ISOamsb
// sum is NOT the same character as U+03A3 'greek capital letter sigma'
// though the same glyph might be used for both
map["minus"] = '\u2212'; // minus sign, U+2212 ISOtech
map["lowast"] = '\u2217'; // asterisk operator, U+2217 ISOtech
map["radic"] = '\u221a'; // square root = radical sign, U+221A ISOtech
map["prop"] = '\u221d'; // proportional to, U+221D ISOtech
map["infin"] = '\u221e'; // infinity, U+221E ISOtech
map["ang"] = '\u2220'; // angle, U+2220 ISOamso
map["and"] = '\u2227'; // logical and = wedge, U+2227 ISOtech
map["or"] = '\u2228'; // logical or = vee, U+2228 ISOtech
map["cap"] = '\u2229'; // intersection = cap, U+2229 ISOtech
map["cup"] = '\u222a'; // union = cup, U+222A ISOtech
map["int"] = '\u222b'; // integral, U+222B ISOtech
map["there4"] = '\u2234'; // therefore, U+2234 ISOtech
map["sim"] = '\u223c'; // tilde operator = varies with = similar to, U+223C ISOtech
// tilde operator is NOT the same character as the tilde, U+007E,
// although the same glyph might be used to represent both
map["cong"] = '\u2245'; // approximately equal to, U+2245 ISOtech
map["asymp"] = '\u2248'; // almost equal to = asymptotic to, U+2248 ISOamsr
map["ne"] = '\u2260'; // not equal to, U+2260 ISOtech
map["equiv"] = '\u2261'; // identical to, U+2261 ISOtech
map["le"] = '\u2264'; // less-than or equal to, U+2264 ISOtech
map["ge"] = '\u2265'; // greater-than or equal to, U+2265 ISOtech
map["sub"] = '\u2282'; // subset of, U+2282 ISOtech
map["sup"] = '\u2283'; // superset of, U+2283 ISOtech
// note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
// font encoding and is not included. Should it be, for symmetry?
// It is in ISOamsn
map["nsub"] = '\u2284'; // not a subset of, U+2284 ISOamsn
map["sube"] = '\u2286'; // subset of or equal to, U+2286 ISOtech
map["supe"] = '\u2287'; // superset of or equal to, U+2287 ISOtech
map["oplus"] = '\u2295'; // circled plus = direct sum, U+2295 ISOamsb
map["otimes"] = '\u2297'; // circled times = vector product, U+2297 ISOamsb
map["perp"] = '\u22a5'; // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
map["sdot"] = '\u22c5'; // dot operator, U+22C5 ISOamsb
// dot operator is NOT the same character as U+00B7 middle dot
// Miscellaneous Technical
map["lceil"] = '\u2308'; // left ceiling = apl upstile, U+2308 ISOamsc
map["rceil"] = '\u2309'; // right ceiling, U+2309 ISOamsc
map["lfloor"] = '\u230a'; // left floor = apl downstile, U+230A ISOamsc
map["rfloor"] = '\u230b'; // right floor, U+230B ISOamsc
map["lang"] = '\u2329'; // left-pointing angle bracket = bra, U+2329 ISOtech
// lang is NOT the same character as U+003C 'less than'
// or U+2039 'single left-pointing angle quotation mark'
map["rang"] = '\u232a'; // right-pointing angle bracket = ket, U+232A ISOtech
// rang is NOT the same character as U+003E 'greater than'
// or U+203A 'single right-pointing angle quotation mark'
// Geometric Shapes
map["loz"] = '\u25ca'; // lozenge, U+25CA ISOpub
// Miscellaneous Symbols
map["spades"] = '\u2660'; // black spade suit, U+2660 ISOpub
// black here seems to mean filled as opposed to hollow
map["clubs"] = '\u2663'; // black club suit = shamrock, U+2663 ISOpub
map["hearts"] = '\u2665'; // black heart suit = valentine, U+2665 ISOpub
map["diams"] = '\u2666'; // black diamond suit, U+2666 ISOpub
// C0 Controls and Basic Latin
map["quot"] = '\u0022'; // quotation mark = APL quote, U+0022 ISOnum
map["amp"] = '\u0026'; // ampersand, U+0026 ISOnum
map["apos"] = '\'';
map["lt"] = '\u003c'; // less-than sign, U+003C ISOnum
map["gt"] = '\u003e'; // greater-than sign, U+003E ISOnum
// Latin Extended-A
map["OElig"] = '\u0152'; // latin capital ligature OE, U+0152 ISOlat2
map["oelig"] = '\u0153'; // latin small ligature oe, U+0153 ISOlat2
// ligature is a misnomer, this is a separate character in some languages
map["Scaron"] = '\u0160'; // latin capital letter S with caron, U+0160 ISOlat2
map["scaron"] = '\u0161'; // latin small letter s with caron, U+0161 ISOlat2
map["Yuml"] = '\u0178'; // latin capital letter Y with diaeresis, U+0178 ISOlat2
// Spacing Modifier Letters
map["circ"] = '\u02c6'; // modifier letter circumflex accent, U+02C6 ISOpub
map["tilde"] = '\u02dc'; // small tilde, U+02DC ISOdia
// General Punctuation
map["ensp"] = '\u2002'; // en space, U+2002 ISOpub
map["emsp"] = '\u2003'; // em space, U+2003 ISOpub
map["thinsp"] = '\u2009'; // thin space, U+2009 ISOpub
map["zwnj"] = '\u200c'; // zero width non-joiner, U+200C NEW RFC 2070
map["zwj"] = '\u200d'; // zero width joiner, U+200D NEW RFC 2070
map["lrm"] = '\u200e'; // left-to-right mark, U+200E NEW RFC 2070
map["rlm"] = '\u200f'; // right-to-left mark, U+200F NEW RFC 2070
map["ndash"] = '\u2013'; // en dash, U+2013 ISOpub
map["mdash"] = '\u2014'; // em dash, U+2014 ISOpub
map["lsquo"] = '\u2018'; // left single quotation mark, U+2018 ISOnum
map["rsquo"] = '\u2019'; // right single quotation mark, U+2019 ISOnum
map["sbquo"] = '\u201a'; // single low-9 quotation mark, U+201A NEW
map["ldquo"] = '\u201c'; // left double quotation mark, U+201C ISOnum
map["rdquo"] = '\u201d'; // right double quotation mark, U+201D ISOnum
map["bdquo"] = '\u201e'; // double low-9 quotation mark, U+201E NEW
map["dagger"] = '\u2020'; // dagger, U+2020 ISOpub
map["Dagger"] = '\u2021'; // double dagger, U+2021 ISOpub
map["permil"] = '\u2030'; // per mille sign, U+2030 ISOtech
map["lsaquo"] = '\u2039'; // single left-pointing angle quotation mark, U+2039 ISO proposed
// lsaquo is proposed but not yet ISO standardized
map["rsaquo"] = '\u203a'; // single right-pointing angle quotation mark, U+203A ISO proposed
// rsaquo is proposed but not yet ISO standardized
map["euro"] = '\u20ac'; // euro sign, U+20AC NEW
}
/**
* Translates an entity to a unicode character.
*
* @param name the name of the entity
* @return the corresponding unicode character
*/
public static char DecodeEntity(String name) {
if (name.StartsWith("#x")) {
try {
return (char)int.Parse(name.Substring(2), NumberStyles.AllowHexSpecifier);
}
catch {
return '\0';
}
}
if (name.StartsWith("#")) {
try {
return (char)int.Parse(name.Substring(1));
}
catch {
return '\0';
}
}
object c = map[name];
if (c == null)
return '\0';
else
return (char)c;
}
/**
* Translates a String with entities (&...;) to a String without entities,
* replacing the entity with the right (unicode) character.
*/
public static String DecodeString(String s) {
int pos_amp = s.IndexOf('&');
if (pos_amp == -1) return s;
int pos_sc;
int pos_a;
StringBuilder buf = new StringBuilder(s.Substring(0, pos_amp));
char replace;
while (true) {
pos_sc = s.IndexOf(';', pos_amp);
if (pos_sc == -1) {
buf.Append(s.Substring(pos_amp));
return buf.ToString();
}
pos_a = s.IndexOf('&', pos_amp + 1);
while (pos_a != -1 && pos_a < pos_sc) {
buf.Append(s.Substring(pos_amp, pos_a - pos_amp));
pos_amp = pos_a;
pos_a = s.IndexOf('&', pos_amp + 1);
}
replace = DecodeEntity(s.Substring(pos_amp + 1, pos_sc - (pos_amp + 1)));
if (s.Length < pos_sc + 1) {
return buf.ToString();
}
if (replace == '\0') {
buf.Append(s.Substring(pos_amp, pos_sc + 1 - pos_amp));
}
else {
buf.Append(replace);
}
pos_amp = s.IndexOf('&', pos_sc);
if (pos_amp == -1) {
buf.Append(s.Substring(pos_sc + 1));
return buf.ToString();
}
else {
buf.Append(s.Substring(pos_sc + 1, pos_amp - (pos_sc + 1)));
}
}
}
}
}