// ======================================================================== // Copyright 2006 - Volian Enterprises, Inc. All rights reserved. // Volian Enterprises - Proprietary Information - DO NOT COPY OR DISTRIBUTE // ------------------------------------------------------------------------ // $Workfile: $ $Revision: $ // $Author: $ $Date: $ // // $History: $ // ======================================================================== using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using Volian.Base.Library; namespace DataLoader { public static class TextConvert { static TextConvert() { BuildDictionarySeq(); BuildDictionaryText(); } private static Dictionary dicChar; public static void BuildDictionarySeq() { dicChar = new Dictionary(); for (int i = 0; i < 128; i++) dicChar[i] = i; dicChar[199] = 128; dicChar[252] = 129; dicChar[233] = 130; dicChar[226] = 131; dicChar[228] = 132; dicChar[224] = 133; dicChar[229] = 134; dicChar[231] = 135; dicChar[234] = 136; dicChar[235] = 137; dicChar[232] = 138; dicChar[239] = 139; dicChar[238] = 140; dicChar[236] = 141; dicChar[196] = 142; dicChar[197] = 143; dicChar[201] = 144; dicChar[230] = 145; dicChar[198] = 146; dicChar[244] = 147; dicChar[246] = 148; dicChar[242] = 149; dicChar[251] = 150; dicChar[249] = 151; dicChar[255] = 152; dicChar[214] = 153; dicChar[220] = 154; dicChar[162] = 155; dicChar[163] = 156; dicChar[165] = 157; dicChar[8359] = 158; dicChar[402] = 159; dicChar[225] = 160; dicChar[237] = 161; dicChar[243] = 162; dicChar[250] = 163; dicChar[241] = 164; dicChar[209] = 165; dicChar[170] = 166; dicChar[186] = 167; dicChar[191] = 168; dicChar[8976] = 169; dicChar[172] = 170; dicChar[189] = 171; dicChar[188] = 172; dicChar[161] = 173; dicChar[171] = 174; dicChar[187] = 175; dicChar[9617] = 176; dicChar[9618] = 177; dicChar[9619] = 178; dicChar[9474] = 179; dicChar[9508] = 180; dicChar[9569] = 181; dicChar[9570] = 182; dicChar[9558] = 183; dicChar[9557] = 184; dicChar[9571] = 185; dicChar[9553] = 186; dicChar[9559] = 187; dicChar[9565] = 188; dicChar[9564] = 189; dicChar[9563] = 190; dicChar[9488] = 191; dicChar[9492] = 192; dicChar[9524] = 193; dicChar[9516] = 194; dicChar[9500] = 195; dicChar[9472] = 196; dicChar[9532] = 197; dicChar[9566] = 198; dicChar[9567] = 199; dicChar[9562] = 200; dicChar[9556] = 201; dicChar[9577] = 202; dicChar[9574] = 203; dicChar[9568] = 204; dicChar[9552] = 205; dicChar[9580] = 206; dicChar[9575] = 207; dicChar[9576] = 208; dicChar[9572] = 209; dicChar[9573] = 210; dicChar[9561] = 211; dicChar[9560] = 212; dicChar[9554] = 213; dicChar[9555] = 214; dicChar[9579] = 215; dicChar[9578] = 216; dicChar[9496] = 217; dicChar[9484] = 218; dicChar[9608] = 219; dicChar[9604] = 220; dicChar[9612] = 221; dicChar[9616] = 222; dicChar[9600] = 223; dicChar[945] = 224; dicChar[223] = 225; dicChar[915] = 226; dicChar[960] = 227; dicChar[931] = 228; dicChar[963] = 229; dicChar[181] = 230; dicChar[964] = 231; dicChar[934] = 232; dicChar[920] = 233; dicChar[937] = 234; dicChar[948] = 235; dicChar[8734] = 236; dicChar[966] = 237; dicChar[949] = 238; dicChar[8745] = 239; dicChar[8801] = 240; dicChar[177] = 241; dicChar[8805] = 242; dicChar[8804] = 243; dicChar[8992] = 244; dicChar[8993] = 245; dicChar[247] = 246; dicChar[8776] = 247; dicChar[176] = 248; dicChar[8729] = 249; dicChar[183] = 250; dicChar[8730] = 251; dicChar[8319] = 252; dicChar[178] = 253; dicChar[9632] = 254; dicChar[160] = 255; } public static string ConvertSeq(string s1) { Encoding Eibm437 = Encoding.GetEncoding(437); Encoding Eunicode = Encoding.Unicode; Decoder d = Eibm437.GetDecoder(); Byte[] bs1 = Eunicode.GetBytes(s1); Byte[] bs2 = Encoding.Convert(Eunicode, Eibm437, bs1); char[] cs2 = new char[Eibm437.GetCharCount(bs2)]; for (int i = 0; i < cs2.Length; i++) cs2[i] = (char)bs2[i]; return new string(cs2); } public static Regex Reg2; public static void BuildDictionaryText() { dicChar = new Dictionary(); dicChar[966] = 216; dicChar[201] = 274; dicChar[127] = 916; dicChar[964] = 947; dicChar[920] = 952; dicChar[915] = 961; dicChar[191] = 964; dicChar[8801] = 8773; dicChar[8734] = 8857; dicChar[7] = 9679; dicChar[8976] = 9830; dicChar[9632] = 9604; //dicChar[236] = 38914; char[] creg = new char[dicChar.Count]; int i = 0; foreach (int ic in dicChar.Keys) { creg[i] = (char)ic; i++; } Reg2 = new Regex("[" + new string(creg) + "]"); } public static string ReplaceChars(Match m) { char[] cs = m.Value.ToCharArray(); for (int i = 0; i < cs.Length; i++) { if (dicChar.ContainsKey((int)(cs[i]))) { int iKey = (int)cs[i]; int iValue = dicChar[iKey]; cs[i] = (char)iValue; } } return new string(cs); } public static string ConvertText(string s1, bool DoCaret) { string s2 = s1; if (DoCaret) s2 = s2.Replace("^", @"\u916?"); return ConvertText(s2); } private static void ShowRawString(string str, string title) { Console.WriteLine("Raw Start --{0}:\n", title); foreach (char c in str) { int ic = (int)c; if (c != '\n' && (ic > 126 || ic < 32)) Console.Write("<<{0:x4}>>", ic); else Console.Write(c); } Console.WriteLine("\n-- Raw End:{0}", title); } public static string ReplaceUnicode(string s2) { return ReplaceUnicode(s2, false); } public static string ReplaceUnicode(string s2, bool DoCaret) { //char[] tmp; //tmp = s2.ToCharArray(); string orig = s2; //ShowRawString(s2, "ReplaceUnicode"); s2 = s2.Replace("`", @"\'b0"); // convert backquote to degree - left over from DOS days. s2 = s2.Replace("\xa0",@"\u160?"); // hardspace s2 = s2.Replace("\xb0", @"\'b0"); // degree s2 = s2.Replace("\x7f", @"\u916?"); // delta s2 = s2.Replace("\x2265",@"\u8805?"); // greater than or equal s2 = s2.Replace("\x2264",@"\u8804?"); // less than or equal s2 = s2.Replace("\xB1",@"\'b1"); // plus minus s2 = s2.Replace("\x3A3",@"\u931?"); // sigma s2 = s2.Replace("\x3C4",@"\u947?"); // gamma s2 = s2.Replace("\xBD",@"\'bd"); // half s2 = s2.Replace("\x25A0",@"\u9604?"); // accum 2584 s2 = s2.Replace("\x7",@"\u9679?"); // bullet 25CF s2 = s2.Replace("\x2248",@"\u8776?"); // approx eq s2 = s2.Replace("\x2261",@"\u8773?"); // similar eq 2245 s2 = s2.Replace("\xF7",@"\'f7"); // division s2 = s2.Replace("\x221A",@"\u8730?"); // square root s2 = s2.Replace("\x393",@"\u961?"); // rho 3C1 s2 = s2.Replace("\x3C0",@"\u960?"); // pi s2 = s2.Replace("\xb5", @"\u956?"); // micro 3BC (try e6, if not work try 109) s2 = s2.Replace("\x3B4", @"\u948?"); // lower case delta s2 = s2.Replace("\x3C3", @"\u963?"); // lower case sigma s2 = s2.Replace("\xBC", @"\'bc"); // quarter s2 = s2.Replace("\x256A", @"\'d8"); // dist zero, D8 s2 = s2.Replace("\x3C6", @"\'d8"); // dist zero, D8 s2 = s2.Replace("\xC9", @"\u274?"); // energy, 112 s2 = s2.Replace("\xEC", @"\'ec"); // grave s2 = s2.Replace("\x2502", @"\u9474?"); // bar s2 = s2.Replace("\x3B5", @"\u949?"); // epsilon s2 = s2.Replace("\x398", @"\u952?"); // theta, 3B8 s2 = s2.Replace("\x221E", @"\u8857?"); // dot in oval, 2299 s2 = s2.Replace("\xBF", @"\u964?"); // tau, 3C4 s2 = s2.Replace("\x2310", @"\u9830?"); // diamond, 2666 s2 = s2.Replace("\x2192", @"\u8594?"); s2 = s2.Replace("\x2190", @"\u8592?"); s2 = s2.Replace("\x2191", @"\u8593?"); s2 = s2.Replace("\x2193", @"\u8595?"); s2 = s2.Replace("\x2207", @"\u8711?"); s2 = s2.Replace("\x2591", @"\'b0"); // Degree Symbol s2 = s2.Replace("\xFF", @"\u8593?"); // Up Arrow s2 = s2.Replace("\xD6", @"\u8595?"); // Down Arrow if (DoCaret) s2 = s2.Replace("^", @"\u916?"); //s2 = s2.Replace("^", @"\u916"); //s2 = ConvertDOSSuperAndSubScripts(s2); string sBefore = s2; s2 = ConvertFortranFormatToScienctificNotation(s2); if (sBefore != s2) MyGlitches.Add("ConvertFortranFormatToScienctificNotation", sBefore, s2); // Convert dash to a non-breaking dash. This is a unicode character. // This character will be used in veproms rather than a dash. //if the dash is preceeded byte a token remove the space following the token //#if DEBUG if (VlnSettings.DebugMode) { if (s2.Contains(@"\super ")) Console.WriteLine("RTF Super token"); } //#endif s2 = Regex.Replace(s2, @"(\\[^ \\?]*) \-", @"$1\u8209?"); //KBR s2 = s2.Replace("-", @"\u8209?"); //Remove spaces between comment end nad Next token s2 = s2.Replace(@"\v0 \", @"\v0\"); //Change Token Order to match RTB output s2 = s2.Replace(@"\v0\b0", @"\b0\v0"); s2 = s2.Replace(@"\b0\ulnone", @"\ulnone\b0"); s2 = s2.Replace(@"\par ", "\r\n"); return s2; } private static DataLoaderGlitches _MyGlitches; public static DataLoaderGlitches MyGlitches { get { if (_MyGlitches == null) _MyGlitches = new DataLoaderGlitches(); return _MyGlitches; } set { _MyGlitches = value; } } public static string ConvertFortranFormatToScienctificNotation(string str) { // Convert E style numbers to RTF with \super and \nosupersub string retval = Regex.Replace(str, "([+-]?)([0-9]+)[.]([0-9]*?)0*E([+-]?[0-9]+)", new MatchEvaluator(FixFortranNumber)); return retval; } public static string ConvertDOSSuperAndSubScripts(string instr) { try { string retval = Regex.Replace(instr, "[#](.*?)[#]", "\\up2 $1\\up0 ");// DOS Superscript retval = Regex.Replace(retval, "[~](.*?)[~]", "\\dn2 $1\\up0 ");// DOS Subscript return retval; } catch (Exception ex) { Console.WriteLine("Error in ConvertDOSSuperAndSubScripts"); } return ""; } private static string FixFortranNumber(Match match) { StringBuilder sb = new StringBuilder(match.Groups[1].Value); if (match.Groups[3].Length == 0) // Nothing to the right of the decimal if (match.Groups[2].Value != "1") // Other than "1", multiply it times 10 raised to a power sb.Append(match.Groups[2].Value + "x10"); else // The number is simply 1 so it can be ignored and 10 can be raised to a power sb.Append("10"); else // A number with a decimal point sb.Append(match.Groups[2].Value + "." + match.Groups[3].Value + "x10"); // Add the exponent as superscript return sb.ToString() + "\\up2 " + match.Groups[4].Value + "\\up0 "; } #region CommentedOut //private static string ConvertFortranFormatToScienctificNotation(string str) //{ // string outstr = ""; // int orglen = str.Length; // int cnt = 0; // int ptr; // int nbytes; // int tstr, tstr2, rptr, start = 0; // while (cnt < orglen) // { // // position up to the the next number, sign, or period // ptr = str.IndexOfAny("+-0123456789.".ToCharArray(), cnt); // if (ptr == -1) // { // outstr += str.Substring(cnt); // break; // jump out of while loop - nothing else to process // } // if ((ptr - cnt) > 0) // { // outstr += str.Substring(cnt, ptr - cnt); // cnt = ptr; // } // if (cnt > start && str[cnt - 1] == '\'') // { // //B2003-053: only remove the single quote character // // if str ptr is not at the end of the string or // // the next char (after the str ptr) is not a space // // or newline... (as per Paul Linn on 7/17/03) // int len = orglen - cnt; // if (len <= 1 || str[cnt + 1] == ' ' || str[cnt + 1] == '\n') // start = cnt; // else // start = cnt - 1; // } // else start = cnt; // tstr = cnt; // //Skip preceeding signs // if (str[cnt] == '+' || str[cnt] == '-') // cnt++; // cnt = NextNonNumber(str, cnt); // if ((cnt < str.Length -1) && str[cnt] == '.') // { // cnt = NextNonNumber(str, cnt + 1); // if (str[start] == '\'') // { // start++; // } // else if ((cnt < str.Length -1) && (str[cnt] == 'E') && (cnt > tstr)) // { // nbytes = (cnt - tstr); // don't include the 'E' // outstr += str.Substring(tstr, nbytes); // cnt++; // rptr = outstr.Length - 1; // while (outstr[rptr] == '0') rptr--; // if (outstr[rptr] != '.') rptr++; // if (rptr < (outstr.Length - 1)) // outstr = outstr.Substring(0, rptr + 1); // trim trailing 0's // int poutstr = 0; // if (outstr[poutstr] == '+' || outstr[poutstr] == '-') poutstr++; // if (!outstr[poutstr].Equals("1")) // { // outstr += "x1"; // } // outstr += "0\\super "; // tstr2 = cnt; // if (str[cnt] == '+' || str[cnt] == '-') cnt++; // cnt = NextNonNumber(str, cnt); // if (str[cnt] == '.' && char.IsDigit(str, cnt + 1)) // cnt = NextNonNumber(str, cnt + 1); // nbytes = cnt - tstr2; // +1; // outstr += str.Substring(tstr2, nbytes); // outstr += "\\nosupersub "; // if (!char.IsLetterOrDigit(str, cnt) && !char.IsWhiteSpace(str, cnt)) // return (str.Substring(tstr)); // } // else if (cnt > 0) // { // outstr += str.Substring(start, cnt - start + ((cnt < str.Length) ? 1 : 0)); // } // } // else // { // outstr += str.Substring(start, cnt - start + ((cnt < str.Length)?1:0)); // cnt++; // } // } // return (outstr); //} //private static int NextNonNumber(string str, int cnt) //{ // int rtn = 0; // string tstr = str.Substring(cnt); // int len = tstr.Length; // while (rtn < len && char.IsDigit(tstr, rtn)) rtn++; // return rtn + cnt; //} //public static string ConvertDOSSuperAndSubScripts(string instr) //{ // string outstr = ""; // string tstr = instr; // int cnt = 0; // int ptr = 0; // bool issupper = false, issub = false; // while (tstr != null && (ptr = tstr.IndexOfAny("#~".ToCharArray(), cnt)) >= 0) // { // if (ptr > cnt) // outstr += tstr.Substring(cnt, ptr - cnt); // switch (tstr[ptr]) // { // case '#': // if (issub || issupper) // outstr += "\\nosupersub "; // else // outstr += "\\super "; // issupper = !issupper; // issub = false; // break; // case '~': // if (issupper || issub) // outstr += "\\nosupersub "; // else // outstr += "\\sub "; // issub = !issub; // issupper = false; // break; // } // cnt = ptr + 1; // if (cnt >= tstr.Length) // tstr = null; // else // tstr = instr.Substring(cnt); // cnt = 0; // } // if (tstr != null) // outstr += tstr; // return outstr; //} #endregion public static string ConvertText(string s1) { string s2 = s1; //CompareBeforeAndAfter(s1); s2 = ReplaceUnicode(s2); // now replace underline on/off (AE,AF), super on/off (C6,C7) // bold on/off (D5, D6), subscript on/off (D1 A6), and // italics on/off (B2, DD) //s2 = s2.Replace("\xAB", "\\ul "); //s2 = s2.Replace("\xBB", "\\ulnone "); //s2 = s2.Replace("\x255E", "\\super "); //s2 = s2.Replace("\x255F", "\\nosupersub "); //\xAB -> \\ul and \xBB -> \\ulnone - look for pairs: s2 = Regex.Replace(s2, @"\xAB([^\xBB]*?)\xBB", @"\ul $1\ulnone "); // if there is an underline on without underline off or vice versa, just remove it. if (s2.Contains("\xAB") || s2.Contains("\xBB")) s2 = s2.Replace("\xAB", "").Replace("\xBB", ""); s2 = s2.Replace("\x255E", "\\up2 "); s2 = s2.Replace("\x255F", "\\up0 "); s2 = s2.Replace("\x2552", "\\b "); s2 = s2.Replace("\x2553", "\\b0 "); //s2 = s2.Replace("\x2564", "\\sub "); //s2 = s2.Replace("\xAA", "\\nosupersub "); s2 = s2.Replace("\x2564", "\\dn2 "); s2 = s2.Replace("\xAA", "\\up0 "); s2 = s2.Replace("\x2593", "\\i "); s2 = s2.Replace("\x258C", "\\i0 "); s2 = s2.Replace("\x2559", "\\ul\\b "); s2 = s2.Replace("\x2558", "\\b0\\ulnone "); // underline next word is 0x17 // superscript next is 0x18 // subscript next is 0x19 // bold next is 0x13 // \x18([A-Za-z0-9]+)(?:[\x18]|(?= )|\Z|(?=[^A-Za-z0-9]))(.*?) s2 = Regex.Replace(s2, @"\x18([A-Za-z0-9\-]+)(?:[\x18]|(?= )|\Z|(?=[^A-Za-z0-9]))(.*?)", @"\up2 $1\up0 $2"); s2 = Regex.Replace(s2, @"\x19([A-Za-z0-9\-]+)(?:[\x19]|(?= )|\Z|(?=[^A-Za-z0-9]))(.*?)", @"\dn2 $1\up0 $2"); s2 = Regex.Replace(s2, @"\x13([A-Za-z0-9\-]+)(?:[\x13]|(?= )|\Z|(?=[^A-Za-z0-9]))(.*?)", @"\b $1\b0 $2"); s2 = s2.Replace("\x11", ""); // this was an 'end' string for the above, 16bit just removed this char s2 = s2.Replace("-", @"\u8209?"); // do this here so that super/sub & bold next work. // if the underline is at beginning of text, don't replace the underline token with a space: s2 = Regex.Replace(s2, @"^\x17(([A-Za-z0-9]|\\u[0-9]+\?)+)", @"\ul $1\ulnone "); s2 = Regex.Replace(s2, @"\x17(([A-Za-z0-9]|\\u[0-9]+\?)+)", @" \ul $1\ulnone "); s2 = Reg2.Replace(s2, new MatchEvaluator(ReplaceChars)); // Now prepend an escape character, '\', to any curly brace. The curly brace // is used in rtf land. s2 = s2.Replace(@"{", @"\{"); s2 = s2.Replace(@"}", @"\}"); s2 = s2.Replace("\n", @"\par "); // line break in tables s2 = s2.Replace(@"\up0 \up2 ", @"\up2 "); //jsj - 18MAR2010 - rbt.Save() seems to do this automatically s2 = s2.Replace(@"\up0 \dn2 ", @"\dn2 "); //jsj - 18MAR2010 - rbt.Save() seems to do this automatically return s2; } private static void CompareBeforeAndAfter(string txtBefore) { string txtAfter = ConvertSeq(txtBefore); if (txtAfter != txtBefore) { ListDifference(txtBefore, txtAfter); } } private static Dictionary _SpecialChars = new Dictionary(); private static void ListDifference(string txtBefore, string txtAfter) { int nBefore = txtBefore.Length; int nAfter = txtAfter.Length; int n = nBefore > nAfter ? nAfter : nBefore; for (int i = 0; i < n; i++) { int chrBefore = (int)(txtBefore[i]); int chrAfter = (int)(txtAfter[i]); if (chrBefore != chrAfter) { if (_SpecialChars.ContainsKey(chrBefore)) _SpecialChars[chrBefore]++; else { _SpecialChars.Add(chrBefore, 1); Console.WriteLine("Character Difference 0x{0:X0000} 0x{1:X0000} @ {2}", chrBefore, chrAfter, i); //Console.WriteLine("Text Difference\r\nBefore '{0}'\r\nAfter '{0}'", txtBefore, txtAfter); string prefix = i == 0 ? "" : txtBefore.Substring(0, i - 1); string suffix = i == txtBefore.Length - 1 ? "" : txtBefore.Substring(i + 1); Console.WriteLine("Found in '{0}' 0x{1:X} '{2}'", prefix, chrBefore, suffix); } } } if (nBefore != nAfter) { Console.WriteLine("Length Difference\r\nBefore '{0}'\r\nAfter '{0}'", txtBefore, txtAfter); } } public static void ListSpecialCharacters() { Console.WriteLine("Special Characters"); foreach (int chr in _SpecialChars.Keys) { Console.WriteLine("0x{0:X0000} - {1} occurances", chr, _SpecialChars[chr]); } } public static void ResetSpecialCharacters() { _SpecialChars = new Dictionary(); } } }