From f784beb00578578fd80fc6d420114b072193e2ce Mon Sep 17 00:00:00 2001 From: John Date: Mon, 30 Mar 2009 15:14:21 +0000 Subject: [PATCH] --- PROMS/EPocalipse.IFilter/ComHelper.cs | 110 +++++ .../EPocalipse.IFilter.csproj | 52 +++ .../EPocalipse.IFilter/EPocalipse.IFilter.sln | 20 + PROMS/EPocalipse.IFilter/FilterLoader.cs | 226 +++++++++ PROMS/EPocalipse.IFilter/FilterReader.cs | 112 +++++ PROMS/EPocalipse.IFilter/IFilter.cs | 435 ++++++++++++++++++ 6 files changed, 955 insertions(+) create mode 100644 PROMS/EPocalipse.IFilter/ComHelper.cs create mode 100644 PROMS/EPocalipse.IFilter/EPocalipse.IFilter.csproj create mode 100644 PROMS/EPocalipse.IFilter/EPocalipse.IFilter.sln create mode 100644 PROMS/EPocalipse.IFilter/FilterLoader.cs create mode 100644 PROMS/EPocalipse.IFilter/FilterReader.cs create mode 100644 PROMS/EPocalipse.IFilter/IFilter.cs diff --git a/PROMS/EPocalipse.IFilter/ComHelper.cs b/PROMS/EPocalipse.IFilter/ComHelper.cs new file mode 100644 index 00000000..0950df90 --- /dev/null +++ b/PROMS/EPocalipse.IFilter/ComHelper.cs @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Runtime.InteropServices; + +namespace EPocalipse.IFilter +{ + [ComVisible(false)] + [ComImport, InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000001-0000-0000-C000-000000000046")] + internal interface IClassFactory + { + void CreateInstance([MarshalAs(UnmanagedType.Interface)] object pUnkOuter, ref Guid refiid, [MarshalAs(UnmanagedType.Interface)] out object ppunk); + void LockServer(bool fLock); + } + + /// + /// Utility class to get a Class Factory for a certain Class ID + /// by loading the dll that implements that class + /// + internal static class ComHelper + { + //DllGetClassObject fuction pointer signature + private delegate int DllGetClassObject(ref Guid ClassId, ref Guid InterfaceId, [Out, MarshalAs(UnmanagedType.Interface)] out object ppunk); + + //Some win32 methods to load\unload dlls and get a function pointer + private class Win32NativeMethods + { + [DllImport("kernel32.dll", CharSet=CharSet.Ansi)] + public static extern IntPtr GetProcAddress(IntPtr hModule, string lpProcName); + + [DllImport("kernel32.dll")] + public static extern bool FreeLibrary(IntPtr hModule); + + [DllImport("kernel32.dll")] + public static extern IntPtr LoadLibrary(string lpFileName); + } + + /// + /// Holds a list of dll handles and unloads the dlls + /// in the destructor + /// + private class DllList + { + private List _dllList=new List(); + public void AddDllHandle(IntPtr dllHandle) + { + lock (_dllList) + { + _dllList.Add(dllHandle); + } + } + + ~DllList() + { + foreach (IntPtr dllHandle in _dllList) + { + try + { + Win32NativeMethods.FreeLibrary(dllHandle); + } + catch { }; + } + } + } + + static DllList _dllList=new DllList(); + + /// + /// Gets a class factory for a specific COM Class ID. + /// + /// The dll where the COM class is implemented + /// The requested Class ID + /// IClassFactory instance used to create instances of that class + internal static IClassFactory GetClassFactory(string dllName, string filterPersistClass) + { + //Load the class factory from the dll + IClassFactory classFactory=GetClassFactoryFromDll(dllName, filterPersistClass); + return classFactory; + } + + private static IClassFactory GetClassFactoryFromDll(string dllName, string filterPersistClass) + { + //Load the dll + IntPtr dllHandle=Win32NativeMethods.LoadLibrary(dllName); + if (dllHandle==IntPtr.Zero) + return null; + + //Keep a reference to the dll until the process\AppDomain dies + _dllList.AddDllHandle(dllHandle); + + //Get a pointer to the DllGetClassObject function + IntPtr dllGetClassObjectPtr=Win32NativeMethods.GetProcAddress(dllHandle, "DllGetClassObject"); + if (dllGetClassObjectPtr==IntPtr.Zero) + return null; + + //Convert the function pointer to a .net delegate + DllGetClassObject dllGetClassObject=(DllGetClassObject)Marshal.GetDelegateForFunctionPointer(dllGetClassObjectPtr, typeof(DllGetClassObject)); + + //Call the DllGetClassObject to retreive a class factory for out Filter class + Guid filterPersistGUID=new Guid(filterPersistClass); + Guid IClassFactoryGUID=new Guid("00000001-0000-0000-C000-000000000046"); //IClassFactory class id + Object unk; + if (dllGetClassObject(ref filterPersistGUID, ref IClassFactoryGUID, out unk)!=0) + return null; + + //Yippie! cast the returned object to IClassFactory + return (unk as IClassFactory); + } + } +} diff --git a/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.csproj b/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.csproj new file mode 100644 index 00000000..c5c93a3b --- /dev/null +++ b/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.csproj @@ -0,0 +1,52 @@ + + + Debug + AnyCPU + 8.0.50727 + 2.0 + {400462CE-40B3-498F-B95A-B1D2AE679359} + Library + Properties + EPocalipse.IFilter + EPocalipse.IFilter + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.sln b/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.sln new file mode 100644 index 00000000..52dad765 --- /dev/null +++ b/PROMS/EPocalipse.IFilter/EPocalipse.IFilter.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 9.00 +# Visual Studio 2005 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EPocalipse.IFilter", "EPocalipse.IFilter.csproj", "{400462CE-40B3-498F-B95A-B1D2AE679359}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {400462CE-40B3-498F-B95A-B1D2AE679359}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {400462CE-40B3-498F-B95A-B1D2AE679359}.Debug|Any CPU.Build.0 = Debug|Any CPU + {400462CE-40B3-498F-B95A-B1D2AE679359}.Release|Any CPU.ActiveCfg = Release|Any CPU + {400462CE-40B3-498F-B95A-B1D2AE679359}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/PROMS/EPocalipse.IFilter/FilterLoader.cs b/PROMS/EPocalipse.IFilter/FilterLoader.cs new file mode 100644 index 00000000..bf55e512 --- /dev/null +++ b/PROMS/EPocalipse.IFilter/FilterLoader.cs @@ -0,0 +1,226 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.Win32; +using System.IO; +using System.Runtime.InteropServices.ComTypes; +using System.Runtime.InteropServices; + +namespace EPocalipse.IFilter +{ + /// + /// FilterLoader finds the dll and ClassID of the COM object responsible + /// for filtering a specific file extension. + /// It then loads that dll, creates the appropriate COM object and returns + /// a pointer to an IFilter instance + /// + static class FilterLoader + { + #region CacheEntry + private class CacheEntry + { + public string DllName; + public string ClassName; + + public CacheEntry(string dllName, string className) + { + DllName=dllName; + ClassName=className; + } + } + #endregion + + static Dictionary _cache=new Dictionary(); + + #region Registry Read String helper + static string ReadStrFromHKLM(string key) + { + return ReadStrFromHKLM(key,null); + } + static string ReadStrFromHKLM(string key, string value) + { + RegistryKey rk=Registry.LocalMachine.OpenSubKey(key); + if (rk==null) + return null; + + using (rk) + { + return (string)rk.GetValue(value); + } + } + #endregion + + /// + /// finds an IFilter implementation for a file type + /// + /// The extension of the file + /// an IFilter instance used to retreive text from that file type + private static IFilter LoadIFilter(string ext) + { + string dllName, filterPersistClass; + + //Find the dll and ClassID + if (GetFilterDllAndClass(ext, out dllName, out filterPersistClass)) + { + //load the dll and return an IFilter instance. + return LoadFilterFromDll(dllName, filterPersistClass); + } + return null; + } + + internal static IFilter LoadAndInitIFilter(string fileName) + { + return LoadAndInitIFilter(fileName,Path.GetExtension(fileName)); + } + + internal static IFilter LoadAndInitIFilter(string fileName, string extension) + { + IFilter filter=LoadIFilter(extension); + if (filter==null) + return null; + IPersistFile persistFile=(filter as IPersistFile); + if (persistFile!=null) + { + persistFile.Load(fileName, 0); + IFILTER_FLAGS flags; + IFILTER_INIT iflags = + IFILTER_INIT.CANON_HYPHENS | + IFILTER_INIT.CANON_PARAGRAPHS | + IFILTER_INIT.CANON_SPACES | + IFILTER_INIT.APPLY_INDEX_ATTRIBUTES | + IFILTER_INIT.HARD_LINE_BREAKS | + IFILTER_INIT.FILTER_OWNED_VALUE_OK; + + if (filter.Init(iflags, 0, IntPtr.Zero, out flags)==IFilterReturnCode.S_OK) + return filter; + } + //If we failed to retreive an IPersistFile interface or to initialize + //the filter, we release it and return null. + Marshal.ReleaseComObject(filter); + return null; + } + + private static IFilter LoadFilterFromDll(string dllName, string filterPersistClass) + { + //Get a classFactory for our classID + IClassFactory classFactory=ComHelper.GetClassFactory(dllName, filterPersistClass); + if (classFactory==null) + return null; + + //And create an IFilter instance using that class factory + Guid IFilterGUID=new Guid("89BCB740-6119-101A-BCB7-00DD010655AF"); + Object obj; + classFactory.CreateInstance(null, ref IFilterGUID, out obj); + return (obj as IFilter); + } + + private static bool GetFilterDllAndClass(string ext, out string dllName, out string filterPersistClass) + { + if (!GetFilterDllAndClassFromCache(ext, out dllName, out filterPersistClass)) + { + string persistentHandlerClass; + + persistentHandlerClass=GetPersistentHandlerClass(ext,true); + if (persistentHandlerClass!=null) + { + GetFilterDllAndClassFromPersistentHandler(persistentHandlerClass, + out dllName, out filterPersistClass); + } + AddExtensionToCache(ext, dllName, filterPersistClass); + } + return (dllName!=null && filterPersistClass!=null); + } + + private static void AddExtensionToCache(string ext, string dllName, string filterPersistClass) + { + lock (_cache) + { + _cache.Add(ext.ToLower(), new CacheEntry(dllName, filterPersistClass)); + } + } + + private static bool GetFilterDllAndClassFromPersistentHandler(string persistentHandlerClass, out string dllName, out string filterPersistClass) + { + dllName=null; + filterPersistClass=null; + + //Read the CLASS ID of the IFilter persistent handler + filterPersistClass=ReadStrFromHKLM(@"Software\Classes\CLSID\" + persistentHandlerClass + + @"\PersistentAddinsRegistered\{89BCB740-6119-101A-BCB7-00DD010655AF}"); + if (String.IsNullOrEmpty(filterPersistClass)) + return false; + + //Read the dll name + dllName=ReadStrFromHKLM(@"Software\Classes\CLSID\" + filterPersistClass + + @"\InprocServer32"); + return (!String.IsNullOrEmpty(dllName)); + } + + private static string GetPersistentHandlerClass(string ext, bool searchContentType) + { + //Try getting the info from the file extension + string persistentHandlerClass=GetPersistentHandlerClassFromExtension(ext); + if (String.IsNullOrEmpty(persistentHandlerClass)) + //try getting the info from the document type + persistentHandlerClass=GetPersistentHandlerClassFromDocumentType(ext); + if (searchContentType && String.IsNullOrEmpty(persistentHandlerClass)) + //Try getting the info from the Content Type + persistentHandlerClass=GetPersistentHandlerClassFromContentType(ext); + return persistentHandlerClass; + } + + private static string GetPersistentHandlerClassFromContentType(string ext) + { + string contentType=ReadStrFromHKLM(@"Software\Classes\"+ext,"Content Type"); + if (String.IsNullOrEmpty(contentType)) + return null; + + string contentTypeExtension=ReadStrFromHKLM(@"Software\Classes\MIME\Database\Content Type\"+contentType, + "Extension"); + if (ext.Equals(contentTypeExtension, StringComparison.CurrentCultureIgnoreCase)) + return null; //No need to look further. This extension does not have any persistent handler + + //We know the extension that is assciated with that content type. Simply try again with the new extension + return GetPersistentHandlerClass(contentTypeExtension, false); //Don't search content type this time. + } + + private static string GetPersistentHandlerClassFromDocumentType(string ext) + { + //Get the DocumentType of this file extension + string docType=ReadStrFromHKLM(@"Software\Classes\"+ext); + if (String.IsNullOrEmpty(docType)) + return null; + + //Get the Class ID for this document type + string docClass=ReadStrFromHKLM(@"Software\Classes\" + docType + @"\CLSID"); + if (String.IsNullOrEmpty(docType)) + return null; + + //Now get the PersistentHandler for that Class ID + return ReadStrFromHKLM(@"Software\Classes\CLSID\" + docClass + @"\PersistentHandler"); + } + + private static string GetPersistentHandlerClassFromExtension(string ext) + { + return ReadStrFromHKLM(@"Software\Classes\"+ext+@"\PersistentHandler"); + } + + private static bool GetFilterDllAndClassFromCache(string ext, out string dllName, out string filterPersistClass) + { + string lowerExt=ext.ToLower(); + lock (_cache) + { + CacheEntry cacheEntry; + if (_cache.TryGetValue(lowerExt, out cacheEntry)) + { + dllName=cacheEntry.DllName; + filterPersistClass=cacheEntry.ClassName; + return true; + } + } + dllName=null; + filterPersistClass=null; + return false; + } + } +} diff --git a/PROMS/EPocalipse.IFilter/FilterReader.cs b/PROMS/EPocalipse.IFilter/FilterReader.cs new file mode 100644 index 00000000..2f98f8de --- /dev/null +++ b/PROMS/EPocalipse.IFilter/FilterReader.cs @@ -0,0 +1,112 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.IO; +using System.Runtime.InteropServices; + +namespace EPocalipse.IFilter +{ + /// + /// Implements a TextReader that reads from an IFilter. + /// + public class FilterReader : TextReader + { + IFilter _filter; + private bool _done; + private STAT_CHUNK _currentChunk; + private bool _currentChunkValid; + private char[] _charsLeftFromLastRead; + + public override void Close() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + ~FilterReader() + { + Dispose(false); + } + + protected override void Dispose(bool disposing) + { + if (_filter!=null) + Marshal.ReleaseComObject(_filter); + } + + public override int Read(char[] array, int offset, int count) + { + int endOfChunksCount=0; + int charsRead=0; + + while (!_done && charsRead1) + _done=true; //That's it. no more chuncks available + } + + if (_currentChunkValid) + { + uint bufLength=(uint)(count-charsRead); + if (bufLength<8192) + bufLength=8192; //Read ahead + + char[] buffer=new char[bufLength]; + IFilterReturnCode res=_filter.GetText(ref bufLength, buffer); + buffer[bufLength++] = ' '; + if (res==IFilterReturnCode.S_OK || res==IFilterReturnCode.FILTER_S_LAST_TEXT) + { + int cRead=(int)bufLength; + if (cRead+charsRead>count) + { + int charsLeft=(cRead+charsRead-count); + _charsLeftFromLastRead=new char[charsLeft]; + Array.Copy(buffer, cRead-charsLeft, _charsLeftFromLastRead, 0, charsLeft); + cRead-=charsLeft; + } + else + _charsLeftFromLastRead=null; + + Array.Copy(buffer, 0, array, offset+charsRead, cRead); + charsRead+=cRead; + } + + if (res==IFilterReturnCode.FILTER_S_LAST_TEXT || res==IFilterReturnCode.FILTER_E_NO_MORE_TEXT) + _currentChunkValid=false; + } + } + return charsRead; + } + + public FilterReader(string fileName) + { + _filter=FilterLoader.LoadAndInitIFilter(fileName); + if (_filter==null) + throw new ArgumentException("no filter defined for "+fileName); + } + } +} diff --git a/PROMS/EPocalipse.IFilter/IFilter.cs b/PROMS/EPocalipse.IFilter/IFilter.cs new file mode 100644 index 00000000..e71de152 --- /dev/null +++ b/PROMS/EPocalipse.IFilter/IFilter.cs @@ -0,0 +1,435 @@ +using System; +using System.Text; +using System.Runtime.InteropServices; + +//Contains IFilter interface translation +//Most translations are from PInvoke.net + +namespace EPocalipse.IFilter +{ + [StructLayout(LayoutKind.Sequential)] + public struct FULLPROPSPEC + { + public Guid guidPropSet; + public PROPSPEC psProperty; + } + + [StructLayout(LayoutKind.Sequential)] + internal struct FILTERREGION + { + public int idChunk; + public int cwcStart; + public int cwcExtent; + } + + [StructLayout(LayoutKind.Explicit)] + public struct PROPSPEC + { + [FieldOffset(0)] public int ulKind; // 0 - string used; 1 - PROPID + [FieldOffset(4)] public int propid; + [FieldOffset(4)] public IntPtr lpwstr; + } + + [Flags] + internal enum IFILTER_FLAGS + { + /// + /// The caller should use the IPropertySetStorage and IPropertyStorage + /// interfaces to locate additional properties. + /// When this flag is set, properties available through COM + /// enumerators should not be returned from IFilter. + /// + IFILTER_FLAGS_OLE_PROPERTIES = 1 + } + + /// + /// Flags controlling the operation of the FileFilter + /// instance. + /// + [Flags] + internal enum IFILTER_INIT + { + NONE = 0, + /// + /// Paragraph breaks should be marked with the Unicode PARAGRAPH + /// SEPARATOR (0x2029) + /// + CANON_PARAGRAPHS = 1, + + /// + /// Soft returns, such as the newline character in Microsoft Word, should + /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard + /// returns can be doubled. A carriage return (0x000D), line feed (0x000A), + /// or the carriage return and line feed in combination should be considered + /// a hard return. The intent is to enable pattern-expression matches that + /// match against observed line breaks. + /// + HARD_LINE_BREAKS = 2, + + /// + /// Various word-processing programs have forms of hyphens that are not + /// represented in the host character set, such as optional hyphens + /// (appearing only at the end of a line) and nonbreaking hyphens. This flag + /// indicates that optional hyphens are to be converted to nulls, and + /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or + /// HYPHEN-MINUSES (0x002D). + /// + CANON_HYPHENS = 4, + + /// + /// Just as the CANON_HYPHENS flag standardizes hyphens, + /// this one standardizes spaces. All special space characters, such as + /// nonbreaking spaces, are converted to the standard space character + /// (0x0020). + /// + CANON_SPACES = 8, + + /// + /// Indicates that the client wants text split into chunks representing + /// public value-type properties. + /// + APPLY_INDEX_ATTRIBUTES = 16, + + /// + /// Indicates that the client wants text split into chunks representing + /// properties determined during the indexing process. + /// + APPLY_CRAWL_ATTRIBUTES = 256, + + /// + /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES + /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted. + /// + APPLY_OTHER_ATTRIBUTES = 32, + + /// + /// Optimizes IFilter for indexing because the client calls the + /// IFilter::Init method only once and does not call IFilter::BindRegion. + /// This eliminates the possibility of accessing a chunk both before and + /// after accessing another chunk. + /// + INDEXING_ONLY = 64, + + /// + /// The text extraction process must recursively search all linked + /// objects within the document. If a link is unavailable, the + /// IFilter::GetChunk call that would have obtained the first chunk of the + /// link should return FILTER_E_LINK_UNAVAILABLE. + /// + SEARCH_LINKS = 128, + + /// + /// The content indexing process can return property values set by the filter. + /// + FILTER_OWNED_VALUE_OK = 512 + } + + public struct STAT_CHUNK + { + /// + /// The chunk identifier. Chunk identifiers must be unique for the + /// current instance of the IFilter interface. + /// Chunk identifiers must be in ascending order. The order in which + /// chunks are numbered should correspond to the order in which they appear + /// in the source document. Some search engines can take advantage of the + /// proximity of chunks of various properties. If so, the order in which + /// chunks with different properties are emitted will be important to the + /// search engine. + /// + public int idChunk; + + /// + /// The type of break that separates the previous chunk from the current + /// chunk. Values are from the CHUNK_BREAKTYPE enumeration. + /// + [MarshalAs(UnmanagedType.U4)] + public CHUNK_BREAKTYPE breakType; + + /// + /// Flags indicate whether this chunk contains a text-type or a + /// value-type property. + /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set, + /// IFilter::GetText should be used to retrieve the contents of the chunk + /// as a series of words. + /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve + /// the value and treat it as a single property value. If the filter dictates that the same + /// content be treated as both text and as a value, the chunk should be emitted twice in two + /// different chunks, each with one flag set. + /// + [MarshalAs(UnmanagedType.U4)] + public CHUNKSTATE flags; + + /// + /// The language and sublanguage associated with a chunk of text. Chunk locale is used + /// by document indexers to perform proper word breaking of text. If the chunk is + /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR, + /// this field is ignored. + /// + public int locale; + + /// + /// The property to be applied to the chunk. If a filter requires that the same text + /// have more than one property, it needs to emit the text once for each property + /// in separate chunks. + /// + public FULLPROPSPEC attribute; + + /// + /// The ID of the source of a chunk. The value of the idChunkSource member depends on the nature of the chunk: + /// If the chunk is a text-type property, the value of the idChunkSource member must be the same as the value of the idChunk member. + /// If the chunk is an public value-type property derived from textual content, the value of the idChunkSource member is the chunk ID for the + /// text-type chunk from which it is derived. + /// If the filter attributes specify to return only public value-type + /// properties, there is no content chunk from which to derive the current + /// public value-type property. In this case, the value of the + /// idChunkSource member must be set to zero, which is an invalid chunk. + /// + public int idChunkSource; + + /// + /// The offset from which the source text for a derived chunk starts in + /// the source chunk. + /// + public int cwcStartSource; + + /// + /// The length in characters of the source text from which the current + /// chunk was derived. + /// A zero value signifies character-by-character correspondence between + /// the source text and + /// the derived text. A nonzero value means that no such direct + /// correspondence exists + /// + public int cwcLenSource; + } + + /// + /// Enumerates the different breaking types that occur between + /// chunks of text read out by the FileFilter. + /// + public enum CHUNK_BREAKTYPE + { + /// + /// No break is placed between the current chunk and the previous chunk. + /// The chunks are glued together. + /// + CHUNK_NO_BREAK = 0, + /// + /// A word break is placed between this chunk and the previous chunk that + /// had the same attribute. + /// Use of CHUNK_EOW should be minimized because the choice of word + /// breaks is language-dependent, + /// so determining word breaks is best left to the search engine. + /// + CHUNK_EOW = 1, + /// + /// A sentence break is placed between this chunk and the previous chunk + /// that had the same attribute. + /// + CHUNK_EOS = 2, + /// + /// A paragraph break is placed between this chunk and the previous chunk + /// that had the same attribute. + /// + CHUNK_EOP = 3, + /// + /// A chapter break is placed between this chunk and the previous chunk + /// that had the same attribute. + /// + CHUNK_EOC = 4 + } + + + public enum CHUNKSTATE + { + /// + /// The current chunk is a text-type property. + /// + CHUNK_TEXT = 0x1, + /// + /// The current chunk is a value-type property. + /// + CHUNK_VALUE = 0x2, + /// + /// Reserved + /// + CHUNK_FILTER_OWNED_VALUE = 0x4 + } + + internal enum IFilterReturnCode : uint + { + /// + /// Success + /// + S_OK = 0, + /// + /// The function was denied access to the filter file. + /// + E_ACCESSDENIED = 0x80070005, + /// + /// The function encountered an invalid handle, + /// probably due to a low-memory situation. + /// + E_HANDLE = 0x80070006, + /// + /// The function received an invalid parameter. + /// + E_INVALIDARG = 0x80070057, + /// + /// Out of memory + /// + E_OUTOFMEMORY = 0x8007000E, + /// + /// Not implemented + /// + E_NOTIMPL = 0x80004001, + /// + /// Unknown error + /// + E_FAIL = 0x80000008, + /// + /// File not filtered due to password protection + /// + FILTER_E_PASSWORD = 0x8004170B, + /// + /// The document format is not recognised by the filter + /// + FILTER_E_UNKNOWNFORMAT = 0x8004170C, + /// + /// No text in current chunk + /// + FILTER_E_NO_TEXT = 0x80041705, + /// + /// No more chunks of text available in object + /// + FILTER_E_END_OF_CHUNKS = 0x80041700, + /// + /// No more text available in chunk + /// + FILTER_E_NO_MORE_TEXT = 0x80041701, + /// + /// No more property values available in chunk + /// + FILTER_E_NO_MORE_VALUES = 0x80041702, + /// + /// Unable to access object + /// + FILTER_E_ACCESS = 0x80041703, + /// + /// Moniker doesn't cover entire region + /// + FILTER_W_MONIKER_CLIPPED = 0x00041704, + /// + /// Unable to bind IFilter for embedded object + /// + FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707, + /// + /// Unable to bind IFilter for linked object + /// + FILTER_E_LINK_UNAVAILABLE = 0x80041708, + /// + /// This is the last text in the current chunk + /// + FILTER_S_LAST_TEXT = 0x00041709, + /// + /// This is the last value in the current chunk + /// + FILTER_S_LAST_VALUES = 0x0004170A + } + + [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] + [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] + internal interface IFilter + { + /// + /// The IFilter::Init method initializes a filtering session. + /// + [PreserveSig] + IFilterReturnCode Init( + //[in] Flag settings from the IFILTER_INIT enumeration for + // controlling text standardization, property output, embedding + // scope, and IFilter access patterns. + IFILTER_INIT grfFlags, + + // [in] The size of the attributes array. When nonzero, cAttributes + // takes + // precedence over attributes specified in grfFlags. If no + // attribute flags + // are specified and cAttributes is zero, the default is given by + // the + // PSGUID_STORAGE storage property set, which contains the date and + // time + // of the last write to the file, size, and so on; and by the + // PID_STG_CONTENTS + // 'contents' property, which maps to the main contents of the + // file. + // For more information about properties and property sets, see + // Property Sets. + int cAttributes, + + //[in] Array of pointers to FULLPROPSPEC structures for the + // requested properties. + // When cAttributes is nonzero, only the properties in aAttributes + // are returned. + IntPtr aAttributes, + + // [out] Information about additional properties available to the + // caller; from the IFILTER_FLAGS enumeration. + out IFILTER_FLAGS pdwFlags); + + /// + /// The IFilter::GetChunk method positions the filter at the beginning + /// of the next chunk, + /// or at the first chunk if this is the first call to the GetChunk + /// method, and returns a description of the current chunk. + /// + [PreserveSig] + IFilterReturnCode GetChunk(out STAT_CHUNK pStat); + + /// + /// The IFilter::GetText method retrieves text (text-type properties) + /// from the current chunk, + /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT. + /// + [PreserveSig] + IFilterReturnCode GetText( + // [in/out] On entry, the size of awcBuffer array in wide/Unicode + // characters. On exit, the number of Unicode characters written to + // awcBuffer. + // Note that this value is not the number of bytes in the buffer. + ref uint pcwcBuffer, + + // Text retrieved from the current chunk. Do not terminate the + // buffer with a character. + [Out(), MarshalAs(UnmanagedType.LPArray)] + char[] awcBuffer); + + /// + /// The IFilter::GetValue method retrieves a value (public + /// value-type property) from a chunk, + /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE. + /// + [PreserveSig] + int GetValue( + // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some + // PROPVARIANT + // structures contain pointers, which can be freed by calling the + // PropVariantClear function. + // It is up to the caller of the GetValue method to call the + // PropVariantClear method. + // ref IntPtr ppPropValue + // [MarshalAs(UnmanagedType.Struct)] + ref IntPtr PropVal); + + /// + /// The IFilter::BindRegion method retrieves an interface representing + /// the specified portion of the object. + /// Currently reserved for future use. + /// + [PreserveSig] + int BindRegion(ref FILTERREGION origPos, + ref Guid riid, ref object ppunk); + } + + +}