using System; using System.Text; using System.Runtime.InteropServices; //Contains IFilter interface translation //Most translations are from PInvoke.net namespace EPocalipse.IFilter { [StructLayout(LayoutKind.Sequential)] public struct FULLPROPSPEC { public Guid guidPropSet; public PROPSPEC psProperty; } [StructLayout(LayoutKind.Sequential)] internal struct FILTERREGION { public int idChunk; public int cwcStart; public int cwcExtent; } [StructLayout(LayoutKind.Explicit)] public struct PROPSPEC { [FieldOffset(0)] public int ulKind; // 0 - string used; 1 - PROPID [FieldOffset(4)] public int propid; [FieldOffset(4)] public IntPtr lpwstr; } [Flags] internal enum IFILTER_FLAGS { /// /// The caller should use the IPropertySetStorage and IPropertyStorage /// interfaces to locate additional properties. /// When this flag is set, properties available through COM /// enumerators should not be returned from IFilter. /// IFILTER_FLAGS_OLE_PROPERTIES = 1 } /// /// Flags controlling the operation of the FileFilter /// instance. /// [Flags] internal enum IFILTER_INIT { NONE = 0, /// /// Paragraph breaks should be marked with the Unicode PARAGRAPH /// SEPARATOR (0x2029) /// CANON_PARAGRAPHS = 1, /// /// Soft returns, such as the newline character in Microsoft Word, should /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard /// returns can be doubled. A carriage return (0x000D), line feed (0x000A), /// or the carriage return and line feed in combination should be considered /// a hard return. The intent is to enable pattern-expression matches that /// match against observed line breaks. /// HARD_LINE_BREAKS = 2, /// /// Various word-processing programs have forms of hyphens that are not /// represented in the host character set, such as optional hyphens /// (appearing only at the end of a line) and nonbreaking hyphens. This flag /// indicates that optional hyphens are to be converted to nulls, and /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or /// HYPHEN-MINUSES (0x002D). /// CANON_HYPHENS = 4, /// /// Just as the CANON_HYPHENS flag standardizes hyphens, /// this one standardizes spaces. All special space characters, such as /// nonbreaking spaces, are converted to the standard space character /// (0x0020). /// CANON_SPACES = 8, /// /// Indicates that the client wants text split into chunks representing /// public value-type properties. /// APPLY_INDEX_ATTRIBUTES = 16, /// /// Indicates that the client wants text split into chunks representing /// properties determined during the indexing process. /// APPLY_CRAWL_ATTRIBUTES = 256, /// /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted. /// APPLY_OTHER_ATTRIBUTES = 32, /// /// Optimizes IFilter for indexing because the client calls the /// IFilter::Init method only once and does not call IFilter::BindRegion. /// This eliminates the possibility of accessing a chunk both before and /// after accessing another chunk. /// INDEXING_ONLY = 64, /// /// The text extraction process must recursively search all linked /// objects within the document. If a link is unavailable, the /// IFilter::GetChunk call that would have obtained the first chunk of the /// link should return FILTER_E_LINK_UNAVAILABLE. /// SEARCH_LINKS = 128, /// /// The content indexing process can return property values set by the filter. /// FILTER_OWNED_VALUE_OK = 512 } public struct STAT_CHUNK { /// /// The chunk identifier. Chunk identifiers must be unique for the /// current instance of the IFilter interface. /// Chunk identifiers must be in ascending order. The order in which /// chunks are numbered should correspond to the order in which they appear /// in the source document. Some search engines can take advantage of the /// proximity of chunks of various properties. If so, the order in which /// chunks with different properties are emitted will be important to the /// search engine. /// public int idChunk; /// /// The type of break that separates the previous chunk from the current /// chunk. Values are from the CHUNK_BREAKTYPE enumeration. /// [MarshalAs(UnmanagedType.U4)] public CHUNK_BREAKTYPE breakType; /// /// Flags indicate whether this chunk contains a text-type or a /// value-type property. /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set, /// IFilter::GetText should be used to retrieve the contents of the chunk /// as a series of words. /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve /// the value and treat it as a single property value. If the filter dictates that the same /// content be treated as both text and as a value, the chunk should be emitted twice in two /// different chunks, each with one flag set. /// [MarshalAs(UnmanagedType.U4)] public CHUNKSTATE flags; /// /// The language and sublanguage associated with a chunk of text. Chunk locale is used /// by document indexers to perform proper word breaking of text. If the chunk is /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR, /// this field is ignored. /// public int locale; /// /// The property to be applied to the chunk. If a filter requires that the same text /// have more than one property, it needs to emit the text once for each property /// in separate chunks. /// public FULLPROPSPEC attribute; /// /// The ID of the source of a chunk. The value of the idChunkSource member depends on the nature of the chunk: /// If the chunk is a text-type property, the value of the idChunkSource member must be the same as the value of the idChunk member. /// If the chunk is an public value-type property derived from textual content, the value of the idChunkSource member is the chunk ID for the /// text-type chunk from which it is derived. /// If the filter attributes specify to return only public value-type /// properties, there is no content chunk from which to derive the current /// public value-type property. In this case, the value of the /// idChunkSource member must be set to zero, which is an invalid chunk. /// public int idChunkSource; /// /// The offset from which the source text for a derived chunk starts in /// the source chunk. /// public int cwcStartSource; /// /// The length in characters of the source text from which the current /// chunk was derived. /// A zero value signifies character-by-character correspondence between /// the source text and /// the derived text. A nonzero value means that no such direct /// correspondence exists /// public int cwcLenSource; } /// /// Enumerates the different breaking types that occur between /// chunks of text read out by the FileFilter. /// public enum CHUNK_BREAKTYPE { /// /// No break is placed between the current chunk and the previous chunk. /// The chunks are glued together. /// CHUNK_NO_BREAK = 0, /// /// A word break is placed between this chunk and the previous chunk that /// had the same attribute. /// Use of CHUNK_EOW should be minimized because the choice of word /// breaks is language-dependent, /// so determining word breaks is best left to the search engine. /// CHUNK_EOW = 1, /// /// A sentence break is placed between this chunk and the previous chunk /// that had the same attribute. /// CHUNK_EOS = 2, /// /// A paragraph break is placed between this chunk and the previous chunk /// that had the same attribute. /// CHUNK_EOP = 3, /// /// A chapter break is placed between this chunk and the previous chunk /// that had the same attribute. /// CHUNK_EOC = 4 } public enum CHUNKSTATE { /// /// The current chunk is a text-type property. /// CHUNK_TEXT = 0x1, /// /// The current chunk is a value-type property. /// CHUNK_VALUE = 0x2, /// /// Reserved /// CHUNK_FILTER_OWNED_VALUE = 0x4 } internal enum IFilterReturnCode : uint { /// /// Success /// S_OK = 0, /// /// The function was denied access to the filter file. /// E_ACCESSDENIED = 0x80070005, /// /// The function encountered an invalid handle, /// probably due to a low-memory situation. /// E_HANDLE = 0x80070006, /// /// The function received an invalid parameter. /// E_INVALIDARG = 0x80070057, /// /// Out of memory /// E_OUTOFMEMORY = 0x8007000E, /// /// Not implemented /// E_NOTIMPL = 0x80004001, /// /// Unknown error /// E_FAIL = 0x80000008, /// /// File not filtered due to password protection /// FILTER_E_PASSWORD = 0x8004170B, /// /// The document format is not recognised by the filter /// FILTER_E_UNKNOWNFORMAT = 0x8004170C, /// /// No text in current chunk /// FILTER_E_NO_TEXT = 0x80041705, /// /// No more chunks of text available in object /// FILTER_E_END_OF_CHUNKS = 0x80041700, /// /// No more text available in chunk /// FILTER_E_NO_MORE_TEXT = 0x80041701, /// /// No more property values available in chunk /// FILTER_E_NO_MORE_VALUES = 0x80041702, /// /// Unable to access object /// FILTER_E_ACCESS = 0x80041703, /// /// Moniker doesn't cover entire region /// FILTER_W_MONIKER_CLIPPED = 0x00041704, /// /// Unable to bind IFilter for embedded object /// FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707, /// /// Unable to bind IFilter for linked object /// FILTER_E_LINK_UNAVAILABLE = 0x80041708, /// /// This is the last text in the current chunk /// FILTER_S_LAST_TEXT = 0x00041709, /// /// This is the last value in the current chunk /// FILTER_S_LAST_VALUES = 0x0004170A } [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] internal interface IFilter { /// /// The IFilter::Init method initializes a filtering session. /// [PreserveSig] IFilterReturnCode Init( //[in] Flag settings from the IFILTER_INIT enumeration for // controlling text standardization, property output, embedding // scope, and IFilter access patterns. IFILTER_INIT grfFlags, // [in] The size of the attributes array. When nonzero, cAttributes // takes // precedence over attributes specified in grfFlags. If no // attribute flags // are specified and cAttributes is zero, the default is given by // the // PSGUID_STORAGE storage property set, which contains the date and // time // of the last write to the file, size, and so on; and by the // PID_STG_CONTENTS // 'contents' property, which maps to the main contents of the // file. // For more information about properties and property sets, see // Property Sets. int cAttributes, //[in] Array of pointers to FULLPROPSPEC structures for the // requested properties. // When cAttributes is nonzero, only the properties in aAttributes // are returned. IntPtr aAttributes, // [out] Information about additional properties available to the // caller; from the IFILTER_FLAGS enumeration. out IFILTER_FLAGS pdwFlags); /// /// The IFilter::GetChunk method positions the filter at the beginning /// of the next chunk, /// or at the first chunk if this is the first call to the GetChunk /// method, and returns a description of the current chunk. /// [PreserveSig] IFilterReturnCode GetChunk(out STAT_CHUNK pStat); /// /// The IFilter::GetText method retrieves text (text-type properties) /// from the current chunk, /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT. /// [PreserveSig] IFilterReturnCode GetText( // [in/out] On entry, the size of awcBuffer array in wide/Unicode // characters. On exit, the number of Unicode characters written to // awcBuffer. // Note that this value is not the number of bytes in the buffer. ref uint pcwcBuffer, // Text retrieved from the current chunk. Do not terminate the // buffer with a character. [Out(), MarshalAs(UnmanagedType.LPArray)] char[] awcBuffer); /// /// The IFilter::GetValue method retrieves a value (public /// value-type property) from a chunk, /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE. /// [PreserveSig] int GetValue( // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some // PROPVARIANT // structures contain pointers, which can be freed by calling the // PropVariantClear function. // It is up to the caller of the GetValue method to call the // PropVariantClear method. // ref IntPtr ppPropValue // [MarshalAs(UnmanagedType.Struct)] ref IntPtr PropVal); /// /// The IFilter::BindRegion method retrieves an interface representing /// the specified portion of the object. /// Currently reserved for future use. /// [PreserveSig] int BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk); } }