436 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			436 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
| using System;
 | |
| using System.Text;
 | |
| using System.Runtime.InteropServices;
 | |
| 
 | |
| //Contains IFilter interface translation
 | |
| //Most translations are from PInvoke.net
 | |
| 
 | |
| namespace EPocalipse.IFilter
 | |
| {
 | |
|   [StructLayout(LayoutKind.Sequential)]
 | |
|   public struct FULLPROPSPEC 
 | |
|   {
 | |
|     public Guid guidPropSet;
 | |
|     public PROPSPEC psProperty;
 | |
|   }
 | |
| 
 | |
|   [StructLayout(LayoutKind.Sequential)]
 | |
|   internal struct FILTERREGION 
 | |
|   {
 | |
|     public int idChunk;
 | |
|     public int cwcStart;
 | |
|     public int cwcExtent;
 | |
|   }
 | |
| 
 | |
|   [StructLayout(LayoutKind.Explicit)]
 | |
|   public struct PROPSPEC
 | |
|   {
 | |
|     [FieldOffset(0)] public int ulKind;     // 0 - string used; 1 - PROPID
 | |
|     [FieldOffset(4)] public int propid;    
 | |
|     [FieldOffset(4)] public IntPtr lpwstr;
 | |
|   }
 | |
| 
 | |
|   [Flags]
 | |
|   internal enum IFILTER_FLAGS 
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// The caller should use the IPropertySetStorage and IPropertyStorage
 | |
|     /// interfaces to locate additional properties. 
 | |
|     /// When this flag is set, properties available through COM
 | |
|     /// enumerators should not be returned from IFilter. 
 | |
|     /// </summary>
 | |
|     IFILTER_FLAGS_OLE_PROPERTIES = 1
 | |
|   }
 | |
| 
 | |
|   /// <summary>
 | |
|   /// Flags controlling the operation of the FileFilter
 | |
|   /// instance.
 | |
|   /// </summary>
 | |
|   [Flags]
 | |
|   internal enum IFILTER_INIT
 | |
|   {
 | |
|     NONE = 0,
 | |
|     /// <summary>
 | |
|     /// Paragraph breaks should be marked with the Unicode PARAGRAPH
 | |
|     /// SEPARATOR (0x2029)
 | |
|     /// </summary>
 | |
|     CANON_PARAGRAPHS = 1,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Soft returns, such as the newline character in Microsoft Word, should
 | |
|     /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
 | |
|     /// returns can be doubled. A carriage return (0x000D), line feed (0x000A),
 | |
|     /// or the carriage return and line feed in combination should be considered
 | |
|     /// a hard return. The intent is to enable pattern-expression matches that
 | |
|     /// match against observed line breaks. 
 | |
|     /// </summary>
 | |
|     HARD_LINE_BREAKS = 2,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Various word-processing programs have forms of hyphens that are not
 | |
|     /// represented in the host character set, such as optional hyphens
 | |
|     /// (appearing only at the end of a line) and nonbreaking hyphens. This flag
 | |
|     /// indicates that optional hyphens are to be converted to nulls, and
 | |
|     /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or
 | |
|     /// HYPHEN-MINUSES (0x002D). 
 | |
|     /// </summary>
 | |
|     CANON_HYPHENS = 4,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Just as the CANON_HYPHENS flag standardizes hyphens,
 | |
|     /// this one standardizes spaces. All special space characters, such as
 | |
|     /// nonbreaking spaces, are converted to the standard space character
 | |
|     /// (0x0020). 
 | |
|     /// </summary>
 | |
|     CANON_SPACES = 8,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Indicates that the client wants text split into chunks representing
 | |
|     /// public value-type properties. 
 | |
|     /// </summary>
 | |
|     APPLY_INDEX_ATTRIBUTES = 16,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Indicates that the client wants text split into chunks representing
 | |
|     /// properties determined during the indexing process. 
 | |
|     /// </summary>
 | |
|     APPLY_CRAWL_ATTRIBUTES = 256,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES
 | |
|     /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted. 
 | |
|     /// </summary>
 | |
|     APPLY_OTHER_ATTRIBUTES = 32,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Optimizes IFilter for indexing because the client calls the
 | |
|     /// IFilter::Init method only once and does not call IFilter::BindRegion.
 | |
|     /// This eliminates the possibility of accessing a chunk both before and
 | |
|     /// after accessing another chunk. 
 | |
|     /// </summary>
 | |
|     INDEXING_ONLY = 64,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The text extraction process must recursively search all linked
 | |
|     /// objects within the document. If a link is unavailable, the
 | |
|     /// IFilter::GetChunk call that would have obtained the first chunk of the
 | |
|     /// link should return FILTER_E_LINK_UNAVAILABLE. 
 | |
|     /// </summary>
 | |
|     SEARCH_LINKS = 128,
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The content indexing process can return property values set by the  filter. 
 | |
|     /// </summary>
 | |
|     FILTER_OWNED_VALUE_OK = 512
 | |
|   }
 | |
| 
 | |
|   public struct STAT_CHUNK 
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// The chunk identifier. Chunk identifiers must be unique for the
 | |
|     /// current instance of the IFilter interface. 
 | |
|     /// Chunk identifiers must be in ascending order. The order in which
 | |
|     /// chunks are numbered should correspond to the order in which they appear
 | |
|     /// in the source document. Some search engines can take advantage of the
 | |
|     /// proximity of chunks of various properties. If so, the order in which
 | |
|     /// chunks with different properties are emitted will be important to the
 | |
|     /// search engine. 
 | |
|     /// </summary>
 | |
|     public int idChunk;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The type of break that separates the previous chunk from the current
 | |
|     ///  chunk. Values are from the CHUNK_BREAKTYPE enumeration. 
 | |
|     /// </summary>
 | |
|     [MarshalAs(UnmanagedType.U4)]
 | |
|     public CHUNK_BREAKTYPE breakType;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Flags indicate whether this chunk contains a text-type or a
 | |
|     /// value-type property. 
 | |
|     /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set, 
 | |
|     /// IFilter::GetText should be used to retrieve the contents of the chunk
 | |
|     /// as a series of words. 
 | |
|     /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve 
 | |
|     /// the value and treat it as a single property value. If the filter dictates that the same 
 | |
|     /// content be treated as both text and as a value, the chunk should be emitted twice in two       
 | |
|     /// different chunks, each with one flag set. 
 | |
|     /// </summary>
 | |
|     [MarshalAs(UnmanagedType.U4)]
 | |
|     public CHUNKSTATE flags;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The language and sublanguage associated with a chunk of text. Chunk locale is used 
 | |
|     /// by document indexers to perform proper word breaking of text. If the chunk is 
 | |
|     /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR, 
 | |
|     /// this field is ignored. 
 | |
|     /// </summary>
 | |
|     public int locale;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The property to be applied to the chunk. If a filter requires that       the same text 
 | |
|     /// have more than one property, it needs to emit the text once for each       property 
 | |
|     /// in separate chunks. 
 | |
|     /// </summary>
 | |
|     public FULLPROPSPEC attribute;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The ID of the source of a chunk. The value of the idChunkSource     member depends on the nature of the chunk: 
 | |
|     /// If the chunk is a text-type property, the value of the idChunkSource       member must be the same as the value of the idChunk member. 
 | |
|     /// If the chunk is an public value-type property derived from textual       content, the value of the idChunkSource member is the chunk ID for the
 | |
|     /// text-type chunk from which it is derived. 
 | |
|     /// If the filter attributes specify to return only public value-type
 | |
|     /// properties, there is no content chunk from which to derive the current
 | |
|     /// public value-type property. In this case, the value of the
 | |
|     /// idChunkSource member must be set to zero, which is an invalid chunk. 
 | |
|     /// </summary>
 | |
|     public int idChunkSource;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The offset from which the source text for a derived chunk starts in
 | |
|     /// the source chunk. 
 | |
|     /// </summary>
 | |
|     public int cwcStartSource;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The length in characters of the source text from which the current
 | |
|     /// chunk was derived. 
 | |
|     /// A zero value signifies character-by-character correspondence between
 | |
|     /// the source text and 
 | |
|     /// the derived text. A nonzero value means that no such direct
 | |
|     /// correspondence exists
 | |
|     /// </summary>
 | |
|     public int cwcLenSource;
 | |
|   }
 | |
| 
 | |
|   /// <summary>
 | |
|   /// Enumerates the different breaking types that occur between 
 | |
|   /// chunks of text read out by the FileFilter.
 | |
|   /// </summary>
 | |
|   public enum CHUNK_BREAKTYPE
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// No break is placed between the current chunk and the previous chunk.
 | |
|     /// The chunks are glued together. 
 | |
|     /// </summary>
 | |
|     CHUNK_NO_BREAK = 0,
 | |
|     /// <summary>
 | |
|     /// A word break is placed between this chunk and the previous chunk that
 | |
|     /// had the same attribute. 
 | |
|     /// Use of CHUNK_EOW should be minimized because the choice of word
 | |
|     /// breaks is language-dependent, 
 | |
|     /// so determining word breaks is best left to the search engine. 
 | |
|     /// </summary>
 | |
|     CHUNK_EOW = 1,
 | |
|     /// <summary>
 | |
|     /// A sentence break is placed between this chunk and the previous chunk
 | |
|     /// that had the same attribute. 
 | |
|     /// </summary>
 | |
|     CHUNK_EOS = 2,
 | |
|     /// <summary>
 | |
|     /// A paragraph break is placed between this chunk and the previous chunk
 | |
|     /// that had the same attribute.
 | |
|     /// </summary>     
 | |
|     CHUNK_EOP = 3,
 | |
|     /// <summary>
 | |
|     /// A chapter break is placed between this chunk and the previous chunk
 | |
|     /// that had the same attribute. 
 | |
|     /// </summary>
 | |
|     CHUNK_EOC = 4
 | |
|   }
 | |
| 
 | |
| 
 | |
|   public enum CHUNKSTATE 
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// The current chunk is a text-type property.
 | |
|     /// </summary>
 | |
|     CHUNK_TEXT = 0x1,
 | |
|     /// <summary>
 | |
|     /// The current chunk is a value-type property. 
 | |
|     /// </summary>
 | |
|     CHUNK_VALUE = 0x2,
 | |
|     /// <summary>
 | |
|     /// Reserved
 | |
|     /// </summary>
 | |
|     CHUNK_FILTER_OWNED_VALUE = 0x4
 | |
|   }
 | |
| 
 | |
|   internal enum IFilterReturnCode : uint 
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// Success
 | |
|     /// </summary>
 | |
|     S_OK = 0,
 | |
|     /// <summary>
 | |
|     /// The function was denied access to the filter file. 
 | |
|     /// </summary>
 | |
|     E_ACCESSDENIED = 0x80070005,
 | |
|     /// <summary>
 | |
|     /// The function encountered an invalid handle,
 | |
|     /// probably due to a low-memory situation. 
 | |
|     /// </summary>
 | |
|     E_HANDLE = 0x80070006,
 | |
|     /// <summary>
 | |
|     /// The function received an invalid parameter.
 | |
|     /// </summary>
 | |
|     E_INVALIDARG = 0x80070057,
 | |
|     /// <summary>
 | |
|     /// Out of memory
 | |
|     /// </summary>
 | |
|     E_OUTOFMEMORY = 0x8007000E,
 | |
|     /// <summary>
 | |
|     /// Not implemented
 | |
|     /// </summary>
 | |
|     E_NOTIMPL = 0x80004001,
 | |
|     /// <summary>
 | |
|     /// Unknown error
 | |
|     /// </summary>
 | |
|     E_FAIL = 0x80000008,
 | |
|     /// <summary>
 | |
|     /// File not filtered due to password protection
 | |
|     /// </summary>
 | |
|     FILTER_E_PASSWORD = 0x8004170B,
 | |
|     /// <summary>
 | |
|     /// The document format is not recognised by the filter
 | |
|     /// </summary>
 | |
|     FILTER_E_UNKNOWNFORMAT = 0x8004170C,
 | |
|     /// <summary>
 | |
|     /// No text in current chunk
 | |
|     /// </summary>
 | |
|     FILTER_E_NO_TEXT = 0x80041705,
 | |
|     /// <summary>
 | |
|     /// No more chunks of text available in object
 | |
|     /// </summary>
 | |
|     FILTER_E_END_OF_CHUNKS = 0x80041700,
 | |
|     /// <summary>
 | |
|     /// No more text available in chunk
 | |
|     /// </summary>
 | |
|     FILTER_E_NO_MORE_TEXT = 0x80041701,
 | |
|     /// <summary>
 | |
|     /// No more property values available in chunk
 | |
|     /// </summary>
 | |
|     FILTER_E_NO_MORE_VALUES = 0x80041702,
 | |
|     /// <summary>
 | |
|     /// Unable to access object
 | |
|     /// </summary>
 | |
|     FILTER_E_ACCESS = 0x80041703,
 | |
|     /// <summary>
 | |
|     /// Moniker doesn't cover entire region
 | |
|     /// </summary>
 | |
|     FILTER_W_MONIKER_CLIPPED = 0x00041704,
 | |
|     /// <summary>
 | |
|     /// Unable to bind IFilter for embedded object
 | |
|     /// </summary>
 | |
|     FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
 | |
|     /// <summary>
 | |
|     /// Unable to bind IFilter for linked object
 | |
|     /// </summary>
 | |
|     FILTER_E_LINK_UNAVAILABLE = 0x80041708,
 | |
|     /// <summary>
 | |
|     ///  This is the last text in the current chunk
 | |
|     /// </summary>
 | |
|     FILTER_S_LAST_TEXT = 0x00041709,
 | |
|     /// <summary>
 | |
|     /// This is the last value in the current chunk
 | |
|     /// </summary>
 | |
|     FILTER_S_LAST_VALUES = 0x0004170A
 | |
|   }
 | |
| 
 | |
|   [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
 | |
|   [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
 | |
|   internal interface IFilter
 | |
|   {
 | |
|     /// <summary>
 | |
|     /// The IFilter::Init method initializes a filtering session.
 | |
|     /// </summary>
 | |
|     [PreserveSig]
 | |
|     IFilterReturnCode Init(
 | |
|       //[in] Flag settings from the IFILTER_INIT enumeration for
 | |
|       // controlling text standardization, property output, embedding
 | |
|       // scope, and IFilter access patterns. 
 | |
|       IFILTER_INIT grfFlags,
 | |
| 
 | |
|       // [in] The size of the attributes array. When nonzero, cAttributes
 | |
|       //  takes 
 | |
|       // precedence over attributes specified in grfFlags. If no
 | |
|       // attribute flags 
 | |
|       // are specified and cAttributes is zero, the default is given by
 | |
|       // the 
 | |
|       // PSGUID_STORAGE storage property set, which contains the date and
 | |
|       //  time 
 | |
|       // of the last write to the file, size, and so on; and by the
 | |
|       //  PID_STG_CONTENTS 
 | |
|       // 'contents' property, which maps to the main contents of the
 | |
|       // file. 
 | |
|       // For more information about properties and property sets, see
 | |
|       // Property Sets. 
 | |
|       int cAttributes,
 | |
| 
 | |
|       //[in] Array of pointers to FULLPROPSPEC structures for the
 | |
|       // requested properties. 
 | |
|       // When cAttributes is nonzero, only the properties in aAttributes
 | |
|       // are returned. 
 | |
|       IntPtr aAttributes,
 | |
| 
 | |
|       // [out] Information about additional properties available to the
 | |
|       //  caller; from the IFILTER_FLAGS enumeration. 
 | |
|       out IFILTER_FLAGS pdwFlags);
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The IFilter::GetChunk method positions the filter at the beginning
 | |
|     /// of the next chunk, 
 | |
|     /// or at the first chunk if this is the first call to the GetChunk
 | |
|     /// method, and returns a description of the current chunk. 
 | |
|     /// </summary>
 | |
|     [PreserveSig]
 | |
|     IFilterReturnCode GetChunk(out STAT_CHUNK pStat);
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The IFilter::GetText method retrieves text (text-type properties)
 | |
|     /// from the current chunk, 
 | |
|     /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
 | |
|     /// </summary>
 | |
|     [PreserveSig]
 | |
|     IFilterReturnCode GetText(
 | |
|       // [in/out] On entry, the size of awcBuffer array in wide/Unicode
 | |
|       // characters. On exit, the number of Unicode characters written to
 | |
|       // awcBuffer. 
 | |
|       // Note that this value is not the number of bytes in the buffer. 
 | |
|       ref uint pcwcBuffer,
 | |
| 
 | |
|       // Text retrieved from the current chunk. Do not terminate the
 | |
|       // buffer with a character.  
 | |
|       [Out(), MarshalAs(UnmanagedType.LPArray)] 
 | |
|       char[] awcBuffer);
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The IFilter::GetValue method retrieves a value (public
 | |
|     /// value-type property) from a chunk, 
 | |
|     /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
 | |
|     /// </summary>
 | |
|     [PreserveSig]
 | |
|     int GetValue(
 | |
|       // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
 | |
|       // PROPVARIANT 
 | |
|       // structures contain pointers, which can be freed by calling the
 | |
|       // PropVariantClear function. 
 | |
|       // It is up to the caller of the GetValue method to call the
 | |
|       //   PropVariantClear method.            
 | |
|       // ref IntPtr ppPropValue
 | |
|       // [MarshalAs(UnmanagedType.Struct)]
 | |
|       ref IntPtr PropVal);
 | |
| 
 | |
|     /// <summary>
 | |
|     /// The IFilter::BindRegion method retrieves an interface representing
 | |
|     /// the specified portion of the object. 
 | |
|     /// Currently reserved for future use.
 | |
|     /// </summary>
 | |
|     [PreserveSig]
 | |
|     int BindRegion(ref FILTERREGION origPos,
 | |
|       ref Guid riid, ref object ppunk);
 | |
|   }
 | |
| 
 | |
| 
 | |
| }
 |