436 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			436 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
using System;
 | 
						|
using System.Text;
 | 
						|
using System.Runtime.InteropServices;
 | 
						|
 | 
						|
//Contains IFilter interface translation
 | 
						|
//Most translations are from PInvoke.net
 | 
						|
 | 
						|
namespace EPocalipse.IFilter
 | 
						|
{
 | 
						|
  [StructLayout(LayoutKind.Sequential)]
 | 
						|
  public struct FULLPROPSPEC 
 | 
						|
  {
 | 
						|
    public Guid guidPropSet;
 | 
						|
    public PROPSPEC psProperty;
 | 
						|
  }
 | 
						|
 | 
						|
  [StructLayout(LayoutKind.Sequential)]
 | 
						|
  internal struct FILTERREGION 
 | 
						|
  {
 | 
						|
    public int idChunk;
 | 
						|
    public int cwcStart;
 | 
						|
    public int cwcExtent;
 | 
						|
  }
 | 
						|
 | 
						|
  [StructLayout(LayoutKind.Explicit)]
 | 
						|
  public struct PROPSPEC
 | 
						|
  {
 | 
						|
    [FieldOffset(0)] public int ulKind;     // 0 - string used; 1 - PROPID
 | 
						|
    [FieldOffset(4)] public int propid;    
 | 
						|
    [FieldOffset(4)] public IntPtr lpwstr;
 | 
						|
  }
 | 
						|
 | 
						|
  [Flags]
 | 
						|
  internal enum IFILTER_FLAGS 
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// The caller should use the IPropertySetStorage and IPropertyStorage
 | 
						|
    /// interfaces to locate additional properties. 
 | 
						|
    /// When this flag is set, properties available through COM
 | 
						|
    /// enumerators should not be returned from IFilter. 
 | 
						|
    /// </summary>
 | 
						|
    IFILTER_FLAGS_OLE_PROPERTIES = 1
 | 
						|
  }
 | 
						|
 | 
						|
  /// <summary>
 | 
						|
  /// Flags controlling the operation of the FileFilter
 | 
						|
  /// instance.
 | 
						|
  /// </summary>
 | 
						|
  [Flags]
 | 
						|
  internal enum IFILTER_INIT
 | 
						|
  {
 | 
						|
    NONE = 0,
 | 
						|
    /// <summary>
 | 
						|
    /// Paragraph breaks should be marked with the Unicode PARAGRAPH
 | 
						|
    /// SEPARATOR (0x2029)
 | 
						|
    /// </summary>
 | 
						|
    CANON_PARAGRAPHS = 1,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Soft returns, such as the newline character in Microsoft Word, should
 | 
						|
    /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
 | 
						|
    /// returns can be doubled. A carriage return (0x000D), line feed (0x000A),
 | 
						|
    /// or the carriage return and line feed in combination should be considered
 | 
						|
    /// a hard return. The intent is to enable pattern-expression matches that
 | 
						|
    /// match against observed line breaks. 
 | 
						|
    /// </summary>
 | 
						|
    HARD_LINE_BREAKS = 2,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Various word-processing programs have forms of hyphens that are not
 | 
						|
    /// represented in the host character set, such as optional hyphens
 | 
						|
    /// (appearing only at the end of a line) and nonbreaking hyphens. This flag
 | 
						|
    /// indicates that optional hyphens are to be converted to nulls, and
 | 
						|
    /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or
 | 
						|
    /// HYPHEN-MINUSES (0x002D). 
 | 
						|
    /// </summary>
 | 
						|
    CANON_HYPHENS = 4,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Just as the CANON_HYPHENS flag standardizes hyphens,
 | 
						|
    /// this one standardizes spaces. All special space characters, such as
 | 
						|
    /// nonbreaking spaces, are converted to the standard space character
 | 
						|
    /// (0x0020). 
 | 
						|
    /// </summary>
 | 
						|
    CANON_SPACES = 8,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Indicates that the client wants text split into chunks representing
 | 
						|
    /// public value-type properties. 
 | 
						|
    /// </summary>
 | 
						|
    APPLY_INDEX_ATTRIBUTES = 16,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Indicates that the client wants text split into chunks representing
 | 
						|
    /// properties determined during the indexing process. 
 | 
						|
    /// </summary>
 | 
						|
    APPLY_CRAWL_ATTRIBUTES = 256,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES
 | 
						|
    /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted. 
 | 
						|
    /// </summary>
 | 
						|
    APPLY_OTHER_ATTRIBUTES = 32,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Optimizes IFilter for indexing because the client calls the
 | 
						|
    /// IFilter::Init method only once and does not call IFilter::BindRegion.
 | 
						|
    /// This eliminates the possibility of accessing a chunk both before and
 | 
						|
    /// after accessing another chunk. 
 | 
						|
    /// </summary>
 | 
						|
    INDEXING_ONLY = 64,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The text extraction process must recursively search all linked
 | 
						|
    /// objects within the document. If a link is unavailable, the
 | 
						|
    /// IFilter::GetChunk call that would have obtained the first chunk of the
 | 
						|
    /// link should return FILTER_E_LINK_UNAVAILABLE. 
 | 
						|
    /// </summary>
 | 
						|
    SEARCH_LINKS = 128,
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The content indexing process can return property values set by the  filter. 
 | 
						|
    /// </summary>
 | 
						|
    FILTER_OWNED_VALUE_OK = 512
 | 
						|
  }
 | 
						|
 | 
						|
  public struct STAT_CHUNK 
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// The chunk identifier. Chunk identifiers must be unique for the
 | 
						|
    /// current instance of the IFilter interface. 
 | 
						|
    /// Chunk identifiers must be in ascending order. The order in which
 | 
						|
    /// chunks are numbered should correspond to the order in which they appear
 | 
						|
    /// in the source document. Some search engines can take advantage of the
 | 
						|
    /// proximity of chunks of various properties. If so, the order in which
 | 
						|
    /// chunks with different properties are emitted will be important to the
 | 
						|
    /// search engine. 
 | 
						|
    /// </summary>
 | 
						|
    public int idChunk;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The type of break that separates the previous chunk from the current
 | 
						|
    ///  chunk. Values are from the CHUNK_BREAKTYPE enumeration. 
 | 
						|
    /// </summary>
 | 
						|
    [MarshalAs(UnmanagedType.U4)]
 | 
						|
    public CHUNK_BREAKTYPE breakType;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Flags indicate whether this chunk contains a text-type or a
 | 
						|
    /// value-type property. 
 | 
						|
    /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set, 
 | 
						|
    /// IFilter::GetText should be used to retrieve the contents of the chunk
 | 
						|
    /// as a series of words. 
 | 
						|
    /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve 
 | 
						|
    /// the value and treat it as a single property value. If the filter dictates that the same 
 | 
						|
    /// content be treated as both text and as a value, the chunk should be emitted twice in two       
 | 
						|
    /// different chunks, each with one flag set. 
 | 
						|
    /// </summary>
 | 
						|
    [MarshalAs(UnmanagedType.U4)]
 | 
						|
    public CHUNKSTATE flags;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The language and sublanguage associated with a chunk of text. Chunk locale is used 
 | 
						|
    /// by document indexers to perform proper word breaking of text. If the chunk is 
 | 
						|
    /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR, 
 | 
						|
    /// this field is ignored. 
 | 
						|
    /// </summary>
 | 
						|
    public int locale;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The property to be applied to the chunk. If a filter requires that       the same text 
 | 
						|
    /// have more than one property, it needs to emit the text once for each       property 
 | 
						|
    /// in separate chunks. 
 | 
						|
    /// </summary>
 | 
						|
    public FULLPROPSPEC attribute;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The ID of the source of a chunk. The value of the idChunkSource     member depends on the nature of the chunk: 
 | 
						|
    /// If the chunk is a text-type property, the value of the idChunkSource       member must be the same as the value of the idChunk member. 
 | 
						|
    /// If the chunk is an public value-type property derived from textual       content, the value of the idChunkSource member is the chunk ID for the
 | 
						|
    /// text-type chunk from which it is derived. 
 | 
						|
    /// If the filter attributes specify to return only public value-type
 | 
						|
    /// properties, there is no content chunk from which to derive the current
 | 
						|
    /// public value-type property. In this case, the value of the
 | 
						|
    /// idChunkSource member must be set to zero, which is an invalid chunk. 
 | 
						|
    /// </summary>
 | 
						|
    public int idChunkSource;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The offset from which the source text for a derived chunk starts in
 | 
						|
    /// the source chunk. 
 | 
						|
    /// </summary>
 | 
						|
    public int cwcStartSource;
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The length in characters of the source text from which the current
 | 
						|
    /// chunk was derived. 
 | 
						|
    /// A zero value signifies character-by-character correspondence between
 | 
						|
    /// the source text and 
 | 
						|
    /// the derived text. A nonzero value means that no such direct
 | 
						|
    /// correspondence exists
 | 
						|
    /// </summary>
 | 
						|
    public int cwcLenSource;
 | 
						|
  }
 | 
						|
 | 
						|
  /// <summary>
 | 
						|
  /// Enumerates the different breaking types that occur between 
 | 
						|
  /// chunks of text read out by the FileFilter.
 | 
						|
  /// </summary>
 | 
						|
  public enum CHUNK_BREAKTYPE
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// No break is placed between the current chunk and the previous chunk.
 | 
						|
    /// The chunks are glued together. 
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_NO_BREAK = 0,
 | 
						|
    /// <summary>
 | 
						|
    /// A word break is placed between this chunk and the previous chunk that
 | 
						|
    /// had the same attribute. 
 | 
						|
    /// Use of CHUNK_EOW should be minimized because the choice of word
 | 
						|
    /// breaks is language-dependent, 
 | 
						|
    /// so determining word breaks is best left to the search engine. 
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_EOW = 1,
 | 
						|
    /// <summary>
 | 
						|
    /// A sentence break is placed between this chunk and the previous chunk
 | 
						|
    /// that had the same attribute. 
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_EOS = 2,
 | 
						|
    /// <summary>
 | 
						|
    /// A paragraph break is placed between this chunk and the previous chunk
 | 
						|
    /// that had the same attribute.
 | 
						|
    /// </summary>     
 | 
						|
    CHUNK_EOP = 3,
 | 
						|
    /// <summary>
 | 
						|
    /// A chapter break is placed between this chunk and the previous chunk
 | 
						|
    /// that had the same attribute. 
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_EOC = 4
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
  public enum CHUNKSTATE 
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// The current chunk is a text-type property.
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_TEXT = 0x1,
 | 
						|
    /// <summary>
 | 
						|
    /// The current chunk is a value-type property. 
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_VALUE = 0x2,
 | 
						|
    /// <summary>
 | 
						|
    /// Reserved
 | 
						|
    /// </summary>
 | 
						|
    CHUNK_FILTER_OWNED_VALUE = 0x4
 | 
						|
  }
 | 
						|
 | 
						|
  internal enum IFilterReturnCode : uint 
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// Success
 | 
						|
    /// </summary>
 | 
						|
    S_OK = 0,
 | 
						|
    /// <summary>
 | 
						|
    /// The function was denied access to the filter file. 
 | 
						|
    /// </summary>
 | 
						|
    E_ACCESSDENIED = 0x80070005,
 | 
						|
    /// <summary>
 | 
						|
    /// The function encountered an invalid handle,
 | 
						|
    /// probably due to a low-memory situation. 
 | 
						|
    /// </summary>
 | 
						|
    E_HANDLE = 0x80070006,
 | 
						|
    /// <summary>
 | 
						|
    /// The function received an invalid parameter.
 | 
						|
    /// </summary>
 | 
						|
    E_INVALIDARG = 0x80070057,
 | 
						|
    /// <summary>
 | 
						|
    /// Out of memory
 | 
						|
    /// </summary>
 | 
						|
    E_OUTOFMEMORY = 0x8007000E,
 | 
						|
    /// <summary>
 | 
						|
    /// Not implemented
 | 
						|
    /// </summary>
 | 
						|
    E_NOTIMPL = 0x80004001,
 | 
						|
    /// <summary>
 | 
						|
    /// Unknown error
 | 
						|
    /// </summary>
 | 
						|
    E_FAIL = 0x80000008,
 | 
						|
    /// <summary>
 | 
						|
    /// File not filtered due to password protection
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_PASSWORD = 0x8004170B,
 | 
						|
    /// <summary>
 | 
						|
    /// The document format is not recognised by the filter
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_UNKNOWNFORMAT = 0x8004170C,
 | 
						|
    /// <summary>
 | 
						|
    /// No text in current chunk
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_NO_TEXT = 0x80041705,
 | 
						|
    /// <summary>
 | 
						|
    /// No more chunks of text available in object
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_END_OF_CHUNKS = 0x80041700,
 | 
						|
    /// <summary>
 | 
						|
    /// No more text available in chunk
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_NO_MORE_TEXT = 0x80041701,
 | 
						|
    /// <summary>
 | 
						|
    /// No more property values available in chunk
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_NO_MORE_VALUES = 0x80041702,
 | 
						|
    /// <summary>
 | 
						|
    /// Unable to access object
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_ACCESS = 0x80041703,
 | 
						|
    /// <summary>
 | 
						|
    /// Moniker doesn't cover entire region
 | 
						|
    /// </summary>
 | 
						|
    FILTER_W_MONIKER_CLIPPED = 0x00041704,
 | 
						|
    /// <summary>
 | 
						|
    /// Unable to bind IFilter for embedded object
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
 | 
						|
    /// <summary>
 | 
						|
    /// Unable to bind IFilter for linked object
 | 
						|
    /// </summary>
 | 
						|
    FILTER_E_LINK_UNAVAILABLE = 0x80041708,
 | 
						|
    /// <summary>
 | 
						|
    ///  This is the last text in the current chunk
 | 
						|
    /// </summary>
 | 
						|
    FILTER_S_LAST_TEXT = 0x00041709,
 | 
						|
    /// <summary>
 | 
						|
    /// This is the last value in the current chunk
 | 
						|
    /// </summary>
 | 
						|
    FILTER_S_LAST_VALUES = 0x0004170A
 | 
						|
  }
 | 
						|
 | 
						|
  [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
 | 
						|
  [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
 | 
						|
  internal interface IFilter
 | 
						|
  {
 | 
						|
    /// <summary>
 | 
						|
    /// The IFilter::Init method initializes a filtering session.
 | 
						|
    /// </summary>
 | 
						|
    [PreserveSig]
 | 
						|
    IFilterReturnCode Init(
 | 
						|
      //[in] Flag settings from the IFILTER_INIT enumeration for
 | 
						|
      // controlling text standardization, property output, embedding
 | 
						|
      // scope, and IFilter access patterns. 
 | 
						|
      IFILTER_INIT grfFlags,
 | 
						|
 | 
						|
      // [in] The size of the attributes array. When nonzero, cAttributes
 | 
						|
      //  takes 
 | 
						|
      // precedence over attributes specified in grfFlags. If no
 | 
						|
      // attribute flags 
 | 
						|
      // are specified and cAttributes is zero, the default is given by
 | 
						|
      // the 
 | 
						|
      // PSGUID_STORAGE storage property set, which contains the date and
 | 
						|
      //  time 
 | 
						|
      // of the last write to the file, size, and so on; and by the
 | 
						|
      //  PID_STG_CONTENTS 
 | 
						|
      // 'contents' property, which maps to the main contents of the
 | 
						|
      // file. 
 | 
						|
      // For more information about properties and property sets, see
 | 
						|
      // Property Sets. 
 | 
						|
      int cAttributes,
 | 
						|
 | 
						|
      //[in] Array of pointers to FULLPROPSPEC structures for the
 | 
						|
      // requested properties. 
 | 
						|
      // When cAttributes is nonzero, only the properties in aAttributes
 | 
						|
      // are returned. 
 | 
						|
      IntPtr aAttributes,
 | 
						|
 | 
						|
      // [out] Information about additional properties available to the
 | 
						|
      //  caller; from the IFILTER_FLAGS enumeration. 
 | 
						|
      out IFILTER_FLAGS pdwFlags);
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The IFilter::GetChunk method positions the filter at the beginning
 | 
						|
    /// of the next chunk, 
 | 
						|
    /// or at the first chunk if this is the first call to the GetChunk
 | 
						|
    /// method, and returns a description of the current chunk. 
 | 
						|
    /// </summary>
 | 
						|
    [PreserveSig]
 | 
						|
    IFilterReturnCode GetChunk(out STAT_CHUNK pStat);
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The IFilter::GetText method retrieves text (text-type properties)
 | 
						|
    /// from the current chunk, 
 | 
						|
    /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
 | 
						|
    /// </summary>
 | 
						|
    [PreserveSig]
 | 
						|
    IFilterReturnCode GetText(
 | 
						|
      // [in/out] On entry, the size of awcBuffer array in wide/Unicode
 | 
						|
      // characters. On exit, the number of Unicode characters written to
 | 
						|
      // awcBuffer. 
 | 
						|
      // Note that this value is not the number of bytes in the buffer. 
 | 
						|
      ref uint pcwcBuffer,
 | 
						|
 | 
						|
      // Text retrieved from the current chunk. Do not terminate the
 | 
						|
      // buffer with a character.  
 | 
						|
      [Out(), MarshalAs(UnmanagedType.LPArray)] 
 | 
						|
      char[] awcBuffer);
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The IFilter::GetValue method retrieves a value (public
 | 
						|
    /// value-type property) from a chunk, 
 | 
						|
    /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
 | 
						|
    /// </summary>
 | 
						|
    [PreserveSig]
 | 
						|
    int GetValue(
 | 
						|
      // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
 | 
						|
      // PROPVARIANT 
 | 
						|
      // structures contain pointers, which can be freed by calling the
 | 
						|
      // PropVariantClear function. 
 | 
						|
      // It is up to the caller of the GetValue method to call the
 | 
						|
      //   PropVariantClear method.            
 | 
						|
      // ref IntPtr ppPropValue
 | 
						|
      // [MarshalAs(UnmanagedType.Struct)]
 | 
						|
      ref IntPtr PropVal);
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// The IFilter::BindRegion method retrieves an interface representing
 | 
						|
    /// the specified portion of the object. 
 | 
						|
    /// Currently reserved for future use.
 | 
						|
    /// </summary>
 | 
						|
    [PreserveSig]
 | 
						|
    int BindRegion(ref FILTERREGION origPos,
 | 
						|
      ref Guid riid, ref object ppunk);
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
}
 |