436 lines
15 KiB
C#
436 lines
15 KiB
C#
using System;
|
|
using System.Text;
|
|
using System.Runtime.InteropServices;
|
|
|
|
//Contains IFilter interface translation
|
|
//Most translations are from PInvoke.net
|
|
|
|
namespace EPocalipse.IFilter
|
|
{
|
|
[StructLayout(LayoutKind.Sequential)]
|
|
public struct FULLPROPSPEC
|
|
{
|
|
public Guid guidPropSet;
|
|
public PROPSPEC psProperty;
|
|
}
|
|
|
|
[StructLayout(LayoutKind.Sequential)]
|
|
internal struct FILTERREGION
|
|
{
|
|
public int idChunk;
|
|
public int cwcStart;
|
|
public int cwcExtent;
|
|
}
|
|
|
|
[StructLayout(LayoutKind.Explicit)]
|
|
public struct PROPSPEC
|
|
{
|
|
[FieldOffset(0)] public int ulKind; // 0 - string used; 1 - PROPID
|
|
[FieldOffset(4)] public int propid;
|
|
[FieldOffset(4)] public IntPtr lpwstr;
|
|
}
|
|
|
|
[Flags]
|
|
internal enum IFILTER_FLAGS
|
|
{
|
|
/// <summary>
|
|
/// The caller should use the IPropertySetStorage and IPropertyStorage
|
|
/// interfaces to locate additional properties.
|
|
/// When this flag is set, properties available through COM
|
|
/// enumerators should not be returned from IFilter.
|
|
/// </summary>
|
|
IFILTER_FLAGS_OLE_PROPERTIES = 1
|
|
}
|
|
|
|
/// <summary>
|
|
/// Flags controlling the operation of the FileFilter
|
|
/// instance.
|
|
/// </summary>
|
|
[Flags]
|
|
internal enum IFILTER_INIT
|
|
{
|
|
NONE = 0,
|
|
/// <summary>
|
|
/// Paragraph breaks should be marked with the Unicode PARAGRAPH
|
|
/// SEPARATOR (0x2029)
|
|
/// </summary>
|
|
CANON_PARAGRAPHS = 1,
|
|
|
|
/// <summary>
|
|
/// Soft returns, such as the newline character in Microsoft Word, should
|
|
/// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
|
|
/// returns can be doubled. A carriage return (0x000D), line feed (0x000A),
|
|
/// or the carriage return and line feed in combination should be considered
|
|
/// a hard return. The intent is to enable pattern-expression matches that
|
|
/// match against observed line breaks.
|
|
/// </summary>
|
|
HARD_LINE_BREAKS = 2,
|
|
|
|
/// <summary>
|
|
/// Various word-processing programs have forms of hyphens that are not
|
|
/// represented in the host character set, such as optional hyphens
|
|
/// (appearing only at the end of a line) and nonbreaking hyphens. This flag
|
|
/// indicates that optional hyphens are to be converted to nulls, and
|
|
/// non-breaking hyphens are to be converted to normal hyphens (0x2010), or
|
|
/// HYPHEN-MINUSES (0x002D).
|
|
/// </summary>
|
|
CANON_HYPHENS = 4,
|
|
|
|
/// <summary>
|
|
/// Just as the CANON_HYPHENS flag standardizes hyphens,
|
|
/// this one standardizes spaces. All special space characters, such as
|
|
/// nonbreaking spaces, are converted to the standard space character
|
|
/// (0x0020).
|
|
/// </summary>
|
|
CANON_SPACES = 8,
|
|
|
|
/// <summary>
|
|
/// Indicates that the client wants text split into chunks representing
|
|
/// public value-type properties.
|
|
/// </summary>
|
|
APPLY_INDEX_ATTRIBUTES = 16,
|
|
|
|
/// <summary>
|
|
/// Indicates that the client wants text split into chunks representing
|
|
/// properties determined during the indexing process.
|
|
/// </summary>
|
|
APPLY_CRAWL_ATTRIBUTES = 256,
|
|
|
|
/// <summary>
|
|
/// Any properties not covered by the APPLY_INDEX_ATTRIBUTES
|
|
/// and APPLY_CRAWL_ATTRIBUTES flags should be emitted.
|
|
/// </summary>
|
|
APPLY_OTHER_ATTRIBUTES = 32,
|
|
|
|
/// <summary>
|
|
/// Optimizes IFilter for indexing because the client calls the
|
|
/// IFilter::Init method only once and does not call IFilter::BindRegion.
|
|
/// This eliminates the possibility of accessing a chunk both before and
|
|
/// after accessing another chunk.
|
|
/// </summary>
|
|
INDEXING_ONLY = 64,
|
|
|
|
/// <summary>
|
|
/// The text extraction process must recursively search all linked
|
|
/// objects within the document. If a link is unavailable, the
|
|
/// IFilter::GetChunk call that would have obtained the first chunk of the
|
|
/// link should return FILTER_E_LINK_UNAVAILABLE.
|
|
/// </summary>
|
|
SEARCH_LINKS = 128,
|
|
|
|
/// <summary>
|
|
/// The content indexing process can return property values set by the filter.
|
|
/// </summary>
|
|
FILTER_OWNED_VALUE_OK = 512
|
|
}
|
|
|
|
public struct STAT_CHUNK
|
|
{
|
|
/// <summary>
|
|
/// The chunk identifier. Chunk identifiers must be unique for the
|
|
/// current instance of the IFilter interface.
|
|
/// Chunk identifiers must be in ascending order. The order in which
|
|
/// chunks are numbered should correspond to the order in which they appear
|
|
/// in the source document. Some search engines can take advantage of the
|
|
/// proximity of chunks of various properties. If so, the order in which
|
|
/// chunks with different properties are emitted will be important to the
|
|
/// search engine.
|
|
/// </summary>
|
|
public int idChunk;
|
|
|
|
/// <summary>
|
|
/// The type of break that separates the previous chunk from the current
|
|
/// chunk. Values are from the CHUNK_BREAKTYPE enumeration.
|
|
/// </summary>
|
|
[MarshalAs(UnmanagedType.U4)]
|
|
public CHUNK_BREAKTYPE breakType;
|
|
|
|
/// <summary>
|
|
/// Flags indicate whether this chunk contains a text-type or a
|
|
/// value-type property.
|
|
/// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set,
|
|
/// IFilter::GetText should be used to retrieve the contents of the chunk
|
|
/// as a series of words.
|
|
/// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve
|
|
/// the value and treat it as a single property value. If the filter dictates that the same
|
|
/// content be treated as both text and as a value, the chunk should be emitted twice in two
|
|
/// different chunks, each with one flag set.
|
|
/// </summary>
|
|
[MarshalAs(UnmanagedType.U4)]
|
|
public CHUNKSTATE flags;
|
|
|
|
/// <summary>
|
|
/// The language and sublanguage associated with a chunk of text. Chunk locale is used
|
|
/// by document indexers to perform proper word breaking of text. If the chunk is
|
|
/// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR,
|
|
/// this field is ignored.
|
|
/// </summary>
|
|
public int locale;
|
|
|
|
/// <summary>
|
|
/// The property to be applied to the chunk. If a filter requires that the same text
|
|
/// have more than one property, it needs to emit the text once for each property
|
|
/// in separate chunks.
|
|
/// </summary>
|
|
public FULLPROPSPEC attribute;
|
|
|
|
/// <summary>
|
|
/// The ID of the source of a chunk. The value of the idChunkSource member depends on the nature of the chunk:
|
|
/// If the chunk is a text-type property, the value of the idChunkSource member must be the same as the value of the idChunk member.
|
|
/// If the chunk is an public value-type property derived from textual content, the value of the idChunkSource member is the chunk ID for the
|
|
/// text-type chunk from which it is derived.
|
|
/// If the filter attributes specify to return only public value-type
|
|
/// properties, there is no content chunk from which to derive the current
|
|
/// public value-type property. In this case, the value of the
|
|
/// idChunkSource member must be set to zero, which is an invalid chunk.
|
|
/// </summary>
|
|
public int idChunkSource;
|
|
|
|
/// <summary>
|
|
/// The offset from which the source text for a derived chunk starts in
|
|
/// the source chunk.
|
|
/// </summary>
|
|
public int cwcStartSource;
|
|
|
|
/// <summary>
|
|
/// The length in characters of the source text from which the current
|
|
/// chunk was derived.
|
|
/// A zero value signifies character-by-character correspondence between
|
|
/// the source text and
|
|
/// the derived text. A nonzero value means that no such direct
|
|
/// correspondence exists
|
|
/// </summary>
|
|
public int cwcLenSource;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Enumerates the different breaking types that occur between
|
|
/// chunks of text read out by the FileFilter.
|
|
/// </summary>
|
|
public enum CHUNK_BREAKTYPE
|
|
{
|
|
/// <summary>
|
|
/// No break is placed between the current chunk and the previous chunk.
|
|
/// The chunks are glued together.
|
|
/// </summary>
|
|
CHUNK_NO_BREAK = 0,
|
|
/// <summary>
|
|
/// A word break is placed between this chunk and the previous chunk that
|
|
/// had the same attribute.
|
|
/// Use of CHUNK_EOW should be minimized because the choice of word
|
|
/// breaks is language-dependent,
|
|
/// so determining word breaks is best left to the search engine.
|
|
/// </summary>
|
|
CHUNK_EOW = 1,
|
|
/// <summary>
|
|
/// A sentence break is placed between this chunk and the previous chunk
|
|
/// that had the same attribute.
|
|
/// </summary>
|
|
CHUNK_EOS = 2,
|
|
/// <summary>
|
|
/// A paragraph break is placed between this chunk and the previous chunk
|
|
/// that had the same attribute.
|
|
/// </summary>
|
|
CHUNK_EOP = 3,
|
|
/// <summary>
|
|
/// A chapter break is placed between this chunk and the previous chunk
|
|
/// that had the same attribute.
|
|
/// </summary>
|
|
CHUNK_EOC = 4
|
|
}
|
|
|
|
|
|
public enum CHUNKSTATE
|
|
{
|
|
/// <summary>
|
|
/// The current chunk is a text-type property.
|
|
/// </summary>
|
|
CHUNK_TEXT = 0x1,
|
|
/// <summary>
|
|
/// The current chunk is a value-type property.
|
|
/// </summary>
|
|
CHUNK_VALUE = 0x2,
|
|
/// <summary>
|
|
/// Reserved
|
|
/// </summary>
|
|
CHUNK_FILTER_OWNED_VALUE = 0x4
|
|
}
|
|
|
|
internal enum IFilterReturnCode : uint
|
|
{
|
|
/// <summary>
|
|
/// Success
|
|
/// </summary>
|
|
S_OK = 0,
|
|
/// <summary>
|
|
/// The function was denied access to the filter file.
|
|
/// </summary>
|
|
E_ACCESSDENIED = 0x80070005,
|
|
/// <summary>
|
|
/// The function encountered an invalid handle,
|
|
/// probably due to a low-memory situation.
|
|
/// </summary>
|
|
E_HANDLE = 0x80070006,
|
|
/// <summary>
|
|
/// The function received an invalid parameter.
|
|
/// </summary>
|
|
E_INVALIDARG = 0x80070057,
|
|
/// <summary>
|
|
/// Out of memory
|
|
/// </summary>
|
|
E_OUTOFMEMORY = 0x8007000E,
|
|
/// <summary>
|
|
/// Not implemented
|
|
/// </summary>
|
|
E_NOTIMPL = 0x80004001,
|
|
/// <summary>
|
|
/// Unknown error
|
|
/// </summary>
|
|
E_FAIL = 0x80000008,
|
|
/// <summary>
|
|
/// File not filtered due to password protection
|
|
/// </summary>
|
|
FILTER_E_PASSWORD = 0x8004170B,
|
|
/// <summary>
|
|
/// The document format is not recognised by the filter
|
|
/// </summary>
|
|
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
|
|
/// <summary>
|
|
/// No text in current chunk
|
|
/// </summary>
|
|
FILTER_E_NO_TEXT = 0x80041705,
|
|
/// <summary>
|
|
/// No more chunks of text available in object
|
|
/// </summary>
|
|
FILTER_E_END_OF_CHUNKS = 0x80041700,
|
|
/// <summary>
|
|
/// No more text available in chunk
|
|
/// </summary>
|
|
FILTER_E_NO_MORE_TEXT = 0x80041701,
|
|
/// <summary>
|
|
/// No more property values available in chunk
|
|
/// </summary>
|
|
FILTER_E_NO_MORE_VALUES = 0x80041702,
|
|
/// <summary>
|
|
/// Unable to access object
|
|
/// </summary>
|
|
FILTER_E_ACCESS = 0x80041703,
|
|
/// <summary>
|
|
/// Moniker doesn't cover entire region
|
|
/// </summary>
|
|
FILTER_W_MONIKER_CLIPPED = 0x00041704,
|
|
/// <summary>
|
|
/// Unable to bind IFilter for embedded object
|
|
/// </summary>
|
|
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
|
|
/// <summary>
|
|
/// Unable to bind IFilter for linked object
|
|
/// </summary>
|
|
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
|
|
/// <summary>
|
|
/// This is the last text in the current chunk
|
|
/// </summary>
|
|
FILTER_S_LAST_TEXT = 0x00041709,
|
|
/// <summary>
|
|
/// This is the last value in the current chunk
|
|
/// </summary>
|
|
FILTER_S_LAST_VALUES = 0x0004170A
|
|
}
|
|
|
|
[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
|
|
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
|
|
internal interface IFilter
|
|
{
|
|
/// <summary>
|
|
/// The IFilter::Init method initializes a filtering session.
|
|
/// </summary>
|
|
[PreserveSig]
|
|
IFilterReturnCode Init(
|
|
//[in] Flag settings from the IFILTER_INIT enumeration for
|
|
// controlling text standardization, property output, embedding
|
|
// scope, and IFilter access patterns.
|
|
IFILTER_INIT grfFlags,
|
|
|
|
// [in] The size of the attributes array. When nonzero, cAttributes
|
|
// takes
|
|
// precedence over attributes specified in grfFlags. If no
|
|
// attribute flags
|
|
// are specified and cAttributes is zero, the default is given by
|
|
// the
|
|
// PSGUID_STORAGE storage property set, which contains the date and
|
|
// time
|
|
// of the last write to the file, size, and so on; and by the
|
|
// PID_STG_CONTENTS
|
|
// 'contents' property, which maps to the main contents of the
|
|
// file.
|
|
// For more information about properties and property sets, see
|
|
// Property Sets.
|
|
int cAttributes,
|
|
|
|
//[in] Array of pointers to FULLPROPSPEC structures for the
|
|
// requested properties.
|
|
// When cAttributes is nonzero, only the properties in aAttributes
|
|
// are returned.
|
|
IntPtr aAttributes,
|
|
|
|
// [out] Information about additional properties available to the
|
|
// caller; from the IFILTER_FLAGS enumeration.
|
|
out IFILTER_FLAGS pdwFlags);
|
|
|
|
/// <summary>
|
|
/// The IFilter::GetChunk method positions the filter at the beginning
|
|
/// of the next chunk,
|
|
/// or at the first chunk if this is the first call to the GetChunk
|
|
/// method, and returns a description of the current chunk.
|
|
/// </summary>
|
|
[PreserveSig]
|
|
IFilterReturnCode GetChunk(out STAT_CHUNK pStat);
|
|
|
|
/// <summary>
|
|
/// The IFilter::GetText method retrieves text (text-type properties)
|
|
/// from the current chunk,
|
|
/// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
|
|
/// </summary>
|
|
[PreserveSig]
|
|
IFilterReturnCode GetText(
|
|
// [in/out] On entry, the size of awcBuffer array in wide/Unicode
|
|
// characters. On exit, the number of Unicode characters written to
|
|
// awcBuffer.
|
|
// Note that this value is not the number of bytes in the buffer.
|
|
ref uint pcwcBuffer,
|
|
|
|
// Text retrieved from the current chunk. Do not terminate the
|
|
// buffer with a character.
|
|
[Out(), MarshalAs(UnmanagedType.LPArray)]
|
|
char[] awcBuffer);
|
|
|
|
/// <summary>
|
|
/// The IFilter::GetValue method retrieves a value (public
|
|
/// value-type property) from a chunk,
|
|
/// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
|
|
/// </summary>
|
|
[PreserveSig]
|
|
int GetValue(
|
|
// Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
|
|
// PROPVARIANT
|
|
// structures contain pointers, which can be freed by calling the
|
|
// PropVariantClear function.
|
|
// It is up to the caller of the GetValue method to call the
|
|
// PropVariantClear method.
|
|
// ref IntPtr ppPropValue
|
|
// [MarshalAs(UnmanagedType.Struct)]
|
|
ref IntPtr PropVal);
|
|
|
|
/// <summary>
|
|
/// The IFilter::BindRegion method retrieves an interface representing
|
|
/// the specified portion of the object.
|
|
/// Currently reserved for future use.
|
|
/// </summary>
|
|
[PreserveSig]
|
|
int BindRegion(ref FILTERREGION origPos,
|
|
ref Guid riid, ref object ppunk);
|
|
}
|
|
|
|
|
|
}
|