IWordBreaker ã¨ãã¡ã¤ã«æ¤ç´¢
ããããªãã¥ã¢ãã§æ¤ç´¢ãããããã¼ããã£ããããªãã¥ã¢ãã«ãããããªããã¨ãã Windows Search ã®è©±ï¼
Windows7ã«æ·±å»ãªãã°ãçºè¦ããã®ã§ãè¦éãé³´ããããã«æãã¦ã¿ã¾ãã
åç¾ã«ä½¿ç¨ããOSã¯Windows7 Home Premium x64ã§ãããã°ã®åç¾æé
ã
ï¼ï¼ï¼ æªç¨å³ç¦ ï¼ï¼ï¼
ã
âï¼ï¼é©å½ã«ãã©ã«ããä½ã ååã¯ä½ã§ãOK
ã
âï¼ï¼ä½ã£ããã©ã«ãã¼ãéãã¦ã
ããã¼ããã£ããããªãã¥ã¢ã
ããµããã¯ããªãã¥ã¢ã
ãããªãã¥ã¢ã
ã®ï¼ã¤ã®ãã©ã«ããæ°è¦ä½æãã
ãâï¼ï¼æ¤ç´¢çªã«ãããªãã¥ã¢ãã¨å ¥åãã¦ã¿ã
ãâï¼ï¼ããã¼ããã£ããããªãã¥ã¢ããç¡ãã£ããã¨ã«ããã
ã¡ããããï¼èª°ããããªãã¨ãï¼ã¡ãã£ã¼ãã¯ï¼ï¼ã¡ãã£ã¼ã¼ã¼ã¼ãã¯ï¼ï¼
対å¦æ¹æ³
æ¤ç´¢çªã«ã*ããªãã¥ã¢ãã¨å ¥ããã¨å ¨é¨ãããããã¿ããã
ã§ããXPã®é ã¯ãããªãã¥ã¢ãã§å ¨é¨ããããã¦ãã®ã§ãªããè ã«è½ã¡ãªãã¢ã¬ãã
ã¡ãªã¿ã«æ¤ç´¢ã¤ã³ããã¯ã¹ã®æç¡ã¯é¢ä¿ãªãã¿ããã§ãã
ï¼2010/10/30 11:05 追è¨
VistaãMacOSã§ãåç¾ããã¨ãã
Windowsã®äººã¯ããEverythingãã使ãã¨å¹¸ãã«ãªãããããã§ãã
ãå¾æ¥ä½ãèããã«ãã¡ã¤ã«åã®é¨åæååã§æ¤ç´¢ã§ãã¦ããã®ãã®ãï¼ã©ããã¦ã¢ã¹ã¿ãªã¹ã¯ãå¿
è¦ã«ãã¡ãã£ãã®ï¼ãã¨ããæ¹åã®è©±ã®ãããªæ°ããã¾ããï¼ãã®è¾ºã¯ç½®ãã¦ããã¦ä¹
ãã¶ãã« IWordBreaker ã¨ãï¼
Windows 7 ã«æ¨æºã§ä»ãã¦ããæ¥æ¬èªåã IWordBreaker å®è£
ã«ããã¼ããã£ããããªãã¥ã¢ãçãé£ããã¦ã¿ã¾ãï¼
using System; using System.Collections.Generic; using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Security; using Microsoft.Win32; using WordBreaker; namespace WordBreakerTest { using HRESULT = System.UInt32; public struct HResults { public const HRESULT S_OK = 0x00000000; public const HRESULT S_FALSE = 0x00000001; public const HRESULT E_FAIL = 0x80004005; public const HRESULT WBREAK_E_END_OF_TEXT = 0x80041780; public const HRESULT LANGUAGE_S_LARGE_WORD = 0x00041781; public const HRESULT WBREAK_E_QUERY_ONLY = 0x80041782; public const HRESULT WBREAK_E_BUFFER_TOO_SMALL = 0x80041783; public const HRESULT LANGUAGE_E_DATABASE_NOT_FOUND = 0x80041784; public const HRESULT WBREAK_E_INIT_FAILED = 0x80041785; } public enum WORDREP_BREAK_TYPE { WORDREP_BREAK_EOW = 0, WORDREP_BREAK_EOS = 1, WORDREP_BREAK_EOP = 2, WORDREP_BREAK_EOC = 3 } [SuppressUnmanagedCodeSecurity] [ComImport, Guid("CC907054-C058-101A-B554-08002B33B0E6"), InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IWordSink { [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT PutWord( uint cwc, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 0, ArraySubType = UnmanagedType.U2)] char[] pwcInBuf, uint cwcSrcLen, uint cwcSrcPos); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT PutAltWord( uint cwc, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 0, ArraySubType = UnmanagedType.U2)] char[] pwcInBuf, uint cwcSrcLen, uint cwcSrcPos); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT StartAltPhrase(); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT EndAltPhrase(); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT PutBreak(WORDREP_BREAK_TYPE breakType); } [SuppressUnmanagedCodeSecurity] [ComImport, Guid("CC906FF0-C058-101A-B554-08002B33B0E6"), InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IPhraseSink { [Obsolete("Not supported.")] [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT PutSmallPhrase( [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1, ArraySubType = UnmanagedType.U2)] char[] pwcNoun, uint cwcNoun, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 3, ArraySubType = UnmanagedType.U2)] char[] pwcModifier, uint cwcModifier, uint ulAttachmentType); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT PutPhrase( [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1, ArraySubType = UnmanagedType.U2)] char[] pwcPhrase, uint cwcPhrase); } public class WordSink : IWordSink { public Action<string, uint, uint> OnWord { get; set; } public Action<string, uint, uint> OnAltWord { get; set; } public Action<WORDREP_BREAK_TYPE> OnBreak { get; set; } #region CWordSink Members public HRESULT PutWord( uint cwc, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 0, ArraySubType = UnmanagedType.U2)] char[] pwcInBuf, uint cwcSrcLen, uint cwcSrcPos) { if (OnWord != null) { OnWord(new string(pwcInBuf), cwcSrcLen, cwcSrcPos); } return HResults.S_OK; } public HRESULT PutAltWord( uint cwc, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 0, ArraySubType = UnmanagedType.U2)] char[] pwcInBuf, uint cwcSrcLen, uint cwcSrcPos) { if (OnAltWord != null) { OnAltWord(new string(pwcInBuf), cwcSrcLen, cwcSrcPos); } return HResults.S_OK; } public HRESULT StartAltPhrase() { return HResults.S_OK; } public HRESULT EndAltPhrase() { return HResults.S_OK; } public HRESULT PutBreak(WORDREP_BREAK_TYPE breakType) { if (OnBreak != null) { OnBreak(breakType); } return HResults.S_OK; } #endregion } public class CPhraseSink : IPhraseSink { #region CPhraseSink Members public HRESULT PutSmallPhrase( [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1, ArraySubType = UnmanagedType.U2)] char[] pwcNoun, uint cwcNoun, [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 3, ArraySubType = UnmanagedType.U2)] char[] pwcModifier, uint cwcModifier, uint ulAttachmentType) { return HResults.S_OK; } public HRESULT PutPhrase( [In][MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1, ArraySubType = UnmanagedType.U2)] char[] pwcPhrase, uint cwcPhrase) { return HResults.S_OK; } #endregion } [UnmanagedFunctionPointer(CallingConvention.StdCall)] public delegate uint FillTextBufferDelegate(ref TEXT_SOURCE pTextSource); [StructLayout(LayoutKind.Sequential)] public struct TEXT_SOURCE { [MarshalAs(UnmanagedType.FunctionPtr)] public FillTextBufferDelegate pfnFillTextBuffer; [MarshalAs(UnmanagedType.LPWStr)] public string awcBuffer; public uint iEnd; public uint iCur; } [SuppressUnmanagedCodeSecurity] [ComImport, Guid("D53552C8-77E3-101A-B552-08002B33B0E6"), InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IWordBreaker { [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT Init( [MarshalAs(UnmanagedType.Bool)] bool fQuery, uint maxTokenSize, [MarshalAs(UnmanagedType.Bool)] out bool pfLicense); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT BreakText( ref TEXT_SOURCE pTextSource, [MarshalAs(UnmanagedType.Interface)] IWordSink pWordSink, [MarshalAs(UnmanagedType.Interface)] IPhraseSink pPhraseSink); [PreserveSig, MethodImpl(MethodImplOptions.InternalCall, MethodCodeType = MethodCodeType.Runtime)] HRESULT GetLicenseToUse([MarshalAs(UnmanagedType.LPWStr)] out string ppwcsLicense); } public static class Program { public static void BreakText(string text, bool forQuery) { const string kWordBreakerKey = @"HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\ContentIndex\Language\Japanese_Default"; var guid = new Guid(Registry.GetValue(kWordBreakerKey, @"WBreakerClass", string.Empty) as string); var wordBreakerType = Type.GetTypeFromCLSID(guid); // A newer wordbreaker shipped with MS Office 2010. // wordBreakerType = Type.GetTypeFromProgID("NLG.Japanese Wordbreaker.4.1"); var wordBreaker = default(IWordBreaker); try { wordBreaker = Activator.CreateInstance(wordBreakerType) as IWordBreaker; var license = true; wordBreaker.Init(forQuery, 4096, out license); var filler = (FillTextBufferDelegate)((ref TEXT_SOURCE _) => HResults.WBREAK_E_END_OF_TEXT); var pTextSource = new TEXT_SOURCE() { pfnFillTextBuffer = filler, awcBuffer = text, iCur = 0, iEnd = checked((uint)text.Length), }; var dictionary = new Dictionary<WORDREP_BREAK_TYPE, string> { {WORDREP_BREAK_TYPE.WORDREP_BREAK_EOC, "[EOC]"}, {WORDREP_BREAK_TYPE.WORDREP_BREAK_EOP, "[EOP]"}, {WORDREP_BREAK_TYPE.WORDREP_BREAK_EOS, "[EOS]"}, {WORDREP_BREAK_TYPE.WORDREP_BREAK_EOW, "[EOW]"}, }; var words = new List<string>(); var altWords = new List<string>(); wordBreaker.BreakText(ref pTextSource, new WordSink { OnWord = (word, _, __) => words.Add(word), OnAltWord = (word, _, __) => altWords.Add(word), OnBreak = type => { words.Add(dictionary[type]); altWords.Add(dictionary[type]); }, }, new CPhraseSink()); GC.KeepAlive(filler); Console.WriteLine("Words: " + string.Join("/", words)); Console.WriteLine("Alt Words: " + string.Join("/", altWords)); } catch { if (wordBreaker != null) { Marshal.ReleaseComObject(wordBreaker); wordBreaker = null; } } } [MTAThread] static void Main(string[] args) { BreakText("ããªãã¥ã¢", false); BreakText("ãµããã¯ããªãã¥ã¢", false); BreakText("ãã¼ããã£ããããªãã¥ã¢", false); BreakText("ãã¤ã³ã³ãã¥ã¼ã¿ã¼", false); BreakText("æ ã·ã¹", false); } } }
Words: ããªãã¥ã¢ Alt Words: Words: ãµãã/ã¯/ããªãã¥ã¢ Alt Words: Words: ãããã¢ããããªãã¥ã¢ Alt Words: ãã¼ããã£ããããªãã¥ã¢ Words: ãã¤ã³ã³ãã¥ã¿ Alt Words: ãã¤ã³ã³ãã¥ã¼ã¿ã¼ Words: æ /ã·ã¹ Alt Words:
ãããã« "ããªãã¥ã¢" ã§åå²ãã¦ããããã¯ããªãããã§ããï¼ã¨ãããããããï¼ã欧æå°å以å¤ã®è¤åèªãã«ã¿ã«ã表è¨ããã¨ãã¯åãã¡æ¸ããã¨ãã Microsoft のスタイルガイド ãéµå®ããã¦ããã®ãåæãªã®ãï¼ã«ã¿ã«ãã®é£ç¶ã¯ä½ãèããã«ãã£ã¤ãã¦ããã ãã®ãããªæåã«ãè¦ãã¾ããï¼ããã¾ãã¡ããã¨å®é¨ãã¦ã¾ãããï¼
ã¡ãªã¿ã«ï¼SharePoint ã«ä»å±ãã WordBreaker ã§ã¯ï¼ä»¥ä¸ã®ããã«ã¦ã¼ã¶è¾æ¸ãã¡ã¤ã«ã使ããã¨ãåºæ¥ãããã§ãï¼
4. 以ä¸ã«å¾ãããã¡ã¤ã«ãä¿åãã¾ãã
å ´æ "C:\Program Files\Microsoft Office Servers\12.0\Bin"
(æ¥æ¬èªã¯ã¼ããã¬ã¼ã« nlsdata0011.dllãåå¨ããå ´æ)ãã¡ã¤ã«å "Custom0011.lex" (0011 ã¯è¨èª ID) æåã³ã¼ã "Unicode"
ããã«ãã® nlsdata0011.dll ã¨ãããã¡ã¤ã«ã§ããï¼æå ã® Windows 7 Ja ç°å¢ã§ã¯ååã®ãã¡ã¤ã«ãã·ã¹ãã ãã£ã¬ã¯ããªã«åå¨ãã¾ãï¼è©¦ãã« %SystemRoot%\System32\Custom0011.lex (㨠%SystemRoot%\SysWOW64\Custom0011.lex) ã¨ãããã¡ã¤ã«ãä½ãï¼ä»¥ä¸ã®å 容ãå ¥åãï¼BOM ä»ã UTF-16 ãã¡ã¤ã«ã§ä¿åãã¦ã¿ã¾ãï¼
#CUSTOMER_WB æ ã·ã¹ ããªãã¥ã¢
æ¹ãã¦æåã®ã³ã¼ããå®è¡ããã¨ï¼çµæã¯ä»¥ä¸ã®ããã«ãªãã¾ããï¼
Words: ããªãã¥ã¢ Alt Words: Words: ãµãã/ã¯/ããªãã¥ã¢ Alt Words: Words: ãããã¢ããããªãã¥ã¢ Alt Words: ãã¼ããã£ããããªãã¥ã¢ Words: ãã¤ã³ã³ãã¥ã¿ Alt Words: ãã¤ã³ã³ãã¥ã¼ã¿ã¼ Words: æ ã·ã¹ Alt Words:
å°ãªãã¨ããæ
ã·ã¹ãã®æ¹ã¯ 1 word ã¨ãã¦èªèãããããã«ãªãã¾ããï¼ã¾ãï¼å®è¡ä¸ã« Custom0011.lex ãèªã¿è¾¼ã¾ãã¦ãããã¨ãï¼Process Monitor ã®ãã°ãã確ããããã¾ããï¼
ä¸æ¹ï¼ã¦ã¼ã¶è¾æ¸ã«ãããªãã¥ã¢ãã追å ãã¦ãï¼"ãã¼ããã£ãã/ããªãã¥ã¢" ã¨åå²ããã¾ããã§ããï¼ããã¯ï¼ä»¥ä¸ã® SharePoint ã§ã®äºä¾ã¨åããã®ã®ããã§ãï¼
çºç«¯ã®è©±ãï¼ãã¯ã¼ããåå²ãã¦èªèãããããªãããã®ä¸ç¨®ã ã¨æãã¾ããï¼ã©ããç¾ä¸ä»£ã® Microsoft 製 IWordBreaker å®è£ ã§ã¯ã¦ã¼ã¶è¾æ¸ã使ã£ã¦ããã®åé¡ãåé¿ã§ããªããããªæãã§ãï¼ æ¬¡ãªãæ段ã¨ãã¦ã¯ï¼èªå㧠IWordBreaker を実装 ãã¦ï¼HKLM\SYSTEM\CurrentControlSet\Control\ContentIndex\Language\Japanese_Default 以ä¸ã® WBreakerClass ãç½®ãæãã¦ãã¾ãï¼ãããã§ããããï¼è©¦ãã¦ã¯ããªãã®ã§ï¼ãã¾ããããã¯åããã¾ãããï¼ã¯ã¼ããã¬ã¼ãã³ã° (è¨å®ç®æ : ãµã¼ãã¼å®ç¾©ãã¡ã¤ã«)
ãã¡ãã¯ãã»ããã¼ã®è³æã§ã¯çç¥ãã¦ãã¾ããããæ親ä¼ã§ã質åãããã¾ããã®ã§è¨è¼ãã¦ããã¾ã (æ親ä¼å ´ã§ãåçããã¦é ãã¾ãã)ã
ä¾ãã°ããããã&ã«ããªã·ã£ã¹ãã®ãããªãã¼ã¯ã¼ããæ¤ç´¢ãããå ´åãã¤ã³ããã¯ã¹åéæã«ãéã®è¨å·(ã¢ã³ããµã³ã &)ã«ãã£ã¦ããããããã¨ãã«ããªã·ã£ã¹ãã§ãã¼ã¯ã¼ããèªåçã«åºåããã¾ããããããå ´åã«ã¯ãã«ã¹ã¿ã ãã£ã¯ã·ã§ããªã¼(Custom Dictionary) ãè¨å®ãããã¨ã§ãããããèªåãã¬ã¼ã¯ãé»æ¢ãããããã&ã«ããªã·ã£ã¹ãã§å®å ¨ãããã®æ¤ç´¢ããããªããã¨ãã§ãã¾ãã
ã«ã¹ã¿ã ãã£ã¯ã·ã§ããªã®è¨å®ãã¡ã¤ã«ãé ç½®ããå ´æã¯ãã·ã½ã¼ã©ã¹ãã¡ã¤ã«ã¨ã¯ç°ãªãã%programfiles%\Microsoft Office Servers\12\bin\CustomLANG.lex ã§ãã(æ¥æ¬èªã®å ´åã¯ãCustom0011.lex ã§ãã) è¨å®ãåæ ãããã«ã¯ãã¤ã³ããã¯ã¹ã®ååé以å¤ã«ãã¯ã¨ãªã¼æã®ãã¬ã¼ã¯ç®æãæ£ããèªèãããå¿ è¦ãããããããã¡ã¤ã«ç·¨éå¾ã¯ã Office SharePoint Server Search ãµã¼ãã¹ (osearch) ã®åèµ·åã¨ãåã¯ãã¼ã«ã®åæ¹ããããªã£ã¦ãã ããã
ã«ã¹ã¿ã ãã£ã¯ã·ã§ããªã®ä½ææ¹æ³ã«ã¤ãã¦ã¯ã以ä¸ã®è¨äºãåèã«ãªãã¾ãã
TechNet : ã¦ã¼ã¶ã¼è¾æ¸ãä½æãã (Office SharePoint Server 2007)
http://technet.microsoft.com/ja-jp/library/cc263242.aspxå®ã¯ãæ親ä¼ã§ã¯ããã¯ã¼ããã¬ã¼ã¯ãé»æ¢ãããã ã¨ããã質åã§ã¯ãªããéã« ãã¯ã¼ããåå²ãã¦èªèãããããªããã ã¨ãããã®ã§ãããç§ã¯ããã®åçã¨ãã¦ããã«ã¹ã¿ã è¾æ¸ (ä¸è¨ã® CustomLANG.lex) ãç·¨éãããã¨ã§èªèãããããå¯è½æ§ããããããããªãã ã¨ãçããã¦ãã¾ãã¾ãããããã¿ã¾ãããåä½ã確èªãã¦ã¿ãã¨ãããæ¬æ¥åå²ããã¦ããªãã¯ã¼ããåå²ãã¦èªèããããã¨ã¯ä¸å¯è½ã§ããã(ãã®äºæ¸¬ã¯èª¤ã£ã¦ããã¾ãããç³ã訳ããã¾ãã . . .)