C#ã§é«ç²¾åº¦ãªããã¹ããã¡ã¤ã«æåã³ã¼ãèªåå¤å¥ï¼2014å¹´çï¼
C#ï¼.NET Frameworkï¼ã«éã£ããã¨ã§ã¯ããã¾ããããæ±ç¨çã«ããã¹ããã¡ã¤ã«ãæ±ããããªã¢ããªã±ã¼ã·ã§ã³ãä½ã£ã¦ããã¨ããã
- ç¹å®ã®æåã³ã¼ãã®ãã¡ã¤ã«ããèªã¿åºããªãã®ã§ã¯å°ã
âæåã³ã¼ããèªåå¤å¥ããããã¹ãã®å 容ãåãåºããã - èªã¿åºãããã¡ã¤ã«ã¨åãæåã³ã¼ãã§ãã¡ã¤ã«ãæ¸ãåºããã
âèªã¿åºãããã¡ã¤ã«ã®æåã³ã¼ããç¥ããã
ã¨ãã£ãå ´é¢ã«åºãããã¾ãã
ã§ãããC#ï¼.NET Frameworkï¼æ¨æºã®ã©ã¤ãã©ãªã§ã¯ãã®ãããªæ©è½ã¯æä¾ããã¦ããªããããæåã³ã¼ããå¤å®ããã«ã¯ã
- èªåã§æåã³ã¼ãå¤å®ã®ãã¸ãã¯ãå®è£ ãã
- åºæ¥åãã®å¤é¨ã©ã¤ãã©ãªãWindows版NKF32.dllãICU4Cãªã©ãå©ç¨ãã
- IEç¨ã®æåã³ã¼ãå¤å¥ã©ã¤ãã©ãªï¼mlang.dllï¼ã利用する â»COMã³ã³ãã¼ãã³ãå¼ã³åºãè¦
ã®ããããã®æ¹æ³ãåããã¨ã«ãªãã¾ãã
HNXgrepã¨ããèªä½ã®grepãã¼ã«ï¼Vectorãããã¦ã³ãã¼ãã§ãã¾ãï¼ã§ã¯ãèªåã®ã½ã¼ã¹ã³ã¼ãã§æåã³ã¼ããå¤å¥ãã¦ãã¾ãã
å½åã¯JCode.pmã®C#移æ¤çãæ¹è¯ãã¦ä½¿ã£ã¦ããã®ã§ããï¼2012å¹´æç¹ã®ã½ã¼ã¹ã³ã¼ãã¯こちらï¼ããã®å¾ãä¸å
·åæ¹åã»ãã¥ã¼ãã³ã°ã»æ©è½è¿½å ãéãã
- ANSIï¼ISO-8859-1æ¡å¼µï¼æ¬§ç±³çWindowsã®ããã©ã«ãæåã³ã¼ãï¼ãã¡ã¤ã«
- EUCè£å©æ¼¢åï¼0x8Få§ã¾ãã®ï¼ãã¤ãæåã³ã¼ãï¼ä½¿ç¨ãã¡ã¤ã«
- ã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹ãä¼´ããªãåè§ã«ãJISãã¡ã¤ã«
ã®æ¤åºã»ãã³ã¼ãã«ã対å¿ããã¾ãEUCãUTF8Nã®ãã¡ã¤ã«ãShiftJISã¨èª¤å¤å¥ããå¯è½æ§ãããã«ä½æ¸ãæ¤åºç²¾åº¦ãé«ãããã¼ã¸ã§ã³ã¨ãªãã¾ããã
2014å¹´8ææç¹çã®ã½ã¼ã¹ã³ã¼ãã以ä¸ã«å
¬éãã¾ãã
â ã¢ã«ã´ãªãºã æ¦è¦
- ASCIIå¶å¾¡ã³ã¼ãã®ãã¡0x00-0x03ããªãã0x7F(DEL)ãåºç¾ããå ´å
- ååã¨ãã¦éããã¹ããã¡ã¤ã«ã¨ã¿ãªã
- ãã ããã¡ã¤ã«å é 2ãã¤ãã§0x00ãç»å ´ããå ´åã¯ãBOMãªãUTF16ã®å¯è½æ§ã調ã¹ã
- éASCIIã³ã¼ãï¼0x80以éï¼ãåºç¾ããªãã£ãå ´å
- JISã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹ãããã°JISããªããã°ASCII
- éASCIIã³ã¼ãï¼0x80以éï¼ãåºç¾ããå ´å
- 以ä¸ï¼ç¨®é¡ã®æåã³ã¼ãã«è©²å½ããããå¯è½æ§ã調æ»ãã
- ANSIï¼CP1252ï¼
- BOMãªãUTF8ï¼CP65001ï¼
- EUCï¼CP51932ãè£å©æ¼¢å使ç¨æã¯CP20932ç¸å½ï¼
âç»å ´ããã³ã¼ãç¯å²ã®é¢ä¿ã§ï¼å¿ ã0x80以ä¸ã¨ãªãï¼ãä¸è¨ï¼ç¨®ã¯ã¾ã¨ãã¦ãã§ã㯠- ShiftJISï¼CP932ï¼
âï¼ãã¤ãç®ã0x20-0x7Eã®ASCIIã«ãªããã¨ãããã®ã§ãåå¥ã«ãã§ãã¯
- æåã³ã¼ãä½ç³»ã¨ãã¦æããã«å¦¥å½ã§ã¯ãªãå ´åãå¯è½æ§ãªãã¨ã¿ãªã
- åè§æåï¼åè§ã«ãï¼ã»å ¨è§æåãé£ç¶ãã¦ããã°å¯è½æ§ãé«ãã¨ã¿ãªã
- éã«ä¸é£ç¶ã§ããã°å¯è½æ§ã¯ä½ããã®ã¨ã¿ãªã
- 誤å¤å¥ãèµ·ãããããç®æã¯åã
ã«é
ç¹ããã¥ã¼ãã³ã°
ï¼éå»ã®èª¤å¤å¥äºä¾ãªã©ããã£ã¼ãããã¯ãã¦ãã¾ãï¼
- 誤å¤å¥ãèµ·ãããããç®æã¯åã
ã«é
ç¹ããã¥ã¼ãã³ã°
- å
¨ãã¤ãèµ°æ»å®äºå¾ãå¯è½æ§ãé«ããã®ããé ã«ããã¹ãåãåºãã試ã¿ã
- ãã³ã¼ãã¨ã©ã¼ãçºçããªããã°ãã®æåã³ã¼ãã§ç¢ºå®
- 以ä¸ï¼ç¨®é¡ã®æåã³ã¼ãã«è©²å½ããããå¯è½æ§ã調æ»ãã
以ä¸ãã½ã¼ã¹ã³ã¼ãæç²ã§ãã
ã½ã¼ã¹ã³ã¼ãå
¨éã¯ãVectorã©ã¤ãã©ãªã«ãReadJEncãã¨ããååã§ç»é²ãã¦ããã¾ãã®ã§ãこちらãããã¦ã³ãã¼ããã¦ãã ããã
ï¼åä½ãµã³ãã«ã»å©ç¨ä¾ã½ã¼ã¹ã³ã¼ããå梱ãã¦ããã¾ãï¼
â ã½ã¼ã¹ã³ã¼ãæç²ï¼æåã³ã¼ãå¤å¥ã¯ã©ã¹ã»å¤å¥ã¡ã½ããæ¬ä½
public class ReadJEnc { //////////////////////////////////////////////////////////////////////// // <ReadJEnc.cs> ReadJEnc æåã³ã¼ãèªåå¤å¥å¦çæ¬ä½ãæç²ã // Copyright (C) 2014 hnx8(H.Takahashi) // http://hp.vector.co.jp/authors/VA055804/ // // Released under the MIT license // http://opensource.org/licenses/mit-license.php //////////////////////////////////////////////////////////////////////// /// <summary>ãã¤ããªã¨å¤å®ããDELæåã³ã¼ããå ¼ãASCII/éASCIIã®å¢çæåã³ã¼ã</summary> const byte DEL = (byte)0x7F; /// <summary>éããã¹ããã¡ã¤ã«ã¨å¤å®ããå¶å¾¡æåã³ã¼ãã®æ大å¤</summary> const byte BINARY = (byte)0x03; //0x01-0x07ä½ã®ç¯å²ã§èª¿æ´ã0x08(BS)ã¯TeraTermçãã°ã§åºãã0x09(TAB)ã¯æ®éã«ããã¹ãã§ä½¿ãã0x03ãããã«ããã®ããããHNXgrepã§ã¯0x03ãæ¡ç¨ //æåã³ã¼ãå¤å¥ã¡ã½ãã================================================ /// <summary>ãã¤ãé åãå ¨èµ°æ»ããæåã³ã¼ããèªåå¤å¥ãã</summary> /// <param name="Bytes">å¤å®å¯¾è±¡ã®ãã¤ãé å</param> /// <param name="Length">ãã¡ã¤ã«ãµã¤ãº(ãã¤ãé åå é ããã®ãã³ã¼ã対象ãã¤ãæ°)</param> /// <param name="Text">out å¤å¥ããæåã³ã¼ãã«ããåãåºããããã¹ãæååï¼éããã¹ããªãnullï¼</param> /// <returns>æåã³ã¼ãå¤å¥çµæï¼éããã¹ããªãnullï¼</returns> public CharCode GetEncoding(byte[] Bytes, int Length, out string Text) { byte b1 = (Length > 0) ? Bytes[0] : (byte)0; //æ±ç¨ãã¤ããã¼ã¿èªã¿åãå¤æ°åæå byte b2; //ã1ã7bitæåã³ã¼ãã®ç¯å²ã®èµ°æ»(ASCIIå¤å®/éASCIIæåéå§ä½ç½®ææ¡)ãããã³UTF16N/JISãã§ã㯠int jisScore = 0; //JISç¨ã®ã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹ãç»å ´ãããã«ã¦ã³ãã¢ãã int asciiEndPos = 0; //ã«ã¼ãå¤æ°ãå ¼ãéASCIIæåãåãã¦æ¤åºããä½ç½® while (b1 < DEL) //éASCIIæåãåºç¾ãããã«ã¼ãè±åºï¼b1ã«ã¯ãããããèªã¿è¾¼ã¿æ¸ { if (b1 <= BINARY) { //ãã¤ããªæåæ¤åºï¼å é ï¼ãã¤ãã§ã®æ¤åºãªãUTF16Nã®å¯è½æ§ããã§ãã¯ãå¦ãªããã¤ããªç¢ºå® CharCode ret = (asciiEndPos < 2 ? SeemsUTF16N(Bytes, Length) : null); if (ret != null && (Text = ret.GetString(Bytes, Length)) != null) { //UTF16Nãã³ã¼ãæåï¼éããã¹ãæåæ··å ¥ãã§ã㯠int i; for (i = -3; i <= BINARY; i++) { //0xFFFD,0xFFFE,0xFFFF,0ãBINARYãDELãæ··å ¥ãã¦ããå ´åã¯éããã¹ãã¨ã¿ãªã if (Text.IndexOf((char)i, 0, Text.Length) != -1) { break; } } if (i > BINARY && Text.IndexOf((char)DEL, 0, Text.Length) == -1) { //â UTF16N確å®ï¼éããã¹ãæåæ··å ¥ãªãï¼ return ret; } } Text = null; return null; //â ãã¤ããªç¢ºå® } if (b1 == 0x1B && (b2 = JIS.isEscape(Bytes, Length, asciiEndPos)) > 0) { //JISã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹æ¤åºæã¯ã«ã¦ã³ãå ç® jisScore++; asciiEndPos += b2; } //次ã®æå㸠if ((++asciiEndPos) >= Length) { //å ¨æåãã§ãã¯å®äºï¼éASCIIæåæªæ¤åºãJISãããã¯ASCII if (jisScore >= 2) { //â JIS確å®(詳細種å¥ãå¤å®ãã) return JIS.GetEncoding(Bytes, Length, out Text); } if (JIS.hasSOSI(Bytes, Length)) { //SO,SIã«ããã¨ã¹ã±ã¼ããæ¤åºããå ´åãæç²ã if ((Text = CharCode.JIS50222.GetString(Bytes, Length)) != null) { //â åè§ã«ãSOSIã®ã¿ã使ç¨ããJISã§ç¢ºå® return CharCode.JIS50222; } } //â ASCII確å®ï¼ãã ããã³ã¼ã失ææã¯ãã¤ããªï¼ return ((Text = CharCode.ASCII.GetString(Bytes, Length)) != null) ? CharCode.ASCII : null; } b1 = Bytes[asciiEndPos]; } //ã2ãéASCIIæåãå«ãç¯å²ã®èµ°æ»ãCP1252/UTF8/EUCãã§ãã¯ãJISæ®ãã§ã㯠int cp1252Score = 0; //ãããããå¯è½æ§ãå¦å®ããããint.MinValueãè¨å®ããã int utfScore = 0; int eucScore = 0; int sjisScore = 0; bool existsEUC0x8F = false; //EUCè£å©æ¼¢åãè¦ã¤ãããtrueãè¨å® for (int cp1252Pos = asciiEndPos; cp1252Pos < Length; ) //cp1252Posã®å ç®ã¯ãã¸ãã¯éä¸ã§éæå®æ½ { if (b1 == DEL) { //å¶å¾¡æå0x7Fç»å ´ãªããããããããªJISã®å¯è½æ§ä»¥å¤å ¨æ¶æ» ãJISã®å¯è½æ§ãæ¶ãããããå¤å® cp1252Score = int.MinValue; utfScore = int.MinValue; eucScore = int.MinValue; sjisScore = int.MinValue; if (jisScore == 0 || (cp1252Pos++) >= Length || (b1 = Bytes[cp1252Pos]) < 0x21 || b1 >= DEL) { //JISã¨ã¹ã±ã¼ãæªåºç¾ or ãã¡ã¤ã«æ«å°¾ã§2ãã¤ãç®ãªã or 2ãã¤ãç®ã0x21-0x7Eç¯å²å¤ãªãJISã®å¯è½æ§ãå¦å® Text = null; return null; //â ãã¤ããªç¢ºå® } } //CP1252ãã§ãã¯ï¼0x80以ä¸ã®æåç¯å²ã®ææ¡(notAsciiStartPosãcp1252Pos)ãb1èªè¾¼æ¸ int notAsciiStart = cp1252Pos; switch (cp1252Score) { case int.MinValue: //CP1252å¯è½æ§å¦å®æ¸ã¿ãéASCIIæåã®ã¹ãããã®ã¿å®æ½ while (b1 > DEL && (++cp1252Pos) < Length) { b1 = Bytes[cp1252Pos]; } break; default: //CP1252å¯è½æ§ãããå®ç¾©å¤æåæ··å ¥ãã§ãã¯ï¼CP1252ãã¤ã³ãå ç® while (b1 > DEL) { // éCP1252ãã§ãã¯ç¨å®ç¾©(0x2001A002)ï¼æªå®ç¾©ã®81,8D,8F,90,9Dã«å¯¾å¿ããããããON // FEDC BA98 7654 3210 FEDC BA98 7654 3210 // ---- ---- ---- ---- ---- ---- ---- ---- // (0x9#) 0010 0000 0000 0001 (0x8#) 1010 0000 0000 0010 if (b1 <= 0x9D && (0x2001A002 & (1u << (b1 % 32))) != 0) { // CP1252æªå®ç¾©æåãæ¤åºãå¯è½æ§æ¶æ» cp1252Score = int.MinValue; goto case int.MinValue; //éASCIIæåã¹ããã㸠} if ((++cp1252Pos) >= Length) { break; } b1 = Bytes[cp1252Pos]; } //éASCIIæåç¯å²çµäºãè©ä¾¡ãã¤ã³ãå ç® //ï¼ãã¤ãã®ã¿åºç¾æï¼SJISãããCP1252ã®å¯è½æ§ãé«ãï¼ãSJISæ¼¢å1æåç®ã¨åè©ä¾¡ã»SJISã«ããããé«è©ä¾¡ã¨ãªããããã¤ã³ãå ç® if (cp1252Pos == notAsciiStart + 1) { cp1252Score += 2; } else if (cp1252Pos == notAsciiStart + 2 && (b2 = Bytes[cp1252Pos - 1]) >= 0xC0) { //ï¼ãã¤ãã®ã¿åºç¾æããã¤ã¢ã¯ãªãã£ã«ã«ãã¼ã¯ï¼çºé³è¨å·çï¼ã¤ãã¢ã«ãã¡ããããªãé ç¹è£æ£ if (b2 == (b2 = Bytes[cp1252Pos - 2])) { cp1252Score += 5; } //åä¸æåéãã¯ããªãç¹å¾´ç(SJISã«ãããå¯è½æ§é«) else if (b2 >= 0xC0) { //ç¶ãorç´åã®ASCIIæåãã¢ã«ãã¡ãããã£ã½ããã°ãSJISã«ãããå¯è½æ§ãé«ããªãããè£æ£ if (b1 > 0x40 || (notAsciiStart > 0 && Bytes[notAsciiStart - 1] > 0x40)) { cp1252Score += 5; } else { cp1252Score += 3; } //ã©ã¡ãã§ããªããã°ãEUCããã¯å¯è½æ§é«ã¨ãã } else { cp1252Score++; } //å¦ãªãã°ä½ãã®å ç®ã¨ãã } else { cp1252Score++; } //ãããã«ã該å½ããªããã°ããä½ãã®å ç®ã¨ãã break; } //notAsciiStartPosãcp1252Posç¯å²ã®UTF8ãã§ã㯠if (utfScore >= 0) { bool prevIsKanji = false; for (int utfPos = notAsciiStart; utfPos < cp1252Pos; utfPos++) { b1 = Bytes[utfPos]; //1ãã¤ãç®ã»ï¼ãã¤ãç®(ã¨ãã«0x80以ä¸ã§ãããã¨ã¯ç¢ºèªæ¸ã¿)ããã§ã㯠if (b1 < 0xC2 || (++utfPos) >= cp1252Pos || Bytes[utfPos] > 0xBF) { utfScore = int.MinValue; break; } //UTF8å¯è½æ§æ¶æ» else if (b1 < 0xE0) { //ï¼ãã¤ãæåOKï¼åè§æåã¨ã¿ãªãã¦è©ä¾¡ï¼ if (prevIsKanji == false) { utfScore += 6; } else { utfScore += 2; prevIsKanji = false; } } //3ãã¤ãç®(0x80以ä¸ã§ãããã¨ã¯ç¢ºèªæ¸ã¿)ããã§ã㯠else if ((++utfPos) >= cp1252Pos || Bytes[utfPos] > 0xBF) { utfScore = int.MinValue; break; } //UTF8å¯è½æ§æ¶æ» else if (b1 < 0xF0) { //ï¼ãã¤ãæåOKï¼å ¨è§æåã¨ã¿ãªãã¦è©ä¾¡ï¼ if (prevIsKanji == true) { utfScore += 8; } else { utfScore += 4; prevIsKanji = true; } } //4ãã¤ãç®(0x80以ä¸ã§ãããã¨ã¯ç¢ºèªæ¸ã¿)ããã§ã㯠else if ((++utfPos) >= cp1252Pos || Bytes[utfPos] > 0xBF) { utfScore = int.MinValue; break; } //UTF8å¯è½æ§æ¶æ» else if (b1 < 0xF5) { //ï¼ãã¤ãæåOKï¼å ¨è§æåã¨ã¿ãªãã¦è©ä¾¡ï¼ if (prevIsKanji == true) { utfScore += 12; } else { utfScore += 6; prevIsKanji = true; } } else { utfScore = int.MinValue; break; } //UTF8å¯è½æ§æ¶æ» (0xF5以éã¯UTF8æªå®ç¾©) } } //notAsciiStartPosãcp1252Posç¯å²ã®EUCãã§ã㯠if (eucScore >= 0) { //åã®æåã¨ã®é£ç¶æ§ãã§ãã¯ç¨å®æ°å®ç¾© const int PREV_KANA = 1; //ç´åæåã¯åè§ã«ã const int PREV_ZENKAKU = 2; //ç´åæåã¯å ¨è§ int prevChar = 0; //åã®æåã¯KANAã§ãZENKAKUã§ããªã for (int eucPos = notAsciiStart; eucPos < cp1252Pos; eucPos++) { //ï¼ãã¤ãç®(0xA1-0xFE,0x8E,0x8F)ã»ï¼ãã¤ãç®(ï¼ãã¤ãç®ã«å¿ãç¯å²ãç°ãªã)ã®ãã§ã㯠b1 = Bytes[eucPos]; if (b1 == 0xFF || (++eucPos) >= cp1252Pos) { eucScore = int.MinValue; break; } //EUCå¯è½æ§æ¶æ» b2 = Bytes[eucPos]; if (b1 >= 0xA1) { //ï¼ãã¤ãç®ï¼å ¨è§æåæå®ãï¼ãã¤ãå ¨è§æåãã§ã㯠if (b2 < 0xA1 || b2 == 0xFF) { eucScore = int.MinValue; break; } //EUCå¯è½æ§æ¶æ» //ï¼ãã¤ãæåOKï¼å ¨è§ï¼ if (prevChar == PREV_ZENKAKU) { eucScore += 5; } else { eucScore += 2; prevChar = PREV_ZENKAKU; } } else if (b1 == 0x8E) { //ï¼ãã¤ãç®ï¼ããªæå(orEUC-TWã®ï¼ãã¤ãæå)æå®ãï¼ãã¤ãã®åè§ã«ãæåãã§ã㯠if (b2 < 0xA1 || b2 > 0xDF) { eucScore = int.MinValue; break; } //EUCå¯è½æ§æ¶æ» //æ¤åºOK,EUCæåæ°ãå ç®ï¼åè§æåï¼ãæç²ã if (prevChar == PREV_KANA) { eucScore += 6; } else { eucScore += 2; prevChar = PREV_KANA; } } else if (b1 == 0x8F && b2 >= 0xA1 && b2 < 0xFF && (++eucPos) < cp1252Pos && (b2 = Bytes[eucPos]) >= 0xA1 && b2 < 0xFF) { //æ®ãå¯è½æ§ã¯ï¼ãã¤ãæåï¼æ¤åºOKãªãEUCæåæ°ãå ç®ï¼å ¨è§æåãè£å©æ¼¢åï¼ if (prevChar == PREV_ZENKAKU) { eucScore += 8; } else { eucScore += 3; prevChar = PREV_ZENKAKU; } existsEUC0x8F = true; //â»è£å©æ¼¢åæ } else { eucScore = int.MinValue; break; } //EUCå¯è½æ§æ¶æ» } } //ASCIIæåç¯å²ã®èªã¿é£ã°ãï¼ãã¤ããªãã§ãã¯ï¼JISãã§ãã¯ãb1ã«éASCIIæååºç¾ä½ç½®ã®ãã¤ãå¤ãæ ¼ç´ while (cp1252Pos < Length && (b1 = Bytes[cp1252Pos]) < DEL) { if (b1 <= BINARY) { //â ãã¤ããªç¢ºå® Text = null; return null; } if (b1 == 0x1B && (b2 = JIS.isEscape(Bytes, Length, cp1252Pos)) > 0) { //JISã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹ãæ¤åº jisScore++; asciiEndPos += b2; } cp1252Pos++; } } //ã3ãSJISãã§ãã¯ï¼éASCIIç»å ´ä½ç½®ãããã§ãã¯éå§:ãã ãDELæ¤åºæãªã©ã¯å¯è½æ§ãªãï¼ if (sjisScore != int.MinValue) { sjisScore = GetEncoding(Bytes, asciiEndPos, Length); } //ã4ããã¤ã³ãã«å¿ãæåã³ã¼ãã決å®ãæç²ãï¼å®éã«ãã®ã¨ã³ã³ã¼ãã£ã³ã°ã§èªã¿åºãæåããã°OKã¨ã¿ãªãï¼ if (jisScore >= 2 && jisScore > (Length / 100000)) { //ä¸å®ä»¥ä¸ã®æ¯çã§JISã¨ã¹ã±ã¼ãã·ã¼ã±ã³ã¹åºç¾ return JIS.GetEncoding(Bytes, Length, out Text); //â JIS確å®(詳細種å¥ãå¤å®ãã) } if (eucScore > 0 && eucScore > sjisScore && eucScore > utfScore) { //EUCå¯è½æ§é« if (cp1252Score > eucScore) { //ãã ãCP1252ã®å¯è½æ§ãé«ããã°CP1252ãå ã«ãã§ã㯠if ((Text = CharCode.ANSI.GetString(Bytes, Length)) != null) { return CharCode.ANSI; } //â CP1252ã§èªã¿ãã¿æå } if (existsEUC0x8F && (Text = CharCode.EUCH.GetString(Bytes, Length)) != null) { return CharCode.EUCH; }//â EUCè£å©æ¼¢åèªã¿ãã¿æå if ((Text = CharCode.EUC.GetString(Bytes, Length)) != null) { return EUC; } //â EUCã§èªã¿ãã¿æå } if (utfScore > 0 && utfScore >= sjisScore) { //UTFå¯è½æ§é« if ((Text = CharCode.UTF8N.GetString(Bytes, Length)) != null) { return CharCode.UTF8N; } //â UTF-8Nã§èªã¿ãã¿æå } if (sjisScore >= 0) { //SJISå¯è½æ§é«(ãã ãCP1252ã®å¯è½æ§ãé«ããã°CP1252ãå ã«ãã§ãã¯) if (cp1252Score > sjisScore && (Text = CharCode.ANSI.GetString(Bytes, Length)) != null) { return CharCode.ANSI; } //â CP1252ã§èªã¿ãã¿æå if ((Text = CharCode.SJIS.GetString(Bytes, Length)) != null) { return CharCode; } //â SJISã§èªã¿ãã¿æå } if (cp1252Score > 0) { //CP1252ã®å¯è½æ§ã®ã¿æ®ã£ã¦ããã®ã§ãã§ã㯠if ((Text = CharCode.ANSI.GetString(Bytes, Length)) != null) { return CharCode.ANSI; } //â CP1252ã§èªã¿ãã¿æå } //â ãããã«ã該å½ããªãã£ãå ´åã¯ããã¤ããªãã¡ã¤ã«æ±ãã¨ãã Text = null; return null; }
BOMãªãUTF16ï¼ShiftJISã®å¤å®ã¯å¥ã¡ã½ããã«åãåºãã¦ãã¾ãã
以ä¸ããã®ã½ã¼ã¹ã³ã¼ãã§ãã
â ã½ã¼ã¹ã³ã¼ãæç²ï¼BOMãªãUTF16å¯è½æ§å¤å®ã¡ã½ããï¼ShiftJISå¤å®ã¡ã½ãã
/// <summary>BOMãªãUTF16ã®å¯è½æ§ãããã(å é æåãASCIIãå¦ãããã¨ã«)å¤å®</summary> /// <param name="Bytes">å¤å®å¯¾è±¡ã®ãã¤ãé å</param> /// <param name="Length">ãã¡ã¤ã«ãµã¤ãº</param> /// <returns>UTF16Nã¨æãããå ´åã¯ãã®æåã³ã¼ããå¦ãªãnull</returns> public static CharCode SeemsUTF16N(byte[] Bytes, int Length) { if (Length >= 2 && Length % 2 == 0) { if (Bytes[0] == 0x00) { if (Bytes[1] > BINARY && Bytes[1] < DEL && (Length == 2 || Bytes[2] == 0)) { //â²UTF16BigEndianã®å¯è½æ§ãã return CharCode.UTF16BE; } } else if (Bytes[1] == 0x00) { if (Bytes[0] > BINARY && Bytes[0] < DEL && (Length == 2 || Bytes[3] == 0)) { //â²UTF16LittleEndianã®å¯è½æ§ãã return CharCode.UTF16LE; } } } return null; //UTF16Nã®å¯è½æ§ã¯ãªãã¨å¤æ } /// <summary>ShiftJISã®å¤å®ã¹ã³ã¢ç®åºï¼å¤å®éå§ä½ç½®ããã¡ã¤ã«æ«å°¾ã¾ã§ã®ç¯å²ã対象ï¼</summary> /// <param name="Bytes">å¤å®å¯¾è±¡ã®ãã¤ãé å</param> /// <param name="pos">å¤å®éå§ä½ç½®(éASCIIæåã³ã¼ããåãã¦ç»å ´ããä½ç½®)</param> /// <param name="Length">ãã¡ã¤ã«ãµã¤ãº(ãã¤ãé åå é ããã®ãã³ã¼ã対象ãã¤ãæ°)</param> /// <returns>å¤å®ã¹ã³ã¢ç®åºçµæ</returns> protected override int GetEncoding(byte[] Bytes, int pos, int Length) { int score = 0; //åæå¤ã¼ãããReadJEncè©ä¾¡ãå§ãã byte b1 = Bytes[pos]; byte b2; while (pos < Length) { //åã®æåã¨ã®é£ç¶æ§ãã§ãã¯ç¨å®æ°å®ç¾© const int PREV_KANA = 1; //ç´åæåã¯åè§ã«ã const int PREV_ZENKAKU = 2; //ç´åæåã¯å ¨è§ int prevChar = 0; //åã®æåã¯KANAã§ãZENKAKUã§ããªã while (b1 > DEL) { if (b1 >= 0xA1 && b1 <= 0xDF) { //ï¼ãã¤ãåè§ã«ãï¼OKï¼é£ç¶ã¯EUCãCP1252ãããé«é ç¹ã¨ããï¼ if (prevChar == PREV_KANA) { score += 3; } else { score += 1; prevChar = PREV_KANA; } } // éCP932ãã§ãã¯ç¨å®æ°(0x00000061,0xE0009800)ï¼CP932ã§ã¯ãã³ã¼ãä¸è½ãªæªå®ç¾©æåã®ããããï¼ // FEDC BA98 7654 3210 FEDC BA98 7654 3210 // ---- ---- ---- ---- ---- ---- ---- ---- // (0x9#) 0000 0000 0000 0000 (0x8#) 0000 0000 0110 0001 - 80(A0å¤å®ã§ãæµç¨):å®ç¾©å¤ã85,86:æªä½¿ç¨(shift_jis2004ãªã©ã§ã¯ä½¿ç¨ããã ãCP932ã§ã¯ãã³ã¼ãä¸è½) // (0xF#) 1110 0000 0000 0000 (0xE#) 1001 1000 0000 0000 - FD,FE,FF:å®ç¾©å¤ãEB,EC,EF:æªä½¿ç¨ (F0-F9:å¤åã¯è¨±å®¹ãHNXgrepãªã©å¤åä¸è¨±å®¹ã¨ããå ´åã¯ããããç«ã¦ããã¨) else if (((b1 < 0xE0 ? 0x00000061 : 0xE0009800) & 1u << (b1 % 32)) != 0 || (++pos) >= Length || (b2 = Bytes[pos]) < 0x40 || b2 > 0xFC) { //ï¼ãã¤ãç®ããããã(SJISå®ç¾©å¤/æªä½¿ç¨) or ï¼ãã¤ãç®ææ¡ä¸è½ or ï¼ãã¤ãç®SJISå®ç¾©å¤ return int.MinValue; //å¯è½æ§æ¶æ» } else { //å ¨è§æåæ°ãå ç®(EUCããã¯å¯è½æ§ãä½ãã«è¦ç©ãã£ã¦ãã) if (prevChar == PREV_ZENKAKU) { score += 4; } else { //ï¼ãã ãåçªã«0x98以éã®ç¬¬äºæ°´æºæåãåºã¦ããå ´åã¯ãUTF-8/EUC/CP1252ã®å¯è½æ§ãé«ãã®ã§ãã©ã¹å¤ãªãã¨ããï¼ score += (b1 > 0x98 ? 0 : 2); prevChar = PREV_ZENKAKU; } } //åå½èªå ¨ã³ã¼ãå ±éï¼ããã«æ¬¡ã®æå㸠if ((++pos) >= Length) { break; } b1 = Bytes[pos]; } //åå½èªå ¨ã³ã¼ãå ±éï¼åè§æåã®ç¯å²ãèªã¿é£ã°ã while (b1 <= DEL && (++pos) < Length) { b1 = Bytes[pos]; } } return score; }
æåã³ã¼ãã®ç¨®é¡ã«ã¤ãã¦ã¯å¥ã¯ã©ã¹ã«ã¦å®ç¾©ãã¦ãã¾ãã
å®éã®æåã³ã¼ãå¤å¥å¦çã«ããã£ã¦ã¯ããã¡ã¤ã«å
容ãå
¨ãã¤ãèµ°æ»ããåã«ãå
é ãã¤ãå(ByteOrderMark)ãããã¡ã¤ã«ã®æåã³ã¼ã種é¡ãèå¥ã§ãããã©ããããã§ãã¯ãã¾ãã
BOMã¤ãã§ããã°Unicodeã¨å¤å®ãã¾ãã
以ä¸ãæåã³ã¼ãå®ç¾©ã¯ã©ã¹ã¨BOMã¤ãUTFå¤å¥ã¡ã½ããã§ãã
åãã¡ã¤ã«ç¨®é¡ãå®ç¾©ããã«ããããå
é ãã¤ããã¼ã¿ã®æ
å ±ãæãããããã«ãã¦ãã¾ãã
BOMã¯Encoding.GetPreamble()ãå©ç¨ãã¦ææ¡ãã¾ãã
ã¾ãããã¹ãåãåºãæã«ã¯BOMãã¤ãããã³ã¼ãç¯å²ããé¤å¤ãã¾ãã
â ã½ã¼ã¹ã³ã¼ãæç²ï¼æåã³ã¼ãå®ç¾©ã¯ã©ã¹ã»ããã³BOMã¤ãUTFå¤å¥ã¡ã½ãã
public class CharCode { //////////////////////////////////////////////////////////////////////// // <CharCode.cs> ReadJEnc æåã³ã¼ã種é¡å®ç¾©ãæç²ã // Copyright (C) 2014 hnx8(H.Takahashi) // http://hp.vector.co.jp/authors/VA055804/ // // Released under the MIT license // http://opensource.org/licenses/mit-license.php //////////////////////////////////////////////////////////////////////// //Unicodeç³»æåã³ã¼ã /// <summary>UTF8(BOMãã)</summary> public static readonly Text UTF8 = new Text("UTF-8", new UTF8Encoding(true, true)); //BOM : 0xEF, 0xBB, 0xBF /// <summary>UTF32(BOMããLittleEndian)</summary> public static readonly Text UTF32 = new Text("UTF-32", new UTF32Encoding(false, true, true)); //BOM : 0xFF, 0xFE, 0x00, 0x00 /// <summary>UTF32(BOMããBigEndian)</summary> public static readonly Text UTF32B = new Text("UTF-32B", new UTF32Encoding(true, true, true)); //BOM : 0x00, 0x00, 0xFE, 0xFF /// <summary>UTF16(BOMããLittleEndian)</summary><remarks>Windowsæ¨æºã®Unicode</remarks> public static readonly Text UTF16 = new Text("UTF-16", new UnicodeEncoding(false, true, true)); //BOM : 0xFF, 0xFE /// <summary>UTF16(BOMããBigEndian)</summary> public static readonly Text UTF16B = new Text("UTF-16B", new UnicodeEncoding(true, true, true)); //BOM : 0xFE, 0xFF /// <summary>UTF16(BOMç¡ãLE)</summary> public static readonly Text UTF16LE = new Text("UTF-16LE", new UnicodeEncoding(false, false, true)); /// <summary>UTF16(BOMç¡ãBE)</summary> public static readonly Text UTF16BE = new Text("UTF-16BE", new UnicodeEncoding(true, false, true)); /// <summary>UTF8(BOMç¡ã)</summary> public static readonly Text UTF8N = new Text("UTF-8N", new UTF8Encoding(false, true)); //éUnicodeç³»æåã³ã¼ã /// <summary>Ascii</summary><remarks>ãã³ã¼ãã¯UTF8Encodingãæµç¨</remarks> public static readonly Text ASCII = new Text("ASCII", UTF8N.Encoding); /// <summary>1252 ISO8859 西ã¨ã¼ãããè¨èª</summary> public static readonly Text ANSI = new Text("ANSI欧米", 1252); /// <summary>50221 iso-2022-jp æ¥æ¬èª (JIS-Allow 1 byte Kana) â»MSç</summary> public static readonly Text JIS = new Text("JIS50221", 50221); /// <summary>50222 iso-2022-jp æ¥æ¬èª (JIS-Allow 1 byte Kana - SO/SI) â»MSç</summary><remarks>SO/SIã«ããã«ãã·ããã®ã¿ã®ãã¡ã¤ã«ãCP50222ã¨ã¿ãªã</remarks> public static readonly Text JIS50222 = new Text("JIS50222", 50222); /// <summary>EUCè£å©æ¼¢å(0x8F)ãã â»MS-CP20932ãå©ç¨ãå¼·å¼ã«ãã³ã¼ããã</summary><remarks>ã¨ã³ã³ã¼ãããã¨ãã¡ã¤ã«ãå£ããã®ã§æ³¨æ</remarks> public static readonly Text EUCH = new EucH("EUCè£æ¼¢"); /// <summary>51932 euc-jp æ¥æ¬èª (EUC) â»MSç</summary> public static readonly Text EUC = new Text("EUCJP", 51932); /// <summary>932 shift_jis æ¥æ¬èª (ã·ãã JIS) â»MSç¬èª</summary> public static readonly Text SJIS = new Text("ShiftJIS", 932); // æåã³ã¼ãï¼ãã¡ã¤ã«ç¨®é¡ï¼å¤å®ã¡ã½ãã /// <summary>BOMããUTFãã¡ã¤ã«ã®æåã³ã¼ããå¤å®ãã</summary> /// <param name="Bytes">å¤å®å¯¾è±¡ã®ãã¤ãé å</param> /// <param name="Read">ãã¤ãé åã®èªã¿è¾¼ã¿æ¸ãã¤ãæ°</param> /// <returns>BOMããå¤å®ã§ããæåã³ã¼ã種é¡ãåè´ãªãã®å ´ånull</returns> public static CharCode GetPreamble(byte[] Bytes, int Read) { //BOMä¸è´å¤å® return GetPreamble(Bytes, Read, UTF8, UTF32, UTF32B, UTF16, UTF16B); } protected static CharCode GetPreamble(byte[] Bytes, int Read, params CharCode[] Array) { foreach (CharCode c in Array) { //èªã¿è¾¼ã¿æ¸ãã¤ãé åå 容ããã¨ã«ãã¡ã¤ã«ç¨®é¡ã®ä¸è´ãç¢ºèª byte[] bom = c.Bytes; int i = (bom != null ? bom.Length : int.MaxValue); //BOMã»ãã¸ãã¯ãã³ãã¼æ«å°¾ãã調ã¹ã if (Read < i) { continue; } //ãããããã¡ã¤ã«ãµã¤ãºãå°ããå ´åã¯ä¸ä¸è´ do { //å ¨ãã¤ãä¸è´ãªããã®æåã³ã¼ãã¨ã¿ãªã if (i == 0) { return c; } i--; } while (Bytes[i] == bom[i]); //BOMã»ãã¸ãã¯ãã³ãã¼ä¸ä¸è´ç®æãããªãdoè±åº } return null; //ãã¡ã¤ã«ç¨®é¡æ±ºå®ã§ãã } #region ããã¹ãåºæ¬ã¯ã©ã¹å®ç¾©ãæç²ã---------------------------- /// <summary>æåã³ã¼ã種é¡ï¼ããã¹ã /// </summary> public class Text : CharCode { internal Text(string Name, Encoding Encoding) : base(Name, Encoding, Encoding.GetPreamble()) { } internal Text(string Name, int enc) : base(Name, System.Text.Encoding.GetEncoding(enc, EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback), null) { } } /// <summary>ãã¡ã¤ã«æåã³ã¼ã種é¡å</summary> public readonly string Name; /// <summary>å é ãã¤ãèå¥ãã¼ã¿ï¼BOM/ãã¸ãã¯ãã³ãã¼ï¼</summary> protected readonly byte[] Bytes = null; /// <summary>ã¨ã³ã³ã¼ãã£ã³ã°</summary> private Encoding Encoding; /// <summary>åºæ¬ã³ã³ã¹ãã©ã¯ã¿</summary> protected CharCode(string Name, Encoding Encoding, byte[] Bytes) { this.Name = Name; this.Encoding = Encoding; this.Bytes = Bytes; } /// <summary>å¼æ°ã®ãã¤ãé åããæååãåãåºãã失ææã¯nullãè¿ã</summary> public virtual string GetString(byte[] Bytes, int Length) { try { //BOMãµã¤ãºãææ¡ããBOMãé¤ããé¨åãæååã¨ãã¦åãåºã int bomBytes = (this.Bytes == null ? 0 : this.Bytes.Length); return Encoding.GetString(Bytes, bomBytes, Length - bomBytes); } catch (DecoderFallbackException) { //èªã¿åºã失æ(ãããã³ã°ããã¦ããªãæåããã£ãå ´åãªã©) return null; } } #endregion }
â»ããã¾ã§ã«æ²è¼ããã½ã¼ã¹ã³ã¼ãã¯ãæåã³ã¼ãå¤å®ãã¸ãã¯ã³ã¢é¨åã®æç²ã§ãã
ã½ã¼ã¹ã³ã¼ãå
¨éã¯ãVectorã©ã¤ãã©ãª ã ReadJEnc ããããã¦ã³ãã¼ããã¦ãã ããã
ã¾ãããã®èªåå¤å¥ã½ã¼ã¹ã³ã¼ãã使ç¨ããgrepãã¼ã« ã HNXgrep ãã«ã¤ãã¦ããVectorã©ã¤ãã©ãªã«ç»é²ããã¦ãã¾ãããããããã°ã試ããã ããã
è¨äºããã¯å²æãã¾ããããReadJEncã«ã¯
- ãã¡ã¤ã«èªã¿åºãæ©è½æä¾ã¯ã©ã¹
- éããã¹ããã¡ã¤ã«ã®ç¨®é¡ï¼ç»åãã¡ã¤ã«ã»å§ç¸®ãã¡ã¤ã«çï¼ãå¤å¥ããæ©è½
- EUCè£å©æ¼¢å(0x8Fããå§ã¾ã3ãã¤ãã®æåã³ã¼ã)ã®ãã³ã¼ãæ©è½
- æ¥æ¬èªä»¥å¤ã®æåã³ã¼ãï¼ä¸å½èªç¹ä½åã»ä¸å½èªç°¡ä½åã»ãã³ã°ã«ï¼ã«ã¤ãã¦ã®æåã³ã¼ãå¤å¥æ©è½
ãªã©ãåãã£ã¦ãã¾ãã
(2015.01.20追è¨)
ããã¹ããã¡ã¤ã«ããã®èªã¿åºãã»æåã³ã¼ãå¤å¥ã«ã¤ãã¦ã¯ãVectorããã®ãã¦ã³ãã¼ãzipãã¡ã¤ã«å
ã«ãµã³ãã«ã¢ããªã±ã¼ã·ã§ã³ï¼ããã³ãã®ã½ã¼ã¹ã³ã¼ãï¼ãå梱ãã¦ãã¾ãã®ã§ããã¡ããåç
§ãã¦ãã ããã
ã¾ãã@ITの.NET TIPSã«ã¦ãWebãã¼ã¸ãbyteé
åã§åå¾ãæåã³ã¼ããå¤å¥ï¼æ¨å®ï¼ããããæ¹ãç´¹ä»ããã¦ãã¾ãã
ReadJEncを使って文字エンコーディングを推定するには?[C#、VB]:.NET TIPS - @IT
ãªãJISå¤å¥ã«é¢ããã¡ã½ãããEUCã®ãã³ã¼ãå¦çã«ã¤ãã¦ã¯ãC#ã§å©ç¨ã§ããEncoding(CP5022x/CP20932)ã®ä»æ§ãç¹ç°ã§ãããã¨çããããåä»ãªäºæ
ãããããã®è¨äºã§ã¯æ²è¼ã»èª¬æãå²æãã¾ãã
å¾æ¥è£è¶³è¨äºãæ¸ããããããªã¨ã¯æã£ã¦ããã¾ãããæ¸ããè¦è¾¼ã¿ãç«ã¡ããã«ããã¾ããããã
ã©ã¤ã»ã³ã¹ã¯ã½ã¼ã¹ã³ã¼ãè¨è¼ã®ã¨ããMITã©ã¤ã»ã³ã¹ã§ãã
ãæè¦ãææ³ããã°æ
å ±ã®ãã£ã¼ãããã¯ãªã©ã¯ããã¡ãã®è¨äºã®ã³ã¡ã³ãã¸ãå¯ããã ããã