HNXgrepã®C#ã«ããæåã³ã¼ãå¤å®
â»2014.08.24追è¨
ããã®è¨äºã®æåã³ã¼ãå¤å¥ã½ã¼ã¹ã³ã¼ãã¯ã2012å¹´æç¹ã®å¤ããã¼ã¸ã§ã³ã®ãã®ã§ãã
ãææ°ãã¼ã¸ã§ã³ã®æåã³ã¼ãå¤å¥ã¯ããC#で高精度なテキストファイル文字コード自動判別(2014年版) - hnx8 開発室ãã®è¨äºãåç
§ãã ããã
C#(.net Framework)ã«ã¯ãããã¹ããã¡ã¤ã«ã®æåã³ã¼ãï¼ã¨ã³ã³ã¼ãã£ã³ã°ï¼ãèªåå¤å¥ãã¦èªã¿è¾¼ããããªæ©è½ãJavaã®JISAutoDetectã«ç¸å½ããæ©è½ã¯ç¨æããã¦ãã¾ããã
ãªã®ã§ãèªã¿è¾¼ãããã¹ããã¡ã¤ã«ã®æåã³ã¼ãã¯èªåã§å¤å®ããããããã¯ããããæ©è½ããã¤å¤é¨dllãå©ç¨ããå¿
è¦ãããã¾ãã
æä½ã½ããHNXgrep http://www.vector.co.jp/soft/winnt/util/se494966.html ã§ã¯ãç¬èªå®è£
ã®ã½ã¼ã¹ã³ã¼ãã§æåã³ã¼ãã®å¤å®ãè¡ã£ã¦ãã¾ãã
ASCII,JISãEUC,ShiftJIS,UTF8ãããã³(åè§è±æ°æåã®ã¿ã§æ¸ããã¦ãã)UTF16Nã®å¤å®ãåºæ¥ã¾ãã
åºæ¬çã«ã¯ãDOBON.NETæ§ã®ãµã¤ãã§å
¬éããã¦ããJcode.pmã®C#移æ¤ç http://dobon.net/vb/dotnet/string/detectcode.html ã®èãæ¹ããã¼ã¹ã«ã
- UTF16(BOMãªãåè§è±æ°ã®ã¿)ãã¡ã¤ã«ã®æ¤åºæ©è½ã追å
- EUC/SJISã®å¤å¥ãå¼·åãããããåè§ã«ãæåã®é£ç¶ï¼å ¨è§æåã®é£ç¶ã«ã¤ãã¦ã¯ãã¤ã³ãè©ä¾¡ãé«ãã«è£æ£(â»)
ã¨ãã£ãç¬èªå¯¾å¿ãè¡ã£ã¦ãã¾ãã
â»ããã¹ããã¡ã¤ã«ã®å
容ã«ãã£ã¦ã¯ãæåã³ã¼ãçã«ShiftJISã¨ãã¦ãEUCã¨ãã¦ã妥å½ã§ãããã¾ãUTF8ã¨ãã¦ã妥å½ã§ãããããªå ´åãããå¾ã¾ãããã®ãããã©ã®æåã³ã¼ãã§ãã³ã¼ãããã®ãããé©åãªã®ãã決å®ããåºæºã¨ãã¦ãå
¨è§åè§æåãé£ç¶ãã¦ãããã¨ãéè¦è¦ãã¦ã¿ã¾ãããï¼ãã ããã¤ã³ãé
ç¹ã¯ãã£ããé©å½ã«æ±ºãã¦ãã¾ãï¼
2012.02.25æç¹ã®æåã³ã¼ãå¤å¥ã½ã¼ã¹ã³ã¼ãã以ä¸ã«å
¬éãã¾ãã
2014.05.13ï¼ESC(JIS)å¤å®ã®ï¼ãã¤ãç®ä»¥éææ¡ã®ãã°ãä¿®æ£ãã¾ãããé»ãç«ããããããã¨ããããã¾ããï¼
ãã°ã»ãã¸ãã¯ä¸ã®åé¡ç¹ãªã©ãè¦ã¤ããæ¹ã¯ãã²ãã²ãææãã ããã
class CharCodeDetector { //æåã³ã¼ãã®ç¨®é¡ enum CharCode { ASCII, BINARY, EUC, JIS, SJIS, UTF16BE, UTF16LE, UTF8N } /// <summary> /// èªã¿è¾¼ãã§ããbyteé åå 容ã®ã¨ã³ã³ã¼ãã£ã³ã°ãèªåã§å¤å®ãã /// </summary> /// <param name="data">ãã¡ã¤ã«ããèªã¿è¾¼ãã ãã¤ããã¼ã¿</param> /// <param name="datasize">ãã¤ããã¼ã¿ã®ãµã¤ãº</param> /// <returns>ã¨ã³ã³ã¼ãã£ã³ã°ã®ç¨®é¡</returns> public CharCode detectCharCode(byte[] data, int datasize) { //ãã¤ããã¼ã¿ï¼èªã¿åãçµæï¼ byte b1 = (datasize > 0) ? data[0] : (byte)0; byte b2 = (datasize > 1) ? data[1] : (byte)0; byte b3 = (datasize > 2) ? data[2] : (byte)0; byte b4 = (datasize > 3) ? data[3] : (byte)0; //UTF16Nã®å¤å®(ãã ãåè§è±æ°æåã®å ´åã®ã¿æ¤åºå¯è½) if (b1 == 0x00 && (datasize % 2 == 0)) { for (int i = 0; i < datasize; i = i + 2) { if (data[i] != 0x00 || data[i + 1] < 0x06 || data[i + 1] >= 0x7f) { //åè§Onlyã®UTF16ã§ããªããããªã®ã§ãã¤ã㪠return CharCode.BINARY; } } return CharCode.UTF16BE; } if (b2 == 0x00 && (datasize % 2 == 0)) { for (int i = 0; i < datasize; i = i + 2) { if (data[i] < 0x06 || data[i] >= 0x7f || data[i + 1] != 0x00) { //åè§Onlyã®UTF16ã§ããªããããªã®ã§ãã¤ã㪠return CharCode.BINARY; } } return CharCode.UTF16LE; } //å ¨ãã¤ãå 容ãèµ°æ»ã»ã¾ãAscii,JISå¤å® int pos = 0; int jisCount = 0; while (pos < datasize) { b1 = data[pos]; if (b1 < 0x03 || b1 >= 0x7f) { //éascii(UTF,SJisç)çºè¦ï¼æ¬¡ã®ã«ã¼ã㸠break; } else if (b1 == 0x1b) { //ESC(JIS)å¤å® //2ãã¤ãç®ä»¥éã®å¤ãææ¡ b2 = ((pos + 1 < datasize) ? data[pos + 1] : (byte)0); b3 = ((pos + 2 < datasize) ? data[pos + 2] : (byte)0); b4 = ((pos + 3 < datasize) ? data[pos + 3] : (byte)0); //B2ã®å¤ããã¨ã«å¤å® if (b2 == 0x24) { //ESC$ if (b3 == 0x40 || b3 == 0x42) { //ESC $@,$B : JISã¨ã¹ã±ã¼ã jisCount++; pos = pos + 2; } else if (b3 == 0x28 && (b4 == 0x44 || b4 == 0x4F || b4 == 0x51 || b4 == 0x50)) { //ESC$(D, ESC$(O, ESC$(Q, ESC$(P : JISã¨ã¹ã±ã¼ã jisCount++; pos = pos + 3; } } else if (b2 == 0x26) { //ESC& : JISã¨ã¹ã±ã¼ã if (b3 == 0x40) { //ESC &@ : JISã¨ã¹ã±ã¼ã jisCount++; pos = pos + 2; } } else if (b2 == 0x28) { //ESC((28) if (b3 == 0x4A || b3 == 0x49 || b3 == 0x42) { //ESC(J, ESC(I, ESC(B : JISã¨ã¹ã±ã¼ã jisCount++; pos = pos + 2; } } } pos++; } //Asciiã®ã¿ãªãããã§æåã³ã¼ãæ±ºå® if (pos == datasize) { if (jisCount > 0) { //JISåºç¾ return CharCode.JIS; } else { //JISæªåºç¾ãAscii return CharCode.ASCII; } } bool prevIsKanji = false; //æåã³ã¼ãå¤å®å¼·åãå種æåã®ã¨ãã«ãã¤ã³ãå ç®-HNXgrep int notAsciiPos = pos; int utfCount = 0; //UTF妥å½æ§ãã§ãã¯ï¼ãã¤ããªå¤å®ãè¡ããªããå®æ½ï¼ while (pos < datasize) { b1 = data[pos]; pos++; if (b1 < 0x03 || b1 == 0x7f || b1 == 0xff) { //ãã¤ããªæåï¼ç´æ¥è±åº return CharCode.BINARY; } if (b1 < 0x80 || utfCount < 0) { //åè§æåã»éUTF確å®æã¯ãå¾ç¶å¦çã¯è¡ããªã continue; // åè§æåã¯ç¹ã«ãã§ãã¯ããªã } //2ãã¤ãç®ãææ¡ãã³ã¼ããã§ã㯠b2 = ((pos < datasize) ? data[pos] : (byte)0x00); if (b1 < 0xC2 || b1 >= 0xf5) { //ï¼ãã¤ãç®ãC0,C1,F5以éãã¾ãã¯ï¼ãã¤ãç®ã«ããç¾ããªãã¯ãã®ã³ã¼ããåºç¾ãNG utfCount = -1; } else if (b1 < 0xe0) { //2ãã¤ãæåï¼ã³ã¼ããã§ã㯠if (b2 >= 0x80 && b2 <= 0xbf) { //ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããåºç¾ãOKï¼åè§æåã¨ãã¦æ±ãï¼ if (prevIsKanji == false) { utfCount += 2; } else { utfCount += 1; prevIsKanji = false; } pos++; } else { //ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããæªåºç¾ãNG utfCount = -1; } } else if (b1 < 0xf0) { //3ãã¤ãæåï¼ï¼ãã¤ãç®ãææ¡ b3 = ((pos + 1 < datasize) ? data[pos + 1] : (byte)0x00); if (b2 >= 0x80 && b2 <= 0xbf && b3 >= 0x80 && b3 <= 0xbf) { //ï¼/ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããåºç¾ãOKï¼å ¨è§æåæ±ãï¼ if (prevIsKanji == true) { utfCount += 4; } else { utfCount += 3; prevIsKanji = true; } pos += 2; } else { //ï¼/ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããæªåºç¾ãNG utfCount = -1; } } else { //ï¼ãã¤ãæåï¼ï¼ï¼ï¼ãã¤ãç®ãææ¡ b3 = ((pos + 1 < datasize) ? data[pos + 1] : (byte)0x00); b4 = ((pos + 2 < datasize) ? data[pos + 2] : (byte)0x00); if (b2 >= 0x80 && b2 <= 0xbf && b3 >= 0x80 && b3 <= 0xbf && b4 >= 0x80 && b4 <= 0xbf) { //ï¼/ï¼/ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããåºç¾ãOKï¼å ¨è§æåæ±ãï¼ if (prevIsKanji == true) { utfCount += 6; } else { utfCount += 4; prevIsKanji = true; } pos += 3; } else { //ï¼/ï¼/ï¼ãã¤ãç®ã«ç¾ããã¹ãã³ã¼ããæªåºç¾ãNG utfCount = -1; } } } //SJIS妥å½æ§ãã§ã㯠pos = notAsciiPos; int sjisCount = 0; while (sjisCount >= 0 && pos < datasize) { b1 = data[pos]; pos++; if (b1 < 0x80) { continue; }// åè§æåã¯ç¹ã«ãã§ãã¯ããªã else if (b1 == 0x80 || b1 == 0xA0 || b1 >= 0xFD) { //SJISã³ã¼ãå¤ãå¯è½æ§ãç ´æ£ sjisCount = -1; } else if ((b1 > 0x80 && b1 < 0xA0) || b1 > 0xDF) { //å ¨è§æåãã§ãã¯ã®ããã2ãã¤ãç®ã®å¤ãææ¡ b2 = ((pos < datasize) ? data[pos] : (byte)0x00); //å ¨è§æåç¯å²å¤ãããªãããã§ã㯠if (b2 < 0x40 || b2 == 0x7f || b2 > 0xFC) { //å¯è½æ§ãé¤å¤ sjisCount = -1; } else { //å ¨è§æåæ°ãå ç®,ãã¸ã·ã§ã³ãé²ãã¦ãã if (prevIsKanji == true) { sjisCount += 2; } else { sjisCount += 1; prevIsKanji = true; } pos++; } } else if (prevIsKanji == false) { //åè§æåæ°ã®å ç®ï¼åè§ã«ãã®é£ç¶ã¯ãã¼ãã¹ç¹ãé«ãã«ï¼ sjisCount += 1; } else { prevIsKanji = false; } } //EUC妥å½æ§ãã§ã㯠pos = notAsciiPos; int eucCount = 0; while (eucCount >= 0 && pos < datasize) { b1 = data[pos]; pos++; if (b1 < 0x80) { continue; } // åè§æåã¯ç¹ã«ãã§ãã¯ããªã //2ãã¤ãç®ãææ¡ãã³ã¼ããã§ã㯠b2 = ((pos < datasize) ? data[pos] : (byte)0); if (b1 == 0x8e) { //1ãã¤ãç®ï¼ããªæåæå®ã2ãã¤ãã®åè§ã«ãæåãã§ã㯠if (b2 < 0xA1 || b2 > 0xdf) { //å¯è½æ§ç ´æ£ eucCount = -1; } else { //æ¤åºOK,EUCæåæ°ãå ç®ï¼åè§æåï¼ if (prevIsKanji == false) { eucCount += 2; } else { eucCount += 1; prevIsKanji = false; } pos++; } } else if (b1 == 0x8f) { //ï¼ãã¤ãç®ã®å¤ï¼ï¼ãã¤ãæåãæå® if (b2 < 0xa1 || (pos + 1 < datasize && data[pos + 1] < 0xa1)) { //ï¼ãã¤ãç®ã»ï¼ãã¤ãç®ã§å¯è½æ§ç ´æ£ eucCount = -1; } else { //æ¤åºOK,EUCæåæ°ãå ç®ï¼å ¨è§æåï¼ if (prevIsKanji == true) { eucCount += 3; } else { eucCount += 1; prevIsKanji = true; } pos += 2; } } else if (b1 < 0xa1 || b2 < 0xa1) { //ï¼ãã¤ãæåã®ã¯ãã ã£ããã©ã¡ããã®ãã¤ããNG eucCount = -1; } else { //ï¼ãã¤ãæåOKï¼å ¨è§ï¼ if (prevIsKanji == true) { eucCount += 2; } else { eucCount += 1; prevIsKanji = true; } pos++; } } //æåã³ã¼ãæ±ºå® if (eucCount > sjisCount && eucCount > utfCount) { return CharCode.EUC; } else if (utfCount > sjisCount) { return CharCode.UTF8N; } else if (sjisCount > -1) { return CharCode.SJIS; } else { return CharCode.BINARY; } } }
使ãæ¹ï¼
- æåã³ã¼ãå¤å¥ãããããã¹ããã¡ã¤ã«ã®å 容ããbyteé åã«èªã¿è¾¼ã
- èªã¿è¾¼ãã ãã¡ã¤ã«ã®å 容ã«BOMãä»ãã¦ããªãã確èªããBOMãªãã§ããã°detectCharCode()ãå¼ã³åºããå¼æ°ã«ã¯byteé åã¨ãã¡ã¤ã«é·ãæå®ãããï¼byteé åé·ï¼ãã¡ã¤ã«é·ã§ãOKã§ãï¼
- å¤å®çµæã«å¿ããEncodingã¯ã©ã¹ã®ãªãã¸ã§ã¯ããç¨æããencoding.getString(data, å¤æéå§ä½ç½®, å¤æãããã¤ãæ°)ã§byteé åå 容ãStringæååã«å¤æãã
BOM(Byte Order Mark)ã¤ãã®ããã¹ããã¡ã¤ã«ã¯ãå
é æ°ãã¤ãã®å
容ããæåã³ã¼ãã決ã¾ãã¾ããï¼getStringã§æååã¸ã¨å¤æããéã«ã¯ãBOMé¨åãå¤æ対象ã«å«ããªããã注æãã¦ãã ããï¼
BOMå¤å®ç¨ã®ã³ã¼ã(æç²)ã以ä¸ã«å
¬éãã¦ããã¾ããenumã¯é©å®è¿½å ãã¦ãã ããã
/// <summary> /// Bomã»ããããã決å®ã§ããæåã³ã¼ããå¤å®ã /// </summary> /// <returns>ã¨ã³ã³ã¼ãã£ã³ã°ã®ç¨®é¡</returns> public CharCode detectCharCodeWithBomHeader(byte[] data, int datasize) { //ãã¤ããã¼ã¿ï¼èªã¿åãçµæï¼ byte b1 = (datasize > 0) ? data[0] : (byte)0; byte b2 = (datasize > 1) ? data[1] : (byte)0; byte b3 = (datasize > 2) ? data[2] : (byte)0; byte b4 = (datasize > 3) ? data[3] : (byte)1; //BOMããå¤å¥ã§ããæåã³ã¼ãå¤å® if (b1 == 0xFF && b2 == 0xFE && b3 == 0x00 && b4 == 0x00) { //BOMããUTF32(littleEndian) return CharCode.UTF32; } if (b1 == 0x00 && b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) { //BOMããUTF32(bigEndian) return CharCode.UTF32B; } if (b1 == 0xff && b2 == 0xfe) { //BOMããUnicode(Windowsæ¨æºã®UTF-16ã®littleEndian) return CharCode.UTF16; } if (b1 == 0xfe && b2 == 0xff) { //BOMããUnicode(UTF-16ã®BigEndien) return CharCode.UTF16B; } if (b1 == 0xef && b2 == 0xbb && b3 == 0xbf) { //BOMããUTF-8 return CharCode.UTF8; } //BOMãªã return Charcode.Unknown; }
以ä¸ããã¾ã綺éºãªã½ã¼ã¹ã³ã¼ãã§ã¯ããã¾ããããShiftJISå¤å®ã®ãã°ãè¦ã¤ãã£ããã¨ï¼Nabeæ§ãå ±åãããã¨ããããã¾ããï¼ãããå
¬éãã¦ã¿ããã¨ã«ãã¾ããã
ãC# æåã³ã¼ãå¤å®ãã¨ãã£ããã¼ã¯ã¼ãã§ãã®blogã«ãã©ãçããæ¹ãããã£ããããããªã®ã§ãåèã«ãªãã°å¹¸ãã§ãã
åèã«ãããµã¤ãï¼
DOBON.NET æåã³ã¼ããå¤å¥ãã
http://dobon.net/vb/dotnet/string/detectcode.html
é
éå¡ã® C# ããã°ã©ãã³ã° C#2008 æåã³ã¼ãã®å¤å®
http://www.geocities.jp/gakaibon/tips/csharp2008/charset-check.html