Windows の日本語環境では、テキストのエンコーディングとして UTF-8 と MS932 の2つが今でも主流です。しかし、両者の純粋なマッピングテーブルはあまり見かけない気がしました。つまり、Shift_JIS に加えてベンダー拡張文字まで含めてマッピングしているものが、意外と見当たらないなと。あったら便利そうだと思い、じゃあ作ってみるかということで作成してみました。学術的な定義というより、Windows の実情に合わせた実務寄りのテーブルとしてまとめています。
用途として
ここに収録している文字コードは、実際にテキストファイルに現れることを想定した文字のみです。
そのため、この表にある MS932 のコード列だけで構成されたファイルは、実質的に「純粋な MS932 テキスト」と言えます。
また、この表にある UTF-8 のコード列のみ(+ UTF-8 BOM)で構成されたファイルは、確実に MS932 に変換可能な UTF-8 テキストであると言えます。
● ユーザー定義外字は未収録
MS932 なのにテーブルに存在しない文字が出てくる場合、それは **ユーザー定義外字(F040–F9FC)**である可能性が高いです。
これは PC 環境に依存する文字で、別の PC では表示できなくなることもあります。また、Unicode への正しいマッピングも存在しません。
そのため、本テーブルには収録していません。
いずれにせよ、この種の文字への対応は現場の判断に委ねられるケースが多いと思います。
● ワンチャン今でもありそうなコントロールコード
コントロールコード(0x00–0x1F, 0x7F)で、実質的に現在でも使われるものは TAB / LF / CR くらいだと思います。
ただし、MS-DOS 時代の名残のようなものまで含めるなら、例えば次のようなものが残っている可能性もあります。
・テキストファイルの終端を表す EOF(0x1A)
・文字の色付けやテキストアニメーションなどで使われた エスケープシーケンスの ESC(0x1B)
……あたりは、環境によっては今でも見かけるかもしれません。
使用例
当該リソースは <src_root>/tests/charset_detectors/res/ms932_utf8_mapping_table.txt に配置した。
判定のみ_ver:
package tests.charset_detectors; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class CharsetDetector { private CodeSequence usAscii; private CodeSequence ms932; private CodeSequence utf8; public CharsetDetector() { loadCodeSequences(); } public boolean isUSAscii(byte[] fileData) { return check(usAscii, fileData); } public boolean isMS932(byte[] fileData) { return check(ms932, fileData); } public boolean isUTF8(byte[] fileData) { fileData = removeUTF8BomIfNeeded(fileData); return check(utf8, fileData); } private byte[] removeUTF8BomIfNeeded(byte[] fileData) { if ( fileData.length >= 3 && (fileData[0] & 0xff) == 0xEF && (fileData[1] & 0xff) == 0xBB && (fileData[2] & 0xff) == 0xBF ) { fileData = Arrays.copyOfRange(fileData, 3, fileData.length); // Inefficient, but acceptable for now. } return fileData; } private boolean check(CodeSequence cs, byte[] fileData) { try { for (byte b : fileData) { cs = cs.nexts[b & 0xff]; } return cs.head; } catch (NullPointerException e) { return false; } } private void loadCodeSequences() { byte[][][] mappingTable = getMappingTable(); usAscii = new CodeSequence(); ms932 = new CodeSequence(); utf8 = new CodeSequence(); usAscii.head = true; ms932.head = true; utf8.head = true; for (byte[][] mapping : mappingTable) { if ((mapping[0][0] & 0xff) < 0x80) { usAscii.addCode(mapping[0]); } ms932.addCode(mapping[0]); utf8.addCode(mapping[1]); } } private static class CodeSequence { public boolean head = false; public CodeSequence[] nexts = new CodeSequence[256]; public void addCode(byte[] code) { CodeSequence curr = this; for (int i = 0; i < code.length - 1; i++) { byte b = code[i]; int bi = b & 0xff; if (curr.nexts[bi] == null) { curr.nexts[bi] = new CodeSequence(); } else { assert curr.nexts[bi] != this; } curr = curr.nexts[bi]; } { byte b = code[code.length - 1]; int bi = b & 0xff; if (curr.nexts[bi] == null) { curr.nexts[bi] = this; } else { assert curr.nexts[bi] == this; } } } } private byte[][][] getMappingTable() { List<byte[][]> rows = new ArrayList<>(); try ( InputStream is = this.getClass().getResourceAsStream("res/ms932_utf8_mapping_table.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.US_ASCII)); ) { for (; ; ) { String line = reader.readLine(); if (line == null) { break; } // remove comment { int p = line.indexOf('#'); if (p != -1) { line = line.substring(0, p); } } line = line.trim(); if (line.isEmpty()) { continue; } String[] tokens = line.split(" "); byte[][] row = Arrays.stream(tokens) .filter(t -> !t.isEmpty()) .map(t -> hexToBytes(t)) .toArray(size -> new byte[size][]); rows.add(row); } } catch (Exception e) { throw new RuntimeException("Problem with resource file 'res/ms932_utf8_mapping_table.txt'.", e); } return rows.toArray(size -> new byte[size][][]); } private static byte[] hexToBytes(String strHex) { byte[] data = new byte[strHex.length() / 2]; for (int i = 0; i < strHex.length(); i += 2) { data[i / 2] = (byte)Integer.parseInt(strHex.substring(i, i + 2), 16); } return data; } }
変換のみ_ver:
package tests.charset_detectors; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class CharsetConverter { private CodeSequence ms932ToUTF8; private CodeSequence utf8ToMS932; public CharsetConverter() { loadCodeSequences(); } public byte[] convertMS932ToUTF8(byte[] fileData) { return conv(ms932ToUTF8, fileData); } public byte[] convertUTF8ToMS932(byte[] fileData) { fileData = removeUTF8BomIfNeeded(fileData); return conv(utf8ToMS932, fileData); } private byte[] removeUTF8BomIfNeeded(byte[] fileData) { if ( fileData.length >= 3 && (fileData[0] & 0xff) == 0xEF && (fileData[1] & 0xff) == 0xBB && (fileData[2] & 0xff) == 0xBF ) { fileData = Arrays.copyOfRange(fileData, 3, fileData.length); // Inefficient, but acceptable for now. } return fileData; } private byte[] conv(CodeSequence cs, byte[] fileData) { try (ByteArrayOutputStream buff = new ByteArrayOutputStream()) { for (byte b : fileData) { CodeLink cl = cs.nexts[b & 0xff]; buff.write(cl.destinationCode); cs = cl.next; } return buff.toByteArray(); } catch (IOException e) { throw new RuntimeException(e); // ByteArrayOutputStream does not throw IOException in practice } } private void loadCodeSequences() { byte[][][] mappingTable = getMappingTable(); ms932ToUTF8 = new CodeSequence(); utf8ToMS932 = new CodeSequence(); for (byte[][] mapping : mappingTable) { ms932ToUTF8.addCode(mapping[0], mapping[1]); utf8ToMS932.addCode(mapping[1], mapping[2]); } } private static class CodeSequence { private static final byte[] EMPTY_BYTES = new byte[0]; public CodeLink[] nexts = new CodeLink[256]; public void addCode(byte[] code, byte[] code2) { CodeSequence curr = this; for (int i = 0; i < code.length - 1; i++) { byte b = code[i]; int bi = b & 0xff; if (curr.nexts[bi] == null) { curr.nexts[bi] = new CodeLink() { { this.destinationCode = EMPTY_BYTES; this.next = new CodeSequence(); } }; } else { assert curr.nexts[bi].next != this; } curr = curr.nexts[bi].next; } { byte b = code[code.length - 1]; int bi = b & 0xff; if (curr.nexts[bi] == null) { curr.nexts[bi] = new CodeLink() { { this.destinationCode = code2; this.next = CodeSequence.this; } }; } else { assert curr.nexts[bi].next == this; } } } } private static class CodeLink { public byte[] destinationCode; public CodeSequence next; } private byte[][][] getMappingTable() { List<byte[][]> rows = new ArrayList<>(); try ( InputStream is = this.getClass().getResourceAsStream("res/ms932_utf8_mapping_table.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.US_ASCII)); ) { for (; ; ) { String line = reader.readLine(); if (line == null) { break; } // remove comment { int p = line.indexOf('#'); if (p != -1) { line = line.substring(0, p); } } line = line.trim(); if (line.isEmpty()) { continue; } String[] tokens = line.split(" "); byte[][] row = Arrays.stream(tokens) .filter(t -> !t.isEmpty()) .map(t -> hexToBytes(t)) .toArray(size -> new byte[size][]); rows.add(row); } } catch (Exception e) { throw new RuntimeException("Problem with resource file 'res/ms932_utf8_mapping_table.txt'.", e); } return rows.toArray(size -> new byte[size][][]); } private static byte[] hexToBytes(String strHex) { byte[] data = new byte[strHex.length() / 2]; for (int i = 0; i < strHex.length(); i += 2) { data[i / 2] = (byte)Integer.parseInt(strHex.substring(i, i + 2), 16); } return data; } }
――
テスト実行:
package tests.charset_detectors; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class Test_CharsetDetectorConverter { public static void main(String[] args) { try { testMain(); } catch (Throwable e) { e.printStackTrace(); } } private static void testMain() throws IOException { String[] TARGET_ROOT_DIRS = new String[] { "C:\\Dev", "C:\\home\\Java_workspace\\MetroCore" }; String[] TARGET_LOWER_EXTS = new String[] { ".c", ".h", ".cs", ".cpp", ".java", ".txt" }; List<String> testeeFilePaths = new ArrayList<String>(); for (String root : TARGET_ROOT_DIRS) { Files.walk(Paths.get(root)) .filter(Files::isRegularFile) .forEach(path -> { String filePath = path.toString(); String filePath_lower = filePath.toLowerCase(); if (Arrays.stream(TARGET_LOWER_EXTS).anyMatch(ext_lower -> filePath_lower.endsWith(ext_lower))) { testeeFilePaths.add(filePath); } }); } CharsetDetector cd = new CharsetDetector(); CharsetConverter cc = new CharsetConverter(); for (String filePath : testeeFilePaths) { byte[] fileData = Files.readAllBytes(Paths.get(filePath)); System.out.println(String.join("\t" , "" + cd.isUSAscii(fileData) , "" + cd.isMS932(fileData) , "" + cd.isUTF8(fileData) , filePath )); if (cd.isMS932(fileData)) { //System.out.println("convertMS932ToUTF8() test."); byte[] fileData2 = cc.convertMS932ToUTF8(fileData); byte[] fileData3 = new String(fileData, "MS932").getBytes("UTF-8"); if (!Arrays.equals(fileData2, fileData3)) { throw null; } System.out.println("convertMS932ToUTF8() test OK!"); } if (cd.isUTF8(fileData)) { //System.out.println("convertUTF8ToMS932() test."); byte[] fileData2 = cc.convertUTF8ToMS932(fileData); byte[] fileData3 = new String(test_removeUTF8BomIfNeeded(fileData), "UTF-8").getBytes("MS932"); if (!Arrays.equals(fileData2, fileData3)) { throw null; } System.out.println("convertUTF8ToMS932() test OK!"); } } System.out.println("All test OK!"); } private static byte[] test_removeUTF8BomIfNeeded(byte[] fileData) { if ( fileData.length >= 3 && (fileData[0] & 0xff) == 0xEF && (fileData[1] & 0xff) == 0xBB && (fileData[2] & 0xff) == 0xBF ) { fileData = Arrays.copyOfRange(fileData, 3, fileData.length); } return fileData; } }
テスト実行_出力:
false true false C:\Dev\Dev\Annex\Commit\doc\Readme.txt convertMS932ToUTF8() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Common.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\ArgsReader.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\ProcMain.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\Randomizer.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\SCommon.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\SimpleDateTime.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Commons\WorkingDir.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Consts.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Extensions.cs convertUTF8ToMS932() test OK! true true true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\obj\x86\Release\.NETFramework,Version=v4.8.AssemblyAttributes.cs convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\obj\x86\Release\HLTConsole.csproj.FileListAbsolute.txt convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Program.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\Commit\HLTConsole\HLTConsole\Properties\AssemblyInfo.cs convertUTF8ToMS932() test OK! false true false C:\Dev\Dev\Annex\ExcelToCsv\doc\Readme.txt convertMS932ToUTF8() test OK! false false true C:\Dev\Dev\Annex\ExcelToCsv\HLTConsole\HLTConsole\Common.cs convertUTF8ToMS932() test OK! false false true C:\Dev\Dev\Annex\ExcelToCsv\HLTConsole\HLTConsole\Commons\ArgsReader.cs convertUTF8ToMS932() test OK! ……(省略)…… false true false C:\Dev\Factory\Build\_Cx\SolutionOrder.c convertMS932ToUTF8() test OK! true true true C:\Dev\Factory\Build\_Cx\SolutionOrder.h convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false true false C:\Dev\Factory\Build\_Cx\_Cx.c convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\all.h convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\autoBlock.c convertMS932ToUTF8() test OK! true true true C:\Dev\Factory\Common\autoBlock.h convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\Dev\Factory\Common\autoBlockTools.c convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false true false C:\Dev\Factory\Common\autoBlockTools.h convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\autoList.c convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\autoList.h convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\Compare.c convertMS932ToUTF8() test OK! true true true C:\Dev\Factory\Common\Compare.h convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false true false C:\Dev\Factory\Common\Cout.c convertMS932ToUTF8() test OK! true true true C:\Dev\Factory\Common\Cout.h convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\Dev\Factory\Common\Data.c convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\Dev\Factory\Common\Data.h convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false true false C:\Dev\Factory\Common\DataConv.c convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\DataConv.h convertMS932ToUTF8() test OK! false true false C:\Dev\Factory\Common\Define.h convertMS932ToUTF8() test OK! ……(省略)…… true true true C:\home\Java_workspace\MetroCore\src\hltstudio\tools\ResourceTools.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\hltstudio\tools\RunnableEx.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\hltstudio\tools\TCommon.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\CharsetConverter.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\CharsetDetector.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\res\ms932_utf8_mapping_table.txt convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\Test0001.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\tests\Test0001.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\charset_detectors\Test_CharsetDetectorConverter.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! false false true C:\home\Java_workspace\MetroCore\src\tests\ObjectDumper.java convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\resource_tests\res\CP932.txt convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\resource_tests\res\JIS0208.txt convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\resource_tests\Test0001.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\Test0001.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! true true true C:\home\Java_workspace\MetroCore\src\tests\Test0002.java convertMS932ToUTF8() test OK! convertUTF8ToMS932() test OK! All test OK!











