| <!DOCTYPE html> |
| <title>Character Decoding</title> |
| <script src="../../resources/testharness.js"></script> |
| <script src="../../resources/testharnessreport.js"></script> |
| <script src="resources/char-decoding-utils.js"></script> |
| <script> |
| |
| testDecode('UTF-8', '%E2%88%9A', 'U+221A'); |
| |
| // \xA3\xA0 in GBK should be mapped to U+3000 instead of U+E5E5. |
| testDecode('gb2312', '%A3%A0', 'U+3000'); |
| testDecode('gb_2312', '%A3%A0', 'U+3000'); |
| testDecode('gb_2312-80', '%A3%A0', 'U+3000'); |
| testDecode('csgb2312', '%A3%A0', 'U+3000'); |
| testDecode('iso-ir-58', '%A3%A0', 'U+3000'); |
| testDecode('csiso58gb231280', '%A3%A0', 'U+3000'); |
| testDecode('chinese', '%A3%A0', 'U+3000'); |
| testDecode('gbk', '%A3%A0', 'U+3000'); |
| testDecode('x-gbk', '%A3%A0', 'U+3000'); |
| testDecode('gb18030', '%A3%A0', 'U+3000'); |
| |
| // Align GBK with GB18030 |
| testDecode('gbk', '%A8%BF', 'U+01F9'); |
| testDecode('gbk', '%A1%AD', 'U+2026'); |
| testDecode('gbk', '%A1%AB', 'U+FF5E'); |
| testDecode('gb18030', '%A8%BF', 'U+01F9'); |
| testDecode('gb18030', '%A8%BC', 'U+1E3F'); |
| testDecode('gb18030', '%A1%AD', 'U+2026'); |
| testDecode('gb18030', '%A1%AB', 'U+FF5E'); |
| |
| // Replace U+E7C7 with U+1E3F once |
| // https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 is resolved. |
| testDecode('gbk', '%A8%BC', 'U+E7C7'); |
| |
| // Test Shift_JIS aliases. |
| testDecode('Shift_JIS', '%82%d0', 'U+3072'); |
| testDecode('shift-jis', '%82%d0', 'U+3072'); |
| testDecode('csshiftjis', '%82%d0', 'U+3072'); |
| testDecode('sjis', '%82%d0', 'U+3072'); |
| testDecode('x-sjis', '%82%d0', 'U+3072'); |
| testDecode('ms_kanji', '%82%d0', 'U+3072'); |
| testDecode('windows-31j', '%82%d0', 'U+3072'); |
| |
| // Test Big5 peculiarities. |
| // See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 |
| testDecode('Big5', '%A4%51', 'U+5341'); |
| testDecode('Big5', '%A2%CC', 'U+5341'); |
| testDecode('Big5', '%A4%CA', 'U+5345'); |
| testDecode('Big5', '%A2%CE', 'U+5345'); |
| testDecode('Big5', '%A2%A4', 'U+2550'); |
| testDecode('Big5', '%F9%F9', 'U+2550'); |
| testDecode('Big5', '%A2%A5', 'U+255E'); |
| testDecode('Big5', '%F9%E9', 'U+255E'); |
| testDecode('Big5', '%A2%A7', 'U+2561'); |
| testDecode('Big5', '%F9%EB', 'U+2561'); |
| testDecode('Big5', '%A2%A6', 'U+256A'); |
| testDecode('Big5', '%F9%EA', 'U+256A'); |
| |
| // Test that all Korean encodings of EUC-KR family are treated as windows-949. |
| var korean = { |
| encodings: ['korean', 'EUC-KR', 'windows-949', 'cseuckr', 'csksc56011987', |
| 'iso-ir-149', 'KS_C_5601-1987', 'KS_C_5601-1989', |
| 'KSC5601', 'KSC_5601'], |
| encoded: ['%A2%E6', '%A1%A4', '%A1%A9', '%A1%AA', '%A1%AD', '%A2%A6', |
| '%A2%C1', '%1A', '%1C', '%8F%A1', '%B4%D3', '%A2%41'], |
| unicode: ['U+20AC', 'U+00B7', 'U+00AD', 'U+2015', 'U+223C', 'U+FF5E', |
| 'U+2299', 'U+001A', 'U+001C', 'U+B8EA', 'U+B2D2', 'U+C910'] |
| }; |
| |
| batchTestDecode(korean); |
| |
| // Test that ISO-8859-9 (Turkish) is upgraded to windows-1254 with Euro symbol. |
| var turkish = { |
| encodings: ['iso-8859-9', 'latin5', 'windows-1254'], |
| encoded: ['%80', '%9F', '%FD'], |
| unicode: ['U+20AC', 'U+0178', 'U+0131'] |
| }; |
| |
| batchTestDecode(turkish); |
| |
| // FIXME: Have to add tests for Euro and a few new characters added to ISO-8859-x |
| // that are NOT subsets of the corresponding Windows codepages. For instance, |
| // ISO-8859-7:2003 has Euro at 0xA4 and a couple of other new characters. |
| |
| // Baltic encodings fine points. |
| testDecode('ISO-8859-13', '%A1', 'U+201D'); |
| testDecode('ISO-8859-13', '%A5', 'U+201E'); |
| testDecode('ISO-8859-13', '%B4', 'U+201C'); |
| testDecode('ISO-8859-13', '%FF', 'U+2019'); |
| testDecode('windows-1257', '%80', 'U+20AC'); |
| testDecode('windows-1257', '%B4', 'U+00B4'); |
| testDecode('windows-1257', '%FF', 'U+02D9'); |
| |
| // Greek encodings fine points. |
| testDecode('iso-8859-7', '%A1', 'U+2018'); |
| testDecode('iso-8859-7', '%B5', 'U+0385'); |
| testDecode('iso-8859-7', '%B6', 'U+0386'); |
| testDecode('iso-8859-7', '%A4', 'U+20AC'); |
| testDecode('iso-8859-7', '%A5', 'U+20AF'); |
| testDecode('iso-8859-7', '%AA', 'U+037A'); |
| testDecode('windows-1253', '%80', 'U+20AC'); |
| testDecode('windows-1253', '%A1', 'U+0385'); |
| testDecode('windows-1253', '%B5', 'U+00B5'); |
| testDecode('windows-1253', '%B6', 'U+00B6'); |
| |
| // KOI-8 variants |
| testDecode('KOI8-R', '%A4', 'U+2553'); |
| testDecode('KOI8-R', '%AD', 'U+255C'); |
| testDecode('KOI8-U', '%A4', 'U+0454'); |
| testDecode('KOI8-U', '%AD', 'U+0491'); |
| testDecode('KOI8-U', '%AE', 'U+045E'); |
| testDecode('KOI8-U', '%BE', 'U+040E'); |
| testDecode('KOI8-RU', '%AE', 'U+045E'); |
| testDecode('KOI8-RU', '%BE', 'U+040E'); |
| |
| // Test that TIS-620 and ISO-8859-11 (Thai) are upgraded to windows-874. |
| var thai = { |
| encodings: ['TIS-620', 'ISO-8859-11', 'windows-874', 'dos-874'], |
| encoded: ['%80', '%96', '%A0', '%A1', '%DA'], |
| unicode: ['U+20AC', 'U+2013', 'U+00A0', 'U+0E01', 'U+0E3A'] |
| }; |
| |
| batchTestDecode(thai); |
| |
| // UTF-16LE and variants. |
| testDecode('UTF-16LE', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| testDecode('unicodeFEFF', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| // According to HTML5 and for IE compatibility, UTF-16 is treated as little endian. The following tests fail as of Firefox 3.6.13. |
| testDecode('UTF-16', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| testDecode('ISO-10646-UCS-2', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| testDecode('UCS-2', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| testDecode('Unicode', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| testDecode('csUnicode', '%69%D8%D6%DE', 'U+D869/U+DED6'); |
| |
| // UTF-16BE and variants. |
| testDecode('UTF-16BE', '%D8%69%DE%D6', 'U+D869/U+DED6'); |
| testDecode('unicodeFFFE', '%D8%69%DE%D6', 'U+D869/U+DED6'); |
| |
| </script> |