| type codepoint = int |
| type unicode = codepoint list |
| |
| exception Utf8 |
| |
| let con n = 0x80 lor (n land 0x3f) |
| |
| let rec encode ns = Lib.String.implode (List.map Char.chr (encode' ns)) |
| and encode' = function |
| | [] -> [] |
| | n::ns when n < 0 -> |
| raise Utf8 |
| | n::ns when n < 0x80 -> |
| n :: encode' ns |
| | n::ns when n < 0x800 -> |
| 0xc0 lor (n lsr 6) :: con n :: encode' ns |
| | n::ns when n < 0x10000 -> |
| 0xe0 lor (n lsr 12) :: con (n lsr 6) :: con n :: encode' ns |
| | n::ns when n < 0x110000 -> |
| 0xf0 lor (n lsr 18) :: con (n lsr 12) :: con (n lsr 6) :: con n |
| :: encode' ns |
| | _ -> |
| raise Utf8 |
| |
| let con b = if b land 0xc0 = 0x80 then b land 0x3f else raise Utf8 |
| let code min n = |
| if n < min || (0xd800 <= n && n < 0xe000) || n >= 0x110000 then raise Utf8 |
| else n |
| |
| let rec decode s = decode' (List.map Char.code (Lib.String.explode s)) |
| and decode' = function |
| | [] -> [] |
| | b1::bs when b1 < 0x80 -> |
| code 0x0 b1 :: decode' bs |
| | b1::bs when b1 < 0xc0 -> |
| raise Utf8 |
| | b1::b2::bs when b1 < 0xe0 -> |
| code 0x80 ((b1 land 0x1f) lsl 6 + con b2) :: decode' bs |
| | b1::b2::b3::bs when b1 < 0xf0 -> |
| code 0x800 ((b1 land 0x0f) lsl 12 + con b2 lsl 6 + con b3) :: decode' bs |
| | b1::b2::b3::b4::bs when b1 < 0xf8 -> |
| code 0x10000 ((b1 land 0x07) lsl 18 + con b2 lsl 12 + con b3 lsl 6 + con b4) |
| :: decode' bs |
| | _ -> |
| raise Utf8 |