blob: b922954882af90d797b2240757c41109e00743fd [file] [log] [blame] [edit]
type codepoint = int
type unicode = codepoint list
exception Utf8
let con n = 0x80 lor (n land 0x3f)
let rec encode ns = Lib.String.implode (List.map Char.chr (encode' ns))
and encode' = function
| [] -> []
| n::ns when n < 0 ->
raise Utf8
| n::ns when n < 0x80 ->
n :: encode' ns
| n::ns when n < 0x800 ->
0xc0 lor (n lsr 6) :: con n :: encode' ns
| n::ns when n < 0x10000 ->
0xe0 lor (n lsr 12) :: con (n lsr 6) :: con n :: encode' ns
| n::ns when n < 0x110000 ->
0xf0 lor (n lsr 18) :: con (n lsr 12) :: con (n lsr 6) :: con n
:: encode' ns
| _ ->
raise Utf8
let con b = if b land 0xc0 = 0x80 then b land 0x3f else raise Utf8
let code min n =
if n < min || (0xd800 <= n && n < 0xe000) || n >= 0x110000 then raise Utf8
else n
let rec decode s = decode' (List.map Char.code (Lib.String.explode s))
and decode' = function
| [] -> []
| b1::bs when b1 < 0x80 ->
code 0x0 b1 :: decode' bs
| b1::bs when b1 < 0xc0 ->
raise Utf8
| b1::b2::bs when b1 < 0xe0 ->
code 0x80 ((b1 land 0x1f) lsl 6 + con b2) :: decode' bs
| b1::b2::b3::bs when b1 < 0xf0 ->
code 0x800 ((b1 land 0x0f) lsl 12 + con b2 lsl 6 + con b3) :: decode' bs
| b1::b2::b3::b4::bs when b1 < 0xf8 ->
code 0x10000 ((b1 land 0x07) lsl 18 + con b2 lsl 12 + con b3 lsl 6 + con b4)
:: decode' bs
| _ ->
raise Utf8