blob: 69763ca32136af9e2f6ec6bfc7851977993c2836 [file] [log] [blame]
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: ucnvbocu.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002mar27
* created by: Markus W. Scherer
*
* This is an implementation of the Binary Ordered Compression for Unicode,
* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"
#include "unicode/utf16.h"
#include "putilimp.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "uassert.h"
/* BOCU-1 constants and macros ---------------------------------------------- */
/*
* BOCU-1 encodes the code points of a Unicode string as
* a sequence of byte-encoded differences (slope detection),
* preserving lexical order.
*
* Optimize the difference-taking for runs of Unicode text within
* small scripts:
*
* Most small scripts are allocated within aligned 128-blocks of Unicode
* code points. Lexical order is preserved if the "previous code point" state
* is always moved into the middle of such a block.
*
* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
* areas into the middle of those areas.
*
* C0 control codes and space are encoded with their US-ASCII bytes.
* "prev" is reset for C0 controls but not for space.
*/
/* initial value for "prev": middle of the ASCII range */
#define BOCU1_ASCII_PREV 0x40
/* bounding byte values for differences */
#define BOCU1_MIN 0x21
#define BOCU1_MIDDLE 0x90
#define BOCU1_MAX_LEAD 0xfe
#define BOCU1_MAX_TRAIL 0xff
#define BOCU1_RESET 0xff
/* number of lead bytes */
#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
/* adjust trail byte counts for the use of some C0 control byte values */
#define BOCU1_TRAIL_CONTROLS_COUNT 20
#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
/* number of trail bytes */
#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
/*
* number of positive and negative single-byte codes
* (counting 0==BOCU1_MIDDLE among the positive ones)
*/
#define BOCU1_SINGLE 64
/* number of lead bytes for positive and negative 2/3/4-byte sequences */
#define BOCU1_LEAD_2 43
#define BOCU1_LEAD_3 3
#define BOCU1_LEAD_4 1
/* The difference value range for single-byters. */
#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
/* The difference value range for double-byters. */
#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
/* The difference value range for 3-byters. */
#define BOCU1_REACH_POS_3 \
(BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
/* The lead byte start values. */
#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
/* ==BOCU1_MAX_LEAD */
#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
/* ==BOCU1_MIN+1 */
/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
#define BOCU1_LENGTH_FROM_LEAD(lead) \
((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
/* The length of a byte sequence, according to its packed form. */
#define BOCU1_LENGTH_FROM_PACKED(packed) \
((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
/*
* 12 commonly used C0 control codes (and space) are only used to encode
* themselves directly,
* which makes BOCU-1 MIME-usable and reasonably safe for
* ASCII-oriented software.
*
* These controls are
* 0 NUL
*
* 7 BEL
* 8 BS
*
* 9 TAB
* a LF
* b VT
* c FF
* d CR
*
* e SO
* f SI
*
* 1a SUB
* 1b ESC
*
* The other 20 C0 controls are also encoded directly (to preserve order)
* but are also used as trail bytes in difference encoding
* (for better compression).
*/
#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
/*
* Byte value map for control codes,
* from external byte values 0x00..0x20
* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
* External byte values that are illegal as trail bytes are mapped to -1.
*/
static const int8_t
bocu1ByteToTrail[BOCU1_MIN]={
/* 0 1 2 3 4 5 6 7 */
-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
/* 8 9 a b c d e f */
-1, -1, -1, -1, -1, -1, -1, -1,
/* 10 11 12 13 14 15 16 17 */
0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
/* 18 19 1a 1b 1c 1d 1e 1f */
0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
/* 20 */
-1
};
/*
* Byte value map for control codes,
* from trail byte values 0..19 (0..0x13) as used in the difference calculation
* to external byte values 0x00..0x20.
*/
static const int8_t
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
/* 0 1 2 3 4 5 6 7 */
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
/* 8 9 a b c d e f */
0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
/* 10 11 12 13 */
0x1c, 0x1d, 0x1e, 0x1f
};
/**
* Integer division and modulo with negative numerators
* yields negative modulo results and quotients that are one more than
* what we need here.
* This macro adjust the results so that the modulo-value m is always >=0.
*
* For positive n, the if() condition is always FALSE.
*
* @param n Number to be split into quotient and rest.
* Will be modified to contain the quotient.
* @param d Divisor.
* @param m Output variable for the rest (modulo result).
*/
#define NEGDIVMOD(n, d, m) { \
(m)=(n)%(d); \
(n)/=(d); \
if((m)<0) { \
--(n); \
(m)+=(d); \
} \
}
/* Faster versions of packDiff() for single-byte-encoded diff values. */
/** Is a diff value encodable in a single byte? */
#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
/** Encode a diff value in a single byte. */
#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
/** Is a diff value encodable in two bytes? */
#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
/* BOCU-1 implementation functions ------------------------------------------ */
#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
/**
* Compute the next "previous" value for differencing
* from the current code point.
*
* @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
* @return "previous code point" state value
*/
static inline int32_t
bocu1Prev(int32_t c) {
/* compute new prev */
if(/* 0x3040<=c && */ c<=0x309f) {
/* Hiragana is not 128-aligned */
return 0x3070;
} else if(0x4e00<=c && c<=0x9fa5) {
/* CJK Unihan */
return 0x4e00-BOCU1_REACH_NEG_2;
} else if(0xac00<=c /* && c<=0xd7a3 */) {
/* Korean Hangul */
return (0xd7a3+0xac00)/2;
} else {
/* mostly small scripts */
return BOCU1_SIMPLE_PREV(c);
}
}
/** Fast version of bocu1Prev() for most scripts. */
#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
/*
* The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
* The UConverter fields are used as follows:
*
* fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
*
* toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
* mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
*/
/* BOCU-1-from-Unicode conversion functions --------------------------------- */
/**
* Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
* and return a packed integer with them.
*
* The encoding favors small absolute differences with short encodings
* to compress runs of same-script characters.
*
* Optimized version with unrolled loops and fewer floating-point operations
* than the standard packDiff().
*
* @param diff difference value -0x10ffff..0x10ffff
* @return
* 0x010000zz for 1-byte sequence zz
* 0x0200yyzz for 2-byte sequence yy zz
* 0x03xxyyzz for 3-byte sequence xx yy zz
* 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
*/
static int32_t
packDiff(int32_t diff) {
int32_t result, m;
U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
if(diff>=BOCU1_REACH_NEG_1) {
/* mostly positive differences, and single-byte negative ones */
#if 0 /* single-byte case handled in macros, see below */
if(diff<=BOCU1_REACH_POS_1) {
/* single byte */
return 0x01000000|(BOCU1_MIDDLE+diff);
} else
#endif
if(diff<=BOCU1_REACH_POS_2) {
/* two bytes */
diff-=BOCU1_REACH_POS_1+1;
result=0x02000000;
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m);
result|=(BOCU1_START_POS_2+diff)<<8;
} else if(diff<=BOCU1_REACH_POS_3) {
/* three bytes */
diff-=BOCU1_REACH_POS_2+1;
result=0x03000000;
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m);
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
result|=(BOCU1_START_POS_3+diff)<<16;
} else {
/* four bytes */
diff-=BOCU1_REACH_POS_3+1;
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
result=BOCU1_TRAIL_TO_BYTE(m);
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
/*
* We know that / and % would deliver quotient 0 and rest=diff.
* Avoid division and modulo for performance.
*/
result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
result|=((uint32_t)BOCU1_START_POS_4)<<24;
}
} else {
/* two- to four-byte negative differences */
if(diff>=BOCU1_REACH_NEG_2) {
/* two bytes */
diff-=BOCU1_REACH_NEG_1;
result=0x02000000;
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
result|=BOCU1_TRAIL_TO_BYTE(m);
result|=(BOCU1_START_NEG_2+diff)<<8;
} else if(diff>=BOCU1_REACH_NEG_3) {
/* three bytes */
diff-=BOCU1_REACH_NEG_2;
result=0x03000000;
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
result|=BOCU1_TRAIL_TO_BYTE(m);
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
result|=(BOCU1_START_NEG_3+diff)<<16;
} else {
/* four bytes */
diff-=BOCU1_REACH_NEG_3;
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
result=BOCU1_TRAIL_TO_BYTE(m);
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
/*
* We know that NEGDIVMOD would deliver
* quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
* Avoid division and modulo for performance.
*/
m=diff+BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
result|=BOCU1_MIN<<24;
}
}
return result;
}
static void U_CALLCONV
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const UChar *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity;
int32_t *offsets;
int32_t prev, c, diff;
int32_t sourceIndex, nextSourceIndex;
/* set up the local pointers */
cnv=pArgs->converter;
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target=(uint8_t *)pArgs->target;
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
offsets=pArgs->offsets;
/* get the converter state from UConverter */
c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
nextSourceIndex=0;
/* conversion loop */
if(c!=0 && targetCapacity>0) {
goto getTrail;
}
fastSingle:
/* fast loop for single-byte differences */
/* use only one loop counter variable, targetCapacity, not also source */
diff=(int32_t)(sourceLimit-source);
if(targetCapacity>diff) {
targetCapacity=diff;
}
while(targetCapacity>0 && (c=*source)<0x3000) {
if(c<=0x20) {
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(uint8_t)c;
*offsets++=nextSourceIndex++;
++source;
--targetCapacity;
} else {
diff=c-prev;
if(DIFF_IS_SINGLE(diff)) {
prev=BOCU1_SIMPLE_PREV(c);
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
*offsets++=nextSourceIndex++;
++source;
--targetCapacity;
} else {
break;
}
}
}
/* restore real values */
targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
/* regular loop for all cases */
while(source<sourceLimit) {
if(targetCapacity>0) {
c=*source++;
++nextSourceIndex;
if(c<=0x20) {
/*
* ISO C0 control & space:
* Encode directly for MIME compatibility,
* and reset state except for space, to not disrupt compression.
*/
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(uint8_t)c;
*offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex;
continue;
}
if(U16_IS_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
c=U16_GET_SUPPLEMENTARY(c, trail);
}
} else {
/* no more input */
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
break;
}
}
/*
* all other Unicode code points c==U+0021..U+10ffff
* are encoded with the difference c-prev
*
* a new prev is computed from c,
* placed in the middle of a 0x80-block (for most small scripts) or
* in the middle of the Unihan and Hangul blocks
* to statistically minimize the following difference
*/
diff=c-prev;
prev=BOCU1_PREV(c);
if(DIFF_IS_SINGLE(diff)) {
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
*offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex;
if(c<0x3000) {
goto fastSingle;
}
} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
/* optimize 2-byte case */
int32_t m;
if(diff>=0) {
diff-=BOCU1_REACH_POS_1+1;
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
diff+=BOCU1_START_POS_2;
} else {
diff-=BOCU1_REACH_NEG_1;
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
diff+=BOCU1_START_NEG_2;
}
*target++=(uint8_t)diff;
*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
*offsets++=sourceIndex;
*offsets++=sourceIndex;
targetCapacity-=2;
sourceIndex=nextSourceIndex;
} else {
int32_t length; /* will be 2..4 */
diff=packDiff(diff);
length=BOCU1_LENGTH_FROM_PACKED(diff);
/* write the output character bytes from diff and length */
/* from the first if in the loop we know that targetCapacity>0 */
if(length<=targetCapacity) {
switch(length) {
/* each branch falls through to the next one */
case 4:
*target++=(uint8_t)(diff>>24);
*offsets++=sourceIndex;
U_FALLTHROUGH;
case 3:
*target++=(uint8_t)(diff>>16);
*offsets++=sourceIndex;
U_FALLTHROUGH;
case 2:
*target++=(uint8_t)(diff>>8);
*offsets++=sourceIndex;
/* case 1: handled above */
*target++=(uint8_t)diff;
*offsets++=sourceIndex;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
targetCapacity-=length;
sourceIndex=nextSourceIndex;
} else {
uint8_t *charErrorBuffer;
/*
* We actually do this backwards here:
* In order to save an intermediate variable, we output
* first to the overflow buffer what does not fit into the
* regular target.
*/
/* we know that 1<=targetCapacity<length<=4 */
length-=targetCapacity;
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
switch(length) {
/* each branch falls through to the next one */
case 3:
*charErrorBuffer++=(uint8_t)(diff>>16);
U_FALLTHROUGH;
case 2:
*charErrorBuffer++=(uint8_t)(diff>>8);
U_FALLTHROUGH;
case 1:
*charErrorBuffer=(uint8_t)diff;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
cnv->charErrorBufferLength=(int8_t)length;
/* now output what fits into the regular target */
diff>>=8*length; /* length was reduced by targetCapacity */
switch(targetCapacity) {
/* each branch falls through to the next one */
case 3:
*target++=(uint8_t)(diff>>16);
*offsets++=sourceIndex;
U_FALLTHROUGH;
case 2:
*target++=(uint8_t)(diff>>8);
*offsets++=sourceIndex;
U_FALLTHROUGH;
case 1:
*target++=(uint8_t)diff;
*offsets++=sourceIndex;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
/* target overflow */
targetCapacity=0;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
/* set the converter state back into UConverter */
cnv->fromUChar32= c<0 ? -c : 0;
cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
}
/*
* Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
* If a change is made in the original function, then either
* change this function the same way or
* re-copy the original function and remove the variables
* offsets, sourceIndex, and nextSourceIndex.
*/
static void U_CALLCONV
_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const UChar *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity;
int32_t prev, c, diff;
/* set up the local pointers */
cnv=pArgs->converter;
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target=(uint8_t *)pArgs->target;
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
/* get the converter state from UConverter */
c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
/* conversion loop */
if(c!=0 && targetCapacity>0) {
goto getTrail;
}
fastSingle:
/* fast loop for single-byte differences */
/* use only one loop counter variable, targetCapacity, not also source */
diff=(int32_t)(sourceLimit-source);
if(targetCapacity>diff) {
targetCapacity=diff;
}
while(targetCapacity>0 && (c=*source)<0x3000) {
if(c<=0x20) {
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(uint8_t)c;
} else {
diff=c-prev;
if(DIFF_IS_SINGLE(diff)) {
prev=BOCU1_SIMPLE_PREV(c);
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
} else {
break;
}
}
++source;
--targetCapacity;
}
/* restore real values */
targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
/* regular loop for all cases */
while(source<sourceLimit) {
if(targetCapacity>0) {
c=*source++;
if(c<=0x20) {
/*
* ISO C0 control & space:
* Encode directly for MIME compatibility,
* and reset state except for space, to not disrupt compression.
*/
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(uint8_t)c;
--targetCapacity;
continue;
}
if(U16_IS_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
if(U16_IS_TRAIL(trail)) {
++source;
c=U16_GET_SUPPLEMENTARY(c, trail);
}
} else {
/* no more input */
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
break;
}
}
/*
* all other Unicode code points c==U+0021..U+10ffff
* are encoded with the difference c-prev
*
* a new prev is computed from c,
* placed in the middle of a 0x80-block (for most small scripts) or
* in the middle of the Unihan and Hangul blocks
* to statistically minimize the following difference
*/
diff=c-prev;
prev=BOCU1_PREV(c);
if(DIFF_IS_SINGLE(diff)) {
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
--targetCapacity;
if(c<0x3000) {
goto fastSingle;
}
} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
/* optimize 2-byte case */
int32_t m;
if(diff>=0) {
diff-=BOCU1_REACH_POS_1+1;
m=diff%BOCU1_TRAIL_COUNT;
diff/=BOCU1_TRAIL_COUNT;
diff+=BOCU1_START_POS_2;
} else {
diff-=BOCU1_REACH_NEG_1;
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
diff+=BOCU1_START_NEG_2;
}
*target++=(uint8_t)diff;
*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
targetCapacity-=2;
} else {
int32_t length; /* will be 2..4 */
diff=packDiff(diff);
length=BOCU1_LENGTH_FROM_PACKED(diff);
/* write the output character bytes from diff and length */
/* from the first if in the loop we know that targetCapacity>0 */
if(length<=targetCapacity) {
switch(length) {
/* each branch falls through to the next one */
case 4:
*target++=(uint8_t)(diff>>24);
U_FALLTHROUGH;
case 3:
*target++=(uint8_t)(diff>>16);
/* case 2: handled above */
*target++=(uint8_t)(diff>>8);
/* case 1: handled above */
*target++=(uint8_t)diff;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
targetCapacity-=length;
} else {
uint8_t *charErrorBuffer;
/*
* We actually do this backwards here:
* In order to save an intermediate variable, we output
* first to the overflow buffer what does not fit into the
* regular target.
*/
/* we know that 1<=targetCapacity<length<=4 */
length-=targetCapacity;
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
switch(length) {
/* each branch falls through to the next one */
case 3:
*charErrorBuffer++=(uint8_t)(diff>>16);
U_FALLTHROUGH;
case 2:
*charErrorBuffer++=(uint8_t)(diff>>8);
U_FALLTHROUGH;
case 1:
*charErrorBuffer=(uint8_t)diff;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
cnv->charErrorBufferLength=(int8_t)length;
/* now output what fits into the regular target */
diff>>=8*length; /* length was reduced by targetCapacity */
switch(targetCapacity) {
/* each branch falls through to the next one */
case 3:
*target++=(uint8_t)(diff>>16);
U_FALLTHROUGH;
case 2:
*target++=(uint8_t)(diff>>8);
U_FALLTHROUGH;
case 1:
*target++=(uint8_t)diff;
U_FALLTHROUGH;
default:
/* will never occur */
break;
}
/* target overflow */
targetCapacity=0;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
/* set the converter state back into UConverter */
cnv->fromUChar32= c<0 ? -c : 0;
cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */
pArgs->source=source;
pArgs->target=(char *)target;
}
/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
/**
* Function for BOCU-1 decoder; handles multi-byte lead bytes.
*
* @param b lead byte;
* BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
* @return (diff<<2)|count
*/
static inline int32_t
decodeBocu1LeadByte(int32_t b) {
int32_t diff, count;
if(b>=BOCU1_START_NEG_2) {
/* positive difference */
if(b<BOCU1_START_POS_3) {
/* two bytes */
diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
count=1;
} else if(b<BOCU1_START_POS_4) {
/* three bytes */
diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
count=2;
} else {
/* four bytes */
diff=BOCU1_REACH_POS_3+1;
count=3;
}
} else {
/* negative difference */
if(b>=BOCU1_START_NEG_3) {
/* two bytes */
diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
count=1;
} else if(b>BOCU1_MIN) {
/* three bytes */
diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
count=2;
} else {
/* four bytes */
diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
count=3;
}
}
/* return the state for decoding the trail byte(s) */
return (diff<<2)|count;
}
/**
* Function for BOCU-1 decoder; handles multi-byte trail bytes.
*
* @param count number of remaining trail bytes including this one
* @param b trail byte
* @return new delta for diff including b - <0 indicates an error
*
* @see decodeBocu1
*/
static inline int32_t
decodeBocu1TrailByte(int32_t count, int32_t b) {
if(b<=0x20) {
/* skip some C0 controls and make the trail byte range contiguous */
b=bocu1ByteToTrail[b];
/* b<0 for an illegal trail byte value will result in return<0 below */
#if BOCU1_MAX_TRAIL<0xff
} else if(b>BOCU1_MAX_TRAIL) {
return -99;
#endif
} else {
b-=BOCU1_TRAIL_BYTE_OFFSET;
}
/* add trail byte into difference and decrement count */
if(count==1) {
return b;
} else if(count==2) {
return b*BOCU1_TRAIL_COUNT;
} else /* count==3 */ {
return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
}
}
static void U_CALLCONV
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit;
UChar *target;
const UChar *targetLimit;
int32_t *offsets;
int32_t prev, count, diff, c;
int8_t byteIndex;
uint8_t *bytes;
int32_t sourceIndex, nextSourceIndex;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
target=pArgs->target;
targetLimit=pArgs->targetLimit;
offsets=pArgs->offsets;
/* get the converter state from UConverter */
prev=(int32_t)cnv->toUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
count=diff&3;
diff>>=2;
byteIndex=cnv->toULength;
bytes=cnv->toUBytes;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
if(count>0 && byteIndex>0 && target<targetLimit) {
goto getTrail;
}
fastSingle:
/* fast loop for single-byte differences */
/* use count as the only loop counter variable */
diff=(int32_t)(sourceLimit-source);
count=(int32_t)(pArgs->targetLimit-target);
if(count>diff) {
count=diff;
}
while(count>0) {
if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
c=prev+(c-BOCU1_MIDDLE);
if(c<0x3000) {
*target++=(UChar)c;
*offsets++=nextSourceIndex++;
prev=BOCU1_SIMPLE_PREV(c);
} else {
break;
}
} else if(c<=0x20) {
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(UChar)c;
*offsets++=nextSourceIndex++;
} else {
break;
}
++source;
--count;
}
sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
/* decode a sequence of single and lead bytes */
while(source<sourceLimit) {
if(target>=targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
++nextSourceIndex;
c=*source++;
if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
/* Write a code point directly from a single-byte difference. */
c=prev+(c-BOCU1_MIDDLE);
if(c<0x3000) {
*target++=(UChar)c;
*offsets++=sourceIndex;
prev=BOCU1_SIMPLE_PREV(c);
sourceIndex=nextSourceIndex;
goto fastSingle;
}
} else if(c<=0x20) {
/*
* Direct-encoded C0 control code or space.
* Reset prev for C0 control codes but not for space.
*/
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(UChar)c;
*offsets++=sourceIndex;
sourceIndex=nextSourceIndex;
continue;
} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
/* Optimize two-byte case. */
if(c>=BOCU1_MIDDLE) {
diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
} else {
diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
}
/* trail byte */
++nextSourceIndex;
c=decodeBocu1TrailByte(1, *source++);
if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
bytes[0]=source[-2];
bytes[1]=source[-1];
byteIndex=2;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else if(c==BOCU1_RESET) {
/* only reset the state, no code point */
prev=BOCU1_ASCII_PREV;
sourceIndex=nextSourceIndex;
continue;
} else {
/*
* For multi-byte difference lead bytes, set the decoder state
* with the partial difference value from the lead byte and
* with the number of trail bytes.
*/
bytes[0]=(uint8_t)c;
byteIndex=1;
diff=decodeBocu1LeadByte(c);
count=diff&3;
diff>>=2;
getTrail:
for(;;) {
if(source>=sourceLimit) {
goto endloop;
}
++nextSourceIndex;
c=bytes[byteIndex++]=*source++;
/* trail byte in any position */
c=decodeBocu1TrailByte(count, c);
if(c<0) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
diff+=c;
if(--count==0) {
/* final trail byte, deliver a code point */
byteIndex=0;
c=prev+diff;
if((uint32_t)c>0x10ffff) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
break;
}
}
}
/* calculate the next prev and output c */
prev=BOCU1_PREV(c);
if(c<=0xffff) {
*target++=(UChar)c;
*offsets++=sourceIndex;
} else {
/* output surrogate pair */
*target++=U16_LEAD(c);
if(target<targetLimit) {
*target++=U16_TRAIL(c);
*offsets++=sourceIndex;
*offsets++=sourceIndex;
} else {
/* target overflow */
*offsets++=sourceIndex;
cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
cnv->UCharErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
sourceIndex=nextSourceIndex;
}
endloop:
if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
/* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
cnv->mode=0;
} else {
/* set the converter state back into UConverter */
cnv->toUnicodeStatus=(uint32_t)prev;
cnv->mode=(diff<<2)|count;
}
cnv->toULength=byteIndex;
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
}
/*
* Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
* If a change is made in the original function, then either
* change this function the same way or
* re-copy the original function and remove the variables
* offsets, sourceIndex, and nextSourceIndex.
*/
static void U_CALLCONV
_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit;
UChar *target;
const UChar *targetLimit;
int32_t prev, count, diff, c;
int8_t byteIndex;
uint8_t *bytes;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
target=pArgs->target;
targetLimit=pArgs->targetLimit;
/* get the converter state from UConverter */
prev=(int32_t)cnv->toUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
count=diff&3;
diff>>=2;
byteIndex=cnv->toULength;
bytes=cnv->toUBytes;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
if(count>0 && byteIndex>0 && target<targetLimit) {
goto getTrail;
}
fastSingle:
/* fast loop for single-byte differences */
/* use count as the only loop counter variable */
diff=(int32_t)(sourceLimit-source);
count=(int32_t)(pArgs->targetLimit-target);
if(count>diff) {
count=diff;
}
while(count>0) {
if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
c=prev+(c-BOCU1_MIDDLE);
if(c<0x3000) {
*target++=(UChar)c;
prev=BOCU1_SIMPLE_PREV(c);
} else {
break;
}
} else if(c<=0x20) {
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(UChar)c;
} else {
break;
}
++source;
--count;
}
/* decode a sequence of single and lead bytes */
while(source<sourceLimit) {
if(target>=targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
c=*source++;
if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
/* Write a code point directly from a single-byte difference. */
c=prev+(c-BOCU1_MIDDLE);
if(c<0x3000) {
*target++=(UChar)c;
prev=BOCU1_SIMPLE_PREV(c);
goto fastSingle;
}
} else if(c<=0x20) {
/*
* Direct-encoded C0 control code or space.
* Reset prev for C0 control codes but not for space.
*/
if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++=(UChar)c;
continue;
} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
/* Optimize two-byte case. */
if(c>=BOCU1_MIDDLE) {
diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
} else {
diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
}
/* trail byte */
c=decodeBocu1TrailByte(1, *source++);
if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
bytes[0]=source[-2];
bytes[1]=source[-1];
byteIndex=2;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else if(c==BOCU1_RESET) {
/* only reset the state, no code point */
prev=BOCU1_ASCII_PREV;
continue;
} else {
/*
* For multi-byte difference lead bytes, set the decoder state
* with the partial difference value from the lead byte and
* with the number of trail bytes.
*/
bytes[0]=(uint8_t)c;
byteIndex=1;
diff=decodeBocu1LeadByte(c);
count=diff&3;
diff>>=2;
getTrail:
for(;;) {
if(source>=sourceLimit) {
goto endloop;
}
c=bytes[byteIndex++]=*source++;
/* trail byte in any position */
c=decodeBocu1TrailByte(count, c);
if(c<0) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
diff+=c;
if(--count==0) {
/* final trail byte, deliver a code point */
byteIndex=0;
c=prev+diff;
if((uint32_t)c>0x10ffff) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
break;
}
}
}
/* calculate the next prev and output c */
prev=BOCU1_PREV(c);
if(c<=0xffff) {
*target++=(UChar)c;
} else {
/* output surrogate pair */
*target++=U16_LEAD(c);
if(target<targetLimit) {
*target++=U16_TRAIL(c);
} else {
/* target overflow */
cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
cnv->UCharErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
}
endloop:
if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
/* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
cnv->mode=0;
} else {
/* set the converter state back into UConverter */
cnv->toUnicodeStatus=(uint32_t)prev;
cnv->mode=(diff<<2)|count;
}
cnv->toULength=byteIndex;
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
return;
}
/* miscellaneous ------------------------------------------------------------ */
static const UConverterImpl _Bocu1Impl={
UCNV_BOCU1,
NULL,
NULL,
NULL,
NULL,
NULL,
_Bocu1ToUnicode,
_Bocu1ToUnicodeWithOffsets,
_Bocu1FromUnicode,
_Bocu1FromUnicodeWithOffsets,
NULL,
NULL,
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet,
NULL,
NULL
};
static const UConverterStaticData _Bocu1StaticData={
sizeof(UConverterStaticData),
"BOCU-1",
1214, /* CCSID for BOCU-1 */
UCNV_IBM, UCNV_BOCU1,
1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
{ 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _Bocu1Data=
UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
#endif