blob: 83d79160e8f35ddc801bd3a3a4d78d3a06dc181e [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/ures.h"
#include "unicode/putil.h"
#include "unicode/uloc.h"
#include "ustr_imp.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "putilimp.h"
#include "uinvchar.h"
#include "ulocimp.h"
#include "uvector.h"
#include "uassert.h"
/* struct holding a single variant */
typedef struct VariantListEntry {
const char *variant;
struct VariantListEntry *next;
} VariantListEntry;
/* struct holding a single attribute value */
typedef struct AttributeListEntry {
const char *attribute;
struct AttributeListEntry *next;
} AttributeListEntry;
/* struct holding a single extension */
typedef struct ExtensionListEntry {
const char *key;
const char *value;
struct ExtensionListEntry *next;
} ExtensionListEntry;
#define MAXEXTLANG 3
typedef struct ULanguageTag {
char *buf; /* holding parsed subtags */
const char *language;
const char *extlang[MAXEXTLANG];
const char *script;
const char *region;
VariantListEntry *variants;
ExtensionListEntry *extensions;
const char *privateuse;
const char *grandfathered;
} ULanguageTag;
#define MINLEN 2
#define SEP '-'
#define PRIVATEUSE 'x'
#define LDMLEXT 'u'
#define LOCALE_SEP '_'
#define LOCALE_EXT_SEP '@'
#define LOCALE_KEYWORD_SEP ';'
#define LOCALE_KEY_TYPE_SEP '='
#define ISALPHA(c) uprv_isASCIILetter(c)
#define ISNUMERIC(c) ((c)>='0' && (c)<='9')
static const char EMPTY[] = "";
static const char LANG_UND[] = "und";
static const char PRIVATEUSE_KEY[] = "x";
static const char _POSIX[] = "_POSIX";
static const char POSIX_KEY[] = "va";
static const char POSIX_VALUE[] = "posix";
static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
static const char LOCALE_TYPE_YES[] = "yes";
#define LANG_UND_LEN 3
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
This table has 2 parts. The parts for Grandfathered tags is generated by the
following scripts from the IANA language tag registry.
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
egrep -A 7 'Type: grandfathered' | \
egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
tr 'A-Z' 'a-z'
The 2nd part is made of five ICU-specific entries. They're kept for
the backward compatibility for now, even though there are no preferred
values. They may have to be removed for the strict BCP 47 compliance.
*/
static const char* const GRANDFATHERED[] = {
/* grandfathered preferred */
"art-lojban", "jbo",
"en-gb-oed", "en-gb-oxendict",
"i-ami", "ami",
"i-bnn", "bnn",
"i-hak", "hak",
"i-klingon", "tlh",
"i-lux", "lb",
"i-navajo", "nv",
"i-pwn", "pwn",
"i-tao", "tao",
"i-tay", "tay",
"i-tsu", "tsu",
"no-bok", "nb",
"no-nyn", "nn",
"sgn-be-fr", "sfb",
"sgn-be-nl", "vgt",
"sgn-ch-de", "sgg",
"zh-guoyu", "cmn",
"zh-hakka", "hak",
"zh-min-nan", "nan",
"zh-xiang", "hsn",
// Grandfathered tags with no preferred value in the IANA
// registry. Kept for now for the backward compatibility
// because ICU has mapped them this way.
"cel-gaulish", "xtg-x-cel-gaulish",
"i-default", "en-x-i-default",
"i-enochian", "und-x-i-enochian",
"i-mingo", "see-x-i-mingo",
"zh-min", "nan-x-zh-min",
};
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
The table lists redundant tags with preferred value in the IANA languate tag registry.
It's generated with the following command:
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
tr 'A-Z' 'a-z'
In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
*/
static const char* const REDUNDANT[] = {
// redundant preferred
"sgn-br", "bzs",
"sgn-co", "csn",
"sgn-de", "gsg",
"sgn-dk", "dsl",
"sgn-es", "ssp",
"sgn-fr", "fsl",
"sgn-gb", "bfi",
"sgn-gr", "gss",
"sgn-ie", "isg",
"sgn-it", "ise",
"sgn-jp", "jsl",
"sgn-mx", "mfs",
"sgn-ni", "ncs",
"sgn-nl", "dse",
"sgn-no", "nsl",
"sgn-pt", "psr",
"sgn-se", "swl",
"sgn-us", "ase",
"sgn-za", "sfs",
"zh-cmn", "cmn",
"zh-cmn-hans", "cmn-hans",
"zh-cmn-hant", "cmn-hant",
"zh-gan", "gan",
"zh-wuu", "wuu",
"zh-yue", "yue",
// variant tag with preferred value
"ja-latn-hepburn-heploc", "ja-latn-alalc97",
};
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
Make sure that 2-letter language subtags come before 3-letter subtags.
*/
static const char DEPRECATEDLANGS[][4] = {
/* deprecated new */
"in", "id",
"iw", "he",
"ji", "yi",
"jw", "jv",
"mo", "ro",
"aam", "aas",
"adp", "dz",
"aue", "ktz",
"ayx", "nun",
"bgm", "bcg",
"bjd", "drl",
"ccq", "rki",
"cjr", "mom",
"cka", "cmr",
"cmk", "xch",
"coy", "pij",
"cqu", "quh",
"drh", "khk",
"drw", "prs",
"gav", "dev",
"gfx", "vaj",
"ggn", "gvr",
"gti", "nyc",
"guv", "duz",
"hrr", "jal",
"ibi", "opa",
"ilw", "gal",
"jeg", "oyb",
"kgc", "tdf",
"kgh", "kml",
"koj", "kwv",
"krm", "bmf",
"ktr", "dtp",
"kvs", "gdj",
"kwq", "yam",
"kxe", "tvd",
"kzj", "dtp",
"kzt", "dtp",
"lii", "raq",
"lmm", "rmx",
"meg", "cir",
"mst", "mry",
"mwj", "vaj",
"myt", "mry",
"nad", "xny",
"ncp", "kdz",
"nnx", "ngv",
"nts", "pij",
"oun", "vaj",
"pcr", "adx",
"pmc", "huw",
"pmu", "phr",
"ppa", "bfy",
"ppr", "lcq",
"pry", "prt",
"puz", "pub",
"sca", "hle",
"skk", "oyb",
"tdu", "dtp",
"thc", "tpo",
"thx", "oyb",
"tie", "ras",
"tkk", "twm",
"tlw", "weo",
"tmp", "tyj",
"tne", "kak",
"tnf", "prs",
"tsf", "taj",
"uok", "ema",
"xba", "cax",
"xia", "acn",
"xkh", "waw",
"xsj", "suj",
"ybd", "rki",
"yma", "lrr",
"ymt", "mtm",
"yos", "zom",
"yuu", "yug",
};
/*
Updated on 2018-04-24 from
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
*/
static const char DEPRECATEDREGIONS[][3] = {
/* deprecated new */
"BU", "MM",
"DD", "DE",
"FX", "FR",
"TP", "TL",
"YD", "YE",
"ZR", "CD",
};
/*
* -------------------------------------------------
*
* These ultag_ functions may be exposed as APIs later
*
* -------------------------------------------------
*/
static ULanguageTag*
ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
static void
ultag_close(ULanguageTag* langtag);
static const char*
ultag_getLanguage(const ULanguageTag* langtag);
#if 0
static const char*
ultag_getJDKLanguage(const ULanguageTag* langtag);
#endif
static const char*
ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
static int32_t
ultag_getExtlangSize(const ULanguageTag* langtag);
static const char*
ultag_getScript(const ULanguageTag* langtag);
static const char*
ultag_getRegion(const ULanguageTag* langtag);
static const char*
ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
static int32_t
ultag_getVariantsSize(const ULanguageTag* langtag);
static const char*
ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
static const char*
ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
static int32_t
ultag_getExtensionsSize(const ULanguageTag* langtag);
static const char*
ultag_getPrivateUse(const ULanguageTag* langtag);
#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag);
#endif
namespace {
// Helper class to memory manage CharString objects.
// Only ever stack-allocated, does not need to inherit UMemory.
class CharStringPool {
public:
CharStringPool() : status(U_ZERO_ERROR), pool(&deleter, nullptr, status) {}
~CharStringPool() = default;
CharStringPool(const CharStringPool&) = delete;
CharStringPool& operator=(const CharStringPool&) = delete;
icu::CharString* create() {
if (U_FAILURE(status)) {
return nullptr;
}
icu::CharString* const obj = new icu::CharString;
if (obj == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
pool.addElement(obj, status);
if (U_FAILURE(status)) {
delete obj;
return nullptr;
}
return obj;
}
private:
static void U_CALLCONV deleter(void* obj) {
delete static_cast<icu::CharString*>(obj);
}
UErrorCode status;
icu::UVector pool;
};
} // namespace
/*
* -------------------------------------------------
*
* Language subtag syntax validation functions
*
* -------------------------------------------------
*/
static UBool
_isAlphaString(const char* s, int32_t len) {
int32_t i;
for (i = 0; i < len; i++) {
if (!ISALPHA(*(s + i))) {
return FALSE;
}
}
return TRUE;
}
static UBool
_isNumericString(const char* s, int32_t len) {
int32_t i;
for (i = 0; i < len; i++) {
if (!ISNUMERIC(*(s + i))) {
return FALSE;
}
}
return TRUE;
}
static UBool
_isAlphaNumericString(const char* s, int32_t len) {
int32_t i;
for (i = 0; i < len; i++) {
if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
return FALSE;
}
}
return TRUE;
}
static UBool
_isLanguageSubtag(const char* s, int32_t len) {
/*
* language = 2*3ALPHA ; shortest ISO 639 code
* ["-" extlang] ; sometimes followed by
* ; extended language subtags
* / 4ALPHA ; or reserved for future use
* / 5*8ALPHA ; or registered language subtag
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isExtlangSubtag(const char* s, int32_t len) {
/*
* extlang = 3ALPHA ; selected ISO 639 codes
* *2("-" 3ALPHA) ; permanently reserved
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len == 3 && _isAlphaString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isScriptSubtag(const char* s, int32_t len) {
/*
* script = 4ALPHA ; ISO 15924 code
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len == 4 && _isAlphaString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isRegionSubtag(const char* s, int32_t len) {
/*
* region = 2ALPHA ; ISO 3166-1 code
* / 3DIGIT ; UN M.49 code
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len == 2 && _isAlphaString(s, len)) {
return TRUE;
}
if (len == 3 && _isNumericString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isVariantSubtag(const char* s, int32_t len) {
/*
* variant = 5*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len >= 5 && len <= 8 && _isAlphaNumericString(s, len)) {
return TRUE;
}
if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
return TRUE;
}
return FALSE;
}
static UBool
_isPrivateuseVariantSubtag(const char* s, int32_t len) {
/*
* variant = 1*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isExtensionSingleton(const char* s, int32_t len) {
/*
* extension = singleton 1*("-" (2*8alphanum))
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len == 1 && ISALPHA(*s) && (uprv_tolower(*s) != PRIVATEUSE)) {
return TRUE;
}
return FALSE;
}
static UBool
_isExtensionSubtag(const char* s, int32_t len) {
/*
* extension = singleton 1*("-" (2*8alphanum))
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len >= 2 && len <= 8 && _isAlphaNumericString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isExtensionSubtags(const char* s, int32_t len) {
const char *p = s;
const char *pSubtag = NULL;
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
while ((p - s) < len) {
if (*p == SEP) {
if (pSubtag == NULL) {
return FALSE;
}
if (!_isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag))) {
return FALSE;
}
pSubtag = NULL;
} else if (pSubtag == NULL) {
pSubtag = p;
}
p++;
}
if (pSubtag == NULL) {
return FALSE;
}
return _isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag));
}
static UBool
_isPrivateuseValueSubtag(const char* s, int32_t len) {
/*
* privateuse = "x" 1*("-" (1*8alphanum))
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) {
return TRUE;
}
return FALSE;
}
static UBool
_isPrivateuseValueSubtags(const char* s, int32_t len) {
const char *p = s;
const char *pSubtag = NULL;
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
while ((p - s) < len) {
if (*p == SEP) {
if (pSubtag == NULL) {
return FALSE;
}
if (!_isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag))) {
return FALSE;
}
pSubtag = NULL;
} else if (pSubtag == NULL) {
pSubtag = p;
}
p++;
}
if (pSubtag == NULL) {
return FALSE;
}
return _isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag));
}
U_CFUNC UBool
ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
if (len == 2 && _isAlphaNumericString(s, len)) {
return TRUE;
}
return FALSE;
}
U_CFUNC UBool
ultag_isUnicodeLocaleType(const char*s, int32_t len) {
const char* p;
int32_t subtagLen = 0;
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
for (p = s; len > 0; p++, len--) {
if (*p == SEP) {
if (subtagLen < 3) {
return FALSE;
}
subtagLen = 0;
} else if (ISALPHA(*p) || ISNUMERIC(*p)) {
subtagLen++;
if (subtagLen > 8) {
return FALSE;
}
} else {
return FALSE;
}
}
return (subtagLen >= 3);
}
/*
* -------------------------------------------------
*
* Helper functions
*
* -------------------------------------------------
*/
static UBool
_addVariantToList(VariantListEntry **first, VariantListEntry *var) {
UBool bAdded = TRUE;
if (*first == NULL) {
var->next = NULL;
*first = var;
} else {
VariantListEntry *prev, *cur;
int32_t cmp;
/* variants order should be preserved */
prev = NULL;
cur = *first;
while (TRUE) {
if (cur == NULL) {
prev->next = var;
var->next = NULL;
break;
}
/* Checking for duplicate variant */
cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
if (cmp == 0) {
/* duplicated variant */
bAdded = FALSE;
break;
}
prev = cur;
cur = cur->next;
}
}
return bAdded;
}
static UBool
_addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
UBool bAdded = TRUE;
if (*first == NULL) {
attr->next = NULL;
*first = attr;
} else {
AttributeListEntry *prev, *cur;
int32_t cmp;
/* reorder variants in alphabetical order */
prev = NULL;
cur = *first;
while (TRUE) {
if (cur == NULL) {
prev->next = attr;
attr->next = NULL;
break;
}
cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
if (cmp < 0) {
if (prev == NULL) {
*first = attr;
} else {
prev->next = attr;
}
attr->next = cur;
break;
}
if (cmp == 0) {
/* duplicated variant */
bAdded = FALSE;
break;
}
prev = cur;
cur = cur->next;
}
}
return bAdded;
}
static UBool
_addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
UBool bAdded = TRUE;
if (*first == NULL) {
ext->next = NULL;
*first = ext;
} else {
ExtensionListEntry *prev, *cur;
int32_t cmp;
/* reorder variants in alphabetical order */
prev = NULL;
cur = *first;
while (TRUE) {
if (cur == NULL) {
prev->next = ext;
ext->next = NULL;
break;
}
if (localeToBCP) {
/* special handling for locale to bcp conversion */
int32_t len, curlen;
len = (int32_t)uprv_strlen(ext->key);
curlen = (int32_t)uprv_strlen(cur->key);
if (len == 1 && curlen == 1) {
if (*(ext->key) == *(cur->key)) {
cmp = 0;
} else if (*(ext->key) == PRIVATEUSE) {
cmp = 1;
} else if (*(cur->key) == PRIVATEUSE) {
cmp = -1;
} else {
cmp = *(ext->key) - *(cur->key);
}
} else if (len == 1) {
cmp = *(ext->key) - LDMLEXT;
} else if (curlen == 1) {
cmp = LDMLEXT - *(cur->key);
} else {
cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
/* Both are u extension keys - we need special handling for 'attribute' */
if (cmp != 0) {
if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
cmp = 1;
} else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
cmp = -1;
}
}
}
} else {
cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
}
if (cmp < 0) {
if (prev == NULL) {
*first = ext;
} else {
prev->next = ext;
}
ext->next = cur;
break;
}
if (cmp == 0) {
/* duplicated extension key */
bAdded = FALSE;
break;
}
prev = cur;
cur = cur->next;
}
}
return bAdded;
}
static void
_initializeULanguageTag(ULanguageTag* langtag) {
int32_t i;
langtag->buf = NULL;
langtag->language = EMPTY;
for (i = 0; i < MAXEXTLANG; i++) {
langtag->extlang[i] = NULL;
}
langtag->script = EMPTY;
langtag->region = EMPTY;
langtag->variants = NULL;
langtag->extensions = NULL;
langtag->grandfathered = EMPTY;
langtag->privateuse = EMPTY;
}
static int32_t
_appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
char buf[ULOC_LANG_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
}
len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
len = 0;
}
/* Note: returned language code is in lower case letters */
if (len == 0) {
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen));
}
reslen += LANG_UND_LEN;
} else if (!_isLanguageSubtag(buf, len)) {
/* invalid language code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen));
}
reslen += LANG_UND_LEN;
} else {
/* resolve deprecated */
for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
// 2-letter deprecated subtags are listede before 3-letter
// ones in DEPRECATEDLANGS[]. Get out of loop on coming
// across the 1st 3-letter subtag, if the input is a 2-letter code.
// to avoid continuing to try when there's no match.
if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
len = (int32_t)uprv_strlen(buf);
break;
}
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
char buf[ULOC_SCRIPT_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
}
len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
}
if (len > 0) {
if (!_isScriptSubtag(buf, len)) {
/* invalid script code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
char buf[ULOC_COUNTRY_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
}
len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
}
if (len > 0) {
if (!_isRegionSubtag(buf, len)) {
/* invalid region code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
/* resolve deprecated */
for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
len = (int32_t)uprv_strlen(buf);
break;
}
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool *hadPosix, UErrorCode* status) {
char buf[ULOC_FULLNAME_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
}
len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
}
if (len > 0) {
char *p, *pVar;
UBool bNext = TRUE;
VariantListEntry *var;
VariantListEntry *varFirst = NULL;
pVar = NULL;
p = buf;
while (bNext) {
if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
if (*p == 0) {
bNext = FALSE;
} else {
*p = 0; /* terminate */
}
if (pVar == NULL) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
/* ignore empty variant */
} else {
/* ICU uses upper case letters for variants, but
the canonical format is lowercase in BCP47 */
for (i = 0; *(pVar + i) != 0; i++) {
*(pVar + i) = uprv_tolower(*(pVar + i));
}
/* validate */
if (_isVariantSubtag(pVar, -1)) {
if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
/* emit the variant to the list */
var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
if (var == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
var->variant = pVar;
if (!_addVariantToList(&varFirst, var)) {
/* duplicated variant */
uprv_free(var);
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
}
} else {
/* Special handling for POSIX variant, need to remember that we had it and then */
/* treat it like an extension later. */
*hadPosix = TRUE;
}
} else if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
} else if (_isPrivateuseValueSubtag(pVar, -1)) {
/* Handle private use subtags separately */
break;
}
}
/* reset variant starting position */
pVar = NULL;
} else if (pVar == NULL) {
pVar = p;
}
p++;
}
if (U_SUCCESS(*status)) {
if (varFirst != NULL) {
int32_t varLen;
/* write out validated/normalized variants to the target */
var = varFirst;
while (var != NULL) {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
varLen = (int32_t)uprv_strlen(var->variant);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, var->variant, uprv_min(varLen, capacity - reslen));
}
reslen += varLen;
var = var->next;
}
}
}
/* clean up */
var = varFirst;
while (var != NULL) {
VariantListEntry *tmpVar = var->next;
uprv_free(var);
var = tmpVar;
}
if (U_FAILURE(*status)) {
return 0;
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
int32_t attrBufLength = 0;
UEnumeration *keywordEnum = NULL;
int32_t reslen = 0;
keywordEnum = uloc_openKeywords(localeID, status);
if (U_FAILURE(*status) && !hadPosix) {
uenum_close(keywordEnum);
return 0;
}
if (keywordEnum != NULL || hadPosix) {
/* reorder extensions */
int32_t len;
const char *key;
ExtensionListEntry *firstExt = NULL;
ExtensionListEntry *ext;
AttributeListEntry *firstAttr = NULL;
AttributeListEntry *attr;
char *attrValue;
CharStringPool extBufPool;
const char *bcpKey=nullptr, *bcpValue=nullptr;
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t keylen;
UBool isBcpUExt;
while (TRUE) {
icu::CharString buf;
key = uenum_next(keywordEnum, NULL, status);
if (key == NULL) {
break;
}
char* buffer;
int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY;
for (;;) {
buffer = buf.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
break;
}
len = uloc_getKeywordValue(
localeID, key, buffer, resultCapacity, &tmpStatus);
if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
break;
}
resultCapacity = len;
tmpStatus = U_ZERO_ERROR;
}
if (U_FAILURE(tmpStatus)) {
if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
/* ignore this keyword */
tmpStatus = U_ZERO_ERROR;
continue;
}
buf.append(buffer, len, tmpStatus);
if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
}
keylen = (int32_t)uprv_strlen(key);
isBcpUExt = (keylen > 1);
/* special keyword used for representing Unicode locale attributes */
if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
if (len > 0) {
int32_t i = 0;
while (TRUE) {
attrBufLength = 0;
for (; i < len; i++) {
if (buf[i] != '-') {
attrBuf[attrBufLength++] = buf[i];
} else {
i++;
break;
}
}
if (attrBufLength > 0) {
attrBuf[attrBufLength] = 0;
} else if (i >= len){
break;
}
/* create AttributeListEntry */
attr = (AttributeListEntry*)uprv_malloc(sizeof(AttributeListEntry));
if (attr == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
attrValue = (char*)uprv_malloc(attrBufLength + 1);
if (attrValue == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
uprv_strcpy(attrValue, attrBuf);
attr->attribute = attrValue;
if (!_addAttributeToList(&firstAttr, attr)) {
uprv_free(attr);
uprv_free(attrValue);
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
}
}
/* for a place holder ExtensionListEntry */
bcpKey = LOCALE_ATTRIBUTE_KEY;
bcpValue = NULL;
}
} else if (isBcpUExt) {
bcpKey = uloc_toUnicodeLocaleKey(key);
if (bcpKey == NULL) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
continue;
}
/* we've checked buf is null-terminated above */
bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
if (bcpValue == NULL) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
continue;
}
if (bcpValue == buf.data()) {
/*
When uloc_toUnicodeLocaleType(key, buf) returns the
input value as is, the value is well-formed, but has
no known mapping. This implementation normalizes the
value to lower case
*/
icu::CharString* extBuf = extBufPool.create();
if (extBuf == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
int32_t bcpValueLen = static_cast<int32_t>(uprv_strlen(bcpValue));
int32_t resultCapacity;
char* pExtBuf = extBuf->getAppendBuffer(
/*minCapacity=*/bcpValueLen,
/*desiredCapacityHint=*/bcpValueLen,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
uprv_strcpy(pExtBuf, bcpValue);
T_CString_toLowerCase(pExtBuf);
extBuf->append(pExtBuf, bcpValueLen, tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
bcpValue = extBuf->data();
}
} else {
if (*key == PRIVATEUSE) {
if (!_isPrivateuseValueSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
continue;
}
} else {
if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
continue;
}
}
bcpKey = key;
icu::CharString* extBuf = extBufPool.create();
if (extBuf == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
extBuf->append(buf.data(), len, tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
bcpValue = extBuf->data();
}
/* create ExtensionListEntry */
ext = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (ext == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
ext->key = bcpKey;
ext->value = bcpValue;
if (!_addExtensionToList(&firstExt, ext, TRUE)) {
uprv_free(ext);
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
}
}
/* Special handling for POSIX variant - add the keywords for POSIX */
if (hadPosix) {
/* create ExtensionListEntry for POSIX */
ext = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (ext == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
ext->key = POSIX_KEY;
ext->value = POSIX_VALUE;
if (!_addExtensionToList(&firstExt, ext, TRUE)) {
uprv_free(ext);
}
}
if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
UBool startLDMLExtension = FALSE;
for (ext = firstExt; ext; ext = ext->next) {
if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
/* first LDML u singlton extension */
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
if (reslen < capacity) {
*(appendAt + reslen) = LDMLEXT;
}
reslen++;
startLDMLExtension = TRUE;
}
/* write out the sorted BCP47 attributes, extensions and private use */
if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
/* write the value for the attributes */
for (attr = firstAttr; attr; attr = attr->next) {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(attr->attribute);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, attr->attribute, uprv_min(len, capacity - reslen));
}
reslen += len;
}
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(ext->key);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, ext->key, uprv_min(len, capacity - reslen));
}
reslen += len;
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(ext->value);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, ext->value, uprv_min(len, capacity - reslen));
}
reslen += len;
}
}
}
cleanup:
/* clean up */
ext = firstExt;
while (ext != NULL) {
ExtensionListEntry *tmpExt = ext->next;
uprv_free(ext);
ext = tmpExt;
}
attr = firstAttr;
while (attr != NULL) {
AttributeListEntry *tmpAttr = attr->next;
char *pValue = (char *)attr->attribute;
uprv_free(pValue);
uprv_free(attr);
attr = tmpAttr;
}
uenum_close(keywordEnum);
if (U_FAILURE(*status)) {
return 0;
}
}
return u_terminateChars(appendAt, capacity, reslen, status);
}
/**
* Append keywords parsed from LDML extension value
* e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
* Note: char* buf is used for storing keywords
*/
static void
_appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, char* buf, int32_t bufSize, UBool *posixVariant, UErrorCode *status) {
const char *pTag; /* beginning of current subtag */
const char *pKwds; /* beginning of key-type pairs */
UBool variantExists = *posixVariant;
ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */
ExtensionListEntry *kwd, *nextKwd;
AttributeListEntry *attrFirst = NULL; /* first attribute */
AttributeListEntry *attr, *nextAttr;
int32_t len;
int32_t bufIdx = 0;
char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
int32_t attrBufIdx = 0;
/* Reset the posixVariant value */
*posixVariant = FALSE;
pTag = ldmlext;
pKwds = NULL;
/* Iterate through u extension attributes */
while (*pTag) {
/* locate next separator char */
for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
if (ultag_isUnicodeLocaleKey(pTag, len)) {
pKwds = pTag;
break;
}
/* add this attribute to the list */
attr = (AttributeListEntry*)uprv_malloc(sizeof(AttributeListEntry));
if (attr == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
attrBuf[attrBufIdx + len] = 0;
attr->attribute = &attrBuf[attrBufIdx];
attrBufIdx += (len + 1);
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR;
uprv_free(attr);
goto cleanup;
}
if (!_addAttributeToList(&attrFirst, attr)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
uprv_free(attr);
goto cleanup;
}
/* next tag */
pTag += len;
if (*pTag) {
/* next to the separator */
pTag++;
}
}
if (attrFirst) {
/* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
if (attrBufIdx > bufSize) {
/* attrBufIdx == <total length of attribute subtag> + 1 */
*status = U_ILLEGAL_ARGUMENT_ERROR;
goto cleanup;
}
kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (kwd == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
kwd->key = LOCALE_ATTRIBUTE_KEY;
kwd->value = buf;
/* attribute subtags sorted in alphabetical order as type */
attr = attrFirst;
while (attr != NULL) {
nextAttr = attr->next;
/* buffer size check is done above */
if (attr != attrFirst) {
*(buf + bufIdx) = SEP;
bufIdx++;
}
len = static_cast<int32_t>(uprv_strlen(attr->attribute));
uprv_memcpy(buf + bufIdx, attr->attribute, len);
bufIdx += len;
attr = nextAttr;
}
*(buf + bufIdx) = 0;
bufIdx++;
if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
uprv_free(kwd);
goto cleanup;
}
/* once keyword entry is created, delete the attribute list */
attr = attrFirst;
while (attr != NULL) {
nextAttr = attr->next;
uprv_free(attr);
attr = nextAttr;
}
attrFirst = NULL;
}
if (pKwds) {
const char *pBcpKey = NULL; /* u extenstion key subtag */
const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */
int32_t bcpKeyLen = 0;
int32_t bcpTypeLen = 0;
UBool isDone = FALSE;
pTag = pKwds;
/* BCP47 representation of LDML key/type pairs */
while (!isDone) {
const char *pNextBcpKey = NULL;
int32_t nextBcpKeyLen = 0;
UBool emitKeyword = FALSE;
if (*pTag) {
/* locate next separator char */
for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
if (ultag_isUnicodeLocaleKey(pTag, len)) {
if (pBcpKey) {
emitKeyword = TRUE;
pNextBcpKey = pTag;
nextBcpKeyLen = len;
} else {
pBcpKey = pTag;
bcpKeyLen = len;
}
} else {
U_ASSERT(pBcpKey != NULL);
/* within LDML type subtags */
if (pBcpType) {
bcpTypeLen += (len + 1);
} else {
pBcpType = pTag;
bcpTypeLen = len;
}
}
/* next tag */
pTag += len;
if (*pTag) {
/* next to the separator */
pTag++;
}
} else {
/* processing last one */
emitKeyword = TRUE;
isDone = TRUE;
}
if (emitKeyword) {
const char *pKey = NULL; /* LDML key */
const char *pType = NULL; /* LDML type */
char bcpKeyBuf[9]; /* BCP key length is always 2 for now */
U_ASSERT(pBcpKey != NULL);
if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
/* the BCP key is invalid */
*status = U_ILLEGAL_ARGUMENT_ERROR;
goto cleanup;
}
uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
bcpKeyBuf[bcpKeyLen] = 0;
/* u extension key to LDML key */
pKey = uloc_toLegacyKey(bcpKeyBuf);
if (pKey == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
goto cleanup;
}
if (pKey == bcpKeyBuf) {
/*
The key returned by toLegacyKey points to the input buffer.
We normalize the result key to lower case.
*/
T_CString_toLowerCase(bcpKeyBuf);
if (bufSize - bufIdx - 1 >= bcpKeyLen) {
uprv_memcpy(buf + bufIdx, bcpKeyBuf, bcpKeyLen);
pKey = buf + bufIdx;
bufIdx += bcpKeyLen;
*(buf + bufIdx) = 0;
bufIdx++;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
goto cleanup;
}
}
if (pBcpType) {
char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */
if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
/* the BCP type is too long */
*status = U_ILLEGAL_ARGUMENT_ERROR;
goto cleanup;
}
uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
bcpTypeBuf[bcpTypeLen] = 0;
/* BCP type to locale type */
pType = uloc_toLegacyType(pKey, bcpTypeBuf);
if (pType == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
goto cleanup;
}
if (pType == bcpTypeBuf) {
/*
The type returned by toLegacyType points to the input buffer.
We normalize the result type to lower case.
*/
/* normalize to lower case */
T_CString_toLowerCase(bcpTypeBuf);
if (bufSize - bufIdx - 1 >= bcpTypeLen) {
uprv_memcpy(buf + bufIdx, bcpTypeBuf, bcpTypeLen);
pType = buf + bufIdx;
bufIdx += bcpTypeLen;
*(buf + bufIdx) = 0;
bufIdx++;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
goto cleanup;
}
}
} else {
/* typeless - default type value is "yes" */
pType = LOCALE_TYPE_YES;
}
/* Special handling for u-va-posix, since we want to treat this as a variant,
not as a keyword */
if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
*posixVariant = TRUE;
} else {
/* create an ExtensionListEntry for this keyword */
kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (kwd == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
kwd->key = pKey;
kwd->value = pType;
if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
// duplicate keyword is allowed, Only the first
// is honored.
uprv_free(kwd);
}
}
pBcpKey = pNextBcpKey;
bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
pBcpType = NULL;
bcpTypeLen = 0;
}
}
}
kwd = kwdFirst;
while (kwd != NULL) {
nextKwd = kwd->next;
_addExtensionToList(appendTo, kwd, FALSE);
kwd = nextKwd;
}
return;
cleanup:
attr = attrFirst;
while (attr != NULL) {
nextAttr = attr->next;
uprv_free(attr);
attr = nextAttr;
}
kwd = kwdFirst;
while (kwd != NULL) {
nextKwd = kwd->next;
uprv_free(kwd);
kwd = nextKwd;
}
}
static int32_t
_appendKeywords(ULanguageTag* langtag, char* appendAt, int32_t capacity, UErrorCode* status) {
int32_t reslen = 0;
int32_t i, n;
int32_t len;
ExtensionListEntry *kwdFirst = NULL;
ExtensionListEntry *kwd;
const char *key, *type;
char *kwdBuf = NULL;
int32_t kwdBufLength = capacity;
UBool posixVariant = FALSE;
if (U_FAILURE(*status)) {
return 0;
}
kwdBuf = (char*)uprv_malloc(kwdBufLength);
if (kwdBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
/* Determine if variants already exists */
if (ultag_getVariantsSize(langtag)) {
posixVariant = TRUE;
}
n = ultag_getExtensionsSize(langtag);
/* resolve locale keywords and reordering keys */
for (i = 0; i < n; i++) {
key = ultag_getExtensionKey(langtag, i);
type = ultag_getExtensionValue(langtag, i);
if (*key == LDMLEXT) {
_appendLDMLExtensionAsKeywords(type, &kwdFirst, kwdBuf, kwdBufLength, &posixVariant, status);
if (U_FAILURE(*status)) {
break;
}
} else {
kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (kwd == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
kwd->key = key;
kwd->value = type;
if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
uprv_free(kwd);
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
}
}
if (U_SUCCESS(*status)) {
type = ultag_getPrivateUse(langtag);
if ((int32_t)uprv_strlen(type) > 0) {
/* add private use as a keyword */
kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (kwd == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
} else {
kwd->key = PRIVATEUSE_KEY;
kwd->value = type;
if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
uprv_free(kwd);
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
}
}
/* If a POSIX variant was in the extensions, write it out before writing the keywords. */
if (U_SUCCESS(*status) && posixVariant) {
len = (int32_t) uprv_strlen(_POSIX);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, _POSIX, uprv_min(len, capacity - reslen));
}
reslen += len;
}
if (U_SUCCESS(*status) && kwdFirst != NULL) {
/* write out the sorted keywords */
UBool firstValue = TRUE;
kwd = kwdFirst;
do {
if (reslen < capacity) {
if (firstValue) {
/* '@' */
*(appendAt + reslen) = LOCALE_EXT_SEP;
firstValue = FALSE;
} else {
/* ';' */
*(appendAt + reslen) = LOCALE_KEYWORD_SEP;
}
}
reslen++;
/* key */
len = (int32_t)uprv_strlen(kwd->key);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, kwd->key, uprv_min(len, capacity - reslen));
}
reslen += len;
/* '=' */
if (reslen < capacity) {
*(appendAt + reslen) = LOCALE_KEY_TYPE_SEP;
}
reslen++;
/* type */
len = (int32_t)uprv_strlen(kwd->value);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, kwd->value, uprv_min(len, capacity - reslen));
}
reslen += len;
kwd = kwd->next;
} while (kwd);
}
/* clean up */
kwd = kwdFirst;
while (kwd != NULL) {
ExtensionListEntry *tmpKwd = kwd->next;
uprv_free(kwd);
kwd = tmpKwd;
}
uprv_free(kwdBuf);
if (U_FAILURE(*status)) {
return 0;
}
return u_terminateChars(appendAt, capacity, reslen, status);
}
static int32_t
_appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
(void)hadPosix;
char buf[ULOC_FULLNAME_CAPACITY];
char tmpAppend[ULOC_FULLNAME_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
}
len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
}
if (len > 0) {
char *p, *pPriv;
UBool bNext = TRUE;
UBool firstValue = TRUE;
UBool writeValue;
pPriv = NULL;
p = buf;
while (bNext) {
writeValue = FALSE;
if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
if (*p == 0) {
bNext = FALSE;
} else {
*p = 0; /* terminate */
}
if (pPriv != NULL) {
/* Private use in the canonical format is lowercase in BCP47 */
for (i = 0; *(pPriv + i) != 0; i++) {
*(pPriv + i) = uprv_tolower(*(pPriv + i));
}
/* validate */
if (_isPrivateuseValueSubtag(pPriv, -1)) {
if (firstValue) {
if (!_isVariantSubtag(pPriv, -1)) {
writeValue = TRUE;
}
} else {
writeValue = TRUE;
}
} else if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
} else {
break;
}
if (writeValue) {
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
if (firstValue) {
if (reslen < capacity) {
tmpAppend[reslen++] = *PRIVATEUSE_KEY;
}
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
if (reslen < capacity) {
uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
}
reslen += len;
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
firstValue = FALSE;
}
len = (int32_t)uprv_strlen(pPriv);
if (reslen < capacity) {
uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
}
reslen += len;
}
}
/* reset private use starting position */
pPriv = NULL;
} else if (pPriv == NULL) {
pPriv = p;
}
p++;
}
if (U_FAILURE(*status)) {
return 0;
}
}
if (U_SUCCESS(*status)) {
len = reslen;
if (reslen < capacity) {
uprv_memcpy(appendAt, tmpAppend, uprv_min(len, capacity - reslen));
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
/*
* -------------------------------------------------
*
* ultag_ functions
*
* -------------------------------------------------
*/
/* Bit flags used by the parser */
#define LANG 0x0001
#define EXTL 0x0002
#define SCRT 0x0004
#define REGN 0x0008
#define VART 0x0010
#define EXTS 0x0020
#define EXTV 0x0040
#define PRIV 0x0080
/**
* Ticket #12705 - Visual Studio 2015 Update 3 contains a new code optimizer which has problems optimizing
* this function. (See https://blogs.msdn.microsoft.com/vcblog/2016/05/04/new-code-optimizer/ )
* As a workaround, we will turn off optimization just for this function on VS2015 Update 3 and above.
*/
#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
#pragma optimize( "", off )
#endif
static ULanguageTag*
ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
ULanguageTag *t;
char *tagBuf;
int16_t next;
char *pSubtag, *pNext, *pLastGoodPosition;
int32_t subtagLen;
int32_t extlangIdx;
ExtensionListEntry *pExtension;
char *pExtValueSubtag, *pExtValueSubtagEnd;
int32_t i;
UBool privateuseVar = FALSE;
int32_t grandfatheredLen = 0;
if (parsedLen != NULL) {
*parsedLen = 0;
}
if (U_FAILURE(*status)) {
return NULL;
}
if (tagLen < 0) {
tagLen = (int32_t)uprv_strlen(tag);
}
/* copy the entire string */
tagBuf = (char*)uprv_malloc(tagLen + 1);
if (tagBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memcpy(tagBuf, tag, tagLen);
*(tagBuf + tagLen) = 0;
/* create a ULanguageTag */
t = (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag));
if (t == NULL) {
uprv_free(tagBuf);
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
_initializeULanguageTag(t);
t->buf = tagBuf;
if (tagLen < MINLEN) {
/* the input tag is too short - return empty ULanguageTag */
return t;
}
/* check if the tag is grandfathered */
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) {
int32_t newTagLength;
grandfatheredLen = tagLen; /* back up for output parsedLen */
newTagLength = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
if (tagLen < newTagLength) {
uprv_free(tagBuf);
tagBuf = (char*)uprv_malloc(newTagLength + 1);
if (tagBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
ultag_close(t);
return NULL;
}
t->buf = tagBuf;
tagLen = newTagLength;
}
uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
break;
}
}
size_t parsedLenDelta = 0;
if (grandfatheredLen == 0) {
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
const char* redundantTag = REDUNDANT[i];
size_t redundantTagLen = uprv_strlen(redundantTag);
// The preferred tag for a redundant tag is always shorter than redundant
// tag. A redundant tag may or may not be followed by other subtags.
// (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
const char* redundantTagEnd = tagBuf + redundantTagLen;
if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
const char* preferredTag = REDUNDANT[i + 1];
size_t preferredTagLen = uprv_strlen(preferredTag);
uprv_strncpy(t->buf, preferredTag, preferredTagLen);
if (*redundantTagEnd == SEP) {
uprv_memmove(tagBuf + preferredTagLen,
redundantTagEnd,
tagLen - redundantTagLen + 1);
} else {
tagBuf[preferredTagLen] = '\0';
}
// parsedLen should be the length of the input
// before redundantTag is replaced by preferredTag.
// Save the delta to add it back later.
parsedLenDelta = redundantTagLen - preferredTagLen;
break;
}
}
}
}
/*
* langtag = language
* ["-" script]
* ["-" region]
* *("-" variant)
* *("-" extension)
* ["-" privateuse]
*/
next = LANG | PRIV;
pNext = pLastGoodPosition = tagBuf;
extlangIdx = 0;
pExtension = NULL;
pExtValueSubtag = NULL;
pExtValueSubtagEnd = NULL;
while (pNext) {
char *pSep;
pSubtag = pNext;
/* locate next separator char */
pSep = pSubtag;
while (*pSep) {
if (*pSep == SEP) {
break;
}
pSep++;
}
if (*pSep == 0) {
/* last subtag */
pNext = NULL;
} else {
pNext = pSep + 1;
}
subtagLen = (int32_t)(pSep - pSubtag);
if (next & LANG) {
if (_isLanguageSubtag(pSubtag, subtagLen)) {
*pSep = 0; /* terminate */
// TODO: move deprecated language code handling here.
t->language = T_CString_toLowerCase(pSubtag);
pLastGoodPosition = pSep;
next = SCRT | REGN | VART | EXTS | PRIV;
if (subtagLen <= 3)
next |= EXTL;
continue;
}
}
if (next & EXTL) {
if (_isExtlangSubtag(pSubtag, subtagLen)) {
*pSep = 0;
t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
pLastGoodPosition = pSep;
if (extlangIdx < 3) {
next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
} else {
next = SCRT | REGN | VART | EXTS | PRIV;
}
continue;
}
}
if (next & SCRT) {
if (_isScriptSubtag(pSubtag, subtagLen)) {
char *p = pSubtag;
*pSep = 0;
/* to title case */
*p = uprv_toupper(*p);
p++;
for (; *p; p++) {
*p = uprv_tolower(*p);
}
t->script = pSubtag;
pLastGoodPosition = pSep;
next = REGN | VART | EXTS | PRIV;
continue;
}
}
if (next & REGN) {
if (_isRegionSubtag(pSubtag, subtagLen)) {
*pSep = 0;
// TODO: move deprecated region code handling here.
t->region = T_CString_toUpperCase(pSubtag);
pLastGoodPosition = pSep;
next = VART | EXTS | PRIV;
continue;
}
}
if (next & VART) {
if (_isVariantSubtag(pSubtag, subtagLen) ||
(privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
VariantListEntry *var;
UBool isAdded;
var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
if (var == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto error;
}
*pSep = 0;
var->variant = T_CString_toUpperCase(pSubtag);
isAdded = _addVariantToList(&(t->variants), var);
if (!isAdded) {
/* duplicated variant entry */
uprv_free(var);
break;
}
pLastGoodPosition = pSep;
next = VART | EXTS | PRIV;
continue;
}
}
if (next & EXTS) {
if (_isExtensionSingleton(pSubtag, subtagLen)) {
if (pExtension != NULL) {
if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
/* the previous extension is incomplete */
uprv_free(pExtension);
pExtension = NULL;
break;
}
/* terminate the previous extension value */
*pExtValueSubtagEnd = 0;
pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
/* insert the extension to the list */
if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
pLastGoodPosition = pExtValueSubtagEnd;
} else {
/* stop parsing here */
uprv_free(pExtension);
pExtension = NULL;
break;
}
}
/* create a new extension */
pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
if (pExtension == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto error;
}
*pSep = 0;
pExtension->key = T_CString_toLowerCase(pSubtag);
pExtension->value = NULL; /* will be set later */
/*
* reset the start and the end location of extension value
* subtags for this extension
*/
pExtValueSubtag = NULL;
pExtValueSubtagEnd = NULL;
next = EXTV;
continue;
}
}
if (next & EXTV) {
if (_isExtensionSubtag(pSubtag, subtagLen)) {
if (pExtValueSubtag == NULL) {
/* if the start postion of this extension's value is not yet,
this one is the first value subtag */
pExtValueSubtag = pSubtag;
}
/* Mark the end of this subtag */
pExtValueSubtagEnd = pSep;
next = EXTS | EXTV | PRIV;
continue;
}
}
if (next & PRIV) {
if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
char *pPrivuseVal;
if (pExtension != NULL) {
/* Process the last extension */
if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
/* the previous extension is incomplete */
uprv_free(pExtension);
pExtension = NULL;
break;
} else {
/* terminate the previous extension value */
*pExtValueSubtagEnd = 0;
pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
/* insert the extension to the list */
if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
pLastGoodPosition = pExtValueSubtagEnd;
pExtension = NULL;
} else {
/* stop parsing here */
uprv_free(pExtension);
pExtension = NULL;
break;
}
}
}
/* The rest of part will be private use value subtags */
if (pNext == NULL) {
/* empty private use subtag */
break;
}
/* back up the private use value start position */
pPrivuseVal = pNext;
/* validate private use value subtags */
while (pNext) {
pSubtag = pNext;
pSep = pSubtag;
while (*pSep) {
if (*pSep == SEP) {
break;
}
pSep++;
}
if (*pSep == 0) {
/* last subtag */
pNext = NULL;
} else {
pNext = pSep + 1;
}
subtagLen = (int32_t)(pSep - pSubtag);
if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
*pSep = 0;
next = VART;
privateuseVar = TRUE;
break;
} else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
pLastGoodPosition = pSep;
} else {
break;
}
}
if (next == VART) {
continue;
}
if (pLastGoodPosition - pPrivuseVal > 0) {
*pLastGoodPosition = 0;
t->privateuse = T_CString_toLowerCase(pPrivuseVal);
}
/* No more subtags, exiting the parse loop */
break;
}
break;
}
/* If we fell through here, it means this subtag is illegal - quit parsing */
break;
}
if (pExtension != NULL) {
/* Process the last extension */
if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
/* the previous extension is incomplete */
uprv_free(pExtension);
} else {
/* terminate the previous extension value */
*pExtValueSubtagEnd = 0;
pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
/* insert the extension to the list */
if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
pLastGoodPosition = pExtValueSubtagEnd;
} else {
uprv_free(pExtension);
}
}
}
if (parsedLen != NULL) {
*parsedLen = (grandfatheredLen > 0) ? grandfatheredLen :
(int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
}
return t;
error:
ultag_close(t);
return NULL;
}
/**
* Ticket #12705 - Turn optimization back on.
*/
#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
#pragma optimize( "", on )
#endif
static void
ultag_close(ULanguageTag* langtag) {
if (langtag == NULL) {
return;
}
uprv_free(langtag->buf);
if (langtag->variants) {
VariantListEntry *curVar = langtag->variants;
while (curVar) {
VariantListEntry *nextVar = curVar->next;
uprv_free(curVar);
curVar = nextVar;
}
}
if (langtag->extensions) {
ExtensionListEntry *curExt = langtag->extensions;
while (curExt) {
ExtensionListEntry *nextExt = curExt->next;
uprv_free(curExt);
curExt = nextExt;
}
}
uprv_free(langtag);
}
static const char*
ultag_getLanguage(const ULanguageTag* langtag) {
return langtag->language;
}
#if 0
static const char*
ultag_getJDKLanguage(const ULanguageTag* langtag) {
int32_t i;
for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
return DEPRECATEDLANGS[i + 1];
}
}
return langtag->language;
}
#endif
static const char*
ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
if (idx >= 0 && idx < MAXEXTLANG) {
return langtag->extlang[idx];
}
return NULL;
}
static int32_t
ultag_getExtlangSize(const ULanguageTag* langtag) {
int32_t size = 0;
int32_t i;
for (i = 0; i < MAXEXTLANG; i++) {
if (langtag->extlang[i]) {
size++;
}
}
return size;
}
static const char*
ultag_getScript(const ULanguageTag* langtag) {
return langtag->script;
}
static const char*
ultag_getRegion(const ULanguageTag* langtag) {
return langtag->region;
}
static const char*
ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
const char *var = NULL;
VariantListEntry *cur = langtag->variants;
int32_t i = 0;
while (cur) {
if (i == idx) {
var = cur->variant;
break;
}
cur = cur->next;
i++;
}
return var;
}
static int32_t
ultag_getVariantsSize(const ULanguageTag* langtag) {
int32_t size = 0;
VariantListEntry *cur = langtag->variants;
while (TRUE) {
if (cur == NULL) {
break;
}
size++;
cur = cur->next;
}
return size;
}
static const char*
ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
const char *key = NULL;
ExtensionListEntry *cur = langtag->extensions;
int32_t i = 0;
while (cur) {
if (i == idx) {
key = cur->key;
break;
}
cur = cur->next;
i++;
}
return key;
}
static const char*
ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
const char *val = NULL;
ExtensionListEntry *cur = langtag->extensions;
int32_t i = 0;
while (cur) {
if (i == idx) {
val = cur->value;
break;
}
cur = cur->next;
i++;
}
return val;
}
static int32_t
ultag_getExtensionsSize(const ULanguageTag* langtag) {
int32_t size = 0;
ExtensionListEntry *cur = langtag->extensions;
while (TRUE) {
if (cur == NULL) {
break;
}
size++;
cur = cur->next;
}
return size;
}
static const char*
ultag_getPrivateUse(const ULanguageTag* langtag) {
return langtag->privateuse;
}
#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag) {
return langtag->grandfathered;
}
#endif
/*
* -------------------------------------------------
*
* Locale/BCP47 conversion APIs, exposed as uloc_*
*
* -------------------------------------------------
*/
U_CAPI int32_t U_EXPORT2
uloc_toLanguageTag(const char* localeID,
char* langtag,
int32_t langtagCapacity,
UBool strict,
UErrorCode* status) {
icu::CharString canonical;
int32_t reslen;
UErrorCode tmpStatus = U_ZERO_ERROR;
UBool hadPosix = FALSE;
const char* pKeywordStart;
/* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
if (resultCapacity > 0) {
char* buffer;
for (;;) {
buffer = canonical.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return 0;
}
reslen =
uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
break;
}
resultCapacity = reslen;
tmpStatus = U_ZERO_ERROR;
}
if (U_FAILURE(tmpStatus)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
canonical.append(buffer, reslen, tmpStatus);
if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
}