blob: c2a7011b5093eb2e126bd81671d50bd837ea48f8 [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// loclikelysubtags.cpp
// created: 2019may08 Markus W. Scherer
#include <utility>
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/localpointer.h"
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "charstr.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "lsr.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "uhash.h"
#include "uinvchar.h"
#include "umutex.h"
#include "uniquecharstr.h"
#include "uresdata.h"
#include "uresimp.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
namespace {
constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT
constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI
constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK
} // namespace
LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) :
distanceTrieBytes(data.distanceTrieBytes),
regionToPartitions(data.regionToPartitions),
partitions(data.partitions),
paradigms(data.paradigms), paradigmsLength(data.paradigmsLength),
distances(data.distances) {
data.partitions = nullptr;
data.paradigms = nullptr;
}
LocaleDistanceData::~LocaleDistanceData() {
uprv_free(partitions);
delete[] paradigms;
}
// TODO(ICU-20777): Rename to just LikelySubtagsData.
struct XLikelySubtagsData {
UResourceBundle *langInfoBundle = nullptr;
UniqueCharStrings strings;
CharStringMap languageAliases;
CharStringMap regionAliases;
const uint8_t *trieBytes = nullptr;
LSR *lsrs = nullptr;
int32_t lsrsLength = 0;
LocaleDistanceData distanceData;
XLikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {}
~XLikelySubtagsData() {
ures_close(langInfoBundle);
delete[] lsrs;
}
void load(UErrorCode &errorCode) {
langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode);
if (U_FAILURE(errorCode)) { return; }
StackUResourceBundle stackTempBundle;
ResourceDataValue value;
ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(),
value, errorCode);
ResourceTable likelyTable = value.getTable(errorCode);
if (U_FAILURE(errorCode)) { return; }
// Read all strings in the resource bundle and convert them to invariant char *.
LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
ResourceArray m49Array;
if (likelyTable.findValue("m49", value)) {
m49Array = value.getArray(errorCode);
} else {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!readStrings(likelyTable, "languageAliases", value,
languageIndexes, languagesLength, errorCode) ||
!readStrings(likelyTable, "regionAliases", value,
regionIndexes, regionsLength, errorCode) ||
!readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
return;
}
if ((languagesLength & 1) != 0 ||
(regionsLength & 1) != 0 ||
(lsrSubtagsLength % 3) != 0) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
if (lsrSubtagsLength == 0) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!likelyTable.findValue("trie", value)) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
int32_t length;
trieBytes = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
// Also read distance/matcher data if available,
// to open & keep only one resource bundle pointer
// and to use one single UniqueCharStrings.
UErrorCode matchErrorCode = U_ZERO_ERROR;
ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(),
value, matchErrorCode);
LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes;
int32_t partitionsLength = 0, paradigmSubtagsLength = 0;
if (U_SUCCESS(matchErrorCode)) {
ResourceTable matchTable = value.getTable(errorCode);
if (U_FAILURE(errorCode)) { return; }
if (matchTable.findValue("trie", value)) {
distanceData.distanceTrieBytes = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
}
if (matchTable.findValue("regionToPartitions", value)) {
distanceData.regionToPartitions = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
if (length < LSR::REGION_INDEX_LIMIT) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
}
if (!readStrings(matchTable, "partitions", value,
partitionIndexes, partitionsLength, errorCode) ||
!readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
return;
}
if ((paradigmSubtagsLength % 3) != 0) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
if (matchTable.findValue("distances", value)) {
distanceData.distances = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
if (length < 4) { // LocaleDistance IX_LIMIT
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
}
} else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) {
// ok for likely subtags
} else { // error other than missing resource
errorCode = matchErrorCode;
return;
}
// Fetch & store invariant-character versions of strings
// only after we have collected and de-duplicated all of them.
strings.freeze();
languageAliases = CharStringMap(languagesLength / 2, errorCode);
for (int32_t i = 0; i < languagesLength; i += 2) {
languageAliases.put(strings.get(languageIndexes[i]),
strings.get(languageIndexes[i + 1]), errorCode);
}
regionAliases = CharStringMap(regionsLength / 2, errorCode);
for (int32_t i = 0; i < regionsLength; i += 2) {
regionAliases.put(strings.get(regionIndexes[i]),
strings.get(regionIndexes[i + 1]), errorCode);
}
if (U_FAILURE(errorCode)) { return; }
lsrsLength = lsrSubtagsLength / 3;
lsrs = new LSR[lsrsLength];
if (lsrs == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
strings.get(lsrSubtagIndexes[i + 1]),
strings.get(lsrSubtagIndexes[i + 2]),
LSR::IMPLICIT_LSR);
}
if (partitionsLength > 0) {
distanceData.partitions = static_cast<const char **>(
uprv_malloc(partitionsLength * sizeof(const char *)));
if (distanceData.partitions == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0; i < partitionsLength; ++i) {
distanceData.partitions[i] = strings.get(partitionIndexes[i]);
}
}
if (paradigmSubtagsLength > 0) {
distanceData.paradigmsLength = paradigmSubtagsLength / 3;
LSR *paradigms = new LSR[distanceData.paradigmsLength];
if (paradigms == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
strings.get(paradigmSubtagIndexes[i + 1]),
strings.get(paradigmSubtagIndexes[i + 2]),
LSR::DONT_CARE_FLAGS);
}
distanceData.paradigms = paradigms;
}
}
private:
bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (table.findValue(key, value)) {
ResourceArray stringArray = value.getArray(errorCode);
if (U_FAILURE(errorCode)) { return false; }
length = stringArray.getSize();
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
if (stringArray.getValue(i, value)) { // returns true because i < length
int32_t strLength = 0;
rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
}
}
return true;
}
UnicodeString toLanguage(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("skip");
}
encoded &= 0x00ffffff;
encoded %= 27*27*27;
char lang[3];
lang[0] = 'a' + ((encoded % 27) - 1);
lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
if (encoded / (27 * 27) == 0) {
return UnicodeString(lang, 2, US_INV);
}
lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
return UnicodeString(lang, 3, US_INV);
}
UnicodeString toScript(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("script");
}
encoded = (encoded >> 24) & 0x000000ff;
const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
if (script == nullptr) {
return UNICODE_STRING_SIMPLE("");
}
U_ASSERT(uprv_strlen(script) == 4);
return UnicodeString(script, 4, US_INV);
}
UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return UNICODE_STRING_SIMPLE("");
}
if (m49Array.getValue(index, value)) {
return value.getUnicodeString(errorCode);
}
// "m49" does not include the index.
errorCode = U_MISSING_RESOURCE_ERROR;
return UNICODE_STRING_SIMPLE("");
}
UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
if (encoded == 0 || encoded == 1) {
return UNICODE_STRING_SIMPLE("");
}
encoded &= 0x00ffffff;
encoded /= 27 * 27 * 27;
encoded %= 27 * 27;
if (encoded < 27) {
// Selected M49 code index, find the code from "m49" resource.
return m49IndexToCode(m49Array, value, encoded, errorCode);
}
char region[2];
region[0] = 'A' + ((encoded % 27) - 1);
region[1] = 'A' + (((encoded / 27) % 27) - 1);
return UnicodeString(region, 2, US_INV);
}
bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (table.findValue(key, value)) {
const int32_t* vectors = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return false; }
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
rawIndexes[i*3+2] = strings.addByValue(
toRegion(m49Array, value, vectors[i], errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
length *= 3;
}
return true;
}
};
namespace {
XLikelySubtags *gLikelySubtags = nullptr;
UVector *gMacroregions = nullptr;
UInitOnce gInitOnce {};
UBool U_CALLCONV cleanup() {
delete gLikelySubtags;
gLikelySubtags = nullptr;
delete gMacroregions;
gMacroregions = nullptr;
gInitOnce.reset();
return true;
}
static const char16_t RANGE_MARKER = 0x7E; /* '~' */
UVector* loadMacroregions(UErrorCode &status) {
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
if (U_FAILURE(status)) {
return nullptr;
}
while (U_SUCCESS(status) && ures_hasNext(regionMacro.getAlias())) {
UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
char16_t buf[6];
regionName.extract(buf,6,status);
if ( rangeMarkerLocation > 0 ) {
char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
buf[rangeMarkerLocation] = 0;
while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
buf[rangeMarkerLocation-1]++;
}
} else {
LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
}
}
return newMacroRegions.orphan();
}
} // namespace
void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(gLikelySubtags == nullptr);
XLikelySubtagsData data(errorCode);
data.load(errorCode);
if (U_FAILURE(errorCode)) { return; }
gLikelySubtags = new XLikelySubtags(data);
gMacroregions = loadMacroregions(errorCode);
if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
delete gLikelySubtags;
delete gMacroregions;
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
}
const XLikelySubtags *XLikelySubtags::getSingleton(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode);
return gLikelySubtags;
}
XLikelySubtags::XLikelySubtags(XLikelySubtagsData &data) :
langInfoBundle(data.langInfoBundle),
strings(data.strings.orphanCharStrings()),
languageAliases(std::move(data.languageAliases)),
regionAliases(std::move(data.regionAliases)),
trie(data.trieBytes),
lsrs(data.lsrs),
#if U_DEBUG
lsrsLength(data.lsrsLength),
#endif
distanceData(std::move(data.distanceData)) {
data.langInfoBundle = nullptr;
data.lsrs = nullptr;
// Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**").
UStringTrieResult result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
trieUndState = trie.getState64();
result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
trieUndZzzzState = trie.getState64();
result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
defaultLsrIndex = trie.getValue();
trie.reset();
for (char16_t c = u'a'; c <= u'z'; ++c) {
result = trie.next(c);
if (result == USTRINGTRIE_NO_VALUE) {
trieFirstLetterStates[c - u'a'] = trie.getState64();
}
trie.reset();
}
}
XLikelySubtags::~XLikelySubtags() {
ures_close(langInfoBundle);
delete strings;
delete[] lsrs;
}
LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (locale.isBogus()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return LSR("", "", "", LSR::EXPLICIT_LSR);
}
const char *name = locale.getName();
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
// Private use language tag x-subtag-subtag... which CLDR changes to
// und-x-subtag-subtag...
return LSR(name, "", "", LSR::EXPLICIT_LSR);
}
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), returnInputIfUnmatch, errorCode);
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// If the provided ULocale instance is already in the maximal form, or
// there is no data available available for maximization, it will be
// returned.
return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
}
return max;
}
namespace {
const char *getCanonical(const CharStringMap &aliases, const char *alias) {
const char *canonical = aliases.get(alias);
return canonical == nullptr ? alias : canonical;
}
} // namespace
LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
}
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
}
// else normal locale
}
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
region = getCanonical(regionAliases, region);
return maximize(language, script, region, returnInputIfUnmatch, errorCode);
}
LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
return maximize({language, (int32_t)uprv_strlen(language)},
{script, (int32_t)uprv_strlen(script)},
{region, (int32_t)uprv_strlen(region)},
returnInputIfUnmatch,
errorCode);
}
bool XLikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
// In Java, we use Region class. In C++, since Region is under i18n,
// we read the same data used by Region into gMacroregions avoid dependency
// from common to i18n/region.cpp
if (U_FAILURE(errorCode)) { return false; }
umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode);
if (U_FAILURE(errorCode)) { return false; }
UnicodeString str(UnicodeString::fromUTF8(region));
return gMacroregions->contains((void *)&str);
}
LSR XLikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
if (language.compare("und") == 0) {
language = "";
}
if (script.compare("Zzzz") == 0) {
script = "";
}
if (region.compare("ZZ") == 0) {
region = "";
}
if (!script.empty() && !region.empty() && !language.empty()) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
}
bool retainLanguage = false;
bool retainScript = false;
bool retainRegion = false;
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
language.length() >= 2 &&
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
bool matchLanguage = (value >= 0);
bool matchScript = false;
if (value >= 0) {
retainLanguage = !language.empty();
state = iter.getState64();
} else {
retainLanguage = true;
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value >= 0 && !script.empty()) {
matchScript = true;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
retainScript = !script.empty();
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
retainScript = !script.empty();
state = iter.getState64();
} else {
retainScript = true;
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
state = iter.getState64();
}
}
}
bool matchRegion = false;
if (value > 0) {
// Final value from just language or language+script.
retainRegion = !region.empty();
} else {
value = trieNext(iter, region, 0);
if (value >= 0) {
if (!region.empty() && !isMacroregion(region, errorCode)) {
retainRegion = true;
matchRegion = true;
}
} else {
retainRegion = true;
if (state == 0) {
value = defaultLsrIndex;
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
}
}
U_ASSERT(value < lsrsLength);
const LSR &matched = lsrs[value];
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
}
if (language.empty()) {
language = StringPiece("und");
}
if (!(retainLanguage || retainScript || retainRegion)) {
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
return LSR(matched.language, matched.script, matched.region, matched.flags);
}
if (!retainLanguage) {
language = matched.language;
}
if (!retainScript) {
script = matched.script;
}
if (!retainRegion) {
region = matched.region;
}
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags
return LSR(language, script, region, retainMask, errorCode);
}
int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
// If likelyInfo >= 0:
// likelyInfo bit 1 is set if the previous comparison with lsr
// was for equal language and script.
// Otherwise the scripts differed.
if (uprv_strcmp(lsr.language, other.language) != 0) {
return 0xfffffffc; // negative, lsr not better than other
}
if (uprv_strcmp(lsr.script, other.script) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, "");
likelyInfo = index << 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.script, likely.script) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
if (uprv_strcmp(lsr.region, other.region) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, lsr.region);
likelyInfo = (index << 2) | 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.region, likely.region) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
return likelyInfo & ~1; // lsr not better than other
}
// Subset of maximize().
int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
if (uprv_strcmp(script, "Zzzz") == 0) {
script = "";
}
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
language[1] != 0 && // language.length() >= 2
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
if (value >= 0) {
state = iter.getState64();
} else {
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
state = iter.getState64();
} else {
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
state = iter.getState64();
}
}
}
if (value > 0) {
// Final value from just language or language+script.
} else {
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
U_ASSERT(value < lsrsLength);
return value;
}
int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if ((c = s[i]) == 0) {
result = iter.next(u'*');
} else {
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
uint8_t next = s[++i];
if (next != 0) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
c = next;
}
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
}
int32_t XLikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if (s.length() == i) {
result = iter.next(u'*');
} else {
c = s.data()[i];
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
if (i+1 != s.length()) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
c = s.data()[++i];
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
}
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
}
LSR XLikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
StringPiece region,
bool favorScript,
UErrorCode &errorCode) const {
LSR max = maximize(language, script, region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
// If no match, return it.
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// "If this Locale is already in the minimal form, or not valid, or
// there is no data available for minimization, the Locale will be
// unchanged."
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
// try language
LSR test = maximize(max.language, "", "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (!favorScript) {
// favor Region
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
// try language and script
test = maximize(max.language, max.script, "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (favorScript) {
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
U_NAMESPACE_END