blob: 621b3e5d4573c6092cfb8f88c54adf37165f8198 [file] [log] [blame]
diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp
index 3aff85b3..b416ef52 100644
--- a/source/common/characterproperties.cpp
+++ b/source/common/characterproperties.cpp
@@ -23,6 +23,9 @@
#include "umutex.h"
#include "uprops.h"
+using icu::LocalPointer;
+using icu::Normalizer2Factory;
+using icu::Normalizer2Impl;
using icu::UInitOnce;
using icu::UnicodeSet;
@@ -30,11 +33,13 @@ namespace {
UBool U_CALLCONV characterproperties_cleanup();
+constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
+
struct Inclusion {
UnicodeSet *fSet;
UInitOnce fInitOnce;
};
-Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
+Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
@@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() {
return TRUE;
}
-} // namespace
-
-U_NAMESPACE_BEGIN
-
-/*
-Reduce excessive reallocation, and make it easier to detect initialization problems.
-Usually you don't see smaller sets than this for Unicode 5.0.
-*/
-constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072;
-
-void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) {
+void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
- // This function is a friend of class UnicodeSet.
-
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
if (src == UPROPS_SRC_NONE) {
errorCode = U_INTERNAL_PROGRAM_ERROR;
return;
}
- UnicodeSet * &incl = gInclusions[src].fSet;
- U_ASSERT(incl == nullptr);
+ U_ASSERT(gInclusions[src].fSet == nullptr);
- incl = new UnicodeSet();
- if (incl == nullptr) {
+ LocalPointer<UnicodeSet> incl(new UnicodeSet());
+ if (incl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
USetAdder sa = {
- (USet *)incl,
+ (USet *)incl.getAlias(),
_set_add,
_set_addRange,
_set_addString,
@@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
nullptr // don't need removeRange()
};
- incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode);
switch(src) {
case UPROPS_SRC_CHAR:
uchar_addPropertyStarts(&sa, &errorCode);
@@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
}
if (U_FAILURE(errorCode)) {
- delete incl;
- incl = nullptr;
return;
}
- // Compact for caching
+ if (incl->isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ // Compact for caching.
incl->compact();
+ gInclusions[src].fSet = incl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
@@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC
return nullptr;
}
Inclusion &i = gInclusions[src];
- umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode);
+ umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
return i.fSet;
}
+void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
+ // This function is invoked only via umtx_initOnce().
+ U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
+ U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
+ UPropertySource src = uprops_getSource(prop);
+ const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+
+ LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
+ if (intPropIncl.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ int32_t numRanges = incl->getRangeCount();
+ int32_t prevValue = 0;
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = incl->getRangeEnd(i);
+ for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+ int32_t value = u_getIntPropertyValue(c, prop);
+ if (value != prevValue) {
+ intPropIncl->add(c);
+ prevValue = value;
+ }
+ }
+ }
+
+ if (intPropIncl->isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ // Compact for caching.
+ intPropIncl->compact();
+ gInclusions[inclIndex].fSet = intPropIncl.orphan();
+ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
+}
+
+} // namespace
+
+U_NAMESPACE_BEGIN
+
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
UProperty prop, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
- UPropertySource src = uprops_getSource(prop);
- return getInclusionsForSource(src, errorCode);
+ if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
+ Inclusion &i = gInclusions[inclIndex];
+ umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
+ return i.fSet;
+ } else {
+ UPropertySource src = uprops_getSource(prop);
+ return getInclusionsForSource(src, errorCode);
+ }
}
U_NAMESPACE_END
@@ -216,7 +261,7 @@ namespace {
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
- icu::LocalPointer<UnicodeSet> set(new UnicodeSet());
+ LocalPointer<UnicodeSet> set(new UnicodeSet());
if (set.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp
index 13496ad5..b72e3183 100644
--- a/source/common/ucptrie.cpp
+++ b/source/common/ucptrie.cpp
@@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start,
int32_t prevI3Block = -1;
int32_t prevBlock = -1;
UChar32 c = start;
- uint32_t value;
+ uint32_t trieValue, value;
bool haveValue = false;
do {
int32_t i3Block;
@@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start,
return c - 1;
}
} else {
+ trieValue = trie->nullValue;
value = nullValue;
if (pValue != nullptr) { *pValue = nullValue; }
haveValue = true;
@@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start,
return c - 1;
}
} else {
+ trieValue = trie->nullValue;
value = nullValue;
if (pValue != nullptr) { *pValue = nullValue; }
haveValue = true;
@@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start,
c = (c + dataBlockLength) & ~dataMask;
} else {
int32_t di = block + (c & dataMask);
- uint32_t value2 = getValue(trie->data, valueWidth, di);
- value2 = maybeFilterValue(value2, trie->nullValue, nullValue,
- filter, context);
+ uint32_t trieValue2 = getValue(trie->data, valueWidth, di);
if (haveValue) {
- if (value2 != value) {
- return c - 1;
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, trie->nullValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
}
} else {
- value = value2;
+ trieValue = trieValue2;
+ value = maybeFilterValue(trieValue2, trie->nullValue, nullValue,
+ filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
while ((++c & dataMask) != 0) {
- if (maybeFilterValue(getValue(trie->data, valueWidth, ++di),
- trie->nullValue, nullValue,
- filter, context) != value) {
- return c - 1;
+ trieValue2 = getValue(trie->data, valueWidth, ++di);
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, trie->nullValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
}
}
}
diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp
index 44af8309..926be468 100644
--- a/source/common/umutablecptrie.cpp
+++ b/source/common/umutablecptrie.cpp
@@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange(
uint32_t nullValue = initialValue;
if (filter != nullptr) { nullValue = filter(context, nullValue); }
UChar32 c = start;
- uint32_t value;
+ uint32_t trieValue, value;
bool haveValue = false;
int32_t i = c >> UCPTRIE_SHIFT_3;
do {
if (flags[i] == ALL_SAME) {
- uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue,
- filter, context);
+ uint32_t trieValue2 = index[i];
if (haveValue) {
- if (value2 != value) {
- return c - 1;
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
}
} else {
- value = value2;
+ trieValue = trieValue2;
+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
} else /* MIXED */ {
int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
- uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue,
- filter, context);
+ uint32_t trieValue2 = data[di];
if (haveValue) {
- if (value2 != value) {
- return c - 1;
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
}
} else {
- value = value2;
+ trieValue = trieValue2;
+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
- if (maybeFilterValue(data[++di], initialValue, nullValue,
- filter, context) != value) {
- return c - 1;
+ trieValue2 = data[++di];
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
}
+ trieValue = trieValue2; // may or may not help
}
}
++i;
diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h
index 0abc7542..af56b872 100644
--- a/source/common/unicode/uniset.h
+++ b/source/common/unicode/uniset.h
@@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN
// Forward Declarations.
class BMPSet;
-class CharacterProperties;
class ParsePosition;
class RBBIRuleScanner;
class SymbolTable;
@@ -276,14 +275,23 @@ class RuleCharacterIterator;
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
+private:
+ /**
+ * Enough for sets with few ranges.
+ * For example, White_Space has 10 ranges, list length 21.
+ */
+ static constexpr int32_t INITIAL_CAPACITY = 25;
+ // fFlags constant
+ static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
+
+ UChar32* list = stackList; // MUST be terminated with HIGH
+ int32_t capacity = INITIAL_CAPACITY; // capacity of list
+ int32_t len = 1; // length of list used; 1 <= len <= capacity
+ uint8_t fFlags = 0; // Bit flag (see constants above)
- int32_t len; // length of list used; 0 <= len <= capacity
- int32_t capacity; // capacity of list
- UChar32* list; // MUST be terminated with HIGH
- BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
- UChar32* buffer; // internal buffer, may be NULL
- int32_t bufferCapacity; // capacity of buffer
- int32_t patLen;
+ BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
+ UChar32* buffer = nullptr; // internal buffer, may be NULL
+ int32_t bufferCapacity = 0; // capacity of buffer
/**
* The pattern representation of this set. This may not be the
@@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
- char16_t *pat;
- UVector* strings; // maintained in sorted order
- UnicodeSetStringSpan *stringSpan;
+ char16_t *pat = nullptr;
+ int32_t patLen = 0;
+
+ UVector* strings = nullptr; // maintained in sorted order
+ UnicodeSetStringSpan *stringSpan = nullptr;
+
+ /**
+ * Initial list array.
+ * Avoids some heap allocations, and list is never nullptr.
+ * Increases the object size a bit.
+ */
+ UChar32 stackList[INITIAL_CAPACITY];
-private:
- enum { // constants
- kIsBogus = 1 // This set is bogus (i.e. not valid)
- };
- uint8_t fFlags; // Bit flag (see constants above)
public:
/**
* Determine if this object contains a valid set.
@@ -1480,8 +1492,6 @@ private:
friend class USetAccess;
- int32_t getStringCount() const;
-
const UnicodeString* getString(int32_t index) const;
//----------------------------------------------------------------
@@ -1528,13 +1538,18 @@ private:
// Implementation: Utility methods
//----------------------------------------------------------------
- void ensureCapacity(int32_t newLen, UErrorCode& ec);
+ static int32_t nextCapacity(int32_t minCapacity);
+
+ bool ensureCapacity(int32_t newLen);
- void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
+ bool ensureBufferCapacity(int32_t newLen);
void swapBuffers(void);
UBool allocateStrings(UErrorCode &status);
+ UBool hasStrings() const;
+ int32_t stringsSize() const;
+ UBool stringsContains(const UnicodeString &s) const;
UnicodeString& _toPattern(UnicodeString& result,
UBool escapeUnprintable) const;
@@ -1614,7 +1629,6 @@ private:
UnicodeString& rebuiltPat,
UErrorCode& ec);
- friend class CharacterProperties;
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
/**
@@ -1646,7 +1660,10 @@ private:
/**
* Set the new pattern to cache.
*/
- void setPattern(const UnicodeString& newPat);
+ void setPattern(const UnicodeString& newPat) {
+ setPattern(newPat.getBuffer(), newPat.length());
+ }
+ void setPattern(const char16_t *newPat, int32_t newPatLen);
/**
* Release existing cached pattern.
*/
diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp
index e8378e0a..20242776 100644
--- a/source/common/uniset.cpp
+++ b/source/common/uniset.cpp
@@ -14,6 +14,7 @@
#include "unicode/parsepos.h"
#include "unicode/symtable.h"
#include "unicode/uniset.h"
+#include "unicode/ustring.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "ruleiter.h"
@@ -53,11 +54,8 @@
// LOW <= all valid values. ZERO for codepoints
#define UNICODESET_LOW 0x000000
-// initial storage. Must be >= 0
-#define START_EXTRA 16
-
-// extra amount for growth. Must be >= 0
-#define GROW_EXTRA START_EXTRA
+/** Max list [0, 1, 2, ..., max code point, HIGH] */
+constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
U_NAMESPACE_BEGIN
@@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
return a.compare(b);
}
+UBool UnicodeSet::hasStrings() const {
+ return strings != nullptr && !strings->isEmpty();
+}
+
+int32_t UnicodeSet::stringsSize() const {
+ return strings == nullptr ? 0 : strings->size();
+}
+
+UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
+ return strings != nullptr && strings->contains((void*) &s);
+}
+
//----------------------------------------------------------------
// Constructors &c
//----------------------------------------------------------------
@@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
/**
* Constructs an empty set.
*/
-UnicodeSet::UnicodeSet() :
- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- UErrorCode status = U_ZERO_ERROR;
- allocateStrings(status);
- if (U_FAILURE(status)) {
- setToBogus(); // If memory allocation failed, set to bogus state.
- return;
- }
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- if(list!=NULL){
- list[0] = UNICODESET_HIGH;
- } else { // If memory allocation failed, set to bogus state.
- setToBogus();
- return;
- }
+UnicodeSet::UnicodeSet() {
+ list[0] = UNICODESET_HIGH;
_dbgct(this);
}
@@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() :
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
*/
-UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- UErrorCode status = U_ZERO_ERROR;
- allocateStrings(status);
- if (U_FAILURE(status)) {
- setToBogus(); // If memory allocation failed, set to bogus state.
- return;
- }
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- if(list!=NULL){
- list[0] = UNICODESET_HIGH;
- complement(start, end);
- } else { // If memory allocation failed, set to bogus state.
- setToBogus();
- return;
- }
+UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
+ list[0] = UNICODESET_HIGH;
+ add(start, end);
_dbgct(this);
}
/**
* Constructs a set that is identical to the given UnicodeSet.
*/
-UnicodeSet::UnicodeSet(const UnicodeSet& o) :
- UnicodeFilter(o),
- len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
- bmpSet(0),
- buffer(0), bufferCapacity(0),
- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- UErrorCode status = U_ZERO_ERROR;
- allocateStrings(status);
- if (U_FAILURE(status)) {
- setToBogus(); // If memory allocation failed, set to bogus state.
- return;
- }
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- if(list!=NULL){
- *this = o;
- } else { // If memory allocation failed, set to bogus state.
- setToBogus();
- return;
- }
+UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
+ *this = o;
_dbgct(this);
}
// Copy-construct as thawed.
-UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
- UnicodeFilter(o),
- len(0), capacity(o.len + GROW_EXTRA), list(0),
- bmpSet(0),
- buffer(0), bufferCapacity(0),
- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- UErrorCode status = U_ZERO_ERROR;
- allocateStrings(status);
- if (U_FAILURE(status)) {
- setToBogus(); // If memory allocation failed, set to bogus state.
- return;
- }
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- if(list!=NULL){
+UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
+ if (ensureCapacity(o.len)) {
// *this = o except for bmpSet and stringSpan
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
- if (strings != NULL && o.strings != NULL) {
- strings->assign(*o.strings, cloneUnicodeString, status);
- } else { // Invalid strings.
- setToBogus();
- return;
+ if (o.hasStrings()) {
+ UErrorCode status = U_ZERO_ERROR;
+ if (!allocateStrings(status) ||
+ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
+ setToBogus();
+ return;
+ }
}
if (o.pat) {
- setPattern(UnicodeString(o.pat, o.patLen));
+ setPattern(o.pat, o.patLen);
}
- } else { // If memory allocation failed, set to bogus state.
- setToBogus();
- return;
+ _dbgct(this);
}
- _dbgct(this);
}
/**
@@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
*/
UnicodeSet::~UnicodeSet() {
_dbgdt(this); // first!
- uprv_free(list);
+ if (list != stackList) {
+ uprv_free(list);
+ }
delete bmpSet;
- if (buffer) {
+ if (buffer != stackList) {
uprv_free(buffer);
}
delete strings;
@@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
setToBogus();
return *this;
}
- UErrorCode ec = U_ZERO_ERROR;
- ensureCapacity(o.len, ec);
- if (U_FAILURE(ec)) {
+ if (!ensureCapacity(o.len)) {
// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
return *this;
}
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
- if (o.bmpSet == NULL || asThawed) {
- bmpSet = NULL;
- } else {
+ if (o.bmpSet != nullptr && !asThawed) {
bmpSet = new BMPSet(*o.bmpSet, list, len);
if (bmpSet == NULL) { // Check for memory allocation error.
setToBogus();
return *this;
}
}
- if (strings != NULL && o.strings != NULL) {
- strings->assign(*o.strings, cloneUnicodeString, ec);
- } else { // Invalid strings.
- setToBogus();
- return *this;
+ if (o.hasStrings()) {
+ UErrorCode status = U_ZERO_ERROR;
+ if ((strings == nullptr && !allocateStrings(status)) ||
+ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
+ setToBogus();
+ return *this;
+ }
+ } else if (hasStrings()) {
+ strings->removeAllElements();
}
- if (o.stringSpan == NULL || asThawed) {
- stringSpan = NULL;
- } else {
+ if (o.stringSpan != nullptr && !asThawed) {
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
if (stringSpan == NULL) { // Check for memory allocation error.
setToBogus();
@@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
}
releasePattern();
if (o.pat) {
- setPattern(UnicodeString(o.pat, o.patLen));
+ setPattern(o.pat, o.patLen);
}
return *this;
}
@@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
for (int32_t i = 0; i < len; ++i) {
if (list[i] != o.list[i]) return FALSE;
}
- if (*strings != *o.strings) return FALSE;
+ if (hasStrings() != o.hasStrings()) { return FALSE; }
+ if (hasStrings() && *strings != *o.strings) return FALSE;
return TRUE;
}
@@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const {
for (int32_t i = 0; i < count; ++i) {
n += getRangeEnd(i) - getRangeStart(i) + 1;
}
- return n + strings->size();
+ return n + stringsSize();
}
/**
@@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const {
* @return <tt>true</tt> if this set contains no elements.
*/
UBool UnicodeSet::isEmpty(void) const {
- return len == 1 && strings->size() == 0;
+ return len == 1 && !hasStrings();
}
/**
@@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const {
if (s.length() == 0) return FALSE;
int32_t cp = getSingleCP(s);
if (cp < 0) {
- return strings->contains((void*) &s);
+ return stringsContains(s);
} else {
return contains((UChar32) cp);
}
@@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
return FALSE;
}
}
- if (!strings->containsAll(*c.strings)) return FALSE;
- return TRUE;
+ return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
}
/**
@@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
return FALSE;
}
}
- if (!strings->containsNone(*c.strings)) return FALSE;
- return TRUE;
+ return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
}
/**
@@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
return TRUE;
}
}
- if (strings->size() != 0) {
+ if (hasStrings()) {
for (i=0; i<strings->size(); ++i) {
const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
//if (s.length() == 0) {
@@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
return U_MISMATCH;
}
} else {
- if (strings->size() != 0) { // try strings first
+ if (hasStrings()) { // try strings first
// might separate forward and backward loops later
// for now they are combined
@@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
*/
UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
if (pinCodePoint(start) < pinCodePoint(end)) {
- UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
+ UChar32 limit = end + 1;
+ // Fast path for adding a new range after the last one.
+ // Odd list length: [..., lastStart, lastLimit, HIGH]
+ if ((len & 1) != 0) {
+ // If the list is empty, set lastLimit low enough to not be adjacent to 0.
+ UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
+ if (lastLimit <= start && !isFrozen() && !isBogus()) {
+ if (lastLimit == start) {
+ // Extend the last range.
+ list[len - 2] = limit;
+ if (limit == UNICODESET_HIGH) {
+ --len;
+ }
+ } else {
+ list[len - 1] = start;
+ if (limit < UNICODESET_HIGH) {
+ if (ensureCapacity(len + 2)) {
+ list[len++] = limit;
+ list[len++] = UNICODESET_HIGH;
+ }
+ } else { // limit == UNICODESET_HIGH
+ if (ensureCapacity(len + 1)) {
+ list[len++] = UNICODESET_HIGH;
+ }
+ }
+ }
+ releasePattern();
+ return *this;
+ }
+ }
+ // This is slow. Could be much faster using findCodePoint(start)
+ // and modifying the list, dealing with adjacent & overlapping ranges.
+ UChar32 range[3] = { start, limit, UNICODESET_HIGH };
add(range, 2, 0);
} else if (start == end) {
add(start);
@@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
list[i] = c;
// if we touched the HIGH mark, then add a new one
if (c == (UNICODESET_HIGH - 1)) {
- UErrorCode status = U_ZERO_ERROR;
- ensureCapacity(len+1, status);
- if (U_FAILURE(status)) {
+ if (!ensureCapacity(len+1)) {
// ensureCapacity will mark the object as Bogus if OOM failure happens.
return *this;
}
@@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
// ^
// list[i]
- UErrorCode status = U_ZERO_ERROR;
- ensureCapacity(len+2, status);
- if (U_FAILURE(status)) {
+ if (!ensureCapacity(len+2)) {
// ensureCapacity will mark the object as Bogus if OOM failure happens.
return *this;
}
- //for (int32_t k=len-1; k>=i; --k) {
- // list[k+2] = list[k];
- //}
- UChar32* src = list + len;
- UChar32* dst = src + 2;
- UChar32* srclimit = list + i;
- while (src > srclimit) *(--dst) = *(--src);
-
+ UChar32 *p = list + i;
+ uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
list[i] = c;
list[i+1] = c+1;
len += 2;
@@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
- if (!strings->contains((void*) &s)) {
+ if (!stringsContains(s)) {
_add(s);
releasePattern();
}
@@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen() || isBogus()) {
return;
}
+ UErrorCode ec = U_ZERO_ERROR;
+ if (strings == nullptr && !allocateStrings(ec)) {
+ setToBogus();
+ return;
+ }
UnicodeString* t = new UnicodeString(s);
if (t == NULL) { // Check for memory allocation error.
setToBogus();
return;
}
- UErrorCode ec = U_ZERO_ERROR;
strings->sortedInsert(t, compareUnicodeString, ec);
if (U_FAILURE(ec)) {
setToBogus();
@@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
}
UnicodeSet& UnicodeSet::removeAllStrings() {
- strings->removeAllElements();
+ if (!isFrozen() && hasStrings()) {
+ strings->removeAllElements();
+ releasePattern();
+ }
return *this;
}
@@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
- strings->removeElement((void*) &s);
- releasePattern();
+ if (strings != nullptr && strings->removeElement((void*) &s)) {
+ releasePattern();
+ }
} else {
remove((UChar32)cp, (UChar32)cp);
}
@@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) {
if (isFrozen() || isBogus()) {
return *this;
}
- UErrorCode status = U_ZERO_ERROR;
if (list[0] == UNICODESET_LOW) {
- ensureBufferCapacity(len-1, status);
- if (U_FAILURE(status)) {
- return *this;
- }
- uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
+ uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
--len;
} else {
- ensureBufferCapacity(len+1, status);
- if (U_FAILURE(status)) {
+ if (!ensureCapacity(len+1)) {
return *this;
}
- uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
- buffer[0] = UNICODESET_LOW;
+ uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
+ list[0] = UNICODESET_LOW;
++len;
}
- swapBuffers();
releasePattern();
return *this;
}
@@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
- if (strings->contains((void*) &s)) {
+ if (stringsContains(s)) {
strings->removeElement((void*) &s);
} else {
_add(s);
@@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
if ( c.strings!=NULL ) {
for (int32_t i=0; i<c.strings->size(); ++i) {
const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
- if (!strings->contains((void*) s)) {
+ if (!stringsContains(*s)) {
_add(*s);
}
}
@@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
return *this;
}
retain(c.list, c.len, 0);
- strings->retainAll(*c.strings);
+ if (hasStrings()) {
+ if (!c.hasStrings()) {
+ strings->removeAllElements();
+ } else {
+ strings->retainAll(*c.strings);
+ }
+ }
return *this;
}
@@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
return *this;
}
retain(c.list, c.len, 2);
- strings->removeAll(*c.strings);
+ if (hasStrings() && c.hasStrings()) {
+ strings->removeAll(*c.strings);
+ }
return *this;
}
@@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
}
exclusiveOr(c.list, c.len, 0);
- for (int32_t i=0; i<c.strings->size(); ++i) {
- void* e = c.strings->elementAt(i);
- if (!strings->removeElement(e)) {
- _add(*(const UnicodeString*)e);
+ if (c.strings != nullptr) {
+ for (int32_t i=0; i<c.strings->size(); ++i) {
+ void* e = c.strings->elementAt(i);
+ if (strings == nullptr || !strings->removeElement(e)) {
+ _add(*(const UnicodeString*)e);
+ }
}
}
return *this;
@@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) {
if (isFrozen()) {
return *this;
}
- if (list != NULL) {
- list[0] = UNICODESET_HIGH;
- }
+ list[0] = UNICODESET_HIGH;
len = 1;
releasePattern();
if (strings != NULL) {
strings->removeAllElements();
}
- if (list != NULL && strings != NULL) {
- // Remove bogus
- fFlags = 0;
- }
+ // Remove bogus
+ fFlags = 0;
return *this;
}
@@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
return list[index*2 + 1] - 1;
}
-int32_t UnicodeSet::getStringCount() const {
- return strings->size();
-}
-
const UnicodeString* UnicodeSet::getString(int32_t index) const {
return (const UnicodeString*) strings->elementAt(index);
}
@@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() {
return *this;
}
// Delete buffer first to defragment memory less.
- if (buffer != NULL) {
+ if (buffer != stackList) {
uprv_free(buffer);
buffer = NULL;
- }
- if (len < capacity) {
- // Make the capacity equal to len or 1.
- // We don't want to realloc of 0 size.
- int32_t newCapacity = len + (len == 0);
- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
+ bufferCapacity = 0;
+ }
+ if (list == stackList) {
+ // pass
+ } else if (len <= INITIAL_CAPACITY) {
+ uprv_memcpy(stackList, list, len * sizeof(UChar32));
+ uprv_free(list);
+ list = stackList;
+ capacity = INITIAL_CAPACITY;
+ } else if ((len + 7) < capacity) {
+ // If we have more than a little unused capacity, shrink it to len.
+ UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
if (temp) {
list = temp;
- capacity = newCapacity;
+ capacity = len;
}
// else what the heck happened?! We allocated less memory!
// Oh well. We'll keep our original array.
}
+ if (strings != nullptr && strings->isEmpty()) {
+ delete strings;
+ strings = nullptr;
+ }
return *this;
}
@@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() {
/**
* Deserialize constructor.
*/
-UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
- : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0) {
+UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
+ UErrorCode &ec) {
if(U_FAILURE(ec)) {
setToBogus();
@@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
return;
}
- allocateStrings(ec);
- if (U_FAILURE(ec)) {
- setToBogus();
- return;
- }
-
// bmp?
int32_t headerSize = ((data[0]&0x8000)) ?2:1;
int32_t bmpLength = (headerSize==1)?data[0]:data[1];
- len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
+ int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
#ifdef DEBUG_SERIALIZE
- printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
+ printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
#endif
- capacity = len+1;
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- if(!list || U_FAILURE(ec)) {
- setToBogus();
+ if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
return;
}
// copy bmp
@@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
#endif
}
// copy smp
- for(i=bmpLength;i<len;i++) {
+ for(i=bmpLength;i<newLength;i++) {
list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
#ifdef DEBUG_SERIALIZE
printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
#endif
}
- // terminator
- list[len++]=UNICODESET_HIGH;
+ U_ASSERT(i == newLength);
+ if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
+ list[i++] = UNICODESET_HIGH;
+ }
+ len = i;
}
@@ -1664,33 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) {
return TRUE;
}
-void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
+int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
+ // Grow exponentially to reduce the frequency of allocations.
+ if (minCapacity < INITIAL_CAPACITY) {
+ return minCapacity + INITIAL_CAPACITY;
+ } else if (minCapacity <= 2500) {
+ return 5 * minCapacity;
+ } else {
+ int32_t newCapacity = 2 * minCapacity;
+ if (newCapacity > MAX_LENGTH) {
+ newCapacity = MAX_LENGTH;
+ }
+ return newCapacity;
+ }
+}
+
+bool UnicodeSet::ensureCapacity(int32_t newLen) {
+ if (newLen > MAX_LENGTH) {
+ newLen = MAX_LENGTH;
+ }
if (newLen <= capacity) {
- return;
+ return true;
}
- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
+ int32_t newCapacity = nextCapacity(newLen);
+ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
if (temp == NULL) {
- ec = U_MEMORY_ALLOCATION_ERROR;
setToBogus(); // set the object to bogus state if an OOM failure occurred.
- return;
+ return false;
+ }
+ // Copy only the actual contents.
+ uprv_memcpy(temp, list, len * sizeof(UChar32));
+ if (list != stackList) {
+ uprv_free(list);
}
list = temp;
- capacity = newLen + GROW_EXTRA;
- // else we keep the original contents on the memory failure.
+ capacity = newCapacity;
+ return true;
}
-void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
- if (buffer != NULL && newLen <= bufferCapacity)
- return;
- UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
+bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
+ if (newLen > MAX_LENGTH) {
+ newLen = MAX_LENGTH;
+ }
+ if (newLen <= bufferCapacity) {
+ return true;
+ }
+ int32_t newCapacity = nextCapacity(newLen);
+ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
if (temp == NULL) {
- ec = U_MEMORY_ALLOCATION_ERROR;
setToBogus();
- return;
+ return false;
+ }
+ // The buffer has no contents to be copied.
+ // It is always filled from scratch after this call.
+ if (buffer != stackList) {
+ uprv_free(buffer);
}
buffer = temp;
- bufferCapacity = newLen + GROW_EXTRA;
- // else we keep the original contents on the memory failure.
+ bufferCapacity = newCapacity;
+ return true;
}
/**
@@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola
if (isFrozen() || isBogus()) {
return;
}
- UErrorCode status = U_ZERO_ERROR;
- ensureBufferCapacity(len + otherLen, status);
- if (U_FAILURE(status)) {
+ if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
if (isFrozen() || isBogus() || other==NULL) {
return;
}
- UErrorCode status = U_ZERO_ERROR;
- ensureBufferCapacity(len + otherLen, status);
- if (U_FAILURE(status)) {
+ if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
if (isFrozen() || isBogus()) {
return;
}
- UErrorCode status = U_ZERO_ERROR;
- ensureBufferCapacity(len + otherLen, status);
- if (U_FAILURE(status)) {
+ if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
}
}
- for (int32_t i = 0; i<strings->size(); ++i) {
- result.append(OPEN_BRACE);
- _appendToPat(result,
- *(const UnicodeString*) strings->elementAt(i),
- escapeUnprintable);
- result.append(CLOSE_BRACE);
+ if (strings != nullptr) {
+ for (int32_t i = 0; i<strings->size(); ++i) {
+ result.append(OPEN_BRACE);
+ _appendToPat(result,
+ *(const UnicodeString*) strings->elementAt(i),
+ escapeUnprintable);
+ result.append(CLOSE_BRACE);
+ }
}
return result.append(SET_CLOSE);
}
@@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() {
/**
* Set the new pattern to cache.
*/
-void UnicodeSet::setPattern(const UnicodeString& newPat) {
+void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
releasePattern();
- int32_t newPatLen = newPat.length();
pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
if (pat) {
patLen = newPatLen;
- newPat.extractBetween(0, patLen, pat);
+ u_memcpy(pat, newPat, patLen);
pat[patLen] = 0;
}
// else we don't care if malloc failed. This was just a nice cache.
@@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) {
UnicodeFunctor *UnicodeSet::freeze() {
if(!isFrozen() && !isBogus()) {
- // Do most of what compact() does before freezing because
- // compact() will not work when the set is frozen.
- // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
-
- // Delete buffer first to defragment memory less.
- if (buffer != NULL) {
- uprv_free(buffer);
- buffer = NULL;
- }
- if (capacity > (len + GROW_EXTRA)) {
- // Make the capacity equal to len or 1.
- // We don't want to realloc of 0 size.
- capacity = len + (len == 0);
- list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
- if (list == NULL) { // Check for memory allocation error.
- setToBogus();
- return this;
- }
- }
+ compact();
// Optimize contains() and span() and similar functions.
- if (!strings->isEmpty()) {
+ if (hasStrings()) {
stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
- if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
+ if (stringSpan == nullptr) {
+ setToBogus();
+ return this;
+ } else if (!stringSpan->needsStringSpanUTF16()) {
// All strings are irrelevant for span() etc. because
// all of each string's code points are contained in this set.
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
@@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC
}
if(stringSpan!=NULL) {
return stringSpan->span(s, length, spanCondition);
- } else if(!strings->isEmpty()) {
+ } else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
@@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s
}
if(stringSpan!=NULL) {
return stringSpan->spanBack(s, length, spanCondition);
- } else if(!strings->isEmpty()) {
+ } else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
@@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
}
if(stringSpan!=NULL) {
return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
- } else if(!strings->isEmpty()) {
+ } else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
@@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
}
if(stringSpan!=NULL) {
return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
- } else if(!strings->isEmpty()) {
+ } else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp
index 0b7da796..882231ba 100644
--- a/source/common/uniset_closure.cpp
+++ b/source/common/uniset_closure.cpp
@@ -31,10 +31,6 @@
#include "util.h"
#include "uvector.h"
-// initial storage. Must be >= 0
-// *** same as in uniset.cpp ! ***
-#define START_EXTRA 16
-
U_NAMESPACE_BEGIN
// TODO memory debugging provided inside uniset.cpp
@@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
- UErrorCode& status) :
- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- if(U_SUCCESS(status)){
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- /* test for NULL */
- if(list == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }else{
- allocateStrings(status);
- applyPattern(pattern, options, symbols, status);
- }
- }
+ UErrorCode& status) {
+ applyPattern(pattern, options, symbols, status);
_dbgct(this);
}
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
uint32_t options,
const SymbolTable* symbols,
- UErrorCode& status) :
- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- if(U_SUCCESS(status)){
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- /* test for NULL */
- if(list == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }else{
- allocateStrings(status);
- applyPattern(pattern, pos, options, symbols, status);
- }
- }
+ UErrorCode& status) {
+ applyPattern(pattern, pos, options, symbols, status);
_dbgct(this);
}
@@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
// start with input set to guarantee inclusion
// USET_CASE: remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed
- if (attribute & USET_CASE_INSENSITIVE) {
+ if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
foldSet.strings->removeAllElements();
}
@@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
}
}
}
- if (strings != NULL && strings->size() > 0) {
+ if (hasStrings()) {
if (attribute & USET_CASE_INSENSITIVE) {
for (int32_t j=0; j<strings->size(); ++j) {
str = *(const UnicodeString *) strings->elementAt(j);
diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp
index 6cfd80a7..e98c175f 100644
--- a/source/common/uniset_props.cpp
+++ b/source/common/uniset_props.cpp
@@ -47,10 +47,6 @@
U_NAMESPACE_USE
-// initial storage. Must be >= 0
-// *** same as in uniset.cpp ! ***
-#define START_EXTRA 16
-
// Define UChar constants using hex for EBCDIC compatibility
// Used #define to reduce private static exports and memory access time.
#define SET_OPEN ((UChar)0x005B) /*[*/
@@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
* @param pattern a string specifying what characters are in the set
*/
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
- UErrorCode& status) :
- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
- fFlags(0)
-{
- if(U_SUCCESS(status)){
- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
- /* test for NULL */
- if(list == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }else{
- allocateStrings(status);
- applyPattern(pattern, status);
- }
- }
+ UErrorCode& status) {
+ applyPattern(pattern, status);
_dbgct(this);
}
@@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) {
return u_getNumericValue(ch) == *(double*)context;
}
+static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
+ int32_t value = *(int32_t*)context;
+ return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
+}
+
static UBool versionFilter(UChar32 ch, void* context) {
static const UVersionInfo none = { 0, 0, 0, 0 };
UVersionInfo v;
@@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) {
return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
}
+typedef struct {
+ UProperty prop;
+ int32_t value;
+} IntPropertyContext;
+
+static UBool intPropertyFilter(UChar32 ch, void* context) {
+ IntPropertyContext* c = (IntPropertyContext*)context;
+ return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
+}
+
static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
return uscript_hasScript(ch, *(UScriptCode*)context);
}
@@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
namespace {
-/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
-uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
- uint32_t mask = *(const uint32_t *)context;
- value = U_MASK(value) & mask;
- if (value != 0) { value = 1; }
- return value;
-}
-
-/** Maps one map value to 1, all others to 0. */
-uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
- uint32_t v = *(const uint32_t *)context;
- return value == v ? 1 : 0;
-}
-
-} // namespace
-
-void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
- UCPMapValueFilter *filter, const void *context,
- UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return; }
- clear();
- UChar32 start = 0, end;
- uint32_t value;
- while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
- filter, context, &value)) >= 0) {
- if (value != 0) {
- add(start, end);
- }
- start = end + 1;
- }
- if (isBogus()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- }
-}
-
-namespace {
-
static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
/* Note: we use ' ' in compiler code page */
int32_t j = 0;
@@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
UnicodeSet&
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
- if (U_FAILURE(ec)) { return *this; }
- // All of the following check isFrozen() before modifying this set.
+ if (U_FAILURE(ec) || isFrozen()) { return *this; }
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
- const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
- applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
+ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
+ applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
UScriptCode script = (UScriptCode)value;
@@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
clear();
}
} else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
- const UCPMap *map = u_getIntPropertyMap(prop, &ec);
- applyIntPropertyValue(map, intValueFilter, &value, ec);
+ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
+ IntPropertyContext c = {prop, value};
+ applyFilter(intPropertyFilter, &c, inclusions, ec);
} else {
- // This code used to always call getInclusions(property source)
- // which sets an error for an unsupported property.
ec = U_ILLEGAL_ARGUMENT_ERROR;
- // Otherwise we would just clear() this set because
- // getIntPropertyValue(c, prop) returns 0 for all code points.
}
return *this;
}
diff --git a/source/common/uprops.h b/source/common/uprops.h
index 1a8e4e84..34b3600b 100644
--- a/source/common/uprops.h
+++ b/source/common/uprops.h
@@ -462,7 +462,6 @@ class UnicodeSet;
class CharacterProperties {
public:
CharacterProperties() = delete;
- static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode);
static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
};
diff --git a/source/common/uset.cpp b/source/common/uset.cpp
index 39ad0a34..eae7981d 100644
--- a/source/common/uset.cpp
+++ b/source/common/uset.cpp
@@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ {
public:
/* Try to have the compiler inline these*/
inline static int32_t getStringCount(const UnicodeSet& set) {
- return set.getStringCount();
+ return set.stringsSize();
}
inline static const UnicodeString* getString(const UnicodeSet& set,
int32_t i) {
diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp
index 93048ba2..79151690 100644
--- a/source/common/usetiter.cpp
+++ b/source/common/usetiter.cpp
@@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() {
stringCount = 0;
} else {
endRange = set->getRangeCount() - 1;
- stringCount = set->strings->size();
+ stringCount = set->stringsSize();
}
range = 0;
endElement = -1;