| // Copyright (C) 2006 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Author: Jim Meehan |
| |
| #ifndef UNICODETEXT_H_ |
| #define UNICODETEXT_H_ |
| |
| #include <iterator> |
| #include <utility> |
| |
| namespace chrome_lang_id { |
| |
| // ***************************** UnicodeText ************************** |
| // |
| // A UnicodeText object is a wrapper around a sequence of Unicode |
| // codepoint values that allows iteration over these values. |
| // |
| // The internal representation of the text is UTF-8. Since UTF-8 is a |
| // variable-width format, UnicodeText does not provide random access |
| // to the text, and changes to the text are permitted only at the end. |
| // |
| // The UnicodeText class defines a const_iterator. The dereferencing |
| // operator (*) returns a codepoint (int32). The iterator is a |
| // read-only iterator. It becomes invalid if the text is changed. |
| // |
| // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, |
| // 0x10FFFF], but UnicodeText has the additional restriction that it |
| // can contain only those characters that are valid for interchange on |
| // the Web. This excludes all of the control codes except for carriage |
| // return, line feed, and horizontal tab. It also excludes |
| // non-characters, but codepoints that are in the Private Use regions |
| // are allowed, as are codepoints that are unassigned. (See the |
| // Unicode reference for details.) |
| // |
| // MEMORY MANAGEMENT: |
| // |
| // PointToUTF8(buffer, size) creates an alias pointing to buffer. |
| // |
| // The purpose of an alias is to avoid making an unnecessary copy of a |
| // UTF-8 buffer while still providing access to the Unicode values |
| // within that text through iterators. The lifetime of an alias must not |
| // exceed the lifetime of the buffer from which it was constructed. |
| // |
| // Aliases should be used with care. If the source from which an alias |
| // was created is freed, or if the contents are changed, while the |
| // alias is still in use, fatal errors could result. But it can be |
| // quite useful to have a UnicodeText "window" through which to see a |
| // UTF-8 buffer without having to pay the price of making a copy. |
| |
| // TODO(abakalov): Consider merging this class with the script detection |
| // code in the directory script_span. |
| class UnicodeText { |
| public: |
| class const_iterator; |
| |
| UnicodeText(); // Create an empty text. |
| ~UnicodeText(); |
| |
| class const_iterator { |
| typedef const_iterator CI; |
| |
| public: |
| // Iterators are default-constructible. |
| const_iterator(); |
| |
| // It's safe to make multiple passes over a UnicodeText. |
| const_iterator(const const_iterator &other); |
| const_iterator &operator=(const const_iterator &other); |
| |
| int operator*() const; // Dereference |
| |
| const_iterator &operator++(); // Advance (++iter) |
| |
| friend bool operator==(const CI &lhs, const CI &rhs) { |
| return lhs.it_ == rhs.it_; |
| } |
| friend bool operator!=(const CI &lhs, const CI &rhs) { |
| return !(lhs == rhs); |
| } |
| |
| private: |
| friend class UnicodeText; |
| explicit const_iterator(const char *it) : it_(it) {} |
| |
| const char *it_; |
| }; |
| |
| const_iterator begin() const; |
| const_iterator end() const; |
| |
| // x.PointToUTF8(buf,len) changes x so that it points to buf |
| // ("becomes an alias"). It does not take ownership or copy buf. |
| // This function assumes that the input is interchange valid UTF8. |
| UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length); |
| |
| private: |
| friend class const_iterator; |
| |
| class Repr { // A byte-string. |
| public: |
| char *data_; |
| int size_; |
| int capacity_; |
| bool ours_; // Do we own data_? |
| |
| Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} |
| ~Repr() { |
| if (ours_) delete[] data_; |
| } |
| |
| void clear(); |
| void reserve(int capacity); |
| void resize(int size); |
| |
| void append(const char *bytes, int byte_length); |
| void Copy(const char *data, int size); |
| void TakeOwnershipOf(char *data, int size, int capacity); |
| void PointTo(const char *data, int size); |
| |
| private: |
| Repr &operator=(const Repr &); |
| Repr(const Repr &other); |
| }; |
| |
| Repr repr_; |
| }; |
| |
| } // namespace chrome_lang_id |
| |
| #endif // UNICODETEXT_H_ |