blob: b277d4c0ccbbc744d1b641bebe241a8cd569572b [file] [log] [blame]
/* Unicode Character Database API
*
* Copyright (C) 2012-2018 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef UNICODE_CHARACTER_DATA_H
#define UNICODE_CHARACTER_DATA_H
#include <stdint.h>
#ifdef __cplusplus
extern "C"
{
#endif
/** @brief Represents a Unicode codepoint.
*/
typedef uint32_t codepoint_t;
/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
typedef enum ucd_category_group_
{
UCD_CATEGORY_GROUP_C, /**< @brief Other */
UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
UCD_CATEGORY_GROUP_L, /**< @brief Letter */
UCD_CATEGORY_GROUP_M, /**< @brief Mark */
UCD_CATEGORY_GROUP_N, /**< @brief Number */
UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
} ucd_category_group;
/** @brief Get a string representation of the category_group enumeration value.
*
* @param c The value to get the string representation for.
*
* @return The string representation, or "-" if the value is not recognized.
*/
const char *ucd_get_category_group_string(ucd_category_group c);
/** @brief Unicode General Category Values
* @see http://www.unicode.org/reports/tr44/
*/
typedef enum ucd_category_
{
UCD_CATEGORY_Cc, /**< @brief Control Character */
UCD_CATEGORY_Cf, /**< @brief Format Control Character */
UCD_CATEGORY_Cn, /**< @brief Unassigned */
UCD_CATEGORY_Co, /**< @brief Private Use */
UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
UCD_CATEGORY_Lo, /**< @brief Other Letter */
UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
UCD_CATEGORY_No, /**< @brief Other Number */
UCD_CATEGORY_Pc, /**< @brief Connector */
UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
UCD_CATEGORY_Po, /**< @brief Other */
UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
UCD_CATEGORY_Sm, /**< @brief Math Symbol */
UCD_CATEGORY_So, /**< @brief Other Symbol */
UCD_CATEGORY_Zl, /**< @brief Line Separator */
UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
UCD_CATEGORY_Zs, /**< @brief Space Separator */
} ucd_category;
/** @brief Get a string representation of the category enumeration value.
*
* @param c The value to get the string representation for.
*
* @return The string representation, or "--" if the value is not recognized.
*/
const char *ucd_get_category_string(ucd_category c);
/** @brief Lookup the General Category Group for a General Category.
*
* @param c The General Category to lookup.
* @return The General Category Group of the General Category.
*/
ucd_category_group ucd_get_category_group_for_category(ucd_category c);
/** @brief Lookup the General Category Group for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category Group of the Unicode codepoint.
*/
ucd_category_group ucd_lookup_category_group(codepoint_t c);
/** @brief Lookup the General Category for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category of the Unicode codepoint.
*/
ucd_category ucd_lookup_category(codepoint_t c);
/** @brief Unicode Script
* @see http://www.iana.org/assignments/language-subtag-registry
* @see http://www.unicode.org/iso15924/iso15924-codes.html
*/
typedef enum ucd_script_
{
UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
UCD_SCRIPT_Afak, /**< @brief Afaka Script */
UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
UCD_SCRIPT_Arab, /**< @brief Arabic Script */
UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
UCD_SCRIPT_Armn, /**< @brief Armenian Script */
UCD_SCRIPT_Avst, /**< @brief Avestan Script */
UCD_SCRIPT_Bali, /**< @brief Balinese Script */
UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
UCD_SCRIPT_Batk, /**< @brief Batak Script */
UCD_SCRIPT_Beng, /**< @brief Bengali Script */
UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
UCD_SCRIPT_Brai, /**< @brief Braille Script */
UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
UCD_SCRIPT_Cari, /**< @brief Carian Script */
UCD_SCRIPT_Cham, /**< @brief Cham Script */
UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
UCD_SCRIPT_Copt, /**< @brief Coptic Script */
UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
UCD_SCRIPT_Dogr, /**< @brief Dogra Script */
UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
UCD_SCRIPT_Gong, /**< @brief Gunjala Gondi */
UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
UCD_SCRIPT_Goth, /**< @brief Gothic Script */
UCD_SCRIPT_Gran, /**< @brief Grantha Script */
UCD_SCRIPT_Grek, /**< @brief Greek Script */
UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
UCD_SCRIPT_Hang, /**< @brief Hangul Script */
UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
UCD_SCRIPT_Inds, /**< @brief Indus Script */
UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
UCD_SCRIPT_Java, /**< @brief Javanese Script */
UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
UCD_SCRIPT_Kana, /**< @brief Katakana Script */
UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
UCD_SCRIPT_Knda, /**< @brief Kannada Script */
UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
UCD_SCRIPT_Laoo, /**< @brief Lao Script */
UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
UCD_SCRIPT_Latn, /**< @brief Latin Script */
UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
UCD_SCRIPT_Limb, /**< @brief Limbu Script */
UCD_SCRIPT_Lina, /**< @brief Linear A Script */
UCD_SCRIPT_Linb, /**< @brief Linear B Script */
UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
UCD_SCRIPT_Loma, /**< @brief Loma Script */
UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
UCD_SCRIPT_Maka, /**< @brief Makasar Script */
UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
UCD_SCRIPT_Marc, /**< @brief Marchen Script */
UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
UCD_SCRIPT_Medf, /**< @brief Medefaidrin (Oberi Okaime) Script */
UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
UCD_SCRIPT_Modi, /**< @brief Modi Script */
UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
UCD_SCRIPT_Moon, /**< @brief Moon Script */
UCD_SCRIPT_Mroo, /**< @brief Mro Script */
UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
UCD_SCRIPT_Mult, /**< @brief Multani Script */
UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
UCD_SCRIPT_Newa, /**< @brief Newa Script */
UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
UCD_SCRIPT_Orya, /**< @brief Oriya Script */
UCD_SCRIPT_Osge, /**< @brief Osage Script */
UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
UCD_SCRIPT_Perm, /**< @brief Old Permic */
UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
UCD_SCRIPT_Plrd, /**< @brief Miao Script */
UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
UCD_SCRIPT_Rohg, /**< @brief Hanifi Rohingya Script */
UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
UCD_SCRIPT_Runr, /**< @brief Runic Script */
UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
UCD_SCRIPT_Sara, /**< @brief Sarati Script */
UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
UCD_SCRIPT_Sogd, /**< @brief Sogdian Script */
UCD_SCRIPT_Sogo, /**< @brief Old Sogdian Script */
UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
UCD_SCRIPT_Soyo, /**< @brief Soyombo */
UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
UCD_SCRIPT_Takr, /**< @brief Takri Script */
UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
UCD_SCRIPT_Taml, /**< @brief Tamil Script */
UCD_SCRIPT_Tang, /**< @brief Tangut Script */
UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
UCD_SCRIPT_Telu, /**< @brief Telugu Script */
UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
UCD_SCRIPT_Thai, /**< @brief Thai Script */
UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
UCD_SCRIPT_Vaii, /**< @brief Vai Script */
UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
UCD_SCRIPT_Wole, /**< @brief Woleai Script */
UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
UCD_SCRIPT_Yiii, /**< @brief Yi Script */
UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
UCD_SCRIPT_Zsym, /**< @brief Symbols */
UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
} ucd_script;
/** @brief Get a string representation of the script enumeration value.
*
* @param s The value to get the string representation for.
*
* @return The string representation, or "----" if the value is not recognized.
*/
const char *ucd_get_script_string(ucd_script s);
/** @brief Lookup the Script for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The Script of the Unicode codepoint.
*/
ucd_script ucd_lookup_script(codepoint_t c);
/** @brief Properties
*/
typedef uint64_t ucd_property;
#define UCD_PROPERTY_WHITE_SPACE 0x0000000000000001ull /**< @brief White_Space */
#define UCD_PROPERTY_BIDI_CONTROL 0x0000000000000002ull /**< @brief Bidi_Control */
#define UCD_PROPERTY_JOIN_CONTROL 0x0000000000000004ull /**< @brief Join_Control */
#define UCD_PROPERTY_DASH 0x0000000000000008ull /**< @brief Dash */
#define UCD_PROPERTY_HYPHEN 0x0000000000000010ull /**< @brief Hyphen */
#define UCD_PROPERTY_QUOTATION_MARK 0x0000000000000020ull /**< @brief Quotation_Mark */
#define UCD_PROPERTY_TERMINAL_PUNCTUATION 0x0000000000000040ull /**< @brief Terminal_Punctuation */
#define UCD_PROPERTY_OTHER_MATH 0x0000000000000080ull /**< @brief Other_Math */
#define UCD_PROPERTY_HEX_DIGIT 0x0000000000000100ull /**< @brief Hex_Digit */
#define UCD_PROPERTY_ASCII_HEX_DIGIT 0x0000000000000200ull /**< @brief ASCII_Hex_Digit */
#define UCD_PROPERTY_OTHER_ALPHABETIC 0x0000000000000400ull /**< @brief Other_Alphabetic */
#define UCD_PROPERTY_IDEOGRAPHIC 0x0000000000000800ull /**< @brief Ideographic */
#define UCD_PROPERTY_DIACRITIC 0x0000000000001000ull /**< @brief Diacritic */
#define UCD_PROPERTY_EXTENDER 0x0000000000002000ull /**< @brief Extender */
#define UCD_PROPERTY_OTHER_LOWERCASE 0x0000000000004000ull /**< @brief Other_Lowercase */
#define UCD_PROPERTY_OTHER_UPPERCASE 0x0000000000008000ull /**< @brief Other_Uppercase */
#define UCD_PROPERTY_NONCHARACTER_CODE_POINT 0x0000000000010000ull /**< @brief Noncharacter_Code_Point */
#define UCD_PROPERTY_OTHER_GRAPHEME_EXTEND 0x0000000000020000ull /**< @brief Other_Grapheme_Extend */
#define UCD_PROPERTY_IDS_BINARY_OPERATOR 0x0000000000040000ull /**< @brief IDS_Binary_Operator */
#define UCD_PROPERTY_IDS_TRINARY_OPERATOR 0x0000000000080000ull /**< @brief IDS_Trinary_Operator */
#define UCD_PROPERTY_RADICAL 0x0000000000100000ull /**< @brief Radical */
#define UCD_PROPERTY_UNIFIED_IDEOGRAPH 0x0000000000200000ull /**< @brief Unified_Ideograph */
#define UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT 0x0000000000400000ull /**< @brief Other_Default_Ignorable_Code_Point */
#define UCD_PROPERTY_DEPRECATED 0x0000000000800000ull /**< @brief Deprecated */
#define UCD_PROPERTY_SOFT_DOTTED 0x0000000001000000ull /**< @brief Soft_Dotted */
#define UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION 0x0000000002000000ull /**< @brief Logical_Order_Exception */
#define UCD_PROPERTY_OTHER_ID_START 0x0000000004000000ull /**< @brief Other_ID_Start */
#define UCD_PROPERTY_OTHER_ID_CONTINUE 0x0000000008000000ull /**< @brief Other_ID_Continue */
#define UCD_PROPERTY_SENTENCE_TERMINAL 0x0000000010000000ull /**< @brief Sentence_Terminal */
#define UCD_PROPERTY_VARIATION_SELECTOR 0x0000000020000000ull /**< @brief Variation_Selector */
#define UCD_PROPERTY_PATTERN_WHITE_SPACE 0x0000000040000000ull /**< @brief Pattern_White_Space */
#define UCD_PROPERTY_PATTERN_SYNTAX 0x0000000080000000ull /**< @brief Pattern_Syntax */
#define UCD_PROPERTY_PREPENDED_CONCATENATION_MARK 0x0000000100000000ull /**< @brief Prepended_Concatenation_Mark */
#define UCD_PROPERTY_EMOJI 0x0000000200000000ull /**< @brief Emoji */
#define UCD_PROPERTY_EMOJI_PRESENTATION 0x0000000400000000ull /**< @brief Emoji_Presentation */
#define UCD_PROPERTY_EMOJI_MODIFIER 0x0000000800000000ull /**< @brief Emoji_Modifier */
#define UCD_PROPERTY_EMOJI_MODIFIER_BASE 0x0000001000000000ull /**< @brief Emoji_Modifier_Base */
#define UCD_PROPERTY_REGIONAL_INDICATOR 0x0000002000000000ull /**< @brief Regional_Indicator */
#define UCD_PROPERTY_EMOJI_COMPONENT 0x0000004000000000ull /**< @brief Emoji_Component */
#define UCD_PROPERTY_EXTENDED_PICTOGRAPHIC 0x0000008000000000ull /**< @brief Extended_Pictographic */
// eSpeak NG extended properties:
#define ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION 0x0010000000000000ull /**< @brief Inverted_Terminal_Punctuation */
#define ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD 0x0020000000000000ull /**< @brief Punctuation_In_Word */
#define ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER 0x0040000000000000ull /**< @brief Optional_Space_After */
#define ESPEAKNG_PROPERTY_EXTENDED_DASH 0x0080000000000000ull /**< @brief Extended_Dash */
#define ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR 0x0100000000000000ull /**< @brief Paragraph_Separator */
#define ESPEAKNG_PROPERTY_ELLIPSIS 0x0200000000000000ull /**< @brief Ellipsis */
#define ESPEAKNG_PROPERTY_SEMI_COLON 0x0400000000000000ull /**< @brief Semi_Colon */
#define ESPEAKNG_PROPERTY_COLON 0x0800000000000000ull /**< @brief Colon */
#define ESPEAKNG_PROPERTY_COMMA 0x1000000000000000ull /**< @brief Comma */
#define ESPEAKNG_PROPERTY_EXCLAMATION_MARK 0x2000000000000000ull /**< @brief Exclamation_Mark */
#define ESPEAKNG_PROPERTY_QUESTION_MARK 0x4000000000000000ull /**< @brief Question_Mark */
#define ESPEAKNG_PROPERTY_FULL_STOP 0x8000000000000000ull /**< @brief Full_Stop */
/** @brief Return the properties of the specified codepoint.
*
* @param c The Unicode codepoint to lookup.
* @param category The General Category of the codepoint.
* @return The properties associated with the codepoint.
*/
ucd_property ucd_properties(codepoint_t c, ucd_category category);
/** @brief Is the codepoint in the 'alnum' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
*/
int ucd_isalnum(codepoint_t c);
/** @brief Is the codepoint in the 'alpha' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
*/
int ucd_isalpha(codepoint_t c);
/** @brief Is the codepoint in the 'blank' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
*/
int ucd_isblank(codepoint_t c);
/** @brief Is the codepoint in the 'cntrl' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
*/
int ucd_iscntrl(codepoint_t c);
/** @brief Is the codepoint in the 'digit' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
*/
int ucd_isdigit(codepoint_t c);
/** @brief Is the codepoint in the 'graph' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
*/
int ucd_isgraph(codepoint_t c);
/** @brief Is the codepoint in the 'lower' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
*/
int ucd_islower(codepoint_t c);
/** @brief Is the codepoint in the 'print' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
*/
int ucd_isprint(codepoint_t c);
/** @brief Is the codepoint in the 'punct' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
*/
int ucd_ispunct(codepoint_t c);
/** @brief Is the codepoint in the 'space' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
*/
int ucd_isspace(codepoint_t c);
/** @brief Is the codepoint in the 'upper' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
*/
int ucd_isupper(codepoint_t c);
/** @brief Is the codepoint in the 'xdigit' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
*/
int ucd_isxdigit(codepoint_t c);
/** @brief Convert the Unicode codepoint to upper-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The upper-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t ucd_toupper(codepoint_t c);
/** @brief Convert the Unicode codepoint to lower-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The lower-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t ucd_tolower(codepoint_t c);
/** @brief Convert the Unicode codepoint to title-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The title-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t ucd_totitle(codepoint_t c);
#ifdef __cplusplus
}
/** @brief Unicode Character Database
*/
namespace ucd
{
/** @brief Represents a Unicode codepoint.
*/
using ::codepoint_t;
/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
enum category_group
{
C = UCD_CATEGORY_GROUP_C, /**< @brief Other */
I = UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
L = UCD_CATEGORY_GROUP_L, /**< @brief Letter */
M = UCD_CATEGORY_GROUP_M, /**< @brief Mark */
N = UCD_CATEGORY_GROUP_N, /**< @brief Number */
P = UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
S = UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
Z = UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
};
/** @brief Get a string representation of the category_group enumeration value.
*
* @param c The value to get the string representation for.
*
* @return The string representation, or "-" if the value is not recognized.
*/
inline const char *get_category_group_string(category_group c)
{
return ucd_get_category_group_string((ucd_category_group)c);
}
/** @brief Unicode General Category Values
* @see http://www.unicode.org/reports/tr44/
*/
enum category
{
Cc = UCD_CATEGORY_Cc, /**< @brief Control Character */
Cf = UCD_CATEGORY_Cf, /**< @brief Format Control Character */
Cn = UCD_CATEGORY_Cn, /**< @brief Unassigned */
Co = UCD_CATEGORY_Co, /**< @brief Private Use */
Cs = UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
Ii = UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
Ll = UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
Lm = UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
Lo = UCD_CATEGORY_Lo, /**< @brief Other Letter */
Lt = UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
Lu = UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
Mc = UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
Me = UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
Mn = UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
Nd = UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
Nl = UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
No = UCD_CATEGORY_No, /**< @brief Other Number */
Pc = UCD_CATEGORY_Pc, /**< @brief Connector */
Pd = UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
Pe = UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
Pf = UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
Pi = UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
Po = UCD_CATEGORY_Po, /**< @brief Other */
Ps = UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
Sc = UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
Sk = UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
Sm = UCD_CATEGORY_Sm, /**< @brief Math Symbol */
So = UCD_CATEGORY_So, /**< @brief Other Symbol */
Zl = UCD_CATEGORY_Zl, /**< @brief Line Separator */
Zp = UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
Zs = UCD_CATEGORY_Zs, /**< @brief Space Separator */
};
/** @brief Get a string representation of the category enumeration value.
*
* @param c The value to get the string representation for.
*
* @return The string representation, or "--" if the value is not recognized.
*/
inline const char *get_category_string(category c)
{
return ucd_get_category_string((ucd_category)c);
}
/** @brief Lookup the General Category Group for a General Category.
*
* @param c The General Category to lookup.
* @return The General Category Group of the General Category.
*/
inline category_group lookup_category_group(category c)
{
return (category_group)ucd_get_category_group_for_category((ucd_category)c);
}
/** @brief Lookup the General Category Group for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category Group of the Unicode codepoint.
*/
inline category_group lookup_category_group(codepoint_t c)
{
return (category_group)ucd_lookup_category_group(c);
}
/** @brief Lookup the General Category for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category of the Unicode codepoint.
*/
inline category lookup_category(codepoint_t c)
{
return (category)ucd_lookup_category(c);
}
/** @brief Unicode Script
* @see http://www.iana.org/assignments/language-subtag-registry
* @see http://www.unicode.org/iso15924/iso15924-codes.html
*/
enum script
{
Adlm = UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
Afak = UCD_SCRIPT_Afak, /**< @brief Afaka Script */
Aghb = UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
Ahom = UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
Arab = UCD_SCRIPT_Arab, /**< @brief Arabic Script */
Armi = UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
Armn = UCD_SCRIPT_Armn, /**< @brief Armenian Script */
Avst = UCD_SCRIPT_Avst, /**< @brief Avestan Script */
Bali = UCD_SCRIPT_Bali, /**< @brief Balinese Script */
Bamu = UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
Bass = UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
Batk = UCD_SCRIPT_Batk, /**< @brief Batak Script */
Beng = UCD_SCRIPT_Beng, /**< @brief Bengali Script */
Bhks = UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
Blis = UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
Bopo = UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
Brah = UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
Brai = UCD_SCRIPT_Brai, /**< @brief Braille Script */
Bugi = UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
Buhd = UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
Cakm = UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
Cans = UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
Cari = UCD_SCRIPT_Cari, /**< @brief Carian Script */
Cham = UCD_SCRIPT_Cham, /**< @brief Cham Script */
Cher = UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
Cirt = UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
Copt = UCD_SCRIPT_Copt, /**< @brief Coptic Script */
Cprt = UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
Cyrl = UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
Cyrs = UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
Deva = UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
Dogr = UCD_SCRIPT_Dogr, /**< @brief Dogra Script */
Dsrt = UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
Dupl = UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
Egyd = UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
Egyh = UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
Egyp = UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
Elba = UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
Ethi = UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
Geok = UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
Geor = UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
Glag = UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
Gong = UCD_SCRIPT_Gong, /**< @brief Gunjala Gondi */
Gonm = UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
Goth = UCD_SCRIPT_Goth, /**< @brief Gothic Script */
Gran = UCD_SCRIPT_Gran, /**< @brief Grantha Script */
Grek = UCD_SCRIPT_Grek, /**< @brief Greek Script */
Gujr = UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
Guru = UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
Hang = UCD_SCRIPT_Hang, /**< @brief Hangul Script */
Hani = UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
Hano = UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
Hans = UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
Hant = UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
Hatr = UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
Hebr = UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
Hira = UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
Hluw = UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
Hmng = UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
Hrkt = UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
Hung = UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
Inds = UCD_SCRIPT_Inds, /**< @brief Indus Script */
Ital = UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
Java = UCD_SCRIPT_Java, /**< @brief Javanese Script */
Jpan = UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
Jurc = UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
Kali = UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
Kana = UCD_SCRIPT_Kana, /**< @brief Katakana Script */
Khar = UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
Khmr = UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
Khoj = UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
Knda = UCD_SCRIPT_Knda, /**< @brief Kannada Script */
Kore = UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
Kpel = UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
Kthi = UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
Lana = UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
Laoo = UCD_SCRIPT_Laoo, /**< @brief Lao Script */
Latf = UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
Latg = UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
Latn = UCD_SCRIPT_Latn, /**< @brief Latin Script */
Lepc = UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
Limb = UCD_SCRIPT_Limb, /**< @brief Limbu Script */
Lina = UCD_SCRIPT_Lina, /**< @brief Linear A Script */
Linb = UCD_SCRIPT_Linb, /**< @brief Linear B Script */
Lisu = UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
Loma = UCD_SCRIPT_Loma, /**< @brief Loma Script */
Lyci = UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
Lydi = UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
Mahj = UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
Maka = UCD_SCRIPT_Maka, /**< @brief Mahajani Script */
Mand = UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
Mani = UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
Marc = UCD_SCRIPT_Marc, /**< @brief Marchen Script */
Maya = UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
Medf = UCD_SCRIPT_Medf, /**< @brief Medefaidrin (Oberi Okaime) Script */
Mend = UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
Merc = UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
Mero = UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
Mlym = UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
Modi = UCD_SCRIPT_Modi, /**< @brief Modi Script */
Mong = UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
Moon = UCD_SCRIPT_Moon, /**< @brief Moon Script */
Mroo = UCD_SCRIPT_Mroo, /**< @brief Mro Script */
Mtei = UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
Mult = UCD_SCRIPT_Mult, /**< @brief Multani Script */
Mymr = UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
Narb = UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
Nbat = UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
Newa = UCD_SCRIPT_Newa, /**< @brief Newa Script */
Nkgb = UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
Nkoo = UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
Nshu = UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
Ogam = UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
Olck = UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
Orkh = UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
Orya = UCD_SCRIPT_Orya, /**< @brief Oriya Script */
Osge = UCD_SCRIPT_Osge, /**< @brief Osage Script */
Osma = UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
Palm = UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
Pauc = UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
Perm = UCD_SCRIPT_Perm, /**< @brief Old Permic */
Phag = UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
Phli = UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
Phlp = UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
Phlv = UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
Phnx = UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
Plrd = UCD_SCRIPT_Plrd, /**< @brief Miao Script */
Prti = UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
Qaak = UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
Rjng = UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
Rohg = UCD_SCRIPT_Rohg, /**< @brief Hanifi Rohingya Script */
Roro = UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
Runr = UCD_SCRIPT_Runr, /**< @brief Runic Script */
Samr = UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
Sara = UCD_SCRIPT_Sara, /**< @brief Sarati Script */
Sarb = UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
Saur = UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
Sgnw = UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
Shaw = UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
Shrd = UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
Sidd = UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
Sind = UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
Sinh = UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
Sogd = UCD_SCRIPT_Sogd, /**< @brief Sogdian Script */
Sogo = UCD_SCRIPT_Sogo, /**< @brief Old Sogdian Script */
Sora = UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
Soyo = UCD_SCRIPT_Soyo, /**< @brief Soyombo */
Sund = UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
Sylo = UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
Syrc = UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
Syre = UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
Syrj = UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
Syrn = UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
Tagb = UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
Takr = UCD_SCRIPT_Takr, /**< @brief Takri Script */
Tale = UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
Talu = UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
Taml = UCD_SCRIPT_Taml, /**< @brief Tamil Script */
Tang = UCD_SCRIPT_Tang, /**< @brief Tangut Script */
Tavt = UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
Telu = UCD_SCRIPT_Telu, /**< @brief Telugu Script */
Teng = UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
Tfng = UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
Tglg = UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
Thaa = UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
Thai = UCD_SCRIPT_Thai, /**< @brief Thai Script */
Tibt = UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
Tirh = UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
Ugar = UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
Vaii = UCD_SCRIPT_Vaii, /**< @brief Vai Script */
Visp = UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
Wara = UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
Wole = UCD_SCRIPT_Wole, /**< @brief Woleai Script */
Xpeo = UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
Xsux = UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
Yiii = UCD_SCRIPT_Yiii, /**< @brief Yi Script */
Zanb = UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
Zinh = UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
Zmth = UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
Zsym = UCD_SCRIPT_Zsym, /**< @brief Symbols */
Zxxx = UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
Zyyy = UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
Zzzz = UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
};
/** @brief Get a string representation of the script enumeration value.
*
* @param s The value to get the string representation for.
*
* @return The string representation, or "----" if the value is not recognized.
*/
inline const char *get_script_string(script s)
{
return ucd_get_script_string((ucd_script)s);
}
/** @brief Lookup the Script for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The Script of the Unicode codepoint.
*/
inline script lookup_script(codepoint_t c)
{
return (script)ucd_lookup_script(c);
}
/** @brief Properties
*/
typedef ucd_property property;
enum
{
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash */
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen */
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark */
Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation */
Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math */
Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit */
ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit */
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic */
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic */
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic */
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender */
Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase */
Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase */
Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point */
Other_Grapheme_Extend = UCD_PROPERTY_OTHER_GRAPHEME_EXTEND, /**< @brief Other_Grapheme_Extend */
IDS_Binary_Operator = UCD_PROPERTY_IDS_BINARY_OPERATOR, /**< @brief IDS_Binary_Operator */
IDS_Trinary_Operator = UCD_PROPERTY_IDS_TRINARY_OPERATOR, /**< @brief IDS_Trinary_Operator */
Radical = UCD_PROPERTY_RADICAL, /**< @brief Radical */
Unified_Ideograph = UCD_PROPERTY_UNIFIED_IDEOGRAPH, /**< @brief Unified_Ideograph */
Other_Default_Ignorable_Code_Point = UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT, /**< @brief Other_Default_Ignorable_Code_Point */
Deprecated = UCD_PROPERTY_DEPRECATED, /**< @brief Deprecated */
Soft_Dotted = UCD_PROPERTY_SOFT_DOTTED, /**< @brief Soft_Dotted */
Logical_Order_Exception = UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION, /**< @brief Logical_Order_Exception */
Other_ID_Start = UCD_PROPERTY_OTHER_ID_START, /**< @brief Other_ID_Start */
Other_ID_Continue = UCD_PROPERTY_OTHER_ID_CONTINUE, /**< @brief Other_ID_Continue */
Sentence_Terminal = UCD_PROPERTY_SENTENCE_TERMINAL, /**< @brief Sentence_Terminal */
Variation_Selector = UCD_PROPERTY_VARIATION_SELECTOR, /**< @brief Variation_Selector */
Pattern_White_Space = UCD_PROPERTY_PATTERN_WHITE_SPACE, /**< @brief Pattern_White_Space */
Pattern_Syntax = UCD_PROPERTY_PATTERN_SYNTAX, /**< @brief Pattern_Syntax */
Prepended_Concatenation_Mark = UCD_PROPERTY_PREPENDED_CONCATENATION_MARK, /**< @brief Prepended_Concatenation_Mark */
Emoji = UCD_PROPERTY_EMOJI, /**< @brief Emoji */
Emoji_Presentation = UCD_PROPERTY_EMOJI_PRESENTATION, /**< @brief Emoji_Presentation */
Emoji_Modifier = UCD_PROPERTY_EMOJI_MODIFIER, /**< @brief Emoji_Modifier */
Emoji_Modifier_Base = UCD_PROPERTY_EMOJI_MODIFIER_BASE, /**< @brief Emoji_Modifier_Base */
Regional_Indicator = UCD_PROPERTY_REGIONAL_INDICATOR, /**< @brief Regional_Indicator */
Emoji_Component = UCD_PROPERTY_EMOJI_COMPONENT, /**< @brief Emoji_Component */
Extended_Pictographic = UCD_PROPERTY_EXTENDED_PICTOGRAPHIC, /**< @brief Extended_Pictographic */
};
/** @brief Return the properties of the specified codepoint.
*
* @param c The Unicode codepoint to lookup.
* @param cat The General Category of the codepoint.
* @return The properties associated with the codepoint.
*/
inline property properties(codepoint_t c, category cat)
{
return (property)ucd_properties(c, (ucd_category)cat);
}
/** @brief Is the codepoint in the 'alnum' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
*/
inline int isalnum(codepoint_t c)
{
return ucd_isalnum(c);
}
/** @brief Is the codepoint in the 'alpha' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
*/
inline int isalpha(codepoint_t c)
{
return ucd_isalpha(c);
}
/** @brief Is the codepoint in the 'blank' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
*/
inline int isblank(codepoint_t c)
{
return ucd_isblank(c);
}
/** @brief Is the codepoint in the 'cntrl' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
*/
inline int iscntrl(codepoint_t c)
{
return ucd_iscntrl(c);
}
/** @brief Is the codepoint in the 'digit' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
*/
inline int isdigit(codepoint_t c)
{
return ucd_isdigit(c);
}
/** @brief Is the codepoint in the 'graph' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
*/
inline int isgraph(codepoint_t c)
{
return ucd_isgraph(c);
}
/** @brief Is the codepoint in the 'lower' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
*/
inline int islower(codepoint_t c)
{
return ucd_islower(c);
}
/** @brief Is the codepoint in the 'print' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
*/
inline int isprint(codepoint_t c)
{
return ucd_isprint(c);
}
/** @brief Is the codepoint in the 'punct' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
*/
inline int ispunct(codepoint_t c)
{
return ucd_ispunct(c);
}
/** @brief Is the codepoint in the 'space' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
*/
inline int isspace(codepoint_t c)
{
return ucd_isspace(c);
}
/** @brief Is the codepoint in the 'upper' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
*/
inline int isupper(codepoint_t c)
{
return ucd_isupper(c);
}
/** @brief Is the codepoint in the 'xdigit' class?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
*/
inline int isxdigit(codepoint_t c)
{
return ucd_isxdigit(c);
}
/** @brief Convert the Unicode codepoint to upper-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The upper-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
inline codepoint_t toupper(codepoint_t c)
{
return ucd_toupper(c);
}
/** @brief Convert the Unicode codepoint to lower-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The lower-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
inline codepoint_t tolower(codepoint_t c)
{
return ucd_tolower(c);
}
/** @brief Convert the Unicode codepoint to title-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The title-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
inline codepoint_t totitle(codepoint_t c)
{
return ucd_totitle(c);
}
}
#endif
#endif