...

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 1999-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999sep09
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: Code point macros
 *
 * This file defines macros for checking whether a code point is
 * a surrogate or a non-character etc.
 *
 * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
 * and itself includes utf8.h and utf16.h after some
 * common definitions.
 * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be
 * included explicitly if their definitions are used.
 *
 * utf8.h and utf16.h define macros for efficiently getting code points
 * in and out of UTF-8/16 strings.
 * utf16.h macros have "U16_" prefixes.
 * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
 *
 * ICU mostly processes 16-bit Unicode strings.
 * Most of the time, such strings are well-formed UTF-16.
 * Single, unpaired surrogates must be handled as well, and are treated in ICU
 * like regular code points where possible.
 * (Pairs of surrogate code points are indistinguishable from supplementary
 * code points encoded as pairs of supplementary code units.)
 *
 * In fact, almost all Unicode code points in normal text (>99%)
 * are on the BMP (<=U+ffff) and even <=U+d7ff.
 * ICU functions handle supplementary code points (U+10000..U+10ffff)
 * but are optimized for the much more frequently occurring BMP code points.
 *
 * umachine.h defines UChar to be an unsigned 16-bit integer.
 * Since ICU 59, ICU uses char16_t in C++, UChar only in C,
 * and defines UChar=char16_t by default. See the UChar API docs for details.
 *
 * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
 * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
 * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
 * the definition of UChar. For details see the documentation for UChar32 itself.
 *
 * utf.h defines a small number of C macros for single Unicode code points.
 * These are simple checks for surrogates and non-characters.
 * For actual Unicode character properties see uchar.h.
 *
 * By default, string operations must be done with error checking in case
 * a string is not well-formed UTF-16 or UTF-8.
 *
 * The U16_ macros detect if a surrogate code unit is unpaired
 * (lead unit without trail unit or vice versa) and just return the unit itself
 * as the code point.
 *
 * The U8_ macros detect illegal byte sequences and return a negative value.
 * Starting with ICU 60, the observable length of a single illegal byte sequence
 * skipped by one of these macros follows the Unicode 6+ recommendation
 * which is consistent with the W3C Encoding Standard.
 *
 * There are ..._OR_FFFD versions of both U16_ and U8_ macros
 * that return U+FFFD for illegal code unit sequences.
 *
 * The regular "safe" macros require that the initial, passed-in string index
 * is within bounds. They only check the index when they read more than one
 * code unit. This is usually done with code similar to the following loop:
 * <pre>while(i<length) {
 *   U16_NEXT(s, i, length, c);
 *   // use c
 * }</pre>
 *
 * When it is safe to assume that text is well-formed UTF-16
 * (does not contain single, unpaired surrogates), then one can use
 * U16_..._UNSAFE macros.
 * These do not check for proper code unit sequences or truncated text and may
 * yield wrong results or even cause a crash if they are used with "malformed"
 * text.
 * In practice, U16_..._UNSAFE macros will produce slightly less code but
 * should not be faster because the processing is only different when a
 * surrogate code unit is detected, which will be rare.
 *
 * Similarly for UTF-8, there are "safe" macros without a suffix,
 * and U8_..._UNSAFE versions.
 * The performance differences are much larger here because UTF-8 provides so
 * many opportunities for malformed sequences.
 * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
 * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
 *
 * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
 * code point values (0..U+10ffff). They are indicated with negative values instead.
 *
 * For more information see the ICU User Guide Strings chapter
 * (http://userguide.icu-project.org/strings).
 *
 * <em>Usage:</em>
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 *
 * @stable ICU 2.4
 */

#ifndef __UTF_H__
#define __UTF_H__

#include "unicode/umachine.h"
/* include the utfXX.h after the following definitions */

/* single-code point definitions -------------------------------------------- */

/**
 * Is this code point a Unicode noncharacter?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_NONCHAR(c) \
    ((c)>=0xfdd0 && \
     ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)

/**
 * Is c a Unicode code point value (0..U+10ffff)
 * that can be assigned a character?
 *
 * Code points that are not characters include:
 * - single surrogate code points (U+d800..U+dfff, 2048 code points)
 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
 * - the highest Unicode code point value is U+10ffff
 *
 * This means that all code points below U+d800 are character code points,
 * and that boundary is tested first for performance.
 *
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_CHAR(c) \
    ((uint32_t)(c)<0xd800 || \
        (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))

/**
 * Is this code point a BMP code point (U+0000..U+ffff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.8
 */
#define U_IS_BMP(c) ((uint32_t)(c)<=0xffff)

/**
 * Is this code point a supplementary code point (U+10000..U+10ffff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.8
 */
#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff)
 
/**
 * Is this code point a lead surrogate (U+d800..U+dbff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code point a trail surrogate (U+dc00..U+dfff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Is this code point a surrogate (U+d800..U+dfff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a lead surrogate?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a trail surrogate?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 4.2
 */
#define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)

/* include the utfXX.h ------------------------------------------------------ */

#if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS

#include "unicode/utf8.h"
#include "unicode/utf16.h"

/* utf_old.h contains deprecated, pre-ICU 2.4 definitions */
#include "unicode/utf_old.h"

#endif  /* !U_NO_DEFAULT_INCLUDE_UTF_HEADERS */

#endif  /* __UTF_H__ */

.	Edit
..	Edit
alphaindex.h	Edit
appendable.h	Edit
basictz.h	Edit
brkiter.h	Edit
bytestream.h	Edit
bytestrie.h	Edit
bytestriebuilder.h	Edit
calendar.h	Edit
caniter.h	Edit
casemap.h	Edit
char16ptr.h	Edit
chariter.h	Edit
choicfmt.h	Edit
coleitr.h	Edit
coll.h	Edit
compactdecimalformat.h	Edit
curramt.h	Edit
currpinf.h	Edit
currunit.h	Edit
datefmt.h	Edit
dbbi.h	Edit
dcfmtsym.h	Edit
decimfmt.h	Edit
docmain.h	Edit
dtfmtsym.h	Edit
dtintrv.h	Edit
dtitvfmt.h	Edit
dtitvinf.h	Edit
dtptngen.h	Edit
dtrule.h	Edit
edits.h	Edit
enumset.h	Edit
errorcode.h	Edit
fieldpos.h	Edit
filteredbrk.h	Edit
fmtable.h	Edit
format.h	Edit
formattedvalue.h	Edit
fpositer.h	Edit
gender.h	Edit
gregocal.h	Edit
icudataver.h	Edit
icuplug.h	Edit
idna.h	Edit
listformatter.h	Edit
localebuilder.h	Edit
localematcher.h	Edit
localpointer.h	Edit
locdspnm.h	Edit
locid.h	Edit
measfmt.h	Edit
measunit.h	Edit
measure.h	Edit
messagepattern.h	Edit
msgfmt.h	Edit
normalizer2.h	Edit
normlzr.h	Edit
nounit.h	Edit
numberformatter.h	Edit
numberrangeformatter.h	Edit
numfmt.h	Edit
numsys.h	Edit
parseerr.h	Edit
parsepos.h	Edit
platform.h	Edit
plurfmt.h	Edit
plurrule.h	Edit
ptypes.h	Edit
putil.h	Edit
rbbi.h	Edit
rbnf.h	Edit
rbtz.h	Edit
regex.h	Edit
region.h	Edit
reldatefmt.h	Edit
rep.h	Edit
resbund.h	Edit
schriter.h	Edit
scientificnumberformatter.h	Edit
search.h	Edit
selfmt.h	Edit
simpleformatter.h	Edit
simpletz.h	Edit
smpdtfmt.h	Edit
sortkey.h	Edit
std_string.h	Edit
strenum.h	Edit
stringoptions.h	Edit
stringpiece.h	Edit
stringtriebuilder.h	Edit
stsearch.h	Edit
symtable.h	Edit
tblcoll.h	Edit
timezone.h	Edit
tmunit.h	Edit
tmutamt.h	Edit
tmutfmt.h	Edit
translit.h	Edit
tzfmt.h	Edit
tznames.h	Edit
tzrule.h	Edit
tztrans.h	Edit
ubidi.h	Edit
ubiditransform.h	Edit
ubrk.h	Edit
ucal.h	Edit
ucasemap.h	Edit
ucat.h	Edit
uchar.h	Edit
ucharstrie.h	Edit
ucharstriebuilder.h	Edit
uchriter.h	Edit
uclean.h	Edit
ucnv.h	Edit
ucnv_cb.h	Edit
ucnv_err.h	Edit
ucnvsel.h	Edit
ucol.h	Edit
ucoleitr.h	Edit
uconfig.h	Edit
ucpmap.h	Edit
ucptrie.h	Edit
ucsdet.h	Edit
ucurr.h	Edit
udat.h	Edit
udata.h	Edit
udateintervalformat.h	Edit
udatpg.h	Edit
udisplaycontext.h	Edit
uenum.h	Edit
ufieldpositer.h	Edit
uformattable.h	Edit
uformattedvalue.h	Edit
ugender.h	Edit
uidna.h	Edit
uiter.h	Edit
uldnames.h	Edit
ulistformatter.h	Edit
uloc.h	Edit
ulocdata.h	Edit
umachine.h	Edit
umisc.h	Edit
umsg.h	Edit
umutablecptrie.h	Edit
unifilt.h	Edit
unifunct.h	Edit
unimatch.h	Edit
unirepl.h	Edit
uniset.h	Edit
unistr.h	Edit
unorm.h	Edit
unorm2.h	Edit
unum.h	Edit
unumberformatter.h	Edit
unumsys.h	Edit
uobject.h	Edit
upluralrules.h	Edit
uregex.h	Edit
uregion.h	Edit
ureldatefmt.h	Edit
urename.h	Edit
urep.h	Edit
ures.h	Edit
uscript.h	Edit
usearch.h	Edit
uset.h	Edit
usetiter.h	Edit
ushape.h	Edit
uspoof.h	Edit
usprep.h	Edit
ustdio.h	Edit
ustream.h	Edit
ustring.h	Edit
ustringtrie.h	Edit
utext.h	Edit
utf.h	Edit
utf16.h	Edit
utf32.h	Edit
utf8.h	Edit
utf_old.h	Edit
utmscale.h	Edit
utrace.h	Edit
utrans.h	Edit
utypes.h	Edit
uvernum.h	Edit
uversion.h	Edit
vtzone.h	Edit