ICU 72.1 72.1
normalizer2.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
86public:
92
104 static const Normalizer2 *
106
118 static const Normalizer2 *
120
132 static const Normalizer2 *
134
146 static const Normalizer2 *
148
160 static const Normalizer2 *
162
184 static const Normalizer2 *
185 getInstance(const char *packageName,
186 const char *name,
188 UErrorCode &errorCode);
189
201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202 UnicodeString result;
203 normalize(src, result, errorCode);
204 return result;
205 }
219 virtual UnicodeString &
221 UnicodeString &dest,
222 UErrorCode &errorCode) const = 0;
223
246 virtual void
247 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248 Edits *edits, UErrorCode &errorCode) const;
249
264 virtual UnicodeString &
266 const UnicodeString &second,
267 UErrorCode &errorCode) const = 0;
282 virtual UnicodeString &
284 const UnicodeString &second,
285 UErrorCode &errorCode) const = 0;
286
300 virtual UBool
301 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302
327 virtual UBool
328 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329
345 virtual UChar32
347
356 virtual uint8_t
358
373 virtual UBool
374 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
394 virtual UBool
396
397
414 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
415
438 virtual int32_t
439 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
440
454 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
455
470 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
471
485 virtual UBool isInert(UChar32 c) const = 0;
486};
487
500public:
511 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
512 norm2(n2), set(filterSet) {}
513
519
533 virtual UnicodeString &
535 UnicodeString &dest,
536 UErrorCode &errorCode) const U_OVERRIDE;
537
560 virtual void
561 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
562 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
563
578 virtual UnicodeString &
580 const UnicodeString &second,
581 UErrorCode &errorCode) const U_OVERRIDE;
596 virtual UnicodeString &
598 const UnicodeString &second,
599 UErrorCode &errorCode) const U_OVERRIDE;
600
612 virtual UBool
614
626 virtual UBool
628
639 virtual UChar32
641
650 virtual uint8_t
652
664 virtual UBool
665 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
685 virtual UBool
699 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
711 virtual int32_t
713
723
733
741 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
742private:
744 normalize(const UnicodeString &src,
745 UnicodeString &dest,
746 USetSpanCondition spanCondition,
747 UErrorCode &errorCode) const;
748
749 void
750 normalizeUTF8(uint32_t options, const char *src, int32_t length,
751 ByteSink &sink, Edits *edits,
752 USetSpanCondition spanCondition,
753 UErrorCode &errorCode) const;
754
756 normalizeSecondAndAppend(UnicodeString &first,
757 const UnicodeString &second,
758 UBool doNormalize,
759 UErrorCode &errorCode) const;
760
761 const Normalizer2 &norm2;
762 const UnicodeSet &set;
763};
764
765U_NAMESPACE_END
766
767#endif // !UCONFIG_NO_NORMALIZATION
768
769#endif /* U_SHOW_CPLUSPLUS_API */
770
771#endif // __NORMALIZER2_H__
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
Records lengths of string edits but not replacement text.
Definition: edits.h:80
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:499
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const U_OVERRIDE
Writes the normalized form of the source string to the destination string (replacing its contents) an...
virtual UBool isInert(UChar32 c) const U_OVERRIDE
Tests if the character is normalization-inert.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the raw decomposition mapping of c.
~FilteredNormalizer2()
Destructor.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const U_OVERRIDE
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the UTF-8 string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary before it, regardless of context.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Returns the end of the normalized substring of the input string.
virtual uint8_t getCombiningClass(UChar32 c) const U_OVERRIDE
Gets the combining class of c.
virtual UChar32 composePair(UChar32 a, UChar32 b) const U_OVERRIDE
Performs pairwise composition of a & b and returns the composite if there is one.
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary after it, regardless of context.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:511
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the second string to the first string (merging them at the boundary) and returns the first st...
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
~Normalizer2()
Destructor.
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:201
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
C++ API: StringPiece: Read-only byte string wrapper class.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:461
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269
#define U_OVERRIDE
Defined to the C++11 "override" keyword if available.
Definition: umachine.h:130
C++ API: Unicode Set.
C++ API: Unicode String.
C API: New API for Unicode Normalization.
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:159
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:300