ICU 72.1  72.1
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/utypes.h"
17 
18 #if U_SHOW_CPLUSPLUS_API
19 
20 #include "unicode/ucpmap.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uset.h"
24 
30 U_NAMESPACE_BEGIN
31 
32 // Forward Declarations.
33 class BMPSet;
34 class ParsePosition;
35 class RBBIRuleScanner;
36 class SymbolTable;
37 class UnicodeSetStringSpan;
38 class UVector;
39 class RuleCharacterIterator;
40 
286 private:
291  static constexpr int32_t INITIAL_CAPACITY = 25;
292  // fFlags constant
293  static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
294 
295  UChar32* list = stackList; // MUST be terminated with HIGH
296  int32_t capacity = INITIAL_CAPACITY; // capacity of list
297  int32_t len = 1; // length of list used; 1 <= len <= capacity
298  uint8_t fFlags = 0; // Bit flag (see constants above)
299 
300  BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
301  UChar32* buffer = nullptr; // internal buffer, may be NULL
302  int32_t bufferCapacity = 0; // capacity of buffer
303 
313  char16_t *pat = nullptr;
314  int32_t patLen = 0;
315 
316  UVector* strings = nullptr; // maintained in sorted order
317  UnicodeSetStringSpan *stringSpan = nullptr;
318 
324  UChar32 stackList[INITIAL_CAPACITY];
325 
326 public:
336  inline UBool isBogus(void) const;
337 
354  void setToBogus();
355 
356 public:
357 
358  enum {
363  MIN_VALUE = 0,
364 
369  MAX_VALUE = 0x10ffff
370  };
371 
372  //----------------------------------------------------------------
373  // Constructors &c
374  //----------------------------------------------------------------
375 
376 public:
377 
382  UnicodeSet();
383 
392  UnicodeSet(UChar32 start, UChar32 end);
393 
394 #ifndef U_HIDE_INTERNAL_API
395 
399  kSerialized /* result of serialize() */
400  };
401 
412  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413  ESerialization serialization, UErrorCode &status);
414 #endif /* U_HIDE_INTERNAL_API */
415 
424  UnicodeSet(const UnicodeString& pattern,
425  UErrorCode& status);
426 
427 #ifndef U_HIDE_INTERNAL_API
428 
440  UnicodeSet(const UnicodeString& pattern,
441  uint32_t options,
442  const SymbolTable* symbols,
443  UErrorCode& status);
444 #endif /* U_HIDE_INTERNAL_API */
445 
459  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
460  uint32_t options,
461  const SymbolTable* symbols,
462  UErrorCode& status);
463 
468  UnicodeSet(const UnicodeSet& o);
469 
474  virtual ~UnicodeSet();
475 
481  UnicodeSet& operator=(const UnicodeSet& o);
482 
494  virtual bool operator==(const UnicodeSet& o) const;
495 
501  inline bool operator!=(const UnicodeSet& o) const;
502 
512  virtual UnicodeSet* clone() const override;
513 
521  virtual int32_t hashCode(void) const;
522 
531  inline static UnicodeSet *fromUSet(USet *uset);
532 
541  inline static const UnicodeSet *fromUSet(const USet *uset);
542 
550  inline USet *toUSet();
551 
552 
560  inline const USet * toUSet() const;
561 
562 
563  //----------------------------------------------------------------
564  // Freezable API
565  //----------------------------------------------------------------
566 
575  inline UBool isFrozen() const;
576 
590  UnicodeSet *freeze();
591 
600  UnicodeSet *cloneAsThawed() const;
601 
602  //----------------------------------------------------------------
603  // Public API
604  //----------------------------------------------------------------
605 
615  UnicodeSet& set(UChar32 start, UChar32 end);
616 
622  static UBool resemblesPattern(const UnicodeString& pattern,
623  int32_t pos);
624 
637  UnicodeSet& applyPattern(const UnicodeString& pattern,
638  UErrorCode& status);
639 
640 #ifndef U_HIDE_INTERNAL_API
641 
657  UnicodeSet& applyPattern(const UnicodeString& pattern,
658  uint32_t options,
659  const SymbolTable* symbols,
660  UErrorCode& status);
661 #endif /* U_HIDE_INTERNAL_API */
662 
694  UnicodeSet& applyPattern(const UnicodeString& pattern,
695  ParsePosition& pos,
696  uint32_t options,
697  const SymbolTable* symbols,
698  UErrorCode& status);
699 
713  virtual UnicodeString& toPattern(UnicodeString& result,
714  UBool escapeUnprintable = false) const override;
715 
738  UnicodeSet& applyIntPropertyValue(UProperty prop,
739  int32_t value,
740  UErrorCode& ec);
741 
771  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
772  const UnicodeString& value,
773  UErrorCode& ec);
774 
787  virtual int32_t size(void) const;
788 
795  virtual UBool isEmpty(void) const;
796 
801  UBool hasStrings() const;
802 
810  virtual UBool contains(UChar32 c) const override;
811 
820  virtual UBool contains(UChar32 start, UChar32 end) const;
821 
829  UBool contains(const UnicodeString& s) const;
830 
838  virtual UBool containsAll(const UnicodeSet& c) const;
839 
847  UBool containsAll(const UnicodeString& s) const;
848 
857  UBool containsNone(UChar32 start, UChar32 end) const;
858 
866  UBool containsNone(const UnicodeSet& c) const;
867 
875  UBool containsNone(const UnicodeString& s) const;
876 
885  inline UBool containsSome(UChar32 start, UChar32 end) const;
886 
894  inline UBool containsSome(const UnicodeSet& s) const;
895 
903  inline UBool containsSome(const UnicodeString& s) const;
904 
923  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
924 
937  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
938 
956  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
957 
971  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
972 
991  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
992 
1010  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1011 
1016  virtual UMatchDegree matches(const Replaceable& text,
1017  int32_t& offset,
1018  int32_t limit,
1019  UBool incremental) override;
1020 
1021 private:
1044  static int32_t matchRest(const Replaceable& text,
1045  int32_t start, int32_t limit,
1046  const UnicodeString& s);
1047 
1057  int32_t findCodePoint(UChar32 c) const;
1058 
1059 public:
1060 
1068  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1069 
1078  int32_t indexOf(UChar32 c) const;
1079 
1095  UChar32 charAt(int32_t index) const;
1096 
1111  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1112 
1123  UnicodeSet& add(UChar32 c);
1124 
1136  UnicodeSet& add(const UnicodeString& s);
1137 
1138  private:
1144  static int32_t getSingleCP(const UnicodeString& s);
1145 
1146  void _add(const UnicodeString& s);
1147 
1148  public:
1157  UnicodeSet& addAll(const UnicodeString& s);
1158 
1166  UnicodeSet& retainAll(const UnicodeString& s);
1167 
1175  UnicodeSet& complementAll(const UnicodeString& s);
1176 
1184  UnicodeSet& removeAll(const UnicodeString& s);
1185 
1194  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1195 
1196 
1204  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1205 
1217  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1218 
1219 
1228  UnicodeSet& retain(UChar32 c);
1229 
1240  UnicodeSet& retain(const UnicodeString &s);
1241 
1255  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1256 
1267  UnicodeSet& remove(UChar32 c);
1268 
1278  UnicodeSet& remove(const UnicodeString& s);
1279 
1292  virtual UnicodeSet& complement();
1293 
1306  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1307 
1318  UnicodeSet& complement(UChar32 c);
1319 
1329  UnicodeSet& complement(const UnicodeString& s);
1330 
1343  virtual UnicodeSet& addAll(const UnicodeSet& c);
1344 
1356  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1357 
1369  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1370 
1381  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1382 
1389  virtual UnicodeSet& clear(void);
1390 
1416  UnicodeSet& closeOver(int32_t attribute);
1417 
1424  virtual UnicodeSet &removeAllStrings();
1425 
1433  virtual int32_t getRangeCount(void) const;
1434 
1442  virtual UChar32 getRangeStart(int32_t index) const;
1443 
1451  virtual UChar32 getRangeEnd(int32_t index) const;
1452 
1501  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1502 
1509  virtual UnicodeSet& compact();
1510 
1522  static UClassID U_EXPORT2 getStaticClassID(void);
1523 
1532  virtual UClassID getDynamicClassID(void) const override;
1533 
1534 private:
1535 
1536  // Private API for the USet API
1537 
1538  friend class USetAccess;
1539 
1540  const UnicodeString* getString(int32_t index) const;
1541 
1542  //----------------------------------------------------------------
1543  // RuleBasedTransliterator support
1544  //----------------------------------------------------------------
1545 
1546 private:
1547 
1553  virtual UBool matchesIndexValue(uint8_t v) const override;
1554 
1555 private:
1556  friend class RBBIRuleScanner;
1557 
1558  //----------------------------------------------------------------
1559  // Implementation: Clone as thawed (see ICU4J Freezable)
1560  //----------------------------------------------------------------
1561 
1562  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1563  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1564 
1565  //----------------------------------------------------------------
1566  // Implementation: Pattern parsing
1567  //----------------------------------------------------------------
1568 
1569  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1570  ParsePosition& pos,
1571  const SymbolTable* symbols,
1572  UErrorCode& status);
1573 
1574  void applyPattern(RuleCharacterIterator& chars,
1575  const SymbolTable* symbols,
1576  UnicodeString& rebuiltPat,
1577  uint32_t options,
1578  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1579  int32_t depth,
1580  UErrorCode& ec);
1581 
1582  //----------------------------------------------------------------
1583  // Implementation: Utility methods
1584  //----------------------------------------------------------------
1585 
1586  static int32_t nextCapacity(int32_t minCapacity);
1587 
1588  bool ensureCapacity(int32_t newLen);
1589 
1590  bool ensureBufferCapacity(int32_t newLen);
1591 
1592  void swapBuffers(void);
1593 
1594  UBool allocateStrings(UErrorCode &status);
1595  int32_t stringsSize() const;
1596  UBool stringsContains(const UnicodeString &s) const;
1597 
1598  UnicodeString& _toPattern(UnicodeString& result,
1599  UBool escapeUnprintable) const;
1600 
1601  UnicodeString& _generatePattern(UnicodeString& result,
1602  UBool escapeUnprintable) const;
1603 
1604  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1605 
1606  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1607 
1608  static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1609  UBool escapeUnprintable);
1610 
1611  //----------------------------------------------------------------
1612  // Implementation: Fundamental operators
1613  //----------------------------------------------------------------
1614 
1615  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1616 
1617  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1618 
1619  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1620 
1626  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1627  int32_t pos);
1628 
1629  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1630  int32_t iterOpts);
1631 
1671  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1672  ParsePosition& ppos,
1673  UErrorCode &ec);
1674 
1675  void applyPropertyPattern(RuleCharacterIterator& chars,
1676  UnicodeString& rebuiltPat,
1677  UErrorCode& ec);
1678 
1683  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1684 
1694  void applyFilter(Filter filter,
1695  void* context,
1696  const UnicodeSet* inclusions,
1697  UErrorCode &status);
1698 
1702  void setPattern(const UnicodeString& newPat) {
1703  setPattern(newPat.getBuffer(), newPat.length());
1704  }
1705  void setPattern(const char16_t *newPat, int32_t newPatLen);
1709  void releasePattern();
1710 
1711  friend class UnicodeSetIterator;
1712 };
1713 
1714 
1715 
1716 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1717  return !operator==(o);
1718 }
1719 
1720 inline UBool UnicodeSet::isFrozen() const {
1721  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1722 }
1723 
1724 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1725  return !containsNone(start, end);
1726 }
1727 
1728 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1729  return !containsNone(s);
1730 }
1731 
1732 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1733  return !containsNone(s);
1734 }
1735 
1736 inline UBool UnicodeSet::isBogus() const {
1737  return (UBool)(fFlags & kIsBogus);
1738 }
1739 
1740 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1741  return reinterpret_cast<UnicodeSet *>(uset);
1742 }
1743 
1744 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1745  return reinterpret_cast<const UnicodeSet *>(uset);
1746 }
1747 
1748 inline USet *UnicodeSet::toUSet() {
1749  return reinterpret_cast<USet *>(this);
1750 }
1751 
1752 inline const USet *UnicodeSet::toUSet() const {
1753  return reinterpret_cast<const USet *>(this);
1754 }
1755 
1756 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1757  int32_t sLength=s.length();
1758  if(start<0) {
1759  start=0;
1760  } else if(start>sLength) {
1761  start=sLength;
1762  }
1763  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1764 }
1765 
1766 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1767  int32_t sLength=s.length();
1768  if(limit<0) {
1769  limit=0;
1770  } else if(limit>sLength) {
1771  limit=sLength;
1772  }
1773  return spanBack(s.getBuffer(), limit, spanCondition);
1774 }
1775 
1776 U_NAMESPACE_END
1777 
1778 #endif /* U_SHOW_CPLUSPLUS_API */
1779 
1780 #endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition: RunArrays.h:32
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:335
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental) override
Implement UnicodeMatcher API.
static UClassID getStaticClassID()
ICU &quot;poor man&#39;s RTTI&quot;, returns a UClassID for this class.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:33
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
C API: This file defines an abstract map from Unicode code points to integer values.
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns true if this matcher will match a character c, where c &amp; 0xFF == v, at offset, in the forward direction (with limit &gt; offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:59
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=false) const =0
Returns a string representation of this matcher.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:77
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:65
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:461
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:159
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:195
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:50
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:52
#define U_FINAL
Defined to the C++11 &quot;final&quot; keyword if available.
Definition: umachine.h:141
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeFilter * clone() const override=0
Clones this object polymorphically.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
C++ API: Unicode Filter.
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3890
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269
virtual UClassID getDynamicClassID(void) const override=0
Returns a unique class ID polymorphically.