| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | // The Unicode properties as used in the various parsers. Only the ones needed |
| 4 | // in the context of this package are included. |
| 5 | const ( |
| 6 | prXX = 0 // Same as prAny. |
| 7 | prAny = iota // prAny must be 0. |
| 8 | prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector. |
| 9 | prCR |
| 10 | prLF |
| 11 | prControl |
| 12 | prExtend |
| 13 | prRegionalIndicator |
| 14 | prSpacingMark |
| 15 | prL |
| 16 | prV |
| 17 | prT |
| 18 | prLV |
| 19 | prLVT |
| 20 | prZWJ |
| 21 | prExtendedPictographic |
| 22 | prNewline |
| 23 | prWSegSpace |
| 24 | prDoubleQuote |
| 25 | prSingleQuote |
| 26 | prMidNumLet |
| 27 | prNumeric |
| 28 | prMidLetter |
| 29 | prMidNum |
| 30 | prExtendNumLet |
| 31 | prALetter |
| 32 | prFormat |
| 33 | prHebrewLetter |
| 34 | prKatakana |
| 35 | prSp |
| 36 | prSTerm |
| 37 | prClose |
| 38 | prSContinue |
| 39 | prATerm |
| 40 | prUpper |
| 41 | prLower |
| 42 | prSep |
| 43 | prOLetter |
| 44 | prCM |
| 45 | prBA |
| 46 | prBK |
| 47 | prSP |
| 48 | prEX |
| 49 | prQU |
| 50 | prAL |
| 51 | prPR |
| 52 | prPO |
| 53 | prOP |
| 54 | prCP |
| 55 | prIS |
| 56 | prHY |
| 57 | prSY |
| 58 | prNU |
| 59 | prCL |
| 60 | prNL |
| 61 | prGL |
| 62 | prAI |
| 63 | prBB |
| 64 | prHL |
| 65 | prSA |
| 66 | prJL |
| 67 | prJV |
| 68 | prJT |
| 69 | prNS |
| 70 | prZW |
| 71 | prB2 |
| 72 | prIN |
| 73 | prWJ |
| 74 | prID |
| 75 | prEB |
| 76 | prCJ |
| 77 | prH2 |
| 78 | prH3 |
| 79 | prSG |
| 80 | prCB |
| 81 | prRI |
| 82 | prEM |
| 83 | prN |
| 84 | prNa |
| 85 | prA |
| 86 | prW |
| 87 | prH |
| 88 | prF |
| 89 | prEmojiPresentation |
| 90 | ) |
| 91 | |
| 92 | // Unicode General Categories. Only the ones needed in the context of this |
| 93 | // package are included. |
| 94 | const ( |
| 95 | gcNone = iota // gcNone must be 0. |
| 96 | gcCc |
| 97 | gcZs |
| 98 | gcPo |
| 99 | gcSc |
| 100 | gcPs |
| 101 | gcPe |
| 102 | gcSm |
| 103 | gcPd |
| 104 | gcNd |
| 105 | gcLu |
| 106 | gcSk |
| 107 | gcPc |
| 108 | gcLl |
| 109 | gcSo |
| 110 | gcLo |
| 111 | gcPi |
| 112 | gcCf |
| 113 | gcNo |
| 114 | gcPf |
| 115 | gcLC |
| 116 | gcLm |
| 117 | gcMn |
| 118 | gcMe |
| 119 | gcMc |
| 120 | gcNl |
| 121 | gcZl |
| 122 | gcZp |
| 123 | gcCn |
| 124 | gcCs |
| 125 | gcCo |
| 126 | ) |
| 127 | |
| 128 | // Special code points. |
| 129 | const ( |
| 130 | vs15 = 0xfe0e // Variation Selector-15 (text presentation) |
| 131 | vs16 = 0xfe0f // Variation Selector-16 (emoji presentation) |
| 132 | ) |
| 133 | |
| 134 | // propertySearch performs a binary search on a property slice and returns the |
| 135 | // entry whose range (start = first array element, end = second array element) |
| 136 | // includes r, or an array of 0's if no such entry was found. |
| 137 | func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) { |
| 138 | // Run a binary search. |
| 139 | from := 0 |
| 140 | to := len(dictionary) |
| 141 | for to > from { |
| 142 | middle := (from + to) / 2 |
| 143 | cpRange := dictionary[middle] |
| 144 | if int(r) < cpRange[0] { |
| 145 | to = middle |
| 146 | continue |
| 147 | } |
| 148 | if int(r) > cpRange[1] { |
| 149 | from = middle + 1 |
| 150 | continue |
| 151 | } |
| 152 | return cpRange |
| 153 | } |
| 154 | return |
| 155 | } |
| 156 | |
| 157 | // property returns the Unicode property value (see constants above) of the |
| 158 | // given code point. |
| 159 | func property(dictionary [][3]int, r rune) int { |
| 160 | return propertySearch(dictionary, r)[2] |
| 161 | } |
| 162 | |
| 163 | // propertyLineBreak returns the Unicode property value and General Category |
| 164 | // (see constants above) of the given code point, as listed in the line break |
| 165 | // code points table, while fast tracking ASCII digits and letters. |
| 166 | func propertyLineBreak(r rune) (property, generalCategory int) { |
| 167 | if r >= 'a' && r <= 'z' { |
| 168 | return prAL, gcLl |
| 169 | } |
| 170 | if r >= 'A' && r <= 'Z' { |
| 171 | return prAL, gcLu |
| 172 | } |
| 173 | if r >= '0' && r <= '9' { |
| 174 | return prNU, gcNd |
| 175 | } |
| 176 | entry := propertySearch(lineBreakCodePoints, r) |
| 177 | return entry[2], entry[3] |
| 178 | } |
| 179 | |
| 180 | // propertyGraphemes returns the Unicode grapheme cluster property value of the |
| 181 | // given code point while fast tracking ASCII characters. |
| 182 | func propertyGraphemes(r rune) int { |
| 183 | if r >= 0x20 && r <= 0x7e { |
| 184 | return prAny |
| 185 | } |
| 186 | if r == 0x0a { |
| 187 | return prLF |
| 188 | } |
| 189 | if r == 0x0d { |
| 190 | return prCR |
| 191 | } |
| 192 | if r >= 0 && r <= 0x1f || r == 0x7f { |
| 193 | return prControl |
| 194 | } |
| 195 | return property(graphemeCodePoints, r) |
| 196 | } |
| 197 | |
| 198 | // propertyEastAsianWidth returns the Unicode East Asian Width property value of |
| 199 | // the given code point while fast tracking ASCII characters. |
| 200 | func propertyEastAsianWidth(r rune) int { |
| 201 | if r >= 0x20 && r <= 0x7e { |
| 202 | return prNa |
| 203 | } |
| 204 | if r >= 0 && r <= 0x1f || r == 0x7f { |
| 205 | return prN |
| 206 | } |
| 207 | return property(eastAsianWidth, r) |
| 208 | } |