| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | import "unicode/utf8" |
| 4 | |
| 5 | // Graphemes implements an iterator over Unicode grapheme clusters, or |
| 6 | // user-perceived characters. While iterating, it also provides information |
| 7 | // about word boundaries, sentence boundaries, line breaks, and monospace |
| 8 | // character widths. |
| 9 | // |
| 10 | // After constructing the class via [NewGraphemes] for a given string "str", |
| 11 | // [Graphemes.Next] is called for every grapheme cluster in a loop until it |
| 12 | // returns false. Inside the loop, information about the grapheme cluster as |
| 13 | // well as boundary information and character width is available via the various |
| 14 | // methods (see examples below). |
| 15 | // |
| 16 | // This class basically wraps the [StepString] parser and provides a convenient |
| 17 | // interface to it. If you are only interested in some parts of this package's |
| 18 | // functionality, using the specialized functions starting with "First" is |
| 19 | // almost always faster. |
| 20 | type Graphemes struct { |
| 21 | // The original string. |
| 22 | original string |
| 23 | |
| 24 | // The remaining string to be parsed. |
| 25 | remaining string |
| 26 | |
| 27 | // The current grapheme cluster. |
| 28 | cluster string |
| 29 | |
| 30 | // The byte offset of the current grapheme cluster relative to the original |
| 31 | // string. |
| 32 | offset int |
| 33 | |
| 34 | // The current boundary information of the [Step] parser. |
| 35 | boundaries int |
| 36 | |
| 37 | // The current state of the [Step] parser. |
| 38 | state int |
| 39 | } |
| 40 | |
| 41 | // NewGraphemes returns a new grapheme cluster iterator. |
| 42 | func NewGraphemes(str string) *Graphemes { |
| 43 | return &Graphemes{ |
| 44 | original: str, |
| 45 | remaining: str, |
| 46 | state: -1, |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | // Next advances the iterator by one grapheme cluster and returns false if no |
| 51 | // clusters are left. This function must be called before the first cluster is |
| 52 | // accessed. |
| 53 | func (g *Graphemes) Next() bool { |
| 54 | if len(g.remaining) == 0 { |
| 55 | // We're already past the end. |
| 56 | g.state = -2 |
| 57 | g.cluster = "" |
| 58 | return false |
| 59 | } |
| 60 | g.offset += len(g.cluster) |
| 61 | g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state) |
| 62 | return true |
| 63 | } |
| 64 | |
| 65 | // Runes returns a slice of runes (code points) which corresponds to the current |
| 66 | // grapheme cluster. If the iterator is already past the end or [Graphemes.Next] |
| 67 | // has not yet been called, nil is returned. |
| 68 | func (g *Graphemes) Runes() []rune { |
| 69 | if g.state < 0 { |
| 70 | return nil |
| 71 | } |
| 72 | return []rune(g.cluster) |
| 73 | } |
| 74 | |
| 75 | // Str returns a substring of the original string which corresponds to the |
| 76 | // current grapheme cluster. If the iterator is already past the end or |
| 77 | // [Graphemes.Next] has not yet been called, an empty string is returned. |
| 78 | func (g *Graphemes) Str() string { |
| 79 | return g.cluster |
| 80 | } |
| 81 | |
| 82 | // Bytes returns a byte slice which corresponds to the current grapheme cluster. |
| 83 | // If the iterator is already past the end or [Graphemes.Next] has not yet been |
| 84 | // called, nil is returned. |
| 85 | func (g *Graphemes) Bytes() []byte { |
| 86 | if g.state < 0 { |
| 87 | return nil |
| 88 | } |
| 89 | return []byte(g.cluster) |
| 90 | } |
| 91 | |
| 92 | // Positions returns the interval of the current grapheme cluster as byte |
| 93 | // positions into the original string. The first returned value "from" indexes |
| 94 | // the first byte and the second returned value "to" indexes the first byte that |
| 95 | // is not included anymore, i.e. str[from:to] is the current grapheme cluster of |
| 96 | // the original string "str". If [Graphemes.Next] has not yet been called, both |
| 97 | // values are 0. If the iterator is already past the end, both values are 1. |
| 98 | func (g *Graphemes) Positions() (int, int) { |
| 99 | if g.state == -1 { |
| 100 | return 0, 0 |
| 101 | } else if g.state == -2 { |
| 102 | return 1, 1 |
| 103 | } |
| 104 | return g.offset, g.offset + len(g.cluster) |
| 105 | } |
| 106 | |
| 107 | // IsWordBoundary returns true if a word ends after the current grapheme |
| 108 | // cluster. |
| 109 | func (g *Graphemes) IsWordBoundary() bool { |
| 110 | if g.state < 0 { |
| 111 | return true |
| 112 | } |
| 113 | return g.boundaries&MaskWord != 0 |
| 114 | } |
| 115 | |
| 116 | // IsSentenceBoundary returns true if a sentence ends after the current |
| 117 | // grapheme cluster. |
| 118 | func (g *Graphemes) IsSentenceBoundary() bool { |
| 119 | if g.state < 0 { |
| 120 | return true |
| 121 | } |
| 122 | return g.boundaries&MaskSentence != 0 |
| 123 | } |
| 124 | |
| 125 | // LineBreak returns whether the line can be broken after the current grapheme |
| 126 | // cluster. A value of [LineDontBreak] means the line may not be broken, a value |
| 127 | // of [LineMustBreak] means the line must be broken, and a value of |
| 128 | // [LineCanBreak] means the line may or may not be broken. |
| 129 | func (g *Graphemes) LineBreak() int { |
| 130 | if g.state == -1 { |
| 131 | return LineDontBreak |
| 132 | } |
| 133 | if g.state == -2 { |
| 134 | return LineMustBreak |
| 135 | } |
| 136 | return g.boundaries & MaskLine |
| 137 | } |
| 138 | |
| 139 | // Width returns the monospace width of the current grapheme cluster. |
| 140 | func (g *Graphemes) Width() int { |
| 141 | if g.state < 0 { |
| 142 | return 0 |
| 143 | } |
| 144 | return g.boundaries >> ShiftWidth |
| 145 | } |
| 146 | |
| 147 | // Reset puts the iterator into its initial state such that the next call to |
| 148 | // [Graphemes.Next] sets it to the first grapheme cluster again. |
| 149 | func (g *Graphemes) Reset() { |
| 150 | g.state = -1 |
| 151 | g.offset = 0 |
| 152 | g.cluster = "" |
| 153 | g.remaining = g.original |
| 154 | } |
| 155 | |
| 156 | // GraphemeClusterCount returns the number of user-perceived characters |
| 157 | // (grapheme clusters) for the given string. |
| 158 | func GraphemeClusterCount(s string) (n int) { |
| 159 | state := -1 |
| 160 | for len(s) > 0 { |
| 161 | _, s, _, state = FirstGraphemeClusterInString(s, state) |
| 162 | n++ |
| 163 | } |
| 164 | return |
| 165 | } |
| 166 | |
| 167 | // ReverseString reverses the given string while observing grapheme cluster |
| 168 | // boundaries. |
| 169 | func ReverseString(s string) string { |
| 170 | str := []byte(s) |
| 171 | reversed := make([]byte, len(str)) |
| 172 | state := -1 |
| 173 | index := len(str) |
| 174 | for len(str) > 0 { |
| 175 | var cluster []byte |
| 176 | cluster, str, _, state = FirstGraphemeCluster(str, state) |
| 177 | index -= len(cluster) |
| 178 | copy(reversed[index:], cluster) |
| 179 | if index <= len(str)/2 { |
| 180 | break |
| 181 | } |
| 182 | } |
| 183 | return string(reversed) |
| 184 | } |
| 185 | |
| 186 | // The number of bits the grapheme property must be shifted to make place for |
| 187 | // grapheme states. |
| 188 | const shiftGraphemePropState = 4 |
| 189 | |
| 190 | // FirstGraphemeCluster returns the first grapheme cluster found in the given |
| 191 | // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme |
| 192 | // Cluster Boundaries]. This function can be called continuously to extract all |
| 193 | // grapheme clusters from a byte slice, as illustrated in the example below. |
| 194 | // |
| 195 | // If you don't know the current state, for example when calling the function |
| 196 | // for the first time, you must pass -1. For consecutive calls, pass the state |
| 197 | // and rest slice returned by the previous call. |
| 198 | // |
| 199 | // The "rest" slice is the sub-slice of the original byte slice "b" starting |
| 200 | // after the last byte of the identified grapheme cluster. If the length of the |
| 201 | // "rest" slice is 0, the entire byte slice "b" has been processed. The |
| 202 | // "cluster" byte slice is the sub-slice of the input slice containing the |
| 203 | // identified grapheme cluster. |
| 204 | // |
| 205 | // The returned width is the width of the grapheme cluster for most monospace |
| 206 | // fonts where a value of 1 represents one character cell. |
| 207 | // |
| 208 | // Given an empty byte slice "b", the function returns nil values. |
| 209 | // |
| 210 | // While slightly less convenient than using the Graphemes class, this function |
| 211 | // has much better performance and makes no allocations. It lends itself well to |
| 212 | // large byte slices. |
| 213 | // |
| 214 | // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries |
| 215 | func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) { |
| 216 | // An empty byte slice returns nothing. |
| 217 | if len(b) == 0 { |
| 218 | return |
| 219 | } |
| 220 | |
| 221 | // Extract the first rune. |
| 222 | r, length := utf8.DecodeRune(b) |
| 223 | if len(b) <= length { // If we're already past the end, there is nothing else to parse. |
| 224 | var prop int |
| 225 | if state < 0 { |
| 226 | prop = propertyGraphemes(r) |
| 227 | } else { |
| 228 | prop = state >> shiftGraphemePropState |
| 229 | } |
| 230 | return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) |
| 231 | } |
| 232 | |
| 233 | // If we don't know the state, determine it now. |
| 234 | var firstProp int |
| 235 | if state < 0 { |
| 236 | state, firstProp, _ = transitionGraphemeState(state, r) |
| 237 | } else { |
| 238 | firstProp = state >> shiftGraphemePropState |
| 239 | } |
| 240 | width += runeWidth(r, firstProp) |
| 241 | |
| 242 | // Transition until we find a boundary. |
| 243 | for { |
| 244 | var ( |
| 245 | prop int |
| 246 | boundary bool |
| 247 | ) |
| 248 | |
| 249 | r, l := utf8.DecodeRune(b[length:]) |
| 250 | state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) |
| 251 | |
| 252 | if boundary { |
| 253 | return b[:length], b[length:], width, state | (prop << shiftGraphemePropState) |
| 254 | } |
| 255 | |
| 256 | if firstProp == prExtendedPictographic { |
| 257 | if r == vs15 { |
| 258 | width = 1 |
| 259 | } else if r == vs16 { |
| 260 | width = 2 |
| 261 | } |
| 262 | } else if firstProp != prRegionalIndicator && firstProp != prL { |
| 263 | width += runeWidth(r, prop) |
| 264 | } |
| 265 | |
| 266 | length += l |
| 267 | if len(b) <= length { |
| 268 | return b, nil, width, grAny | (prop << shiftGraphemePropState) |
| 269 | } |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and |
| 274 | // outputs are strings. |
| 275 | func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) { |
| 276 | // An empty string returns nothing. |
| 277 | if len(str) == 0 { |
| 278 | return |
| 279 | } |
| 280 | |
| 281 | // Extract the first rune. |
| 282 | r, length := utf8.DecodeRuneInString(str) |
| 283 | if len(str) <= length { // If we're already past the end, there is nothing else to parse. |
| 284 | var prop int |
| 285 | if state < 0 { |
| 286 | prop = propertyGraphemes(r) |
| 287 | } else { |
| 288 | prop = state >> shiftGraphemePropState |
| 289 | } |
| 290 | return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) |
| 291 | } |
| 292 | |
| 293 | // If we don't know the state, determine it now. |
| 294 | var firstProp int |
| 295 | if state < 0 { |
| 296 | state, firstProp, _ = transitionGraphemeState(state, r) |
| 297 | } else { |
| 298 | firstProp = state >> shiftGraphemePropState |
| 299 | } |
| 300 | width += runeWidth(r, firstProp) |
| 301 | |
| 302 | // Transition until we find a boundary. |
| 303 | for { |
| 304 | var ( |
| 305 | prop int |
| 306 | boundary bool |
| 307 | ) |
| 308 | |
| 309 | r, l := utf8.DecodeRuneInString(str[length:]) |
| 310 | state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) |
| 311 | |
| 312 | if boundary { |
| 313 | return str[:length], str[length:], width, state | (prop << shiftGraphemePropState) |
| 314 | } |
| 315 | |
| 316 | if firstProp == prExtendedPictographic { |
| 317 | if r == vs15 { |
| 318 | width = 1 |
| 319 | } else if r == vs16 { |
| 320 | width = 2 |
| 321 | } |
| 322 | } else if firstProp != prRegionalIndicator && firstProp != prL { |
| 323 | width += runeWidth(r, prop) |
| 324 | } |
| 325 | |
| 326 | length += l |
| 327 | if len(str) <= length { |
| 328 | return str, "", width, grAny | (prop << shiftGraphemePropState) |
| 329 | } |
| 330 | } |
| 331 | } |