| package uniseg |
| |
| import "unicode/utf8" |
| |
| // Graphemes implements an iterator over Unicode grapheme clusters, or |
| // user-perceived characters. While iterating, it also provides information |
| // about word boundaries, sentence boundaries, line breaks, and monospace |
| // character widths. |
| // |
| // After constructing the class via [NewGraphemes] for a given string "str", |
| // [Graphemes.Next] is called for every grapheme cluster in a loop until it |
| // returns false. Inside the loop, information about the grapheme cluster as |
| // well as boundary information and character width is available via the various |
| // methods (see examples below). |
| // |
| // This class basically wraps the [StepString] parser and provides a convenient |
| // interface to it. If you are only interested in some parts of this package's |
| // functionality, using the specialized functions starting with "First" is |
| // almost always faster. |
| type Graphemes struct { |
| // The original string. |
| original string |
| |
| // The remaining string to be parsed. |
| remaining string |
| |
| // The current grapheme cluster. |
| cluster string |
| |
| // The byte offset of the current grapheme cluster relative to the original |
| // string. |
| offset int |
| |
| // The current boundary information of the [Step] parser. |
| boundaries int |
| |
| // The current state of the [Step] parser. |
| state int |
| } |
| |
| // NewGraphemes returns a new grapheme cluster iterator. |
| func NewGraphemes(str string) *Graphemes { |
| return &Graphemes{ |
| original: str, |
| remaining: str, |
| state: -1, |
| } |
| } |
| |
| // Next advances the iterator by one grapheme cluster and returns false if no |
| // clusters are left. This function must be called before the first cluster is |
| // accessed. |
| func (g *Graphemes) Next() bool { |
| if len(g.remaining) == 0 { |
| // We're already past the end. |
| g.state = -2 |
| g.cluster = "" |
| return false |
| } |
| g.offset += len(g.cluster) |
| g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state) |
| return true |
| } |
| |
| // Runes returns a slice of runes (code points) which corresponds to the current |
| // grapheme cluster. If the iterator is already past the end or [Graphemes.Next] |
| // has not yet been called, nil is returned. |
| func (g *Graphemes) Runes() []rune { |
| if g.state < 0 { |
| return nil |
| } |
| return []rune(g.cluster) |
| } |
| |
| // Str returns a substring of the original string which corresponds to the |
| // current grapheme cluster. If the iterator is already past the end or |
| // [Graphemes.Next] has not yet been called, an empty string is returned. |
| func (g *Graphemes) Str() string { |
| return g.cluster |
| } |
| |
| // Bytes returns a byte slice which corresponds to the current grapheme cluster. |
| // If the iterator is already past the end or [Graphemes.Next] has not yet been |
| // called, nil is returned. |
| func (g *Graphemes) Bytes() []byte { |
| if g.state < 0 { |
| return nil |
| } |
| return []byte(g.cluster) |
| } |
| |
| // Positions returns the interval of the current grapheme cluster as byte |
| // positions into the original string. The first returned value "from" indexes |
| // the first byte and the second returned value "to" indexes the first byte that |
| // is not included anymore, i.e. str[from:to] is the current grapheme cluster of |
| // the original string "str". If [Graphemes.Next] has not yet been called, both |
| // values are 0. If the iterator is already past the end, both values are 1. |
| func (g *Graphemes) Positions() (int, int) { |
| if g.state == -1 { |
| return 0, 0 |
| } else if g.state == -2 { |
| return 1, 1 |
| } |
| return g.offset, g.offset + len(g.cluster) |
| } |
| |
| // IsWordBoundary returns true if a word ends after the current grapheme |
| // cluster. |
| func (g *Graphemes) IsWordBoundary() bool { |
| if g.state < 0 { |
| return true |
| } |
| return g.boundaries&MaskWord != 0 |
| } |
| |
| // IsSentenceBoundary returns true if a sentence ends after the current |
| // grapheme cluster. |
| func (g *Graphemes) IsSentenceBoundary() bool { |
| if g.state < 0 { |
| return true |
| } |
| return g.boundaries&MaskSentence != 0 |
| } |
| |
| // LineBreak returns whether the line can be broken after the current grapheme |
| // cluster. A value of [LineDontBreak] means the line may not be broken, a value |
| // of [LineMustBreak] means the line must be broken, and a value of |
| // [LineCanBreak] means the line may or may not be broken. |
| func (g *Graphemes) LineBreak() int { |
| if g.state == -1 { |
| return LineDontBreak |
| } |
| if g.state == -2 { |
| return LineMustBreak |
| } |
| return g.boundaries & MaskLine |
| } |
| |
| // Width returns the monospace width of the current grapheme cluster. |
| func (g *Graphemes) Width() int { |
| if g.state < 0 { |
| return 0 |
| } |
| return g.boundaries >> ShiftWidth |
| } |
| |
| // Reset puts the iterator into its initial state such that the next call to |
| // [Graphemes.Next] sets it to the first grapheme cluster again. |
| func (g *Graphemes) Reset() { |
| g.state = -1 |
| g.offset = 0 |
| g.cluster = "" |
| g.remaining = g.original |
| } |
| |
| // GraphemeClusterCount returns the number of user-perceived characters |
| // (grapheme clusters) for the given string. |
| func GraphemeClusterCount(s string) (n int) { |
| state := -1 |
| for len(s) > 0 { |
| _, s, _, state = FirstGraphemeClusterInString(s, state) |
| n++ |
| } |
| return |
| } |
| |
| // ReverseString reverses the given string while observing grapheme cluster |
| // boundaries. |
| func ReverseString(s string) string { |
| str := []byte(s) |
| reversed := make([]byte, len(str)) |
| state := -1 |
| index := len(str) |
| for len(str) > 0 { |
| var cluster []byte |
| cluster, str, _, state = FirstGraphemeCluster(str, state) |
| index -= len(cluster) |
| copy(reversed[index:], cluster) |
| if index <= len(str)/2 { |
| break |
| } |
| } |
| return string(reversed) |
| } |
| |
| // The number of bits the grapheme property must be shifted to make place for |
| // grapheme states. |
| const shiftGraphemePropState = 4 |
| |
| // FirstGraphemeCluster returns the first grapheme cluster found in the given |
| // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme |
| // Cluster Boundaries]. This function can be called continuously to extract all |
| // grapheme clusters from a byte slice, as illustrated in the example below. |
| // |
| // If you don't know the current state, for example when calling the function |
| // for the first time, you must pass -1. For consecutive calls, pass the state |
| // and rest slice returned by the previous call. |
| // |
| // The "rest" slice is the sub-slice of the original byte slice "b" starting |
| // after the last byte of the identified grapheme cluster. If the length of the |
| // "rest" slice is 0, the entire byte slice "b" has been processed. The |
| // "cluster" byte slice is the sub-slice of the input slice containing the |
| // identified grapheme cluster. |
| // |
| // The returned width is the width of the grapheme cluster for most monospace |
| // fonts where a value of 1 represents one character cell. |
| // |
| // Given an empty byte slice "b", the function returns nil values. |
| // |
| // While slightly less convenient than using the Graphemes class, this function |
| // has much better performance and makes no allocations. It lends itself well to |
| // large byte slices. |
| // |
| // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries |
| func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) { |
| // An empty byte slice returns nothing. |
| if len(b) == 0 { |
| return |
| } |
| |
| // Extract the first rune. |
| r, length := utf8.DecodeRune(b) |
| if len(b) <= length { // If we're already past the end, there is nothing else to parse. |
| var prop int |
| if state < 0 { |
| prop = propertyGraphemes(r) |
| } else { |
| prop = state >> shiftGraphemePropState |
| } |
| return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) |
| } |
| |
| // If we don't know the state, determine it now. |
| var firstProp int |
| if state < 0 { |
| state, firstProp, _ = transitionGraphemeState(state, r) |
| } else { |
| firstProp = state >> shiftGraphemePropState |
| } |
| width += runeWidth(r, firstProp) |
| |
| // Transition until we find a boundary. |
| for { |
| var ( |
| prop int |
| boundary bool |
| ) |
| |
| r, l := utf8.DecodeRune(b[length:]) |
| state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) |
| |
| if boundary { |
| return b[:length], b[length:], width, state | (prop << shiftGraphemePropState) |
| } |
| |
| if firstProp == prExtendedPictographic { |
| if r == vs15 { |
| width = 1 |
| } else if r == vs16 { |
| width = 2 |
| } |
| } else if firstProp != prRegionalIndicator && firstProp != prL { |
| width += runeWidth(r, prop) |
| } |
| |
| length += l |
| if len(b) <= length { |
| return b, nil, width, grAny | (prop << shiftGraphemePropState) |
| } |
| } |
| } |
| |
| // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and |
| // outputs are strings. |
| func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) { |
| // An empty string returns nothing. |
| if len(str) == 0 { |
| return |
| } |
| |
| // Extract the first rune. |
| r, length := utf8.DecodeRuneInString(str) |
| if len(str) <= length { // If we're already past the end, there is nothing else to parse. |
| var prop int |
| if state < 0 { |
| prop = propertyGraphemes(r) |
| } else { |
| prop = state >> shiftGraphemePropState |
| } |
| return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) |
| } |
| |
| // If we don't know the state, determine it now. |
| var firstProp int |
| if state < 0 { |
| state, firstProp, _ = transitionGraphemeState(state, r) |
| } else { |
| firstProp = state >> shiftGraphemePropState |
| } |
| width += runeWidth(r, firstProp) |
| |
| // Transition until we find a boundary. |
| for { |
| var ( |
| prop int |
| boundary bool |
| ) |
| |
| r, l := utf8.DecodeRuneInString(str[length:]) |
| state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) |
| |
| if boundary { |
| return str[:length], str[length:], width, state | (prop << shiftGraphemePropState) |
| } |
| |
| if firstProp == prExtendedPictographic { |
| if r == vs15 { |
| width = 1 |
| } else if r == vs16 { |
| width = 2 |
| } |
| } else if firstProp != prRegionalIndicator && firstProp != prL { |
| width += runeWidth(r, prop) |
| } |
| |
| length += l |
| if len(str) <= length { |
| return str, "", width, grAny | (prop << shiftGraphemePropState) |
| } |
| } |
| } |