| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | import "unicode/utf8" |
| 4 | |
| 5 | // The bit masks used to extract boundary information returned by [Step]. |
| 6 | const ( |
| 7 | MaskLine = 3 |
| 8 | MaskWord = 4 |
| 9 | MaskSentence = 8 |
| 10 | ) |
| 11 | |
| 12 | // The number of bits to shift the boundary information returned by [Step] to |
| 13 | // obtain the monospace width of the grapheme cluster. |
| 14 | const ShiftWidth = 4 |
| 15 | |
| 16 | // The bit positions by which boundary flags are shifted by the [Step] function. |
| 17 | // These must correspond to the Mask constants. |
| 18 | const ( |
| 19 | shiftWord = 2 |
| 20 | shiftSentence = 3 |
| 21 | // shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits. |
| 22 | ) |
| 23 | |
| 24 | // The bit positions by which states are shifted by the [Step] function. These |
| 25 | // values must ensure state values defined for each of the boundary algorithms |
| 26 | // don't overlap (and that they all still fit in a single int). These must |
| 27 | // correspond to the Mask constants. |
| 28 | const ( |
| 29 | shiftWordState = 4 |
| 30 | shiftSentenceState = 9 |
| 31 | shiftLineState = 13 |
| 32 | shiftPropState = 21 // No mask as these are always the remaining bits. |
| 33 | ) |
| 34 | |
| 35 | // The bit mask used to extract the state returned by the [Step] function, after |
| 36 | // shifting. These values must correspond to the shift constants. |
| 37 | const ( |
| 38 | maskGraphemeState = 0xf |
| 39 | maskWordState = 0x1f |
| 40 | maskSentenceState = 0xf |
| 41 | maskLineState = 0xff |
| 42 | ) |
| 43 | |
| 44 | // Step returns the first grapheme cluster (user-perceived character) found in |
| 45 | // the given byte slice. It also returns information about the boundary between |
| 46 | // that grapheme cluster and the one following it as well as the monospace width |
| 47 | // of the grapheme cluster. There are three types of boundary information: word |
| 48 | // boundaries, sentence boundaries, and line breaks. This function is therefore |
| 49 | // a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and |
| 50 | // [FirstLineSegment]. |
| 51 | // |
| 52 | // The "boundaries" return value can be evaluated as follows: |
| 53 | // |
| 54 | // - boundaries&MaskWord != 0: The boundary is a word boundary. |
| 55 | // - boundaries&MaskWord == 0: The boundary is not a word boundary. |
| 56 | // - boundaries&MaskSentence != 0: The boundary is a sentence boundary. |
| 57 | // - boundaries&MaskSentence == 0: The boundary is not a sentence boundary. |
| 58 | // - boundaries&MaskLine == LineDontBreak: You must not break the line at the |
| 59 | // boundary. |
| 60 | // - boundaries&MaskLine == LineMustBreak: You must break the line at the |
| 61 | // boundary. |
| 62 | // - boundaries&MaskLine == LineCanBreak: You may or may not break the line at |
| 63 | // the boundary. |
| 64 | // - boundaries >> ShiftWidth: The width of the grapheme cluster for most |
| 65 | // monospace fonts where a value of 1 represents one character cell. |
| 66 | // |
| 67 | // This function can be called continuously to extract all grapheme clusters |
| 68 | // from a byte slice, as illustrated in the examples below. |
| 69 | // |
| 70 | // If you don't know which state to pass, for example when calling the function |
| 71 | // for the first time, you must pass -1. For consecutive calls, pass the state |
| 72 | // and rest slice returned by the previous call. |
| 73 | // |
| 74 | // The "rest" slice is the sub-slice of the original byte slice "b" starting |
| 75 | // after the last byte of the identified grapheme cluster. If the length of the |
| 76 | // "rest" slice is 0, the entire byte slice "b" has been processed. The |
| 77 | // "cluster" byte slice is the sub-slice of the input slice containing the |
| 78 | // first identified grapheme cluster. |
| 79 | // |
| 80 | // Given an empty byte slice "b", the function returns nil values. |
| 81 | // |
| 82 | // While slightly less convenient than using the Graphemes class, this function |
| 83 | // has much better performance and makes no allocations. It lends itself well to |
| 84 | // large byte slices. |
| 85 | // |
| 86 | // Note that in accordance with [UAX #14 LB3], the final segment will end with |
| 87 | // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose |
| 88 | // to ignore this by checking if the length of the "rest" slice is 0 and calling |
| 89 | // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune. |
| 90 | // |
| 91 | // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm |
| 92 | func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) { |
| 93 | // An empty byte slice returns nothing. |
| 94 | if len(b) == 0 { |
| 95 | return |
| 96 | } |
| 97 | |
| 98 | // Extract the first rune. |
| 99 | r, length := utf8.DecodeRune(b) |
| 100 | if len(b) <= length { // If we're already past the end, there is nothing else to parse. |
| 101 | var prop int |
| 102 | if state < 0 { |
| 103 | prop = propertyGraphemes(r) |
| 104 | } else { |
| 105 | prop = state >> shiftPropState |
| 106 | } |
| 107 | return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) |
| 108 | } |
| 109 | |
| 110 | // If we don't know the state, determine it now. |
| 111 | var graphemeState, wordState, sentenceState, lineState, firstProp int |
| 112 | remainder := b[length:] |
| 113 | if state < 0 { |
| 114 | graphemeState, firstProp, _ = transitionGraphemeState(state, r) |
| 115 | wordState, _ = transitionWordBreakState(state, r, remainder, "") |
| 116 | sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "") |
| 117 | lineState, _ = transitionLineBreakState(state, r, remainder, "") |
| 118 | } else { |
| 119 | graphemeState = state & maskGraphemeState |
| 120 | wordState = (state >> shiftWordState) & maskWordState |
| 121 | sentenceState = (state >> shiftSentenceState) & maskSentenceState |
| 122 | lineState = (state >> shiftLineState) & maskLineState |
| 123 | firstProp = state >> shiftPropState |
| 124 | } |
| 125 | |
| 126 | // Transition until we find a grapheme cluster boundary. |
| 127 | width := runeWidth(r, firstProp) |
| 128 | for { |
| 129 | var ( |
| 130 | graphemeBoundary, wordBoundary, sentenceBoundary bool |
| 131 | lineBreak, prop int |
| 132 | ) |
| 133 | |
| 134 | r, l := utf8.DecodeRune(remainder) |
| 135 | remainder = b[length+l:] |
| 136 | |
| 137 | graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r) |
| 138 | wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "") |
| 139 | sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "") |
| 140 | lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "") |
| 141 | |
| 142 | if graphemeBoundary { |
| 143 | boundary := lineBreak | (width << ShiftWidth) |
| 144 | if wordBoundary { |
| 145 | boundary |= 1 << shiftWord |
| 146 | } |
| 147 | if sentenceBoundary { |
| 148 | boundary |= 1 << shiftSentence |
| 149 | } |
| 150 | return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState) |
| 151 | } |
| 152 | |
| 153 | if firstProp == prExtendedPictographic { |
| 154 | if r == vs15 { |
| 155 | width = 1 |
| 156 | } else if r == vs16 { |
| 157 | width = 2 |
| 158 | } |
| 159 | } else if firstProp != prRegionalIndicator && firstProp != prL { |
| 160 | width += runeWidth(r, prop) |
| 161 | } |
| 162 | |
| 163 | length += l |
| 164 | if len(b) <= length { |
| 165 | return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) |
| 166 | } |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | // StepString is like [Step] but its input and outputs are strings. |
| 171 | func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) { |
| 172 | // An empty byte slice returns nothing. |
| 173 | if len(str) == 0 { |
| 174 | return |
| 175 | } |
| 176 | |
| 177 | // Extract the first rune. |
| 178 | r, length := utf8.DecodeRuneInString(str) |
| 179 | if len(str) <= length { // If we're already past the end, there is nothing else to parse. |
| 180 | prop := propertyGraphemes(r) |
| 181 | return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) |
| 182 | } |
| 183 | |
| 184 | // If we don't know the state, determine it now. |
| 185 | var graphemeState, wordState, sentenceState, lineState, firstProp int |
| 186 | remainder := str[length:] |
| 187 | if state < 0 { |
| 188 | graphemeState, firstProp, _ = transitionGraphemeState(state, r) |
| 189 | wordState, _ = transitionWordBreakState(state, r, nil, remainder) |
| 190 | sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder) |
| 191 | lineState, _ = transitionLineBreakState(state, r, nil, remainder) |
| 192 | } else { |
| 193 | graphemeState = state & maskGraphemeState |
| 194 | wordState = (state >> shiftWordState) & maskWordState |
| 195 | sentenceState = (state >> shiftSentenceState) & maskSentenceState |
| 196 | lineState = (state >> shiftLineState) & maskLineState |
| 197 | firstProp = state >> shiftPropState |
| 198 | } |
| 199 | |
| 200 | // Transition until we find a grapheme cluster boundary. |
| 201 | width := runeWidth(r, firstProp) |
| 202 | for { |
| 203 | var ( |
| 204 | graphemeBoundary, wordBoundary, sentenceBoundary bool |
| 205 | lineBreak, prop int |
| 206 | ) |
| 207 | |
| 208 | r, l := utf8.DecodeRuneInString(remainder) |
| 209 | remainder = str[length+l:] |
| 210 | |
| 211 | graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r) |
| 212 | wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder) |
| 213 | sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder) |
| 214 | lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder) |
| 215 | |
| 216 | if graphemeBoundary { |
| 217 | boundary := lineBreak | (width << ShiftWidth) |
| 218 | if wordBoundary { |
| 219 | boundary |= 1 << shiftWord |
| 220 | } |
| 221 | if sentenceBoundary { |
| 222 | boundary |= 1 << shiftSentence |
| 223 | } |
| 224 | return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState) |
| 225 | } |
| 226 | |
| 227 | if firstProp == prExtendedPictographic { |
| 228 | if r == vs15 { |
| 229 | width = 1 |
| 230 | } else if r == vs16 { |
| 231 | width = 2 |
| 232 | } |
| 233 | } else if firstProp != prRegionalIndicator && firstProp != prL { |
| 234 | width += runeWidth(r, prop) |
| 235 | } |
| 236 | |
| 237 | length += l |
| 238 | if len(str) <= length { |
| 239 | return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) |
| 240 | } |
| 241 | } |
| 242 | } |