| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame^] | 1 | package uniseg |
| 2 | |
| 3 | import "unicode/utf8" |
| 4 | |
| 5 | // The states of the word break parser. |
| 6 | const ( |
| 7 | wbAny = iota |
| 8 | wbCR |
| 9 | wbLF |
| 10 | wbNewline |
| 11 | wbWSegSpace |
| 12 | wbHebrewLetter |
| 13 | wbALetter |
| 14 | wbWB7 |
| 15 | wbWB7c |
| 16 | wbNumeric |
| 17 | wbWB11 |
| 18 | wbKatakana |
| 19 | wbExtendNumLet |
| 20 | wbOddRI |
| 21 | wbEvenRI |
| 22 | wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c). |
| 23 | ) |
| 24 | |
| 25 | // wbTransitions implements the word break parser's state transitions. It's |
| 26 | // anologous to [grTransitions], see comments there for details. |
| 27 | // |
| 28 | // Unicode version 15.0.0. |
| 29 | func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) { |
| 30 | switch uint64(state) | uint64(prop)<<32 { |
| 31 | // WB3b. |
| 32 | case wbAny | prNewline<<32: |
| 33 | return wbNewline, true, 32 |
| 34 | case wbAny | prCR<<32: |
| 35 | return wbCR, true, 32 |
| 36 | case wbAny | prLF<<32: |
| 37 | return wbLF, true, 32 |
| 38 | |
| 39 | // WB3a. |
| 40 | case wbNewline | prAny<<32: |
| 41 | return wbAny, true, 31 |
| 42 | case wbCR | prAny<<32: |
| 43 | return wbAny, true, 31 |
| 44 | case wbLF | prAny<<32: |
| 45 | return wbAny, true, 31 |
| 46 | |
| 47 | // WB3. |
| 48 | case wbCR | prLF<<32: |
| 49 | return wbLF, false, 30 |
| 50 | |
| 51 | // WB3d. |
| 52 | case wbAny | prWSegSpace<<32: |
| 53 | return wbWSegSpace, true, 9990 |
| 54 | case wbWSegSpace | prWSegSpace<<32: |
| 55 | return wbWSegSpace, false, 34 |
| 56 | |
| 57 | // WB5. |
| 58 | case wbAny | prALetter<<32: |
| 59 | return wbALetter, true, 9990 |
| 60 | case wbAny | prHebrewLetter<<32: |
| 61 | return wbHebrewLetter, true, 9990 |
| 62 | case wbALetter | prALetter<<32: |
| 63 | return wbALetter, false, 50 |
| 64 | case wbALetter | prHebrewLetter<<32: |
| 65 | return wbHebrewLetter, false, 50 |
| 66 | case wbHebrewLetter | prALetter<<32: |
| 67 | return wbALetter, false, 50 |
| 68 | case wbHebrewLetter | prHebrewLetter<<32: |
| 69 | return wbHebrewLetter, false, 50 |
| 70 | |
| 71 | // WB7. Transitions to wbWB7 handled by transitionWordBreakState(). |
| 72 | case wbWB7 | prALetter<<32: |
| 73 | return wbALetter, false, 70 |
| 74 | case wbWB7 | prHebrewLetter<<32: |
| 75 | return wbHebrewLetter, false, 70 |
| 76 | |
| 77 | // WB7a. |
| 78 | case wbHebrewLetter | prSingleQuote<<32: |
| 79 | return wbAny, false, 71 |
| 80 | |
| 81 | // WB7c. Transitions to wbWB7c handled by transitionWordBreakState(). |
| 82 | case wbWB7c | prHebrewLetter<<32: |
| 83 | return wbHebrewLetter, false, 73 |
| 84 | |
| 85 | // WB8. |
| 86 | case wbAny | prNumeric<<32: |
| 87 | return wbNumeric, true, 9990 |
| 88 | case wbNumeric | prNumeric<<32: |
| 89 | return wbNumeric, false, 80 |
| 90 | |
| 91 | // WB9. |
| 92 | case wbALetter | prNumeric<<32: |
| 93 | return wbNumeric, false, 90 |
| 94 | case wbHebrewLetter | prNumeric<<32: |
| 95 | return wbNumeric, false, 90 |
| 96 | |
| 97 | // WB10. |
| 98 | case wbNumeric | prALetter<<32: |
| 99 | return wbALetter, false, 100 |
| 100 | case wbNumeric | prHebrewLetter<<32: |
| 101 | return wbHebrewLetter, false, 100 |
| 102 | |
| 103 | // WB11. Transitions to wbWB11 handled by transitionWordBreakState(). |
| 104 | case wbWB11 | prNumeric<<32: |
| 105 | return wbNumeric, false, 110 |
| 106 | |
| 107 | // WB13. |
| 108 | case wbAny | prKatakana<<32: |
| 109 | return wbKatakana, true, 9990 |
| 110 | case wbKatakana | prKatakana<<32: |
| 111 | return wbKatakana, false, 130 |
| 112 | |
| 113 | // WB13a. |
| 114 | case wbAny | prExtendNumLet<<32: |
| 115 | return wbExtendNumLet, true, 9990 |
| 116 | case wbALetter | prExtendNumLet<<32: |
| 117 | return wbExtendNumLet, false, 131 |
| 118 | case wbHebrewLetter | prExtendNumLet<<32: |
| 119 | return wbExtendNumLet, false, 131 |
| 120 | case wbNumeric | prExtendNumLet<<32: |
| 121 | return wbExtendNumLet, false, 131 |
| 122 | case wbKatakana | prExtendNumLet<<32: |
| 123 | return wbExtendNumLet, false, 131 |
| 124 | case wbExtendNumLet | prExtendNumLet<<32: |
| 125 | return wbExtendNumLet, false, 131 |
| 126 | |
| 127 | // WB13b. |
| 128 | case wbExtendNumLet | prALetter<<32: |
| 129 | return wbALetter, false, 132 |
| 130 | case wbExtendNumLet | prHebrewLetter<<32: |
| 131 | return wbHebrewLetter, false, 132 |
| 132 | case wbExtendNumLet | prNumeric<<32: |
| 133 | return wbNumeric, false, 132 |
| 134 | case wbExtendNumLet | prKatakana<<32: |
| 135 | return wbKatakana, false, 132 |
| 136 | |
| 137 | default: |
| 138 | return -1, false, -1 |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | // transitionWordBreakState determines the new state of the word break parser |
| 143 | // given the current state and the next code point. It also returns whether a |
| 144 | // word boundary was detected. If more than one code point is needed to |
| 145 | // determine the new state, the byte slice or the string starting after rune "r" |
| 146 | // can be used (whichever is not nil or empty) for further lookups. |
| 147 | func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) { |
| 148 | // Determine the property of the next character. |
| 149 | nextProperty := property(workBreakCodePoints, r) |
| 150 | |
| 151 | // "Replacing Ignore Rules". |
| 152 | if nextProperty == prZWJ { |
| 153 | // WB4 (for zero-width joiners). |
| 154 | if state == wbNewline || state == wbCR || state == wbLF { |
| 155 | return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a. |
| 156 | } |
| 157 | if state < 0 { |
| 158 | return wbAny | wbZWJBit, false |
| 159 | } |
| 160 | return state | wbZWJBit, false |
| 161 | } else if nextProperty == prExtend || nextProperty == prFormat { |
| 162 | // WB4 (for Extend and Format). |
| 163 | if state == wbNewline || state == wbCR || state == wbLF { |
| 164 | return wbAny, true // Make sure we don't apply WB4 to WB3a. |
| 165 | } |
| 166 | if state == wbWSegSpace || state == wbAny|wbZWJBit { |
| 167 | return wbAny, false // We don't break but this is also not WB3d or WB3c. |
| 168 | } |
| 169 | if state < 0 { |
| 170 | return wbAny, false |
| 171 | } |
| 172 | return state, false |
| 173 | } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 { |
| 174 | // WB3c. |
| 175 | return wbAny, false |
| 176 | } |
| 177 | if state >= 0 { |
| 178 | state = state &^ wbZWJBit |
| 179 | } |
| 180 | |
| 181 | // Find the applicable transition in the table. |
| 182 | var rule int |
| 183 | newState, wordBreak, rule = wbTransitions(state, nextProperty) |
| 184 | if newState < 0 { |
| 185 | // No specific transition found. Try the less specific ones. |
| 186 | anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny) |
| 187 | anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty) |
| 188 | if anyPropState >= 0 && anyStateState >= 0 { |
| 189 | // Both apply. We'll use a mix (see comments for grTransitions). |
| 190 | newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule |
| 191 | if anyPropRule < anyStateRule { |
| 192 | wordBreak, rule = anyPropWordBreak, anyPropRule |
| 193 | } |
| 194 | } else if anyPropState >= 0 { |
| 195 | // We only have a specific state. |
| 196 | newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule |
| 197 | // This branch will probably never be reached because okAnyState will |
| 198 | // always be true given the current transition map. But we keep it here |
| 199 | // for future modifications to the transition map where this may not be |
| 200 | // true anymore. |
| 201 | } else if anyStateState >= 0 { |
| 202 | // We only have a specific property. |
| 203 | newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule |
| 204 | } else { |
| 205 | // No known transition. WB999: Any รท Any. |
| 206 | newState, wordBreak, rule = wbAny, true, 9990 |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | // For those rules that need to look up runes further in the string, we |
| 211 | // determine the property after nextProperty, skipping over Format, Extend, |
| 212 | // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot |
| 213 | // be determined (because the text ends or the rune is faulty). |
| 214 | farProperty := -1 |
| 215 | if rule > 60 && |
| 216 | (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) && |
| 217 | (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6. |
| 218 | nextProperty == prDoubleQuote || // WB7b. |
| 219 | nextProperty == prMidNum) { // WB12. |
| 220 | for { |
| 221 | var ( |
| 222 | r rune |
| 223 | length int |
| 224 | ) |
| 225 | if b != nil { // Byte slice version. |
| 226 | r, length = utf8.DecodeRune(b) |
| 227 | b = b[length:] |
| 228 | } else { // String version. |
| 229 | r, length = utf8.DecodeRuneInString(str) |
| 230 | str = str[length:] |
| 231 | } |
| 232 | if r == utf8.RuneError { |
| 233 | break |
| 234 | } |
| 235 | prop := property(workBreakCodePoints, r) |
| 236 | if prop == prExtend || prop == prFormat || prop == prZWJ { |
| 237 | continue |
| 238 | } |
| 239 | farProperty = prop |
| 240 | break |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | // WB6. |
| 245 | if rule > 60 && |
| 246 | (state == wbALetter || state == wbHebrewLetter) && |
| 247 | (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && |
| 248 | (farProperty == prALetter || farProperty == prHebrewLetter) { |
| 249 | return wbWB7, false |
| 250 | } |
| 251 | |
| 252 | // WB7b. |
| 253 | if rule > 72 && |
| 254 | state == wbHebrewLetter && |
| 255 | nextProperty == prDoubleQuote && |
| 256 | farProperty == prHebrewLetter { |
| 257 | return wbWB7c, false |
| 258 | } |
| 259 | |
| 260 | // WB12. |
| 261 | if rule > 120 && |
| 262 | state == wbNumeric && |
| 263 | (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && |
| 264 | farProperty == prNumeric { |
| 265 | return wbWB11, false |
| 266 | } |
| 267 | |
| 268 | // WB15 and WB16. |
| 269 | if newState == wbAny && nextProperty == prRegionalIndicator { |
| 270 | if state != wbOddRI && state != wbEvenRI { // Includes state == -1. |
| 271 | // Transition into the first RI. |
| 272 | return wbOddRI, true |
| 273 | } |
| 274 | if state == wbOddRI { |
| 275 | // Don't break pairs of Regional Indicators. |
| 276 | return wbEvenRI, false |
| 277 | } |
| 278 | return wbOddRI, true // We can break after a pair. |
| 279 | } |
| 280 | |
| 281 | return |
| 282 | } |