| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | import "unicode/utf8" |
| 4 | |
| 5 | // The states of the sentence break parser. |
| 6 | const ( |
| 7 | sbAny = iota |
| 8 | sbCR |
| 9 | sbParaSep |
| 10 | sbATerm |
| 11 | sbUpper |
| 12 | sbLower |
| 13 | sbSB7 |
| 14 | sbSB8Close |
| 15 | sbSB8Sp |
| 16 | sbSTerm |
| 17 | sbSB8aClose |
| 18 | sbSB8aSp |
| 19 | ) |
| 20 | |
| 21 | // sbTransitions implements the sentence break parser's state transitions. It's |
| 22 | // anologous to [grTransitions], see comments there for details. |
| 23 | // |
| 24 | // Unicode version 15.0.0. |
| 25 | func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) { |
| 26 | switch uint64(state) | uint64(prop)<<32 { |
| 27 | // SB3. |
| 28 | case sbAny | prCR<<32: |
| 29 | return sbCR, false, 9990 |
| 30 | case sbCR | prLF<<32: |
| 31 | return sbParaSep, false, 30 |
| 32 | |
| 33 | // SB4. |
| 34 | case sbAny | prSep<<32: |
| 35 | return sbParaSep, false, 9990 |
| 36 | case sbAny | prLF<<32: |
| 37 | return sbParaSep, false, 9990 |
| 38 | case sbParaSep | prAny<<32: |
| 39 | return sbAny, true, 40 |
| 40 | case sbCR | prAny<<32: |
| 41 | return sbAny, true, 40 |
| 42 | |
| 43 | // SB6. |
| 44 | case sbAny | prATerm<<32: |
| 45 | return sbATerm, false, 9990 |
| 46 | case sbATerm | prNumeric<<32: |
| 47 | return sbAny, false, 60 |
| 48 | case sbSB7 | prNumeric<<32: |
| 49 | return sbAny, false, 60 // Because ATerm also appears in SB7. |
| 50 | |
| 51 | // SB7. |
| 52 | case sbAny | prUpper<<32: |
| 53 | return sbUpper, false, 9990 |
| 54 | case sbAny | prLower<<32: |
| 55 | return sbLower, false, 9990 |
| 56 | case sbUpper | prATerm<<32: |
| 57 | return sbSB7, false, 70 |
| 58 | case sbLower | prATerm<<32: |
| 59 | return sbSB7, false, 70 |
| 60 | case sbSB7 | prUpper<<32: |
| 61 | return sbUpper, false, 70 |
| 62 | |
| 63 | // SB8a. |
| 64 | case sbAny | prSTerm<<32: |
| 65 | return sbSTerm, false, 9990 |
| 66 | case sbATerm | prSContinue<<32: |
| 67 | return sbAny, false, 81 |
| 68 | case sbATerm | prATerm<<32: |
| 69 | return sbATerm, false, 81 |
| 70 | case sbATerm | prSTerm<<32: |
| 71 | return sbSTerm, false, 81 |
| 72 | case sbSB7 | prSContinue<<32: |
| 73 | return sbAny, false, 81 |
| 74 | case sbSB7 | prATerm<<32: |
| 75 | return sbATerm, false, 81 |
| 76 | case sbSB7 | prSTerm<<32: |
| 77 | return sbSTerm, false, 81 |
| 78 | case sbSB8Close | prSContinue<<32: |
| 79 | return sbAny, false, 81 |
| 80 | case sbSB8Close | prATerm<<32: |
| 81 | return sbATerm, false, 81 |
| 82 | case sbSB8Close | prSTerm<<32: |
| 83 | return sbSTerm, false, 81 |
| 84 | case sbSB8Sp | prSContinue<<32: |
| 85 | return sbAny, false, 81 |
| 86 | case sbSB8Sp | prATerm<<32: |
| 87 | return sbATerm, false, 81 |
| 88 | case sbSB8Sp | prSTerm<<32: |
| 89 | return sbSTerm, false, 81 |
| 90 | case sbSTerm | prSContinue<<32: |
| 91 | return sbAny, false, 81 |
| 92 | case sbSTerm | prATerm<<32: |
| 93 | return sbATerm, false, 81 |
| 94 | case sbSTerm | prSTerm<<32: |
| 95 | return sbSTerm, false, 81 |
| 96 | case sbSB8aClose | prSContinue<<32: |
| 97 | return sbAny, false, 81 |
| 98 | case sbSB8aClose | prATerm<<32: |
| 99 | return sbATerm, false, 81 |
| 100 | case sbSB8aClose | prSTerm<<32: |
| 101 | return sbSTerm, false, 81 |
| 102 | case sbSB8aSp | prSContinue<<32: |
| 103 | return sbAny, false, 81 |
| 104 | case sbSB8aSp | prATerm<<32: |
| 105 | return sbATerm, false, 81 |
| 106 | case sbSB8aSp | prSTerm<<32: |
| 107 | return sbSTerm, false, 81 |
| 108 | |
| 109 | // SB9. |
| 110 | case sbATerm | prClose<<32: |
| 111 | return sbSB8Close, false, 90 |
| 112 | case sbSB7 | prClose<<32: |
| 113 | return sbSB8Close, false, 90 |
| 114 | case sbSB8Close | prClose<<32: |
| 115 | return sbSB8Close, false, 90 |
| 116 | case sbATerm | prSp<<32: |
| 117 | return sbSB8Sp, false, 90 |
| 118 | case sbSB7 | prSp<<32: |
| 119 | return sbSB8Sp, false, 90 |
| 120 | case sbSB8Close | prSp<<32: |
| 121 | return sbSB8Sp, false, 90 |
| 122 | case sbSTerm | prClose<<32: |
| 123 | return sbSB8aClose, false, 90 |
| 124 | case sbSB8aClose | prClose<<32: |
| 125 | return sbSB8aClose, false, 90 |
| 126 | case sbSTerm | prSp<<32: |
| 127 | return sbSB8aSp, false, 90 |
| 128 | case sbSB8aClose | prSp<<32: |
| 129 | return sbSB8aSp, false, 90 |
| 130 | case sbATerm | prSep<<32: |
| 131 | return sbParaSep, false, 90 |
| 132 | case sbATerm | prCR<<32: |
| 133 | return sbParaSep, false, 90 |
| 134 | case sbATerm | prLF<<32: |
| 135 | return sbParaSep, false, 90 |
| 136 | case sbSB7 | prSep<<32: |
| 137 | return sbParaSep, false, 90 |
| 138 | case sbSB7 | prCR<<32: |
| 139 | return sbParaSep, false, 90 |
| 140 | case sbSB7 | prLF<<32: |
| 141 | return sbParaSep, false, 90 |
| 142 | case sbSB8Close | prSep<<32: |
| 143 | return sbParaSep, false, 90 |
| 144 | case sbSB8Close | prCR<<32: |
| 145 | return sbParaSep, false, 90 |
| 146 | case sbSB8Close | prLF<<32: |
| 147 | return sbParaSep, false, 90 |
| 148 | case sbSTerm | prSep<<32: |
| 149 | return sbParaSep, false, 90 |
| 150 | case sbSTerm | prCR<<32: |
| 151 | return sbParaSep, false, 90 |
| 152 | case sbSTerm | prLF<<32: |
| 153 | return sbParaSep, false, 90 |
| 154 | case sbSB8aClose | prSep<<32: |
| 155 | return sbParaSep, false, 90 |
| 156 | case sbSB8aClose | prCR<<32: |
| 157 | return sbParaSep, false, 90 |
| 158 | case sbSB8aClose | prLF<<32: |
| 159 | return sbParaSep, false, 90 |
| 160 | |
| 161 | // SB10. |
| 162 | case sbSB8Sp | prSp<<32: |
| 163 | return sbSB8Sp, false, 100 |
| 164 | case sbSB8aSp | prSp<<32: |
| 165 | return sbSB8aSp, false, 100 |
| 166 | case sbSB8Sp | prSep<<32: |
| 167 | return sbParaSep, false, 100 |
| 168 | case sbSB8Sp | prCR<<32: |
| 169 | return sbParaSep, false, 100 |
| 170 | case sbSB8Sp | prLF<<32: |
| 171 | return sbParaSep, false, 100 |
| 172 | |
| 173 | // SB11. |
| 174 | case sbATerm | prAny<<32: |
| 175 | return sbAny, true, 110 |
| 176 | case sbSB7 | prAny<<32: |
| 177 | return sbAny, true, 110 |
| 178 | case sbSB8Close | prAny<<32: |
| 179 | return sbAny, true, 110 |
| 180 | case sbSB8Sp | prAny<<32: |
| 181 | return sbAny, true, 110 |
| 182 | case sbSTerm | prAny<<32: |
| 183 | return sbAny, true, 110 |
| 184 | case sbSB8aClose | prAny<<32: |
| 185 | return sbAny, true, 110 |
| 186 | case sbSB8aSp | prAny<<32: |
| 187 | return sbAny, true, 110 |
| 188 | // We'll always break after ParaSep due to SB4. |
| 189 | |
| 190 | default: |
| 191 | return -1, false, -1 |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | // transitionSentenceBreakState determines the new state of the sentence break |
| 196 | // parser given the current state and the next code point. It also returns |
| 197 | // whether a sentence boundary was detected. If more than one code point is |
| 198 | // needed to determine the new state, the byte slice or the string starting |
| 199 | // after rune "r" can be used (whichever is not nil or empty) for further |
| 200 | // lookups. |
| 201 | func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) { |
| 202 | // Determine the property of the next character. |
| 203 | nextProperty := property(sentenceBreakCodePoints, r) |
| 204 | |
| 205 | // SB5 (Replacing Ignore Rules). |
| 206 | if nextProperty == prExtend || nextProperty == prFormat { |
| 207 | if state == sbParaSep || state == sbCR { |
| 208 | return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4. |
| 209 | } |
| 210 | if state < 0 { |
| 211 | return sbAny, true // SB1. |
| 212 | } |
| 213 | return state, false |
| 214 | } |
| 215 | |
| 216 | // Find the applicable transition in the table. |
| 217 | var rule int |
| 218 | newState, sentenceBreak, rule = sbTransitions(state, nextProperty) |
| 219 | if newState < 0 { |
| 220 | // No specific transition found. Try the less specific ones. |
| 221 | anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny) |
| 222 | anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty) |
| 223 | if anyPropState >= 0 && anyStateState >= 0 { |
| 224 | // Both apply. We'll use a mix (see comments for grTransitions). |
| 225 | newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule |
| 226 | if anyPropRule < anyStateRule { |
| 227 | sentenceBreak, rule = anyPropProp, anyPropRule |
| 228 | } |
| 229 | } else if anyPropState >= 0 { |
| 230 | // We only have a specific state. |
| 231 | newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule |
| 232 | // This branch will probably never be reached because okAnyState will |
| 233 | // always be true given the current transition map. But we keep it here |
| 234 | // for future modifications to the transition map where this may not be |
| 235 | // true anymore. |
| 236 | } else if anyStateState >= 0 { |
| 237 | // We only have a specific property. |
| 238 | newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule |
| 239 | } else { |
| 240 | // No known transition. SB999: Any × Any. |
| 241 | newState, sentenceBreak, rule = sbAny, false, 9990 |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | // SB8. |
| 246 | if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) { |
| 247 | // Check the right side of the rule. |
| 248 | var length int |
| 249 | for nextProperty != prOLetter && |
| 250 | nextProperty != prUpper && |
| 251 | nextProperty != prLower && |
| 252 | nextProperty != prSep && |
| 253 | nextProperty != prCR && |
| 254 | nextProperty != prLF && |
| 255 | nextProperty != prATerm && |
| 256 | nextProperty != prSTerm { |
| 257 | // Move on to the next rune. |
| 258 | if b != nil { // Byte slice version. |
| 259 | r, length = utf8.DecodeRune(b) |
| 260 | b = b[length:] |
| 261 | } else { // String version. |
| 262 | r, length = utf8.DecodeRuneInString(str) |
| 263 | str = str[length:] |
| 264 | } |
| 265 | if r == utf8.RuneError { |
| 266 | break |
| 267 | } |
| 268 | nextProperty = property(sentenceBreakCodePoints, r) |
| 269 | } |
| 270 | if nextProperty == prLower { |
| 271 | return sbLower, false |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | return |
| 276 | } |