| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | // The states of the grapheme cluster parser. |
| 4 | const ( |
| 5 | grAny = iota |
| 6 | grCR |
| 7 | grControlLF |
| 8 | grL |
| 9 | grLVV |
| 10 | grLVTT |
| 11 | grPrepend |
| 12 | grExtendedPictographic |
| 13 | grExtendedPictographicZWJ |
| 14 | grRIOdd |
| 15 | grRIEven |
| 16 | ) |
| 17 | |
| 18 | // The grapheme cluster parser's breaking instructions. |
| 19 | const ( |
| 20 | grNoBoundary = iota |
| 21 | grBoundary |
| 22 | ) |
| 23 | |
| 24 | // grTransitions implements the grapheme cluster parser's state transitions. |
| 25 | // Maps state and property to a new state, a breaking instruction, and rule |
| 26 | // number. The breaking instruction always refers to the boundary between the |
| 27 | // last and next code point. Returns negative values if no transition is found. |
| 28 | // |
| 29 | // This function is used as follows: |
| 30 | // |
| 31 | // 1. Find specific state + specific property. Stop if found. |
| 32 | // 2. Find specific state + any property. |
| 33 | // 3. Find any state + specific property. |
| 34 | // 4. If only (2) or (3) (but not both) was found, stop. |
| 35 | // 5. If both (2) and (3) were found, use state from (3) and breaking instruction |
| 36 | // from the transition with the lower rule number, prefer (3) if rule numbers |
| 37 | // are equal. Stop. |
| 38 | // 6. Assume grAny and grBoundary. |
| 39 | // |
| 40 | // Unicode version 15.0.0. |
| 41 | func grTransitions(state, prop int) (newState int, newProp int, boundary int) { |
| 42 | // It turns out that using a big switch statement is much faster than using |
| 43 | // a map. |
| 44 | |
| 45 | switch uint64(state) | uint64(prop)<<32 { |
| 46 | // GB5 |
| 47 | case grAny | prCR<<32: |
| 48 | return grCR, grBoundary, 50 |
| 49 | case grAny | prLF<<32: |
| 50 | return grControlLF, grBoundary, 50 |
| 51 | case grAny | prControl<<32: |
| 52 | return grControlLF, grBoundary, 50 |
| 53 | |
| 54 | // GB4 |
| 55 | case grCR | prAny<<32: |
| 56 | return grAny, grBoundary, 40 |
| 57 | case grControlLF | prAny<<32: |
| 58 | return grAny, grBoundary, 40 |
| 59 | |
| 60 | // GB3 |
| 61 | case grCR | prLF<<32: |
| 62 | return grControlLF, grNoBoundary, 30 |
| 63 | |
| 64 | // GB6 |
| 65 | case grAny | prL<<32: |
| 66 | return grL, grBoundary, 9990 |
| 67 | case grL | prL<<32: |
| 68 | return grL, grNoBoundary, 60 |
| 69 | case grL | prV<<32: |
| 70 | return grLVV, grNoBoundary, 60 |
| 71 | case grL | prLV<<32: |
| 72 | return grLVV, grNoBoundary, 60 |
| 73 | case grL | prLVT<<32: |
| 74 | return grLVTT, grNoBoundary, 60 |
| 75 | |
| 76 | // GB7 |
| 77 | case grAny | prLV<<32: |
| 78 | return grLVV, grBoundary, 9990 |
| 79 | case grAny | prV<<32: |
| 80 | return grLVV, grBoundary, 9990 |
| 81 | case grLVV | prV<<32: |
| 82 | return grLVV, grNoBoundary, 70 |
| 83 | case grLVV | prT<<32: |
| 84 | return grLVTT, grNoBoundary, 70 |
| 85 | |
| 86 | // GB8 |
| 87 | case grAny | prLVT<<32: |
| 88 | return grLVTT, grBoundary, 9990 |
| 89 | case grAny | prT<<32: |
| 90 | return grLVTT, grBoundary, 9990 |
| 91 | case grLVTT | prT<<32: |
| 92 | return grLVTT, grNoBoundary, 80 |
| 93 | |
| 94 | // GB9 |
| 95 | case grAny | prExtend<<32: |
| 96 | return grAny, grNoBoundary, 90 |
| 97 | case grAny | prZWJ<<32: |
| 98 | return grAny, grNoBoundary, 90 |
| 99 | |
| 100 | // GB9a |
| 101 | case grAny | prSpacingMark<<32: |
| 102 | return grAny, grNoBoundary, 91 |
| 103 | |
| 104 | // GB9b |
| 105 | case grAny | prPrepend<<32: |
| 106 | return grPrepend, grBoundary, 9990 |
| 107 | case grPrepend | prAny<<32: |
| 108 | return grAny, grNoBoundary, 92 |
| 109 | |
| 110 | // GB11 |
| 111 | case grAny | prExtendedPictographic<<32: |
| 112 | return grExtendedPictographic, grBoundary, 9990 |
| 113 | case grExtendedPictographic | prExtend<<32: |
| 114 | return grExtendedPictographic, grNoBoundary, 110 |
| 115 | case grExtendedPictographic | prZWJ<<32: |
| 116 | return grExtendedPictographicZWJ, grNoBoundary, 110 |
| 117 | case grExtendedPictographicZWJ | prExtendedPictographic<<32: |
| 118 | return grExtendedPictographic, grNoBoundary, 110 |
| 119 | |
| 120 | // GB12 / GB13 |
| 121 | case grAny | prRegionalIndicator<<32: |
| 122 | return grRIOdd, grBoundary, 9990 |
| 123 | case grRIOdd | prRegionalIndicator<<32: |
| 124 | return grRIEven, grNoBoundary, 120 |
| 125 | case grRIEven | prRegionalIndicator<<32: |
| 126 | return grRIOdd, grBoundary, 120 |
| 127 | default: |
| 128 | return -1, -1, -1 |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | // transitionGraphemeState determines the new state of the grapheme cluster |
| 133 | // parser given the current state and the next code point. It also returns the |
| 134 | // code point's grapheme property (the value mapped by the [graphemeCodePoints] |
| 135 | // table) and whether a cluster boundary was detected. |
| 136 | func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) { |
| 137 | // Determine the property of the next character. |
| 138 | prop = propertyGraphemes(r) |
| 139 | |
| 140 | // Find the applicable transition. |
| 141 | nextState, nextProp, _ := grTransitions(state, prop) |
| 142 | if nextState >= 0 { |
| 143 | // We have a specific transition. We'll use it. |
| 144 | return nextState, prop, nextProp == grBoundary |
| 145 | } |
| 146 | |
| 147 | // No specific transition found. Try the less specific ones. |
| 148 | anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny) |
| 149 | anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop) |
| 150 | if anyPropState >= 0 && anyStateState >= 0 { |
| 151 | // Both apply. We'll use a mix (see comments for grTransitions). |
| 152 | newState = anyStateState |
| 153 | boundary = anyStateProp == grBoundary |
| 154 | if anyPropRule < anyStateRule { |
| 155 | boundary = anyPropProp == grBoundary |
| 156 | } |
| 157 | return |
| 158 | } |
| 159 | |
| 160 | if anyPropState >= 0 { |
| 161 | // We only have a specific state. |
| 162 | return anyPropState, prop, anyPropProp == grBoundary |
| 163 | // This branch will probably never be reached because okAnyState will |
| 164 | // always be true given the current transition map. But we keep it here |
| 165 | // for future modifications to the transition map where this may not be |
| 166 | // true anymore. |
| 167 | } |
| 168 | |
| 169 | if anyStateState >= 0 { |
| 170 | // We only have a specific property. |
| 171 | return anyStateState, prop, anyStateProp == grBoundary |
| 172 | } |
| 173 | |
| 174 | // No known transition. GB999: Any รท Any. |
| 175 | return grAny, prop, true |
| 176 | } |