| package uniseg |
| |
| // The states of the grapheme cluster parser. |
| const ( |
| grAny = iota |
| grCR |
| grControlLF |
| grL |
| grLVV |
| grLVTT |
| grPrepend |
| grExtendedPictographic |
| grExtendedPictographicZWJ |
| grRIOdd |
| grRIEven |
| ) |
| |
| // The grapheme cluster parser's breaking instructions. |
| const ( |
| grNoBoundary = iota |
| grBoundary |
| ) |
| |
| // grTransitions implements the grapheme cluster parser's state transitions. |
| // Maps state and property to a new state, a breaking instruction, and rule |
| // number. The breaking instruction always refers to the boundary between the |
| // last and next code point. Returns negative values if no transition is found. |
| // |
| // This function is used as follows: |
| // |
| // 1. Find specific state + specific property. Stop if found. |
| // 2. Find specific state + any property. |
| // 3. Find any state + specific property. |
| // 4. If only (2) or (3) (but not both) was found, stop. |
| // 5. If both (2) and (3) were found, use state from (3) and breaking instruction |
| // from the transition with the lower rule number, prefer (3) if rule numbers |
| // are equal. Stop. |
| // 6. Assume grAny and grBoundary. |
| // |
| // Unicode version 15.0.0. |
| func grTransitions(state, prop int) (newState int, newProp int, boundary int) { |
| // It turns out that using a big switch statement is much faster than using |
| // a map. |
| |
| switch uint64(state) | uint64(prop)<<32 { |
| // GB5 |
| case grAny | prCR<<32: |
| return grCR, grBoundary, 50 |
| case grAny | prLF<<32: |
| return grControlLF, grBoundary, 50 |
| case grAny | prControl<<32: |
| return grControlLF, grBoundary, 50 |
| |
| // GB4 |
| case grCR | prAny<<32: |
| return grAny, grBoundary, 40 |
| case grControlLF | prAny<<32: |
| return grAny, grBoundary, 40 |
| |
| // GB3 |
| case grCR | prLF<<32: |
| return grControlLF, grNoBoundary, 30 |
| |
| // GB6 |
| case grAny | prL<<32: |
| return grL, grBoundary, 9990 |
| case grL | prL<<32: |
| return grL, grNoBoundary, 60 |
| case grL | prV<<32: |
| return grLVV, grNoBoundary, 60 |
| case grL | prLV<<32: |
| return grLVV, grNoBoundary, 60 |
| case grL | prLVT<<32: |
| return grLVTT, grNoBoundary, 60 |
| |
| // GB7 |
| case grAny | prLV<<32: |
| return grLVV, grBoundary, 9990 |
| case grAny | prV<<32: |
| return grLVV, grBoundary, 9990 |
| case grLVV | prV<<32: |
| return grLVV, grNoBoundary, 70 |
| case grLVV | prT<<32: |
| return grLVTT, grNoBoundary, 70 |
| |
| // GB8 |
| case grAny | prLVT<<32: |
| return grLVTT, grBoundary, 9990 |
| case grAny | prT<<32: |
| return grLVTT, grBoundary, 9990 |
| case grLVTT | prT<<32: |
| return grLVTT, grNoBoundary, 80 |
| |
| // GB9 |
| case grAny | prExtend<<32: |
| return grAny, grNoBoundary, 90 |
| case grAny | prZWJ<<32: |
| return grAny, grNoBoundary, 90 |
| |
| // GB9a |
| case grAny | prSpacingMark<<32: |
| return grAny, grNoBoundary, 91 |
| |
| // GB9b |
| case grAny | prPrepend<<32: |
| return grPrepend, grBoundary, 9990 |
| case grPrepend | prAny<<32: |
| return grAny, grNoBoundary, 92 |
| |
| // GB11 |
| case grAny | prExtendedPictographic<<32: |
| return grExtendedPictographic, grBoundary, 9990 |
| case grExtendedPictographic | prExtend<<32: |
| return grExtendedPictographic, grNoBoundary, 110 |
| case grExtendedPictographic | prZWJ<<32: |
| return grExtendedPictographicZWJ, grNoBoundary, 110 |
| case grExtendedPictographicZWJ | prExtendedPictographic<<32: |
| return grExtendedPictographic, grNoBoundary, 110 |
| |
| // GB12 / GB13 |
| case grAny | prRegionalIndicator<<32: |
| return grRIOdd, grBoundary, 9990 |
| case grRIOdd | prRegionalIndicator<<32: |
| return grRIEven, grNoBoundary, 120 |
| case grRIEven | prRegionalIndicator<<32: |
| return grRIOdd, grBoundary, 120 |
| default: |
| return -1, -1, -1 |
| } |
| } |
| |
| // transitionGraphemeState determines the new state of the grapheme cluster |
| // parser given the current state and the next code point. It also returns the |
| // code point's grapheme property (the value mapped by the [graphemeCodePoints] |
| // table) and whether a cluster boundary was detected. |
| func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) { |
| // Determine the property of the next character. |
| prop = propertyGraphemes(r) |
| |
| // Find the applicable transition. |
| nextState, nextProp, _ := grTransitions(state, prop) |
| if nextState >= 0 { |
| // We have a specific transition. We'll use it. |
| return nextState, prop, nextProp == grBoundary |
| } |
| |
| // No specific transition found. Try the less specific ones. |
| anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny) |
| anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop) |
| if anyPropState >= 0 && anyStateState >= 0 { |
| // Both apply. We'll use a mix (see comments for grTransitions). |
| newState = anyStateState |
| boundary = anyStateProp == grBoundary |
| if anyPropRule < anyStateRule { |
| boundary = anyPropProp == grBoundary |
| } |
| return |
| } |
| |
| if anyPropState >= 0 { |
| // We only have a specific state. |
| return anyPropState, prop, anyPropProp == grBoundary |
| // This branch will probably never be reached because okAnyState will |
| // always be true given the current transition map. But we keep it here |
| // for future modifications to the transition map where this may not be |
| // true anymore. |
| } |
| |
| if anyStateState >= 0 { |
| // We only have a specific property. |
| return anyStateState, prop, anyStateProp == grBoundary |
| } |
| |
| // No known transition. GB999: Any รท Any. |
| return grAny, prop, true |
| } |