| package uniseg |
| |
| import "unicode/utf8" |
| |
| // The states of the word break parser. |
| const ( |
| wbAny = iota |
| wbCR |
| wbLF |
| wbNewline |
| wbWSegSpace |
| wbHebrewLetter |
| wbALetter |
| wbWB7 |
| wbWB7c |
| wbNumeric |
| wbWB11 |
| wbKatakana |
| wbExtendNumLet |
| wbOddRI |
| wbEvenRI |
| wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c). |
| ) |
| |
| // wbTransitions implements the word break parser's state transitions. It's |
| // anologous to [grTransitions], see comments there for details. |
| // |
| // Unicode version 15.0.0. |
| func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) { |
| switch uint64(state) | uint64(prop)<<32 { |
| // WB3b. |
| case wbAny | prNewline<<32: |
| return wbNewline, true, 32 |
| case wbAny | prCR<<32: |
| return wbCR, true, 32 |
| case wbAny | prLF<<32: |
| return wbLF, true, 32 |
| |
| // WB3a. |
| case wbNewline | prAny<<32: |
| return wbAny, true, 31 |
| case wbCR | prAny<<32: |
| return wbAny, true, 31 |
| case wbLF | prAny<<32: |
| return wbAny, true, 31 |
| |
| // WB3. |
| case wbCR | prLF<<32: |
| return wbLF, false, 30 |
| |
| // WB3d. |
| case wbAny | prWSegSpace<<32: |
| return wbWSegSpace, true, 9990 |
| case wbWSegSpace | prWSegSpace<<32: |
| return wbWSegSpace, false, 34 |
| |
| // WB5. |
| case wbAny | prALetter<<32: |
| return wbALetter, true, 9990 |
| case wbAny | prHebrewLetter<<32: |
| return wbHebrewLetter, true, 9990 |
| case wbALetter | prALetter<<32: |
| return wbALetter, false, 50 |
| case wbALetter | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 50 |
| case wbHebrewLetter | prALetter<<32: |
| return wbALetter, false, 50 |
| case wbHebrewLetter | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 50 |
| |
| // WB7. Transitions to wbWB7 handled by transitionWordBreakState(). |
| case wbWB7 | prALetter<<32: |
| return wbALetter, false, 70 |
| case wbWB7 | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 70 |
| |
| // WB7a. |
| case wbHebrewLetter | prSingleQuote<<32: |
| return wbAny, false, 71 |
| |
| // WB7c. Transitions to wbWB7c handled by transitionWordBreakState(). |
| case wbWB7c | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 73 |
| |
| // WB8. |
| case wbAny | prNumeric<<32: |
| return wbNumeric, true, 9990 |
| case wbNumeric | prNumeric<<32: |
| return wbNumeric, false, 80 |
| |
| // WB9. |
| case wbALetter | prNumeric<<32: |
| return wbNumeric, false, 90 |
| case wbHebrewLetter | prNumeric<<32: |
| return wbNumeric, false, 90 |
| |
| // WB10. |
| case wbNumeric | prALetter<<32: |
| return wbALetter, false, 100 |
| case wbNumeric | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 100 |
| |
| // WB11. Transitions to wbWB11 handled by transitionWordBreakState(). |
| case wbWB11 | prNumeric<<32: |
| return wbNumeric, false, 110 |
| |
| // WB13. |
| case wbAny | prKatakana<<32: |
| return wbKatakana, true, 9990 |
| case wbKatakana | prKatakana<<32: |
| return wbKatakana, false, 130 |
| |
| // WB13a. |
| case wbAny | prExtendNumLet<<32: |
| return wbExtendNumLet, true, 9990 |
| case wbALetter | prExtendNumLet<<32: |
| return wbExtendNumLet, false, 131 |
| case wbHebrewLetter | prExtendNumLet<<32: |
| return wbExtendNumLet, false, 131 |
| case wbNumeric | prExtendNumLet<<32: |
| return wbExtendNumLet, false, 131 |
| case wbKatakana | prExtendNumLet<<32: |
| return wbExtendNumLet, false, 131 |
| case wbExtendNumLet | prExtendNumLet<<32: |
| return wbExtendNumLet, false, 131 |
| |
| // WB13b. |
| case wbExtendNumLet | prALetter<<32: |
| return wbALetter, false, 132 |
| case wbExtendNumLet | prHebrewLetter<<32: |
| return wbHebrewLetter, false, 132 |
| case wbExtendNumLet | prNumeric<<32: |
| return wbNumeric, false, 132 |
| case wbExtendNumLet | prKatakana<<32: |
| return wbKatakana, false, 132 |
| |
| default: |
| return -1, false, -1 |
| } |
| } |
| |
| // transitionWordBreakState determines the new state of the word break parser |
| // given the current state and the next code point. It also returns whether a |
| // word boundary was detected. If more than one code point is needed to |
| // determine the new state, the byte slice or the string starting after rune "r" |
| // can be used (whichever is not nil or empty) for further lookups. |
| func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) { |
| // Determine the property of the next character. |
| nextProperty := property(workBreakCodePoints, r) |
| |
| // "Replacing Ignore Rules". |
| if nextProperty == prZWJ { |
| // WB4 (for zero-width joiners). |
| if state == wbNewline || state == wbCR || state == wbLF { |
| return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a. |
| } |
| if state < 0 { |
| return wbAny | wbZWJBit, false |
| } |
| return state | wbZWJBit, false |
| } else if nextProperty == prExtend || nextProperty == prFormat { |
| // WB4 (for Extend and Format). |
| if state == wbNewline || state == wbCR || state == wbLF { |
| return wbAny, true // Make sure we don't apply WB4 to WB3a. |
| } |
| if state == wbWSegSpace || state == wbAny|wbZWJBit { |
| return wbAny, false // We don't break but this is also not WB3d or WB3c. |
| } |
| if state < 0 { |
| return wbAny, false |
| } |
| return state, false |
| } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 { |
| // WB3c. |
| return wbAny, false |
| } |
| if state >= 0 { |
| state = state &^ wbZWJBit |
| } |
| |
| // Find the applicable transition in the table. |
| var rule int |
| newState, wordBreak, rule = wbTransitions(state, nextProperty) |
| if newState < 0 { |
| // No specific transition found. Try the less specific ones. |
| anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny) |
| anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty) |
| if anyPropState >= 0 && anyStateState >= 0 { |
| // Both apply. We'll use a mix (see comments for grTransitions). |
| newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule |
| if anyPropRule < anyStateRule { |
| wordBreak, rule = anyPropWordBreak, anyPropRule |
| } |
| } else if anyPropState >= 0 { |
| // We only have a specific state. |
| newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule |
| // This branch will probably never be reached because okAnyState will |
| // always be true given the current transition map. But we keep it here |
| // for future modifications to the transition map where this may not be |
| // true anymore. |
| } else if anyStateState >= 0 { |
| // We only have a specific property. |
| newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule |
| } else { |
| // No known transition. WB999: Any รท Any. |
| newState, wordBreak, rule = wbAny, true, 9990 |
| } |
| } |
| |
| // For those rules that need to look up runes further in the string, we |
| // determine the property after nextProperty, skipping over Format, Extend, |
| // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot |
| // be determined (because the text ends or the rune is faulty). |
| farProperty := -1 |
| if rule > 60 && |
| (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) && |
| (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6. |
| nextProperty == prDoubleQuote || // WB7b. |
| nextProperty == prMidNum) { // WB12. |
| for { |
| var ( |
| r rune |
| length int |
| ) |
| if b != nil { // Byte slice version. |
| r, length = utf8.DecodeRune(b) |
| b = b[length:] |
| } else { // String version. |
| r, length = utf8.DecodeRuneInString(str) |
| str = str[length:] |
| } |
| if r == utf8.RuneError { |
| break |
| } |
| prop := property(workBreakCodePoints, r) |
| if prop == prExtend || prop == prFormat || prop == prZWJ { |
| continue |
| } |
| farProperty = prop |
| break |
| } |
| } |
| |
| // WB6. |
| if rule > 60 && |
| (state == wbALetter || state == wbHebrewLetter) && |
| (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && |
| (farProperty == prALetter || farProperty == prHebrewLetter) { |
| return wbWB7, false |
| } |
| |
| // WB7b. |
| if rule > 72 && |
| state == wbHebrewLetter && |
| nextProperty == prDoubleQuote && |
| farProperty == prHebrewLetter { |
| return wbWB7c, false |
| } |
| |
| // WB12. |
| if rule > 120 && |
| state == wbNumeric && |
| (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && |
| farProperty == prNumeric { |
| return wbWB11, false |
| } |
| |
| // WB15 and WB16. |
| if newState == wbAny && nextProperty == prRegionalIndicator { |
| if state != wbOddRI && state != wbEvenRI { // Includes state == -1. |
| // Transition into the first RI. |
| return wbOddRI, true |
| } |
| if state == wbOddRI { |
| // Don't break pairs of Regional Indicators. |
| return wbEvenRI, false |
| } |
| return wbOddRI, true // We can break after a pair. |
| } |
| |
| return |
| } |