| package uniseg |
| |
| import "unicode/utf8" |
| |
| // The states of the sentence break parser. |
| const ( |
| sbAny = iota |
| sbCR |
| sbParaSep |
| sbATerm |
| sbUpper |
| sbLower |
| sbSB7 |
| sbSB8Close |
| sbSB8Sp |
| sbSTerm |
| sbSB8aClose |
| sbSB8aSp |
| ) |
| |
| // sbTransitions implements the sentence break parser's state transitions. It's |
| // anologous to [grTransitions], see comments there for details. |
| // |
| // Unicode version 15.0.0. |
| func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) { |
| switch uint64(state) | uint64(prop)<<32 { |
| // SB3. |
| case sbAny | prCR<<32: |
| return sbCR, false, 9990 |
| case sbCR | prLF<<32: |
| return sbParaSep, false, 30 |
| |
| // SB4. |
| case sbAny | prSep<<32: |
| return sbParaSep, false, 9990 |
| case sbAny | prLF<<32: |
| return sbParaSep, false, 9990 |
| case sbParaSep | prAny<<32: |
| return sbAny, true, 40 |
| case sbCR | prAny<<32: |
| return sbAny, true, 40 |
| |
| // SB6. |
| case sbAny | prATerm<<32: |
| return sbATerm, false, 9990 |
| case sbATerm | prNumeric<<32: |
| return sbAny, false, 60 |
| case sbSB7 | prNumeric<<32: |
| return sbAny, false, 60 // Because ATerm also appears in SB7. |
| |
| // SB7. |
| case sbAny | prUpper<<32: |
| return sbUpper, false, 9990 |
| case sbAny | prLower<<32: |
| return sbLower, false, 9990 |
| case sbUpper | prATerm<<32: |
| return sbSB7, false, 70 |
| case sbLower | prATerm<<32: |
| return sbSB7, false, 70 |
| case sbSB7 | prUpper<<32: |
| return sbUpper, false, 70 |
| |
| // SB8a. |
| case sbAny | prSTerm<<32: |
| return sbSTerm, false, 9990 |
| case sbATerm | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbATerm | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbATerm | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSB7 | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSB7 | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSB7 | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSB8Close | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSB8Close | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSB8Close | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSB8Sp | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSB8Sp | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSB8Sp | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSTerm | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSTerm | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSTerm | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSB8aClose | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSB8aClose | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSB8aClose | prSTerm<<32: |
| return sbSTerm, false, 81 |
| case sbSB8aSp | prSContinue<<32: |
| return sbAny, false, 81 |
| case sbSB8aSp | prATerm<<32: |
| return sbATerm, false, 81 |
| case sbSB8aSp | prSTerm<<32: |
| return sbSTerm, false, 81 |
| |
| // SB9. |
| case sbATerm | prClose<<32: |
| return sbSB8Close, false, 90 |
| case sbSB7 | prClose<<32: |
| return sbSB8Close, false, 90 |
| case sbSB8Close | prClose<<32: |
| return sbSB8Close, false, 90 |
| case sbATerm | prSp<<32: |
| return sbSB8Sp, false, 90 |
| case sbSB7 | prSp<<32: |
| return sbSB8Sp, false, 90 |
| case sbSB8Close | prSp<<32: |
| return sbSB8Sp, false, 90 |
| case sbSTerm | prClose<<32: |
| return sbSB8aClose, false, 90 |
| case sbSB8aClose | prClose<<32: |
| return sbSB8aClose, false, 90 |
| case sbSTerm | prSp<<32: |
| return sbSB8aSp, false, 90 |
| case sbSB8aClose | prSp<<32: |
| return sbSB8aSp, false, 90 |
| case sbATerm | prSep<<32: |
| return sbParaSep, false, 90 |
| case sbATerm | prCR<<32: |
| return sbParaSep, false, 90 |
| case sbATerm | prLF<<32: |
| return sbParaSep, false, 90 |
| case sbSB7 | prSep<<32: |
| return sbParaSep, false, 90 |
| case sbSB7 | prCR<<32: |
| return sbParaSep, false, 90 |
| case sbSB7 | prLF<<32: |
| return sbParaSep, false, 90 |
| case sbSB8Close | prSep<<32: |
| return sbParaSep, false, 90 |
| case sbSB8Close | prCR<<32: |
| return sbParaSep, false, 90 |
| case sbSB8Close | prLF<<32: |
| return sbParaSep, false, 90 |
| case sbSTerm | prSep<<32: |
| return sbParaSep, false, 90 |
| case sbSTerm | prCR<<32: |
| return sbParaSep, false, 90 |
| case sbSTerm | prLF<<32: |
| return sbParaSep, false, 90 |
| case sbSB8aClose | prSep<<32: |
| return sbParaSep, false, 90 |
| case sbSB8aClose | prCR<<32: |
| return sbParaSep, false, 90 |
| case sbSB8aClose | prLF<<32: |
| return sbParaSep, false, 90 |
| |
| // SB10. |
| case sbSB8Sp | prSp<<32: |
| return sbSB8Sp, false, 100 |
| case sbSB8aSp | prSp<<32: |
| return sbSB8aSp, false, 100 |
| case sbSB8Sp | prSep<<32: |
| return sbParaSep, false, 100 |
| case sbSB8Sp | prCR<<32: |
| return sbParaSep, false, 100 |
| case sbSB8Sp | prLF<<32: |
| return sbParaSep, false, 100 |
| |
| // SB11. |
| case sbATerm | prAny<<32: |
| return sbAny, true, 110 |
| case sbSB7 | prAny<<32: |
| return sbAny, true, 110 |
| case sbSB8Close | prAny<<32: |
| return sbAny, true, 110 |
| case sbSB8Sp | prAny<<32: |
| return sbAny, true, 110 |
| case sbSTerm | prAny<<32: |
| return sbAny, true, 110 |
| case sbSB8aClose | prAny<<32: |
| return sbAny, true, 110 |
| case sbSB8aSp | prAny<<32: |
| return sbAny, true, 110 |
| // We'll always break after ParaSep due to SB4. |
| |
| default: |
| return -1, false, -1 |
| } |
| } |
| |
| // transitionSentenceBreakState determines the new state of the sentence break |
| // parser given the current state and the next code point. It also returns |
| // whether a sentence boundary was detected. If more than one code point is |
| // needed to determine the new state, the byte slice or the string starting |
| // after rune "r" can be used (whichever is not nil or empty) for further |
| // lookups. |
| func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) { |
| // Determine the property of the next character. |
| nextProperty := property(sentenceBreakCodePoints, r) |
| |
| // SB5 (Replacing Ignore Rules). |
| if nextProperty == prExtend || nextProperty == prFormat { |
| if state == sbParaSep || state == sbCR { |
| return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4. |
| } |
| if state < 0 { |
| return sbAny, true // SB1. |
| } |
| return state, false |
| } |
| |
| // Find the applicable transition in the table. |
| var rule int |
| newState, sentenceBreak, rule = sbTransitions(state, nextProperty) |
| if newState < 0 { |
| // No specific transition found. Try the less specific ones. |
| anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny) |
| anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty) |
| if anyPropState >= 0 && anyStateState >= 0 { |
| // Both apply. We'll use a mix (see comments for grTransitions). |
| newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule |
| if anyPropRule < anyStateRule { |
| sentenceBreak, rule = anyPropProp, anyPropRule |
| } |
| } else if anyPropState >= 0 { |
| // We only have a specific state. |
| newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule |
| // This branch will probably never be reached because okAnyState will |
| // always be true given the current transition map. But we keep it here |
| // for future modifications to the transition map where this may not be |
| // true anymore. |
| } else if anyStateState >= 0 { |
| // We only have a specific property. |
| newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule |
| } else { |
| // No known transition. SB999: Any × Any. |
| newState, sentenceBreak, rule = sbAny, false, 9990 |
| } |
| } |
| |
| // SB8. |
| if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) { |
| // Check the right side of the rule. |
| var length int |
| for nextProperty != prOLetter && |
| nextProperty != prUpper && |
| nextProperty != prLower && |
| nextProperty != prSep && |
| nextProperty != prCR && |
| nextProperty != prLF && |
| nextProperty != prATerm && |
| nextProperty != prSTerm { |
| // Move on to the next rune. |
| if b != nil { // Byte slice version. |
| r, length = utf8.DecodeRune(b) |
| b = b[length:] |
| } else { // String version. |
| r, length = utf8.DecodeRuneInString(str) |
| str = str[length:] |
| } |
| if r == utf8.RuneError { |
| break |
| } |
| nextProperty = property(sentenceBreakCodePoints, r) |
| } |
| if nextProperty == prLower { |
| return sbLower, false |
| } |
| } |
| |
| return |
| } |