| //go:build generate |
| |
| // This program generates a Go containing a slice of test cases based on the |
| // Unicode Character Database auxiliary data files. The command line arguments |
| // are as follows: |
| // |
| // 1. The name of the Unicode data file (just the filename, without extension). |
| // 2. The name of the locally generated Go file. |
| // 3. The name of the slice containing the test cases. |
| // 4. The name of the generator, for logging purposes. |
| // |
| //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes |
| //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words |
| //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences |
| //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines |
| |
| package main |
| |
| import ( |
| "bufio" |
| "bytes" |
| "errors" |
| "fmt" |
| "go/format" |
| "io/ioutil" |
| "log" |
| "net/http" |
| "os" |
| "time" |
| ) |
| |
| // We want to test against a specific version rather than the latest. When the |
| // package is upgraded to a new version, change these to generate new tests. |
| const ( |
| testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt` |
| ) |
| |
| func main() { |
| if len(os.Args) < 5 { |
| fmt.Println("Not enough arguments, see code for details") |
| os.Exit(1) |
| } |
| |
| log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ") |
| log.SetFlags(0) |
| |
| // Read text of testcases and parse into Go source code. |
| src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1])) |
| if err != nil { |
| log.Fatal(err) |
| } |
| |
| // Format the Go code. |
| formatted, err := format.Source(src) |
| if err != nil { |
| log.Fatalln("gofmt:", err) |
| } |
| |
| // Write it out. |
| log.Print("Writing to ", os.Args[2]) |
| if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil { |
| log.Fatal(err) |
| } |
| } |
| |
| // parse reads a break text file, either from a local file or from a URL. It |
| // parses the file data into Go source code representing the test cases. |
| func parse(url string) ([]byte, error) { |
| log.Printf("Parsing %s", url) |
| res, err := http.Get(url) |
| if err != nil { |
| return nil, err |
| } |
| body := res.Body |
| defer body.Close() |
| |
| buf := new(bytes.Buffer) |
| buf.Grow(120 << 10) |
| buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT. |
| |
| package uniseg |
| |
| // ` + os.Args[3] + ` are Grapheme testcases taken from |
| // ` + url + ` |
| // on ` + time.Now().Format("January 2, 2006") + `. See |
| // https://www.unicode.org/license.html for the Unicode license agreement. |
| var ` + os.Args[3] + ` = []testCase { |
| `) |
| |
| sc := bufio.NewScanner(body) |
| num := 1 |
| var line []byte |
| original := make([]byte, 0, 64) |
| expected := make([]byte, 0, 64) |
| for sc.Scan() { |
| num++ |
| line = sc.Bytes() |
| if len(line) == 0 || line[0] == '#' { |
| continue |
| } |
| var comment []byte |
| if i := bytes.IndexByte(line, '#'); i >= 0 { |
| comment = bytes.TrimSpace(line[i+1:]) |
| line = bytes.TrimSpace(line[:i]) |
| } |
| original, expected, err := parseRuneSequence(line, original[:0], expected[:0]) |
| if err != nil { |
| return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line) |
| } |
| fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment) |
| } |
| if err := sc.Err(); err != nil { |
| return nil, err |
| } |
| |
| // Check for final "# EOF", useful check if we're streaming via HTTP |
| if !bytes.Equal(line, []byte("# EOF")) { |
| return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line) |
| } |
| buf.WriteString("}\n") |
| return buf.Bytes(), nil |
| } |
| |
| // Used by parseRuneSequence to match input via bytes.HasPrefix. |
| var ( |
| prefixBreak = []byte("÷ ") |
| prefixDontBreak = []byte("× ") |
| breakOk = []byte("÷") |
| breakNo = []byte("×") |
| ) |
| |
| // parseRuneSequence parses a rune + breaking opportunity sequence from b |
| // and appends the Go code for testcase.original to orig |
| // and appends the Go code for testcase.expected to exp. |
| // It retuns the new orig and exp slices. |
| // |
| // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷" |
| // it will append |
| // |
| // "\u0020\u0308\U0001F1E6" |
| // |
| // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}" |
| // to orig and exp respectively. |
| // |
| // The formatting of exp is expected to be cleaned up by gofmt or format.Source. |
| // Note we explicitly require the sequence to start with ÷ and we implicitly |
| // require it to end with ÷. |
| func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) { |
| // Check for and remove first ÷ or ×. |
| if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) { |
| return nil, nil, errors.New("expected ÷ or × as first character") |
| } |
| if bytes.HasPrefix(b, prefixBreak) { |
| b = b[len(prefixBreak):] |
| } else { |
| b = b[len(prefixDontBreak):] |
| } |
| |
| boundary := true |
| exp = append(exp, "[][]rune{"...) |
| for len(b) > 0 { |
| if boundary { |
| exp = append(exp, '{') |
| } |
| exp = append(exp, "0x"...) |
| // Find end of hex digits. |
| var i int |
| for i = 0; i < len(b) && b[i] != ' '; i++ { |
| if d := b[i]; ('0' <= d || d <= '9') || |
| ('A' <= d || d <= 'F') || |
| ('a' <= d || d <= 'f') { |
| continue |
| } |
| return nil, nil, errors.New("bad hex digit") |
| } |
| switch i { |
| case 4: |
| orig = append(orig, "\\u"...) |
| case 5: |
| orig = append(orig, "\\U000"...) |
| default: |
| return nil, nil, errors.New("unsupport code point hex length") |
| } |
| orig = append(orig, b[:i]...) |
| exp = append(exp, b[:i]...) |
| b = b[i:] |
| |
| // Check for space between hex and ÷ or ×. |
| if len(b) < 1 || b[0] != ' ' { |
| return nil, nil, errors.New("bad input") |
| } |
| b = b[1:] |
| |
| // Check for next boundary. |
| switch { |
| case bytes.HasPrefix(b, breakOk): |
| boundary = true |
| b = b[len(breakOk):] |
| case bytes.HasPrefix(b, breakNo): |
| boundary = false |
| b = b[len(breakNo):] |
| default: |
| return nil, nil, errors.New("missing ÷ or ×") |
| } |
| if boundary { |
| exp = append(exp, '}') |
| } |
| exp = append(exp, ',') |
| if len(b) > 0 && b[0] == ' ' { |
| b = b[1:] |
| } |
| } |
| exp = append(exp, '}') |
| return orig, exp, nil |
| } |