| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame^] | 1 | //go:build generate |
| 2 | |
| 3 | // This program generates a Go containing a slice of test cases based on the |
| 4 | // Unicode Character Database auxiliary data files. The command line arguments |
| 5 | // are as follows: |
| 6 | // |
| 7 | // 1. The name of the Unicode data file (just the filename, without extension). |
| 8 | // 2. The name of the locally generated Go file. |
| 9 | // 3. The name of the slice containing the test cases. |
| 10 | // 4. The name of the generator, for logging purposes. |
| 11 | // |
| 12 | //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes |
| 13 | //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words |
| 14 | //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences |
| 15 | //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines |
| 16 | |
| 17 | package main |
| 18 | |
| 19 | import ( |
| 20 | "bufio" |
| 21 | "bytes" |
| 22 | "errors" |
| 23 | "fmt" |
| 24 | "go/format" |
| 25 | "io/ioutil" |
| 26 | "log" |
| 27 | "net/http" |
| 28 | "os" |
| 29 | "time" |
| 30 | ) |
| 31 | |
| 32 | // We want to test against a specific version rather than the latest. When the |
| 33 | // package is upgraded to a new version, change these to generate new tests. |
| 34 | const ( |
| 35 | testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt` |
| 36 | ) |
| 37 | |
| 38 | func main() { |
| 39 | if len(os.Args) < 5 { |
| 40 | fmt.Println("Not enough arguments, see code for details") |
| 41 | os.Exit(1) |
| 42 | } |
| 43 | |
| 44 | log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ") |
| 45 | log.SetFlags(0) |
| 46 | |
| 47 | // Read text of testcases and parse into Go source code. |
| 48 | src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1])) |
| 49 | if err != nil { |
| 50 | log.Fatal(err) |
| 51 | } |
| 52 | |
| 53 | // Format the Go code. |
| 54 | formatted, err := format.Source(src) |
| 55 | if err != nil { |
| 56 | log.Fatalln("gofmt:", err) |
| 57 | } |
| 58 | |
| 59 | // Write it out. |
| 60 | log.Print("Writing to ", os.Args[2]) |
| 61 | if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil { |
| 62 | log.Fatal(err) |
| 63 | } |
| 64 | } |
| 65 | |
| 66 | // parse reads a break text file, either from a local file or from a URL. It |
| 67 | // parses the file data into Go source code representing the test cases. |
| 68 | func parse(url string) ([]byte, error) { |
| 69 | log.Printf("Parsing %s", url) |
| 70 | res, err := http.Get(url) |
| 71 | if err != nil { |
| 72 | return nil, err |
| 73 | } |
| 74 | body := res.Body |
| 75 | defer body.Close() |
| 76 | |
| 77 | buf := new(bytes.Buffer) |
| 78 | buf.Grow(120 << 10) |
| 79 | buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT. |
| 80 | |
| 81 | package uniseg |
| 82 | |
| 83 | // ` + os.Args[3] + ` are Grapheme testcases taken from |
| 84 | // ` + url + ` |
| 85 | // on ` + time.Now().Format("January 2, 2006") + `. See |
| 86 | // https://www.unicode.org/license.html for the Unicode license agreement. |
| 87 | var ` + os.Args[3] + ` = []testCase { |
| 88 | `) |
| 89 | |
| 90 | sc := bufio.NewScanner(body) |
| 91 | num := 1 |
| 92 | var line []byte |
| 93 | original := make([]byte, 0, 64) |
| 94 | expected := make([]byte, 0, 64) |
| 95 | for sc.Scan() { |
| 96 | num++ |
| 97 | line = sc.Bytes() |
| 98 | if len(line) == 0 || line[0] == '#' { |
| 99 | continue |
| 100 | } |
| 101 | var comment []byte |
| 102 | if i := bytes.IndexByte(line, '#'); i >= 0 { |
| 103 | comment = bytes.TrimSpace(line[i+1:]) |
| 104 | line = bytes.TrimSpace(line[:i]) |
| 105 | } |
| 106 | original, expected, err := parseRuneSequence(line, original[:0], expected[:0]) |
| 107 | if err != nil { |
| 108 | return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line) |
| 109 | } |
| 110 | fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment) |
| 111 | } |
| 112 | if err := sc.Err(); err != nil { |
| 113 | return nil, err |
| 114 | } |
| 115 | |
| 116 | // Check for final "# EOF", useful check if we're streaming via HTTP |
| 117 | if !bytes.Equal(line, []byte("# EOF")) { |
| 118 | return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line) |
| 119 | } |
| 120 | buf.WriteString("}\n") |
| 121 | return buf.Bytes(), nil |
| 122 | } |
| 123 | |
| 124 | // Used by parseRuneSequence to match input via bytes.HasPrefix. |
| 125 | var ( |
| 126 | prefixBreak = []byte("÷ ") |
| 127 | prefixDontBreak = []byte("× ") |
| 128 | breakOk = []byte("÷") |
| 129 | breakNo = []byte("×") |
| 130 | ) |
| 131 | |
| 132 | // parseRuneSequence parses a rune + breaking opportunity sequence from b |
| 133 | // and appends the Go code for testcase.original to orig |
| 134 | // and appends the Go code for testcase.expected to exp. |
| 135 | // It retuns the new orig and exp slices. |
| 136 | // |
| 137 | // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷" |
| 138 | // it will append |
| 139 | // |
| 140 | // "\u0020\u0308\U0001F1E6" |
| 141 | // |
| 142 | // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}" |
| 143 | // to orig and exp respectively. |
| 144 | // |
| 145 | // The formatting of exp is expected to be cleaned up by gofmt or format.Source. |
| 146 | // Note we explicitly require the sequence to start with ÷ and we implicitly |
| 147 | // require it to end with ÷. |
| 148 | func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) { |
| 149 | // Check for and remove first ÷ or ×. |
| 150 | if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) { |
| 151 | return nil, nil, errors.New("expected ÷ or × as first character") |
| 152 | } |
| 153 | if bytes.HasPrefix(b, prefixBreak) { |
| 154 | b = b[len(prefixBreak):] |
| 155 | } else { |
| 156 | b = b[len(prefixDontBreak):] |
| 157 | } |
| 158 | |
| 159 | boundary := true |
| 160 | exp = append(exp, "[][]rune{"...) |
| 161 | for len(b) > 0 { |
| 162 | if boundary { |
| 163 | exp = append(exp, '{') |
| 164 | } |
| 165 | exp = append(exp, "0x"...) |
| 166 | // Find end of hex digits. |
| 167 | var i int |
| 168 | for i = 0; i < len(b) && b[i] != ' '; i++ { |
| 169 | if d := b[i]; ('0' <= d || d <= '9') || |
| 170 | ('A' <= d || d <= 'F') || |
| 171 | ('a' <= d || d <= 'f') { |
| 172 | continue |
| 173 | } |
| 174 | return nil, nil, errors.New("bad hex digit") |
| 175 | } |
| 176 | switch i { |
| 177 | case 4: |
| 178 | orig = append(orig, "\\u"...) |
| 179 | case 5: |
| 180 | orig = append(orig, "\\U000"...) |
| 181 | default: |
| 182 | return nil, nil, errors.New("unsupport code point hex length") |
| 183 | } |
| 184 | orig = append(orig, b[:i]...) |
| 185 | exp = append(exp, b[:i]...) |
| 186 | b = b[i:] |
| 187 | |
| 188 | // Check for space between hex and ÷ or ×. |
| 189 | if len(b) < 1 || b[0] != ' ' { |
| 190 | return nil, nil, errors.New("bad input") |
| 191 | } |
| 192 | b = b[1:] |
| 193 | |
| 194 | // Check for next boundary. |
| 195 | switch { |
| 196 | case bytes.HasPrefix(b, breakOk): |
| 197 | boundary = true |
| 198 | b = b[len(breakOk):] |
| 199 | case bytes.HasPrefix(b, breakNo): |
| 200 | boundary = false |
| 201 | b = b[len(breakNo):] |
| 202 | default: |
| 203 | return nil, nil, errors.New("missing ÷ or ×") |
| 204 | } |
| 205 | if boundary { |
| 206 | exp = append(exp, '}') |
| 207 | } |
| 208 | exp = append(exp, ',') |
| 209 | if len(b) > 0 && b[0] == ' ' { |
| 210 | b = b[1:] |
| 211 | } |
| 212 | } |
| 213 | exp = append(exp, '}') |
| 214 | return orig, exp, nil |
| 215 | } |