Thu Dec 28 03:49:35 2023 UTC (158d)
mklocale: XXX: Neglect TODIGIT at the moment

PR lib/57798

It was implemented with an assumption that all digit characters
can be mapped to numerical values <= 255.

This is no longer true for Unicode, and results in, e.g., wrong
return values of wcwidth(3) for U+5146 or U+16B60.

As a workaround, neglect TODIGIT for now, as done for OpenBSD:
https://github.com/OpenBSD/src/commit/4efe9bdeb34

XXX
At least netbsd-10 should be fixed, but it requires some tests.


(rin)
diff -r1.17 -r1.18 src/usr.bin/mklocale/mklocale.1
diff -r1.34 -r1.35 src/usr.bin/mklocale/yacc.y

cvs diff -r1.17 -r1.18 src/usr.bin/mklocale/mklocale.1 (expand / switch to unified diff)

--- src/usr.bin/mklocale/mklocale.1 2017/07/03 21:34:20 1.17
+++ src/usr.bin/mklocale/mklocale.1 2023/12/28 03:49:35 1.18
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1.\" $NetBSD: mklocale.1,v 1.17 2017/07/03 21:34:20 wiz Exp $ 1.\" $NetBSD: mklocale.1,v 1.18 2023/12/28 03:49:35 rin Exp $
2.\" FreeBSD: src/usr.bin/mklocale/mklocale.1,v 1.6 1999/09/20 09:15:21 phantom Exp 2.\" FreeBSD: src/usr.bin/mklocale/mklocale.1,v 1.6 1999/09/20 09:15:21 phantom Exp
3.\" 3.\"
4.\" Copyright (c) 1993, 1994 4.\" Copyright (c) 1993, 1994
5.\" The Regents of the University of California. All rights reserved. 5.\" The Regents of the University of California. All rights reserved.
6.\" 6.\"
7.\" This code is derived from software contributed to Berkeley by 7.\" This code is derived from software contributed to Berkeley by
8.\" Paul Borman at Krystal Technologies. 8.\" Paul Borman at Krystal Technologies.
9.\" 9.\"
10.\" Redistribution and use in source and binary forms, with or without 10.\" Redistribution and use in source and binary forms, with or without
11.\" modification, are permitted provided that the following conditions 11.\" modification, are permitted provided that the following conditions
12.\" are met: 12.\" are met:
13.\" 1. Redistributions of source code must retain the above copyright 13.\" 1. Redistributions of source code must retain the above copyright
14.\" notice, this list of conditions and the following disclaimer. 14.\" notice, this list of conditions and the following disclaimer.
@@ -23,27 +23,27 @@ @@ -23,27 +23,27 @@
23.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32.\" SUCH DAMAGE. 32.\" SUCH DAMAGE.
33.\" 33.\"
34.\" @(#)mklocale.1 8.2 (Berkeley) 4/18/94 34.\" @(#)mklocale.1 8.2 (Berkeley) 4/18/94
35.\" 35.\"
36.Dd July 15, 2013 36.Dd December 28, 2023
37.Dt MKLOCALE 1 37.Dt MKLOCALE 1
38.Os 38.Os
39.Sh NAME 39.Sh NAME
40.Nm mklocale 40.Nm mklocale
41.Nd make LC_CTYPE locale files 41.Nd make LC_CTYPE locale files
42.Sh SYNOPSIS 42.Sh SYNOPSIS
43.Nm mklocale 43.Nm mklocale
44.Op Fl d 44.Op Fl d
45.Op Fl t Ar type 45.Op Fl t Ar type
46< 46<
47.Ar source 47.Ar source
48> 48>
49.Ar language/LC_CTYPE 49.Ar language/LC_CTYPE
@@ -200,27 +200,29 @@ is the lower case representation of @@ -200,27 +200,29 @@ is the lower case representation of
200.It Dv MAPUPPER 200.It Dv MAPUPPER
201Defines the toupper mappings. 201Defines the toupper mappings.
202.Dv RUNE2 202.Dv RUNE2
203is the upper case representation of 203is the upper case representation of
204.Dv RUNE1 . 204.Dv RUNE1 .
205.It Dv TODIGIT 205.It Dv TODIGIT
206Defines a map from runes to their digit value. 206Defines a map from runes to their digit value.
207.Dv RUNE2 207.Dv RUNE2
208is the integer value represented by 208is the integer value represented by
209.Dv RUNE1 . 209.Dv RUNE1 .
210For example, the ASCII character 210For example, the ASCII character
211.Sq 0 211.Sq 0
212would map to the decimal value 0. 212would map to the decimal value 0.
213Only values up to 255 are allowed. 213On
 214.Nx ,
 215this information is ignored and not put into the binary output file.
214.El 216.El
215.Pp 217.Pp
216The following keywords may appear multiple times and have the following 218The following keywords may appear multiple times and have the following
217format for data: 219format for data:
218.Bl -tag -width "RUNE1 THRU RUNEn" 220.Bl -tag -width "RUNE1 THRU RUNEn"
219.It Dv RUNE 221.It Dv RUNE
220This rune has the property defined by the keyword. 222This rune has the property defined by the keyword.
221.It Dv "RUNE1 THRU RUNEn" 223.It Dv "RUNE1 THRU RUNEn"
222All the runes between and including 224All the runes between and including
223.Dv RUNE1 225.Dv RUNE1
224and 226and
225.Dv RUNEn 227.Dv RUNEn
226have the property defined by the keyword. 228have the property defined by the keyword.

cvs diff -r1.34 -r1.35 src/usr.bin/mklocale/yacc.y (expand / switch to unified diff)

--- src/usr.bin/mklocale/yacc.y 2019/10/13 21:12:32 1.34
+++ src/usr.bin/mklocale/yacc.y 2023/12/28 03:49:35 1.35
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: yacc.y,v 1.34 2019/10/13 21:12:32 christos Exp $ */ 1/* $NetBSD: yacc.y,v 1.35 2023/12/28 03:49:35 rin Exp $ */
2 2
3%{ 3%{
4/*- 4/*-
5 * Copyright (c) 1993 5 * Copyright (c) 1993
6 * The Regents of the University of California. All rights reserved. 6 * The Regents of the University of California. All rights reserved.
7 * 7 *
8 * This code is derived from software contributed to Berkeley by 8 * This code is derived from software contributed to Berkeley by
9 * Paul Borman at Krystal Technologies. 9 * Paul Borman at Krystal Technologies.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
@@ -33,27 +33,27 @@ @@ -33,27 +33,27 @@
33 * SUCH DAMAGE. 33 * SUCH DAMAGE.
34 */ 34 */
35 35
36#if HAVE_NBTOOL_CONFIG_H 36#if HAVE_NBTOOL_CONFIG_H
37#include "nbtool_config.h" 37#include "nbtool_config.h"
38#endif 38#endif
39 39
40#include <sys/cdefs.h> 40#include <sys/cdefs.h>
41#ifndef lint 41#ifndef lint
42#if 0 42#if 0
43static char sccsid[] = "@(#)yacc.y 8.1 (Berkeley) 6/6/93"; 43static char sccsid[] = "@(#)yacc.y 8.1 (Berkeley) 6/6/93";
44static char rcsid[] = "$FreeBSD$"; 44static char rcsid[] = "$FreeBSD$";
45#else 45#else
46__RCSID("$NetBSD: yacc.y,v 1.34 2019/10/13 21:12:32 christos Exp $"); 46__RCSID("$NetBSD: yacc.y,v 1.35 2023/12/28 03:49:35 rin Exp $");
47#endif 47#endif
48#endif /* not lint */ 48#endif /* not lint */
49 49
50#include <sys/types.h> 50#include <sys/types.h>
51#include <netinet/in.h> /* Needed by <arpa/inet.h> on NetBSD 1.5. */ 51#include <netinet/in.h> /* Needed by <arpa/inet.h> on NetBSD 1.5. */
52#include <arpa/inet.h> /* Needed for htonl on POSIX systems. */ 52#include <arpa/inet.h> /* Needed for htonl on POSIX systems. */
53 53
54#include <err.h> 54#include <err.h>
55#include <locale.h> 55#include <locale.h>
56#include <stddef.h> 56#include <stddef.h>
57#include <stdio.h> 57#include <stdio.h>
58#include <stdlib.h> 58#include <stdlib.h>
59#include <string.h> 59#include <string.h>
@@ -72,27 +72,29 @@ rune_map types = { { 0, }, }; @@ -72,27 +72,29 @@ rune_map types = { { 0, }, };
72 72
73_FileRuneLocale new_locale = { { 0, }, }; 73_FileRuneLocale new_locale = { { 0, }, };
74 74
75size_t rl_variable_len = (size_t)0;  75size_t rl_variable_len = (size_t)0;
76void *rl_variable = NULL; 76void *rl_variable = NULL;
77 77
78__nbrune_t charsetbits = (__nbrune_t)0x00000000; 78__nbrune_t charsetbits = (__nbrune_t)0x00000000;
79#if 0 79#if 0
80__nbrune_t charsetmask = (__nbrune_t)0x0000007f; 80__nbrune_t charsetmask = (__nbrune_t)0x0000007f;
81#endif 81#endif
82__nbrune_t charsetmask = (__nbrune_t)0xffffffff; 82__nbrune_t charsetmask = (__nbrune_t)0xffffffff;
83 83
84void set_map(rune_map *, rune_list *, u_int32_t); 84void set_map(rune_map *, rune_list *, u_int32_t);
 85#if 0
85void set_digitmap(rune_map *, rune_list *); 86void set_digitmap(rune_map *, rune_list *);
 87#endif
86void add_map(rune_map *, rune_list *, u_int32_t); 88void add_map(rune_map *, rune_list *, u_int32_t);
87 89
88__dead void usage(void); 90__dead void usage(void);
89int yyerror(const char *s); 91int yyerror(const char *s);
90void *xmalloc(unsigned int sz); 92void *xmalloc(unsigned int sz);
91u_int32_t *xlalloc(unsigned int sz); 93u_int32_t *xlalloc(unsigned int sz);
92u_int32_t *xrelalloc(u_int32_t *old, unsigned int sz); 94u_int32_t *xrelalloc(u_int32_t *old, unsigned int sz);
93void dump_tables(void); 95void dump_tables(void);
94int yyparse(void); 96int yyparse(void);
95extern int yylex(void); 97extern int yylex(void);
96 98
97/* mklocaledb.c */ 99/* mklocaledb.c */
98extern void mklocaledb(const char *, FILE *, FILE *); 100extern void mklocaledb(const char *, FILE *, FILE *);
@@ -177,28 +179,39 @@ entry : ENCODING STRING @@ -177,28 +179,39 @@ entry : ENCODING STRING
177 /*Latin1: 96A*/ 179 /*Latin1: 96A*/
178 charsetbits = 0x80; 180 charsetbits = 0x80;
179 charsetmask = 0x0000007f; 181 charsetmask = 0x0000007f;
180 } 182 }
181 } 183 }
182 | INVALID RUNE 184 | INVALID RUNE
183 { new_locale.frl_invalid_rune = htonl((u_int32_t)$2); } 185 { new_locale.frl_invalid_rune = htonl((u_int32_t)$2); }
184 | LIST list 186 | LIST list
185 { set_map(&types, $2, $1); } 187 { set_map(&types, $2, $1); }
186 | MAPLOWER map 188 | MAPLOWER map
187 { set_map(&maplower, $2, 0); } 189 { set_map(&maplower, $2, 0); }
188 | MAPUPPER map 190 | MAPUPPER map
189 { set_map(&mapupper, $2, 0); } 191 { set_map(&mapupper, $2, 0); }
190 | DIGITMAP map 192/*
191 { set_digitmap(&types, $2); } 193 * XXX PR lib/57798
 194 * set_digitmap() was implemented with an assumption that
 195 * all characters are mapped to numerical values <= 255.
 196 * This is no longer true for Unicode, and results in, e.g.,
 197 * wrong return values of wcwidth(3) for U+5146 or U+16B60.
 198 *
 199 * | DIGITMAP map
 200 * { set_digitmap(&types, $2); }
 201 *
 202 */
 203 | DIGITMAP mapignore
 204 { }
192 ; 205 ;
193 206
194list : RUNE 207list : RUNE
195 { 208 {
196 $$ = (rune_list *)malloc(sizeof(rune_list)); 209 $$ = (rune_list *)malloc(sizeof(rune_list));
197 $$->min = ($1 & charsetmask) | charsetbits; 210 $$->min = ($1 & charsetmask) | charsetbits;
198 $$->max = ($1 & charsetmask) | charsetbits; 211 $$->max = ($1 & charsetmask) | charsetbits;
199 $$->next = 0; 212 $$->next = 0;
200 } 213 }
201 | RUNE THRU RUNE 214 | RUNE THRU RUNE
202 { 215 {
203 $$ = (rune_list *)malloc(sizeof(rune_list)); 216 $$ = (rune_list *)malloc(sizeof(rune_list));
204 $$->min = ($1 & charsetmask) | charsetbits; 217 $$->min = ($1 & charsetmask) | charsetbits;
@@ -244,26 +257,32 @@ map : LBRK RUNE RUNE RBRK @@ -244,26 +257,32 @@ map : LBRK RUNE RUNE RBRK
244 $$->max = ($4 & charsetmask) | charsetbits; 257 $$->max = ($4 & charsetmask) | charsetbits;
245 $$->map = $6; 258 $$->map = $6;
246 $$->next = 0; 259 $$->next = 0;
247 } 260 }
248 | map LBRK RUNE THRU RUNE ':' RUNE RBRK 261 | map LBRK RUNE THRU RUNE ':' RUNE RBRK
249 { 262 {
250 $$ = (rune_list *)malloc(sizeof(rune_list)); 263 $$ = (rune_list *)malloc(sizeof(rune_list));
251 $$->min = ($3 & charsetmask) | charsetbits; 264 $$->min = ($3 & charsetmask) | charsetbits;
252 $$->max = ($5 & charsetmask) | charsetbits; 265 $$->max = ($5 & charsetmask) | charsetbits;
253 $$->map = $7; 266 $$->map = $7;
254 $$->next = $1; 267 $$->next = $1;
255 } 268 }
256 ; 269 ;
 270
 271mapignore : LBRK RUNE RUNE RBRK { }
 272 | map LBRK RUNE RUNE RBRK { }
 273 | LBRK RUNE THRU RUNE ':' RUNE RBRK { }
 274 | map LBRK RUNE THRU RUNE ':' RUNE RBRK { }
 275 ;
257%% 276%%
258 277
259int debug = 0; 278int debug = 0;
260FILE *ofile; 279FILE *ofile;
261 280
262int 281int
263main(int ac, char *av[]) 282main(int ac, char *av[])
264{ 283{
265 int x; 284 int x;
266 const char *locale_type; 285 const char *locale_type;
267 286
268 extern char *optarg; 287 extern char *optarg;
269 extern int optind; 288 extern int optind;
@@ -372,45 +391,47 @@ xrelalloc(u_int32_t *old, unsigned int s @@ -372,45 +391,47 @@ xrelalloc(u_int32_t *old, unsigned int s
372 391
373void 392void
374set_map(rune_map *map, rune_list *list, u_int32_t flag) 393set_map(rune_map *map, rune_list *list, u_int32_t flag)
375{ 394{
376 list->map &= charsetmask; 395 list->map &= charsetmask;
377 list->map |= charsetbits; 396 list->map |= charsetbits;
378 while (list) { 397 while (list) {
379 rune_list *nlist = list->next; 398 rune_list *nlist = list->next;
380 add_map(map, list, flag); 399 add_map(map, list, flag);
381 list = nlist; 400 list = nlist;
382 } 401 }
383} 402}
384 403
 404#if 0
385void 405void
386set_digitmap(rune_map *map, rune_list *list) 406set_digitmap(rune_map *map, rune_list *list)
387{ 407{
388 __nbrune_t i; 408 __nbrune_t i;
389 409
390 while (list) { 410 while (list) {
391 rune_list *nlist = list->next; 411 rune_list *nlist = list->next;
392 for (i = list->min; i <= list->max; ++i) { 412 for (i = list->min; i <= list->max; ++i) {
393 if (list->map + (i - list->min)) { 413 if (list->map + (i - list->min)) {
394 rune_list *tmp = (rune_list *)xmalloc(sizeof(rune_list)); 414 rune_list *tmp = (rune_list *)xmalloc(sizeof(rune_list));
395 tmp->min = i; 415 tmp->min = i;
396 tmp->max = i; 416 tmp->max = i;
397 add_map(map, tmp, list->map + (i - list->min)); 417 add_map(map, tmp, list->map + (i - list->min));
398 } 418 }
399 } 419 }
400 free(list); 420 free(list);
401 list = nlist; 421 list = nlist;
402 } 422 }
403} 423}
 424#endif
404 425
405void 426void
406add_map(rune_map *map, rune_list *list, u_int32_t flag) 427add_map(rune_map *map, rune_list *list, u_int32_t flag)
407{ 428{
408 __nbrune_t i; 429 __nbrune_t i;
409 rune_list *lr = 0; 430 rune_list *lr = 0;
410 rune_list *r; 431 rune_list *r;
411 __nbrune_t run; 432 __nbrune_t run;
412 433
413 while (list->min < _CTYPE_CACHE_SIZE && list->min <= list->max) { 434 while (list->min < _CTYPE_CACHE_SIZE && list->min <= list->max) {
414 if (flag) 435 if (flag)
415 map->map[list->min++] |= flag; 436 map->map[list->min++] |= flag;
416 else 437 else