Sun Apr 14 15:21:20 2024 UTC (25d)
make: make string matching platform-independent

Previously, whether the character range '[a-辰]' matched, depended on the
signedness of the plain 'char' type.  Since make operates on byte
strings and does not support UTF-8 or other multi-byte character
encodings, this edge case is not expected to occur in practice.

No change in the unit tests as this edge case is not covered by tests.


(rillig)
diff -r1.102 -r1.103 src/usr.bin/make/str.c

cvs diff -r1.102 -r1.103 src/usr.bin/make/str.c (expand / switch to unified diff)

--- src/usr.bin/make/str.c 2024/01/05 23:22:06 1.102
+++ src/usr.bin/make/str.c 2024/04/14 15:21:20 1.103
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: str.c,v 1.102 2024/01/05 23:22:06 rillig Exp $ */ 1/* $NetBSD: str.c,v 1.103 2024/04/14 15:21:20 rillig Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1988, 1989, 1990, 1993 4 * Copyright (c) 1988, 1989, 1990, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to Berkeley by 7 * This code is derived from software contributed to Berkeley by
8 * Adam de Boor. 8 * Adam de Boor.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -61,27 +61,27 @@ @@ -61,27 +61,27 @@
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE. 68 * SUCH DAMAGE.
69 */ 69 */
70 70
71#include "make.h" 71#include "make.h"
72 72
73/* "@(#)str.c 5.8 (Berkeley) 6/1/90" */ 73/* "@(#)str.c 5.8 (Berkeley) 6/1/90" */
74MAKE_RCSID("$NetBSD: str.c,v 1.102 2024/01/05 23:22:06 rillig Exp $"); 74MAKE_RCSID("$NetBSD: str.c,v 1.103 2024/04/14 15:21:20 rillig Exp $");
75 75
76 76
77static HashTable interned_strings; 77static HashTable interned_strings;
78 78
79 79
80/* Return the concatenation of s1 and s2, freshly allocated. */ 80/* Return the concatenation of s1 and s2, freshly allocated. */
81char * 81char *
82str_concat2(const char *s1, const char *s2) 82str_concat2(const char *s1, const char *s2)
83{ 83{
84 size_t len1 = strlen(s1); 84 size_t len1 = strlen(s1);
85 size_t len2 = strlen(s2); 85 size_t len2 = strlen(s2);
86 char *result = bmake_malloc(len1 + len2 + 1); 86 char *result = bmake_malloc(len1 + len2 + 1);
87 memcpy(result, s1, len1); 87 memcpy(result, s1, len1);
@@ -287,46 +287,26 @@ Str_Words(const char *str, bool expand) @@ -287,46 +287,26 @@ Str_Words(const char *str, bool expand)
287 return words; 287 return words;
288 } 288 }
289 289
290 words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0])); 290 words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0]));
291 words.len = swords.len; 291 words.len = swords.len;
292 words.freeIt = swords.freeIt; 292 words.freeIt = swords.freeIt;
293 for (i = 0; i < swords.len + 1; i++) 293 for (i = 0; i < swords.len + 1; i++)
294 words.words[i] = UNCONST(swords.words[i].start); 294 words.words[i] = UNCONST(swords.words[i].start);
295 free(swords.words); 295 free(swords.words);
296 return words; 296 return words;
297} 297}
298 298
299/* 299/*
300 * XXX: In the extreme edge case that one of the characters is from the basic 
301 * execution character set and the other isn't, the result of the comparison 
302 * differs depending on whether plain char is signed or unsigned. 
303 * 
304 * An example is the character range from \xE4 to 'a', where \xE4 may come 
305 * from U+00E4 'Latin small letter A with diaeresis'. 
306 * 
307 * If char is signed, \xE4 evaluates to -28, the first half of the condition 
308 * becomes -28 <= '0' && '0' <= 'a', which evaluates to true. 
309 * 
310 * If char is unsigned, \xE4 evaluates to 228, the second half of the 
311 * condition becomes 'a' <= '0' && '0' <= 228, which evaluates to false. 
312 */ 
313static bool 
314in_range(char e1, char c, char e2) 
315{ 
316 return (e1 <= c && c <= e2) || (e2 <= c && c <= e1); 
317} 
318 
319/* 
320 * Test if a string matches a pattern like "*.[ch]". The pattern matching 300 * Test if a string matches a pattern like "*.[ch]". The pattern matching
321 * characters are '*', '?' and '[]', as in fnmatch(3). 301 * characters are '*', '?' and '[]', as in fnmatch(3).
322 * 302 *
323 * See varmod-match.mk for examples and edge cases. 303 * See varmod-match.mk for examples and edge cases.
324 */ 304 */
325StrMatchResult 305StrMatchResult
326Str_Match(const char *str, const char *pat) 306Str_Match(const char *str, const char *pat)
327{ 307{
328 StrMatchResult res = { NULL, false }; 308 StrMatchResult res = { NULL, false };
329 bool asterisk = false; 309 bool asterisk = false;
330 const char *fixed_str = str; 310 const char *fixed_str = str;
331 const char *fixed_pat = pat; 311 const char *fixed_pat = pat;
332 312
@@ -350,27 +330,31 @@ match_fixed_length: @@ -350,27 +330,31 @@ match_fixed_length:
350 if (*pat == ']' || *pat == '\0') { 330 if (*pat == ']' || *pat == '\0') {
351 if (neg) 331 if (neg)
352 goto end_of_char_list; 332 goto end_of_char_list;
353 goto no_match; 333 goto no_match;
354 } 334 }
355 if (*pat == *str) 335 if (*pat == *str)
356 goto end_of_char_list; 336 goto end_of_char_list;
357 if (pat[1] == '-' && pat[2] == '\0') { 337 if (pat[1] == '-' && pat[2] == '\0') {
358 res.error = "Unfinished character range"; 338 res.error = "Unfinished character range";
359 res.matched = neg; 339 res.matched = neg;
360 return res; 340 return res;
361 } 341 }
362 if (pat[1] == '-') { 342 if (pat[1] == '-') {
363 if (in_range(pat[0], *str, pat[2])) 343 unsigned char e1 = (unsigned char)pat[0];
 344 unsigned char c = (unsigned char)*str;
 345 unsigned char e2 = (unsigned char)pat[2];
 346 if ((e1 <= c && c <= e2)
 347 || (e2 <= c && c <= e1))
364 goto end_of_char_list; 348 goto end_of_char_list;
365 pat += 2; 349 pat += 2;
366 } 350 }
367 pat++; 351 pat++;
368 goto next_char_in_list; 352 goto next_char_in_list;
369 353
370 end_of_char_list: 354 end_of_char_list:
371 if (neg && *pat != ']' && *pat != '\0') 355 if (neg && *pat != ']' && *pat != '\0')
372 goto no_match; 356 goto no_match;
373 while (*pat != ']' && *pat != '\0') 357 while (*pat != ']' && *pat != '\0')
374 pat++; 358 pat++;
375 if (*pat == '\0') 359 if (*pat == '\0')
376 pat--; 360 pat--;