@@ -1,115 +1,111 @@
-/* $NetBSD: memchr.S,v 1.2 2009/07/18 18:06:56 dsl Exp $ */
-
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by David Laight.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
*/
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: memchr.S,v 1.2 2009/07/18 18:06:56 dsl Exp $")
+ RCSID("$NetBSD: memchr.S,v 1.3 2009/07/19 23:45:29 christos Exp $")
#endif
-/*
- * The instruction sequences used try to avoid data dependencies
- * between adjacent instructions (to allow parallel execution).
- * The 'imul' for %r9 could be put into the delay following the
- * memory read (ie inside the loop) at no obvious cost - except
- * that the loop is currently exactly 32 bytes - 2 fetch blocks!.
- *
- * I don't think aligning any of the other branch targets is useful.
- */
-
ENTRY(memchr)
- movabsq $0x0101010101010101,%r8
- lea (%rdi,%rdx),%r10 /* limit of buffer to scan */
- movzbq %sil,%rsi /* mask high bits! */
+ movzbq %sil,%rcx
- /* 'directpath' imuls can execute 3 at a time ... (amd) */
- imul %r8,%rsi /* search byte replicated in word */
- imul $0x80,%r8,%r9 /* 0x8080808080808080 */
- test $7,%dil
- jnz 20f /* jump if misaligned */
- jmp 1f /* jump to avoid 4 nops (13 bytes) in gap */
+ /*
+ * Align to word boundary.
+ * Consider unrolling loop?
+ */
+ testq %rdx,%rdx /* nbytes == 0? */
+ je .Lzero
+.Lalign:
+ testb $7,%dil
+ je .Lword_aligned
+ movq %rdi,%rax
+ cmpb (%rdi),%cl
+ je .Ldone
+ incq %rdi
+ decq %rdx
+ jnz .Lalign
+ jmp .Lzero
- _ALIGN_TEXT /* entire loop now in 32 aligned bytes */
-1:
- cmpq %r10,%rdi /* end of buffer ? */
- jae 30f /* jump if so */
+.Lword_aligned:
+ /* copy char to all bytes in word */
+ movb %cl,%ch
+ movq %rcx,%rsi
+ salq $16,%rcx
+ orq %rsi,%rcx
+ movq %rcx,%rsi
+ salq $32,%rcx
+ orq %rsi,%rcx
- movq (%rdi),%rax /* value to check */
-2:
+ movabsq $0x0101010101010101,%r8
+ movabsq $0x8080808080808080,%r9
+
+ _ALIGN_TEXT
+.Lloop:
+ cmpq $7,%rdx /* nbytes > 8 */
+ jbe .Lbyte
+ movq (%rdi),%rsi
addq $8,%rdi
- xorq %rsi,%rax /* now looking for zeros */
- mov %rax,%rcx
- subq %r8,%rax /* x - 0x10 */
- not %rcx
- andq %r9,%rax /* (x - 0x10) & 0x80 */
- andq %rcx,%rax /* ((x - 0x10) & 0x80) ^ ~x */
- je 1b /* jump if not found */
+ xorq %rcx,%rsi
+ subq $8,%rdx
+ subq %r8,%rsi
+ testq %r9,%rsi
+ je .Lloop
-/* Found byte in word, get its address */
- bsf %rax,%rax
- shr $3,%eax
- lea -8(%rax,%rdi),%rax
- cmpq %r10,%rax /* need to check not beyond buffer */
- jae 30f
- rep
- ret /* amd - no ret after jmp */
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word are
+ * equal to ch.
+ */
-/* Input misaligned, read aligned and kill low bits */
-/* (Getting a -1 is surprisingly hard work!) */
-20:
- xor %eax,%eax /* zeros all 64 bits */
- mov %dil,%cl /* misalignment amount 1..7 (+high bits )*/
- test %rdx,%rdx /* zero length, don't read */
- jz 30f
+ leaq -8(%rdi),%rax
+ cmpb -8(%rdi),%cl /* 1st byte == ch? */
+ je .Ldone
- and $~7,%dil /* %rdi now start of word */
- lea -1(%rax),%r11 /* all 0xff */
- and $7,%cl /* 1..7 */
+ leaq -7(%rdi),%rax
+ cmpb -7(%rdi),%cl /* 2nd byte == ch? */
+ je .Ldone
- mov (%rdi),%rax /* word containing first byte */
- shl $3,%cl /* 8..56 */
- test %r11,%rsi /* searching for 0xff */
- jz 25f
+ leaq -6(%rdi),%rax
+ cmpb -6(%rdi),%cl /* 3rd byte == ch? */
+ je .Ldone
- /* Searching for other than 0xff, set low bytes */
- shl %cl,%r11 /* 0xff in high (wanted) bytes */
- not %r11 /* 0xff in low (unwanted) bytes */
- or %r11,%rax /* low bytes now set */
- jmp 2b
+ leaq -5(%rdi),%rax
+ cmpb -5(%rdi),%cl /* 4th byte == ch? */
+ je .Ldone
-25: /* Searching for 0xff, clear low bytes */
- shl %cl,%r11 /* 0xff in high (wanted) bytes */
- and %r11,%rax /* low bytes now zero */
- jmp 2b
+ leaq -4(%rdi),%rax
+ cmpb -4(%rdi),%cl /* 5th byte == ch? */
+ je .Ldone
-/* Not found */
-30: xorq %rax,%rax
+ leaq -3(%rdi),%rax
+ cmpb -3(%rdi),%cl /* 6th byte == ch? */
+ je .Ldone
+
+ leaq -2(%rdi),%rax
+ cmpb -2(%rdi),%cl /* 7th byte == ch? */
+ je .Ldone
+
+ leaq -1(%rdi),%rax
+ cmpb -1(%rdi),%cl /* 7th byte == ch? */
+ jne .Lloop
+ ret
+
+.Lbyte:
+ testq %rdx,%rdx
+ je .Lzero
+.Lbyte_loop:
+ movq %rdi,%rax
+ cmpb (%rdi),%cl
+ je .Ldone
+ incq %rdi
+ decq %rdx
+ jnz .Lbyte_loop
+
+.Lzero:
+ xorq %rax,%rax
+
+.Ldone:
ret
@@ -1,153 +1,133 @@
-/* $NetBSD: strchr.S,v 1.4 2009/07/18 16:40:31 dsl Exp $ */
-
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by David Laight.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
*/
-/* See comments in strlen.S about checking words for byte values */
-
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: strchr.S,v 1.4 2009/07/18 16:40:31 dsl Exp $")
+ RCSID("$NetBSD: strchr.S,v 1.5 2009/07/19 23:45:29 christos Exp $")
#endif
-/*
- * On entry %rdi is the buffer and the low byte of %rsi (%sil) the
- * character to search for.
- *
- * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable
- */
+ENTRY(strchr)
+ movzbq %sil,%rcx
-/* Uncomment below to get regression test to run this version but
- * have everything else use the trivial one below. */
-/* #define TEST_STRCHR */
+ /*
+ * Align to word boundary.
+ * Consider unrolling loop?
+ */
+.Lalign:
+ testb $7,%dil
+ je .Lword_aligned
+ movb (%rdi),%dl
+ cmpb %cl,%dl
+ je .Ldone
+ incq %rdi
+ testb %dl,%dl
+ jne .Lalign
+ jmp .Lzero
-#ifdef TEST_STRCHR
-ENTRY(test_strchr)
-#else
-ENTRY(strchr)
-#endif
+.Lword_aligned:
+ /* copy char to all bytes in word */
+ movb %cl,%ch
+ movq %rcx,%rdx
+ salq $16,%rcx
+ orq %rdx,%rcx
+ movq %rcx,%rdx
+ salq $32,%rcx
+ orq %rdx,%rcx
+
movabsq $0x0101010101010101,%r8
+ movabsq $0x8080808080808080,%r9
- movzbq %sil,%rdx /* value to search for (c) */
- /* These imul are 'directpath' on athlons, so are fast */
- imul $0x80,%r8,%r9 /* 0x8080808080808080 */
- imul %r8,%rdx /* (c) copied to all bytes */
- test $7,%dil
- jnz 20f /* jump if misaligned */
-
- _ALIGN_TEXT /* one byte nop */
-1:
- movq (%rdi),%rax /* bytes to check (x) */
-2:
+ /* Check whether any byte in the word is equal to ch or 0. */
+ _ALIGN_TEXT
+.Lloop:
+ movq (%rdi),%rdx
addq $8,%rdi
- mov %rax,%r10
- mov %rax,%r11 /* for 'char' check */
- not %r10 /* invert of data (~x) */
+ movq %rdx,%rsi
+ subq %r8,%rdx
+ xorq %rcx,%rsi
+ subq %r8,%rsi
+ orq %rsi,%rdx
+ testq %r9,%rdx
+ je .Lloop
- xorq %rdx,%r11 /* convert 'char' test to one for NUL */
- subq %r8,%rax /* x - 0x10 */
- movq %r10,%rsi /* ~x */
- subq %r8,%r11 /* (x ^ c) - 0x10 */
-/*
- * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80
- * and short-circuit the case where no top bits are set, and
- * we continue the loop.
- * However it needs 3 more clocks that are difficult to interleave
- * in the existing dependency chain ...
- */
- andq %r9,%rax /* (x - 0x10) & 0x80 */
- xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */
- andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */
- andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */
- jne 10f /* jump if string ends */
- andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */
- je 1b /* jump if no match */
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word match
+ * ch or are equal to 0.
+ */
- /* Found char, since LE can use bit scan */
- bsf %r11,%r11 /* 7, 15, 23 ... 63 */
-8: shr $3,%r11 /* 0, 1, 2 .. 7 */
- lea -8(%r11,%rdi),%rax
- ret
+ movb -8(%rdi),%dl
+ cmpb %cl,%dl /* 1st byte == ch? */
+ jne 1f
+ subq $8,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 1st byte == 0? */
+ je .Lzero
-/* End of string, check whether char is before NUL */
- _ALIGN_TEXT /* adds three byte nop */
-10:
- bsf %rax,%rax /* count to NUL */
- andq %rsi,%r11 /* check for char in last 8 bytes */
- je 11f
- bsf %r11,%r11 /* NUL and char - see which was first */
- cmp %r11,%rax
- jae 8b /* return 'found' if same - searching for NUL */
-11: xor %eax,%eax /* char not found */
- ret
+ movb -7(%rdi),%dl
+ cmpb %cl,%dl /* 2nd byte == ch? */
+ jne 1f
+ subq $7,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 2nd byte == 0? */
+ je .Lzero
-/* Source misaligned: read aligned word and make low bytes invalid */
-/* I (dsl) think a _ALIGN_TEXT here will slow things down! */
-20:
- xor %rcx,%rcx
- sub %dil,%cl /* Convert low address values 1..7 ... */
- sbb %rsi,%rsi /* carry was set, so %rsi now ~0u! */
- and $7,%cl /* ... to 7..1 */
- and $~7,%dil /* move address to start of word */
- shl $3,%cl /* now 56, 48 ... 16, 8 */
- movq (%rdi),%rax /* aligned word containing first data */
- xor %rdx,%rsi /* invert of search pattern (~c) */
- je 22f /* searching for 0xff */
-21: shr %cl,%rsi /* ~c in low bytes */
- or %rsi,%rax /* set some bits making low bytes invalid */
- jmp 2b
+ movb -6(%rdi),%dl
+ cmpb %cl,%dl /* 3rd byte == ch? */
+ jne 1f
+ subq $6,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 3rd byte == 0? */
+ je .Lzero
-/* We are searching for 0xff, so can't use ~pattern for invalid value */
-22:
- mov %r8,%r10 /* 0x01 pattern */
- lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */
- not %r10 /* now 0xfe */
- sar %cl,%r10 /* top bytes 0xff */
- and %r10,%rax /* clear lsb from unwanted low bytes */
- jmp 21b
+ movb -5(%rdi),%dl
+ cmpb %cl,%dl /* 4th byte == ch? */
+ jne 1f
+ subq $5,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 4th byte == 0? */
+ je .Lzero
-#ifdef TEST_STRCHR
-/* Trivial version for bug-fixing above */
-ENTRY(strchr)
- movq %rsi,%rdx
- movq %rdi,%rsi
-1:
- lodsb
- cmp %al,%dl
- je 2f
- test %al,%al
- jne 1b
- xor %eax,%eax
- ret
-2: lea -1(%rsi),%rax
- ret
-#endif
+ movb -4(%rdi),%dl
+ cmpb %cl,%dl /* 5th byte == ch? */
+ jne 1f
+ subq $4,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 5th byte == 0? */
+ je .Lzero
+ movb -3(%rdi),%dl
+ cmpb %cl,%dl /* 6th byte == ch? */
+ jne 1f
+ subq $3,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 6th byte == 0? */
+ je .Lzero
+
+ movb -2(%rdi),%dl
+ cmpb %cl,%dl /* 7th byte == ch? */
+ jne 1f
+ subq $2,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 7th byte == 0? */
+ je .Lzero
+
+ movb -1(%rdi),%dl
+ cmpb %cl,%dl /* 8th byte == ch? */
+ jne 1f
+ subq $1,%rdi
+ jmp .Ldone
+1: testb %dl,%dl /* 8th byte == 0? */
+ jne .Lloop
+
+.Lzero:
+ /* If a ch wasn't found, return 0. */
+ xorq %rdi,%rdi
+
+.Ldone:
+ movq %rdi,%rax
+ ret
STRONG_ALIAS(index,strchr)