2 Copyright (c) 2014, Intel Corporation
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33 if the new counter > the old one or is 0. */
34 #define UPDATE_STRNCMP_COUNTER \
35 /* calculate left number to compare */ \
36 lea -16(%rcx, %r11), %r9; \
44 #define UPDATE_STRNCMP_COUNTER
51 # define L(label) .L##label
55 # define cfi_startproc .cfi_startproc
59 # define cfi_endproc .cfi_endproc
63 # define ENTRY(name) \
64 .type name, @function; \
77 .section .text.ssse3,"ax",@progbits
80 * This implementation uses SSE to compare up to 16 bytes at a time.
91 /* Use 64bit AND here to avoid long NOP padding. */
92 and $0x3f, %rcx /* rsi alignment in cache line */
93 and $0x3f, %rax /* rdi alignment in cache line */
95 ja L(crosscache) /* rsi: 16-byte load will cross cache line */
97 ja L(crosscache) /* rdi: 16-byte load will cross cache line */
100 movhpd 8(%rdi), %xmm1
101 movhpd 8(%rsi), %xmm2
102 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
103 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
104 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
105 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
107 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
108 jnz L(less16bytes) /* If not, find different value or null char */
109 #ifdef USE_AS_STRNCMP
111 jbe L(strcmp_exitz) /* finish comparision */
113 add $16, %rsi /* prepare to search next 16 bytes */
114 add $16, %rdi /* prepare to search next 16 bytes */
117 * Determine source and destination string offsets from 16-byte alignment.
118 * Use relative offset difference between the two to determine which case
123 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
124 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
125 mov $0xffff, %edx /* for equivalent offset */
127 and $0xf, %ecx /* offset of rsi */
128 and $0xf, %eax /* offset of rdi */
130 je L(ashr_0) /* rsi and rdi relative offset same */
132 mov %edx, %r8d /* r8d is offset flag for exit tail */
138 lea L(unaligned_table)(%rip), %r10
139 movslq (%r10, %r9,4), %r9
140 lea (%r10, %r9), %r10
141 jmp *%r10 /* jump to corresponding case */
144 * The following cases will be handled by ashr_0
145 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
146 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
152 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
153 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
154 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
155 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
157 shr %cl, %edx /* adjust 0xffff for offset */
158 shr %cl, %r9d /* adjust for 16-byte offset */
161 * edx must be the same with r9d if in left byte (16-rcx) is equal to
162 * the start from (16-rax) and no null char was seen.
164 jne L(less32bytes) /* mismatch or null char */
165 UPDATE_STRNCMP_COUNTER
168 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
171 * Now both strings are aligned at 16-byte boundary. Loop over strings
172 * checking 32-bytes per iteration.
176 movdqa (%rsi, %rcx), %xmm1
177 movdqa (%rdi, %rcx), %xmm2
184 jnz L(exit) /* mismatch or null char seen */
186 #ifdef USE_AS_STRNCMP
191 movdqa (%rsi, %rcx), %xmm1
192 movdqa (%rdi, %rcx), %xmm2
200 #ifdef USE_AS_STRNCMP
208 * The following cases will be handled by ashr_1
209 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
210 * n(15) n -15 0(15 +(n-15) - n) ashr_1
217 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
218 pslldq $15, %xmm2 /* shift first string to align with second */
219 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
220 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
222 shr %cl, %edx /* adjust 0xffff for offset */
223 shr %cl, %r9d /* adjust for 16-byte offset */
225 jnz L(less32bytes) /* mismatch or null char seen */
227 UPDATE_STRNCMP_COUNTER
230 mov $16, %rcx /* index for loads*/
231 mov $1, %r9d /* byte position left over from less32bytes case */
233 * Setup %r10 value allows us to detect crossing a page boundary.
234 * When %r10 goes positive we have crossed a page boundary and
235 * need to do a nibble.
238 and $0xfff, %r10 /* offset into 4K page */
239 sub $0x1000, %r10 /* subtract 4K pagesize */
244 jg L(nibble_ashr_1) /* cross page boundary */
247 movdqa (%rsi, %rcx), %xmm1
248 movdqa (%rdi, %rcx), %xmm2
249 movdqa %xmm2, %xmm4 /* store for next cycle */
251 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
260 #ifdef USE_AS_STRNCMP
268 jg L(nibble_ashr_1) /* cross page boundary */
270 movdqa (%rsi, %rcx), %xmm1
271 movdqa (%rdi, %rcx), %xmm2
272 movdqa %xmm2, %xmm4 /* store for next cycle */
274 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
283 #ifdef USE_AS_STRNCMP
292 * Nibble avoids loads across page boundary. This is to avoid a potential
293 * access into unmapped memory.
297 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
300 jnz L(ashr_1_exittail) /* find null char*/
302 #ifdef USE_AS_STRNCMP
304 jbe L(ashr_1_exittail)
308 sub $0x1000, %r10 /* substract 4K from %r10 */
312 * Once find null char, determine if there is a string mismatch
313 * before the null char.
317 movdqa (%rsi, %rcx), %xmm1
323 * The following cases will be handled by ashr_2
324 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
325 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
342 UPDATE_STRNCMP_COUNTER
345 mov $16, %rcx /* index for loads */
346 mov $2, %r9d /* byte position left over from less32bytes case */
348 * Setup %r10 value allows us to detect crossing a page boundary.
349 * When %r10 goes positive we have crossed a page boundary and
350 * need to do a nibble.
353 and $0xfff, %r10 /* offset into 4K page */
354 sub $0x1000, %r10 /* subtract 4K pagesize */
362 movdqa (%rsi, %rcx), %xmm1
363 movdqa (%rdi, %rcx), %xmm2
366 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
375 #ifdef USE_AS_STRNCMP
384 jg L(nibble_ashr_2) /* cross page boundary */
386 movdqa (%rsi, %rcx), %xmm1
387 movdqa (%rdi, %rcx), %xmm2
390 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
399 #ifdef USE_AS_STRNCMP
410 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
413 jnz L(ashr_2_exittail)
415 #ifdef USE_AS_STRNCMP
417 jbe L(ashr_2_exittail)
426 movdqa (%rsi, %rcx), %xmm1
432 * The following cases will be handled by ashr_3
433 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
434 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
452 UPDATE_STRNCMP_COUNTER
455 mov $16, %rcx /* index for loads */
456 mov $3, %r9d /* byte position left over from less32bytes case */
458 * Setup %r10 value allows us to detect crossing a page boundary.
459 * When %r10 goes positive we have crossed a page boundary and
460 * need to do a nibble.
463 and $0xfff, %r10 /* offset into 4K page */
464 sub $0x1000, %r10 /* subtract 4K pagesize */
472 movdqa (%rsi, %rcx), %xmm1
473 movdqa (%rdi, %rcx), %xmm2
476 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
485 #ifdef USE_AS_STRNCMP
494 jg L(nibble_ashr_3) /* cross page boundary */
496 movdqa (%rsi, %rcx), %xmm1
497 movdqa (%rdi, %rcx), %xmm2
500 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
509 #ifdef USE_AS_STRNCMP
520 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
523 jnz L(ashr_3_exittail)
525 #ifdef USE_AS_STRNCMP
527 jbe L(ashr_3_exittail)
536 movdqa (%rsi, %rcx), %xmm1
542 * The following cases will be handled by ashr_4
543 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
544 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
562 UPDATE_STRNCMP_COUNTER
565 mov $16, %rcx /* index for loads */
566 mov $4, %r9d /* byte position left over from less32bytes case */
568 * Setup %r10 value allows us to detect crossing a page boundary.
569 * When %r10 goes positive we have crossed a page boundary and
570 * need to do a nibble.
573 and $0xfff, %r10 /* offset into 4K page */
574 sub $0x1000, %r10 /* subtract 4K pagesize */
582 movdqa (%rsi, %rcx), %xmm1
583 movdqa (%rdi, %rcx), %xmm2
586 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
595 #ifdef USE_AS_STRNCMP
604 jg L(nibble_ashr_4) /* cross page boundary */
606 movdqa (%rsi, %rcx), %xmm1
607 movdqa (%rdi, %rcx), %xmm2
610 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
619 #ifdef USE_AS_STRNCMP
630 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
633 jnz L(ashr_4_exittail)
635 #ifdef USE_AS_STRNCMP
637 jbe L(ashr_4_exittail)
646 movdqa (%rsi, %rcx), %xmm1
652 * The following cases will be handled by ashr_5
653 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
654 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
672 UPDATE_STRNCMP_COUNTER
675 mov $16, %rcx /* index for loads */
676 mov $5, %r9d /* byte position left over from less32bytes case */
678 * Setup %r10 value allows us to detect crossing a page boundary.
679 * When %r10 goes positive we have crossed a page boundary and
680 * need to do a nibble.
683 and $0xfff, %r10 /* offset into 4K page */
684 sub $0x1000, %r10 /* subtract 4K pagesize */
692 movdqa (%rsi, %rcx), %xmm1
693 movdqa (%rdi, %rcx), %xmm2
696 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
705 #ifdef USE_AS_STRNCMP
714 jg L(nibble_ashr_5) /* cross page boundary */
716 movdqa (%rsi, %rcx), %xmm1
717 movdqa (%rdi, %rcx), %xmm2
720 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
729 #ifdef USE_AS_STRNCMP
740 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
743 jnz L(ashr_5_exittail)
745 #ifdef USE_AS_STRNCMP
747 jbe L(ashr_5_exittail)
756 movdqa (%rsi, %rcx), %xmm1
762 * The following cases will be handled by ashr_6
763 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
764 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
782 UPDATE_STRNCMP_COUNTER
785 mov $16, %rcx /* index for loads */
786 mov $6, %r9d /* byte position left over from less32bytes case */
788 * Setup %r10 value allows us to detect crossing a page boundary.
789 * When %r10 goes positive we have crossed a page boundary and
790 * need to do a nibble.
793 and $0xfff, %r10 /* offset into 4K page */
794 sub $0x1000, %r10 /* subtract 4K pagesize */
802 movdqa (%rsi, %rcx), %xmm1
803 movdqa (%rdi, %rcx), %xmm2
806 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
815 #ifdef USE_AS_STRNCMP
824 jg L(nibble_ashr_6) /* cross page boundary */
826 movdqa (%rsi, %rcx), %xmm1
827 movdqa (%rdi, %rcx), %xmm2
830 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
839 #ifdef USE_AS_STRNCMP
850 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
853 jnz L(ashr_6_exittail)
855 #ifdef USE_AS_STRNCMP
857 jbe L(ashr_6_exittail)
866 movdqa (%rsi, %rcx), %xmm1
872 * The following cases will be handled by ashr_7
873 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
874 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
892 UPDATE_STRNCMP_COUNTER
895 mov $16, %rcx /* index for loads */
896 mov $7, %r9d /* byte position left over from less32bytes case */
898 * Setup %r10 value allows us to detect crossing a page boundary.
899 * When %r10 goes positive we have crossed a page boundary and
900 * need to do a nibble.
903 and $0xfff, %r10 /* offset into 4K page */
904 sub $0x1000, %r10 /* subtract 4K pagesize */
912 movdqa (%rsi, %rcx), %xmm1
913 movdqa (%rdi, %rcx), %xmm2
916 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
925 #ifdef USE_AS_STRNCMP
934 jg L(nibble_ashr_7) /* cross page boundary */
936 movdqa (%rsi, %rcx), %xmm1
937 movdqa (%rdi, %rcx), %xmm2
940 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
949 #ifdef USE_AS_STRNCMP
960 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
963 jnz L(ashr_7_exittail)
965 #ifdef USE_AS_STRNCMP
967 jbe L(ashr_7_exittail)
976 movdqa (%rsi, %rcx), %xmm1
982 * The following cases will be handled by ashr_8
983 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
984 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1000 movdqa (%rdi), %xmm3
1002 UPDATE_STRNCMP_COUNTER
1005 mov $16, %rcx /* index for loads */
1006 mov $8, %r9d /* byte position left over from less32bytes case */
1008 * Setup %r10 value allows us to detect crossing a page boundary.
1009 * When %r10 goes positive we have crossed a page boundary and
1010 * need to do a nibble.
1013 and $0xfff, %r10 /* offset into 4K page */
1014 sub $0x1000, %r10 /* subtract 4K pagesize */
1022 movdqa (%rsi, %rcx), %xmm1
1023 movdqa (%rdi, %rcx), %xmm2
1026 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1028 pcmpeqb %xmm1, %xmm0
1029 pcmpeqb %xmm2, %xmm1
1031 pmovmskb %xmm1, %edx
1035 #ifdef USE_AS_STRNCMP
1044 jg L(nibble_ashr_8) /* cross page boundary */
1046 movdqa (%rsi, %rcx), %xmm1
1047 movdqa (%rdi, %rcx), %xmm2
1050 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1052 pcmpeqb %xmm1, %xmm0
1053 pcmpeqb %xmm2, %xmm1
1055 pmovmskb %xmm1, %edx
1059 #ifdef USE_AS_STRNCMP
1070 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1071 pmovmskb %xmm0, %edx
1073 jnz L(ashr_8_exittail)
1075 #ifdef USE_AS_STRNCMP
1077 jbe L(ashr_8_exittail)
1082 jmp L(gobble_ashr_8)
1086 movdqa (%rsi, %rcx), %xmm1
1092 * The following cases will be handled by ashr_9
1093 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1094 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1099 movdqa (%rdi), %xmm2
1100 movdqa (%rsi), %xmm1
1101 pcmpeqb %xmm1, %xmm0
1103 pcmpeqb %xmm1, %xmm2
1105 pmovmskb %xmm2, %r9d
1110 movdqa (%rdi), %xmm3
1112 UPDATE_STRNCMP_COUNTER
1115 mov $16, %rcx /* index for loads */
1116 mov $9, %r9d /* byte position left over from less32bytes case */
1118 * Setup %r10 value allows us to detect crossing a page boundary.
1119 * When %r10 goes positive we have crossed a page boundary and
1120 * need to do a nibble.
1123 and $0xfff, %r10 /* offset into 4K page */
1124 sub $0x1000, %r10 /* subtract 4K pagesize */
1132 movdqa (%rsi, %rcx), %xmm1
1133 movdqa (%rdi, %rcx), %xmm2
1136 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1138 pcmpeqb %xmm1, %xmm0
1139 pcmpeqb %xmm2, %xmm1
1141 pmovmskb %xmm1, %edx
1145 #ifdef USE_AS_STRNCMP
1154 jg L(nibble_ashr_9) /* cross page boundary */
1156 movdqa (%rsi, %rcx), %xmm1
1157 movdqa (%rdi, %rcx), %xmm2
1160 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1162 pcmpeqb %xmm1, %xmm0
1163 pcmpeqb %xmm2, %xmm1
1165 pmovmskb %xmm1, %edx
1169 #ifdef USE_AS_STRNCMP
1175 movdqa %xmm4, %xmm3 /* store for next cycle */
1180 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1181 pmovmskb %xmm0, %edx
1183 jnz L(ashr_9_exittail)
1185 #ifdef USE_AS_STRNCMP
1187 jbe L(ashr_9_exittail)
1192 jmp L(gobble_ashr_9)
1196 movdqa (%rsi, %rcx), %xmm1
1202 * The following cases will be handled by ashr_10
1203 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1204 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1209 movdqa (%rdi), %xmm2
1210 movdqa (%rsi), %xmm1
1211 pcmpeqb %xmm1, %xmm0
1213 pcmpeqb %xmm1, %xmm2
1215 pmovmskb %xmm2, %r9d
1220 movdqa (%rdi), %xmm3
1222 UPDATE_STRNCMP_COUNTER
1225 mov $16, %rcx /* index for loads */
1226 mov $10, %r9d /* byte position left over from less32bytes case */
1228 * Setup %r10 value allows us to detect crossing a page boundary.
1229 * When %r10 goes positive we have crossed a page boundary and
1230 * need to do a nibble.
1233 and $0xfff, %r10 /* offset into 4K page */
1234 sub $0x1000, %r10 /* subtract 4K pagesize */
1239 jg L(nibble_ashr_10)
1242 movdqa (%rsi, %rcx), %xmm1
1243 movdqa (%rdi, %rcx), %xmm2
1246 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1248 pcmpeqb %xmm1, %xmm0
1249 pcmpeqb %xmm2, %xmm1
1251 pmovmskb %xmm1, %edx
1255 #ifdef USE_AS_STRNCMP
1264 jg L(nibble_ashr_10) /* cross page boundary */
1266 movdqa (%rsi, %rcx), %xmm1
1267 movdqa (%rdi, %rcx), %xmm2
1270 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1272 pcmpeqb %xmm1, %xmm0
1273 pcmpeqb %xmm2, %xmm1
1275 pmovmskb %xmm1, %edx
1279 #ifdef USE_AS_STRNCMP
1290 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1291 pmovmskb %xmm0, %edx
1293 jnz L(ashr_10_exittail)
1295 #ifdef USE_AS_STRNCMP
1297 jbe L(ashr_10_exittail)
1302 jmp L(gobble_ashr_10)
1305 L(ashr_10_exittail):
1306 movdqa (%rsi, %rcx), %xmm1
1312 * The following cases will be handled by ashr_11
1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1314 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1319 movdqa (%rdi), %xmm2
1320 movdqa (%rsi), %xmm1
1321 pcmpeqb %xmm1, %xmm0
1323 pcmpeqb %xmm1, %xmm2
1325 pmovmskb %xmm2, %r9d
1330 movdqa (%rdi), %xmm3
1332 UPDATE_STRNCMP_COUNTER
1335 mov $16, %rcx /* index for loads */
1336 mov $11, %r9d /* byte position left over from less32bytes case */
1338 * Setup %r10 value allows us to detect crossing a page boundary.
1339 * When %r10 goes positive we have crossed a page boundary and
1340 * need to do a nibble.
1343 and $0xfff, %r10 /* offset into 4K page */
1344 sub $0x1000, %r10 /* subtract 4K pagesize */
1349 jg L(nibble_ashr_11)
1352 movdqa (%rsi, %rcx), %xmm1
1353 movdqa (%rdi, %rcx), %xmm2
1356 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1358 pcmpeqb %xmm1, %xmm0
1359 pcmpeqb %xmm2, %xmm1
1361 pmovmskb %xmm1, %edx
1365 #ifdef USE_AS_STRNCMP
1374 jg L(nibble_ashr_11) /* cross page boundary */
1376 movdqa (%rsi, %rcx), %xmm1
1377 movdqa (%rdi, %rcx), %xmm2
1380 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1382 pcmpeqb %xmm1, %xmm0
1383 pcmpeqb %xmm2, %xmm1
1385 pmovmskb %xmm1, %edx
1389 #ifdef USE_AS_STRNCMP
1400 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1401 pmovmskb %xmm0, %edx
1403 jnz L(ashr_11_exittail)
1405 #ifdef USE_AS_STRNCMP
1407 jbe L(ashr_11_exittail)
1412 jmp L(gobble_ashr_11)
1415 L(ashr_11_exittail):
1416 movdqa (%rsi, %rcx), %xmm1
1422 * The following cases will be handled by ashr_12
1423 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1424 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1429 movdqa (%rdi), %xmm2
1430 movdqa (%rsi), %xmm1
1431 pcmpeqb %xmm1, %xmm0
1433 pcmpeqb %xmm1, %xmm2
1435 pmovmskb %xmm2, %r9d
1440 movdqa (%rdi), %xmm3
1442 UPDATE_STRNCMP_COUNTER
1445 mov $16, %rcx /* index for loads */
1446 mov $12, %r9d /* byte position left over from less32bytes case */
1448 * Setup %r10 value allows us to detect crossing a page boundary.
1449 * When %r10 goes positive we have crossed a page boundary and
1450 * need to do a nibble.
1453 and $0xfff, %r10 /* offset into 4K page */
1454 sub $0x1000, %r10 /* subtract 4K pagesize */
1459 jg L(nibble_ashr_12)
1462 movdqa (%rsi, %rcx), %xmm1
1463 movdqa (%rdi, %rcx), %xmm2
1466 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1468 pcmpeqb %xmm1, %xmm0
1469 pcmpeqb %xmm2, %xmm1
1471 pmovmskb %xmm1, %edx
1475 #ifdef USE_AS_STRNCMP
1484 jg L(nibble_ashr_12) /* cross page boundary */
1486 movdqa (%rsi, %rcx), %xmm1
1487 movdqa (%rdi, %rcx), %xmm2
1490 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1492 pcmpeqb %xmm1, %xmm0
1493 pcmpeqb %xmm2, %xmm1
1495 pmovmskb %xmm1, %edx
1499 #ifdef USE_AS_STRNCMP
1510 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1511 pmovmskb %xmm0, %edx
1513 jnz L(ashr_12_exittail)
1515 #ifdef USE_AS_STRNCMP
1517 jbe L(ashr_12_exittail)
1522 jmp L(gobble_ashr_12)
1525 L(ashr_12_exittail):
1526 movdqa (%rsi, %rcx), %xmm1
1532 * The following cases will be handled by ashr_13
1533 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1534 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1539 movdqa (%rdi), %xmm2
1540 movdqa (%rsi), %xmm1
1541 pcmpeqb %xmm1, %xmm0
1543 pcmpeqb %xmm1, %xmm2
1545 pmovmskb %xmm2, %r9d
1550 movdqa (%rdi), %xmm3
1552 UPDATE_STRNCMP_COUNTER
1555 mov $16, %rcx /* index for loads */
1556 mov $13, %r9d /* byte position left over from less32bytes case */
1558 * Setup %r10 value allows us to detect crossing a page boundary.
1559 * When %r10 goes positive we have crossed a page boundary and
1560 * need to do a nibble.
1563 and $0xfff, %r10 /* offset into 4K page */
1564 sub $0x1000, %r10 /* subtract 4K pagesize */
1569 jg L(nibble_ashr_13)
1572 movdqa (%rsi, %rcx), %xmm1
1573 movdqa (%rdi, %rcx), %xmm2
1576 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1578 pcmpeqb %xmm1, %xmm0
1579 pcmpeqb %xmm2, %xmm1
1581 pmovmskb %xmm1, %edx
1585 #ifdef USE_AS_STRNCMP
1594 jg L(nibble_ashr_13) /* cross page boundary */
1596 movdqa (%rsi, %rcx), %xmm1
1597 movdqa (%rdi, %rcx), %xmm2
1600 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1602 pcmpeqb %xmm1, %xmm0
1603 pcmpeqb %xmm2, %xmm1
1605 pmovmskb %xmm1, %edx
1609 #ifdef USE_AS_STRNCMP
1620 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1621 pmovmskb %xmm0, %edx
1623 jnz L(ashr_13_exittail)
1625 #ifdef USE_AS_STRNCMP
1627 jbe L(ashr_13_exittail)
1632 jmp L(gobble_ashr_13)
1635 L(ashr_13_exittail):
1636 movdqa (%rsi, %rcx), %xmm1
1642 * The following cases will be handled by ashr_14
1643 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1644 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1649 movdqa (%rdi), %xmm2
1650 movdqa (%rsi), %xmm1
1651 pcmpeqb %xmm1, %xmm0
1653 pcmpeqb %xmm1, %xmm2
1655 pmovmskb %xmm2, %r9d
1660 movdqa (%rdi), %xmm3
1662 UPDATE_STRNCMP_COUNTER
1665 mov $16, %rcx /* index for loads */
1666 mov $14, %r9d /* byte position left over from less32bytes case */
1668 * Setup %r10 value allows us to detect crossing a page boundary.
1669 * When %r10 goes positive we have crossed a page boundary and
1670 * need to do a nibble.
1673 and $0xfff, %r10 /* offset into 4K page */
1674 sub $0x1000, %r10 /* subtract 4K pagesize */
1679 jg L(nibble_ashr_14)
1682 movdqa (%rsi, %rcx), %xmm1
1683 movdqa (%rdi, %rcx), %xmm2
1686 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1688 pcmpeqb %xmm1, %xmm0
1689 pcmpeqb %xmm2, %xmm1
1691 pmovmskb %xmm1, %edx
1695 #ifdef USE_AS_STRNCMP
1704 jg L(nibble_ashr_14) /* cross page boundary */
1706 movdqa (%rsi, %rcx), %xmm1
1707 movdqa (%rdi, %rcx), %xmm2
1710 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1712 pcmpeqb %xmm1, %xmm0
1713 pcmpeqb %xmm2, %xmm1
1715 pmovmskb %xmm1, %edx
1719 #ifdef USE_AS_STRNCMP
1730 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1731 pmovmskb %xmm0, %edx
1733 jnz L(ashr_14_exittail)
1735 #ifdef USE_AS_STRNCMP
1737 jbe L(ashr_14_exittail)
1742 jmp L(gobble_ashr_14)
1745 L(ashr_14_exittail):
1746 movdqa (%rsi, %rcx), %xmm1
1752 * The following cases will be handled by ashr_15
1753 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1754 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1759 movdqa (%rdi), %xmm2
1760 movdqa (%rsi), %xmm1
1761 pcmpeqb %xmm1, %xmm0
1763 pcmpeqb %xmm1, %xmm2
1765 pmovmskb %xmm2, %r9d
1771 movdqa (%rdi), %xmm3
1773 UPDATE_STRNCMP_COUNTER
1776 mov $16, %rcx /* index for loads */
1777 mov $15, %r9d /* byte position left over from less32bytes case */
1779 * Setup %r10 value allows us to detect crossing a page boundary.
1780 * When %r10 goes positive we have crossed a page boundary and
1781 * need to do a nibble.
1784 and $0xfff, %r10 /* offset into 4K page */
1786 sub $0x1000, %r10 /* subtract 4K pagesize */
1791 jg L(nibble_ashr_15)
1794 movdqa (%rsi, %rcx), %xmm1
1795 movdqa (%rdi, %rcx), %xmm2
1798 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1800 pcmpeqb %xmm1, %xmm0
1801 pcmpeqb %xmm2, %xmm1
1803 pmovmskb %xmm1, %edx
1807 #ifdef USE_AS_STRNCMP
1816 jg L(nibble_ashr_15) /* cross page boundary */
1818 movdqa (%rsi, %rcx), %xmm1
1819 movdqa (%rdi, %rcx), %xmm2
1822 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1824 pcmpeqb %xmm1, %xmm0
1825 pcmpeqb %xmm2, %xmm1
1827 pmovmskb %xmm1, %edx
1831 #ifdef USE_AS_STRNCMP
1842 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1843 pmovmskb %xmm0, %edx
1845 jnz L(ashr_15_exittail)
1847 #ifdef USE_AS_STRNCMP
1849 je L(ashr_15_exittail)
1854 jmp L(gobble_ashr_15)
1857 L(ashr_15_exittail):
1858 movdqa (%rsi, %rcx), %xmm1
1864 pcmpeqb %xmm3, %xmm1
1866 pmovmskb %xmm1, %edx
1871 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
1873 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1874 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1877 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1882 bsf %rdx, %rdx /* find and store bit index in %rdx */
1884 #ifdef USE_AS_STRNCMP
1888 movzbl (%rsi, %rdx), %ecx
1889 movzbl (%rdi, %rdx), %eax
1907 .section .rodata,"a",@progbits
1910 .int L(ashr_1) - L(unaligned_table)
1911 .int L(ashr_2) - L(unaligned_table)
1912 .int L(ashr_3) - L(unaligned_table)
1913 .int L(ashr_4) - L(unaligned_table)
1914 .int L(ashr_5) - L(unaligned_table)
1915 .int L(ashr_6) - L(unaligned_table)
1916 .int L(ashr_7) - L(unaligned_table)
1917 .int L(ashr_8) - L(unaligned_table)
1918 .int L(ashr_9) - L(unaligned_table)
1919 .int L(ashr_10) - L(unaligned_table)
1920 .int L(ashr_11) - L(unaligned_table)
1921 .int L(ashr_12) - L(unaligned_table)
1922 .int L(ashr_13) - L(unaligned_table)
1923 .int L(ashr_14) - L(unaligned_table)
1924 .int L(ashr_15) - L(unaligned_table)
1925 .int L(ashr_0) - L(unaligned_table)