reference/newlib/strcmp.S

   1 /*
   2  * Copyright (c) 2012 ARM Ltd
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. The name of the company may not be used to endorse or promote
  14  *    products derived from this software without specific prior written
  15  *    permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include "arm_asm.h"
  30
  31 #ifdef __ARMEB__
  32 #define S2LOMEM lsl
  33 #define S2LOMEMEQ lsleq
  34 #define S2HIMEM lsr
  35 #define MSB 0x000000ff
  36 #define LSB 0xff000000
  37 #define BYTE0_OFFSET 24
  38 #define BYTE1_OFFSET 16
  39 #define BYTE2_OFFSET 8
  40 #define BYTE3_OFFSET 0
  41 #else /* not  __ARMEB__ */
  42 #define S2LOMEM lsr
  43 #define S2LOMEMEQ lsreq
  44 #define S2HIMEM lsl
  45 #define BYTE0_OFFSET 0
  46 #define BYTE1_OFFSET 8
  47 #define BYTE2_OFFSET 16
  48 #define BYTE3_OFFSET 24
  49 #define MSB 0xff000000
  50 #define LSB 0x000000ff
  51 #endif /* not  __ARMEB__ */
  52
  53 .syntax         unified
  54
  55 #if defined (__thumb__)
  56         .thumb
  57         .thumb_func
  58 #endif
  59         .global strcmp
  60         .type   strcmp, %function
  61 strcmp:
  62
  63 #if (defined (__thumb__) && !defined (__thumb2__))
  64 1:
  65         ldrb    r2, [r0]
  66         ldrb    r3, [r1]
  67         adds    r0, r0, #1
  68         adds    r1, r1, #1
  69         cmp     r2, #0
  70         beq     2f
  71         cmp     r2, r3
  72         beq     1b
  73 2:
  74         subs    r0, r2, r3
  75         bx      lr
  76 #elif (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
  77 1:
  78         ldrb    r2, [r0], #1
  79         ldrb    r3, [r1], #1
  80         cmp     r2, #1
  81         it      cs
  82         cmpcs   r2, r3
  83         beq     1b
  84         subs    r0, r2, r3
  85         RETURN
  86
  87
  88 #elif (defined (_ISA_THUMB_2) || defined (_ISA_ARM_6))
  89         /* Use LDRD whenever possible.  */
  90
  91 /* The main thing to look out for when comparing large blocks is that
  92    the loads do not cross a page boundary when loading past the index
  93    of the byte with the first difference or the first string-terminator.
  94
  95    For example, if the strings are identical and the string-terminator
  96    is at index k, byte by byte comparison will not load beyond address
  97    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
  98    k; double word - up to 7 bytes.  If the load of these bytes crosses
  99    a page boundary, it might cause a memory fault (if the page is not mapped)
 100    that would not have happened in byte by byte comparison.
 101
 102    If an address is (double) word aligned, then a load of a (double) word
 103    from that address will not cross a page boundary.
 104    Therefore, the algorithm below considers word and double-word alignment
 105    of strings separately.  */
 106
 107 /* High-level description of the algorithm.
 108
 109    * The fast path: if both strings are double-word aligned,
 110      use LDRD to load two words from each string in every loop iteration.
 111    * If the strings have the same offset from a word boundary,
 112      use LDRB to load and compare byte by byte until
 113      the first string is aligned to a word boundary (at most 3 bytes).
 114      This is optimized for quick return on short unaligned strings.
 115    * If the strings have the same offset from a double-word boundary,
 116      use LDRD to load two words from each string in every loop iteration, as in the fast path.
 117    * If the strings do not have the same offset from a double-word boundary,
 118      load a word from the second string before the loop to initialize the queue.
 119      Use LDRD to load two words from every string in every loop iteration.
 120      Inside the loop, load the second word from the second string only after comparing
 121      the first word, using the queued value, to guarantee safety across page boundaries.
 122    * If the strings do not have the same offset from a word boundary,
 123      use LDR and a shift queue. Order of loads and comparisons matters,
 124      similarly to the previous case.
 125
 126    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
 127    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
 128    * The only difference between big and little endian is the use of REV in little endian
 129      to compute the return value, instead of MOV.
 130    * No preload. [TODO.]
 131 */
 132
 133         .macro m_cbz reg label
 134 #ifdef __thumb2__
 135         cbz     \reg, \label
 136 #else   /* not defined __thumb2__ */
 137         cmp     \reg, #0
 138         beq     \label
 139 #endif /* not defined __thumb2__ */
 140         .endm /* m_cbz */
 141
 142         .macro m_cbnz reg label
 143 #ifdef __thumb2__
 144         cbnz    \reg, \label
 145 #else   /* not defined __thumb2__ */
 146         cmp     \reg, #0
 147         bne     \label
 148 #endif /* not defined __thumb2__ */
 149         .endm /* m_cbnz */
 150
 151         .macro  init
 152         /* Macro to save temporary registers and prepare magic values.  */
 153         subs    sp, sp, #16
 154         strd    r4, r5, [sp, #8]
 155         strd    r6, r7, [sp]
 156         mvn     r6, #0  /* all F */
 157         mov     r7, #0  /* all 0 */
 158         .endm   /* init */
 159
 160         .macro  magic_compare_and_branch w1 w2 label
 161         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
 162         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
 163         magic_find_zero_bytes \w1
 164         it      eq
 165         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
 166         bne     \label
 167         .endm /* magic_compare_and_branch */
 168
 169         .macro  magic_find_zero_bytes w1
 170         /* Macro to find all-zero bytes in w1, result is in ip.  */
 171 #if (defined (__ARM_FEATURE_DSP))
 172         uadd8   ip, \w1, r6
 173         sel     ip, r7, r6
 174 #else /* not defined (__ARM_FEATURE_DSP) */
 175         /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
 176         Coincidently, these processors only have Thumb-2 mode, where we can use the
 177         the (large) magic constant available directly as an immediate in instructions.
 178         Note that we cannot use the magic constant in ARM mode, where we need
 179         to create the constant in a register.  */
 180         sub     ip, \w1, #0x01010101
 181         bic     ip, ip, \w1
 182         and     ip, ip, #0x80808080
 183 #endif /* not defined (__ARM_FEATURE_DSP) */
 184         .endm /* magic_find_zero_bytes */
 185
 186         .macro  setup_return w1 w2
 187 #ifdef __ARMEB__
 188         mov     r1, \w1
 189         mov     r2, \w2
 190 #else /* not  __ARMEB__ */
 191         rev     r1, \w1
 192         rev     r2, \w2
 193 #endif /* not  __ARMEB__ */
 194         .endm /* setup_return */
 195
 196         /*
 197         optpld r0, #0
 198         optpld r1, #0
 199         */
 200
 201         /* Are both strings double-word aligned?  */
 202         orr     ip, r0, r1
 203         tst     ip, #7
 204         bne     do_align
 205
 206         /* Fast path.  */
 207         init
 208
 209 doubleword_aligned:
 210
 211         /* Get here when the strings to compare are double-word aligned.  */
 212         /* Compare two words in every iteration.  */
 213         .p2align        2
 214 2:
 215         /*
 216         optpld r0, #16
 217         optpld r1, #16
 218         */
 219
 220         /* Load the next double-word from each string.  */
 221         ldrd    r2, r3, [r0], #8
 222         ldrd    r4, r5, [r1], #8
 223
 224         magic_compare_and_branch w1=r2, w2=r4, label=return_24
 225         magic_compare_and_branch w1=r3, w2=r5, label=return_35
 226         b       2b
 227
 228 do_align:
 229         /* Is the first string word-aligned?  */
 230         ands    ip, r0, #3
 231         beq     word_aligned_r0
 232
 233         /* Fast compare byte by byte until the first string is word-aligned.  */
 234         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
 235         to read until the next word boudnary is 4-ip.  */
 236         bic     r0, r0, #3
 237         ldr     r2, [r0], #4
 238         lsls    ip, ip, #31
 239         beq     byte2
 240         bcs     byte3
 241
 242 byte1:
 243         ldrb    ip, [r1], #1
 244         uxtb    r3, r2, ror #BYTE1_OFFSET
 245         subs    ip, r3, ip
 246         bne     fast_return
 247         m_cbz   reg=r3, label=fast_return
 248
 249 byte2:
 250         ldrb    ip, [r1], #1
 251         uxtb    r3, r2, ror #BYTE2_OFFSET
 252         subs    ip, r3, ip
 253         bne     fast_return
 254         m_cbz   reg=r3, label=fast_return
 255
 256 byte3:
 257         ldrb    ip, [r1], #1
 258         uxtb    r3, r2, ror #BYTE3_OFFSET
 259         subs    ip, r3, ip
 260         bne     fast_return
 261         m_cbnz  reg=r3, label=word_aligned_r0
 262
 263 fast_return:
 264         mov     r0, ip
 265         bx      lr
 266
 267 word_aligned_r0:
 268         init
 269         /* The first string is word-aligned.  */
 270         /* Is the second string word-aligned?  */
 271         ands    ip, r1, #3
 272         bne     strcmp_unaligned
 273
 274 word_aligned:
 275         /* The strings are word-aligned. */
 276         /* Is the first string double-word aligned?  */
 277         tst     r0, #4
 278         beq     doubleword_aligned_r0
 279
 280         /* If r0 is not double-word aligned yet, align it by loading
 281         and comparing the next word from each string.  */
 282         ldr     r2, [r0], #4
 283         ldr     r4, [r1], #4
 284         magic_compare_and_branch w1=r2 w2=r4 label=return_24
 285
 286 doubleword_aligned_r0:
 287         /* Get here when r0 is double-word aligned.  */
 288         /* Is r1 doubleword_aligned?  */
 289         tst     r1, #4
 290         beq     doubleword_aligned
 291
 292         /* Get here when the strings to compare are word-aligned,
 293         r0 is double-word aligned, but r1 is not double-word aligned.  */
 294
 295         /* Initialize the queue.  */
 296         ldr     r5, [r1], #4
 297
 298         /* Compare two words in every iteration.  */
 299         .p2align        2
 300 3:
 301         /*
 302         optpld r0, #16
 303         optpld r1, #16
 304         */
 305
 306         /* Load the next double-word from each string and compare.  */
 307         ldrd    r2, r3, [r0], #8
 308         magic_compare_and_branch w1=r2 w2=r5 label=return_25
 309         ldrd    r4, r5, [r1], #8
 310         magic_compare_and_branch w1=r3 w2=r4 label=return_34
 311         b       3b
 312
 313         .macro miscmp_word offsetlo offsethi
 314         /* Macro to compare misaligned strings.  */
 315         /* r0, r1 are word-aligned, and at least one of the strings
 316         is not double-word aligned.  */
 317         /* Compare one word in every loop iteration.  */
 318         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
 319         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
 320
 321         /* Initialize the shift queue.  */
 322         ldr     r5, [r1], #4
 323
 324         /* Compare one word from each string in every loop iteration.  */
 325         .p2align        2
 326 7:
 327         ldr     r3, [r0], #4
 328         S2LOMEM r5, r5, #\offsetlo
 329         magic_find_zero_bytes w1=r3
 330         cmp     r7, ip, S2HIMEM #\offsetlo
 331         and     r2, r3, r6, S2LOMEM #\offsetlo
 332         it      eq
 333         cmpeq   r2, r5
 334         bne     return_25
 335         ldr     r5, [r1], #4
 336         cmp     ip, #0
 337         eor     r3, r2, r3
 338         S2HIMEM r2, r5, #\offsethi
 339         it      eq
 340         cmpeq   r3, r2
 341         bne     return_32
 342         b       7b
 343         .endm /* miscmp_word */
 344
 345 strcmp_unaligned:
 346         /* r0 is word-aligned, r1 is at offset ip from a word.  */
 347         /* Align r1 to the (previous) word-boundary.  */
 348         bic     r1, r1, #3
 349
 350         /* Unaligned comparison word by word using LDRs. */
 351         cmp     ip, #2
 352         beq     miscmp_word_16                    /* If ip == 2.  */
 353         bge     miscmp_word_24                    /* If ip == 3.  */
 354         miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
 355 miscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
 356 miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
 357
 358
 359 return_32:
 360         setup_return w1=r3, w2=r2
 361         b       do_return
 362 return_34:
 363         setup_return w1=r3, w2=r4
 364         b       do_return
 365 return_25:
 366         setup_return w1=r2, w2=r5
 367         b       do_return
 368 return_35:
 369         setup_return w1=r3, w2=r5
 370         b       do_return
 371 return_24:
 372         setup_return w1=r2, w2=r4
 373
 374 do_return:
 375
 376 #ifdef __ARMEB__
 377         mov     r0, ip
 378 #else /* not  __ARMEB__ */
 379         rev     r0, ip
 380 #endif /* not  __ARMEB__ */
 381
 382         /* Restore temporaries early, before computing the return value.  */
 383         ldrd    r6, r7, [sp]
 384         ldrd    r4, r5, [sp, #8]
 385         adds    sp, sp, #16
 386
 387         /* There is a zero or a different byte between r1 and r2.  */
 388         /* r0 contains a mask of all-zero bytes in r1.  */
 389         /* Using r0 and not ip here because cbz requires low register.  */
 390         m_cbz   reg=r0, label=compute_return_value
 391         clz     r0, r0
 392         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
 393         rsb     r0, r0, #24
 394         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
 395         lsr     r1, r1, r0
 396         lsr     r2, r2, r0
 397
 398 compute_return_value:
 399         subs    r0, r1, r2
 400         bx      lr
 401
 402
 403 #else   /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
 404              defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
 405              (defined (__thumb__) && !defined (__thumb2__))) */
 406
 407         /* Use LDR whenever possible. */
 408
 409 #ifdef __thumb2__
 410 #define magic1(REG) 0x01010101
 411 #define magic2(REG) 0x80808080
 412 #else
 413 #define magic1(REG) REG
 414 #define magic2(REG) REG, lsl #7
 415 #endif
 416
 417         optpld  r0
 418         optpld  r1
 419         eor     r2, r0, r1
 420         tst     r2, #3
 421         /* Strings not at same byte offset from a word boundary.  */
 422         bne     strcmp_unaligned
 423         ands    r2, r0, #3
 424         bic     r0, r0, #3
 425         bic     r1, r1, #3
 426         ldr     ip, [r0], #4
 427         it      eq
 428         ldreq   r3, [r1], #4
 429         beq     1f
 430         /* Although s1 and s2 have identical initial alignment, they are
 431         not currently word aligned.  Rather than comparing bytes,
 432         make sure that any bytes fetched from before the addressed
 433         bytes are forced to 0xff.  Then they will always compare
 434         equal.  */
 435         eor     r2, r2, #3
 436         lsl     r2, r2, #3
 437         mvn     r3, MSB
 438         S2LOMEM        r2, r3, r2
 439         ldr     r3, [r1], #4
 440         orr     ip, ip, r2
 441         orr     r3, r3, r2
 442 1:
 443 #ifndef __thumb2__
 444               /* Load the 'magic' constant 0x01010101.  */
 445         str     r4, [sp, #-4]!
 446         mov     r4, #1
 447         orr     r4, r4, r4, lsl #8
 448         orr     r4, r4, r4, lsl #16
 449 #endif
 450         .p2align        2
 451 4:
 452         optpld  r0, #8
 453         optpld  r1, #8
 454         sub     r2, ip, magic1(r4)
 455         cmp     ip, r3
 456         itttt   eq
 457         /* check for any zero bytes in first word */
 458         biceq   r2, r2, ip
 459         tsteq   r2, magic2(r4)
 460         ldreq   ip, [r0], #4
 461         ldreq   r3, [r1], #4
 462         beq     4b
 463 2:
 464         /* There's a zero or a different byte in the word */
 465         S2HIMEM  r0, ip, #24
 466         S2LOMEM  ip, ip, #8
 467         cmp     r0, #1
 468         it      cs
 469         cmpcs   r0, r3, S2HIMEM #24
 470         it      eq
 471         S2LOMEMEQ r3, r3, #8
 472         beq     2b
 473         /* On a big-endian machine, r0 contains the desired byte in bits
 474         0-7; on a little-endian machine they are in bits 24-31.  In
 475         both cases the other bits in r0 are all zero.  For r3 the
 476         interesting byte is at the other end of the word, but the
 477         other bits are not necessarily zero.  We need a signed result
 478         representing the differnece in the unsigned bytes, so for the
 479         little-endian case we can't just shift the interesting bits
 480         up.  */
 481 #ifdef __ARMEB__
 482         sub     r0, r0, r3, lsr #24
 483 #else
 484         and     r3, r3, #255
 485 #ifdef __thumb2__
 486         /* No RSB instruction in Thumb2 */
 487         lsr     r0, r0, #24
 488         sub     r0, r0, r3
 489 #else
 490         rsb     r0, r3, r0, lsr #24
 491 #endif
 492 #endif
 493 #ifndef __thumb2__
 494         ldr     r4, [sp], #4
 495 #endif
 496         RETURN
 497
 498
 499 strcmp_unaligned:
 500
 501 #if 0
 502         /* The assembly code below is based on the following alogrithm.  */
 503 #ifdef __ARMEB__
 504 #define RSHIFT <<
 505 #define LSHIFT >>
 506 #else
 507 #define RSHIFT >>
 508 #define LSHIFT <<
 509 #endif
 510
 511 #define body(shift)                                                     \
 512   mask = 0xffffffffU RSHIFT shift;                                      \
 513   w1 = *wp1++;                                                          \
 514   w2 = *wp2++;                                                          \
 515   do                                                                    \
 516     {                                                                   \
 517       t1 = w1 & mask;                                                   \
 518       if (__builtin_expect(t1 != w2 RSHIFT shift, 0))                   \
 519         {                                                               \
 520           w2 RSHIFT= shift;                                             \
 521           break;                                                        \
 522         }                                                               \
 523       if (__builtin_expect(((w1 - b1) & ~w1) & (b1 << 7), 0))           \
 524         {                                                               \
 525           /* See comment in assembler below re syndrome on big-endian */\
 526           if ((((w1 - b1) & ~w1) & (b1 << 7)) & mask)                   \
 527             w2 RSHIFT= shift;                                           \
 528           else                                                          \
 529             {                                                           \
 530               w2 = *wp2;                                                \
 531               t1 = w1 RSHIFT (32 - shift);                              \
 532               w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift);        \
 533             }                                                           \
 534           break;                                                        \
 535         }                                                               \
 536       w2 = *wp2++;                                                      \
 537       t1 ^= w1;                                                         \
 538       if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0))            \
 539         {                                                               \
 540           t1 = w1 >> (32 - shift);                                      \
 541           w2 = (w2 << (32 - shift)) RSHIFT (32 - shift);                \
 542           break;                                                        \
 543         }                                                               \
 544       w1 = *wp1++;                                                      \
 545     } while (1)
 546
 547   const unsigned* wp1;
 548   const unsigned* wp2;
 549   unsigned w1, w2;
 550   unsigned mask;
 551   unsigned shift;
 552   unsigned b1 = 0x01010101;
 553   char c1, c2;
 554   unsigned t1;
 555
 556   while (((unsigned) s1) & 3)
 557     {
 558       c1 = *s1++;
 559       c2 = *s2++;
 560       if (c1 == 0 || c1 != c2)
 561         return c1 - (int)c2;
 562     }
 563   wp1 = (unsigned*) (((unsigned)s1) & ~3);
 564   wp2 = (unsigned*) (((unsigned)s2) & ~3);
 565   t1 = ((unsigned) s2) & 3;
 566   if (t1 == 1)
 567     {
 568       body(8);
 569     }
 570   else if (t1 == 2)
 571     {
 572       body(16);
 573     }
 574   else
 575     {
 576       body (24);
 577     }
 578
 579   do
 580     {
 581 #ifdef __ARMEB__
 582       c1 = (char) t1 >> 24;
 583       c2 = (char) w2 >> 24;
 584 #else /* not  __ARMEB__ */
 585       c1 = (char) t1;
 586       c2 = (char) w2;
 587 #endif /* not  __ARMEB__ */
 588       t1 RSHIFT= 8;
 589       w2 RSHIFT= 8;
 590     } while (c1 != 0 && c1 == c2);
 591   return c1 - c2;
 592 #endif /* 0 */
 593
 594
 595         wp1 .req r0
 596         wp2 .req r1
 597         b1  .req r2
 598         w1  .req r4
 599         w2  .req r5
 600         t1  .req ip
 601         @ r3 is scratch
 602
 603         /* First of all, compare bytes until wp1(sp1) is word-aligned. */
 604 1:
 605         tst     wp1, #3
 606         beq     2f
 607         ldrb    r2, [wp1], #1
 608         ldrb    r3, [wp2], #1
 609         cmp     r2, #1
 610         it      cs
 611         cmpcs   r2, r3
 612         beq     1b
 613         sub     r0, r2, r3
 614         RETURN
 615
 616 2:
 617         str     r5, [sp, #-4]!
 618         str     r4, [sp, #-4]!
 619         //stmfd   sp!, {r4, r5}
 620         mov     b1, #1
 621         orr     b1, b1, b1, lsl #8
 622         orr     b1, b1, b1, lsl #16
 623
 624         and     t1, wp2, #3
 625         bic     wp2, wp2, #3
 626         ldr     w1, [wp1], #4
 627         ldr     w2, [wp2], #4
 628         cmp     t1, #2
 629         beq     2f
 630         bhi     3f
 631
 632         /* Critical inner Loop: Block with 3 bytes initial overlap */
 633         .p2align        2
 634 1:
 635         bic     t1, w1, MSB
 636         cmp     t1, w2, S2LOMEM #8
 637         sub     r3, w1, b1
 638         bic     r3, r3, w1
 639         bne     4f
 640         ands    r3, r3, b1, lsl #7
 641         it      eq
 642         ldreq   w2, [wp2], #4
 643         bne     5f
 644         eor     t1, t1, w1
 645         cmp     t1, w2, S2HIMEM #24
 646         bne     6f
 647         ldr     w1, [wp1], #4
 648         b       1b
 649 4:
 650         S2LOMEM        w2, w2, #8
 651         b       8f
 652
 653 5:
 654 #ifdef __ARMEB__
 655         /* The syndrome value may contain false ones if the string ends
 656         with the bytes 0x01 0x00 */
 657         tst     w1, #0xff000000
 658         itt     ne
 659         tstne   w1, #0x00ff0000
 660         tstne   w1, #0x0000ff00
 661         beq     7f
 662 #else
 663         bics    r3, r3, #0xff000000
 664         bne     7f
 665 #endif
 666         ldrb    w2, [wp2]
 667         S2LOMEM  t1, w1, #24
 668 #ifdef __ARMEB__
 669         lsl     w2, w2, #24
 670 #endif
 671         b       8f
 672
 673 6:
 674         S2LOMEM  t1, w1, #24
 675         and     w2, w2, LSB
 676         b       8f
 677
 678         /* Critical inner Loop: Block with 2 bytes initial overlap */
 679         .p2align        2
 680 2:
 681         S2HIMEM  t1, w1, #16
 682         sub     r3, w1, b1
 683         S2LOMEM  t1, t1, #16
 684         bic     r3, r3, w1
 685         cmp     t1, w2, S2LOMEM #16
 686         bne     4f
 687         ands    r3, r3, b1, lsl #7
 688         it      eq
 689         ldreq   w2, [wp2], #4
 690         bne     5f
 691         eor     t1, t1, w1
 692         cmp     t1, w2, S2HIMEM #16
 693         bne     6f
 694         ldr     w1, [wp1], #4
 695         b       2b
 696
 697 5:
 698 #ifdef __ARMEB__
 699         /* The syndrome value may contain false ones if the string ends
 700         with the bytes 0x01 0x00 */
 701         tst     w1, #0xff000000
 702         it      ne
 703         tstne   w1, #0x00ff0000
 704         beq     7f
 705 #else
 706         lsls    r3, r3, #16
 707         bne     7f
 708 #endif
 709         ldrh    w2, [wp2]
 710         S2LOMEM  t1, w1, #16
 711 #ifdef __ARMEB__
 712         lsl     w2, w2, #16
 713 #endif
 714         b       8f
 715
 716 6:
 717         S2HIMEM  w2, w2, #16
 718         S2LOMEM  t1, w1, #16
 719 4:
 720         S2LOMEM  w2, w2, #16
 721         b       8f
 722
 723         /* Critical inner Loop: Block with 1 byte initial overlap */
 724         .p2align        2
 725 3:
 726         and     t1, w1, LSB
 727         cmp     t1, w2, S2LOMEM #24
 728         sub     r3, w1, b1
 729         bic     r3, r3, w1
 730         bne     4f
 731         ands    r3, r3, b1, lsl #7
 732         it      eq
 733         ldreq   w2, [wp2], #4
 734         bne     5f
 735         eor     t1, t1, w1
 736         cmp     t1, w2, S2HIMEM #8
 737         bne     6f
 738         ldr     w1, [wp1], #4
 739         b       3b
 740 4:
 741         S2LOMEM  w2, w2, #24
 742         b       8f
 743 5:
 744         /* The syndrome value may contain false ones if the string ends
 745         with the bytes 0x01 0x00 */
 746         tst     w1, LSB
 747         beq     7f
 748         ldr     w2, [wp2], #4
 749 6:
 750         S2LOMEM  t1, w1, #8
 751         bic     w2, w2, MSB
 752         b       8f
 753 7:
 754         mov     r0, #0
 755         //ldmfd   sp!, {r4, r5}
 756         ldr     r4, [sp], #4
 757         ldr     r5, [sp], #4
 758         RETURN
 759 8:
 760         and     r2, t1, LSB
 761         and     r0, w2, LSB
 762         cmp     r0, #1
 763         it      cs
 764         cmpcs   r0, r2
 765         itt     eq
 766         S2LOMEMEQ        t1, t1, #8
 767         S2LOMEMEQ        w2, w2, #8
 768         beq     8b
 769         sub     r0, r2, r0
 770         //ldmfd   sp!, {r4, r5}
 771         ldr     r4, [sp], #4
 772         ldr     r5, [sp], #4
 773         RETURN
 774
 775 #endif /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
 776             defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
 777             (defined (__thumb__) && !defined (__thumb2__))) */