| // Copyright 2020 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !appengine |
| // +build gc |
| // +build !noasm |
| |
| #include "textflag.h" |
| |
| // The asm code generally follows the pure Go code in encode_other.go, except |
| // where marked with a "!!!". |
| |
| // ---------------------------------------------------------------------------- |
| |
| // func emitLiteral(dst, lit []byte) int |
| // |
| // All local variables fit into registers. The register allocation: |
| // - R3 len(lit) |
| // - R4 n |
| // - R6 return value |
| // - R8 &dst[i] |
| // - R10 &lit[0] |
| // |
| // The 32 bytes of stack space is to call runtime·memmove. |
| // |
| // The unusual register allocation of local variables, such as R10 for the |
| // source pointer, matches the allocation used at the call site in encodeBlock, |
| // which makes it easier to manually inline this function. |
| TEXT ·emitLiteral(SB), NOSPLIT, $32-56 |
| MOVD dst_base+0(FP), R8 |
| MOVD lit_base+24(FP), R10 |
| MOVD lit_len+32(FP), R3 |
| MOVD R3, R6 |
| MOVW R3, R4 |
| SUBW $1, R4, R4 |
| |
| CMPW $60, R4 |
| BLT oneByte |
| CMPW $256, R4 |
| BLT twoBytes |
| |
| threeBytes: |
| MOVD $0xf4, R2 |
| MOVB R2, 0(R8) |
| MOVW R4, 1(R8) |
| ADD $3, R8, R8 |
| ADD $3, R6, R6 |
| B memmove |
| |
| twoBytes: |
| MOVD $0xf0, R2 |
| MOVB R2, 0(R8) |
| MOVB R4, 1(R8) |
| ADD $2, R8, R8 |
| ADD $2, R6, R6 |
| B memmove |
| |
| oneByte: |
| LSLW $2, R4, R4 |
| MOVB R4, 0(R8) |
| ADD $1, R8, R8 |
| ADD $1, R6, R6 |
| |
| memmove: |
| MOVD R6, ret+48(FP) |
| |
| // copy(dst[i:], lit) |
| // |
| // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
| // R8, R10 and R3 as arguments. |
| MOVD R8, 8(RSP) |
| MOVD R10, 16(RSP) |
| MOVD R3, 24(RSP) |
| CALL runtime·memmove(SB) |
| RET |
| |
| // ---------------------------------------------------------------------------- |
| |
| // func emitCopy(dst []byte, offset, length int) int |
| // |
| // All local variables fit into registers. The register allocation: |
| // - R3 length |
| // - R7 &dst[0] |
| // - R8 &dst[i] |
| // - R11 offset |
| // |
| // The unusual register allocation of local variables, such as R11 for the |
| // offset, matches the allocation used at the call site in encodeBlock, which |
| // makes it easier to manually inline this function. |
| TEXT ·emitCopy(SB), NOSPLIT, $0-48 |
| MOVD dst_base+0(FP), R8 |
| MOVD R8, R7 |
| MOVD offset+24(FP), R11 |
| MOVD length+32(FP), R3 |
| |
| loop0: |
| // for length >= 68 { etc } |
| CMPW $68, R3 |
| BLT step1 |
| |
| // Emit a length 64 copy, encoded as 3 bytes. |
| MOVD $0xfe, R2 |
| MOVB R2, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| SUB $64, R3, R3 |
| B loop0 |
| |
| step1: |
| // if length > 64 { etc } |
| CMP $64, R3 |
| BLE step2 |
| |
| // Emit a length 60 copy, encoded as 3 bytes. |
| MOVD $0xee, R2 |
| MOVB R2, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| SUB $60, R3, R3 |
| |
| step2: |
| // if length >= 12 || offset >= 2048 { goto step3 } |
| CMP $12, R3 |
| BGE step3 |
| CMPW $2048, R11 |
| BGE step3 |
| |
| // Emit the remaining copy, encoded as 2 bytes. |
| MOVB R11, 1(R8) |
| LSRW $3, R11, R11 |
| AND $0xe0, R11, R11 |
| SUB $4, R3, R3 |
| LSLW $2, R3 |
| AND $0xff, R3, R3 |
| ORRW R3, R11, R11 |
| ORRW $1, R11, R11 |
| MOVB R11, 0(R8) |
| ADD $2, R8, R8 |
| |
| // Return the number of bytes written. |
| SUB R7, R8, R8 |
| MOVD R8, ret+40(FP) |
| RET |
| |
| step3: |
| // Emit the remaining copy, encoded as 3 bytes. |
| SUB $1, R3, R3 |
| AND $0xff, R3, R3 |
| LSLW $2, R3, R3 |
| ORRW $2, R3, R3 |
| MOVB R3, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| |
| // Return the number of bytes written. |
| SUB R7, R8, R8 |
| MOVD R8, ret+40(FP) |
| RET |
| |
| // ---------------------------------------------------------------------------- |
| |
| // func extendMatch(src []byte, i, j int) int |
| // |
| // All local variables fit into registers. The register allocation: |
| // - R6 &src[0] |
| // - R7 &src[j] |
| // - R13 &src[len(src) - 8] |
| // - R14 &src[len(src)] |
| // - R15 &src[i] |
| // |
| // The unusual register allocation of local variables, such as R15 for a source |
| // pointer, matches the allocation used at the call site in encodeBlock, which |
| // makes it easier to manually inline this function. |
| TEXT ·extendMatch(SB), NOSPLIT, $0-48 |
| MOVD src_base+0(FP), R6 |
| MOVD src_len+8(FP), R14 |
| MOVD i+24(FP), R15 |
| MOVD j+32(FP), R7 |
| ADD R6, R14, R14 |
| ADD R6, R15, R15 |
| ADD R6, R7, R7 |
| MOVD R14, R13 |
| SUB $8, R13, R13 |
| |
| cmp8: |
| // As long as we are 8 or more bytes before the end of src, we can load and |
| // compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
| CMP R13, R7 |
| BHI cmp1 |
| MOVD (R15), R3 |
| MOVD (R7), R4 |
| CMP R4, R3 |
| BNE bsf |
| ADD $8, R15, R15 |
| ADD $8, R7, R7 |
| B cmp8 |
| |
| bsf: |
| // If those 8 bytes were not equal, XOR the two 8 byte values, and return |
| // the index of the first byte that differs. |
| // RBIT reverses the bit order, then CLZ counts the leading zeros, the |
| // combination of which finds the least significant bit which is set. |
| // The arm64 architecture is little-endian, and the shift by 3 converts |
| // a bit index to a byte index. |
| EOR R3, R4, R4 |
| RBIT R4, R4 |
| CLZ R4, R4 |
| ADD R4>>3, R7, R7 |
| |
| // Convert from &src[ret] to ret. |
| SUB R6, R7, R7 |
| MOVD R7, ret+40(FP) |
| RET |
| |
| cmp1: |
| // In src's tail, compare 1 byte at a time. |
| CMP R7, R14 |
| BLS extendMatchEnd |
| MOVB (R15), R3 |
| MOVB (R7), R4 |
| CMP R4, R3 |
| BNE extendMatchEnd |
| ADD $1, R15, R15 |
| ADD $1, R7, R7 |
| B cmp1 |
| |
| extendMatchEnd: |
| // Convert from &src[ret] to ret. |
| SUB R6, R7, R7 |
| MOVD R7, ret+40(FP) |
| RET |
| |
| // ---------------------------------------------------------------------------- |
| |
| // func encodeBlock(dst, src []byte) (d int) |
| // |
| // All local variables fit into registers, other than "var table". The register |
| // allocation: |
| // - R3 . . |
| // - R4 . . |
| // - R5 64 shift |
| // - R6 72 &src[0], tableSize |
| // - R7 80 &src[s] |
| // - R8 88 &dst[d] |
| // - R9 96 sLimit |
| // - R10 . &src[nextEmit] |
| // - R11 104 prevHash, currHash, nextHash, offset |
| // - R12 112 &src[base], skip |
| // - R13 . &src[nextS], &src[len(src) - 8] |
| // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x |
| // - R15 120 candidate |
| // - R16 . hash constant, 0x1e35a7bd |
| // - R17 . &table |
| // - . 128 table |
| // |
| // The second column (64, 72, etc) is the stack offset to spill the registers |
| // when calling other functions. We could pack this slightly tighter, but it's |
| // simpler to have a dedicated spill map independent of the function called. |
| // |
| // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An |
| // extra 64 bytes, to call other functions, and an extra 64 bytes, to spill |
| // local variables (registers) during calls gives 32768 + 64 + 64 = 32896. |
| TEXT ·encodeBlock(SB), 0, $32896-56 |
| MOVD dst_base+0(FP), R8 |
| MOVD src_base+24(FP), R7 |
| MOVD src_len+32(FP), R14 |
| |
| // shift, tableSize := uint32(32-8), 1<<8 |
| MOVD $24, R5 |
| MOVD $256, R6 |
| MOVW $0xa7bd, R16 |
| MOVKW $(0x1e35<<16), R16 |
| |
| calcShift: |
| // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { |
| // shift-- |
| // } |
| MOVD $16384, R2 |
| CMP R2, R6 |
| BGE varTable |
| CMP R14, R6 |
| BGE varTable |
| SUB $1, R5, R5 |
| LSL $1, R6, R6 |
| B calcShift |
| |
| varTable: |
| // var table [maxTableSize]uint16 |
| // |
| // In the asm code, unlike the Go code, we can zero-initialize only the |
| // first tableSize elements. Each uint16 element is 2 bytes and each |
| // iterations writes 64 bytes, so we can do only tableSize/32 writes |
| // instead of the 2048 writes that would zero-initialize all of table's |
| // 32768 bytes. This clear could overrun the first tableSize elements, but |
| // it won't overrun the allocated stack size. |
| ADD $128, RSP, R17 |
| MOVD R17, R4 |
| |
| // !!! R6 = &src[tableSize] |
| ADD R6<<1, R17, R6 |
| |
| memclr: |
| STP.P (ZR, ZR), 64(R4) |
| STP (ZR, ZR), -48(R4) |
| STP (ZR, ZR), -32(R4) |
| STP (ZR, ZR), -16(R4) |
| CMP R4, R6 |
| BHI memclr |
| |
| // !!! R6 = &src[0] |
| MOVD R7, R6 |
| |
| // sLimit := len(src) - inputMargin |
| MOVD R14, R9 |
| SUB $15, R9, R9 |
| |
| // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't |
| // change for the rest of the function. |
| MOVD R5, 64(RSP) |
| MOVD R6, 72(RSP) |
| MOVD R9, 96(RSP) |
| |
| // nextEmit := 0 |
| MOVD R6, R10 |
| |
| // s := 1 |
| ADD $1, R7, R7 |
| |
| // nextHash := hash(load32(src, s), shift) |
| MOVW 0(R7), R11 |
| MULW R16, R11, R11 |
| LSRW R5, R11, R11 |
| |
| outer: |
| // for { etc } |
| |
| // skip := 32 |
| MOVD $32, R12 |
| |
| // nextS := s |
| MOVD R7, R13 |
| |
| // candidate := 0 |
| MOVD $0, R15 |
| |
| inner0: |
| // for { etc } |
| |
| // s := nextS |
| MOVD R13, R7 |
| |
| // bytesBetweenHashLookups := skip >> 5 |
| MOVD R12, R14 |
| LSR $5, R14, R14 |
| |
| // nextS = s + bytesBetweenHashLookups |
| ADD R14, R13, R13 |
| |
| // skip += bytesBetweenHashLookups |
| ADD R14, R12, R12 |
| |
| // if nextS > sLimit { goto emitRemainder } |
| MOVD R13, R3 |
| SUB R6, R3, R3 |
| CMP R9, R3 |
| BHI emitRemainder |
| |
| // candidate = int(table[nextHash]) |
| MOVHU 0(R17)(R11<<1), R15 |
| |
| // table[nextHash] = uint16(s) |
| MOVD R7, R3 |
| SUB R6, R3, R3 |
| |
| MOVH R3, 0(R17)(R11<<1) |
| |
| // nextHash = hash(load32(src, nextS), shift) |
| MOVW 0(R13), R11 |
| MULW R16, R11 |
| LSRW R5, R11, R11 |
| |
| // if load32(src, s) != load32(src, candidate) { continue } break |
| MOVW 0(R7), R3 |
| MOVW (R6)(R15*1), R4 |
| CMPW R4, R3 |
| BNE inner0 |
| |
| fourByteMatch: |
| // As per the encode_other.go code: |
| // |
| // A 4-byte match has been found. We'll later see etc. |
| |
| // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment |
| // on inputMargin in encode.go. |
| MOVD R7, R3 |
| SUB R10, R3, R3 |
| CMP $16, R3 |
| BLE emitLiteralFastPath |
| |
| // ---------------------------------------- |
| // Begin inline of the emitLiteral call. |
| // |
| // d += emitLiteral(dst[d:], src[nextEmit:s]) |
| |
| MOVW R3, R4 |
| SUBW $1, R4, R4 |
| |
| MOVW $60, R2 |
| CMPW R2, R4 |
| BLT inlineEmitLiteralOneByte |
| MOVW $256, R2 |
| CMPW R2, R4 |
| BLT inlineEmitLiteralTwoBytes |
| |
| inlineEmitLiteralThreeBytes: |
| MOVD $0xf4, R1 |
| MOVB R1, 0(R8) |
| MOVW R4, 1(R8) |
| ADD $3, R8, R8 |
| B inlineEmitLiteralMemmove |
| |
| inlineEmitLiteralTwoBytes: |
| MOVD $0xf0, R1 |
| MOVB R1, 0(R8) |
| MOVB R4, 1(R8) |
| ADD $2, R8, R8 |
| B inlineEmitLiteralMemmove |
| |
| inlineEmitLiteralOneByte: |
| LSLW $2, R4, R4 |
| MOVB R4, 0(R8) |
| ADD $1, R8, R8 |
| |
| inlineEmitLiteralMemmove: |
| // Spill local variables (registers) onto the stack; call; unspill. |
| // |
| // copy(dst[i:], lit) |
| // |
| // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
| // R8, R10 and R3 as arguments. |
| MOVD R8, 8(RSP) |
| MOVD R10, 16(RSP) |
| MOVD R3, 24(RSP) |
| |
| // Finish the "d +=" part of "d += emitLiteral(etc)". |
| ADD R3, R8, R8 |
| MOVD R7, 80(RSP) |
| MOVD R8, 88(RSP) |
| MOVD R15, 120(RSP) |
| CALL runtime·memmove(SB) |
| MOVD 64(RSP), R5 |
| MOVD 72(RSP), R6 |
| MOVD 80(RSP), R7 |
| MOVD 88(RSP), R8 |
| MOVD 96(RSP), R9 |
| MOVD 120(RSP), R15 |
| ADD $128, RSP, R17 |
| MOVW $0xa7bd, R16 |
| MOVKW $(0x1e35<<16), R16 |
| B inner1 |
| |
| inlineEmitLiteralEnd: |
| // End inline of the emitLiteral call. |
| // ---------------------------------------- |
| |
| emitLiteralFastPath: |
| // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". |
| MOVB R3, R4 |
| SUBW $1, R4, R4 |
| AND $0xff, R4, R4 |
| LSLW $2, R4, R4 |
| MOVB R4, (R8) |
| ADD $1, R8, R8 |
| |
| // !!! Implement the copy from lit to dst as a 16-byte load and store. |
| // (Encode's documentation says that dst and src must not overlap.) |
| // |
| // This always copies 16 bytes, instead of only len(lit) bytes, but that's |
| // OK. Subsequent iterations will fix up the overrun. |
| // |
| // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or |
| // 16-byte loads and stores. This technique probably wouldn't be as |
| // effective on architectures that are fussier about alignment. |
| LDP 0(R10), (R0, R1) |
| STP (R0, R1), 0(R8) |
| ADD R3, R8, R8 |
| |
| inner1: |
| // for { etc } |
| |
| // base := s |
| MOVD R7, R12 |
| |
| // !!! offset := base - candidate |
| MOVD R12, R11 |
| SUB R15, R11, R11 |
| SUB R6, R11, R11 |
| |
| // ---------------------------------------- |
| // Begin inline of the extendMatch call. |
| // |
| // s = extendMatch(src, candidate+4, s+4) |
| |
| // !!! R14 = &src[len(src)] |
| MOVD src_len+32(FP), R14 |
| ADD R6, R14, R14 |
| |
| // !!! R13 = &src[len(src) - 8] |
| MOVD R14, R13 |
| SUB $8, R13, R13 |
| |
| // !!! R15 = &src[candidate + 4] |
| ADD $4, R15, R15 |
| ADD R6, R15, R15 |
| |
| // !!! s += 4 |
| ADD $4, R7, R7 |
| |
| inlineExtendMatchCmp8: |
| // As long as we are 8 or more bytes before the end of src, we can load and |
| // compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
| CMP R13, R7 |
| BHI inlineExtendMatchCmp1 |
| MOVD (R15), R3 |
| MOVD (R7), R4 |
| CMP R4, R3 |
| BNE inlineExtendMatchBSF |
| ADD $8, R15, R15 |
| ADD $8, R7, R7 |
| B inlineExtendMatchCmp8 |
| |
| inlineExtendMatchBSF: |
| // If those 8 bytes were not equal, XOR the two 8 byte values, and return |
| // the index of the first byte that differs. |
| // RBIT reverses the bit order, then CLZ counts the leading zeros, the |
| // combination of which finds the least significant bit which is set. |
| // The arm64 architecture is little-endian, and the shift by 3 converts |
| // a bit index to a byte index. |
| EOR R3, R4, R4 |
| RBIT R4, R4 |
| CLZ R4, R4 |
| ADD R4>>3, R7, R7 |
| B inlineExtendMatchEnd |
| |
| inlineExtendMatchCmp1: |
| // In src's tail, compare 1 byte at a time. |
| CMP R7, R14 |
| BLS inlineExtendMatchEnd |
| MOVB (R15), R3 |
| MOVB (R7), R4 |
| CMP R4, R3 |
| BNE inlineExtendMatchEnd |
| ADD $1, R15, R15 |
| ADD $1, R7, R7 |
| B inlineExtendMatchCmp1 |
| |
| inlineExtendMatchEnd: |
| // End inline of the extendMatch call. |
| // ---------------------------------------- |
| |
| // ---------------------------------------- |
| // Begin inline of the emitCopy call. |
| // |
| // d += emitCopy(dst[d:], base-candidate, s-base) |
| |
| // !!! length := s - base |
| MOVD R7, R3 |
| SUB R12, R3, R3 |
| |
| inlineEmitCopyLoop0: |
| // for length >= 68 { etc } |
| MOVW $68, R2 |
| CMPW R2, R3 |
| BLT inlineEmitCopyStep1 |
| |
| // Emit a length 64 copy, encoded as 3 bytes. |
| MOVD $0xfe, R1 |
| MOVB R1, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| SUBW $64, R3, R3 |
| B inlineEmitCopyLoop0 |
| |
| inlineEmitCopyStep1: |
| // if length > 64 { etc } |
| MOVW $64, R2 |
| CMPW R2, R3 |
| BLE inlineEmitCopyStep2 |
| |
| // Emit a length 60 copy, encoded as 3 bytes. |
| MOVD $0xee, R1 |
| MOVB R1, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| SUBW $60, R3, R3 |
| |
| inlineEmitCopyStep2: |
| // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } |
| MOVW $12, R2 |
| CMPW R2, R3 |
| BGE inlineEmitCopyStep3 |
| MOVW $2048, R2 |
| CMPW R2, R11 |
| BGE inlineEmitCopyStep3 |
| |
| // Emit the remaining copy, encoded as 2 bytes. |
| MOVB R11, 1(R8) |
| LSRW $8, R11, R11 |
| LSLW $5, R11, R11 |
| SUBW $4, R3, R3 |
| AND $0xff, R3, R3 |
| LSLW $2, R3, R3 |
| ORRW R3, R11, R11 |
| ORRW $1, R11, R11 |
| MOVB R11, 0(R8) |
| ADD $2, R8, R8 |
| B inlineEmitCopyEnd |
| |
| inlineEmitCopyStep3: |
| // Emit the remaining copy, encoded as 3 bytes. |
| SUBW $1, R3, R3 |
| LSLW $2, R3, R3 |
| ORRW $2, R3, R3 |
| MOVB R3, 0(R8) |
| MOVW R11, 1(R8) |
| ADD $3, R8, R8 |
| |
| inlineEmitCopyEnd: |
| // End inline of the emitCopy call. |
| // ---------------------------------------- |
| |
| // nextEmit = s |
| MOVD R7, R10 |
| |
| // if s >= sLimit { goto emitRemainder } |
| MOVD R7, R3 |
| SUB R6, R3, R3 |
| CMP R3, R9 |
| BLS emitRemainder |
| |
| // As per the encode_other.go code: |
| // |
| // We could immediately etc. |
| |
| // x := load64(src, s-1) |
| MOVD -1(R7), R14 |
| |
| // prevHash := hash(uint32(x>>0), shift) |
| MOVW R14, R11 |
| MULW R16, R11, R11 |
| LSRW R5, R11, R11 |
| |
| // table[prevHash] = uint16(s-1) |
| MOVD R7, R3 |
| SUB R6, R3, R3 |
| SUB $1, R3, R3 |
| |
| MOVHU R3, 0(R17)(R11<<1) |
| |
| // currHash := hash(uint32(x>>8), shift) |
| LSR $8, R14, R14 |
| MOVW R14, R11 |
| MULW R16, R11, R11 |
| LSRW R5, R11, R11 |
| |
| // candidate = int(table[currHash]) |
| MOVHU 0(R17)(R11<<1), R15 |
| |
| // table[currHash] = uint16(s) |
| ADD $1, R3, R3 |
| MOVHU R3, 0(R17)(R11<<1) |
| |
| // if uint32(x>>8) == load32(src, candidate) { continue } |
| MOVW (R6)(R15*1), R4 |
| CMPW R4, R14 |
| BEQ inner1 |
| |
| // nextHash = hash(uint32(x>>16), shift) |
| LSR $8, R14, R14 |
| MOVW R14, R11 |
| MULW R16, R11, R11 |
| LSRW R5, R11, R11 |
| |
| // s++ |
| ADD $1, R7, R7 |
| |
| // break out of the inner1 for loop, i.e. continue the outer loop. |
| B outer |
| |
| emitRemainder: |
| // if nextEmit < len(src) { etc } |
| MOVD src_len+32(FP), R3 |
| ADD R6, R3, R3 |
| CMP R3, R10 |
| BEQ encodeBlockEnd |
| |
| // d += emitLiteral(dst[d:], src[nextEmit:]) |
| // |
| // Push args. |
| MOVD R8, 8(RSP) |
| MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative. |
| MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative. |
| MOVD R10, 32(RSP) |
| SUB R10, R3, R3 |
| MOVD R3, 40(RSP) |
| MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative. |
| |
| // Spill local variables (registers) onto the stack; call; unspill. |
| MOVD R8, 88(RSP) |
| CALL ·emitLiteral(SB) |
| MOVD 88(RSP), R8 |
| |
| // Finish the "d +=" part of "d += emitLiteral(etc)". |
| MOVD 56(RSP), R1 |
| ADD R1, R8, R8 |
| |
| encodeBlockEnd: |
| MOVD dst_base+0(FP), R3 |
| SUB R3, R8, R8 |
| MOVD R8, d+48(FP) |
| RET |