| // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. |
| |
| //go:build amd64 && !appengine && !noasm && gc |
| // +build amd64,!appengine,!noasm,gc |
| |
| // func decompress4x_main_loop_amd64(ctx *decompress4xContext) |
| TEXT ·decompress4x_main_loop_amd64(SB), $0-8 |
| XORQ DX, DX |
| |
| // Preload values |
| MOVQ ctx+0(FP), AX |
| MOVBQZX 8(AX), DI |
| MOVQ 16(AX), SI |
| MOVQ 48(AX), BX |
| MOVQ 24(AX), R9 |
| MOVQ 32(AX), R10 |
| MOVQ (AX), R11 |
| |
| // Main loop |
| main_loop: |
| MOVQ SI, R8 |
| CMPQ R8, BX |
| SETGE DL |
| |
| // br0.fillFast32() |
| MOVQ 32(R11), R12 |
| MOVBQZX 40(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill0 |
| MOVQ 24(R11), AX |
| SUBQ $0x20, R13 |
| SUBQ $0x04, AX |
| MOVQ (R11), R14 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (AX)(R14*1), R14 |
| MOVQ R13, CX |
| SHLQ CL, R14 |
| MOVQ AX, 24(R11) |
| ORQ R14, R12 |
| |
| // exhausted = exhausted || (br0.off < 4) |
| CMPQ AX, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill0: |
| // val0 := br0.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br0.peekTopBits(peekBits) |
| MOVQ DI, CX |
| MOVQ R12, R14 |
| SHRQ CL, R14 |
| |
| // v1 := table[val1&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v1.entry)) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // these two writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| MOVW AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 32(R11) |
| MOVB R13, 40(R11) |
| ADDQ R9, R8 |
| |
| // br1.fillFast32() |
| MOVQ 80(R11), R12 |
| MOVBQZX 88(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill1 |
| MOVQ 72(R11), AX |
| SUBQ $0x20, R13 |
| SUBQ $0x04, AX |
| MOVQ 48(R11), R14 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (AX)(R14*1), R14 |
| MOVQ R13, CX |
| SHLQ CL, R14 |
| MOVQ AX, 72(R11) |
| ORQ R14, R12 |
| |
| // exhausted = exhausted || (br1.off < 4) |
| CMPQ AX, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill1: |
| // val0 := br1.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br1.peekTopBits(peekBits) |
| MOVQ DI, CX |
| MOVQ R12, R14 |
| SHRQ CL, R14 |
| |
| // v1 := table[val1&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v1.entry)) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // these two writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| MOVW AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 80(R11) |
| MOVB R13, 88(R11) |
| ADDQ R9, R8 |
| |
| // br2.fillFast32() |
| MOVQ 128(R11), R12 |
| MOVBQZX 136(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill2 |
| MOVQ 120(R11), AX |
| SUBQ $0x20, R13 |
| SUBQ $0x04, AX |
| MOVQ 96(R11), R14 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (AX)(R14*1), R14 |
| MOVQ R13, CX |
| SHLQ CL, R14 |
| MOVQ AX, 120(R11) |
| ORQ R14, R12 |
| |
| // exhausted = exhausted || (br2.off < 4) |
| CMPQ AX, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill2: |
| // val0 := br2.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br2.peekTopBits(peekBits) |
| MOVQ DI, CX |
| MOVQ R12, R14 |
| SHRQ CL, R14 |
| |
| // v1 := table[val1&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v1.entry)) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // these two writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| MOVW AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 128(R11) |
| MOVB R13, 136(R11) |
| ADDQ R9, R8 |
| |
| // br3.fillFast32() |
| MOVQ 176(R11), R12 |
| MOVBQZX 184(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill3 |
| MOVQ 168(R11), AX |
| SUBQ $0x20, R13 |
| SUBQ $0x04, AX |
| MOVQ 144(R11), R14 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (AX)(R14*1), R14 |
| MOVQ R13, CX |
| SHLQ CL, R14 |
| MOVQ AX, 168(R11) |
| ORQ R14, R12 |
| |
| // exhausted = exhausted || (br3.off < 4) |
| CMPQ AX, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill3: |
| // val0 := br3.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br3.peekTopBits(peekBits) |
| MOVQ DI, CX |
| MOVQ R12, R14 |
| SHRQ CL, R14 |
| |
| // v1 := table[val1&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v1.entry)) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // these two writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| MOVW AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 176(R11) |
| MOVB R13, 184(R11) |
| ADDQ $0x02, SI |
| TESTB DL, DL |
| JZ main_loop |
| MOVQ ctx+0(FP), AX |
| SUBQ 16(AX), SI |
| SHLQ $0x02, SI |
| MOVQ SI, 40(AX) |
| RET |
| |
| // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) |
| TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 |
| XORQ DX, DX |
| |
| // Preload values |
| MOVQ ctx+0(FP), CX |
| MOVBQZX 8(CX), DI |
| MOVQ 16(CX), BX |
| MOVQ 48(CX), SI |
| MOVQ 24(CX), R9 |
| MOVQ 32(CX), R10 |
| MOVQ (CX), R11 |
| |
| // Main loop |
| main_loop: |
| MOVQ BX, R8 |
| CMPQ R8, SI |
| SETGE DL |
| |
| // br0.fillFast32() |
| MOVQ 32(R11), R12 |
| MOVBQZX 40(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill0 |
| MOVQ 24(R11), R14 |
| SUBQ $0x20, R13 |
| SUBQ $0x04, R14 |
| MOVQ (R11), R15 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (R14)(R15*1), R15 |
| MOVQ R13, CX |
| SHLQ CL, R15 |
| MOVQ R14, 24(R11) |
| ORQ R15, R12 |
| |
| // exhausted = exhausted || (br0.off < 4) |
| CMPQ R14, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill0: |
| // val0 := br0.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br0.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v1 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v1.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // val2 := br0.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v2 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v2.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val3 := br0.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v3 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br0.advance(uint8(v3.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // these four writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| // out[id * dstEvery + 3] = uint8(v2.entry >> 8) |
| // out[id * dstEvery + 4] = uint8(v3.entry >> 8) |
| MOVL AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 32(R11) |
| MOVB R13, 40(R11) |
| ADDQ R9, R8 |
| |
| // br1.fillFast32() |
| MOVQ 80(R11), R12 |
| MOVBQZX 88(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill1 |
| MOVQ 72(R11), R14 |
| SUBQ $0x20, R13 |
| SUBQ $0x04, R14 |
| MOVQ 48(R11), R15 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (R14)(R15*1), R15 |
| MOVQ R13, CX |
| SHLQ CL, R15 |
| MOVQ R14, 72(R11) |
| ORQ R15, R12 |
| |
| // exhausted = exhausted || (br1.off < 4) |
| CMPQ R14, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill1: |
| // val0 := br1.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br1.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v1 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v1.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // val2 := br1.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v2 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v2.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val3 := br1.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v3 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br1.advance(uint8(v3.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // these four writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| // out[id * dstEvery + 3] = uint8(v2.entry >> 8) |
| // out[id * dstEvery + 4] = uint8(v3.entry >> 8) |
| MOVL AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 80(R11) |
| MOVB R13, 88(R11) |
| ADDQ R9, R8 |
| |
| // br2.fillFast32() |
| MOVQ 128(R11), R12 |
| MOVBQZX 136(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill2 |
| MOVQ 120(R11), R14 |
| SUBQ $0x20, R13 |
| SUBQ $0x04, R14 |
| MOVQ 96(R11), R15 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (R14)(R15*1), R15 |
| MOVQ R13, CX |
| SHLQ CL, R15 |
| MOVQ R14, 120(R11) |
| ORQ R15, R12 |
| |
| // exhausted = exhausted || (br2.off < 4) |
| CMPQ R14, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill2: |
| // val0 := br2.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br2.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v1 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v1.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // val2 := br2.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v2 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v2.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val3 := br2.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v3 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br2.advance(uint8(v3.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // these four writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| // out[id * dstEvery + 3] = uint8(v2.entry >> 8) |
| // out[id * dstEvery + 4] = uint8(v3.entry >> 8) |
| MOVL AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 128(R11) |
| MOVB R13, 136(R11) |
| ADDQ R9, R8 |
| |
| // br3.fillFast32() |
| MOVQ 176(R11), R12 |
| MOVBQZX 184(R11), R13 |
| CMPQ R13, $0x20 |
| JBE skip_fill3 |
| MOVQ 168(R11), R14 |
| SUBQ $0x20, R13 |
| SUBQ $0x04, R14 |
| MOVQ 144(R11), R15 |
| |
| // b.value |= uint64(low) << (b.bitsRead & 63) |
| MOVL (R14)(R15*1), R15 |
| MOVQ R13, CX |
| SHLQ CL, R15 |
| MOVQ R14, 168(R11) |
| ORQ R15, R12 |
| |
| // exhausted = exhausted || (br3.off < 4) |
| CMPQ R14, $0x04 |
| SETLT AL |
| ORB AL, DL |
| |
| skip_fill3: |
| // val0 := br3.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v0 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v0.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val1 := br3.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v1 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v1.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // val2 := br3.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v2 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v2.entry) |
| MOVB CH, AH |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| |
| // val3 := br3.peekTopBits(peekBits) |
| MOVQ R12, R14 |
| MOVQ DI, CX |
| SHRQ CL, R14 |
| |
| // v3 := table[val0&mask] |
| MOVW (R10)(R14*2), CX |
| |
| // br3.advance(uint8(v3.entry) |
| MOVB CH, AL |
| SHLQ CL, R12 |
| ADDB CL, R13 |
| BSWAPL AX |
| |
| // these four writes get coalesced |
| // out[id * dstEvery + 0] = uint8(v0.entry >> 8) |
| // out[id * dstEvery + 1] = uint8(v1.entry >> 8) |
| // out[id * dstEvery + 3] = uint8(v2.entry >> 8) |
| // out[id * dstEvery + 4] = uint8(v3.entry >> 8) |
| MOVL AX, (R8) |
| |
| // update the bitreader structure |
| MOVQ R12, 176(R11) |
| MOVB R13, 184(R11) |
| ADDQ $0x04, BX |
| TESTB DL, DL |
| JZ main_loop |
| MOVQ ctx+0(FP), AX |
| SUBQ 16(AX), BX |
| SHLQ $0x02, BX |
| MOVQ BX, 40(AX) |
| RET |
| |
| // func decompress1x_main_loop_amd64(ctx *decompress1xContext) |
| TEXT ·decompress1x_main_loop_amd64(SB), $0-8 |
| MOVQ ctx+0(FP), CX |
| MOVQ 16(CX), DX |
| MOVQ 24(CX), BX |
| CMPQ BX, $0x04 |
| JB error_max_decoded_size_exeeded |
| LEAQ (DX)(BX*1), BX |
| MOVQ (CX), SI |
| MOVQ (SI), R8 |
| MOVQ 24(SI), R9 |
| MOVQ 32(SI), R10 |
| MOVBQZX 40(SI), R11 |
| MOVQ 32(CX), SI |
| MOVBQZX 8(CX), DI |
| JMP loop_condition |
| |
| main_loop: |
| // Check if we have room for 4 bytes in the output buffer |
| LEAQ 4(DX), CX |
| CMPQ CX, BX |
| JGE error_max_decoded_size_exeeded |
| |
| // Decode 4 values |
| CMPQ R11, $0x20 |
| JL bitReader_fillFast_1_end |
| SUBQ $0x20, R11 |
| SUBQ $0x04, R9 |
| MOVL (R8)(R9*1), R12 |
| MOVQ R11, CX |
| SHLQ CL, R12 |
| ORQ R12, R10 |
| |
| bitReader_fillFast_1_end: |
| MOVQ DI, CX |
| MOVQ R10, R12 |
| SHRQ CL, R12 |
| MOVW (SI)(R12*2), CX |
| MOVB CH, AL |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLQ CL, R10 |
| MOVQ DI, CX |
| MOVQ R10, R12 |
| SHRQ CL, R12 |
| MOVW (SI)(R12*2), CX |
| MOVB CH, AH |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLQ CL, R10 |
| BSWAPL AX |
| CMPQ R11, $0x20 |
| JL bitReader_fillFast_2_end |
| SUBQ $0x20, R11 |
| SUBQ $0x04, R9 |
| MOVL (R8)(R9*1), R12 |
| MOVQ R11, CX |
| SHLQ CL, R12 |
| ORQ R12, R10 |
| |
| bitReader_fillFast_2_end: |
| MOVQ DI, CX |
| MOVQ R10, R12 |
| SHRQ CL, R12 |
| MOVW (SI)(R12*2), CX |
| MOVB CH, AH |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLQ CL, R10 |
| MOVQ DI, CX |
| MOVQ R10, R12 |
| SHRQ CL, R12 |
| MOVW (SI)(R12*2), CX |
| MOVB CH, AL |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLQ CL, R10 |
| BSWAPL AX |
| |
| // Store the decoded values |
| MOVL AX, (DX) |
| ADDQ $0x04, DX |
| |
| loop_condition: |
| CMPQ R9, $0x08 |
| JGE main_loop |
| |
| // Update ctx structure |
| MOVQ ctx+0(FP), AX |
| SUBQ 16(AX), DX |
| MOVQ DX, 40(AX) |
| MOVQ (AX), AX |
| MOVQ R9, 24(AX) |
| MOVQ R10, 32(AX) |
| MOVB R11, 40(AX) |
| RET |
| |
| // Report error |
| error_max_decoded_size_exeeded: |
| MOVQ ctx+0(FP), AX |
| MOVQ $-1, CX |
| MOVQ CX, 40(AX) |
| RET |
| |
| // func decompress1x_main_loop_bmi2(ctx *decompress1xContext) |
| // Requires: BMI2 |
| TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 |
| MOVQ ctx+0(FP), CX |
| MOVQ 16(CX), DX |
| MOVQ 24(CX), BX |
| CMPQ BX, $0x04 |
| JB error_max_decoded_size_exeeded |
| LEAQ (DX)(BX*1), BX |
| MOVQ (CX), SI |
| MOVQ (SI), R8 |
| MOVQ 24(SI), R9 |
| MOVQ 32(SI), R10 |
| MOVBQZX 40(SI), R11 |
| MOVQ 32(CX), SI |
| MOVBQZX 8(CX), DI |
| JMP loop_condition |
| |
| main_loop: |
| // Check if we have room for 4 bytes in the output buffer |
| LEAQ 4(DX), CX |
| CMPQ CX, BX |
| JGE error_max_decoded_size_exeeded |
| |
| // Decode 4 values |
| CMPQ R11, $0x20 |
| JL bitReader_fillFast_1_end |
| SUBQ $0x20, R11 |
| SUBQ $0x04, R9 |
| MOVL (R8)(R9*1), CX |
| SHLXQ R11, CX, CX |
| ORQ CX, R10 |
| |
| bitReader_fillFast_1_end: |
| SHRXQ DI, R10, CX |
| MOVW (SI)(CX*2), CX |
| MOVB CH, AL |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLXQ CX, R10, R10 |
| SHRXQ DI, R10, CX |
| MOVW (SI)(CX*2), CX |
| MOVB CH, AH |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLXQ CX, R10, R10 |
| BSWAPL AX |
| CMPQ R11, $0x20 |
| JL bitReader_fillFast_2_end |
| SUBQ $0x20, R11 |
| SUBQ $0x04, R9 |
| MOVL (R8)(R9*1), CX |
| SHLXQ R11, CX, CX |
| ORQ CX, R10 |
| |
| bitReader_fillFast_2_end: |
| SHRXQ DI, R10, CX |
| MOVW (SI)(CX*2), CX |
| MOVB CH, AH |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLXQ CX, R10, R10 |
| SHRXQ DI, R10, CX |
| MOVW (SI)(CX*2), CX |
| MOVB CH, AL |
| MOVBQZX CL, CX |
| ADDQ CX, R11 |
| SHLXQ CX, R10, R10 |
| BSWAPL AX |
| |
| // Store the decoded values |
| MOVL AX, (DX) |
| ADDQ $0x04, DX |
| |
| loop_condition: |
| CMPQ R9, $0x08 |
| JGE main_loop |
| |
| // Update ctx structure |
| MOVQ ctx+0(FP), AX |
| SUBQ 16(AX), DX |
| MOVQ DX, 40(AX) |
| MOVQ (AX), AX |
| MOVQ R9, 24(AX) |
| MOVQ R10, 32(AX) |
| MOVB R11, 40(AX) |
| RET |
| |
| // Report error |
| error_max_decoded_size_exeeded: |
| MOVQ ctx+0(FP), AX |
| MOVQ $-1, CX |
| MOVQ CX, 40(AX) |
| RET |