| // +build !appengine |
| // +build gc |
| // +build !noasm |
| |
| #include "textflag.h" |
| |
| // AX scratch |
| // BX scratch |
| // CX scratch |
| // DX token |
| // |
| // DI &dst |
| // SI &src |
| // R8 &dst + len(dst) |
| // R9 &src + len(src) |
| // R11 &dst |
| // R12 short output end |
| // R13 short input end |
| // func decodeBlock(dst, src []byte) int |
| // using 50 bytes of stack currently |
| TEXT ·decodeBlock(SB), NOSPLIT, $64-56 |
| MOVQ dst_base+0(FP), DI |
| MOVQ DI, R11 |
| MOVQ dst_len+8(FP), R8 |
| ADDQ DI, R8 |
| |
| MOVQ src_base+24(FP), SI |
| MOVQ src_len+32(FP), R9 |
| ADDQ SI, R9 |
| |
| // shortcut ends |
| // short output end |
| MOVQ R8, R12 |
| SUBQ $32, R12 |
| // short input end |
| MOVQ R9, R13 |
| SUBQ $16, R13 |
| |
| loop: |
| // for si < len(src) |
| CMPQ SI, R9 |
| JGE end |
| |
| // token := uint32(src[si]) |
| MOVBQZX (SI), DX |
| INCQ SI |
| |
| // lit_len = token >> 4 |
| // if lit_len > 0 |
| // CX = lit_len |
| MOVQ DX, CX |
| SHRQ $4, CX |
| |
| // if lit_len != 0xF |
| CMPQ CX, $0xF |
| JEQ lit_len_loop_pre |
| CMPQ DI, R12 |
| JGE lit_len_loop_pre |
| CMPQ SI, R13 |
| JGE lit_len_loop_pre |
| |
| // copy shortcut |
| |
| // A two-stage shortcut for the most common case: |
| // 1) If the literal length is 0..14, and there is enough space, |
| // enter the shortcut and copy 16 bytes on behalf of the literals |
| // (in the fast mode, only 8 bytes can be safely copied this way). |
| // 2) Further if the match length is 4..18, copy 18 bytes in a similar |
| // manner; but we ensure that there's enough space in the output for |
| // those 18 bytes earlier, upon entering the shortcut (in other words, |
| // there is a combined check for both stages). |
| |
| // copy literal |
| MOVOU (SI), X0 |
| MOVOU X0, (DI) |
| ADDQ CX, DI |
| ADDQ CX, SI |
| |
| MOVQ DX, CX |
| ANDQ $0xF, CX |
| |
| // The second stage: prepare for match copying, decode full info. |
| // If it doesn't work out, the info won't be wasted. |
| // offset := uint16(data[:2]) |
| MOVWQZX (SI), DX |
| ADDQ $2, SI |
| |
| MOVQ DI, AX |
| SUBQ DX, AX |
| CMPQ AX, DI |
| JGT err_short_buf |
| |
| // if we can't do the second stage then jump straight to read the |
| // match length, we already have the offset. |
| CMPQ CX, $0xF |
| JEQ match_len_loop_pre |
| CMPQ DX, $8 |
| JLT match_len_loop_pre |
| CMPQ AX, R11 |
| JLT err_short_buf |
| |
| // memcpy(op + 0, match + 0, 8); |
| MOVQ (AX), BX |
| MOVQ BX, (DI) |
| // memcpy(op + 8, match + 8, 8); |
| MOVQ 8(AX), BX |
| MOVQ BX, 8(DI) |
| // memcpy(op +16, match +16, 2); |
| MOVW 16(AX), BX |
| MOVW BX, 16(DI) |
| |
| ADDQ $4, DI // minmatch |
| ADDQ CX, DI |
| |
| // shortcut complete, load next token |
| JMP loop |
| |
| lit_len_loop_pre: |
| // if lit_len > 0 |
| CMPQ CX, $0 |
| JEQ offset |
| CMPQ CX, $0xF |
| JNE copy_literal |
| |
| lit_len_loop: |
| // for src[si] == 0xFF |
| CMPB (SI), $0xFF |
| JNE lit_len_finalise |
| |
| // bounds check src[si+1] |
| MOVQ SI, AX |
| ADDQ $1, AX |
| CMPQ AX, R9 |
| JGT err_short_buf |
| |
| // lit_len += 0xFF |
| ADDQ $0xFF, CX |
| INCQ SI |
| JMP lit_len_loop |
| |
| lit_len_finalise: |
| // lit_len += int(src[si]) |
| // si++ |
| MOVBQZX (SI), AX |
| ADDQ AX, CX |
| INCQ SI |
| |
| copy_literal: |
| // bounds check src and dst |
| MOVQ SI, AX |
| ADDQ CX, AX |
| CMPQ AX, R9 |
| JGT err_short_buf |
| |
| MOVQ DI, AX |
| ADDQ CX, AX |
| CMPQ AX, R8 |
| JGT err_short_buf |
| |
| // whats a good cut off to call memmove? |
| CMPQ CX, $16 |
| JGT memmove_lit |
| |
| // if len(dst[di:]) < 16 |
| MOVQ R8, AX |
| SUBQ DI, AX |
| CMPQ AX, $16 |
| JLT memmove_lit |
| |
| // if len(src[si:]) < 16 |
| MOVQ R9, AX |
| SUBQ SI, AX |
| CMPQ AX, $16 |
| JLT memmove_lit |
| |
| MOVOU (SI), X0 |
| MOVOU X0, (DI) |
| |
| JMP finish_lit_copy |
| |
| memmove_lit: |
| // memmove(to, from, len) |
| MOVQ DI, 0(SP) |
| MOVQ SI, 8(SP) |
| MOVQ CX, 16(SP) |
| // spill |
| MOVQ DI, 24(SP) |
| MOVQ SI, 32(SP) |
| MOVQ CX, 40(SP) // need len to inc SI, DI after |
| MOVB DX, 48(SP) |
| CALL runtime·memmove(SB) |
| |
| // restore registers |
| MOVQ 24(SP), DI |
| MOVQ 32(SP), SI |
| MOVQ 40(SP), CX |
| MOVB 48(SP), DX |
| |
| // recalc initial values |
| MOVQ dst_base+0(FP), R8 |
| MOVQ R8, R11 |
| ADDQ dst_len+8(FP), R8 |
| MOVQ src_base+24(FP), R9 |
| ADDQ src_len+32(FP), R9 |
| MOVQ R8, R12 |
| SUBQ $32, R12 |
| MOVQ R9, R13 |
| SUBQ $16, R13 |
| |
| finish_lit_copy: |
| ADDQ CX, SI |
| ADDQ CX, DI |
| |
| CMPQ SI, R9 |
| JGE end |
| |
| offset: |
| // CX := mLen |
| // free up DX to use for offset |
| MOVQ DX, CX |
| |
| MOVQ SI, AX |
| ADDQ $2, AX |
| CMPQ AX, R9 |
| JGT err_short_buf |
| |
| // offset |
| // DX := int(src[si]) | int(src[si+1])<<8 |
| MOVWQZX (SI), DX |
| ADDQ $2, SI |
| |
| // 0 offset is invalid |
| CMPQ DX, $0 |
| JEQ err_corrupt |
| |
| ANDB $0xF, CX |
| |
| match_len_loop_pre: |
| // if mlen != 0xF |
| CMPB CX, $0xF |
| JNE copy_match |
| |
| match_len_loop: |
| // for src[si] == 0xFF |
| // lit_len += 0xFF |
| CMPB (SI), $0xFF |
| JNE match_len_finalise |
| |
| // bounds check src[si+1] |
| MOVQ SI, AX |
| ADDQ $1, AX |
| CMPQ AX, R9 |
| JGT err_short_buf |
| |
| ADDQ $0xFF, CX |
| INCQ SI |
| JMP match_len_loop |
| |
| match_len_finalise: |
| // lit_len += int(src[si]) |
| // si++ |
| MOVBQZX (SI), AX |
| ADDQ AX, CX |
| INCQ SI |
| |
| copy_match: |
| // mLen += minMatch |
| ADDQ $4, CX |
| |
| // check we have match_len bytes left in dst |
| // di+match_len < len(dst) |
| MOVQ DI, AX |
| ADDQ CX, AX |
| CMPQ AX, R8 |
| JGT err_short_buf |
| |
| // DX = offset |
| // CX = match_len |
| // BX = &dst + (di - offset) |
| MOVQ DI, BX |
| SUBQ DX, BX |
| |
| // check BX is within dst |
| // if BX < &dst |
| CMPQ BX, R11 |
| JLT err_short_buf |
| |
| // if offset + match_len < di |
| MOVQ BX, AX |
| ADDQ CX, AX |
| CMPQ DI, AX |
| JGT copy_interior_match |
| |
| // AX := len(dst[:di]) |
| // MOVQ DI, AX |
| // SUBQ R11, AX |
| |
| // copy 16 bytes at a time |
| // if di-offset < 16 copy 16-(di-offset) bytes to di |
| // then do the remaining |
| |
| copy_match_loop: |
| // for match_len >= 0 |
| // dst[di] = dst[i] |
| // di++ |
| // i++ |
| MOVB (BX), AX |
| MOVB AX, (DI) |
| INCQ DI |
| INCQ BX |
| DECQ CX |
| |
| CMPQ CX, $0 |
| JGT copy_match_loop |
| |
| JMP loop |
| |
| copy_interior_match: |
| CMPQ CX, $16 |
| JGT memmove_match |
| |
| // if len(dst[di:]) < 16 |
| MOVQ R8, AX |
| SUBQ DI, AX |
| CMPQ AX, $16 |
| JLT memmove_match |
| |
| MOVOU (BX), X0 |
| MOVOU X0, (DI) |
| |
| ADDQ CX, DI |
| JMP loop |
| |
| memmove_match: |
| // memmove(to, from, len) |
| MOVQ DI, 0(SP) |
| MOVQ BX, 8(SP) |
| MOVQ CX, 16(SP) |
| // spill |
| MOVQ DI, 24(SP) |
| MOVQ SI, 32(SP) |
| MOVQ CX, 40(SP) // need len to inc SI, DI after |
| CALL runtime·memmove(SB) |
| |
| // restore registers |
| MOVQ 24(SP), DI |
| MOVQ 32(SP), SI |
| MOVQ 40(SP), CX |
| |
| // recalc initial values |
| MOVQ dst_base+0(FP), R8 |
| MOVQ R8, R11 // TODO: make these sensible numbers |
| ADDQ dst_len+8(FP), R8 |
| MOVQ src_base+24(FP), R9 |
| ADDQ src_len+32(FP), R9 |
| MOVQ R8, R12 |
| SUBQ $32, R12 |
| MOVQ R9, R13 |
| SUBQ $16, R13 |
| |
| ADDQ CX, DI |
| JMP loop |
| |
| err_corrupt: |
| MOVQ $-1, ret+48(FP) |
| RET |
| |
| err_short_buf: |
| MOVQ $-2, ret+48(FP) |
| RET |
| |
| end: |
| SUBQ R11, DI |
| MOVQ DI, ret+48(FP) |
| RET |