blob: 27e76774caba00a37dbe1cf33da126d0917d0992 [file] [log] [blame]
Akash Reddy Kankanalac0014632025-05-21 17:12:20 +05301// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
2
3//go:build !appengine && !noasm && gc && !noasm
4// +build !appengine,!noasm,gc,!noasm
5
6// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
7// Requires: CMOV
8TEXT ·sequenceDecs_decode_amd64(SB), $8-32
9 MOVQ br+8(FP), AX
10 MOVQ 32(AX), DX
11 MOVBQZX 40(AX), BX
12 MOVQ 24(AX), SI
13 MOVQ (AX), AX
14 ADDQ SI, AX
15 MOVQ AX, (SP)
16 MOVQ ctx+16(FP), AX
17 MOVQ 72(AX), DI
18 MOVQ 80(AX), R8
19 MOVQ 88(AX), R9
20 MOVQ 104(AX), R10
21 MOVQ s+0(FP), AX
22 MOVQ 144(AX), R11
23 MOVQ 152(AX), R12
24 MOVQ 160(AX), R13
25
26sequenceDecs_decode_amd64_main_loop:
27 MOVQ (SP), R14
28
29 // Fill bitreader to have enough for the offset and match length.
30 CMPQ SI, $0x08
31 JL sequenceDecs_decode_amd64_fill_byte_by_byte
32 MOVQ BX, AX
33 SHRQ $0x03, AX
34 SUBQ AX, R14
35 MOVQ (R14), DX
36 SUBQ AX, SI
37 ANDQ $0x07, BX
38 JMP sequenceDecs_decode_amd64_fill_end
39
40sequenceDecs_decode_amd64_fill_byte_by_byte:
41 CMPQ SI, $0x00
42 JLE sequenceDecs_decode_amd64_fill_end
43 CMPQ BX, $0x07
44 JLE sequenceDecs_decode_amd64_fill_end
45 SHLQ $0x08, DX
46 SUBQ $0x01, R14
47 SUBQ $0x01, SI
48 SUBQ $0x08, BX
49 MOVBQZX (R14), AX
50 ORQ AX, DX
51 JMP sequenceDecs_decode_amd64_fill_byte_by_byte
52
53sequenceDecs_decode_amd64_fill_end:
54 // Update offset
55 MOVQ R9, AX
56 MOVQ BX, CX
57 MOVQ DX, R15
58 SHLQ CL, R15
59 MOVB AH, CL
60 SHRQ $0x20, AX
61 TESTQ CX, CX
62 JZ sequenceDecs_decode_amd64_of_update_zero
63 ADDQ CX, BX
64 CMPQ BX, $0x40
65 JA sequenceDecs_decode_amd64_of_update_zero
66 CMPQ CX, $0x40
67 JAE sequenceDecs_decode_amd64_of_update_zero
68 NEGQ CX
69 SHRQ CL, R15
70 ADDQ R15, AX
71
72sequenceDecs_decode_amd64_of_update_zero:
73 MOVQ AX, 16(R10)
74
75 // Update match length
76 MOVQ R8, AX
77 MOVQ BX, CX
78 MOVQ DX, R15
79 SHLQ CL, R15
80 MOVB AH, CL
81 SHRQ $0x20, AX
82 TESTQ CX, CX
83 JZ sequenceDecs_decode_amd64_ml_update_zero
84 ADDQ CX, BX
85 CMPQ BX, $0x40
86 JA sequenceDecs_decode_amd64_ml_update_zero
87 CMPQ CX, $0x40
88 JAE sequenceDecs_decode_amd64_ml_update_zero
89 NEGQ CX
90 SHRQ CL, R15
91 ADDQ R15, AX
92
93sequenceDecs_decode_amd64_ml_update_zero:
94 MOVQ AX, 8(R10)
95
96 // Fill bitreader to have enough for the remaining
97 CMPQ SI, $0x08
98 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
99 MOVQ BX, AX
100 SHRQ $0x03, AX
101 SUBQ AX, R14
102 MOVQ (R14), DX
103 SUBQ AX, SI
104 ANDQ $0x07, BX
105 JMP sequenceDecs_decode_amd64_fill_2_end
106
107sequenceDecs_decode_amd64_fill_2_byte_by_byte:
108 CMPQ SI, $0x00
109 JLE sequenceDecs_decode_amd64_fill_2_end
110 CMPQ BX, $0x07
111 JLE sequenceDecs_decode_amd64_fill_2_end
112 SHLQ $0x08, DX
113 SUBQ $0x01, R14
114 SUBQ $0x01, SI
115 SUBQ $0x08, BX
116 MOVBQZX (R14), AX
117 ORQ AX, DX
118 JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
119
120sequenceDecs_decode_amd64_fill_2_end:
121 // Update literal length
122 MOVQ DI, AX
123 MOVQ BX, CX
124 MOVQ DX, R15
125 SHLQ CL, R15
126 MOVB AH, CL
127 SHRQ $0x20, AX
128 TESTQ CX, CX
129 JZ sequenceDecs_decode_amd64_ll_update_zero
130 ADDQ CX, BX
131 CMPQ BX, $0x40
132 JA sequenceDecs_decode_amd64_ll_update_zero
133 CMPQ CX, $0x40
134 JAE sequenceDecs_decode_amd64_ll_update_zero
135 NEGQ CX
136 SHRQ CL, R15
137 ADDQ R15, AX
138
139sequenceDecs_decode_amd64_ll_update_zero:
140 MOVQ AX, (R10)
141
142 // Fill bitreader for state updates
143 MOVQ R14, (SP)
144 MOVQ R9, AX
145 SHRQ $0x08, AX
146 MOVBQZX AL, AX
147 MOVQ ctx+16(FP), CX
148 CMPQ 96(CX), $0x00
149 JZ sequenceDecs_decode_amd64_skip_update
150
151 // Update Literal Length State
152 MOVBQZX DI, R14
153 SHRQ $0x10, DI
154 MOVWQZX DI, DI
155 LEAQ (BX)(R14*1), CX
156 MOVQ DX, R15
157 MOVQ CX, BX
158 ROLQ CL, R15
159 MOVL $0x00000001, BP
160 MOVB R14, CL
161 SHLL CL, BP
162 DECL BP
163 ANDQ BP, R15
164 ADDQ R15, DI
165
166 // Load ctx.llTable
167 MOVQ ctx+16(FP), CX
168 MOVQ (CX), CX
169 MOVQ (CX)(DI*8), DI
170
171 // Update Match Length State
172 MOVBQZX R8, R14
173 SHRQ $0x10, R8
174 MOVWQZX R8, R8
175 LEAQ (BX)(R14*1), CX
176 MOVQ DX, R15
177 MOVQ CX, BX
178 ROLQ CL, R15
179 MOVL $0x00000001, BP
180 MOVB R14, CL
181 SHLL CL, BP
182 DECL BP
183 ANDQ BP, R15
184 ADDQ R15, R8
185
186 // Load ctx.mlTable
187 MOVQ ctx+16(FP), CX
188 MOVQ 24(CX), CX
189 MOVQ (CX)(R8*8), R8
190
191 // Update Offset State
192 MOVBQZX R9, R14
193 SHRQ $0x10, R9
194 MOVWQZX R9, R9
195 LEAQ (BX)(R14*1), CX
196 MOVQ DX, R15
197 MOVQ CX, BX
198 ROLQ CL, R15
199 MOVL $0x00000001, BP
200 MOVB R14, CL
201 SHLL CL, BP
202 DECL BP
203 ANDQ BP, R15
204 ADDQ R15, R9
205
206 // Load ctx.ofTable
207 MOVQ ctx+16(FP), CX
208 MOVQ 48(CX), CX
209 MOVQ (CX)(R9*8), R9
210
211sequenceDecs_decode_amd64_skip_update:
212 // Adjust offset
213 MOVQ 16(R10), CX
214 CMPQ AX, $0x01
215 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
216 MOVQ R12, R13
217 MOVQ R11, R12
218 MOVQ CX, R11
219 JMP sequenceDecs_decode_amd64_after_adjust
220
221sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
222 CMPQ (R10), $0x00000000
223 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
224 INCQ CX
225 JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
226
227sequenceDecs_decode_amd64_adjust_offset_maybezero:
228 TESTQ CX, CX
229 JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
230 MOVQ R11, CX
231 JMP sequenceDecs_decode_amd64_after_adjust
232
233sequenceDecs_decode_amd64_adjust_offset_nonzero:
234 CMPQ CX, $0x01
235 JB sequenceDecs_decode_amd64_adjust_zero
236 JEQ sequenceDecs_decode_amd64_adjust_one
237 CMPQ CX, $0x02
238 JA sequenceDecs_decode_amd64_adjust_three
239 JMP sequenceDecs_decode_amd64_adjust_two
240
241sequenceDecs_decode_amd64_adjust_zero:
242 MOVQ R11, AX
243 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
244
245sequenceDecs_decode_amd64_adjust_one:
246 MOVQ R12, AX
247 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
248
249sequenceDecs_decode_amd64_adjust_two:
250 MOVQ R13, AX
251 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
252
253sequenceDecs_decode_amd64_adjust_three:
254 LEAQ -1(R11), AX
255
256sequenceDecs_decode_amd64_adjust_test_temp_valid:
257 TESTQ AX, AX
258 JNZ sequenceDecs_decode_amd64_adjust_temp_valid
259 MOVQ $0x00000001, AX
260
261sequenceDecs_decode_amd64_adjust_temp_valid:
262 CMPQ CX, $0x01
263 CMOVQNE R12, R13
264 MOVQ R11, R12
265 MOVQ AX, R11
266 MOVQ AX, CX
267
268sequenceDecs_decode_amd64_after_adjust:
269 MOVQ CX, 16(R10)
270
271 // Check values
272 MOVQ 8(R10), AX
273 MOVQ (R10), R14
274 LEAQ (AX)(R14*1), R15
275 MOVQ s+0(FP), BP
276 ADDQ R15, 256(BP)
277 MOVQ ctx+16(FP), R15
278 SUBQ R14, 128(R15)
279 JS error_not_enough_literals
280 CMPQ AX, $0x00020002
281 JA sequenceDecs_decode_amd64_error_match_len_too_big
282 TESTQ CX, CX
283 JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
284 TESTQ AX, AX
285 JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
286
287sequenceDecs_decode_amd64_match_len_ofs_ok:
288 ADDQ $0x18, R10
289 MOVQ ctx+16(FP), AX
290 DECQ 96(AX)
291 JNS sequenceDecs_decode_amd64_main_loop
292 MOVQ s+0(FP), AX
293 MOVQ R11, 144(AX)
294 MOVQ R12, 152(AX)
295 MOVQ R13, 160(AX)
296 MOVQ br+8(FP), AX
297 MOVQ DX, 32(AX)
298 MOVB BL, 40(AX)
299 MOVQ SI, 24(AX)
300
301 // Return success
302 MOVQ $0x00000000, ret+24(FP)
303 RET
304
305 // Return with match length error
306sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
307 MOVQ $0x00000001, ret+24(FP)
308 RET
309
310 // Return with match too long error
311sequenceDecs_decode_amd64_error_match_len_too_big:
312 MOVQ $0x00000002, ret+24(FP)
313 RET
314
315 // Return with match offset too long error
316 MOVQ $0x00000003, ret+24(FP)
317 RET
318
319 // Return with not enough literals error
320error_not_enough_literals:
321 MOVQ $0x00000004, ret+24(FP)
322 RET
323
324 // Return with not enough output space error
325 MOVQ $0x00000005, ret+24(FP)
326 RET
327
328// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
329// Requires: CMOV
330TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
331 MOVQ br+8(FP), AX
332 MOVQ 32(AX), DX
333 MOVBQZX 40(AX), BX
334 MOVQ 24(AX), SI
335 MOVQ (AX), AX
336 ADDQ SI, AX
337 MOVQ AX, (SP)
338 MOVQ ctx+16(FP), AX
339 MOVQ 72(AX), DI
340 MOVQ 80(AX), R8
341 MOVQ 88(AX), R9
342 MOVQ 104(AX), R10
343 MOVQ s+0(FP), AX
344 MOVQ 144(AX), R11
345 MOVQ 152(AX), R12
346 MOVQ 160(AX), R13
347
348sequenceDecs_decode_56_amd64_main_loop:
349 MOVQ (SP), R14
350
351 // Fill bitreader to have enough for the offset and match length.
352 CMPQ SI, $0x08
353 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
354 MOVQ BX, AX
355 SHRQ $0x03, AX
356 SUBQ AX, R14
357 MOVQ (R14), DX
358 SUBQ AX, SI
359 ANDQ $0x07, BX
360 JMP sequenceDecs_decode_56_amd64_fill_end
361
362sequenceDecs_decode_56_amd64_fill_byte_by_byte:
363 CMPQ SI, $0x00
364 JLE sequenceDecs_decode_56_amd64_fill_end
365 CMPQ BX, $0x07
366 JLE sequenceDecs_decode_56_amd64_fill_end
367 SHLQ $0x08, DX
368 SUBQ $0x01, R14
369 SUBQ $0x01, SI
370 SUBQ $0x08, BX
371 MOVBQZX (R14), AX
372 ORQ AX, DX
373 JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
374
375sequenceDecs_decode_56_amd64_fill_end:
376 // Update offset
377 MOVQ R9, AX
378 MOVQ BX, CX
379 MOVQ DX, R15
380 SHLQ CL, R15
381 MOVB AH, CL
382 SHRQ $0x20, AX
383 TESTQ CX, CX
384 JZ sequenceDecs_decode_56_amd64_of_update_zero
385 ADDQ CX, BX
386 CMPQ BX, $0x40
387 JA sequenceDecs_decode_56_amd64_of_update_zero
388 CMPQ CX, $0x40
389 JAE sequenceDecs_decode_56_amd64_of_update_zero
390 NEGQ CX
391 SHRQ CL, R15
392 ADDQ R15, AX
393
394sequenceDecs_decode_56_amd64_of_update_zero:
395 MOVQ AX, 16(R10)
396
397 // Update match length
398 MOVQ R8, AX
399 MOVQ BX, CX
400 MOVQ DX, R15
401 SHLQ CL, R15
402 MOVB AH, CL
403 SHRQ $0x20, AX
404 TESTQ CX, CX
405 JZ sequenceDecs_decode_56_amd64_ml_update_zero
406 ADDQ CX, BX
407 CMPQ BX, $0x40
408 JA sequenceDecs_decode_56_amd64_ml_update_zero
409 CMPQ CX, $0x40
410 JAE sequenceDecs_decode_56_amd64_ml_update_zero
411 NEGQ CX
412 SHRQ CL, R15
413 ADDQ R15, AX
414
415sequenceDecs_decode_56_amd64_ml_update_zero:
416 MOVQ AX, 8(R10)
417
418 // Update literal length
419 MOVQ DI, AX
420 MOVQ BX, CX
421 MOVQ DX, R15
422 SHLQ CL, R15
423 MOVB AH, CL
424 SHRQ $0x20, AX
425 TESTQ CX, CX
426 JZ sequenceDecs_decode_56_amd64_ll_update_zero
427 ADDQ CX, BX
428 CMPQ BX, $0x40
429 JA sequenceDecs_decode_56_amd64_ll_update_zero
430 CMPQ CX, $0x40
431 JAE sequenceDecs_decode_56_amd64_ll_update_zero
432 NEGQ CX
433 SHRQ CL, R15
434 ADDQ R15, AX
435
436sequenceDecs_decode_56_amd64_ll_update_zero:
437 MOVQ AX, (R10)
438
439 // Fill bitreader for state updates
440 MOVQ R14, (SP)
441 MOVQ R9, AX
442 SHRQ $0x08, AX
443 MOVBQZX AL, AX
444 MOVQ ctx+16(FP), CX
445 CMPQ 96(CX), $0x00
446 JZ sequenceDecs_decode_56_amd64_skip_update
447
448 // Update Literal Length State
449 MOVBQZX DI, R14
450 SHRQ $0x10, DI
451 MOVWQZX DI, DI
452 LEAQ (BX)(R14*1), CX
453 MOVQ DX, R15
454 MOVQ CX, BX
455 ROLQ CL, R15
456 MOVL $0x00000001, BP
457 MOVB R14, CL
458 SHLL CL, BP
459 DECL BP
460 ANDQ BP, R15
461 ADDQ R15, DI
462
463 // Load ctx.llTable
464 MOVQ ctx+16(FP), CX
465 MOVQ (CX), CX
466 MOVQ (CX)(DI*8), DI
467
468 // Update Match Length State
469 MOVBQZX R8, R14
470 SHRQ $0x10, R8
471 MOVWQZX R8, R8
472 LEAQ (BX)(R14*1), CX
473 MOVQ DX, R15
474 MOVQ CX, BX
475 ROLQ CL, R15
476 MOVL $0x00000001, BP
477 MOVB R14, CL
478 SHLL CL, BP
479 DECL BP
480 ANDQ BP, R15
481 ADDQ R15, R8
482
483 // Load ctx.mlTable
484 MOVQ ctx+16(FP), CX
485 MOVQ 24(CX), CX
486 MOVQ (CX)(R8*8), R8
487
488 // Update Offset State
489 MOVBQZX R9, R14
490 SHRQ $0x10, R9
491 MOVWQZX R9, R9
492 LEAQ (BX)(R14*1), CX
493 MOVQ DX, R15
494 MOVQ CX, BX
495 ROLQ CL, R15
496 MOVL $0x00000001, BP
497 MOVB R14, CL
498 SHLL CL, BP
499 DECL BP
500 ANDQ BP, R15
501 ADDQ R15, R9
502
503 // Load ctx.ofTable
504 MOVQ ctx+16(FP), CX
505 MOVQ 48(CX), CX
506 MOVQ (CX)(R9*8), R9
507
508sequenceDecs_decode_56_amd64_skip_update:
509 // Adjust offset
510 MOVQ 16(R10), CX
511 CMPQ AX, $0x01
512 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
513 MOVQ R12, R13
514 MOVQ R11, R12
515 MOVQ CX, R11
516 JMP sequenceDecs_decode_56_amd64_after_adjust
517
518sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
519 CMPQ (R10), $0x00000000
520 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
521 INCQ CX
522 JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
523
524sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
525 TESTQ CX, CX
526 JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
527 MOVQ R11, CX
528 JMP sequenceDecs_decode_56_amd64_after_adjust
529
530sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
531 CMPQ CX, $0x01
532 JB sequenceDecs_decode_56_amd64_adjust_zero
533 JEQ sequenceDecs_decode_56_amd64_adjust_one
534 CMPQ CX, $0x02
535 JA sequenceDecs_decode_56_amd64_adjust_three
536 JMP sequenceDecs_decode_56_amd64_adjust_two
537
538sequenceDecs_decode_56_amd64_adjust_zero:
539 MOVQ R11, AX
540 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
541
542sequenceDecs_decode_56_amd64_adjust_one:
543 MOVQ R12, AX
544 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
545
546sequenceDecs_decode_56_amd64_adjust_two:
547 MOVQ R13, AX
548 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
549
550sequenceDecs_decode_56_amd64_adjust_three:
551 LEAQ -1(R11), AX
552
553sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
554 TESTQ AX, AX
555 JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
556 MOVQ $0x00000001, AX
557
558sequenceDecs_decode_56_amd64_adjust_temp_valid:
559 CMPQ CX, $0x01
560 CMOVQNE R12, R13
561 MOVQ R11, R12
562 MOVQ AX, R11
563 MOVQ AX, CX
564
565sequenceDecs_decode_56_amd64_after_adjust:
566 MOVQ CX, 16(R10)
567
568 // Check values
569 MOVQ 8(R10), AX
570 MOVQ (R10), R14
571 LEAQ (AX)(R14*1), R15
572 MOVQ s+0(FP), BP
573 ADDQ R15, 256(BP)
574 MOVQ ctx+16(FP), R15
575 SUBQ R14, 128(R15)
576 JS error_not_enough_literals
577 CMPQ AX, $0x00020002
578 JA sequenceDecs_decode_56_amd64_error_match_len_too_big
579 TESTQ CX, CX
580 JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
581 TESTQ AX, AX
582 JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
583
584sequenceDecs_decode_56_amd64_match_len_ofs_ok:
585 ADDQ $0x18, R10
586 MOVQ ctx+16(FP), AX
587 DECQ 96(AX)
588 JNS sequenceDecs_decode_56_amd64_main_loop
589 MOVQ s+0(FP), AX
590 MOVQ R11, 144(AX)
591 MOVQ R12, 152(AX)
592 MOVQ R13, 160(AX)
593 MOVQ br+8(FP), AX
594 MOVQ DX, 32(AX)
595 MOVB BL, 40(AX)
596 MOVQ SI, 24(AX)
597
598 // Return success
599 MOVQ $0x00000000, ret+24(FP)
600 RET
601
602 // Return with match length error
603sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
604 MOVQ $0x00000001, ret+24(FP)
605 RET
606
607 // Return with match too long error
608sequenceDecs_decode_56_amd64_error_match_len_too_big:
609 MOVQ $0x00000002, ret+24(FP)
610 RET
611
612 // Return with match offset too long error
613 MOVQ $0x00000003, ret+24(FP)
614 RET
615
616 // Return with not enough literals error
617error_not_enough_literals:
618 MOVQ $0x00000004, ret+24(FP)
619 RET
620
621 // Return with not enough output space error
622 MOVQ $0x00000005, ret+24(FP)
623 RET
624
625// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
626// Requires: BMI, BMI2, CMOV
627TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
628 MOVQ br+8(FP), CX
629 MOVQ 32(CX), AX
630 MOVBQZX 40(CX), DX
631 MOVQ 24(CX), BX
632 MOVQ (CX), CX
633 ADDQ BX, CX
634 MOVQ CX, (SP)
635 MOVQ ctx+16(FP), CX
636 MOVQ 72(CX), SI
637 MOVQ 80(CX), DI
638 MOVQ 88(CX), R8
639 MOVQ 104(CX), R9
640 MOVQ s+0(FP), CX
641 MOVQ 144(CX), R10
642 MOVQ 152(CX), R11
643 MOVQ 160(CX), R12
644
645sequenceDecs_decode_bmi2_main_loop:
646 MOVQ (SP), R13
647
648 // Fill bitreader to have enough for the offset and match length.
649 CMPQ BX, $0x08
650 JL sequenceDecs_decode_bmi2_fill_byte_by_byte
651 MOVQ DX, CX
652 SHRQ $0x03, CX
653 SUBQ CX, R13
654 MOVQ (R13), AX
655 SUBQ CX, BX
656 ANDQ $0x07, DX
657 JMP sequenceDecs_decode_bmi2_fill_end
658
659sequenceDecs_decode_bmi2_fill_byte_by_byte:
660 CMPQ BX, $0x00
661 JLE sequenceDecs_decode_bmi2_fill_end
662 CMPQ DX, $0x07
663 JLE sequenceDecs_decode_bmi2_fill_end
664 SHLQ $0x08, AX
665 SUBQ $0x01, R13
666 SUBQ $0x01, BX
667 SUBQ $0x08, DX
668 MOVBQZX (R13), CX
669 ORQ CX, AX
670 JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
671
672sequenceDecs_decode_bmi2_fill_end:
673 // Update offset
674 MOVQ $0x00000808, CX
675 BEXTRQ CX, R8, R14
676 MOVQ AX, R15
677 LEAQ (DX)(R14*1), CX
678 ROLQ CL, R15
679 BZHIQ R14, R15, R15
680 MOVQ CX, DX
681 MOVQ R8, CX
682 SHRQ $0x20, CX
683 ADDQ R15, CX
684 MOVQ CX, 16(R9)
685
686 // Update match length
687 MOVQ $0x00000808, CX
688 BEXTRQ CX, DI, R14
689 MOVQ AX, R15
690 LEAQ (DX)(R14*1), CX
691 ROLQ CL, R15
692 BZHIQ R14, R15, R15
693 MOVQ CX, DX
694 MOVQ DI, CX
695 SHRQ $0x20, CX
696 ADDQ R15, CX
697 MOVQ CX, 8(R9)
698
699 // Fill bitreader to have enough for the remaining
700 CMPQ BX, $0x08
701 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
702 MOVQ DX, CX
703 SHRQ $0x03, CX
704 SUBQ CX, R13
705 MOVQ (R13), AX
706 SUBQ CX, BX
707 ANDQ $0x07, DX
708 JMP sequenceDecs_decode_bmi2_fill_2_end
709
710sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
711 CMPQ BX, $0x00
712 JLE sequenceDecs_decode_bmi2_fill_2_end
713 CMPQ DX, $0x07
714 JLE sequenceDecs_decode_bmi2_fill_2_end
715 SHLQ $0x08, AX
716 SUBQ $0x01, R13
717 SUBQ $0x01, BX
718 SUBQ $0x08, DX
719 MOVBQZX (R13), CX
720 ORQ CX, AX
721 JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
722
723sequenceDecs_decode_bmi2_fill_2_end:
724 // Update literal length
725 MOVQ $0x00000808, CX
726 BEXTRQ CX, SI, R14
727 MOVQ AX, R15
728 LEAQ (DX)(R14*1), CX
729 ROLQ CL, R15
730 BZHIQ R14, R15, R15
731 MOVQ CX, DX
732 MOVQ SI, CX
733 SHRQ $0x20, CX
734 ADDQ R15, CX
735 MOVQ CX, (R9)
736
737 // Fill bitreader for state updates
738 MOVQ R13, (SP)
739 MOVQ $0x00000808, CX
740 BEXTRQ CX, R8, R13
741 MOVQ ctx+16(FP), CX
742 CMPQ 96(CX), $0x00
743 JZ sequenceDecs_decode_bmi2_skip_update
744 LEAQ (SI)(DI*1), R14
745 ADDQ R8, R14
746 MOVBQZX R14, R14
747 LEAQ (DX)(R14*1), CX
748 MOVQ AX, R15
749 MOVQ CX, DX
750 ROLQ CL, R15
751 BZHIQ R14, R15, R15
752
753 // Update Offset State
754 BZHIQ R8, R15, CX
755 SHRXQ R8, R15, R15
756 MOVQ $0x00001010, R14
757 BEXTRQ R14, R8, R8
758 ADDQ CX, R8
759
760 // Load ctx.ofTable
761 MOVQ ctx+16(FP), CX
762 MOVQ 48(CX), CX
763 MOVQ (CX)(R8*8), R8
764
765 // Update Match Length State
766 BZHIQ DI, R15, CX
767 SHRXQ DI, R15, R15
768 MOVQ $0x00001010, R14
769 BEXTRQ R14, DI, DI
770 ADDQ CX, DI
771
772 // Load ctx.mlTable
773 MOVQ ctx+16(FP), CX
774 MOVQ 24(CX), CX
775 MOVQ (CX)(DI*8), DI
776
777 // Update Literal Length State
778 BZHIQ SI, R15, CX
779 MOVQ $0x00001010, R14
780 BEXTRQ R14, SI, SI
781 ADDQ CX, SI
782
783 // Load ctx.llTable
784 MOVQ ctx+16(FP), CX
785 MOVQ (CX), CX
786 MOVQ (CX)(SI*8), SI
787
788sequenceDecs_decode_bmi2_skip_update:
789 // Adjust offset
790 MOVQ 16(R9), CX
791 CMPQ R13, $0x01
792 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
793 MOVQ R11, R12
794 MOVQ R10, R11
795 MOVQ CX, R10
796 JMP sequenceDecs_decode_bmi2_after_adjust
797
798sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
799 CMPQ (R9), $0x00000000
800 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
801 INCQ CX
802 JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
803
804sequenceDecs_decode_bmi2_adjust_offset_maybezero:
805 TESTQ CX, CX
806 JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
807 MOVQ R10, CX
808 JMP sequenceDecs_decode_bmi2_after_adjust
809
810sequenceDecs_decode_bmi2_adjust_offset_nonzero:
811 CMPQ CX, $0x01
812 JB sequenceDecs_decode_bmi2_adjust_zero
813 JEQ sequenceDecs_decode_bmi2_adjust_one
814 CMPQ CX, $0x02
815 JA sequenceDecs_decode_bmi2_adjust_three
816 JMP sequenceDecs_decode_bmi2_adjust_two
817
818sequenceDecs_decode_bmi2_adjust_zero:
819 MOVQ R10, R13
820 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
821
822sequenceDecs_decode_bmi2_adjust_one:
823 MOVQ R11, R13
824 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
825
826sequenceDecs_decode_bmi2_adjust_two:
827 MOVQ R12, R13
828 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
829
830sequenceDecs_decode_bmi2_adjust_three:
831 LEAQ -1(R10), R13
832
833sequenceDecs_decode_bmi2_adjust_test_temp_valid:
834 TESTQ R13, R13
835 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
836 MOVQ $0x00000001, R13
837
838sequenceDecs_decode_bmi2_adjust_temp_valid:
839 CMPQ CX, $0x01
840 CMOVQNE R11, R12
841 MOVQ R10, R11
842 MOVQ R13, R10
843 MOVQ R13, CX
844
845sequenceDecs_decode_bmi2_after_adjust:
846 MOVQ CX, 16(R9)
847
848 // Check values
849 MOVQ 8(R9), R13
850 MOVQ (R9), R14
851 LEAQ (R13)(R14*1), R15
852 MOVQ s+0(FP), BP
853 ADDQ R15, 256(BP)
854 MOVQ ctx+16(FP), R15
855 SUBQ R14, 128(R15)
856 JS error_not_enough_literals
857 CMPQ R13, $0x00020002
858 JA sequenceDecs_decode_bmi2_error_match_len_too_big
859 TESTQ CX, CX
860 JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
861 TESTQ R13, R13
862 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
863
864sequenceDecs_decode_bmi2_match_len_ofs_ok:
865 ADDQ $0x18, R9
866 MOVQ ctx+16(FP), CX
867 DECQ 96(CX)
868 JNS sequenceDecs_decode_bmi2_main_loop
869 MOVQ s+0(FP), CX
870 MOVQ R10, 144(CX)
871 MOVQ R11, 152(CX)
872 MOVQ R12, 160(CX)
873 MOVQ br+8(FP), CX
874 MOVQ AX, 32(CX)
875 MOVB DL, 40(CX)
876 MOVQ BX, 24(CX)
877
878 // Return success
879 MOVQ $0x00000000, ret+24(FP)
880 RET
881
882 // Return with match length error
883sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
884 MOVQ $0x00000001, ret+24(FP)
885 RET
886
887 // Return with match too long error
888sequenceDecs_decode_bmi2_error_match_len_too_big:
889 MOVQ $0x00000002, ret+24(FP)
890 RET
891
892 // Return with match offset too long error
893 MOVQ $0x00000003, ret+24(FP)
894 RET
895
896 // Return with not enough literals error
897error_not_enough_literals:
898 MOVQ $0x00000004, ret+24(FP)
899 RET
900
901 // Return with not enough output space error
902 MOVQ $0x00000005, ret+24(FP)
903 RET
904
905// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
906// Requires: BMI, BMI2, CMOV
907TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
908 MOVQ br+8(FP), CX
909 MOVQ 32(CX), AX
910 MOVBQZX 40(CX), DX
911 MOVQ 24(CX), BX
912 MOVQ (CX), CX
913 ADDQ BX, CX
914 MOVQ CX, (SP)
915 MOVQ ctx+16(FP), CX
916 MOVQ 72(CX), SI
917 MOVQ 80(CX), DI
918 MOVQ 88(CX), R8
919 MOVQ 104(CX), R9
920 MOVQ s+0(FP), CX
921 MOVQ 144(CX), R10
922 MOVQ 152(CX), R11
923 MOVQ 160(CX), R12
924
925sequenceDecs_decode_56_bmi2_main_loop:
926 MOVQ (SP), R13
927
928 // Fill bitreader to have enough for the offset and match length.
929 CMPQ BX, $0x08
930 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
931 MOVQ DX, CX
932 SHRQ $0x03, CX
933 SUBQ CX, R13
934 MOVQ (R13), AX
935 SUBQ CX, BX
936 ANDQ $0x07, DX
937 JMP sequenceDecs_decode_56_bmi2_fill_end
938
939sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
940 CMPQ BX, $0x00
941 JLE sequenceDecs_decode_56_bmi2_fill_end
942 CMPQ DX, $0x07
943 JLE sequenceDecs_decode_56_bmi2_fill_end
944 SHLQ $0x08, AX
945 SUBQ $0x01, R13
946 SUBQ $0x01, BX
947 SUBQ $0x08, DX
948 MOVBQZX (R13), CX
949 ORQ CX, AX
950 JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
951
952sequenceDecs_decode_56_bmi2_fill_end:
953 // Update offset
954 MOVQ $0x00000808, CX
955 BEXTRQ CX, R8, R14
956 MOVQ AX, R15
957 LEAQ (DX)(R14*1), CX
958 ROLQ CL, R15
959 BZHIQ R14, R15, R15
960 MOVQ CX, DX
961 MOVQ R8, CX
962 SHRQ $0x20, CX
963 ADDQ R15, CX
964 MOVQ CX, 16(R9)
965
966 // Update match length
967 MOVQ $0x00000808, CX
968 BEXTRQ CX, DI, R14
969 MOVQ AX, R15
970 LEAQ (DX)(R14*1), CX
971 ROLQ CL, R15
972 BZHIQ R14, R15, R15
973 MOVQ CX, DX
974 MOVQ DI, CX
975 SHRQ $0x20, CX
976 ADDQ R15, CX
977 MOVQ CX, 8(R9)
978
979 // Update literal length
980 MOVQ $0x00000808, CX
981 BEXTRQ CX, SI, R14
982 MOVQ AX, R15
983 LEAQ (DX)(R14*1), CX
984 ROLQ CL, R15
985 BZHIQ R14, R15, R15
986 MOVQ CX, DX
987 MOVQ SI, CX
988 SHRQ $0x20, CX
989 ADDQ R15, CX
990 MOVQ CX, (R9)
991
992 // Fill bitreader for state updates
993 MOVQ R13, (SP)
994 MOVQ $0x00000808, CX
995 BEXTRQ CX, R8, R13
996 MOVQ ctx+16(FP), CX
997 CMPQ 96(CX), $0x00
998 JZ sequenceDecs_decode_56_bmi2_skip_update
999 LEAQ (SI)(DI*1), R14
1000 ADDQ R8, R14
1001 MOVBQZX R14, R14
1002 LEAQ (DX)(R14*1), CX
1003 MOVQ AX, R15
1004 MOVQ CX, DX
1005 ROLQ CL, R15
1006 BZHIQ R14, R15, R15
1007
1008 // Update Offset State
1009 BZHIQ R8, R15, CX
1010 SHRXQ R8, R15, R15
1011 MOVQ $0x00001010, R14
1012 BEXTRQ R14, R8, R8
1013 ADDQ CX, R8
1014
1015 // Load ctx.ofTable
1016 MOVQ ctx+16(FP), CX
1017 MOVQ 48(CX), CX
1018 MOVQ (CX)(R8*8), R8
1019
1020 // Update Match Length State
1021 BZHIQ DI, R15, CX
1022 SHRXQ DI, R15, R15
1023 MOVQ $0x00001010, R14
1024 BEXTRQ R14, DI, DI
1025 ADDQ CX, DI
1026
1027 // Load ctx.mlTable
1028 MOVQ ctx+16(FP), CX
1029 MOVQ 24(CX), CX
1030 MOVQ (CX)(DI*8), DI
1031
1032 // Update Literal Length State
1033 BZHIQ SI, R15, CX
1034 MOVQ $0x00001010, R14
1035 BEXTRQ R14, SI, SI
1036 ADDQ CX, SI
1037
1038 // Load ctx.llTable
1039 MOVQ ctx+16(FP), CX
1040 MOVQ (CX), CX
1041 MOVQ (CX)(SI*8), SI
1042
1043sequenceDecs_decode_56_bmi2_skip_update:
1044 // Adjust offset
1045 MOVQ 16(R9), CX
1046 CMPQ R13, $0x01
1047 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
1048 MOVQ R11, R12
1049 MOVQ R10, R11
1050 MOVQ CX, R10
1051 JMP sequenceDecs_decode_56_bmi2_after_adjust
1052
1053sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
1054 CMPQ (R9), $0x00000000
1055 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
1056 INCQ CX
1057 JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1058
1059sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
1060 TESTQ CX, CX
1061 JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1062 MOVQ R10, CX
1063 JMP sequenceDecs_decode_56_bmi2_after_adjust
1064
1065sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
1066 CMPQ CX, $0x01
1067 JB sequenceDecs_decode_56_bmi2_adjust_zero
1068 JEQ sequenceDecs_decode_56_bmi2_adjust_one
1069 CMPQ CX, $0x02
1070 JA sequenceDecs_decode_56_bmi2_adjust_three
1071 JMP sequenceDecs_decode_56_bmi2_adjust_two
1072
1073sequenceDecs_decode_56_bmi2_adjust_zero:
1074 MOVQ R10, R13
1075 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1076
1077sequenceDecs_decode_56_bmi2_adjust_one:
1078 MOVQ R11, R13
1079 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1080
1081sequenceDecs_decode_56_bmi2_adjust_two:
1082 MOVQ R12, R13
1083 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1084
1085sequenceDecs_decode_56_bmi2_adjust_three:
1086 LEAQ -1(R10), R13
1087
1088sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
1089 TESTQ R13, R13
1090 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
1091 MOVQ $0x00000001, R13
1092
1093sequenceDecs_decode_56_bmi2_adjust_temp_valid:
1094 CMPQ CX, $0x01
1095 CMOVQNE R11, R12
1096 MOVQ R10, R11
1097 MOVQ R13, R10
1098 MOVQ R13, CX
1099
1100sequenceDecs_decode_56_bmi2_after_adjust:
1101 MOVQ CX, 16(R9)
1102
1103 // Check values
1104 MOVQ 8(R9), R13
1105 MOVQ (R9), R14
1106 LEAQ (R13)(R14*1), R15
1107 MOVQ s+0(FP), BP
1108 ADDQ R15, 256(BP)
1109 MOVQ ctx+16(FP), R15
1110 SUBQ R14, 128(R15)
1111 JS error_not_enough_literals
1112 CMPQ R13, $0x00020002
1113 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
1114 TESTQ CX, CX
1115 JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
1116 TESTQ R13, R13
1117 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
1118
1119sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
1120 ADDQ $0x18, R9
1121 MOVQ ctx+16(FP), CX
1122 DECQ 96(CX)
1123 JNS sequenceDecs_decode_56_bmi2_main_loop
1124 MOVQ s+0(FP), CX
1125 MOVQ R10, 144(CX)
1126 MOVQ R11, 152(CX)
1127 MOVQ R12, 160(CX)
1128 MOVQ br+8(FP), CX
1129 MOVQ AX, 32(CX)
1130 MOVB DL, 40(CX)
1131 MOVQ BX, 24(CX)
1132
1133 // Return success
1134 MOVQ $0x00000000, ret+24(FP)
1135 RET
1136
1137 // Return with match length error
1138sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
1139 MOVQ $0x00000001, ret+24(FP)
1140 RET
1141
1142 // Return with match too long error
1143sequenceDecs_decode_56_bmi2_error_match_len_too_big:
1144 MOVQ $0x00000002, ret+24(FP)
1145 RET
1146
1147 // Return with match offset too long error
1148 MOVQ $0x00000003, ret+24(FP)
1149 RET
1150
1151 // Return with not enough literals error
1152error_not_enough_literals:
1153 MOVQ $0x00000004, ret+24(FP)
1154 RET
1155
1156 // Return with not enough output space error
1157 MOVQ $0x00000005, ret+24(FP)
1158 RET
1159
1160// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
1161// Requires: SSE
1162TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
1163 MOVQ ctx+0(FP), R10
1164 MOVQ 8(R10), CX
1165 TESTQ CX, CX
1166 JZ empty_seqs
1167 MOVQ (R10), AX
1168 MOVQ 24(R10), DX
1169 MOVQ 32(R10), BX
1170 MOVQ 80(R10), SI
1171 MOVQ 104(R10), DI
1172 MOVQ 120(R10), R8
1173 MOVQ 56(R10), R9
1174 MOVQ 64(R10), R10
1175 ADDQ R10, R9
1176
1177 // seqsBase += 24 * seqIndex
1178 LEAQ (DX)(DX*2), R11
1179 SHLQ $0x03, R11
1180 ADDQ R11, AX
1181
1182 // outBase += outPosition
1183 ADDQ DI, BX
1184
1185main_loop:
1186 MOVQ (AX), R11
1187 MOVQ 16(AX), R12
1188 MOVQ 8(AX), R13
1189
1190 // Copy literals
1191 TESTQ R11, R11
1192 JZ check_offset
1193 XORQ R14, R14
1194
1195copy_1:
1196 MOVUPS (SI)(R14*1), X0
1197 MOVUPS X0, (BX)(R14*1)
1198 ADDQ $0x10, R14
1199 CMPQ R14, R11
1200 JB copy_1
1201 ADDQ R11, SI
1202 ADDQ R11, BX
1203 ADDQ R11, DI
1204
1205 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1206check_offset:
1207 LEAQ (DI)(R10*1), R11
1208 CMPQ R12, R11
1209 JG error_match_off_too_big
1210 CMPQ R12, R8
1211 JG error_match_off_too_big
1212
1213 // Copy match from history
1214 MOVQ R12, R11
1215 SUBQ DI, R11
1216 JLS copy_match
1217 MOVQ R9, R14
1218 SUBQ R11, R14
1219 CMPQ R13, R11
1220 JG copy_all_from_history
1221 MOVQ R13, R11
1222 SUBQ $0x10, R11
1223 JB copy_4_small
1224
1225copy_4_loop:
1226 MOVUPS (R14), X0
1227 MOVUPS X0, (BX)
1228 ADDQ $0x10, R14
1229 ADDQ $0x10, BX
1230 SUBQ $0x10, R11
1231 JAE copy_4_loop
1232 LEAQ 16(R14)(R11*1), R14
1233 LEAQ 16(BX)(R11*1), BX
1234 MOVUPS -16(R14), X0
1235 MOVUPS X0, -16(BX)
1236 JMP copy_4_end
1237
1238copy_4_small:
1239 CMPQ R13, $0x03
1240 JE copy_4_move_3
1241 CMPQ R13, $0x08
1242 JB copy_4_move_4through7
1243 JMP copy_4_move_8through16
1244
1245copy_4_move_3:
1246 MOVW (R14), R11
1247 MOVB 2(R14), R12
1248 MOVW R11, (BX)
1249 MOVB R12, 2(BX)
1250 ADDQ R13, R14
1251 ADDQ R13, BX
1252 JMP copy_4_end
1253
1254copy_4_move_4through7:
1255 MOVL (R14), R11
1256 MOVL -4(R14)(R13*1), R12
1257 MOVL R11, (BX)
1258 MOVL R12, -4(BX)(R13*1)
1259 ADDQ R13, R14
1260 ADDQ R13, BX
1261 JMP copy_4_end
1262
1263copy_4_move_8through16:
1264 MOVQ (R14), R11
1265 MOVQ -8(R14)(R13*1), R12
1266 MOVQ R11, (BX)
1267 MOVQ R12, -8(BX)(R13*1)
1268 ADDQ R13, R14
1269 ADDQ R13, BX
1270
1271copy_4_end:
1272 ADDQ R13, DI
1273 ADDQ $0x18, AX
1274 INCQ DX
1275 CMPQ DX, CX
1276 JB main_loop
1277 JMP loop_finished
1278
1279copy_all_from_history:
1280 MOVQ R11, R15
1281 SUBQ $0x10, R15
1282 JB copy_5_small
1283
1284copy_5_loop:
1285 MOVUPS (R14), X0
1286 MOVUPS X0, (BX)
1287 ADDQ $0x10, R14
1288 ADDQ $0x10, BX
1289 SUBQ $0x10, R15
1290 JAE copy_5_loop
1291 LEAQ 16(R14)(R15*1), R14
1292 LEAQ 16(BX)(R15*1), BX
1293 MOVUPS -16(R14), X0
1294 MOVUPS X0, -16(BX)
1295 JMP copy_5_end
1296
1297copy_5_small:
1298 CMPQ R11, $0x03
1299 JE copy_5_move_3
1300 JB copy_5_move_1or2
1301 CMPQ R11, $0x08
1302 JB copy_5_move_4through7
1303 JMP copy_5_move_8through16
1304
1305copy_5_move_1or2:
1306 MOVB (R14), R15
1307 MOVB -1(R14)(R11*1), BP
1308 MOVB R15, (BX)
1309 MOVB BP, -1(BX)(R11*1)
1310 ADDQ R11, R14
1311 ADDQ R11, BX
1312 JMP copy_5_end
1313
1314copy_5_move_3:
1315 MOVW (R14), R15
1316 MOVB 2(R14), BP
1317 MOVW R15, (BX)
1318 MOVB BP, 2(BX)
1319 ADDQ R11, R14
1320 ADDQ R11, BX
1321 JMP copy_5_end
1322
1323copy_5_move_4through7:
1324 MOVL (R14), R15
1325 MOVL -4(R14)(R11*1), BP
1326 MOVL R15, (BX)
1327 MOVL BP, -4(BX)(R11*1)
1328 ADDQ R11, R14
1329 ADDQ R11, BX
1330 JMP copy_5_end
1331
1332copy_5_move_8through16:
1333 MOVQ (R14), R15
1334 MOVQ -8(R14)(R11*1), BP
1335 MOVQ R15, (BX)
1336 MOVQ BP, -8(BX)(R11*1)
1337 ADDQ R11, R14
1338 ADDQ R11, BX
1339
1340copy_5_end:
1341 ADDQ R11, DI
1342 SUBQ R11, R13
1343
1344 // Copy match from the current buffer
1345copy_match:
1346 MOVQ BX, R11
1347 SUBQ R12, R11
1348
1349 // ml <= mo
1350 CMPQ R13, R12
1351 JA copy_overlapping_match
1352
1353 // Copy non-overlapping match
1354 ADDQ R13, DI
1355 MOVQ BX, R12
1356 ADDQ R13, BX
1357
1358copy_2:
1359 MOVUPS (R11), X0
1360 MOVUPS X0, (R12)
1361 ADDQ $0x10, R11
1362 ADDQ $0x10, R12
1363 SUBQ $0x10, R13
1364 JHI copy_2
1365 JMP handle_loop
1366
1367 // Copy overlapping match
1368copy_overlapping_match:
1369 ADDQ R13, DI
1370
1371copy_slow_3:
1372 MOVB (R11), R12
1373 MOVB R12, (BX)
1374 INCQ R11
1375 INCQ BX
1376 DECQ R13
1377 JNZ copy_slow_3
1378
1379handle_loop:
1380 ADDQ $0x18, AX
1381 INCQ DX
1382 CMPQ DX, CX
1383 JB main_loop
1384
1385loop_finished:
1386 // Return value
1387 MOVB $0x01, ret+8(FP)
1388
1389 // Update the context
1390 MOVQ ctx+0(FP), AX
1391 MOVQ DX, 24(AX)
1392 MOVQ DI, 104(AX)
1393 MOVQ 80(AX), CX
1394 SUBQ CX, SI
1395 MOVQ SI, 112(AX)
1396 RET
1397
1398error_match_off_too_big:
1399 // Return value
1400 MOVB $0x00, ret+8(FP)
1401
1402 // Update the context
1403 MOVQ ctx+0(FP), AX
1404 MOVQ DX, 24(AX)
1405 MOVQ DI, 104(AX)
1406 MOVQ 80(AX), CX
1407 SUBQ CX, SI
1408 MOVQ SI, 112(AX)
1409 RET
1410
1411empty_seqs:
1412 // Return value
1413 MOVB $0x01, ret+8(FP)
1414 RET
1415
1416// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
1417// Requires: SSE
1418TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
1419 MOVQ ctx+0(FP), R10
1420 MOVQ 8(R10), CX
1421 TESTQ CX, CX
1422 JZ empty_seqs
1423 MOVQ (R10), AX
1424 MOVQ 24(R10), DX
1425 MOVQ 32(R10), BX
1426 MOVQ 80(R10), SI
1427 MOVQ 104(R10), DI
1428 MOVQ 120(R10), R8
1429 MOVQ 56(R10), R9
1430 MOVQ 64(R10), R10
1431 ADDQ R10, R9
1432
1433 // seqsBase += 24 * seqIndex
1434 LEAQ (DX)(DX*2), R11
1435 SHLQ $0x03, R11
1436 ADDQ R11, AX
1437
1438 // outBase += outPosition
1439 ADDQ DI, BX
1440
1441main_loop:
1442 MOVQ (AX), R11
1443 MOVQ 16(AX), R12
1444 MOVQ 8(AX), R13
1445
1446 // Copy literals
1447 TESTQ R11, R11
1448 JZ check_offset
1449 MOVQ R11, R14
1450 SUBQ $0x10, R14
1451 JB copy_1_small
1452
1453copy_1_loop:
1454 MOVUPS (SI), X0
1455 MOVUPS X0, (BX)
1456 ADDQ $0x10, SI
1457 ADDQ $0x10, BX
1458 SUBQ $0x10, R14
1459 JAE copy_1_loop
1460 LEAQ 16(SI)(R14*1), SI
1461 LEAQ 16(BX)(R14*1), BX
1462 MOVUPS -16(SI), X0
1463 MOVUPS X0, -16(BX)
1464 JMP copy_1_end
1465
1466copy_1_small:
1467 CMPQ R11, $0x03
1468 JE copy_1_move_3
1469 JB copy_1_move_1or2
1470 CMPQ R11, $0x08
1471 JB copy_1_move_4through7
1472 JMP copy_1_move_8through16
1473
1474copy_1_move_1or2:
1475 MOVB (SI), R14
1476 MOVB -1(SI)(R11*1), R15
1477 MOVB R14, (BX)
1478 MOVB R15, -1(BX)(R11*1)
1479 ADDQ R11, SI
1480 ADDQ R11, BX
1481 JMP copy_1_end
1482
1483copy_1_move_3:
1484 MOVW (SI), R14
1485 MOVB 2(SI), R15
1486 MOVW R14, (BX)
1487 MOVB R15, 2(BX)
1488 ADDQ R11, SI
1489 ADDQ R11, BX
1490 JMP copy_1_end
1491
1492copy_1_move_4through7:
1493 MOVL (SI), R14
1494 MOVL -4(SI)(R11*1), R15
1495 MOVL R14, (BX)
1496 MOVL R15, -4(BX)(R11*1)
1497 ADDQ R11, SI
1498 ADDQ R11, BX
1499 JMP copy_1_end
1500
1501copy_1_move_8through16:
1502 MOVQ (SI), R14
1503 MOVQ -8(SI)(R11*1), R15
1504 MOVQ R14, (BX)
1505 MOVQ R15, -8(BX)(R11*1)
1506 ADDQ R11, SI
1507 ADDQ R11, BX
1508
1509copy_1_end:
1510 ADDQ R11, DI
1511
1512 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1513check_offset:
1514 LEAQ (DI)(R10*1), R11
1515 CMPQ R12, R11
1516 JG error_match_off_too_big
1517 CMPQ R12, R8
1518 JG error_match_off_too_big
1519
1520 // Copy match from history
1521 MOVQ R12, R11
1522 SUBQ DI, R11
1523 JLS copy_match
1524 MOVQ R9, R14
1525 SUBQ R11, R14
1526 CMPQ R13, R11
1527 JG copy_all_from_history
1528 MOVQ R13, R11
1529 SUBQ $0x10, R11
1530 JB copy_4_small
1531
1532copy_4_loop:
1533 MOVUPS (R14), X0
1534 MOVUPS X0, (BX)
1535 ADDQ $0x10, R14
1536 ADDQ $0x10, BX
1537 SUBQ $0x10, R11
1538 JAE copy_4_loop
1539 LEAQ 16(R14)(R11*1), R14
1540 LEAQ 16(BX)(R11*1), BX
1541 MOVUPS -16(R14), X0
1542 MOVUPS X0, -16(BX)
1543 JMP copy_4_end
1544
1545copy_4_small:
1546 CMPQ R13, $0x03
1547 JE copy_4_move_3
1548 CMPQ R13, $0x08
1549 JB copy_4_move_4through7
1550 JMP copy_4_move_8through16
1551
1552copy_4_move_3:
1553 MOVW (R14), R11
1554 MOVB 2(R14), R12
1555 MOVW R11, (BX)
1556 MOVB R12, 2(BX)
1557 ADDQ R13, R14
1558 ADDQ R13, BX
1559 JMP copy_4_end
1560
1561copy_4_move_4through7:
1562 MOVL (R14), R11
1563 MOVL -4(R14)(R13*1), R12
1564 MOVL R11, (BX)
1565 MOVL R12, -4(BX)(R13*1)
1566 ADDQ R13, R14
1567 ADDQ R13, BX
1568 JMP copy_4_end
1569
1570copy_4_move_8through16:
1571 MOVQ (R14), R11
1572 MOVQ -8(R14)(R13*1), R12
1573 MOVQ R11, (BX)
1574 MOVQ R12, -8(BX)(R13*1)
1575 ADDQ R13, R14
1576 ADDQ R13, BX
1577
1578copy_4_end:
1579 ADDQ R13, DI
1580 ADDQ $0x18, AX
1581 INCQ DX
1582 CMPQ DX, CX
1583 JB main_loop
1584 JMP loop_finished
1585
1586copy_all_from_history:
1587 MOVQ R11, R15
1588 SUBQ $0x10, R15
1589 JB copy_5_small
1590
1591copy_5_loop:
1592 MOVUPS (R14), X0
1593 MOVUPS X0, (BX)
1594 ADDQ $0x10, R14
1595 ADDQ $0x10, BX
1596 SUBQ $0x10, R15
1597 JAE copy_5_loop
1598 LEAQ 16(R14)(R15*1), R14
1599 LEAQ 16(BX)(R15*1), BX
1600 MOVUPS -16(R14), X0
1601 MOVUPS X0, -16(BX)
1602 JMP copy_5_end
1603
1604copy_5_small:
1605 CMPQ R11, $0x03
1606 JE copy_5_move_3
1607 JB copy_5_move_1or2
1608 CMPQ R11, $0x08
1609 JB copy_5_move_4through7
1610 JMP copy_5_move_8through16
1611
1612copy_5_move_1or2:
1613 MOVB (R14), R15
1614 MOVB -1(R14)(R11*1), BP
1615 MOVB R15, (BX)
1616 MOVB BP, -1(BX)(R11*1)
1617 ADDQ R11, R14
1618 ADDQ R11, BX
1619 JMP copy_5_end
1620
1621copy_5_move_3:
1622 MOVW (R14), R15
1623 MOVB 2(R14), BP
1624 MOVW R15, (BX)
1625 MOVB BP, 2(BX)
1626 ADDQ R11, R14
1627 ADDQ R11, BX
1628 JMP copy_5_end
1629
1630copy_5_move_4through7:
1631 MOVL (R14), R15
1632 MOVL -4(R14)(R11*1), BP
1633 MOVL R15, (BX)
1634 MOVL BP, -4(BX)(R11*1)
1635 ADDQ R11, R14
1636 ADDQ R11, BX
1637 JMP copy_5_end
1638
1639copy_5_move_8through16:
1640 MOVQ (R14), R15
1641 MOVQ -8(R14)(R11*1), BP
1642 MOVQ R15, (BX)
1643 MOVQ BP, -8(BX)(R11*1)
1644 ADDQ R11, R14
1645 ADDQ R11, BX
1646
1647copy_5_end:
1648 ADDQ R11, DI
1649 SUBQ R11, R13
1650
1651 // Copy match from the current buffer
1652copy_match:
1653 MOVQ BX, R11
1654 SUBQ R12, R11
1655
1656 // ml <= mo
1657 CMPQ R13, R12
1658 JA copy_overlapping_match
1659
1660 // Copy non-overlapping match
1661 ADDQ R13, DI
1662 MOVQ R13, R12
1663 SUBQ $0x10, R12
1664 JB copy_2_small
1665
1666copy_2_loop:
1667 MOVUPS (R11), X0
1668 MOVUPS X0, (BX)
1669 ADDQ $0x10, R11
1670 ADDQ $0x10, BX
1671 SUBQ $0x10, R12
1672 JAE copy_2_loop
1673 LEAQ 16(R11)(R12*1), R11
1674 LEAQ 16(BX)(R12*1), BX
1675 MOVUPS -16(R11), X0
1676 MOVUPS X0, -16(BX)
1677 JMP copy_2_end
1678
1679copy_2_small:
1680 CMPQ R13, $0x03
1681 JE copy_2_move_3
1682 JB copy_2_move_1or2
1683 CMPQ R13, $0x08
1684 JB copy_2_move_4through7
1685 JMP copy_2_move_8through16
1686
1687copy_2_move_1or2:
1688 MOVB (R11), R12
1689 MOVB -1(R11)(R13*1), R14
1690 MOVB R12, (BX)
1691 MOVB R14, -1(BX)(R13*1)
1692 ADDQ R13, R11
1693 ADDQ R13, BX
1694 JMP copy_2_end
1695
1696copy_2_move_3:
1697 MOVW (R11), R12
1698 MOVB 2(R11), R14
1699 MOVW R12, (BX)
1700 MOVB R14, 2(BX)
1701 ADDQ R13, R11
1702 ADDQ R13, BX
1703 JMP copy_2_end
1704
1705copy_2_move_4through7:
1706 MOVL (R11), R12
1707 MOVL -4(R11)(R13*1), R14
1708 MOVL R12, (BX)
1709 MOVL R14, -4(BX)(R13*1)
1710 ADDQ R13, R11
1711 ADDQ R13, BX
1712 JMP copy_2_end
1713
1714copy_2_move_8through16:
1715 MOVQ (R11), R12
1716 MOVQ -8(R11)(R13*1), R14
1717 MOVQ R12, (BX)
1718 MOVQ R14, -8(BX)(R13*1)
1719 ADDQ R13, R11
1720 ADDQ R13, BX
1721
1722copy_2_end:
1723 JMP handle_loop
1724
1725 // Copy overlapping match
1726copy_overlapping_match:
1727 ADDQ R13, DI
1728
1729copy_slow_3:
1730 MOVB (R11), R12
1731 MOVB R12, (BX)
1732 INCQ R11
1733 INCQ BX
1734 DECQ R13
1735 JNZ copy_slow_3
1736
1737handle_loop:
1738 ADDQ $0x18, AX
1739 INCQ DX
1740 CMPQ DX, CX
1741 JB main_loop
1742
1743loop_finished:
1744 // Return value
1745 MOVB $0x01, ret+8(FP)
1746
1747 // Update the context
1748 MOVQ ctx+0(FP), AX
1749 MOVQ DX, 24(AX)
1750 MOVQ DI, 104(AX)
1751 MOVQ 80(AX), CX
1752 SUBQ CX, SI
1753 MOVQ SI, 112(AX)
1754 RET
1755
1756error_match_off_too_big:
1757 // Return value
1758 MOVB $0x00, ret+8(FP)
1759
1760 // Update the context
1761 MOVQ ctx+0(FP), AX
1762 MOVQ DX, 24(AX)
1763 MOVQ DI, 104(AX)
1764 MOVQ 80(AX), CX
1765 SUBQ CX, SI
1766 MOVQ SI, 112(AX)
1767 RET
1768
1769empty_seqs:
1770 // Return value
1771 MOVB $0x01, ret+8(FP)
1772 RET
1773
1774// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
1775// Requires: CMOV, SSE
1776TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
1777 MOVQ br+8(FP), AX
1778 MOVQ 32(AX), DX
1779 MOVBQZX 40(AX), BX
1780 MOVQ 24(AX), SI
1781 MOVQ (AX), AX
1782 ADDQ SI, AX
1783 MOVQ AX, (SP)
1784 MOVQ ctx+16(FP), AX
1785 MOVQ 72(AX), DI
1786 MOVQ 80(AX), R8
1787 MOVQ 88(AX), R9
1788 XORQ CX, CX
1789 MOVQ CX, 8(SP)
1790 MOVQ CX, 16(SP)
1791 MOVQ CX, 24(SP)
1792 MOVQ 112(AX), R10
1793 MOVQ 128(AX), CX
1794 MOVQ CX, 32(SP)
1795 MOVQ 144(AX), R11
1796 MOVQ 136(AX), R12
1797 MOVQ 200(AX), CX
1798 MOVQ CX, 56(SP)
1799 MOVQ 176(AX), CX
1800 MOVQ CX, 48(SP)
1801 MOVQ 184(AX), AX
1802 MOVQ AX, 40(SP)
1803 MOVQ 40(SP), AX
1804 ADDQ AX, 48(SP)
1805
1806 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
1807 ADDQ R10, 32(SP)
1808
1809 // outBase += outPosition
1810 ADDQ R12, R10
1811
1812sequenceDecs_decodeSync_amd64_main_loop:
1813 MOVQ (SP), R13
1814
1815 // Fill bitreader to have enough for the offset and match length.
1816 CMPQ SI, $0x08
1817 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1818 MOVQ BX, AX
1819 SHRQ $0x03, AX
1820 SUBQ AX, R13
1821 MOVQ (R13), DX
1822 SUBQ AX, SI
1823 ANDQ $0x07, BX
1824 JMP sequenceDecs_decodeSync_amd64_fill_end
1825
1826sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
1827 CMPQ SI, $0x00
1828 JLE sequenceDecs_decodeSync_amd64_fill_end
1829 CMPQ BX, $0x07
1830 JLE sequenceDecs_decodeSync_amd64_fill_end
1831 SHLQ $0x08, DX
1832 SUBQ $0x01, R13
1833 SUBQ $0x01, SI
1834 SUBQ $0x08, BX
1835 MOVBQZX (R13), AX
1836 ORQ AX, DX
1837 JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1838
1839sequenceDecs_decodeSync_amd64_fill_end:
1840 // Update offset
1841 MOVQ R9, AX
1842 MOVQ BX, CX
1843 MOVQ DX, R14
1844 SHLQ CL, R14
1845 MOVB AH, CL
1846 SHRQ $0x20, AX
1847 TESTQ CX, CX
1848 JZ sequenceDecs_decodeSync_amd64_of_update_zero
1849 ADDQ CX, BX
1850 CMPQ BX, $0x40
1851 JA sequenceDecs_decodeSync_amd64_of_update_zero
1852 CMPQ CX, $0x40
1853 JAE sequenceDecs_decodeSync_amd64_of_update_zero
1854 NEGQ CX
1855 SHRQ CL, R14
1856 ADDQ R14, AX
1857
1858sequenceDecs_decodeSync_amd64_of_update_zero:
1859 MOVQ AX, 8(SP)
1860
1861 // Update match length
1862 MOVQ R8, AX
1863 MOVQ BX, CX
1864 MOVQ DX, R14
1865 SHLQ CL, R14
1866 MOVB AH, CL
1867 SHRQ $0x20, AX
1868 TESTQ CX, CX
1869 JZ sequenceDecs_decodeSync_amd64_ml_update_zero
1870 ADDQ CX, BX
1871 CMPQ BX, $0x40
1872 JA sequenceDecs_decodeSync_amd64_ml_update_zero
1873 CMPQ CX, $0x40
1874 JAE sequenceDecs_decodeSync_amd64_ml_update_zero
1875 NEGQ CX
1876 SHRQ CL, R14
1877 ADDQ R14, AX
1878
1879sequenceDecs_decodeSync_amd64_ml_update_zero:
1880 MOVQ AX, 16(SP)
1881
1882 // Fill bitreader to have enough for the remaining
1883 CMPQ SI, $0x08
1884 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1885 MOVQ BX, AX
1886 SHRQ $0x03, AX
1887 SUBQ AX, R13
1888 MOVQ (R13), DX
1889 SUBQ AX, SI
1890 ANDQ $0x07, BX
1891 JMP sequenceDecs_decodeSync_amd64_fill_2_end
1892
1893sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
1894 CMPQ SI, $0x00
1895 JLE sequenceDecs_decodeSync_amd64_fill_2_end
1896 CMPQ BX, $0x07
1897 JLE sequenceDecs_decodeSync_amd64_fill_2_end
1898 SHLQ $0x08, DX
1899 SUBQ $0x01, R13
1900 SUBQ $0x01, SI
1901 SUBQ $0x08, BX
1902 MOVBQZX (R13), AX
1903 ORQ AX, DX
1904 JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1905
1906sequenceDecs_decodeSync_amd64_fill_2_end:
1907 // Update literal length
1908 MOVQ DI, AX
1909 MOVQ BX, CX
1910 MOVQ DX, R14
1911 SHLQ CL, R14
1912 MOVB AH, CL
1913 SHRQ $0x20, AX
1914 TESTQ CX, CX
1915 JZ sequenceDecs_decodeSync_amd64_ll_update_zero
1916 ADDQ CX, BX
1917 CMPQ BX, $0x40
1918 JA sequenceDecs_decodeSync_amd64_ll_update_zero
1919 CMPQ CX, $0x40
1920 JAE sequenceDecs_decodeSync_amd64_ll_update_zero
1921 NEGQ CX
1922 SHRQ CL, R14
1923 ADDQ R14, AX
1924
1925sequenceDecs_decodeSync_amd64_ll_update_zero:
1926 MOVQ AX, 24(SP)
1927
1928 // Fill bitreader for state updates
1929 MOVQ R13, (SP)
1930 MOVQ R9, AX
1931 SHRQ $0x08, AX
1932 MOVBQZX AL, AX
1933 MOVQ ctx+16(FP), CX
1934 CMPQ 96(CX), $0x00
1935 JZ sequenceDecs_decodeSync_amd64_skip_update
1936
1937 // Update Literal Length State
1938 MOVBQZX DI, R13
1939 SHRQ $0x10, DI
1940 MOVWQZX DI, DI
1941 LEAQ (BX)(R13*1), CX
1942 MOVQ DX, R14
1943 MOVQ CX, BX
1944 ROLQ CL, R14
1945 MOVL $0x00000001, R15
1946 MOVB R13, CL
1947 SHLL CL, R15
1948 DECL R15
1949 ANDQ R15, R14
1950 ADDQ R14, DI
1951
1952 // Load ctx.llTable
1953 MOVQ ctx+16(FP), CX
1954 MOVQ (CX), CX
1955 MOVQ (CX)(DI*8), DI
1956
1957 // Update Match Length State
1958 MOVBQZX R8, R13
1959 SHRQ $0x10, R8
1960 MOVWQZX R8, R8
1961 LEAQ (BX)(R13*1), CX
1962 MOVQ DX, R14
1963 MOVQ CX, BX
1964 ROLQ CL, R14
1965 MOVL $0x00000001, R15
1966 MOVB R13, CL
1967 SHLL CL, R15
1968 DECL R15
1969 ANDQ R15, R14
1970 ADDQ R14, R8
1971
1972 // Load ctx.mlTable
1973 MOVQ ctx+16(FP), CX
1974 MOVQ 24(CX), CX
1975 MOVQ (CX)(R8*8), R8
1976
1977 // Update Offset State
1978 MOVBQZX R9, R13
1979 SHRQ $0x10, R9
1980 MOVWQZX R9, R9
1981 LEAQ (BX)(R13*1), CX
1982 MOVQ DX, R14
1983 MOVQ CX, BX
1984 ROLQ CL, R14
1985 MOVL $0x00000001, R15
1986 MOVB R13, CL
1987 SHLL CL, R15
1988 DECL R15
1989 ANDQ R15, R14
1990 ADDQ R14, R9
1991
1992 // Load ctx.ofTable
1993 MOVQ ctx+16(FP), CX
1994 MOVQ 48(CX), CX
1995 MOVQ (CX)(R9*8), R9
1996
1997sequenceDecs_decodeSync_amd64_skip_update:
1998 // Adjust offset
1999 MOVQ s+0(FP), CX
2000 MOVQ 8(SP), R13
2001 CMPQ AX, $0x01
2002 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
2003 MOVUPS 144(CX), X0
2004 MOVQ R13, 144(CX)
2005 MOVUPS X0, 152(CX)
2006 JMP sequenceDecs_decodeSync_amd64_after_adjust
2007
2008sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
2009 CMPQ 24(SP), $0x00000000
2010 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
2011 INCQ R13
2012 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2013
2014sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
2015 TESTQ R13, R13
2016 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2017 MOVQ 144(CX), R13
2018 JMP sequenceDecs_decodeSync_amd64_after_adjust
2019
2020sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
2021 MOVQ R13, AX
2022 XORQ R14, R14
2023 MOVQ $-1, R15
2024 CMPQ R13, $0x03
2025 CMOVQEQ R14, AX
2026 CMOVQEQ R15, R14
2027 ADDQ 144(CX)(AX*8), R14
2028 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
2029 MOVQ $0x00000001, R14
2030
2031sequenceDecs_decodeSync_amd64_adjust_temp_valid:
2032 CMPQ R13, $0x01
2033 JZ sequenceDecs_decodeSync_amd64_adjust_skip
2034 MOVQ 152(CX), AX
2035 MOVQ AX, 160(CX)
2036
2037sequenceDecs_decodeSync_amd64_adjust_skip:
2038 MOVQ 144(CX), AX
2039 MOVQ AX, 152(CX)
2040 MOVQ R14, 144(CX)
2041 MOVQ R14, R13
2042
2043sequenceDecs_decodeSync_amd64_after_adjust:
2044 MOVQ R13, 8(SP)
2045
2046 // Check values
2047 MOVQ 16(SP), AX
2048 MOVQ 24(SP), CX
2049 LEAQ (AX)(CX*1), R14
2050 MOVQ s+0(FP), R15
2051 ADDQ R14, 256(R15)
2052 MOVQ ctx+16(FP), R14
2053 SUBQ CX, 104(R14)
2054 JS error_not_enough_literals
2055 CMPQ AX, $0x00020002
2056 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
2057 TESTQ R13, R13
2058 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
2059 TESTQ AX, AX
2060 JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
2061
2062sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
2063 MOVQ 24(SP), AX
2064 MOVQ 8(SP), CX
2065 MOVQ 16(SP), R13
2066
2067 // Check if we have enough space in s.out
2068 LEAQ (AX)(R13*1), R14
2069 ADDQ R10, R14
2070 CMPQ R14, 32(SP)
2071 JA error_not_enough_space
2072
2073 // Copy literals
2074 TESTQ AX, AX
2075 JZ check_offset
2076 XORQ R14, R14
2077
2078copy_1:
2079 MOVUPS (R11)(R14*1), X0
2080 MOVUPS X0, (R10)(R14*1)
2081 ADDQ $0x10, R14
2082 CMPQ R14, AX
2083 JB copy_1
2084 ADDQ AX, R11
2085 ADDQ AX, R10
2086 ADDQ AX, R12
2087
2088 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2089check_offset:
2090 MOVQ R12, AX
2091 ADDQ 40(SP), AX
2092 CMPQ CX, AX
2093 JG error_match_off_too_big
2094 CMPQ CX, 56(SP)
2095 JG error_match_off_too_big
2096
2097 // Copy match from history
2098 MOVQ CX, AX
2099 SUBQ R12, AX
2100 JLS copy_match
2101 MOVQ 48(SP), R14
2102 SUBQ AX, R14
2103 CMPQ R13, AX
2104 JG copy_all_from_history
2105 MOVQ R13, AX
2106 SUBQ $0x10, AX
2107 JB copy_4_small
2108
2109copy_4_loop:
2110 MOVUPS (R14), X0
2111 MOVUPS X0, (R10)
2112 ADDQ $0x10, R14
2113 ADDQ $0x10, R10
2114 SUBQ $0x10, AX
2115 JAE copy_4_loop
2116 LEAQ 16(R14)(AX*1), R14
2117 LEAQ 16(R10)(AX*1), R10
2118 MOVUPS -16(R14), X0
2119 MOVUPS X0, -16(R10)
2120 JMP copy_4_end
2121
2122copy_4_small:
2123 CMPQ R13, $0x03
2124 JE copy_4_move_3
2125 CMPQ R13, $0x08
2126 JB copy_4_move_4through7
2127 JMP copy_4_move_8through16
2128
2129copy_4_move_3:
2130 MOVW (R14), AX
2131 MOVB 2(R14), CL
2132 MOVW AX, (R10)
2133 MOVB CL, 2(R10)
2134 ADDQ R13, R14
2135 ADDQ R13, R10
2136 JMP copy_4_end
2137
2138copy_4_move_4through7:
2139 MOVL (R14), AX
2140 MOVL -4(R14)(R13*1), CX
2141 MOVL AX, (R10)
2142 MOVL CX, -4(R10)(R13*1)
2143 ADDQ R13, R14
2144 ADDQ R13, R10
2145 JMP copy_4_end
2146
2147copy_4_move_8through16:
2148 MOVQ (R14), AX
2149 MOVQ -8(R14)(R13*1), CX
2150 MOVQ AX, (R10)
2151 MOVQ CX, -8(R10)(R13*1)
2152 ADDQ R13, R14
2153 ADDQ R13, R10
2154
2155copy_4_end:
2156 ADDQ R13, R12
2157 JMP handle_loop
2158 JMP loop_finished
2159
2160copy_all_from_history:
2161 MOVQ AX, R15
2162 SUBQ $0x10, R15
2163 JB copy_5_small
2164
2165copy_5_loop:
2166 MOVUPS (R14), X0
2167 MOVUPS X0, (R10)
2168 ADDQ $0x10, R14
2169 ADDQ $0x10, R10
2170 SUBQ $0x10, R15
2171 JAE copy_5_loop
2172 LEAQ 16(R14)(R15*1), R14
2173 LEAQ 16(R10)(R15*1), R10
2174 MOVUPS -16(R14), X0
2175 MOVUPS X0, -16(R10)
2176 JMP copy_5_end
2177
2178copy_5_small:
2179 CMPQ AX, $0x03
2180 JE copy_5_move_3
2181 JB copy_5_move_1or2
2182 CMPQ AX, $0x08
2183 JB copy_5_move_4through7
2184 JMP copy_5_move_8through16
2185
2186copy_5_move_1or2:
2187 MOVB (R14), R15
2188 MOVB -1(R14)(AX*1), BP
2189 MOVB R15, (R10)
2190 MOVB BP, -1(R10)(AX*1)
2191 ADDQ AX, R14
2192 ADDQ AX, R10
2193 JMP copy_5_end
2194
2195copy_5_move_3:
2196 MOVW (R14), R15
2197 MOVB 2(R14), BP
2198 MOVW R15, (R10)
2199 MOVB BP, 2(R10)
2200 ADDQ AX, R14
2201 ADDQ AX, R10
2202 JMP copy_5_end
2203
2204copy_5_move_4through7:
2205 MOVL (R14), R15
2206 MOVL -4(R14)(AX*1), BP
2207 MOVL R15, (R10)
2208 MOVL BP, -4(R10)(AX*1)
2209 ADDQ AX, R14
2210 ADDQ AX, R10
2211 JMP copy_5_end
2212
2213copy_5_move_8through16:
2214 MOVQ (R14), R15
2215 MOVQ -8(R14)(AX*1), BP
2216 MOVQ R15, (R10)
2217 MOVQ BP, -8(R10)(AX*1)
2218 ADDQ AX, R14
2219 ADDQ AX, R10
2220
2221copy_5_end:
2222 ADDQ AX, R12
2223 SUBQ AX, R13
2224
2225 // Copy match from the current buffer
2226copy_match:
2227 MOVQ R10, AX
2228 SUBQ CX, AX
2229
2230 // ml <= mo
2231 CMPQ R13, CX
2232 JA copy_overlapping_match
2233
2234 // Copy non-overlapping match
2235 ADDQ R13, R12
2236 MOVQ R10, CX
2237 ADDQ R13, R10
2238
2239copy_2:
2240 MOVUPS (AX), X0
2241 MOVUPS X0, (CX)
2242 ADDQ $0x10, AX
2243 ADDQ $0x10, CX
2244 SUBQ $0x10, R13
2245 JHI copy_2
2246 JMP handle_loop
2247
2248 // Copy overlapping match
2249copy_overlapping_match:
2250 ADDQ R13, R12
2251
2252copy_slow_3:
2253 MOVB (AX), CL
2254 MOVB CL, (R10)
2255 INCQ AX
2256 INCQ R10
2257 DECQ R13
2258 JNZ copy_slow_3
2259
2260handle_loop:
2261 MOVQ ctx+16(FP), AX
2262 DECQ 96(AX)
2263 JNS sequenceDecs_decodeSync_amd64_main_loop
2264
2265loop_finished:
2266 MOVQ br+8(FP), AX
2267 MOVQ DX, 32(AX)
2268 MOVB BL, 40(AX)
2269 MOVQ SI, 24(AX)
2270
2271 // Update the context
2272 MOVQ ctx+16(FP), AX
2273 MOVQ R12, 136(AX)
2274 MOVQ 144(AX), CX
2275 SUBQ CX, R11
2276 MOVQ R11, 168(AX)
2277
2278 // Return success
2279 MOVQ $0x00000000, ret+24(FP)
2280 RET
2281
2282 // Return with match length error
2283sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
2284 MOVQ 16(SP), AX
2285 MOVQ ctx+16(FP), CX
2286 MOVQ AX, 216(CX)
2287 MOVQ $0x00000001, ret+24(FP)
2288 RET
2289
2290 // Return with match too long error
2291sequenceDecs_decodeSync_amd64_error_match_len_too_big:
2292 MOVQ ctx+16(FP), AX
2293 MOVQ 16(SP), CX
2294 MOVQ CX, 216(AX)
2295 MOVQ $0x00000002, ret+24(FP)
2296 RET
2297
2298 // Return with match offset too long error
2299error_match_off_too_big:
2300 MOVQ ctx+16(FP), AX
2301 MOVQ 8(SP), CX
2302 MOVQ CX, 224(AX)
2303 MOVQ R12, 136(AX)
2304 MOVQ $0x00000003, ret+24(FP)
2305 RET
2306
2307 // Return with not enough literals error
2308error_not_enough_literals:
2309 MOVQ ctx+16(FP), AX
2310 MOVQ 24(SP), CX
2311 MOVQ CX, 208(AX)
2312 MOVQ $0x00000004, ret+24(FP)
2313 RET
2314
2315 // Return with not enough output space error
2316error_not_enough_space:
2317 MOVQ ctx+16(FP), AX
2318 MOVQ 24(SP), CX
2319 MOVQ CX, 208(AX)
2320 MOVQ 16(SP), CX
2321 MOVQ CX, 216(AX)
2322 MOVQ R12, 136(AX)
2323 MOVQ $0x00000005, ret+24(FP)
2324 RET
2325
2326// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2327// Requires: BMI, BMI2, CMOV, SSE
2328TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2329 MOVQ br+8(FP), CX
2330 MOVQ 32(CX), AX
2331 MOVBQZX 40(CX), DX
2332 MOVQ 24(CX), BX
2333 MOVQ (CX), CX
2334 ADDQ BX, CX
2335 MOVQ CX, (SP)
2336 MOVQ ctx+16(FP), CX
2337 MOVQ 72(CX), SI
2338 MOVQ 80(CX), DI
2339 MOVQ 88(CX), R8
2340 XORQ R9, R9
2341 MOVQ R9, 8(SP)
2342 MOVQ R9, 16(SP)
2343 MOVQ R9, 24(SP)
2344 MOVQ 112(CX), R9
2345 MOVQ 128(CX), R10
2346 MOVQ R10, 32(SP)
2347 MOVQ 144(CX), R10
2348 MOVQ 136(CX), R11
2349 MOVQ 200(CX), R12
2350 MOVQ R12, 56(SP)
2351 MOVQ 176(CX), R12
2352 MOVQ R12, 48(SP)
2353 MOVQ 184(CX), CX
2354 MOVQ CX, 40(SP)
2355 MOVQ 40(SP), CX
2356 ADDQ CX, 48(SP)
2357
2358 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2359 ADDQ R9, 32(SP)
2360
2361 // outBase += outPosition
2362 ADDQ R11, R9
2363
2364sequenceDecs_decodeSync_bmi2_main_loop:
2365 MOVQ (SP), R12
2366
2367 // Fill bitreader to have enough for the offset and match length.
2368 CMPQ BX, $0x08
2369 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2370 MOVQ DX, CX
2371 SHRQ $0x03, CX
2372 SUBQ CX, R12
2373 MOVQ (R12), AX
2374 SUBQ CX, BX
2375 ANDQ $0x07, DX
2376 JMP sequenceDecs_decodeSync_bmi2_fill_end
2377
2378sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
2379 CMPQ BX, $0x00
2380 JLE sequenceDecs_decodeSync_bmi2_fill_end
2381 CMPQ DX, $0x07
2382 JLE sequenceDecs_decodeSync_bmi2_fill_end
2383 SHLQ $0x08, AX
2384 SUBQ $0x01, R12
2385 SUBQ $0x01, BX
2386 SUBQ $0x08, DX
2387 MOVBQZX (R12), CX
2388 ORQ CX, AX
2389 JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2390
2391sequenceDecs_decodeSync_bmi2_fill_end:
2392 // Update offset
2393 MOVQ $0x00000808, CX
2394 BEXTRQ CX, R8, R13
2395 MOVQ AX, R14
2396 LEAQ (DX)(R13*1), CX
2397 ROLQ CL, R14
2398 BZHIQ R13, R14, R14
2399 MOVQ CX, DX
2400 MOVQ R8, CX
2401 SHRQ $0x20, CX
2402 ADDQ R14, CX
2403 MOVQ CX, 8(SP)
2404
2405 // Update match length
2406 MOVQ $0x00000808, CX
2407 BEXTRQ CX, DI, R13
2408 MOVQ AX, R14
2409 LEAQ (DX)(R13*1), CX
2410 ROLQ CL, R14
2411 BZHIQ R13, R14, R14
2412 MOVQ CX, DX
2413 MOVQ DI, CX
2414 SHRQ $0x20, CX
2415 ADDQ R14, CX
2416 MOVQ CX, 16(SP)
2417
2418 // Fill bitreader to have enough for the remaining
2419 CMPQ BX, $0x08
2420 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2421 MOVQ DX, CX
2422 SHRQ $0x03, CX
2423 SUBQ CX, R12
2424 MOVQ (R12), AX
2425 SUBQ CX, BX
2426 ANDQ $0x07, DX
2427 JMP sequenceDecs_decodeSync_bmi2_fill_2_end
2428
2429sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
2430 CMPQ BX, $0x00
2431 JLE sequenceDecs_decodeSync_bmi2_fill_2_end
2432 CMPQ DX, $0x07
2433 JLE sequenceDecs_decodeSync_bmi2_fill_2_end
2434 SHLQ $0x08, AX
2435 SUBQ $0x01, R12
2436 SUBQ $0x01, BX
2437 SUBQ $0x08, DX
2438 MOVBQZX (R12), CX
2439 ORQ CX, AX
2440 JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2441
2442sequenceDecs_decodeSync_bmi2_fill_2_end:
2443 // Update literal length
2444 MOVQ $0x00000808, CX
2445 BEXTRQ CX, SI, R13
2446 MOVQ AX, R14
2447 LEAQ (DX)(R13*1), CX
2448 ROLQ CL, R14
2449 BZHIQ R13, R14, R14
2450 MOVQ CX, DX
2451 MOVQ SI, CX
2452 SHRQ $0x20, CX
2453 ADDQ R14, CX
2454 MOVQ CX, 24(SP)
2455
2456 // Fill bitreader for state updates
2457 MOVQ R12, (SP)
2458 MOVQ $0x00000808, CX
2459 BEXTRQ CX, R8, R12
2460 MOVQ ctx+16(FP), CX
2461 CMPQ 96(CX), $0x00
2462 JZ sequenceDecs_decodeSync_bmi2_skip_update
2463 LEAQ (SI)(DI*1), R13
2464 ADDQ R8, R13
2465 MOVBQZX R13, R13
2466 LEAQ (DX)(R13*1), CX
2467 MOVQ AX, R14
2468 MOVQ CX, DX
2469 ROLQ CL, R14
2470 BZHIQ R13, R14, R14
2471
2472 // Update Offset State
2473 BZHIQ R8, R14, CX
2474 SHRXQ R8, R14, R14
2475 MOVQ $0x00001010, R13
2476 BEXTRQ R13, R8, R8
2477 ADDQ CX, R8
2478
2479 // Load ctx.ofTable
2480 MOVQ ctx+16(FP), CX
2481 MOVQ 48(CX), CX
2482 MOVQ (CX)(R8*8), R8
2483
2484 // Update Match Length State
2485 BZHIQ DI, R14, CX
2486 SHRXQ DI, R14, R14
2487 MOVQ $0x00001010, R13
2488 BEXTRQ R13, DI, DI
2489 ADDQ CX, DI
2490
2491 // Load ctx.mlTable
2492 MOVQ ctx+16(FP), CX
2493 MOVQ 24(CX), CX
2494 MOVQ (CX)(DI*8), DI
2495
2496 // Update Literal Length State
2497 BZHIQ SI, R14, CX
2498 MOVQ $0x00001010, R13
2499 BEXTRQ R13, SI, SI
2500 ADDQ CX, SI
2501
2502 // Load ctx.llTable
2503 MOVQ ctx+16(FP), CX
2504 MOVQ (CX), CX
2505 MOVQ (CX)(SI*8), SI
2506
2507sequenceDecs_decodeSync_bmi2_skip_update:
2508 // Adjust offset
2509 MOVQ s+0(FP), CX
2510 MOVQ 8(SP), R13
2511 CMPQ R12, $0x01
2512 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
2513 MOVUPS 144(CX), X0
2514 MOVQ R13, 144(CX)
2515 MOVUPS X0, 152(CX)
2516 JMP sequenceDecs_decodeSync_bmi2_after_adjust
2517
2518sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
2519 CMPQ 24(SP), $0x00000000
2520 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
2521 INCQ R13
2522 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2523
2524sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
2525 TESTQ R13, R13
2526 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2527 MOVQ 144(CX), R13
2528 JMP sequenceDecs_decodeSync_bmi2_after_adjust
2529
2530sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
2531 MOVQ R13, R12
2532 XORQ R14, R14
2533 MOVQ $-1, R15
2534 CMPQ R13, $0x03
2535 CMOVQEQ R14, R12
2536 CMOVQEQ R15, R14
2537 ADDQ 144(CX)(R12*8), R14
2538 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
2539 MOVQ $0x00000001, R14
2540
2541sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
2542 CMPQ R13, $0x01
2543 JZ sequenceDecs_decodeSync_bmi2_adjust_skip
2544 MOVQ 152(CX), R12
2545 MOVQ R12, 160(CX)
2546
2547sequenceDecs_decodeSync_bmi2_adjust_skip:
2548 MOVQ 144(CX), R12
2549 MOVQ R12, 152(CX)
2550 MOVQ R14, 144(CX)
2551 MOVQ R14, R13
2552
2553sequenceDecs_decodeSync_bmi2_after_adjust:
2554 MOVQ R13, 8(SP)
2555
2556 // Check values
2557 MOVQ 16(SP), CX
2558 MOVQ 24(SP), R12
2559 LEAQ (CX)(R12*1), R14
2560 MOVQ s+0(FP), R15
2561 ADDQ R14, 256(R15)
2562 MOVQ ctx+16(FP), R14
2563 SUBQ R12, 104(R14)
2564 JS error_not_enough_literals
2565 CMPQ CX, $0x00020002
2566 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
2567 TESTQ R13, R13
2568 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
2569 TESTQ CX, CX
2570 JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
2571
2572sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
2573 MOVQ 24(SP), CX
2574 MOVQ 8(SP), R12
2575 MOVQ 16(SP), R13
2576
2577 // Check if we have enough space in s.out
2578 LEAQ (CX)(R13*1), R14
2579 ADDQ R9, R14
2580 CMPQ R14, 32(SP)
2581 JA error_not_enough_space
2582
2583 // Copy literals
2584 TESTQ CX, CX
2585 JZ check_offset
2586 XORQ R14, R14
2587
2588copy_1:
2589 MOVUPS (R10)(R14*1), X0
2590 MOVUPS X0, (R9)(R14*1)
2591 ADDQ $0x10, R14
2592 CMPQ R14, CX
2593 JB copy_1
2594 ADDQ CX, R10
2595 ADDQ CX, R9
2596 ADDQ CX, R11
2597
2598 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2599check_offset:
2600 MOVQ R11, CX
2601 ADDQ 40(SP), CX
2602 CMPQ R12, CX
2603 JG error_match_off_too_big
2604 CMPQ R12, 56(SP)
2605 JG error_match_off_too_big
2606
2607 // Copy match from history
2608 MOVQ R12, CX
2609 SUBQ R11, CX
2610 JLS copy_match
2611 MOVQ 48(SP), R14
2612 SUBQ CX, R14
2613 CMPQ R13, CX
2614 JG copy_all_from_history
2615 MOVQ R13, CX
2616 SUBQ $0x10, CX
2617 JB copy_4_small
2618
2619copy_4_loop:
2620 MOVUPS (R14), X0
2621 MOVUPS X0, (R9)
2622 ADDQ $0x10, R14
2623 ADDQ $0x10, R9
2624 SUBQ $0x10, CX
2625 JAE copy_4_loop
2626 LEAQ 16(R14)(CX*1), R14
2627 LEAQ 16(R9)(CX*1), R9
2628 MOVUPS -16(R14), X0
2629 MOVUPS X0, -16(R9)
2630 JMP copy_4_end
2631
2632copy_4_small:
2633 CMPQ R13, $0x03
2634 JE copy_4_move_3
2635 CMPQ R13, $0x08
2636 JB copy_4_move_4through7
2637 JMP copy_4_move_8through16
2638
2639copy_4_move_3:
2640 MOVW (R14), CX
2641 MOVB 2(R14), R12
2642 MOVW CX, (R9)
2643 MOVB R12, 2(R9)
2644 ADDQ R13, R14
2645 ADDQ R13, R9
2646 JMP copy_4_end
2647
2648copy_4_move_4through7:
2649 MOVL (R14), CX
2650 MOVL -4(R14)(R13*1), R12
2651 MOVL CX, (R9)
2652 MOVL R12, -4(R9)(R13*1)
2653 ADDQ R13, R14
2654 ADDQ R13, R9
2655 JMP copy_4_end
2656
2657copy_4_move_8through16:
2658 MOVQ (R14), CX
2659 MOVQ -8(R14)(R13*1), R12
2660 MOVQ CX, (R9)
2661 MOVQ R12, -8(R9)(R13*1)
2662 ADDQ R13, R14
2663 ADDQ R13, R9
2664
2665copy_4_end:
2666 ADDQ R13, R11
2667 JMP handle_loop
2668 JMP loop_finished
2669
2670copy_all_from_history:
2671 MOVQ CX, R15
2672 SUBQ $0x10, R15
2673 JB copy_5_small
2674
2675copy_5_loop:
2676 MOVUPS (R14), X0
2677 MOVUPS X0, (R9)
2678 ADDQ $0x10, R14
2679 ADDQ $0x10, R9
2680 SUBQ $0x10, R15
2681 JAE copy_5_loop
2682 LEAQ 16(R14)(R15*1), R14
2683 LEAQ 16(R9)(R15*1), R9
2684 MOVUPS -16(R14), X0
2685 MOVUPS X0, -16(R9)
2686 JMP copy_5_end
2687
2688copy_5_small:
2689 CMPQ CX, $0x03
2690 JE copy_5_move_3
2691 JB copy_5_move_1or2
2692 CMPQ CX, $0x08
2693 JB copy_5_move_4through7
2694 JMP copy_5_move_8through16
2695
2696copy_5_move_1or2:
2697 MOVB (R14), R15
2698 MOVB -1(R14)(CX*1), BP
2699 MOVB R15, (R9)
2700 MOVB BP, -1(R9)(CX*1)
2701 ADDQ CX, R14
2702 ADDQ CX, R9
2703 JMP copy_5_end
2704
2705copy_5_move_3:
2706 MOVW (R14), R15
2707 MOVB 2(R14), BP
2708 MOVW R15, (R9)
2709 MOVB BP, 2(R9)
2710 ADDQ CX, R14
2711 ADDQ CX, R9
2712 JMP copy_5_end
2713
2714copy_5_move_4through7:
2715 MOVL (R14), R15
2716 MOVL -4(R14)(CX*1), BP
2717 MOVL R15, (R9)
2718 MOVL BP, -4(R9)(CX*1)
2719 ADDQ CX, R14
2720 ADDQ CX, R9
2721 JMP copy_5_end
2722
2723copy_5_move_8through16:
2724 MOVQ (R14), R15
2725 MOVQ -8(R14)(CX*1), BP
2726 MOVQ R15, (R9)
2727 MOVQ BP, -8(R9)(CX*1)
2728 ADDQ CX, R14
2729 ADDQ CX, R9
2730
2731copy_5_end:
2732 ADDQ CX, R11
2733 SUBQ CX, R13
2734
2735 // Copy match from the current buffer
2736copy_match:
2737 MOVQ R9, CX
2738 SUBQ R12, CX
2739
2740 // ml <= mo
2741 CMPQ R13, R12
2742 JA copy_overlapping_match
2743
2744 // Copy non-overlapping match
2745 ADDQ R13, R11
2746 MOVQ R9, R12
2747 ADDQ R13, R9
2748
2749copy_2:
2750 MOVUPS (CX), X0
2751 MOVUPS X0, (R12)
2752 ADDQ $0x10, CX
2753 ADDQ $0x10, R12
2754 SUBQ $0x10, R13
2755 JHI copy_2
2756 JMP handle_loop
2757
2758 // Copy overlapping match
2759copy_overlapping_match:
2760 ADDQ R13, R11
2761
2762copy_slow_3:
2763 MOVB (CX), R12
2764 MOVB R12, (R9)
2765 INCQ CX
2766 INCQ R9
2767 DECQ R13
2768 JNZ copy_slow_3
2769
2770handle_loop:
2771 MOVQ ctx+16(FP), CX
2772 DECQ 96(CX)
2773 JNS sequenceDecs_decodeSync_bmi2_main_loop
2774
2775loop_finished:
2776 MOVQ br+8(FP), CX
2777 MOVQ AX, 32(CX)
2778 MOVB DL, 40(CX)
2779 MOVQ BX, 24(CX)
2780
2781 // Update the context
2782 MOVQ ctx+16(FP), AX
2783 MOVQ R11, 136(AX)
2784 MOVQ 144(AX), CX
2785 SUBQ CX, R10
2786 MOVQ R10, 168(AX)
2787
2788 // Return success
2789 MOVQ $0x00000000, ret+24(FP)
2790 RET
2791
2792 // Return with match length error
2793sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
2794 MOVQ 16(SP), AX
2795 MOVQ ctx+16(FP), CX
2796 MOVQ AX, 216(CX)
2797 MOVQ $0x00000001, ret+24(FP)
2798 RET
2799
2800 // Return with match too long error
2801sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
2802 MOVQ ctx+16(FP), AX
2803 MOVQ 16(SP), CX
2804 MOVQ CX, 216(AX)
2805 MOVQ $0x00000002, ret+24(FP)
2806 RET
2807
2808 // Return with match offset too long error
2809error_match_off_too_big:
2810 MOVQ ctx+16(FP), AX
2811 MOVQ 8(SP), CX
2812 MOVQ CX, 224(AX)
2813 MOVQ R11, 136(AX)
2814 MOVQ $0x00000003, ret+24(FP)
2815 RET
2816
2817 // Return with not enough literals error
2818error_not_enough_literals:
2819 MOVQ ctx+16(FP), AX
2820 MOVQ 24(SP), CX
2821 MOVQ CX, 208(AX)
2822 MOVQ $0x00000004, ret+24(FP)
2823 RET
2824
2825 // Return with not enough output space error
2826error_not_enough_space:
2827 MOVQ ctx+16(FP), AX
2828 MOVQ 24(SP), CX
2829 MOVQ CX, 208(AX)
2830 MOVQ 16(SP), CX
2831 MOVQ CX, 216(AX)
2832 MOVQ R11, 136(AX)
2833 MOVQ $0x00000005, ret+24(FP)
2834 RET
2835
2836// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2837// Requires: CMOV, SSE
2838TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2839 MOVQ br+8(FP), AX
2840 MOVQ 32(AX), DX
2841 MOVBQZX 40(AX), BX
2842 MOVQ 24(AX), SI
2843 MOVQ (AX), AX
2844 ADDQ SI, AX
2845 MOVQ AX, (SP)
2846 MOVQ ctx+16(FP), AX
2847 MOVQ 72(AX), DI
2848 MOVQ 80(AX), R8
2849 MOVQ 88(AX), R9
2850 XORQ CX, CX
2851 MOVQ CX, 8(SP)
2852 MOVQ CX, 16(SP)
2853 MOVQ CX, 24(SP)
2854 MOVQ 112(AX), R10
2855 MOVQ 128(AX), CX
2856 MOVQ CX, 32(SP)
2857 MOVQ 144(AX), R11
2858 MOVQ 136(AX), R12
2859 MOVQ 200(AX), CX
2860 MOVQ CX, 56(SP)
2861 MOVQ 176(AX), CX
2862 MOVQ CX, 48(SP)
2863 MOVQ 184(AX), AX
2864 MOVQ AX, 40(SP)
2865 MOVQ 40(SP), AX
2866 ADDQ AX, 48(SP)
2867
2868 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2869 ADDQ R10, 32(SP)
2870
2871 // outBase += outPosition
2872 ADDQ R12, R10
2873
2874sequenceDecs_decodeSync_safe_amd64_main_loop:
2875 MOVQ (SP), R13
2876
2877 // Fill bitreader to have enough for the offset and match length.
2878 CMPQ SI, $0x08
2879 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2880 MOVQ BX, AX
2881 SHRQ $0x03, AX
2882 SUBQ AX, R13
2883 MOVQ (R13), DX
2884 SUBQ AX, SI
2885 ANDQ $0x07, BX
2886 JMP sequenceDecs_decodeSync_safe_amd64_fill_end
2887
2888sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
2889 CMPQ SI, $0x00
2890 JLE sequenceDecs_decodeSync_safe_amd64_fill_end
2891 CMPQ BX, $0x07
2892 JLE sequenceDecs_decodeSync_safe_amd64_fill_end
2893 SHLQ $0x08, DX
2894 SUBQ $0x01, R13
2895 SUBQ $0x01, SI
2896 SUBQ $0x08, BX
2897 MOVBQZX (R13), AX
2898 ORQ AX, DX
2899 JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2900
2901sequenceDecs_decodeSync_safe_amd64_fill_end:
2902 // Update offset
2903 MOVQ R9, AX
2904 MOVQ BX, CX
2905 MOVQ DX, R14
2906 SHLQ CL, R14
2907 MOVB AH, CL
2908 SHRQ $0x20, AX
2909 TESTQ CX, CX
2910 JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
2911 ADDQ CX, BX
2912 CMPQ BX, $0x40
2913 JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
2914 CMPQ CX, $0x40
2915 JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
2916 NEGQ CX
2917 SHRQ CL, R14
2918 ADDQ R14, AX
2919
2920sequenceDecs_decodeSync_safe_amd64_of_update_zero:
2921 MOVQ AX, 8(SP)
2922
2923 // Update match length
2924 MOVQ R8, AX
2925 MOVQ BX, CX
2926 MOVQ DX, R14
2927 SHLQ CL, R14
2928 MOVB AH, CL
2929 SHRQ $0x20, AX
2930 TESTQ CX, CX
2931 JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2932 ADDQ CX, BX
2933 CMPQ BX, $0x40
2934 JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2935 CMPQ CX, $0x40
2936 JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2937 NEGQ CX
2938 SHRQ CL, R14
2939 ADDQ R14, AX
2940
2941sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
2942 MOVQ AX, 16(SP)
2943
2944 // Fill bitreader to have enough for the remaining
2945 CMPQ SI, $0x08
2946 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2947 MOVQ BX, AX
2948 SHRQ $0x03, AX
2949 SUBQ AX, R13
2950 MOVQ (R13), DX
2951 SUBQ AX, SI
2952 ANDQ $0x07, BX
2953 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
2954
2955sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
2956 CMPQ SI, $0x00
2957 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
2958 CMPQ BX, $0x07
2959 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
2960 SHLQ $0x08, DX
2961 SUBQ $0x01, R13
2962 SUBQ $0x01, SI
2963 SUBQ $0x08, BX
2964 MOVBQZX (R13), AX
2965 ORQ AX, DX
2966 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2967
2968sequenceDecs_decodeSync_safe_amd64_fill_2_end:
2969 // Update literal length
2970 MOVQ DI, AX
2971 MOVQ BX, CX
2972 MOVQ DX, R14
2973 SHLQ CL, R14
2974 MOVB AH, CL
2975 SHRQ $0x20, AX
2976 TESTQ CX, CX
2977 JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2978 ADDQ CX, BX
2979 CMPQ BX, $0x40
2980 JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2981 CMPQ CX, $0x40
2982 JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2983 NEGQ CX
2984 SHRQ CL, R14
2985 ADDQ R14, AX
2986
2987sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
2988 MOVQ AX, 24(SP)
2989
2990 // Fill bitreader for state updates
2991 MOVQ R13, (SP)
2992 MOVQ R9, AX
2993 SHRQ $0x08, AX
2994 MOVBQZX AL, AX
2995 MOVQ ctx+16(FP), CX
2996 CMPQ 96(CX), $0x00
2997 JZ sequenceDecs_decodeSync_safe_amd64_skip_update
2998
2999 // Update Literal Length State
3000 MOVBQZX DI, R13
3001 SHRQ $0x10, DI
3002 MOVWQZX DI, DI
3003 LEAQ (BX)(R13*1), CX
3004 MOVQ DX, R14
3005 MOVQ CX, BX
3006 ROLQ CL, R14
3007 MOVL $0x00000001, R15
3008 MOVB R13, CL
3009 SHLL CL, R15
3010 DECL R15
3011 ANDQ R15, R14
3012 ADDQ R14, DI
3013
3014 // Load ctx.llTable
3015 MOVQ ctx+16(FP), CX
3016 MOVQ (CX), CX
3017 MOVQ (CX)(DI*8), DI
3018
3019 // Update Match Length State
3020 MOVBQZX R8, R13
3021 SHRQ $0x10, R8
3022 MOVWQZX R8, R8
3023 LEAQ (BX)(R13*1), CX
3024 MOVQ DX, R14
3025 MOVQ CX, BX
3026 ROLQ CL, R14
3027 MOVL $0x00000001, R15
3028 MOVB R13, CL
3029 SHLL CL, R15
3030 DECL R15
3031 ANDQ R15, R14
3032 ADDQ R14, R8
3033
3034 // Load ctx.mlTable
3035 MOVQ ctx+16(FP), CX
3036 MOVQ 24(CX), CX
3037 MOVQ (CX)(R8*8), R8
3038
3039 // Update Offset State
3040 MOVBQZX R9, R13
3041 SHRQ $0x10, R9
3042 MOVWQZX R9, R9
3043 LEAQ (BX)(R13*1), CX
3044 MOVQ DX, R14
3045 MOVQ CX, BX
3046 ROLQ CL, R14
3047 MOVL $0x00000001, R15
3048 MOVB R13, CL
3049 SHLL CL, R15
3050 DECL R15
3051 ANDQ R15, R14
3052 ADDQ R14, R9
3053
3054 // Load ctx.ofTable
3055 MOVQ ctx+16(FP), CX
3056 MOVQ 48(CX), CX
3057 MOVQ (CX)(R9*8), R9
3058
3059sequenceDecs_decodeSync_safe_amd64_skip_update:
3060 // Adjust offset
3061 MOVQ s+0(FP), CX
3062 MOVQ 8(SP), R13
3063 CMPQ AX, $0x01
3064 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
3065 MOVUPS 144(CX), X0
3066 MOVQ R13, 144(CX)
3067 MOVUPS X0, 152(CX)
3068 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
3069
3070sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
3071 CMPQ 24(SP), $0x00000000
3072 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
3073 INCQ R13
3074 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3075
3076sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
3077 TESTQ R13, R13
3078 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3079 MOVQ 144(CX), R13
3080 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
3081
3082sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
3083 MOVQ R13, AX
3084 XORQ R14, R14
3085 MOVQ $-1, R15
3086 CMPQ R13, $0x03
3087 CMOVQEQ R14, AX
3088 CMOVQEQ R15, R14
3089 ADDQ 144(CX)(AX*8), R14
3090 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
3091 MOVQ $0x00000001, R14
3092
3093sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
3094 CMPQ R13, $0x01
3095 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
3096 MOVQ 152(CX), AX
3097 MOVQ AX, 160(CX)
3098
3099sequenceDecs_decodeSync_safe_amd64_adjust_skip:
3100 MOVQ 144(CX), AX
3101 MOVQ AX, 152(CX)
3102 MOVQ R14, 144(CX)
3103 MOVQ R14, R13
3104
3105sequenceDecs_decodeSync_safe_amd64_after_adjust:
3106 MOVQ R13, 8(SP)
3107
3108 // Check values
3109 MOVQ 16(SP), AX
3110 MOVQ 24(SP), CX
3111 LEAQ (AX)(CX*1), R14
3112 MOVQ s+0(FP), R15
3113 ADDQ R14, 256(R15)
3114 MOVQ ctx+16(FP), R14
3115 SUBQ CX, 104(R14)
3116 JS error_not_enough_literals
3117 CMPQ AX, $0x00020002
3118 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
3119 TESTQ R13, R13
3120 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
3121 TESTQ AX, AX
3122 JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
3123
3124sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
3125 MOVQ 24(SP), AX
3126 MOVQ 8(SP), CX
3127 MOVQ 16(SP), R13
3128
3129 // Check if we have enough space in s.out
3130 LEAQ (AX)(R13*1), R14
3131 ADDQ R10, R14
3132 CMPQ R14, 32(SP)
3133 JA error_not_enough_space
3134
3135 // Copy literals
3136 TESTQ AX, AX
3137 JZ check_offset
3138 MOVQ AX, R14
3139 SUBQ $0x10, R14
3140 JB copy_1_small
3141
3142copy_1_loop:
3143 MOVUPS (R11), X0
3144 MOVUPS X0, (R10)
3145 ADDQ $0x10, R11
3146 ADDQ $0x10, R10
3147 SUBQ $0x10, R14
3148 JAE copy_1_loop
3149 LEAQ 16(R11)(R14*1), R11
3150 LEAQ 16(R10)(R14*1), R10
3151 MOVUPS -16(R11), X0
3152 MOVUPS X0, -16(R10)
3153 JMP copy_1_end
3154
3155copy_1_small:
3156 CMPQ AX, $0x03
3157 JE copy_1_move_3
3158 JB copy_1_move_1or2
3159 CMPQ AX, $0x08
3160 JB copy_1_move_4through7
3161 JMP copy_1_move_8through16
3162
3163copy_1_move_1or2:
3164 MOVB (R11), R14
3165 MOVB -1(R11)(AX*1), R15
3166 MOVB R14, (R10)
3167 MOVB R15, -1(R10)(AX*1)
3168 ADDQ AX, R11
3169 ADDQ AX, R10
3170 JMP copy_1_end
3171
3172copy_1_move_3:
3173 MOVW (R11), R14
3174 MOVB 2(R11), R15
3175 MOVW R14, (R10)
3176 MOVB R15, 2(R10)
3177 ADDQ AX, R11
3178 ADDQ AX, R10
3179 JMP copy_1_end
3180
3181copy_1_move_4through7:
3182 MOVL (R11), R14
3183 MOVL -4(R11)(AX*1), R15
3184 MOVL R14, (R10)
3185 MOVL R15, -4(R10)(AX*1)
3186 ADDQ AX, R11
3187 ADDQ AX, R10
3188 JMP copy_1_end
3189
3190copy_1_move_8through16:
3191 MOVQ (R11), R14
3192 MOVQ -8(R11)(AX*1), R15
3193 MOVQ R14, (R10)
3194 MOVQ R15, -8(R10)(AX*1)
3195 ADDQ AX, R11
3196 ADDQ AX, R10
3197
3198copy_1_end:
3199 ADDQ AX, R12
3200
3201 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3202check_offset:
3203 MOVQ R12, AX
3204 ADDQ 40(SP), AX
3205 CMPQ CX, AX
3206 JG error_match_off_too_big
3207 CMPQ CX, 56(SP)
3208 JG error_match_off_too_big
3209
3210 // Copy match from history
3211 MOVQ CX, AX
3212 SUBQ R12, AX
3213 JLS copy_match
3214 MOVQ 48(SP), R14
3215 SUBQ AX, R14
3216 CMPQ R13, AX
3217 JG copy_all_from_history
3218 MOVQ R13, AX
3219 SUBQ $0x10, AX
3220 JB copy_4_small
3221
3222copy_4_loop:
3223 MOVUPS (R14), X0
3224 MOVUPS X0, (R10)
3225 ADDQ $0x10, R14
3226 ADDQ $0x10, R10
3227 SUBQ $0x10, AX
3228 JAE copy_4_loop
3229 LEAQ 16(R14)(AX*1), R14
3230 LEAQ 16(R10)(AX*1), R10
3231 MOVUPS -16(R14), X0
3232 MOVUPS X0, -16(R10)
3233 JMP copy_4_end
3234
3235copy_4_small:
3236 CMPQ R13, $0x03
3237 JE copy_4_move_3
3238 CMPQ R13, $0x08
3239 JB copy_4_move_4through7
3240 JMP copy_4_move_8through16
3241
3242copy_4_move_3:
3243 MOVW (R14), AX
3244 MOVB 2(R14), CL
3245 MOVW AX, (R10)
3246 MOVB CL, 2(R10)
3247 ADDQ R13, R14
3248 ADDQ R13, R10
3249 JMP copy_4_end
3250
3251copy_4_move_4through7:
3252 MOVL (R14), AX
3253 MOVL -4(R14)(R13*1), CX
3254 MOVL AX, (R10)
3255 MOVL CX, -4(R10)(R13*1)
3256 ADDQ R13, R14
3257 ADDQ R13, R10
3258 JMP copy_4_end
3259
3260copy_4_move_8through16:
3261 MOVQ (R14), AX
3262 MOVQ -8(R14)(R13*1), CX
3263 MOVQ AX, (R10)
3264 MOVQ CX, -8(R10)(R13*1)
3265 ADDQ R13, R14
3266 ADDQ R13, R10
3267
3268copy_4_end:
3269 ADDQ R13, R12
3270 JMP handle_loop
3271 JMP loop_finished
3272
3273copy_all_from_history:
3274 MOVQ AX, R15
3275 SUBQ $0x10, R15
3276 JB copy_5_small
3277
3278copy_5_loop:
3279 MOVUPS (R14), X0
3280 MOVUPS X0, (R10)
3281 ADDQ $0x10, R14
3282 ADDQ $0x10, R10
3283 SUBQ $0x10, R15
3284 JAE copy_5_loop
3285 LEAQ 16(R14)(R15*1), R14
3286 LEAQ 16(R10)(R15*1), R10
3287 MOVUPS -16(R14), X0
3288 MOVUPS X0, -16(R10)
3289 JMP copy_5_end
3290
3291copy_5_small:
3292 CMPQ AX, $0x03
3293 JE copy_5_move_3
3294 JB copy_5_move_1or2
3295 CMPQ AX, $0x08
3296 JB copy_5_move_4through7
3297 JMP copy_5_move_8through16
3298
3299copy_5_move_1or2:
3300 MOVB (R14), R15
3301 MOVB -1(R14)(AX*1), BP
3302 MOVB R15, (R10)
3303 MOVB BP, -1(R10)(AX*1)
3304 ADDQ AX, R14
3305 ADDQ AX, R10
3306 JMP copy_5_end
3307
3308copy_5_move_3:
3309 MOVW (R14), R15
3310 MOVB 2(R14), BP
3311 MOVW R15, (R10)
3312 MOVB BP, 2(R10)
3313 ADDQ AX, R14
3314 ADDQ AX, R10
3315 JMP copy_5_end
3316
3317copy_5_move_4through7:
3318 MOVL (R14), R15
3319 MOVL -4(R14)(AX*1), BP
3320 MOVL R15, (R10)
3321 MOVL BP, -4(R10)(AX*1)
3322 ADDQ AX, R14
3323 ADDQ AX, R10
3324 JMP copy_5_end
3325
3326copy_5_move_8through16:
3327 MOVQ (R14), R15
3328 MOVQ -8(R14)(AX*1), BP
3329 MOVQ R15, (R10)
3330 MOVQ BP, -8(R10)(AX*1)
3331 ADDQ AX, R14
3332 ADDQ AX, R10
3333
3334copy_5_end:
3335 ADDQ AX, R12
3336 SUBQ AX, R13
3337
3338 // Copy match from the current buffer
3339copy_match:
3340 MOVQ R10, AX
3341 SUBQ CX, AX
3342
3343 // ml <= mo
3344 CMPQ R13, CX
3345 JA copy_overlapping_match
3346
3347 // Copy non-overlapping match
3348 ADDQ R13, R12
3349 MOVQ R13, CX
3350 SUBQ $0x10, CX
3351 JB copy_2_small
3352
3353copy_2_loop:
3354 MOVUPS (AX), X0
3355 MOVUPS X0, (R10)
3356 ADDQ $0x10, AX
3357 ADDQ $0x10, R10
3358 SUBQ $0x10, CX
3359 JAE copy_2_loop
3360 LEAQ 16(AX)(CX*1), AX
3361 LEAQ 16(R10)(CX*1), R10
3362 MOVUPS -16(AX), X0
3363 MOVUPS X0, -16(R10)
3364 JMP copy_2_end
3365
3366copy_2_small:
3367 CMPQ R13, $0x03
3368 JE copy_2_move_3
3369 JB copy_2_move_1or2
3370 CMPQ R13, $0x08
3371 JB copy_2_move_4through7
3372 JMP copy_2_move_8through16
3373
3374copy_2_move_1or2:
3375 MOVB (AX), CL
3376 MOVB -1(AX)(R13*1), R14
3377 MOVB CL, (R10)
3378 MOVB R14, -1(R10)(R13*1)
3379 ADDQ R13, AX
3380 ADDQ R13, R10
3381 JMP copy_2_end
3382
3383copy_2_move_3:
3384 MOVW (AX), CX
3385 MOVB 2(AX), R14
3386 MOVW CX, (R10)
3387 MOVB R14, 2(R10)
3388 ADDQ R13, AX
3389 ADDQ R13, R10
3390 JMP copy_2_end
3391
3392copy_2_move_4through7:
3393 MOVL (AX), CX
3394 MOVL -4(AX)(R13*1), R14
3395 MOVL CX, (R10)
3396 MOVL R14, -4(R10)(R13*1)
3397 ADDQ R13, AX
3398 ADDQ R13, R10
3399 JMP copy_2_end
3400
3401copy_2_move_8through16:
3402 MOVQ (AX), CX
3403 MOVQ -8(AX)(R13*1), R14
3404 MOVQ CX, (R10)
3405 MOVQ R14, -8(R10)(R13*1)
3406 ADDQ R13, AX
3407 ADDQ R13, R10
3408
3409copy_2_end:
3410 JMP handle_loop
3411
3412 // Copy overlapping match
3413copy_overlapping_match:
3414 ADDQ R13, R12
3415
3416copy_slow_3:
3417 MOVB (AX), CL
3418 MOVB CL, (R10)
3419 INCQ AX
3420 INCQ R10
3421 DECQ R13
3422 JNZ copy_slow_3
3423
3424handle_loop:
3425 MOVQ ctx+16(FP), AX
3426 DECQ 96(AX)
3427 JNS sequenceDecs_decodeSync_safe_amd64_main_loop
3428
3429loop_finished:
3430 MOVQ br+8(FP), AX
3431 MOVQ DX, 32(AX)
3432 MOVB BL, 40(AX)
3433 MOVQ SI, 24(AX)
3434
3435 // Update the context
3436 MOVQ ctx+16(FP), AX
3437 MOVQ R12, 136(AX)
3438 MOVQ 144(AX), CX
3439 SUBQ CX, R11
3440 MOVQ R11, 168(AX)
3441
3442 // Return success
3443 MOVQ $0x00000000, ret+24(FP)
3444 RET
3445
3446 // Return with match length error
3447sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
3448 MOVQ 16(SP), AX
3449 MOVQ ctx+16(FP), CX
3450 MOVQ AX, 216(CX)
3451 MOVQ $0x00000001, ret+24(FP)
3452 RET
3453
3454 // Return with match too long error
3455sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
3456 MOVQ ctx+16(FP), AX
3457 MOVQ 16(SP), CX
3458 MOVQ CX, 216(AX)
3459 MOVQ $0x00000002, ret+24(FP)
3460 RET
3461
3462 // Return with match offset too long error
3463error_match_off_too_big:
3464 MOVQ ctx+16(FP), AX
3465 MOVQ 8(SP), CX
3466 MOVQ CX, 224(AX)
3467 MOVQ R12, 136(AX)
3468 MOVQ $0x00000003, ret+24(FP)
3469 RET
3470
3471 // Return with not enough literals error
3472error_not_enough_literals:
3473 MOVQ ctx+16(FP), AX
3474 MOVQ 24(SP), CX
3475 MOVQ CX, 208(AX)
3476 MOVQ $0x00000004, ret+24(FP)
3477 RET
3478
3479 // Return with not enough output space error
3480error_not_enough_space:
3481 MOVQ ctx+16(FP), AX
3482 MOVQ 24(SP), CX
3483 MOVQ CX, 208(AX)
3484 MOVQ 16(SP), CX
3485 MOVQ CX, 216(AX)
3486 MOVQ R12, 136(AX)
3487 MOVQ $0x00000005, ret+24(FP)
3488 RET
3489
3490// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
3491// Requires: BMI, BMI2, CMOV, SSE
3492TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
3493 MOVQ br+8(FP), CX
3494 MOVQ 32(CX), AX
3495 MOVBQZX 40(CX), DX
3496 MOVQ 24(CX), BX
3497 MOVQ (CX), CX
3498 ADDQ BX, CX
3499 MOVQ CX, (SP)
3500 MOVQ ctx+16(FP), CX
3501 MOVQ 72(CX), SI
3502 MOVQ 80(CX), DI
3503 MOVQ 88(CX), R8
3504 XORQ R9, R9
3505 MOVQ R9, 8(SP)
3506 MOVQ R9, 16(SP)
3507 MOVQ R9, 24(SP)
3508 MOVQ 112(CX), R9
3509 MOVQ 128(CX), R10
3510 MOVQ R10, 32(SP)
3511 MOVQ 144(CX), R10
3512 MOVQ 136(CX), R11
3513 MOVQ 200(CX), R12
3514 MOVQ R12, 56(SP)
3515 MOVQ 176(CX), R12
3516 MOVQ R12, 48(SP)
3517 MOVQ 184(CX), CX
3518 MOVQ CX, 40(SP)
3519 MOVQ 40(SP), CX
3520 ADDQ CX, 48(SP)
3521
3522 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
3523 ADDQ R9, 32(SP)
3524
3525 // outBase += outPosition
3526 ADDQ R11, R9
3527
3528sequenceDecs_decodeSync_safe_bmi2_main_loop:
3529 MOVQ (SP), R12
3530
3531 // Fill bitreader to have enough for the offset and match length.
3532 CMPQ BX, $0x08
3533 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3534 MOVQ DX, CX
3535 SHRQ $0x03, CX
3536 SUBQ CX, R12
3537 MOVQ (R12), AX
3538 SUBQ CX, BX
3539 ANDQ $0x07, DX
3540 JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
3541
3542sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
3543 CMPQ BX, $0x00
3544 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
3545 CMPQ DX, $0x07
3546 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
3547 SHLQ $0x08, AX
3548 SUBQ $0x01, R12
3549 SUBQ $0x01, BX
3550 SUBQ $0x08, DX
3551 MOVBQZX (R12), CX
3552 ORQ CX, AX
3553 JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3554
3555sequenceDecs_decodeSync_safe_bmi2_fill_end:
3556 // Update offset
3557 MOVQ $0x00000808, CX
3558 BEXTRQ CX, R8, R13
3559 MOVQ AX, R14
3560 LEAQ (DX)(R13*1), CX
3561 ROLQ CL, R14
3562 BZHIQ R13, R14, R14
3563 MOVQ CX, DX
3564 MOVQ R8, CX
3565 SHRQ $0x20, CX
3566 ADDQ R14, CX
3567 MOVQ CX, 8(SP)
3568
3569 // Update match length
3570 MOVQ $0x00000808, CX
3571 BEXTRQ CX, DI, R13
3572 MOVQ AX, R14
3573 LEAQ (DX)(R13*1), CX
3574 ROLQ CL, R14
3575 BZHIQ R13, R14, R14
3576 MOVQ CX, DX
3577 MOVQ DI, CX
3578 SHRQ $0x20, CX
3579 ADDQ R14, CX
3580 MOVQ CX, 16(SP)
3581
3582 // Fill bitreader to have enough for the remaining
3583 CMPQ BX, $0x08
3584 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3585 MOVQ DX, CX
3586 SHRQ $0x03, CX
3587 SUBQ CX, R12
3588 MOVQ (R12), AX
3589 SUBQ CX, BX
3590 ANDQ $0x07, DX
3591 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3592
3593sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
3594 CMPQ BX, $0x00
3595 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3596 CMPQ DX, $0x07
3597 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3598 SHLQ $0x08, AX
3599 SUBQ $0x01, R12
3600 SUBQ $0x01, BX
3601 SUBQ $0x08, DX
3602 MOVBQZX (R12), CX
3603 ORQ CX, AX
3604 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3605
3606sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
3607 // Update literal length
3608 MOVQ $0x00000808, CX
3609 BEXTRQ CX, SI, R13
3610 MOVQ AX, R14
3611 LEAQ (DX)(R13*1), CX
3612 ROLQ CL, R14
3613 BZHIQ R13, R14, R14
3614 MOVQ CX, DX
3615 MOVQ SI, CX
3616 SHRQ $0x20, CX
3617 ADDQ R14, CX
3618 MOVQ CX, 24(SP)
3619
3620 // Fill bitreader for state updates
3621 MOVQ R12, (SP)
3622 MOVQ $0x00000808, CX
3623 BEXTRQ CX, R8, R12
3624 MOVQ ctx+16(FP), CX
3625 CMPQ 96(CX), $0x00
3626 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
3627 LEAQ (SI)(DI*1), R13
3628 ADDQ R8, R13
3629 MOVBQZX R13, R13
3630 LEAQ (DX)(R13*1), CX
3631 MOVQ AX, R14
3632 MOVQ CX, DX
3633 ROLQ CL, R14
3634 BZHIQ R13, R14, R14
3635
3636 // Update Offset State
3637 BZHIQ R8, R14, CX
3638 SHRXQ R8, R14, R14
3639 MOVQ $0x00001010, R13
3640 BEXTRQ R13, R8, R8
3641 ADDQ CX, R8
3642
3643 // Load ctx.ofTable
3644 MOVQ ctx+16(FP), CX
3645 MOVQ 48(CX), CX
3646 MOVQ (CX)(R8*8), R8
3647
3648 // Update Match Length State
3649 BZHIQ DI, R14, CX
3650 SHRXQ DI, R14, R14
3651 MOVQ $0x00001010, R13
3652 BEXTRQ R13, DI, DI
3653 ADDQ CX, DI
3654
3655 // Load ctx.mlTable
3656 MOVQ ctx+16(FP), CX
3657 MOVQ 24(CX), CX
3658 MOVQ (CX)(DI*8), DI
3659
3660 // Update Literal Length State
3661 BZHIQ SI, R14, CX
3662 MOVQ $0x00001010, R13
3663 BEXTRQ R13, SI, SI
3664 ADDQ CX, SI
3665
3666 // Load ctx.llTable
3667 MOVQ ctx+16(FP), CX
3668 MOVQ (CX), CX
3669 MOVQ (CX)(SI*8), SI
3670
3671sequenceDecs_decodeSync_safe_bmi2_skip_update:
3672 // Adjust offset
3673 MOVQ s+0(FP), CX
3674 MOVQ 8(SP), R13
3675 CMPQ R12, $0x01
3676 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
3677 MOVUPS 144(CX), X0
3678 MOVQ R13, 144(CX)
3679 MOVUPS X0, 152(CX)
3680 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
3681
3682sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
3683 CMPQ 24(SP), $0x00000000
3684 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
3685 INCQ R13
3686 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3687
3688sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
3689 TESTQ R13, R13
3690 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3691 MOVQ 144(CX), R13
3692 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
3693
3694sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
3695 MOVQ R13, R12
3696 XORQ R14, R14
3697 MOVQ $-1, R15
3698 CMPQ R13, $0x03
3699 CMOVQEQ R14, R12
3700 CMOVQEQ R15, R14
3701 ADDQ 144(CX)(R12*8), R14
3702 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
3703 MOVQ $0x00000001, R14
3704
3705sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
3706 CMPQ R13, $0x01
3707 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
3708 MOVQ 152(CX), R12
3709 MOVQ R12, 160(CX)
3710
3711sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
3712 MOVQ 144(CX), R12
3713 MOVQ R12, 152(CX)
3714 MOVQ R14, 144(CX)
3715 MOVQ R14, R13
3716
3717sequenceDecs_decodeSync_safe_bmi2_after_adjust:
3718 MOVQ R13, 8(SP)
3719
3720 // Check values
3721 MOVQ 16(SP), CX
3722 MOVQ 24(SP), R12
3723 LEAQ (CX)(R12*1), R14
3724 MOVQ s+0(FP), R15
3725 ADDQ R14, 256(R15)
3726 MOVQ ctx+16(FP), R14
3727 SUBQ R12, 104(R14)
3728 JS error_not_enough_literals
3729 CMPQ CX, $0x00020002
3730 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
3731 TESTQ R13, R13
3732 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
3733 TESTQ CX, CX
3734 JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
3735
3736sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
3737 MOVQ 24(SP), CX
3738 MOVQ 8(SP), R12
3739 MOVQ 16(SP), R13
3740
3741 // Check if we have enough space in s.out
3742 LEAQ (CX)(R13*1), R14
3743 ADDQ R9, R14
3744 CMPQ R14, 32(SP)
3745 JA error_not_enough_space
3746
3747 // Copy literals
3748 TESTQ CX, CX
3749 JZ check_offset
3750 MOVQ CX, R14
3751 SUBQ $0x10, R14
3752 JB copy_1_small
3753
3754copy_1_loop:
3755 MOVUPS (R10), X0
3756 MOVUPS X0, (R9)
3757 ADDQ $0x10, R10
3758 ADDQ $0x10, R9
3759 SUBQ $0x10, R14
3760 JAE copy_1_loop
3761 LEAQ 16(R10)(R14*1), R10
3762 LEAQ 16(R9)(R14*1), R9
3763 MOVUPS -16(R10), X0
3764 MOVUPS X0, -16(R9)
3765 JMP copy_1_end
3766
3767copy_1_small:
3768 CMPQ CX, $0x03
3769 JE copy_1_move_3
3770 JB copy_1_move_1or2
3771 CMPQ CX, $0x08
3772 JB copy_1_move_4through7
3773 JMP copy_1_move_8through16
3774
3775copy_1_move_1or2:
3776 MOVB (R10), R14
3777 MOVB -1(R10)(CX*1), R15
3778 MOVB R14, (R9)
3779 MOVB R15, -1(R9)(CX*1)
3780 ADDQ CX, R10
3781 ADDQ CX, R9
3782 JMP copy_1_end
3783
3784copy_1_move_3:
3785 MOVW (R10), R14
3786 MOVB 2(R10), R15
3787 MOVW R14, (R9)
3788 MOVB R15, 2(R9)
3789 ADDQ CX, R10
3790 ADDQ CX, R9
3791 JMP copy_1_end
3792
3793copy_1_move_4through7:
3794 MOVL (R10), R14
3795 MOVL -4(R10)(CX*1), R15
3796 MOVL R14, (R9)
3797 MOVL R15, -4(R9)(CX*1)
3798 ADDQ CX, R10
3799 ADDQ CX, R9
3800 JMP copy_1_end
3801
3802copy_1_move_8through16:
3803 MOVQ (R10), R14
3804 MOVQ -8(R10)(CX*1), R15
3805 MOVQ R14, (R9)
3806 MOVQ R15, -8(R9)(CX*1)
3807 ADDQ CX, R10
3808 ADDQ CX, R9
3809
3810copy_1_end:
3811 ADDQ CX, R11
3812
3813 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3814check_offset:
3815 MOVQ R11, CX
3816 ADDQ 40(SP), CX
3817 CMPQ R12, CX
3818 JG error_match_off_too_big
3819 CMPQ R12, 56(SP)
3820 JG error_match_off_too_big
3821
3822 // Copy match from history
3823 MOVQ R12, CX
3824 SUBQ R11, CX
3825 JLS copy_match
3826 MOVQ 48(SP), R14
3827 SUBQ CX, R14
3828 CMPQ R13, CX
3829 JG copy_all_from_history
3830 MOVQ R13, CX
3831 SUBQ $0x10, CX
3832 JB copy_4_small
3833
3834copy_4_loop:
3835 MOVUPS (R14), X0
3836 MOVUPS X0, (R9)
3837 ADDQ $0x10, R14
3838 ADDQ $0x10, R9
3839 SUBQ $0x10, CX
3840 JAE copy_4_loop
3841 LEAQ 16(R14)(CX*1), R14
3842 LEAQ 16(R9)(CX*1), R9
3843 MOVUPS -16(R14), X0
3844 MOVUPS X0, -16(R9)
3845 JMP copy_4_end
3846
3847copy_4_small:
3848 CMPQ R13, $0x03
3849 JE copy_4_move_3
3850 CMPQ R13, $0x08
3851 JB copy_4_move_4through7
3852 JMP copy_4_move_8through16
3853
3854copy_4_move_3:
3855 MOVW (R14), CX
3856 MOVB 2(R14), R12
3857 MOVW CX, (R9)
3858 MOVB R12, 2(R9)
3859 ADDQ R13, R14
3860 ADDQ R13, R9
3861 JMP copy_4_end
3862
3863copy_4_move_4through7:
3864 MOVL (R14), CX
3865 MOVL -4(R14)(R13*1), R12
3866 MOVL CX, (R9)
3867 MOVL R12, -4(R9)(R13*1)
3868 ADDQ R13, R14
3869 ADDQ R13, R9
3870 JMP copy_4_end
3871
3872copy_4_move_8through16:
3873 MOVQ (R14), CX
3874 MOVQ -8(R14)(R13*1), R12
3875 MOVQ CX, (R9)
3876 MOVQ R12, -8(R9)(R13*1)
3877 ADDQ R13, R14
3878 ADDQ R13, R9
3879
3880copy_4_end:
3881 ADDQ R13, R11
3882 JMP handle_loop
3883 JMP loop_finished
3884
3885copy_all_from_history:
3886 MOVQ CX, R15
3887 SUBQ $0x10, R15
3888 JB copy_5_small
3889
3890copy_5_loop:
3891 MOVUPS (R14), X0
3892 MOVUPS X0, (R9)
3893 ADDQ $0x10, R14
3894 ADDQ $0x10, R9
3895 SUBQ $0x10, R15
3896 JAE copy_5_loop
3897 LEAQ 16(R14)(R15*1), R14
3898 LEAQ 16(R9)(R15*1), R9
3899 MOVUPS -16(R14), X0
3900 MOVUPS X0, -16(R9)
3901 JMP copy_5_end
3902
3903copy_5_small:
3904 CMPQ CX, $0x03
3905 JE copy_5_move_3
3906 JB copy_5_move_1or2
3907 CMPQ CX, $0x08
3908 JB copy_5_move_4through7
3909 JMP copy_5_move_8through16
3910
3911copy_5_move_1or2:
3912 MOVB (R14), R15
3913 MOVB -1(R14)(CX*1), BP
3914 MOVB R15, (R9)
3915 MOVB BP, -1(R9)(CX*1)
3916 ADDQ CX, R14
3917 ADDQ CX, R9
3918 JMP copy_5_end
3919
3920copy_5_move_3:
3921 MOVW (R14), R15
3922 MOVB 2(R14), BP
3923 MOVW R15, (R9)
3924 MOVB BP, 2(R9)
3925 ADDQ CX, R14
3926 ADDQ CX, R9
3927 JMP copy_5_end
3928
3929copy_5_move_4through7:
3930 MOVL (R14), R15
3931 MOVL -4(R14)(CX*1), BP
3932 MOVL R15, (R9)
3933 MOVL BP, -4(R9)(CX*1)
3934 ADDQ CX, R14
3935 ADDQ CX, R9
3936 JMP copy_5_end
3937
3938copy_5_move_8through16:
3939 MOVQ (R14), R15
3940 MOVQ -8(R14)(CX*1), BP
3941 MOVQ R15, (R9)
3942 MOVQ BP, -8(R9)(CX*1)
3943 ADDQ CX, R14
3944 ADDQ CX, R9
3945
3946copy_5_end:
3947 ADDQ CX, R11
3948 SUBQ CX, R13
3949
3950 // Copy match from the current buffer
3951copy_match:
3952 MOVQ R9, CX
3953 SUBQ R12, CX
3954
3955 // ml <= mo
3956 CMPQ R13, R12
3957 JA copy_overlapping_match
3958
3959 // Copy non-overlapping match
3960 ADDQ R13, R11
3961 MOVQ R13, R12
3962 SUBQ $0x10, R12
3963 JB copy_2_small
3964
3965copy_2_loop:
3966 MOVUPS (CX), X0
3967 MOVUPS X0, (R9)
3968 ADDQ $0x10, CX
3969 ADDQ $0x10, R9
3970 SUBQ $0x10, R12
3971 JAE copy_2_loop
3972 LEAQ 16(CX)(R12*1), CX
3973 LEAQ 16(R9)(R12*1), R9
3974 MOVUPS -16(CX), X0
3975 MOVUPS X0, -16(R9)
3976 JMP copy_2_end
3977
3978copy_2_small:
3979 CMPQ R13, $0x03
3980 JE copy_2_move_3
3981 JB copy_2_move_1or2
3982 CMPQ R13, $0x08
3983 JB copy_2_move_4through7
3984 JMP copy_2_move_8through16
3985
3986copy_2_move_1or2:
3987 MOVB (CX), R12
3988 MOVB -1(CX)(R13*1), R14
3989 MOVB R12, (R9)
3990 MOVB R14, -1(R9)(R13*1)
3991 ADDQ R13, CX
3992 ADDQ R13, R9
3993 JMP copy_2_end
3994
3995copy_2_move_3:
3996 MOVW (CX), R12
3997 MOVB 2(CX), R14
3998 MOVW R12, (R9)
3999 MOVB R14, 2(R9)
4000 ADDQ R13, CX
4001 ADDQ R13, R9
4002 JMP copy_2_end
4003
4004copy_2_move_4through7:
4005 MOVL (CX), R12
4006 MOVL -4(CX)(R13*1), R14
4007 MOVL R12, (R9)
4008 MOVL R14, -4(R9)(R13*1)
4009 ADDQ R13, CX
4010 ADDQ R13, R9
4011 JMP copy_2_end
4012
4013copy_2_move_8through16:
4014 MOVQ (CX), R12
4015 MOVQ -8(CX)(R13*1), R14
4016 MOVQ R12, (R9)
4017 MOVQ R14, -8(R9)(R13*1)
4018 ADDQ R13, CX
4019 ADDQ R13, R9
4020
4021copy_2_end:
4022 JMP handle_loop
4023
4024 // Copy overlapping match
4025copy_overlapping_match:
4026 ADDQ R13, R11
4027
4028copy_slow_3:
4029 MOVB (CX), R12
4030 MOVB R12, (R9)
4031 INCQ CX
4032 INCQ R9
4033 DECQ R13
4034 JNZ copy_slow_3
4035
4036handle_loop:
4037 MOVQ ctx+16(FP), CX
4038 DECQ 96(CX)
4039 JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
4040
4041loop_finished:
4042 MOVQ br+8(FP), CX
4043 MOVQ AX, 32(CX)
4044 MOVB DL, 40(CX)
4045 MOVQ BX, 24(CX)
4046
4047 // Update the context
4048 MOVQ ctx+16(FP), AX
4049 MOVQ R11, 136(AX)
4050 MOVQ 144(AX), CX
4051 SUBQ CX, R10
4052 MOVQ R10, 168(AX)
4053
4054 // Return success
4055 MOVQ $0x00000000, ret+24(FP)
4056 RET
4057
4058 // Return with match length error
4059sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
4060 MOVQ 16(SP), AX
4061 MOVQ ctx+16(FP), CX
4062 MOVQ AX, 216(CX)
4063 MOVQ $0x00000001, ret+24(FP)
4064 RET
4065
4066 // Return with match too long error
4067sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
4068 MOVQ ctx+16(FP), AX
4069 MOVQ 16(SP), CX
4070 MOVQ CX, 216(AX)
4071 MOVQ $0x00000002, ret+24(FP)
4072 RET
4073
4074 // Return with match offset too long error
4075error_match_off_too_big:
4076 MOVQ ctx+16(FP), AX
4077 MOVQ 8(SP), CX
4078 MOVQ CX, 224(AX)
4079 MOVQ R11, 136(AX)
4080 MOVQ $0x00000003, ret+24(FP)
4081 RET
4082
4083 // Return with not enough literals error
4084error_not_enough_literals:
4085 MOVQ ctx+16(FP), AX
4086 MOVQ 24(SP), CX
4087 MOVQ CX, 208(AX)
4088 MOVQ $0x00000004, ret+24(FP)
4089 RET
4090
4091 // Return with not enough output space error
4092error_not_enough_space:
4093 MOVQ ctx+16(FP), AX
4094 MOVQ 24(SP), CX
4095 MOVQ CX, 208(AX)
4096 MOVQ 16(SP), CX
4097 MOVQ CX, 216(AX)
4098 MOVQ R11, 136(AX)
4099 MOVQ $0x00000005, ret+24(FP)
4100 RET