blob: dd1a5aecd6518ff5b9b49a4a7799db0b752a7c92 [file] [log] [blame]
Akash Reddy Kankanalac0014632025-05-21 17:12:20 +05301// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
2
3//go:build amd64 && !appengine && !noasm && gc
4// +build amd64,!appengine,!noasm,gc
5
6// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
7TEXT ·decompress4x_main_loop_amd64(SB), $0-8
8 XORQ DX, DX
9
10 // Preload values
11 MOVQ ctx+0(FP), AX
12 MOVBQZX 8(AX), DI
13 MOVQ 16(AX), SI
14 MOVQ 48(AX), BX
15 MOVQ 24(AX), R9
16 MOVQ 32(AX), R10
17 MOVQ (AX), R11
18
19 // Main loop
20main_loop:
21 MOVQ SI, R8
22 CMPQ R8, BX
23 SETGE DL
24
25 // br0.fillFast32()
26 MOVQ 32(R11), R12
27 MOVBQZX 40(R11), R13
28 CMPQ R13, $0x20
29 JBE skip_fill0
30 MOVQ 24(R11), AX
31 SUBQ $0x20, R13
32 SUBQ $0x04, AX
33 MOVQ (R11), R14
34
35 // b.value |= uint64(low) << (b.bitsRead & 63)
36 MOVL (AX)(R14*1), R14
37 MOVQ R13, CX
38 SHLQ CL, R14
39 MOVQ AX, 24(R11)
40 ORQ R14, R12
41
42 // exhausted = exhausted || (br0.off < 4)
43 CMPQ AX, $0x04
44 SETLT AL
45 ORB AL, DL
46
47skip_fill0:
48 // val0 := br0.peekTopBits(peekBits)
49 MOVQ R12, R14
50 MOVQ DI, CX
51 SHRQ CL, R14
52
53 // v0 := table[val0&mask]
54 MOVW (R10)(R14*2), CX
55
56 // br0.advance(uint8(v0.entry)
57 MOVB CH, AL
58 SHLQ CL, R12
59 ADDB CL, R13
60
61 // val1 := br0.peekTopBits(peekBits)
62 MOVQ DI, CX
63 MOVQ R12, R14
64 SHRQ CL, R14
65
66 // v1 := table[val1&mask]
67 MOVW (R10)(R14*2), CX
68
69 // br0.advance(uint8(v1.entry))
70 MOVB CH, AH
71 SHLQ CL, R12
72 ADDB CL, R13
73
74 // these two writes get coalesced
75 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
76 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
77 MOVW AX, (R8)
78
79 // update the bitreader structure
80 MOVQ R12, 32(R11)
81 MOVB R13, 40(R11)
82 ADDQ R9, R8
83
84 // br1.fillFast32()
85 MOVQ 80(R11), R12
86 MOVBQZX 88(R11), R13
87 CMPQ R13, $0x20
88 JBE skip_fill1
89 MOVQ 72(R11), AX
90 SUBQ $0x20, R13
91 SUBQ $0x04, AX
92 MOVQ 48(R11), R14
93
94 // b.value |= uint64(low) << (b.bitsRead & 63)
95 MOVL (AX)(R14*1), R14
96 MOVQ R13, CX
97 SHLQ CL, R14
98 MOVQ AX, 72(R11)
99 ORQ R14, R12
100
101 // exhausted = exhausted || (br1.off < 4)
102 CMPQ AX, $0x04
103 SETLT AL
104 ORB AL, DL
105
106skip_fill1:
107 // val0 := br1.peekTopBits(peekBits)
108 MOVQ R12, R14
109 MOVQ DI, CX
110 SHRQ CL, R14
111
112 // v0 := table[val0&mask]
113 MOVW (R10)(R14*2), CX
114
115 // br1.advance(uint8(v0.entry)
116 MOVB CH, AL
117 SHLQ CL, R12
118 ADDB CL, R13
119
120 // val1 := br1.peekTopBits(peekBits)
121 MOVQ DI, CX
122 MOVQ R12, R14
123 SHRQ CL, R14
124
125 // v1 := table[val1&mask]
126 MOVW (R10)(R14*2), CX
127
128 // br1.advance(uint8(v1.entry))
129 MOVB CH, AH
130 SHLQ CL, R12
131 ADDB CL, R13
132
133 // these two writes get coalesced
134 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
135 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
136 MOVW AX, (R8)
137
138 // update the bitreader structure
139 MOVQ R12, 80(R11)
140 MOVB R13, 88(R11)
141 ADDQ R9, R8
142
143 // br2.fillFast32()
144 MOVQ 128(R11), R12
145 MOVBQZX 136(R11), R13
146 CMPQ R13, $0x20
147 JBE skip_fill2
148 MOVQ 120(R11), AX
149 SUBQ $0x20, R13
150 SUBQ $0x04, AX
151 MOVQ 96(R11), R14
152
153 // b.value |= uint64(low) << (b.bitsRead & 63)
154 MOVL (AX)(R14*1), R14
155 MOVQ R13, CX
156 SHLQ CL, R14
157 MOVQ AX, 120(R11)
158 ORQ R14, R12
159
160 // exhausted = exhausted || (br2.off < 4)
161 CMPQ AX, $0x04
162 SETLT AL
163 ORB AL, DL
164
165skip_fill2:
166 // val0 := br2.peekTopBits(peekBits)
167 MOVQ R12, R14
168 MOVQ DI, CX
169 SHRQ CL, R14
170
171 // v0 := table[val0&mask]
172 MOVW (R10)(R14*2), CX
173
174 // br2.advance(uint8(v0.entry)
175 MOVB CH, AL
176 SHLQ CL, R12
177 ADDB CL, R13
178
179 // val1 := br2.peekTopBits(peekBits)
180 MOVQ DI, CX
181 MOVQ R12, R14
182 SHRQ CL, R14
183
184 // v1 := table[val1&mask]
185 MOVW (R10)(R14*2), CX
186
187 // br2.advance(uint8(v1.entry))
188 MOVB CH, AH
189 SHLQ CL, R12
190 ADDB CL, R13
191
192 // these two writes get coalesced
193 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
194 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
195 MOVW AX, (R8)
196
197 // update the bitreader structure
198 MOVQ R12, 128(R11)
199 MOVB R13, 136(R11)
200 ADDQ R9, R8
201
202 // br3.fillFast32()
203 MOVQ 176(R11), R12
204 MOVBQZX 184(R11), R13
205 CMPQ R13, $0x20
206 JBE skip_fill3
207 MOVQ 168(R11), AX
208 SUBQ $0x20, R13
209 SUBQ $0x04, AX
210 MOVQ 144(R11), R14
211
212 // b.value |= uint64(low) << (b.bitsRead & 63)
213 MOVL (AX)(R14*1), R14
214 MOVQ R13, CX
215 SHLQ CL, R14
216 MOVQ AX, 168(R11)
217 ORQ R14, R12
218
219 // exhausted = exhausted || (br3.off < 4)
220 CMPQ AX, $0x04
221 SETLT AL
222 ORB AL, DL
223
224skip_fill3:
225 // val0 := br3.peekTopBits(peekBits)
226 MOVQ R12, R14
227 MOVQ DI, CX
228 SHRQ CL, R14
229
230 // v0 := table[val0&mask]
231 MOVW (R10)(R14*2), CX
232
233 // br3.advance(uint8(v0.entry)
234 MOVB CH, AL
235 SHLQ CL, R12
236 ADDB CL, R13
237
238 // val1 := br3.peekTopBits(peekBits)
239 MOVQ DI, CX
240 MOVQ R12, R14
241 SHRQ CL, R14
242
243 // v1 := table[val1&mask]
244 MOVW (R10)(R14*2), CX
245
246 // br3.advance(uint8(v1.entry))
247 MOVB CH, AH
248 SHLQ CL, R12
249 ADDB CL, R13
250
251 // these two writes get coalesced
252 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
253 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
254 MOVW AX, (R8)
255
256 // update the bitreader structure
257 MOVQ R12, 176(R11)
258 MOVB R13, 184(R11)
259 ADDQ $0x02, SI
260 TESTB DL, DL
261 JZ main_loop
262 MOVQ ctx+0(FP), AX
263 SUBQ 16(AX), SI
264 SHLQ $0x02, SI
265 MOVQ SI, 40(AX)
266 RET
267
268// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
269TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
270 XORQ DX, DX
271
272 // Preload values
273 MOVQ ctx+0(FP), CX
274 MOVBQZX 8(CX), DI
275 MOVQ 16(CX), BX
276 MOVQ 48(CX), SI
277 MOVQ 24(CX), R9
278 MOVQ 32(CX), R10
279 MOVQ (CX), R11
280
281 // Main loop
282main_loop:
283 MOVQ BX, R8
284 CMPQ R8, SI
285 SETGE DL
286
287 // br0.fillFast32()
288 MOVQ 32(R11), R12
289 MOVBQZX 40(R11), R13
290 CMPQ R13, $0x20
291 JBE skip_fill0
292 MOVQ 24(R11), R14
293 SUBQ $0x20, R13
294 SUBQ $0x04, R14
295 MOVQ (R11), R15
296
297 // b.value |= uint64(low) << (b.bitsRead & 63)
298 MOVL (R14)(R15*1), R15
299 MOVQ R13, CX
300 SHLQ CL, R15
301 MOVQ R14, 24(R11)
302 ORQ R15, R12
303
304 // exhausted = exhausted || (br0.off < 4)
305 CMPQ R14, $0x04
306 SETLT AL
307 ORB AL, DL
308
309skip_fill0:
310 // val0 := br0.peekTopBits(peekBits)
311 MOVQ R12, R14
312 MOVQ DI, CX
313 SHRQ CL, R14
314
315 // v0 := table[val0&mask]
316 MOVW (R10)(R14*2), CX
317
318 // br0.advance(uint8(v0.entry)
319 MOVB CH, AL
320 SHLQ CL, R12
321 ADDB CL, R13
322
323 // val1 := br0.peekTopBits(peekBits)
324 MOVQ R12, R14
325 MOVQ DI, CX
326 SHRQ CL, R14
327
328 // v1 := table[val0&mask]
329 MOVW (R10)(R14*2), CX
330
331 // br0.advance(uint8(v1.entry)
332 MOVB CH, AH
333 SHLQ CL, R12
334 ADDB CL, R13
335 BSWAPL AX
336
337 // val2 := br0.peekTopBits(peekBits)
338 MOVQ R12, R14
339 MOVQ DI, CX
340 SHRQ CL, R14
341
342 // v2 := table[val0&mask]
343 MOVW (R10)(R14*2), CX
344
345 // br0.advance(uint8(v2.entry)
346 MOVB CH, AH
347 SHLQ CL, R12
348 ADDB CL, R13
349
350 // val3 := br0.peekTopBits(peekBits)
351 MOVQ R12, R14
352 MOVQ DI, CX
353 SHRQ CL, R14
354
355 // v3 := table[val0&mask]
356 MOVW (R10)(R14*2), CX
357
358 // br0.advance(uint8(v3.entry)
359 MOVB CH, AL
360 SHLQ CL, R12
361 ADDB CL, R13
362 BSWAPL AX
363
364 // these four writes get coalesced
365 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
366 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
367 // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
368 // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
369 MOVL AX, (R8)
370
371 // update the bitreader structure
372 MOVQ R12, 32(R11)
373 MOVB R13, 40(R11)
374 ADDQ R9, R8
375
376 // br1.fillFast32()
377 MOVQ 80(R11), R12
378 MOVBQZX 88(R11), R13
379 CMPQ R13, $0x20
380 JBE skip_fill1
381 MOVQ 72(R11), R14
382 SUBQ $0x20, R13
383 SUBQ $0x04, R14
384 MOVQ 48(R11), R15
385
386 // b.value |= uint64(low) << (b.bitsRead & 63)
387 MOVL (R14)(R15*1), R15
388 MOVQ R13, CX
389 SHLQ CL, R15
390 MOVQ R14, 72(R11)
391 ORQ R15, R12
392
393 // exhausted = exhausted || (br1.off < 4)
394 CMPQ R14, $0x04
395 SETLT AL
396 ORB AL, DL
397
398skip_fill1:
399 // val0 := br1.peekTopBits(peekBits)
400 MOVQ R12, R14
401 MOVQ DI, CX
402 SHRQ CL, R14
403
404 // v0 := table[val0&mask]
405 MOVW (R10)(R14*2), CX
406
407 // br1.advance(uint8(v0.entry)
408 MOVB CH, AL
409 SHLQ CL, R12
410 ADDB CL, R13
411
412 // val1 := br1.peekTopBits(peekBits)
413 MOVQ R12, R14
414 MOVQ DI, CX
415 SHRQ CL, R14
416
417 // v1 := table[val0&mask]
418 MOVW (R10)(R14*2), CX
419
420 // br1.advance(uint8(v1.entry)
421 MOVB CH, AH
422 SHLQ CL, R12
423 ADDB CL, R13
424 BSWAPL AX
425
426 // val2 := br1.peekTopBits(peekBits)
427 MOVQ R12, R14
428 MOVQ DI, CX
429 SHRQ CL, R14
430
431 // v2 := table[val0&mask]
432 MOVW (R10)(R14*2), CX
433
434 // br1.advance(uint8(v2.entry)
435 MOVB CH, AH
436 SHLQ CL, R12
437 ADDB CL, R13
438
439 // val3 := br1.peekTopBits(peekBits)
440 MOVQ R12, R14
441 MOVQ DI, CX
442 SHRQ CL, R14
443
444 // v3 := table[val0&mask]
445 MOVW (R10)(R14*2), CX
446
447 // br1.advance(uint8(v3.entry)
448 MOVB CH, AL
449 SHLQ CL, R12
450 ADDB CL, R13
451 BSWAPL AX
452
453 // these four writes get coalesced
454 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
455 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
456 // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
457 // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
458 MOVL AX, (R8)
459
460 // update the bitreader structure
461 MOVQ R12, 80(R11)
462 MOVB R13, 88(R11)
463 ADDQ R9, R8
464
465 // br2.fillFast32()
466 MOVQ 128(R11), R12
467 MOVBQZX 136(R11), R13
468 CMPQ R13, $0x20
469 JBE skip_fill2
470 MOVQ 120(R11), R14
471 SUBQ $0x20, R13
472 SUBQ $0x04, R14
473 MOVQ 96(R11), R15
474
475 // b.value |= uint64(low) << (b.bitsRead & 63)
476 MOVL (R14)(R15*1), R15
477 MOVQ R13, CX
478 SHLQ CL, R15
479 MOVQ R14, 120(R11)
480 ORQ R15, R12
481
482 // exhausted = exhausted || (br2.off < 4)
483 CMPQ R14, $0x04
484 SETLT AL
485 ORB AL, DL
486
487skip_fill2:
488 // val0 := br2.peekTopBits(peekBits)
489 MOVQ R12, R14
490 MOVQ DI, CX
491 SHRQ CL, R14
492
493 // v0 := table[val0&mask]
494 MOVW (R10)(R14*2), CX
495
496 // br2.advance(uint8(v0.entry)
497 MOVB CH, AL
498 SHLQ CL, R12
499 ADDB CL, R13
500
501 // val1 := br2.peekTopBits(peekBits)
502 MOVQ R12, R14
503 MOVQ DI, CX
504 SHRQ CL, R14
505
506 // v1 := table[val0&mask]
507 MOVW (R10)(R14*2), CX
508
509 // br2.advance(uint8(v1.entry)
510 MOVB CH, AH
511 SHLQ CL, R12
512 ADDB CL, R13
513 BSWAPL AX
514
515 // val2 := br2.peekTopBits(peekBits)
516 MOVQ R12, R14
517 MOVQ DI, CX
518 SHRQ CL, R14
519
520 // v2 := table[val0&mask]
521 MOVW (R10)(R14*2), CX
522
523 // br2.advance(uint8(v2.entry)
524 MOVB CH, AH
525 SHLQ CL, R12
526 ADDB CL, R13
527
528 // val3 := br2.peekTopBits(peekBits)
529 MOVQ R12, R14
530 MOVQ DI, CX
531 SHRQ CL, R14
532
533 // v3 := table[val0&mask]
534 MOVW (R10)(R14*2), CX
535
536 // br2.advance(uint8(v3.entry)
537 MOVB CH, AL
538 SHLQ CL, R12
539 ADDB CL, R13
540 BSWAPL AX
541
542 // these four writes get coalesced
543 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
544 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
545 // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
546 // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
547 MOVL AX, (R8)
548
549 // update the bitreader structure
550 MOVQ R12, 128(R11)
551 MOVB R13, 136(R11)
552 ADDQ R9, R8
553
554 // br3.fillFast32()
555 MOVQ 176(R11), R12
556 MOVBQZX 184(R11), R13
557 CMPQ R13, $0x20
558 JBE skip_fill3
559 MOVQ 168(R11), R14
560 SUBQ $0x20, R13
561 SUBQ $0x04, R14
562 MOVQ 144(R11), R15
563
564 // b.value |= uint64(low) << (b.bitsRead & 63)
565 MOVL (R14)(R15*1), R15
566 MOVQ R13, CX
567 SHLQ CL, R15
568 MOVQ R14, 168(R11)
569 ORQ R15, R12
570
571 // exhausted = exhausted || (br3.off < 4)
572 CMPQ R14, $0x04
573 SETLT AL
574 ORB AL, DL
575
576skip_fill3:
577 // val0 := br3.peekTopBits(peekBits)
578 MOVQ R12, R14
579 MOVQ DI, CX
580 SHRQ CL, R14
581
582 // v0 := table[val0&mask]
583 MOVW (R10)(R14*2), CX
584
585 // br3.advance(uint8(v0.entry)
586 MOVB CH, AL
587 SHLQ CL, R12
588 ADDB CL, R13
589
590 // val1 := br3.peekTopBits(peekBits)
591 MOVQ R12, R14
592 MOVQ DI, CX
593 SHRQ CL, R14
594
595 // v1 := table[val0&mask]
596 MOVW (R10)(R14*2), CX
597
598 // br3.advance(uint8(v1.entry)
599 MOVB CH, AH
600 SHLQ CL, R12
601 ADDB CL, R13
602 BSWAPL AX
603
604 // val2 := br3.peekTopBits(peekBits)
605 MOVQ R12, R14
606 MOVQ DI, CX
607 SHRQ CL, R14
608
609 // v2 := table[val0&mask]
610 MOVW (R10)(R14*2), CX
611
612 // br3.advance(uint8(v2.entry)
613 MOVB CH, AH
614 SHLQ CL, R12
615 ADDB CL, R13
616
617 // val3 := br3.peekTopBits(peekBits)
618 MOVQ R12, R14
619 MOVQ DI, CX
620 SHRQ CL, R14
621
622 // v3 := table[val0&mask]
623 MOVW (R10)(R14*2), CX
624
625 // br3.advance(uint8(v3.entry)
626 MOVB CH, AL
627 SHLQ CL, R12
628 ADDB CL, R13
629 BSWAPL AX
630
631 // these four writes get coalesced
632 // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
633 // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
634 // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
635 // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
636 MOVL AX, (R8)
637
638 // update the bitreader structure
639 MOVQ R12, 176(R11)
640 MOVB R13, 184(R11)
641 ADDQ $0x04, BX
642 TESTB DL, DL
643 JZ main_loop
644 MOVQ ctx+0(FP), AX
645 SUBQ 16(AX), BX
646 SHLQ $0x02, BX
647 MOVQ BX, 40(AX)
648 RET
649
650// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
651TEXT ·decompress1x_main_loop_amd64(SB), $0-8
652 MOVQ ctx+0(FP), CX
653 MOVQ 16(CX), DX
654 MOVQ 24(CX), BX
655 CMPQ BX, $0x04
656 JB error_max_decoded_size_exeeded
657 LEAQ (DX)(BX*1), BX
658 MOVQ (CX), SI
659 MOVQ (SI), R8
660 MOVQ 24(SI), R9
661 MOVQ 32(SI), R10
662 MOVBQZX 40(SI), R11
663 MOVQ 32(CX), SI
664 MOVBQZX 8(CX), DI
665 JMP loop_condition
666
667main_loop:
668 // Check if we have room for 4 bytes in the output buffer
669 LEAQ 4(DX), CX
670 CMPQ CX, BX
671 JGE error_max_decoded_size_exeeded
672
673 // Decode 4 values
674 CMPQ R11, $0x20
675 JL bitReader_fillFast_1_end
676 SUBQ $0x20, R11
677 SUBQ $0x04, R9
678 MOVL (R8)(R9*1), R12
679 MOVQ R11, CX
680 SHLQ CL, R12
681 ORQ R12, R10
682
683bitReader_fillFast_1_end:
684 MOVQ DI, CX
685 MOVQ R10, R12
686 SHRQ CL, R12
687 MOVW (SI)(R12*2), CX
688 MOVB CH, AL
689 MOVBQZX CL, CX
690 ADDQ CX, R11
691 SHLQ CL, R10
692 MOVQ DI, CX
693 MOVQ R10, R12
694 SHRQ CL, R12
695 MOVW (SI)(R12*2), CX
696 MOVB CH, AH
697 MOVBQZX CL, CX
698 ADDQ CX, R11
699 SHLQ CL, R10
700 BSWAPL AX
701 CMPQ R11, $0x20
702 JL bitReader_fillFast_2_end
703 SUBQ $0x20, R11
704 SUBQ $0x04, R9
705 MOVL (R8)(R9*1), R12
706 MOVQ R11, CX
707 SHLQ CL, R12
708 ORQ R12, R10
709
710bitReader_fillFast_2_end:
711 MOVQ DI, CX
712 MOVQ R10, R12
713 SHRQ CL, R12
714 MOVW (SI)(R12*2), CX
715 MOVB CH, AH
716 MOVBQZX CL, CX
717 ADDQ CX, R11
718 SHLQ CL, R10
719 MOVQ DI, CX
720 MOVQ R10, R12
721 SHRQ CL, R12
722 MOVW (SI)(R12*2), CX
723 MOVB CH, AL
724 MOVBQZX CL, CX
725 ADDQ CX, R11
726 SHLQ CL, R10
727 BSWAPL AX
728
729 // Store the decoded values
730 MOVL AX, (DX)
731 ADDQ $0x04, DX
732
733loop_condition:
734 CMPQ R9, $0x08
735 JGE main_loop
736
737 // Update ctx structure
738 MOVQ ctx+0(FP), AX
739 SUBQ 16(AX), DX
740 MOVQ DX, 40(AX)
741 MOVQ (AX), AX
742 MOVQ R9, 24(AX)
743 MOVQ R10, 32(AX)
744 MOVB R11, 40(AX)
745 RET
746
747 // Report error
748error_max_decoded_size_exeeded:
749 MOVQ ctx+0(FP), AX
750 MOVQ $-1, CX
751 MOVQ CX, 40(AX)
752 RET
753
754// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
755// Requires: BMI2
756TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
757 MOVQ ctx+0(FP), CX
758 MOVQ 16(CX), DX
759 MOVQ 24(CX), BX
760 CMPQ BX, $0x04
761 JB error_max_decoded_size_exeeded
762 LEAQ (DX)(BX*1), BX
763 MOVQ (CX), SI
764 MOVQ (SI), R8
765 MOVQ 24(SI), R9
766 MOVQ 32(SI), R10
767 MOVBQZX 40(SI), R11
768 MOVQ 32(CX), SI
769 MOVBQZX 8(CX), DI
770 JMP loop_condition
771
772main_loop:
773 // Check if we have room for 4 bytes in the output buffer
774 LEAQ 4(DX), CX
775 CMPQ CX, BX
776 JGE error_max_decoded_size_exeeded
777
778 // Decode 4 values
779 CMPQ R11, $0x20
780 JL bitReader_fillFast_1_end
781 SUBQ $0x20, R11
782 SUBQ $0x04, R9
783 MOVL (R8)(R9*1), CX
784 SHLXQ R11, CX, CX
785 ORQ CX, R10
786
787bitReader_fillFast_1_end:
788 SHRXQ DI, R10, CX
789 MOVW (SI)(CX*2), CX
790 MOVB CH, AL
791 MOVBQZX CL, CX
792 ADDQ CX, R11
793 SHLXQ CX, R10, R10
794 SHRXQ DI, R10, CX
795 MOVW (SI)(CX*2), CX
796 MOVB CH, AH
797 MOVBQZX CL, CX
798 ADDQ CX, R11
799 SHLXQ CX, R10, R10
800 BSWAPL AX
801 CMPQ R11, $0x20
802 JL bitReader_fillFast_2_end
803 SUBQ $0x20, R11
804 SUBQ $0x04, R9
805 MOVL (R8)(R9*1), CX
806 SHLXQ R11, CX, CX
807 ORQ CX, R10
808
809bitReader_fillFast_2_end:
810 SHRXQ DI, R10, CX
811 MOVW (SI)(CX*2), CX
812 MOVB CH, AH
813 MOVBQZX CL, CX
814 ADDQ CX, R11
815 SHLXQ CX, R10, R10
816 SHRXQ DI, R10, CX
817 MOVW (SI)(CX*2), CX
818 MOVB CH, AL
819 MOVBQZX CL, CX
820 ADDQ CX, R11
821 SHLXQ CX, R10, R10
822 BSWAPL AX
823
824 // Store the decoded values
825 MOVL AX, (DX)
826 ADDQ $0x04, DX
827
828loop_condition:
829 CMPQ R9, $0x08
830 JGE main_loop
831
832 // Update ctx structure
833 MOVQ ctx+0(FP), AX
834 SUBQ 16(AX), DX
835 MOVQ DX, 40(AX)
836 MOVQ (AX), AX
837 MOVQ R9, 24(AX)
838 MOVQ R10, 32(AX)
839 MOVB R11, 40(AX)
840 RET
841
842 // Report error
843error_max_decoded_size_exeeded:
844 MOVQ ctx+0(FP), AX
845 MOVQ $-1, CX
846 MOVQ CX, 40(AX)
847 RET