|
64 | 64 | JBE move_129through256
|
65 | 65 | // TODO: use branch table and BSR to make this just a single dispatch
|
66 | 66 |
|
| 67 | + TESTB $1, runtime·useRepMovs(SB) |
| 68 | + JZ avxUnaligned |
| 69 | + |
67 | 70 | /*
|
68 | 71 | * check and set for backwards
|
69 | 72 | */
|
@@ -108,7 +111,6 @@ back:
|
108 | 111 | ADDQ BX, CX
|
109 | 112 | CMPQ CX, DI
|
110 | 113 | JLS forward
|
111 |
| - |
112 | 114 | /*
|
113 | 115 | * whole thing backwards has
|
114 | 116 | * adjusted addresses
|
@@ -273,3 +275,242 @@ move_256through2048:
|
273 | 275 | LEAQ 256(DI), DI
|
274 | 276 | JGE move_256through2048
|
275 | 277 | JMP tail
|
| 278 | + |
| 279 | +avxUnaligned: |
| 280 | + // There are two implementations of move algorithm. |
| 281 | + // The first one for non-ovelapped memory regions. It uses forward copying. |
| 282 | + // The second one for overlapped regions. It uses backward copying |
| 283 | + MOVQ DI, CX |
| 284 | + SUBQ SI, CX |
| 285 | + // Now CX contains distance between SRC and DEST |
| 286 | + CMPQ CX, BX |
| 287 | + // If the distance lesser than region length it means that regions are overlapped |
| 288 | + JC copy_backward |
| 289 | + |
| 290 | + // Non-temporal copy would be better for big sizes. |
| 291 | + CMPQ BX, $0x100000 |
| 292 | + JAE gobble_big_data_fwd |
| 293 | + |
| 294 | + // Memory layout on the source side |
| 295 | + // SI CX |
| 296 | + // |<---------BX before correction--------->| |
| 297 | + // | |<--BX corrected-->| | |
| 298 | + // | | |<--- AX --->| |
| 299 | + // |<-R11->| |<-128 bytes->| |
| 300 | + // +----------------------------------------+ |
| 301 | + // | Head | Body | Tail | |
| 302 | + // +-------+------------------+-------------+ |
| 303 | + // ^ ^ ^ |
| 304 | + // | | | |
| 305 | + // Save head into Y4 Save tail into X5..X12 |
| 306 | + // | |
| 307 | + // SI+R11, where R11 = ((DI & -32) + 32) - DI |
| 308 | + // Algorithm: |
| 309 | + // 1. Unaligned save of the tail's 128 bytes |
| 310 | + // 2. Unaligned save of the head's 32 bytes |
| 311 | + // 3. Destination-aligned copying of body (128 bytes per iteration) |
| 312 | + // 4. Put head on the new place |
| 313 | + // 5. Put the tail on the new place |
| 314 | + // It can be important to satisfy processor's pipeline requirements for |
| 315 | + // small sizes as the cost of unaligned memory region copying is |
| 316 | + // comparable with the cost of main loop. So code is slightly messed there. |
| 317 | + // There is more clean implementation of that algorithm for bigger sizes |
| 318 | + // where the cost of unaligned part copying is negligible. |
| 319 | + // You can see it after gobble_big_data_fwd label. |
| 320 | + LEAQ (SI)(BX*1), CX |
| 321 | + MOVQ DI, R10 |
| 322 | + // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. |
| 323 | + MOVOU -0x80(CX), X5 |
| 324 | + MOVOU -0x70(CX), X6 |
| 325 | + MOVQ $0x80, AX |
| 326 | + // Align destination address |
| 327 | + ANDQ $-32, DI |
| 328 | + ADDQ $32, DI |
| 329 | + // Continue tail saving. |
| 330 | + MOVOU -0x60(CX), X7 |
| 331 | + MOVOU -0x50(CX), X8 |
| 332 | + // Make R11 delta between aligned and unaligned destination addresses. |
| 333 | + MOVQ DI, R11 |
| 334 | + SUBQ R10, R11 |
| 335 | + // Continue tail saving. |
| 336 | + MOVOU -0x40(CX), X9 |
| 337 | + MOVOU -0x30(CX), X10 |
| 338 | + // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. |
| 339 | + SUBQ R11, BX |
| 340 | + // Continue tail saving. |
| 341 | + MOVOU -0x20(CX), X11 |
| 342 | + MOVOU -0x10(CX), X12 |
| 343 | + // The tail will be put on it's place after main body copying. |
| 344 | + // It's time for the unaligned heading part. |
| 345 | + VMOVDQU (SI), Y4 |
| 346 | + // Adjust source address to point past head. |
| 347 | + ADDQ R11, SI |
| 348 | + SUBQ AX, BX |
| 349 | + // Aligned memory copying there |
| 350 | +gobble_128_loop: |
| 351 | + VMOVDQU (SI), Y0 |
| 352 | + VMOVDQU 0x20(SI), Y1 |
| 353 | + VMOVDQU 0x40(SI), Y2 |
| 354 | + VMOVDQU 0x60(SI), Y3 |
| 355 | + ADDQ AX, SI |
| 356 | + VMOVDQA Y0, (DI) |
| 357 | + VMOVDQA Y1, 0x20(DI) |
| 358 | + VMOVDQA Y2, 0x40(DI) |
| 359 | + VMOVDQA Y3, 0x60(DI) |
| 360 | + ADDQ AX, DI |
| 361 | + SUBQ AX, BX |
| 362 | + JA gobble_128_loop |
| 363 | + // Now we can store unaligned parts. |
| 364 | + ADDQ AX, BX |
| 365 | + ADDQ DI, BX |
| 366 | + VMOVDQU Y4, (R10) |
| 367 | + VZEROUPPER |
| 368 | + MOVOU X5, -0x80(BX) |
| 369 | + MOVOU X6, -0x70(BX) |
| 370 | + MOVOU X7, -0x60(BX) |
| 371 | + MOVOU X8, -0x50(BX) |
| 372 | + MOVOU X9, -0x40(BX) |
| 373 | + MOVOU X10, -0x30(BX) |
| 374 | + MOVOU X11, -0x20(BX) |
| 375 | + MOVOU X12, -0x10(BX) |
| 376 | + RET |
| 377 | + |
| 378 | +gobble_big_data_fwd: |
| 379 | + // There is forward copying for big regions. |
| 380 | + // It uses non-temporal mov instructions. |
| 381 | + // Details of this algorithm are commented previously for small sizes. |
| 382 | + LEAQ (SI)(BX*1), CX |
| 383 | + MOVOU -0x80(SI)(BX*1), X5 |
| 384 | + MOVOU -0x70(CX), X6 |
| 385 | + MOVOU -0x60(CX), X7 |
| 386 | + MOVOU -0x50(CX), X8 |
| 387 | + MOVOU -0x40(CX), X9 |
| 388 | + MOVOU -0x30(CX), X10 |
| 389 | + MOVOU -0x20(CX), X11 |
| 390 | + MOVOU -0x10(CX), X12 |
| 391 | + VMOVDQU (SI), Y4 |
| 392 | + MOVQ DI, R8 |
| 393 | + ANDQ $-32, DI |
| 394 | + ADDQ $32, DI |
| 395 | + MOVQ DI, R10 |
| 396 | + SUBQ R8, R10 |
| 397 | + SUBQ R10, BX |
| 398 | + ADDQ R10, SI |
| 399 | + LEAQ (DI)(BX*1), CX |
| 400 | + SUBQ $0x80, BX |
| 401 | +gobble_mem_fwd_loop: |
| 402 | + PREFETCHNTA 0x1C0(SI) |
| 403 | + PREFETCHNTA 0x280(SI) |
| 404 | + // Prefetch values were choosen empirically. |
| 405 | + // Approach for prefetch usage as in 7.6.6 of [1] |
| 406 | + // [1] 64-ia-32-architectures-optimization-manual.pdf |
| 407 | + // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf |
| 408 | + VMOVDQU (SI), Y0 |
| 409 | + VMOVDQU 0x20(SI), Y1 |
| 410 | + VMOVDQU 0x40(SI), Y2 |
| 411 | + VMOVDQU 0x60(SI), Y3 |
| 412 | + ADDQ $0x80, SI |
| 413 | + VMOVNTDQ Y0, (DI) |
| 414 | + VMOVNTDQ Y1, 0x20(DI) |
| 415 | + VMOVNTDQ Y2, 0x40(DI) |
| 416 | + VMOVNTDQ Y3, 0x60(DI) |
| 417 | + ADDQ $0x80, DI |
| 418 | + SUBQ $0x80, BX |
| 419 | + JA gobble_mem_fwd_loop |
| 420 | + // NT instructions don't follow the normal cache-coherency rules. |
| 421 | + // We need SFENCE there to make copied data available timely. |
| 422 | + SFENCE |
| 423 | + VMOVDQU Y4, (R8) |
| 424 | + VZEROUPPER |
| 425 | + MOVOU X5, -0x80(CX) |
| 426 | + MOVOU X6, -0x70(CX) |
| 427 | + MOVOU X7, -0x60(CX) |
| 428 | + MOVOU X8, -0x50(CX) |
| 429 | + MOVOU X9, -0x40(CX) |
| 430 | + MOVOU X10, -0x30(CX) |
| 431 | + MOVOU X11, -0x20(CX) |
| 432 | + MOVOU X12, -0x10(CX) |
| 433 | + RET |
| 434 | + |
| 435 | +copy_backward: |
| 436 | + MOVQ DI, AX |
| 437 | + // Backward copying is about the same as the forward one. |
| 438 | + // Firstly we load unaligned tail in the beginning of region. |
| 439 | + MOVOU (SI), X5 |
| 440 | + MOVOU 0x10(SI), X6 |
| 441 | + ADDQ BX, DI |
| 442 | + MOVOU 0x20(SI), X7 |
| 443 | + MOVOU 0x30(SI), X8 |
| 444 | + LEAQ -0x20(DI), R10 |
| 445 | + MOVQ DI, R11 |
| 446 | + MOVOU 0x40(SI), X9 |
| 447 | + MOVOU 0x50(SI), X10 |
| 448 | + ANDQ $0x1F, R11 |
| 449 | + MOVOU 0x60(SI), X11 |
| 450 | + MOVOU 0x70(SI), X12 |
| 451 | + XORQ R11, DI |
| 452 | + // Let's point SI to the end of region |
| 453 | + ADDQ BX, SI |
| 454 | + // and load unaligned head into X4. |
| 455 | + VMOVDQU -0x20(SI), Y4 |
| 456 | + SUBQ R11, SI |
| 457 | + SUBQ R11, BX |
| 458 | + // If there is enough data for non-temporal moves go to special loop |
| 459 | + CMPQ BX, $0x100000 |
| 460 | + JA gobble_big_data_bwd |
| 461 | + SUBQ $0x80, BX |
| 462 | +gobble_mem_bwd_loop: |
| 463 | + VMOVDQU -0x20(SI), Y0 |
| 464 | + VMOVDQU -0x40(SI), Y1 |
| 465 | + VMOVDQU -0x60(SI), Y2 |
| 466 | + VMOVDQU -0x80(SI), Y3 |
| 467 | + SUBQ $0x80, SI |
| 468 | + VMOVDQA Y0, -0x20(DI) |
| 469 | + VMOVDQA Y1, -0x40(DI) |
| 470 | + VMOVDQA Y2, -0x60(DI) |
| 471 | + VMOVDQA Y3, -0x80(DI) |
| 472 | + SUBQ $0x80, DI |
| 473 | + SUBQ $0x80, BX |
| 474 | + JA gobble_mem_bwd_loop |
| 475 | + // Let's store unaligned data |
| 476 | + VMOVDQU Y4, (R10) |
| 477 | + VZEROUPPER |
| 478 | + MOVOU X5, (AX) |
| 479 | + MOVOU X6, 0x10(AX) |
| 480 | + MOVOU X7, 0x20(AX) |
| 481 | + MOVOU X8, 0x30(AX) |
| 482 | + MOVOU X9, 0x40(AX) |
| 483 | + MOVOU X10, 0x50(AX) |
| 484 | + MOVOU X11, 0x60(AX) |
| 485 | + MOVOU X12, 0x70(AX) |
| 486 | + RET |
| 487 | + |
| 488 | +gobble_big_data_bwd: |
| 489 | + SUBQ $0x80, BX |
| 490 | +gobble_big_mem_bwd_loop: |
| 491 | + PREFETCHNTA -0x1C0(SI) |
| 492 | + PREFETCHNTA -0x280(SI) |
| 493 | + VMOVDQU -0x20(SI), Y0 |
| 494 | + VMOVDQU -0x40(SI), Y1 |
| 495 | + VMOVDQU -0x60(SI), Y2 |
| 496 | + VMOVDQU -0x80(SI), Y3 |
| 497 | + SUBQ $0x80, SI |
| 498 | + VMOVNTDQ Y0, -0x20(DI) |
| 499 | + VMOVNTDQ Y1, -0x40(DI) |
| 500 | + VMOVNTDQ Y2, -0x60(DI) |
| 501 | + VMOVNTDQ Y3, -0x80(DI) |
| 502 | + SUBQ $0x80, DI |
| 503 | + SUBQ $0x80, BX |
| 504 | + JA gobble_big_mem_bwd_loop |
| 505 | + SFENCE |
| 506 | + VMOVDQU Y4, (R10) |
| 507 | + VZEROUPPER |
| 508 | + MOVOU X5, (AX) |
| 509 | + MOVOU X6, 0x10(AX) |
| 510 | + MOVOU X7, 0x20(AX) |
| 511 | + MOVOU X8, 0x30(AX) |
| 512 | + MOVOU X9, 0x40(AX) |
| 513 | + MOVOU X10, 0x50(AX) |
| 514 | + MOVOU X11, 0x60(AX) |
| 515 | + MOVOU X12, 0x70(AX) |
| 516 | + RET |
0 commit comments