|
64 | 64 | JBE move_129through256
|
65 | 65 | // TODO: use branch table and BSR to make this just a single dispatch
|
66 | 66 |
|
67 |
| - TESTB $1, runtime·useRepMovs(SB) |
68 |
| - JZ avxUnaligned |
69 |
| - |
70 | 67 | /*
|
71 | 68 | * check and set for backwards
|
72 | 69 | */
|
@@ -111,6 +108,7 @@ back:
|
111 | 108 | ADDQ BX, CX
|
112 | 109 | CMPQ CX, DI
|
113 | 110 | JLS forward
|
| 111 | + |
114 | 112 | /*
|
115 | 113 | * whole thing backwards has
|
116 | 114 | * adjusted addresses
|
@@ -275,242 +273,3 @@ move_256through2048:
|
275 | 273 | LEAQ 256(DI), DI
|
276 | 274 | JGE move_256through2048
|
277 | 275 | JMP tail
|
278 |
| - |
279 |
| -avxUnaligned: |
280 |
| - // There are two implementations of the move algorithm. |
281 |
| - // The first one for non-overlapped memory regions. It uses forward copying. |
282 |
| - // The second one for overlapped regions. It uses backward copying |
283 |
| - MOVQ DI, CX |
284 |
| - SUBQ SI, CX |
285 |
| - // Now CX contains distance between SRC and DEST |
286 |
| - CMPQ CX, BX |
287 |
| - // If the distance lesser than region length it means that regions are overlapped |
288 |
| - JC copy_backward |
289 |
| - |
290 |
| - // Non-temporal copy would be better for big sizes. |
291 |
| - CMPQ BX, $0x100000 |
292 |
| - JAE gobble_big_data_fwd |
293 |
| - |
294 |
| - // Memory layout on the source side |
295 |
| - // SI CX |
296 |
| - // |<---------BX before correction--------->| |
297 |
| - // | |<--BX corrected-->| | |
298 |
| - // | | |<--- AX --->| |
299 |
| - // |<-R11->| |<-128 bytes->| |
300 |
| - // +----------------------------------------+ |
301 |
| - // | Head | Body | Tail | |
302 |
| - // +-------+------------------+-------------+ |
303 |
| - // ^ ^ ^ |
304 |
| - // | | | |
305 |
| - // Save head into Y4 Save tail into X5..X12 |
306 |
| - // | |
307 |
| - // SI+R11, where R11 = ((DI & -32) + 32) - DI |
308 |
| - // Algorithm: |
309 |
| - // 1. Unaligned save of the tail's 128 bytes |
310 |
| - // 2. Unaligned save of the head's 32 bytes |
311 |
| - // 3. Destination-aligned copying of body (128 bytes per iteration) |
312 |
| - // 4. Put head on the new place |
313 |
| - // 5. Put the tail on the new place |
314 |
| - // It can be important to satisfy processor's pipeline requirements for |
315 |
| - // small sizes as the cost of unaligned memory region copying is |
316 |
| - // comparable with the cost of main loop. So code is slightly messed there. |
317 |
| - // There is more clean implementation of that algorithm for bigger sizes |
318 |
| - // where the cost of unaligned part copying is negligible. |
319 |
| - // You can see it after gobble_big_data_fwd label. |
320 |
| - LEAQ (SI)(BX*1), CX |
321 |
| - MOVQ DI, R10 |
322 |
| - // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. |
323 |
| - MOVOU -0x80(CX), X5 |
324 |
| - MOVOU -0x70(CX), X6 |
325 |
| - MOVQ $0x80, AX |
326 |
| - // Align destination address |
327 |
| - ANDQ $-32, DI |
328 |
| - ADDQ $32, DI |
329 |
| - // Continue tail saving. |
330 |
| - MOVOU -0x60(CX), X7 |
331 |
| - MOVOU -0x50(CX), X8 |
332 |
| - // Make R11 delta between aligned and unaligned destination addresses. |
333 |
| - MOVQ DI, R11 |
334 |
| - SUBQ R10, R11 |
335 |
| - // Continue tail saving. |
336 |
| - MOVOU -0x40(CX), X9 |
337 |
| - MOVOU -0x30(CX), X10 |
338 |
| - // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. |
339 |
| - SUBQ R11, BX |
340 |
| - // Continue tail saving. |
341 |
| - MOVOU -0x20(CX), X11 |
342 |
| - MOVOU -0x10(CX), X12 |
343 |
| - // The tail will be put on it's place after main body copying. |
344 |
| - // It's time for the unaligned heading part. |
345 |
| - VMOVDQU (SI), Y4 |
346 |
| - // Adjust source address to point past head. |
347 |
| - ADDQ R11, SI |
348 |
| - SUBQ AX, BX |
349 |
| - // Aligned memory copying there |
350 |
| -gobble_128_loop: |
351 |
| - VMOVDQU (SI), Y0 |
352 |
| - VMOVDQU 0x20(SI), Y1 |
353 |
| - VMOVDQU 0x40(SI), Y2 |
354 |
| - VMOVDQU 0x60(SI), Y3 |
355 |
| - ADDQ AX, SI |
356 |
| - VMOVDQA Y0, (DI) |
357 |
| - VMOVDQA Y1, 0x20(DI) |
358 |
| - VMOVDQA Y2, 0x40(DI) |
359 |
| - VMOVDQA Y3, 0x60(DI) |
360 |
| - ADDQ AX, DI |
361 |
| - SUBQ AX, BX |
362 |
| - JA gobble_128_loop |
363 |
| - // Now we can store unaligned parts. |
364 |
| - ADDQ AX, BX |
365 |
| - ADDQ DI, BX |
366 |
| - VMOVDQU Y4, (R10) |
367 |
| - VZEROUPPER |
368 |
| - MOVOU X5, -0x80(BX) |
369 |
| - MOVOU X6, -0x70(BX) |
370 |
| - MOVOU X7, -0x60(BX) |
371 |
| - MOVOU X8, -0x50(BX) |
372 |
| - MOVOU X9, -0x40(BX) |
373 |
| - MOVOU X10, -0x30(BX) |
374 |
| - MOVOU X11, -0x20(BX) |
375 |
| - MOVOU X12, -0x10(BX) |
376 |
| - RET |
377 |
| - |
378 |
| -gobble_big_data_fwd: |
379 |
| - // There is forward copying for big regions. |
380 |
| - // It uses non-temporal mov instructions. |
381 |
| - // Details of this algorithm are commented previously for small sizes. |
382 |
| - LEAQ (SI)(BX*1), CX |
383 |
| - MOVOU -0x80(SI)(BX*1), X5 |
384 |
| - MOVOU -0x70(CX), X6 |
385 |
| - MOVOU -0x60(CX), X7 |
386 |
| - MOVOU -0x50(CX), X8 |
387 |
| - MOVOU -0x40(CX), X9 |
388 |
| - MOVOU -0x30(CX), X10 |
389 |
| - MOVOU -0x20(CX), X11 |
390 |
| - MOVOU -0x10(CX), X12 |
391 |
| - VMOVDQU (SI), Y4 |
392 |
| - MOVQ DI, R8 |
393 |
| - ANDQ $-32, DI |
394 |
| - ADDQ $32, DI |
395 |
| - MOVQ DI, R10 |
396 |
| - SUBQ R8, R10 |
397 |
| - SUBQ R10, BX |
398 |
| - ADDQ R10, SI |
399 |
| - LEAQ (DI)(BX*1), CX |
400 |
| - SUBQ $0x80, BX |
401 |
| -gobble_mem_fwd_loop: |
402 |
| - PREFETCHNTA 0x1C0(SI) |
403 |
| - PREFETCHNTA 0x280(SI) |
404 |
| - // Prefetch values were choosen empirically. |
405 |
| - // Approach for prefetch usage as in 7.6.6 of [1] |
406 |
| - // [1] 64-ia-32-architectures-optimization-manual.pdf |
407 |
| - // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf |
408 |
| - VMOVDQU (SI), Y0 |
409 |
| - VMOVDQU 0x20(SI), Y1 |
410 |
| - VMOVDQU 0x40(SI), Y2 |
411 |
| - VMOVDQU 0x60(SI), Y3 |
412 |
| - ADDQ $0x80, SI |
413 |
| - VMOVNTDQ Y0, (DI) |
414 |
| - VMOVNTDQ Y1, 0x20(DI) |
415 |
| - VMOVNTDQ Y2, 0x40(DI) |
416 |
| - VMOVNTDQ Y3, 0x60(DI) |
417 |
| - ADDQ $0x80, DI |
418 |
| - SUBQ $0x80, BX |
419 |
| - JA gobble_mem_fwd_loop |
420 |
| - // NT instructions don't follow the normal cache-coherency rules. |
421 |
| - // We need SFENCE there to make copied data available timely. |
422 |
| - SFENCE |
423 |
| - VMOVDQU Y4, (R8) |
424 |
| - VZEROUPPER |
425 |
| - MOVOU X5, -0x80(CX) |
426 |
| - MOVOU X6, -0x70(CX) |
427 |
| - MOVOU X7, -0x60(CX) |
428 |
| - MOVOU X8, -0x50(CX) |
429 |
| - MOVOU X9, -0x40(CX) |
430 |
| - MOVOU X10, -0x30(CX) |
431 |
| - MOVOU X11, -0x20(CX) |
432 |
| - MOVOU X12, -0x10(CX) |
433 |
| - RET |
434 |
| - |
435 |
| -copy_backward: |
436 |
| - MOVQ DI, AX |
437 |
| - // Backward copying is about the same as the forward one. |
438 |
| - // Firstly we load unaligned tail in the beginning of region. |
439 |
| - MOVOU (SI), X5 |
440 |
| - MOVOU 0x10(SI), X6 |
441 |
| - ADDQ BX, DI |
442 |
| - MOVOU 0x20(SI), X7 |
443 |
| - MOVOU 0x30(SI), X8 |
444 |
| - LEAQ -0x20(DI), R10 |
445 |
| - MOVQ DI, R11 |
446 |
| - MOVOU 0x40(SI), X9 |
447 |
| - MOVOU 0x50(SI), X10 |
448 |
| - ANDQ $0x1F, R11 |
449 |
| - MOVOU 0x60(SI), X11 |
450 |
| - MOVOU 0x70(SI), X12 |
451 |
| - XORQ R11, DI |
452 |
| - // Let's point SI to the end of region |
453 |
| - ADDQ BX, SI |
454 |
| - // and load unaligned head into X4. |
455 |
| - VMOVDQU -0x20(SI), Y4 |
456 |
| - SUBQ R11, SI |
457 |
| - SUBQ R11, BX |
458 |
| - // If there is enough data for non-temporal moves go to special loop |
459 |
| - CMPQ BX, $0x100000 |
460 |
| - JA gobble_big_data_bwd |
461 |
| - SUBQ $0x80, BX |
462 |
| -gobble_mem_bwd_loop: |
463 |
| - VMOVDQU -0x20(SI), Y0 |
464 |
| - VMOVDQU -0x40(SI), Y1 |
465 |
| - VMOVDQU -0x60(SI), Y2 |
466 |
| - VMOVDQU -0x80(SI), Y3 |
467 |
| - SUBQ $0x80, SI |
468 |
| - VMOVDQA Y0, -0x20(DI) |
469 |
| - VMOVDQA Y1, -0x40(DI) |
470 |
| - VMOVDQA Y2, -0x60(DI) |
471 |
| - VMOVDQA Y3, -0x80(DI) |
472 |
| - SUBQ $0x80, DI |
473 |
| - SUBQ $0x80, BX |
474 |
| - JA gobble_mem_bwd_loop |
475 |
| - // Let's store unaligned data |
476 |
| - VMOVDQU Y4, (R10) |
477 |
| - VZEROUPPER |
478 |
| - MOVOU X5, (AX) |
479 |
| - MOVOU X6, 0x10(AX) |
480 |
| - MOVOU X7, 0x20(AX) |
481 |
| - MOVOU X8, 0x30(AX) |
482 |
| - MOVOU X9, 0x40(AX) |
483 |
| - MOVOU X10, 0x50(AX) |
484 |
| - MOVOU X11, 0x60(AX) |
485 |
| - MOVOU X12, 0x70(AX) |
486 |
| - RET |
487 |
| - |
488 |
| -gobble_big_data_bwd: |
489 |
| - SUBQ $0x80, BX |
490 |
| -gobble_big_mem_bwd_loop: |
491 |
| - PREFETCHNTA -0x1C0(SI) |
492 |
| - PREFETCHNTA -0x280(SI) |
493 |
| - VMOVDQU -0x20(SI), Y0 |
494 |
| - VMOVDQU -0x40(SI), Y1 |
495 |
| - VMOVDQU -0x60(SI), Y2 |
496 |
| - VMOVDQU -0x80(SI), Y3 |
497 |
| - SUBQ $0x80, SI |
498 |
| - VMOVNTDQ Y0, -0x20(DI) |
499 |
| - VMOVNTDQ Y1, -0x40(DI) |
500 |
| - VMOVNTDQ Y2, -0x60(DI) |
501 |
| - VMOVNTDQ Y3, -0x80(DI) |
502 |
| - SUBQ $0x80, DI |
503 |
| - SUBQ $0x80, BX |
504 |
| - JA gobble_big_mem_bwd_loop |
505 |
| - SFENCE |
506 |
| - VMOVDQU Y4, (R10) |
507 |
| - VZEROUPPER |
508 |
| - MOVOU X5, (AX) |
509 |
| - MOVOU X6, 0x10(AX) |
510 |
| - MOVOU X7, 0x20(AX) |
511 |
| - MOVOU X8, 0x30(AX) |
512 |
| - MOVOU X9, 0x40(AX) |
513 |
| - MOVOU X10, 0x50(AX) |
514 |
| - MOVOU X11, 0x60(AX) |
515 |
| - MOVOU X12, 0x70(AX) |
516 |
| - RET |
0 commit comments