Skip to content

Commit 6fb4b15

Browse files
committed
Revert "runtime: improve memmove for amd64"
This reverts commit 3607c5f. This was causing failures on amd64 machines without AVX. Fixes #16939 Change-Id: I70080fbb4e7ae791857334f2bffd847d08cb25fa Reviewed-on: https://go-review.googlesource.com/28274 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
1 parent cc0248a commit 6fb4b15

File tree

4 files changed

+1
-443
lines changed

4 files changed

+1
-443
lines changed

src/runtime/cpuflags_amd64.go

Lines changed: 0 additions & 75 deletions
This file was deleted.

src/runtime/cpuidlow_amd64.s

Lines changed: 0 additions & 22 deletions
This file was deleted.

src/runtime/memmove_amd64.s

Lines changed: 1 addition & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,6 @@ tail:
6464
JBE move_129through256
6565
// TODO: use branch table and BSR to make this just a single dispatch
6666

67-
TESTB $1, runtime·useRepMovs(SB)
68-
JZ avxUnaligned
69-
7067
/*
7168
* check and set for backwards
7269
*/
@@ -111,6 +108,7 @@ back:
111108
ADDQ BX, CX
112109
CMPQ CX, DI
113110
JLS forward
111+
114112
/*
115113
* whole thing backwards has
116114
* adjusted addresses
@@ -275,242 +273,3 @@ move_256through2048:
275273
LEAQ 256(DI), DI
276274
JGE move_256through2048
277275
JMP tail
278-
279-
avxUnaligned:
280-
// There are two implementations of the move algorithm.
281-
// The first one for non-overlapped memory regions. It uses forward copying.
282-
// The second one for overlapped regions. It uses backward copying
283-
MOVQ DI, CX
284-
SUBQ SI, CX
285-
// Now CX contains distance between SRC and DEST
286-
CMPQ CX, BX
287-
// If the distance lesser than region length it means that regions are overlapped
288-
JC copy_backward
289-
290-
// Non-temporal copy would be better for big sizes.
291-
CMPQ BX, $0x100000
292-
JAE gobble_big_data_fwd
293-
294-
// Memory layout on the source side
295-
// SI CX
296-
// |<---------BX before correction--------->|
297-
// | |<--BX corrected-->| |
298-
// | | |<--- AX --->|
299-
// |<-R11->| |<-128 bytes->|
300-
// +----------------------------------------+
301-
// | Head | Body | Tail |
302-
// +-------+------------------+-------------+
303-
// ^ ^ ^
304-
// | | |
305-
// Save head into Y4 Save tail into X5..X12
306-
// |
307-
// SI+R11, where R11 = ((DI & -32) + 32) - DI
308-
// Algorithm:
309-
// 1. Unaligned save of the tail's 128 bytes
310-
// 2. Unaligned save of the head's 32 bytes
311-
// 3. Destination-aligned copying of body (128 bytes per iteration)
312-
// 4. Put head on the new place
313-
// 5. Put the tail on the new place
314-
// It can be important to satisfy processor's pipeline requirements for
315-
// small sizes as the cost of unaligned memory region copying is
316-
// comparable with the cost of main loop. So code is slightly messed there.
317-
// There is more clean implementation of that algorithm for bigger sizes
318-
// where the cost of unaligned part copying is negligible.
319-
// You can see it after gobble_big_data_fwd label.
320-
LEAQ (SI)(BX*1), CX
321-
MOVQ DI, R10
322-
// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
323-
MOVOU -0x80(CX), X5
324-
MOVOU -0x70(CX), X6
325-
MOVQ $0x80, AX
326-
// Align destination address
327-
ANDQ $-32, DI
328-
ADDQ $32, DI
329-
// Continue tail saving.
330-
MOVOU -0x60(CX), X7
331-
MOVOU -0x50(CX), X8
332-
// Make R11 delta between aligned and unaligned destination addresses.
333-
MOVQ DI, R11
334-
SUBQ R10, R11
335-
// Continue tail saving.
336-
MOVOU -0x40(CX), X9
337-
MOVOU -0x30(CX), X10
338-
// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
339-
SUBQ R11, BX
340-
// Continue tail saving.
341-
MOVOU -0x20(CX), X11
342-
MOVOU -0x10(CX), X12
343-
// The tail will be put on it's place after main body copying.
344-
// It's time for the unaligned heading part.
345-
VMOVDQU (SI), Y4
346-
// Adjust source address to point past head.
347-
ADDQ R11, SI
348-
SUBQ AX, BX
349-
// Aligned memory copying there
350-
gobble_128_loop:
351-
VMOVDQU (SI), Y0
352-
VMOVDQU 0x20(SI), Y1
353-
VMOVDQU 0x40(SI), Y2
354-
VMOVDQU 0x60(SI), Y3
355-
ADDQ AX, SI
356-
VMOVDQA Y0, (DI)
357-
VMOVDQA Y1, 0x20(DI)
358-
VMOVDQA Y2, 0x40(DI)
359-
VMOVDQA Y3, 0x60(DI)
360-
ADDQ AX, DI
361-
SUBQ AX, BX
362-
JA gobble_128_loop
363-
// Now we can store unaligned parts.
364-
ADDQ AX, BX
365-
ADDQ DI, BX
366-
VMOVDQU Y4, (R10)
367-
VZEROUPPER
368-
MOVOU X5, -0x80(BX)
369-
MOVOU X6, -0x70(BX)
370-
MOVOU X7, -0x60(BX)
371-
MOVOU X8, -0x50(BX)
372-
MOVOU X9, -0x40(BX)
373-
MOVOU X10, -0x30(BX)
374-
MOVOU X11, -0x20(BX)
375-
MOVOU X12, -0x10(BX)
376-
RET
377-
378-
gobble_big_data_fwd:
379-
// There is forward copying for big regions.
380-
// It uses non-temporal mov instructions.
381-
// Details of this algorithm are commented previously for small sizes.
382-
LEAQ (SI)(BX*1), CX
383-
MOVOU -0x80(SI)(BX*1), X5
384-
MOVOU -0x70(CX), X6
385-
MOVOU -0x60(CX), X7
386-
MOVOU -0x50(CX), X8
387-
MOVOU -0x40(CX), X9
388-
MOVOU -0x30(CX), X10
389-
MOVOU -0x20(CX), X11
390-
MOVOU -0x10(CX), X12
391-
VMOVDQU (SI), Y4
392-
MOVQ DI, R8
393-
ANDQ $-32, DI
394-
ADDQ $32, DI
395-
MOVQ DI, R10
396-
SUBQ R8, R10
397-
SUBQ R10, BX
398-
ADDQ R10, SI
399-
LEAQ (DI)(BX*1), CX
400-
SUBQ $0x80, BX
401-
gobble_mem_fwd_loop:
402-
PREFETCHNTA 0x1C0(SI)
403-
PREFETCHNTA 0x280(SI)
404-
// Prefetch values were choosen empirically.
405-
// Approach for prefetch usage as in 7.6.6 of [1]
406-
// [1] 64-ia-32-architectures-optimization-manual.pdf
407-
// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
408-
VMOVDQU (SI), Y0
409-
VMOVDQU 0x20(SI), Y1
410-
VMOVDQU 0x40(SI), Y2
411-
VMOVDQU 0x60(SI), Y3
412-
ADDQ $0x80, SI
413-
VMOVNTDQ Y0, (DI)
414-
VMOVNTDQ Y1, 0x20(DI)
415-
VMOVNTDQ Y2, 0x40(DI)
416-
VMOVNTDQ Y3, 0x60(DI)
417-
ADDQ $0x80, DI
418-
SUBQ $0x80, BX
419-
JA gobble_mem_fwd_loop
420-
// NT instructions don't follow the normal cache-coherency rules.
421-
// We need SFENCE there to make copied data available timely.
422-
SFENCE
423-
VMOVDQU Y4, (R8)
424-
VZEROUPPER
425-
MOVOU X5, -0x80(CX)
426-
MOVOU X6, -0x70(CX)
427-
MOVOU X7, -0x60(CX)
428-
MOVOU X8, -0x50(CX)
429-
MOVOU X9, -0x40(CX)
430-
MOVOU X10, -0x30(CX)
431-
MOVOU X11, -0x20(CX)
432-
MOVOU X12, -0x10(CX)
433-
RET
434-
435-
copy_backward:
436-
MOVQ DI, AX
437-
// Backward copying is about the same as the forward one.
438-
// Firstly we load unaligned tail in the beginning of region.
439-
MOVOU (SI), X5
440-
MOVOU 0x10(SI), X6
441-
ADDQ BX, DI
442-
MOVOU 0x20(SI), X7
443-
MOVOU 0x30(SI), X8
444-
LEAQ -0x20(DI), R10
445-
MOVQ DI, R11
446-
MOVOU 0x40(SI), X9
447-
MOVOU 0x50(SI), X10
448-
ANDQ $0x1F, R11
449-
MOVOU 0x60(SI), X11
450-
MOVOU 0x70(SI), X12
451-
XORQ R11, DI
452-
// Let's point SI to the end of region
453-
ADDQ BX, SI
454-
// and load unaligned head into X4.
455-
VMOVDQU -0x20(SI), Y4
456-
SUBQ R11, SI
457-
SUBQ R11, BX
458-
// If there is enough data for non-temporal moves go to special loop
459-
CMPQ BX, $0x100000
460-
JA gobble_big_data_bwd
461-
SUBQ $0x80, BX
462-
gobble_mem_bwd_loop:
463-
VMOVDQU -0x20(SI), Y0
464-
VMOVDQU -0x40(SI), Y1
465-
VMOVDQU -0x60(SI), Y2
466-
VMOVDQU -0x80(SI), Y3
467-
SUBQ $0x80, SI
468-
VMOVDQA Y0, -0x20(DI)
469-
VMOVDQA Y1, -0x40(DI)
470-
VMOVDQA Y2, -0x60(DI)
471-
VMOVDQA Y3, -0x80(DI)
472-
SUBQ $0x80, DI
473-
SUBQ $0x80, BX
474-
JA gobble_mem_bwd_loop
475-
// Let's store unaligned data
476-
VMOVDQU Y4, (R10)
477-
VZEROUPPER
478-
MOVOU X5, (AX)
479-
MOVOU X6, 0x10(AX)
480-
MOVOU X7, 0x20(AX)
481-
MOVOU X8, 0x30(AX)
482-
MOVOU X9, 0x40(AX)
483-
MOVOU X10, 0x50(AX)
484-
MOVOU X11, 0x60(AX)
485-
MOVOU X12, 0x70(AX)
486-
RET
487-
488-
gobble_big_data_bwd:
489-
SUBQ $0x80, BX
490-
gobble_big_mem_bwd_loop:
491-
PREFETCHNTA -0x1C0(SI)
492-
PREFETCHNTA -0x280(SI)
493-
VMOVDQU -0x20(SI), Y0
494-
VMOVDQU -0x40(SI), Y1
495-
VMOVDQU -0x60(SI), Y2
496-
VMOVDQU -0x80(SI), Y3
497-
SUBQ $0x80, SI
498-
VMOVNTDQ Y0, -0x20(DI)
499-
VMOVNTDQ Y1, -0x40(DI)
500-
VMOVNTDQ Y2, -0x60(DI)
501-
VMOVNTDQ Y3, -0x80(DI)
502-
SUBQ $0x80, DI
503-
SUBQ $0x80, BX
504-
JA gobble_big_mem_bwd_loop
505-
SFENCE
506-
VMOVDQU Y4, (R10)
507-
VZEROUPPER
508-
MOVOU X5, (AX)
509-
MOVOU X6, 0x10(AX)
510-
MOVOU X7, 0x20(AX)
511-
MOVOU X8, 0x30(AX)
512-
MOVOU X9, 0x40(AX)
513-
MOVOU X10, 0x50(AX)
514-
MOVOU X11, 0x60(AX)
515-
MOVOU X12, 0x70(AX)
516-
RET

0 commit comments

Comments
 (0)