memcpy alignment

#include <stdio.h>
#include <stdlib.h>
#include <memory.h> // memcpy
#include <time.h>

#define N 787654321
typedef unsigned int  word; /* 假定 4bytes */
typedef unsigned char byte;

void cpy1(void *target, void *source, size_t size)
{
    memcpy(target, source, size);
}

void cpy2(void* target, void* source, size_t size)
{
    register byte* Dest = (byte*)target;
    register byte* Sour = (byte*)source;
    register byte* End = Sour + size;
    while( Sour!=End ) 
        *Dest++ = *Sour++;
}

void cpy3(void* target, void* source, size_t size)
{
    register word *Wsrc_s  = (word*)source;
    register word *Wdes_s  = (word*)target; 
    register byte *Bsrc_s;
    register byte *Bdes_s;
    // 處理 4 bytes 
    while(size>4){
        *Wdes_s++ = *Wsrc_s++;
        size-=4;
    }
    // 處理 < 4 bytes
    Bsrc_s = (byte*)(Wsrc_s);
    Bdes_s = (byte*)(Wdes_s);
    while(size) {
        *Bdes_s++ = *Bsrc_s++;
        --size;
    }
}

// 
typedef struct tagT256{ byte trunk[256];}T256;
void cpy4(void* target, void* source, size_t size)
{
    register T256 *Tsrc_s  = (T256*)source;
    register T256 *Tdes_s  = (T256*)target;
    register word *Wsrc_s;
    register word *Wdes_s;
    register byte *Bsrc_s;
    register byte *Bdes_s;
    // 處理 256 bytes 
    while(size>256){
        *Tdes_s++ = *Tsrc_s++;
        size-=256;
    }
    // 處理 4 bytes
    Wsrc_s = (word*)(Tsrc_s);
    Wdes_s = (word*)(Tdes_s);
    while(size>4){
        *Wdes_s++ = *Wsrc_s++;
        size-=4;
    }
    // 處理 < 4 bytes
    Bsrc_s = (byte*)(Wsrc_s);
    Bdes_s = (byte*)(Wdes_s);
    while(size) {
        *Bdes_s++ = *Bsrc_s++;
        --size;
    }
}
// 
typedef struct tagT1024{    byte trunk[1024];}T1024;
void cpy5(void* target, void* source, size_t size)
{
    register T1024 *TTsrc_s = (T1024*)source;
    register T1024 *TTdes_s = (T1024*)target;
    register T256 *Tsrc_s;
    register T256 *Tdes_s;
    register word *Wsrc_s;
    register word *Wdes_s;
    register byte *Bsrc_s;
    register byte *Bdes_s;
    // 處理 1024 bytes
    while(size>1024){
        *TTdes_s++ = *TTsrc_s++;
        size-=1024;
    }
    // 處理 256 bytes 
    Tsrc_s = (T256*)(TTsrc_s);
    Tdes_s = (T256*)(TTdes_s);
    while(size>256){
        *Tdes_s++ = *Tsrc_s++;
        size-=256;
    }
    // 處理 4 bytes
    Wsrc_s = (word*)(Tsrc_s);
    Wdes_s = (word*)(Tdes_s);
    while(size>4){
        *Wdes_s++ = *Wsrc_s++;
        size-=4;
    }
    // 處理 < 4 bytes
    Bsrc_s = (byte*)(Wsrc_s);
    Bdes_s = (byte*)(Wdes_s);
    while(size) {
        *Bdes_s++ = *Bsrc_s++;
        --size;
    }
}

void cpy6(void* target, void* source, size_t size)
{
    register byte* to=(byte*)target;
    register byte* from=(byte*)source;
    register size_t n=(size+7)>>3;
    switch (n&7){
        case 0: do {    *to++ = *from++;
        case 7:         *to++ = *from++;
        case 6:         *to++ = *from++;
        case 5:         *to++ = *from++;
        case 4:         *to++ = *from++;
        case 3:         *to++ = *from++;
        case 2:         *to++ = *from++;
        case 1:         *to++ = *from++;
                } while(--n > 0);
    }               
}

void cpy7(void* target, void* source, size_t size) 
{
    register byte* to=(byte*)target;
    register byte* from=(byte*)source;
    register size_t n=size;
    while(n--)
        *to++=*from++;
}

int main()
{
    int i;
    clock_t t;
    const size_t size = sizeof(byte)*N;
    byte* arr1 = (byte*)malloc(size);
    byte* arr2 = (byte*)malloc(size);
    // setter
    for(i=0; i<N; ++i) arr1[i] =i;

    // memcpy test
    memset(arr2,-1,size);
    t = clock(), cpy1(arr2, arr1, size);
    printf("cpy1 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy2(arr2, arr1, size);
    printf("cpy2 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy3(arr2, arr1, size);
    printf("cpy3 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy4(arr2, arr1, size);
    printf("cpy4 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy5(arr2, arr1, size);
    printf("cpy5 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy6(arr2, arr1, size);
    printf("cpy6 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    memset(arr2,-1,size);
    t = clock(), cpy7(arr2, arr1, size);
    printf("cpy7 : %-5ld  ", clock()-t);
    puts((!memcmp(arr1, arr2, size))? "arr2=arr1" : "arr2!=arr1");

    // release memory
    free(arr1), free(arr2);
    getchar();
    return 0;
}

這比memcpy快 可以快個三到四倍,調用方法與memcpy一樣

不做restricted保護

sse 128-bit暫存器

struct 其實compiler都還是用32 bit暫存器copy 同樣copy 16個byte用128 bit站存器就是隻要1個指令

#include <emmintrin.h> /*SSE2*/

#define DIV16(VAL)              ((VAL)>>4)
#define MUL16(VAL)              ((VAL)<<4)

#define MEMCPY(PDST, PSRC, SIZE) sse2_memcpy( (PDST), (PSRC), (SIZE))


void sse2_memcpy(void* pDst, void* pSrc, size_t size)
{
    if (pDst == pSrc) {
        return ;
    }

    unsigned int i;
    unsigned int nLoops;

    char* pcDst, *pcSrc;
    __m128i* pMovSrc, *pMovDst;


    pMovDst = (__m128i*)(pDst);
    pMovSrc = (__m128i*)(pSrc);

    nLoops = DIV16((unsigned int)size);

    __m128i _miTemp;

    for (i = 0; i < nLoops; i++) {
        _miTemp = _mm_loadu_si128(pMovSrc);
        _mm_storeu_si128(pMovDst, _miTemp);
        pMovDst++;
        pMovSrc++;
    }


    pcDst = (char*)(pMovDst);
    pcSrc = (char*)(pMovSrc);

    for (i = MUL16(nLoops); i < size; i++) {
        *pcDst++ = *pcSrc++;
    }

}/*sse_memcpy*/