typedef enum { PRE_READ, // prefetch assuming that buffer is used for reading only PRE_WRITE, // prefetch assuming that buffer is used for writing only PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing } e_prefetch; void Q_Prefetch (const void *s, const unsigned int bytes, e_prefetch type) { // write buffer prefetching is performed only if // the processor benefits from it. Read and read/write // prefetching is always performed. switch (type) { case PRE_WRITE : break; case PRE_READ: case PRE_READ_WRITE: __asm { mov ebx,s mov ecx,bytes cmp ecx,4096 // clamp to 4kB jle skipClamp mov ecx,4096 skipClamp: add ecx,0x1f shr ecx,5 // number of cache lines jz skip jmp loopie align 16 loopie: test byte ptr [ebx],al add ebx,32 dec ecx jnz loopie skip: } break; } } // optimized memory copy routine that handles all alignment // cases and block sizes efficiently void Q_memcpy (void *dest, const void *src, const size_t count) { Q_Prefetch (src, count, PRE_READ); __asm { push edi push esi mov ecx,count cmp ecx,0 // count = 0 check (just to be on the safe side) je outta mov edx,dest mov ebx,src cmp ecx,32 // padding only? jl padding mov edi,ecx and edi,~31 // edi = count&~31 sub edi,32 align 16 loopMisAligned: mov eax,[ebx + edi + 0 + 0*8] mov esi,[ebx + edi + 4 + 0*8] mov [edx+edi+0 + 0*8],eax mov [edx+edi+4 + 0*8],esi mov eax,[ebx + edi + 0 + 1*8] mov esi,[ebx + edi + 4 + 1*8] mov [edx+edi+0 + 1*8],eax mov [edx+edi+4 + 1*8],esi mov eax,[ebx + edi + 0 + 2*8] mov esi,[ebx + edi + 4 + 2*8] mov [edx+edi+0 + 2*8],eax mov [edx+edi+4 + 2*8],esi mov eax,[ebx + edi + 0 + 3*8] mov esi,[ebx + edi + 4 + 3*8] mov [edx+edi+0 + 3*8],eax mov [edx+edi+4 + 3*8],esi sub edi,32 jge loopMisAligned mov edi,ecx and edi,~31 add ebx,edi // increase src pointer add edx,edi // increase dst pointer and ecx,31 // new count jz outta // if count = 0, get outta here padding: cmp ecx,16 jl skip16 mov eax,dword ptr [ebx] mov dword ptr [edx],eax mov eax,dword ptr [ebx+4] mov dword ptr [edx+4],eax mov eax,dword ptr [ebx+8] mov dword ptr [edx+8],eax mov eax,dword ptr [ebx+12] mov dword ptr [edx+12],eax sub ecx,16 add ebx,16 add edx,16 skip16: cmp ecx,8 jl skip8 mov eax,dword ptr [ebx] mov dword ptr [edx],eax mov eax,dword ptr [ebx+4] sub ecx,8 mov dword ptr [edx+4],eax add ebx,8 add edx,8 skip8: cmp ecx,4 jl skip4 mov eax,dword ptr [ebx] // here 4-7 bytes add ebx,4 sub ecx,4 mov dword ptr [edx],eax add edx,4 skip4: // 0-3 remaining bytes cmp ecx,2 jl skip2 mov ax,word ptr [ebx] // two bytes cmp ecx,3 // less than 3? mov word ptr [edx],ax jl outta mov al,byte ptr [ebx+2] // last byte mov byte ptr [edx+2],al jmp outta skip2: cmp ecx,1 jl outta mov al,byte ptr [ebx] mov byte ptr [edx],al outta: pop esi pop edi } } void *Q_memset(void* dest0, int val, size_t count0) { union { byte bytes[8]; unsigned short words[4]; unsigned int dwords[2]; } dat; byte *dest = (byte *)dest0; int count = count0; while( count > 0 && (((int)dest) & 7) ) { *dest = val; dest++; count--; } if ( !count ) { return dest0; } dat.bytes[0] = val; dat.bytes[1] = val; dat.words[1] = dat.words[0]; dat.dwords[1] = dat.dwords[0]; if ( count >= 64 ) { __asm { mov edi, dest mov ecx, count shr ecx, 6 // 64 bytes per iteration movq mm1, dat // Read in source data movq mm2, mm1 movq mm3, mm1 movq mm4, mm1 movq mm5, mm1 movq mm6, mm1 movq mm7, mm1 movq mm0, mm1 loop1: movntq 0[edi], mm1 // Non-temporal stores movntq 8[edi], mm2 movntq 16[edi], mm3 movntq 24[edi], mm4 movntq 32[edi], mm5 movntq 40[edi], mm6 movntq 48[edi], mm7 movntq 56[edi], mm0 add edi, 64 dec ecx jnz loop1 } dest += ( count & ~63 ); count &= 63; } if ( count >= 8 ) { __asm { mov edi, dest mov ecx, count shr ecx, 3 // 8 bytes per iteration movq mm1, dat // Read in source data loop2: movntq 0[edi], mm1 // Non-temporal stores add edi, 8 dec ecx jnz loop2 } dest += (count & ~7); count &= 7; } while( count > 0 ) { *dest = val; dest++; count--; } __asm emms return dest0; }