#ifndef SCRYPT #define SCRYPT 1 void halfsalsa(uint4 *w){ for(uint i=0; i<4; ++i){ w[0] ^= rotl(w[3] +w[2] , 7U); w[1] ^= rotl(w[0] +w[3] , 9U); w[2] ^= rotl(w[1] +w[0] ,13U); w[3] ^= rotl(w[2] +w[1] ,18U); w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); } } #if (LOOKUP_GAP == 2) void salsa(uint4 *B, bool db){ #else void salsa(uint4 *B){ #endif uint4 w[4]; for(uint i=0; i<4; ++i) w[i] = (B[i]^=B[i+4]); halfsalsa(w); for(uint i=0; i<4; ++i) w[i] = (B[i+4]^=(B[i]+=w[i])); halfsalsa(w); #if (LOOKUP_GAP == 2) if(db){ for(uint i=0; i<4; ++i) w[i] = (B[i]^=(B[i+4]+=w[i])); halfsalsa(w); for(uint i=0; i<4; ++i) w[i] = (B[i+4]^=(B[i]+=w[i])); halfsalsa(w); } #endif for(uint i=0; i<4; ++i) B[i+4] += w[i]; } //void scrypt_core(uint4 *X, __global uint4 *restrict lookup){ void scrypt_core(uint *XA, uint *XB, uint *XC, uint *XD, uint *XE, uint *XF, uint *XG, uint *XH, __global uint4 *restrict lookup){ DEFNFACTOR(nfact) const uint zSIZE = 8; //const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0)); const uint xSIZE = CONCURRENT_THREADS; uint x = get_global_id(0)%xSIZE; //uint4 tmp[4]; uint4 X[8]; uint4 tmpa = (uint4)(XB[0], XC[1], XD[2], XA[3]); uint4 tmpb = (uint4)(XC[0], XD[1], XA[2], XB[3]); uint4 tmpc = (uint4)(XD[0], XA[1], XB[2], XC[3]); uint4 tmpd = (uint4)(XA[0], XB[1], XC[2], XD[3]); X[0] = EndianSwapa(tmpa); X[1] = EndianSwapb(tmpb); X[2] = EndianSwapb(tmpc); X[3] = EndianSwapb(tmpd); tmpa = (uint4)(XF[0], XG[1], XH[2], XE[3]); tmpb = (uint4)(XG[0], XH[1], XE[2], XF[3]); tmpc = (uint4)(XH[0], XE[1], XF[2], XG[3]); tmpd = (uint4)(XE[0], XF[1], XG[2], XH[3]); X[4] = EndianSwapa(tmpa); X[5] = EndianSwapb(tmpb); X[6] = EndianSwapb(tmpc); X[7] = EndianSwapb(tmpd); /* uint4 tmpa = (uint4)(XB[0], XC[1], XD[2], XA[3]); uint4 tmpb = (uint4)(XC[0], XD[1], XA[2], XB[3]); uint4 tmpc = (uint4)(XD[0], XA[1], XB[2], XC[3]); uint4 tmpd = (uint4)(XA[0], XB[1], XC[2], XD[3]); tmpa = EndianSwapa(tmpa); XB[0] = tmpa.x; XC[1] = tmpa.y; XD[2] = tmpa.z; XA[3] = tmpa.w; tmpb = EndianSwapb(tmpb); XC[0] = tmpb.x; XD[1] = tmpb.y; XA[2] = tmpb.z; XB[3] = tmpb.w; tmpc = EndianSwapb(tmpc); XD[0] = tmpc.x; XA[1] = tmpc.y; XB[2] = tmpc.z; XC[3] = tmpc.w; tmpd = EndianSwapb(tmpd); XA[0] = tmpd.x; XB[1] = tmpd.y; XC[2] = tmpd.z; XD[3] = tmpd.w; tmpa = (uint4)(XF[0], XG[1], XH[2], XE[3]); tmpb = (uint4)(XG[0], XH[1], XE[2], XF[3]); tmpc = (uint4)(XH[0], XE[1], XF[2], XG[3]); tmpd = (uint4)(XE[0], XF[1], XG[2], XH[3]); tmpa = EndianSwapa(tmpa); XF[0] = tmpa.x; XG[1] = tmpa.y; XH[2] = tmpa.z; XE[3] = tmpa.w; tmpb = EndianSwapb(tmpb); XG[0] = tmpb.x; XH[1] = tmpb.y; XE[2] = tmpb.z; XF[3] = tmpb.w; tmpc = EndianSwapb(tmpc); XH[0] = tmpc.x; XE[1] = tmpc.y; XF[2] = tmpc.z; XG[3] = tmpc.w; tmpd = EndianSwapb(tmpd); XE[0] = tmpd.x; XF[1] = tmpd.y; XG[2] = tmpd.z; XH[3] = tmpd.w; */ // tmp[0] = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w); // tmp[1] = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w); // tmp[2] = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w); // tmp[3] = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w); // X[0] = EndianSwapa(tmp[0]); // X[1] = EndianSwapb(tmp[1]); // X[2] = EndianSwapb(tmp[2]); // X[3] = EndianSwapb(tmp[3]); // tmp[0] = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w); // tmp[1] = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w); // tmp[2] = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w); // tmp[3] = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w); // X[4] = EndianSwapa(tmp[0]); // X[5] = EndianSwapb(tmp[1]); // X[6] = EndianSwapb(tmp[2]); // X[7] = EndianSwapb(tmp[3]); for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){ for(uint z=0; z>1); #elif (LOOKUP_GAP == 4) uint y = (j>>2); #elif (LOOKUP_GAP == 8) uint y = (j>>3); #else uint y = (j/LOOKUP_GAP); #endif #if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1) uint4 V[8]; for(uint z=0; z