/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 *
 * V1.6 modified by sterling pickens linuxsociety.org 2014
 */

#ifndef NFACTOR
#define NFACTOR 10
#endif

#if NFACTOR == 1
#  define __NF__ 2u
#elif NFACTOR == 2
# define __NF__ 4u
#elif NFACTOR == 3
#  define __NF__ 8u
#elif NFACTOR == 4
#  define __NF__ 16u
#elif NFACTOR == 5
#  define __NF__ 32u
#elif NFACTOR == 6
#  define __NF__ 64u
#elif NFACTOR == 7
#  define __NF__ 128u
#elif NFACTOR == 8
#  define __NF__ 256u
#elif NFACTOR == 9
#  define __NF__ 512u
#elif NFACTOR == 10
#  define __NF__ 1024u
#elif NFACTOR == 11
#  define __NF__ 2048u
#elif NFACTOR == 12
#  define __NF__ 4096u
#elif NFACTOR == 13
#  define __NF__ 8192u
#elif NFACTOR == 14
#  define __NF__ 16384u
#elif NFACTOR == 15
#  define __NF__ 32768u
#elif NFACTOR == 16
#  define __NF__ 65536u
#elif NFACTOR == 17
#  define __NF__ 131072u
#elif NFACTOR == 18
#  define __NF__ 262144u
#elif NFACTOR == 19
#  define __NF__ 524288u
#elif NFACTOR == 20
#  define __NF__ 1048576u
#else
# define __NF__ 1024u
#endif

#define E0 0x00FF00FFU
#define E1 0xFF00FF00U

#define SK00 0x80000000U
#define SK01 0x00000280U
#define SK02 0x5C5C5C5CU
#define SK03 0x36363636U
#define SK04 0x000004a0U
#define SK05 0x00000300U

#define ZERO   0x0U
#define ONE    0x1U
#define TWO    0x2U
#define THREE  0x3U

__constant uint16 Kc[4] = {(uint16)(0x428a2f98U, 0x71374491U, 0xb5c0fbcfU, 0xe9b5dba5U, 0x3956c25bU, 0x59f111f1U, 0x923f82a4U, 0xab1c5ed5U,
									0xd807aa98U, 0x12835b01U, 0x243185beU, 0x550c7dc3U, 0x72be5d74U, 0x80deb1feU, 0x9bdc06a7U, 0xc19bf174U),
							(uint16)(0xe49b69c1U, 0xefbe4786U, 0x0fc19dc6U, 0x240ca1ccU, 0x2de92c6fU, 0x4a7484aaU, 0x5cb0a9dcU, 0x76f988daU,
									0x983e5152U, 0xa831c66dU, 0xb00327c8U, 0xbf597fc7U, 0xc6e00bf3U, 0xd5a79147U, 0x06ca6351U, 0x14292967U),
							(uint16)(0x27b70a85U, 0x2e1b2138U, 0x4d2c6dfcU, 0x53380d13U, 0x650a7354U, 0x766a0abbU, 0x81c2c92eU, 0x92722c85U,
									0xa2bfe8a1U, 0xa81a664bU, 0xc24b8b70U, 0xc76c51a3U, 0xd192e819U, 0xd6990624U, 0xf40e3585U, 0x106aa070U),
							(uint16)(0x19a4c116U, 0x1e376c08U, 0x2748774cU, 0x34b0bcb5U, 0x391c0cb3U, 0x4ed8aa4aU, 0x5b9cca4fU, 0x682e6ff3U,
									0x748f82eeU, 0x78a5636fU, 0x84c87814U, 0x8cc70208U, 0x90befffaU, 0xa4506cebU, 0xbef9a3f7U, 0xc67178f2U)};

typedef struct INPUT_t{
	uint8 a;
	uint8 b;
	uint c;
	uint d;
	uint e;
	uint f;
}input_t;

#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))

#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)  \
	h += Tr1(e);       \
	h += Ch(e, f, g);  \
	h += k;            \
	d += h;            \
	h += Tr2(a);       \
	h += Maj(a, b, c);

//# define CO_0 ((y*CONCURRENT_THREADS+x)<<1)
//# define CO_1 ((y*CONCURRENT_THREADS+x)<<1)+1
# define CO_0 (idx=(y*CONCURRENT_THREADS+x)<<1)
# define CO_1 idx+1
# define CO_W0 (idx=(y+x)<<1)
# define CO_W1 idx+1

#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

void SHA256(uint8 *digest, uint8 *block0, uint8 *block1){
	uint A = (*digest).s0;
	uint B = (*digest).s1;
	uint C = (*digest).s2;
	uint D = (*digest).s3;
	uint E = (*digest).s4;
	uint F = (*digest).s5;
	uint G = (*digest).s6;
	uint H = (*digest).s7;

	uint B00 = (*block0).s0;
	uint B01 = (*block0).s1;
	uint B02 = (*block0).s2;
	uint B03 = (*block0).s3;
	uint B04 = (*block0).s4;
	uint B05 = (*block0).s5;
	uint B06 = (*block0).s6;
	uint B07 = (*block0).s7;

	uint B08 = (*block1).s0;
	uint B09 = (*block1).s1;
	uint B10 = (*block1).s2;
	uint B11 = (*block1).s3;
	uint B12 = (*block1).s4;
	uint B13 = (*block1).s5;
	uint B14 = (*block1).s6;
	uint B15 = (*block1).s7;

	for(uint i=0; i<4; i++){
		RND(A,B,C,D,E,F,G,H, B00 + Kc[i].s0);
		RND(H,A,B,C,D,E,F,G, B01 + Kc[i].s1);
		RND(G,H,A,B,C,D,E,F, B02 + Kc[i].s2);
		RND(F,G,H,A,B,C,D,E, B03 + Kc[i].s3);
		RND(E,F,G,H,A,B,C,D, B04 + Kc[i].s4);
		RND(D,E,F,G,H,A,B,C, B05 + Kc[i].s5);
		RND(C,D,E,F,G,H,A,B, B06 + Kc[i].s6);
		RND(B,C,D,E,F,G,H,A, B07 + Kc[i].s7);

		RND(A,B,C,D,E,F,G,H, B08 + Kc[i].s8);
		RND(H,A,B,C,D,E,F,G, B09 + Kc[i].s9);
		RND(G,H,A,B,C,D,E,F, B10 + Kc[i].sa);
		RND(F,G,H,A,B,C,D,E, B11 + Kc[i].sb);
		RND(E,F,G,H,A,B,C,D, B12 + Kc[i].sc);
		RND(D,E,F,G,H,A,B,C, B13 + Kc[i].sd);
		RND(C,D,E,F,G,H,A,B, B14 + Kc[i].se);
		RND(B,C,D,E,F,G,H,A, B15 + Kc[i].sf);

		if(i==3)
    		break;

		B00 += Wr1(B14) + B09 + Wr2(B01);
		B01 += Wr1(B15) + B10 + Wr2(B02);
		B02 += Wr1(B00) + B11 + Wr2(B03);
		B03 += Wr1(B01) + B12 + Wr2(B04);
		B04 += Wr1(B02) + B13 + Wr2(B05);
		B05 += Wr1(B03) + B14 + Wr2(B06);
		B06 += Wr1(B04) + B15 + Wr2(B07);
		B07 += Wr1(B05) + B00 + Wr2(B08);

		B08 += Wr1(B06) + B01 + Wr2(B09);
		B09 += Wr1(B07) + B02 + Wr2(B10);
		B10 += Wr1(B08) + B03 + Wr2(B11);
		B11 += Wr1(B09) + B04 + Wr2(B12);
		B12 += Wr1(B10) + B05 + Wr2(B13);
		B13 += Wr1(B11) + B06 + Wr2(B14);
		B14 += Wr1(B12) + B07 + Wr2(B15);
		B15 += Wr1(B13) + B08 + Wr2(B00);
	}

	*digest += (uint8)(A, B, C, D, E, F, G, H);
}

void scrypt_key(__global uint16 *pad0, uint pady, const __global input_t *input, uint gid, uint8 tstate, uint8 ostate){
	uint8 block0 = {input->c, input->d, input->e, gid, ZERO, SK00, ZERO, ZERO};
	uint8 tmpa = input->a;
	uint8 block1a = {0,0,0,0,0,0,0,SK04};
	uint8 block1b = {SK00,0,0,0,0,0,0,SK05};
	uint8 tmpb = input->b;
	uint16 key[2];

	SHA256(&tstate, &tmpa, &tmpb);

	for(uint i=0; i<4; i++){
		block0.s4++;
		tmpb = tstate;
		SHA256(&tmpb, &block0, &block1a);
		tmpa = ostate;
		SHA256(&tmpa, &tmpb, &block1b);
		if(i&ONE)
			key[(i>>1)].hi = tmpa;
		else
			key[(i>>1)].lo = tmpa;
	}
	pad0[pady] = key[0];
	pad0[pady+1] = key[1];
}

void final_SHA(uint8 *tstate, uint8 *ostate){
	uint8 tmpa = {0x00000001U, 0x80000000U, 0, 0, 0, 0, 0, 0};
	uint8 tmpb = {0, 0, 0, 0, 0, 0, 0, 0x00000620U};
	uint8 tmpc = {SK00, 0, 0, 0, 0, 0, 0, SK05};

	SHA256(tstate, &tmpa, &tmpb);
	SHA256(ostate, tstate, &tmpc);

}

void halfsalsa(uint16 *B){
	uint16 w = *B;
    for(uint i=0; i<4; ++i){
        w.s0123 ^= rotl(w.scdef + w.s89ab, 7U);
        w.s4567 ^= rotl(w.s0123 + w.scdef, 9U);
        w.s89ab ^= rotl(w.s4567 + w.s0123, 13U);
        w.scdef ^= rotl(w.s89ab + w.s4567, 18U);
        w.s89ab ^= rotl(w.sfcde + w.s2301, 7U);
        w.s4567 ^= rotl(w.sb89a + w.sefcd, 9U);
        w.s0123 ^= rotl(w.s7456 + w.sab89, 13U);
        w.scdef ^= rotl(w.s3012 + w.s6745, 18U);

    }
	*B += w;
}

# if (LOOKUP_GAP == 2)
//void salsa(uint16 *B, bool db){
void salsa(uint16 *B){
	B[0] ^= B[1];
	halfsalsa(B);
	B[1] ^= B[0];
	halfsalsa(B+1);

/*
	if(db){
		B[0] ^= B[1];
		halfsalsa(B);
		B[1] ^= B[0];
		halfsalsa(B+1);
	}
*/

}
# elif (LOOKUP_GAP == 3)
void salsa(uint16 *B, ushort times){
    for(uint i=0; i<times+1; i++){
		B[0] ^= B[1];
		halfsalsa(B);
		B[1] ^= B[0];
		halfsalsa(B+1);
    }
}
# elif (LOOKUP_GAP != 1)
void salsa(uint16 *B, ushort times){
	for(uint i=0; i<times+1; i++){
		B[0] ^= B[1];
		halfsalsa(B);
		B[1] ^= B[0];
		halfsalsa(B+1);
	}
}
# else
void salsa(uint16 *B){
	B[0] ^= B[1];
	halfsalsa(B);
	B[1] ^= B[0];
	halfsalsa(B+1);
}
# endif

void Shittify(__global uint16 *pad0, __global uint16 *pad1, bool opt){
	uint16 tmpa = *pad0;
	uint16 tmpb = *pad1;
	if(opt){
		tmpa = tmpa.s49e38d27c16b05af;
		tmpb = tmpb.s49e38d27c16b05af;
	}else{
		tmpa = tmpa.sc9630da741eb852f;
		tmpb = tmpb.sc9630da741eb852f;
	}
	*pad0 = EndianSwapa(tmpa);
	*pad1 = EndianSwapa(tmpb);

}

void scrypt_core(__global uint16 *lookup, uint x){
	uint idx = x<<1;
	uint16 X[2] = {lookup[idx], lookup[idx+1]};

#if (LOOKUP_GAP == 2)
	uint16 V[2];
	uint savet = 0;
	uchar cond = 0;
#endif

	uint j = ((__NF__/LOOKUP_GAP)+(__NF__%LOOKUP_GAP));
	uint y;

//write portion
	for(y=0; y<__NF__; y++){

		//if(!cond){
		salsa(X);
		cond++;
		//}
		//if(savet == j )
		//	break;

		if(cond == LOOKUP_GAP){
			//idx=((savet*CONCURRENT_THREADS+x)<<1);
			savet++;
			if(savet == j ){
				break;
			}
			//idx=((savet*CONCURRENT_THREADS+x)<<1);
			lookup[(idx=(savet*CONCURRENT_THREADS+x)<<1)] = X[0];
			lookup[idx+1] = X[1];
			//savet++;
			cond = 0;
		}
	}

/*
	for(y=CONCURRENT_THREADS; y<(CONCURRENT_THREADS*(__NF__/LOOKUP_GAP)); y+=CONCURRENT_THREADS){
#if (LOOKUP_GAP == 2)
		salsa(X, ONE);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		salsa(X, LOOKUP_GAP-ONE);
#endif
		lookup[CO_W0] = X[0];
		lookup[CO_W1] = X[1];
	}

#if (LOOKUP_GAP == 2)
	salsa(X, ONE);
#elif (LOOKUP_GAP == 1)
	salsa(X);
#else
	salsa(X, LOOKUP_GAP-ONE);
#endif



#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
	{
		y = (__NF__/LOOKUP_GAP);
		lookup[CO_0] = X[0];
		lookup[CO_1] = X[1];
# if (LOOKUP_GAP == 3)
		salsa(X, ZERO);
# elif (LOOKUP_GAP == 5)
		salsa(X, THREE);
# elif (LOOKUP_GAP == 7)
		salsa(X, ONE);
# else
		for(uint i=0; i<__NF__%LOOKUP_GAP; ++i)
			salsa(X, ZERO);
# endif
	}

#endif
*/
cond = 0;
savet = 0;

// end write portion

// read portion
# if (LOOKUP_GAP == 2)
	for(uint i=0; i<__NF__<<1; i++){
# elif (LOOKUP_GAP == 4)
	for(uint i=0; i<__NF__; i++){
		j = X[1].sc & (__NF__-1);
		y = (j>>2);
# elif (LOOKUP_GAP == 8)
	for(uint i=0; i<__NF__; i++){
		j = X[1].sc & (__NF__-1);
		y = (j>>3);
# elif (LOOKUP_GAP != 1)
	for(uint i=0; i<__NF__; i++){
		j = X[1].sc & (__NF__-1);
		y = (j/LOOKUP_GAP);
# else
	for(uint i=0; i<__NF__; i++){
		y = X[1].sc & (__NF__-1);
# endif

#if (LOOKUP_GAP == 2)

		if(!cond){
			j = X[1].sc & (__NF__-1);
			y = (j>>1);
			cond = (j&1) ? 1 : 2;
			V[0] = lookup[CO_0];
			V[1] = lookup[CO_1];
		}

		if(cond==2){
			V[0] ^= X[0];
			V[1] ^= X[1];
			cond++;
		}

		if(cond != 4){
			//salsa(V, ZERO);
			salsa(V);
			cond++;
		}

		if(cond==4){
			X[0] = V[0];
			X[1] = V[1];
			savet++;
			if(savet == __NF__)
				break;
			cond=0;
		}

#elif (LOOKUP_GAP != 1)
		j -= y*LOOKUP_GAP;
		if(j){
			uint16 V[2] = {lookup[CO_0], lookup[CO_1]};
			salsa(V, j-ONE);
			X[0] ^= V[0];
			X[1] ^= V[1];
		}else{
			X[0] ^= lookup[CO_0];
			X[1] ^= lookup[CO_1];
		}
		salsa(X, ZERO);
#else
		X[0] ^= lookup[CO_0];
		X[1] ^= lookup[CO_1];

		salsa(X);
#endif

	}
// end read portion

	lookup[x<<1] = X[0];
	lookup[(x<<1)+1] = X[1];
}

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(const __global input_t *input, volatile __global uint *output,
__global uint16 *padcache, volatile uint4 midstate0, volatile uint4 midstate16, const uint target){

#ifdef GOFFSET
	uint gid = get_global_id(0);
#else
	uint gid = input->f + get_global_id(0);
#endif

	uint8 ostate = {0x6a09e667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, 0x510e527fU, 0x9b05688cU, 0x1F83D9ABU, 0x5BE0CD19U};
	uint8 tstate = {0x6a09e667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, 0x510e527fU, 0x9b05688cU, 0x1F83D9ABU, 0x5BE0CD19U};
	uint8 tmpa = {input->c, input->d, input->e, gid, SK00, ZERO, ZERO, ZERO};
	uint8 tmpb = {midstate0.s0, midstate0.s1, midstate0.s2, midstate0.s3, midstate16.s0, midstate16.s1, midstate16.s2, midstate16.s3};
	uint8 tmpc = {ZERO, ZERO, ZERO, ZERO, ZERO, ZERO, ZERO, SK01};
	uint padx = gid%CONCURRENT_THREADS;
	uint pady = padx<<1;
	uint padz = pady+1;

	SHA256(&tmpb, &tmpa, &tmpc);
	tmpa = tmpb^SK02;
	tmpc = SK02;
	SHA256(&ostate, &tmpa, &tmpc);
	tmpa = tmpb^SK03;
	tmpc = SK03;
	SHA256(&tstate, &tmpa, &tmpc);

	scrypt_key(padcache, pady, input, gid, tstate, ostate);

	Shittify(&padcache[pady], &padcache[padz], 1);
	scrypt_core(padcache, padx);
	Shittify(&padcache[pady], &padcache[padz], 0);

	for(uint i=0; i<2; i++){
		tmpa = padcache[pady].lo;
		tmpb = padcache[pady++].hi;
		SHA256(&tstate, &tmpa, &tmpb);
	}

	final_SHA(&tstate, &ostate);

	if( (EndianSwapa((ostate.s7)) <= target) )
		SETFOUND(gid);
}