/******************************************************************************* * * MIT License * * Copyright (c) 2017 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #ifndef GUARD_MIOPEN_ACTIV_DRIVER_HPP #define GUARD_MIOPEN_ACTIV_DRIVER_HPP #include "InputFlags.hpp" #include "driver.hpp" #include "mloNeuronHost.hpp" #include "tensor_driver.hpp" #include #include #include #include #include #include #include #include #include "random.hpp" #include "timer.hpp" #ifdef MIOPEN_BACKEND_HIP #ifndef CL_SUCCESS #define CL_SUCCESS 0 #endif #endif template class ActivationDriver : public Driver { public: ActivationDriver() : Driver() { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&outputTensor); miopenCreateTensorDescriptor(&dInputTensor); miopenCreateTensorDescriptor(&dOutputTensor); miopenCreateActivationDescriptor(&activDesc); data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs(); int ParseCmdLineArgs(int argc, char* argv[]); InputFlags& GetInputFlags() { return inflags; } int GetandSetData(); std::vector GetInputTensorLengthsFromCmdLine(); int SetActivationDescriptorFromCmdLineArgs(); int AllocateBuffersAndCopy(); int RunForwardGPU(); int RunForwardCPU(); // Verify implements it int RunBackwardGPU(); int RunBackwardCPU(); // Verify implements it int VerifyBackward(); int VerifyForward(); ~ActivationDriver() { miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(inputTensor); miopenDestroyActivationDescriptor(activDesc); } private: InputFlags inflags; miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t outputTensor; std::unique_ptr in_dev; std::unique_ptr out_dev; std::unique_ptr scale_dev; std::vector in; std::vector out; std::vector outhost; miopenActivationDescriptor_t activDesc; miopenTensorDescriptor_t dInputTensor; miopenTensorDescriptor_t dOutputTensor; std::unique_ptr din_dev; std::unique_ptr dout_dev; std::vector din; std::vector dout; std::vector dinhost; }; template int ActivationDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); if(inflags.GetValueInt("time") == 1) { miopenEnableProfiling(GetHandle(), true); } return miopenStatusSuccess; } template int ActivationDriver::GetandSetData() { std::vector in_len = GetInputTensorLengthsFromCmdLine(); SetTensor4d(inputTensor, in_len, data_type); SetActivationDescriptorFromCmdLineArgs(); SetTensor4d(outputTensor, in_len, data_type); SetTensor4d(dInputTensor, in_len, data_type); SetTensor4d(dOutputTensor, in_len, data_type); return (0); } template int ActivationDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "0", "Run only Forward LRN Normalization (Default=0)", "int"); inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag( "mode", 'm', "3", "Activation Mode (relu,..., see spec) (Default=3(relu))", "int"); inflags.AddInputFlag("alpha", 'A', "1", "Activation alpha (Default=1)", "double"); inflags.AddInputFlag("beta", 'B', "1", "Activation beta (Default=1)", "double"); inflags.AddInputFlag("gamma", 'G', "1", "Activation gamma (Default=1)", "double"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag( "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); return miopenStatusSuccess; } template std::vector ActivationDriver::GetInputTensorLengthsFromCmdLine() { int in_n = inflags.GetValueInt("batchsize"); int in_c = inflags.GetValueInt("in_channels"); int in_h = inflags.GetValueInt("in_h"); int in_w = inflags.GetValueInt("in_w"); return std::vector({in_n, in_c, in_h, in_w}); } template int ActivationDriver::SetActivationDescriptorFromCmdLineArgs() { miopenActivationMode_t mode; double Alpha = inflags.GetValueDouble("alpha"); double Beta = inflags.GetValueDouble("beta"); double Gamma = inflags.GetValueDouble("gamma"); mode = static_cast(inflags.GetValueInt("mode")); return (miopenSetActivationDescriptor(activDesc, mode, Alpha, Beta, Gamma)); } template int ActivationDriver::AllocateBuffersAndCopy() { size_t in_sz = GetTensorSize(inputTensor); size_t out_sz = GetTensorSize(outputTensor); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); din_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); in = std::vector(in_sz, static_cast(0)); out = std::vector(out_sz, static_cast(0)); outhost = std::vector(out_sz, static_cast(0)); din = std::vector(in_sz, static_cast(0)); dout = std::vector(out_sz, static_cast(0)); dinhost = std::vector(in_sz, static_cast(0)); miopenActivationMode_t activation_mode; double alpha, beta, gamma; miopenGetActivationDescriptor(activDesc, &activation_mode, &alpha, &beta, &gamma); for(int i = 0; i < in_sz; i++) { switch(activation_mode) { case MIOPEN_NEURON_PASTHRU: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_LOGISTIC: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_TANH: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_RELU: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_SOFTRELU: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_ABS: in[i] = RAN_GEN(static_cast(-2.0), static_cast(2.0)); break; case MIOPEN_NEURON_POWER: { double v = -alpha / beta; in[i] = i % 2 ? RAN_GEN(static_cast((v + 0.005) / beta), static_cast((v + 2.0) / beta)) : RAN_GEN(static_cast((v - 2.0) / beta), static_cast((v - 0.005) / beta)); break; } case MIOPEN_NEURON_CLIPPED_RELU: if(i % 3 == 0) in[i] = RAN_GEN(static_cast(-1.0 * alpha), static_cast(-0.005 * alpha)); else if(i % 3 == 1) in[i] = RAN_GEN(static_cast(0.005 * alpha), static_cast(0.995 * alpha)); else in[i] = RAN_GEN(static_cast(1.005 * alpha), static_cast(2.0 * alpha)); break; case MIOPEN_NEURON_LEAKY_RELU: in[i] = i % 2 ? RAN_GEN(static_cast(-1.0), static_cast(-0.005)) : RAN_GEN(static_cast(-0.005), static_cast(1.0)); break; case MIOPEN_NEURON_ELU: in[i] = i % 2 ? RAN_GEN(static_cast(0.005), static_cast(2.0)) : RAN_GEN(static_cast(-2.0), static_cast(-0.005)); break; } } for(int i = 0; i < out_sz; i++) { dout[i] = RAN_GEN(static_cast(-0.5), static_cast(0.5)); } #if MIOPEN_BACKEND_OPENCL cl_int status; #elif MIOPEN_BACKEND_HIP int status; #endif status = in_dev->ToGPU(q, in.data()); status |= out_dev->ToGPU(q, out.data()); status = din_dev->ToGPU(q, din.data()); status |= dout_dev->ToGPU(q, dout.data()); if(status != CL_SUCCESS) printf("Error copying data to GPU\n"); return miopenStatusSuccess; } template int ActivationDriver::RunForwardGPU() { float alpha = 1, beta = 0; double fulltime = 0.; float avgtime = 0.0f; float lowtime = 100000000.0f; int iters = inflags.GetValueInt("iter"); Timer t; for(int i = 0; i < iters; i++) { START_TIME miopenActivationForward(GetHandle(), activDesc, &alpha, inputTensor, in_dev->GetMem(), &beta, outputTensor, out_dev->GetMem()); miopen::deref(GetHandle()).Finish(); STOP_TIME if(WALL_CLOCK) { if(iters > 1 && i > 0) fulltime += t.gettime_ms(); else if(iters == 1) fulltime = t.gettime_ms(); // else do nothing, drop the first iteration } if(inflags.GetValueInt("time") == 1) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); lowtime = (time < lowtime) ? time : lowtime; if(iters > 1 && i > 0) avgtime += time; } } if(WALL_CLOCK) { printf("Wall-clock Time Forward GPU Activation Elapsed: %f ms, for %d iterations.\n", (iters == 1) ? t.gettime_ms() : (fulltime / float(iters - 1)), (iters > 1) ? iters - 1 : 1); } if(inflags.GetValueInt("time") == 1) { printf("GPU Kernel Min Time Forward Activation Elapsed: %f ms\n", lowtime); if(iters > 1) printf("GPU Kernel Avg Time Forward Activation Elapsed: %f ms, for %d iterations.\n", avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); size_t dataSz = in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); // layer, readbytes, writebytes, BG/s, timeMS printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n"); printf("stats: fwd-activ, %zu, %zu, %f, %f\n", dataSz, dataSz, 2 * dataSz / lowtime / 1e6, avgtime / (iters - 1)); } out_dev->FromGPU(GetStream(), out.data()); return miopenStatusSuccess; } template int ActivationDriver::RunForwardCPU() { return (0); } template int ActivationDriver::RunBackwardGPU() { float alpha = 1, beta = 0; double fulltime = 0.; float avgtime = 0.0f; float lowtime = 100000000.0f; int iters = inflags.GetValueInt("iter"); Timer t; for(int i = 0; i < iters; i++) { START_TIME miopenActivationBackward(GetHandle(), activDesc, &alpha, outputTensor, out_dev->GetMem(), dOutputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), &beta, dInputTensor, din_dev->GetMem()); miopen::deref(GetHandle()).Finish(); STOP_TIME if(WALL_CLOCK) { if(iters > 1 && i > 0) fulltime += t.gettime_ms(); else if(iters == 1) fulltime = t.gettime_ms(); // else do nothing, drop the first iteration } if(inflags.GetValueInt("time") == 1) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); lowtime = (time < lowtime) ? time : lowtime; if(iters > 1 && i > 0) avgtime += time; } } if(WALL_CLOCK) { printf("Wall-clock Time Backward GPU Activation Elapsed: %f ms, for %d iterations.\n", (iters == 1) ? t.gettime_ms() : (fulltime / float(iters - 1)), (iters > 1) ? iters - 1 : 1); } if(inflags.GetValueInt("time") == 1) { printf("GPU Kernel Min Time Backward Activation Elapsed: %f ms\n", lowtime); if(iters > 1) printf("GPU Kernel Avg Time Backward Activation Elapsed: %f ms, for %d iterations.\n", avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); size_t dataSz = in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); // layer, readbytes, writebytes, BG/s, timeMS printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n"); printf("stats: bwd-activ, %zu, %zu, %f, %f\n", dataSz, dataSz, 2 * dataSz / lowtime / 1e6, avgtime / (iters - 1)); } din_dev->FromGPU(GetStream(), din.data()); return miopenStatusSuccess; } template int ActivationDriver::VerifyForward() { double allowedEps = std::numeric_limits::epsilon() * 80; miopenActivationMode_t v_mode; double v_Alpha; double v_Beta; double v_Gamma; miopenGetActivationDescriptor(activDesc, &v_mode, &v_Alpha, &v_Beta, &v_Gamma); int match = 1; match = mloNeuronForwardRunHostAndVerify(v_mode, static_cast(v_Gamma), static_cast(v_Beta), static_cast(v_Alpha), in.size(), in.data(), out.data(), static_cast(allowedEps)); if(match) printf("Forward Activation Verifies on CPU and GPU\n"); return miopenStatusSuccess; } template int ActivationDriver::RunBackwardCPU() { return miopenStatusSuccess; } template int ActivationDriver::VerifyBackward() { double allowedEps = std::numeric_limits::epsilon() * 80; miopenActivationMode_t v_mode; double v_Alpha; double v_Beta; double v_Gamma; miopenGetActivationDescriptor(activDesc, &v_mode, &v_Alpha, &v_Beta, &v_Gamma); int match = 1; match = mloNeuronBackwardRunHostAndVerify(v_mode, static_cast(v_Gamma), static_cast(v_Beta), static_cast(v_Alpha), dinhost.size(), in.data(), out.data(), din.data(), dout.data(), static_cast(allowedEps)); if(match) printf("Backward Activation Verifies on CPU and GPU\n"); return miopenStatusSuccess; } #endif // GUARD_MIOPEN_ACTIV_DRIVER_HPP