/******************************************************************************* * * MIT License * * Copyright (c) 2017 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #ifndef GUARD_MIOPEN_SOFTMAX_DRIVER_HPP #define GUARD_MIOPEN_SOFTMAX_DRIVER_HPP #include "InputFlags.hpp" #include "driver.hpp" #include "mloSoftmaxHost.hpp" #include "tensor_driver.hpp" #include "timer.hpp" #include <../test/verify.hpp> #include #include #include #include #include #include #include #include #include "random.hpp" template class SoftmaxDriver : public Driver { public: SoftmaxDriver() : Driver() { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&outputTensor); miopenCreateTensorDescriptor(&dInputTensor); miopenCreateTensorDescriptor(&dOutputTensor); data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs(); int ParseCmdLineArgs(int argc, char* argv[]); InputFlags& GetInputFlags() { return inflags; } int GetandSetData(); std::vector GetInputTensorLengthsFromCmdLine(); int AllocateBuffersAndCopy(); int RunForwardGPU(); int RunForwardCPU(); int RunBackwardGPU(); int RunBackwardCPU(); int VerifyBackward(); int VerifyForward(); ~SoftmaxDriver() { miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(inputTensor); miopenDestroyTensorDescriptor(dOutputTensor); miopenDestroyTensorDescriptor(dInputTensor); } private: InputFlags inflags; miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t outputTensor; std::unique_ptr in_dev; std::unique_ptr out_dev; std::vector in; std::vector out; std::vector outhost; miopenTensorDescriptor_t dInputTensor; miopenTensorDescriptor_t dOutputTensor; std::unique_ptr din_dev; std::unique_ptr dout_dev; std::vector din; std::vector dout; std::vector dinhost; float alpha; float beta; miopenSoftmaxAlgorithm_t algo; miopenSoftmaxMode_t mode; }; template int SoftmaxDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); if(inflags.GetValueInt("time") == 1) { miopenEnableProfiling(GetHandle(), true); } return miopenStatusSuccess; } template int SoftmaxDriver::GetandSetData() { std::vector in_len = GetInputTensorLengthsFromCmdLine(); SetTensor4d(inputTensor, in_len, data_type); SetTensor4d(outputTensor, in_len, data_type); SetTensor4d(dInputTensor, in_len, data_type); SetTensor4d(dOutputTensor, in_len, data_type); alpha = static_cast(inflags.GetValueDouble("alpha")); beta = static_cast(inflags.GetValueDouble("beta")); algo = miopenSoftmaxAlgorithm_t(inflags.GetValueInt("algorithm")); mode = miopenSoftmaxMode_t(inflags.GetValueInt("mode")); return (0); } template int SoftmaxDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "0", "Run only Forward Softmax (Default=0)", "int"); inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag("alpha", 'A', "1.0", "Softmax shift (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.0", "Softmax scale (Default=0.0)", "float"); inflags.AddInputFlag("algorithm", 'a', "1", "softmax algorithms: fast (0), accurate (1), logsoftmax (2) (Default=1)", "int"); inflags.AddInputFlag( "mode", 'm', "1", "instance mode (0), channel mode (1) (Default=1)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag( "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); return miopenStatusSuccess; } template std::vector SoftmaxDriver::GetInputTensorLengthsFromCmdLine() { int in_n = inflags.GetValueInt("batchsize"); int in_c = inflags.GetValueInt("in_channels"); int in_h = inflags.GetValueInt("in_h"); int in_w = inflags.GetValueInt("in_w"); return std::vector({in_n, in_c, in_h, in_w}); } template int SoftmaxDriver::AllocateBuffersAndCopy() { size_t in_sz = GetTensorSize(inputTensor); size_t out_sz = GetTensorSize(outputTensor); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); din_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); in = std::vector(in_sz, static_cast(0)); out = std::vector(out_sz, static_cast(0)); outhost = std::vector(out_sz, static_cast(0)); din = std::vector(in_sz, static_cast(0)); dout = std::vector(out_sz, static_cast(0)); dinhost = std::vector(in_sz, static_cast(0)); for(int i = 0; i < in_sz; i++) { in[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); } Tgpu Data_scale = static_cast(0.001); for(int i = 0; i < out_sz; i++) { dout[i] = Data_scale * RAN_GEN(static_cast(-0.5), static_cast(0.5)); } #if MIOPEN_BACKEND_OPENCL cl_int status; #elif MIOPEN_BACKEND_HIP int status; #endif status = in_dev->ToGPU(q, in.data()); status |= out_dev->ToGPU(q, out.data()); status |= din_dev->ToGPU(q, din.data()); status |= dout_dev->ToGPU(q, dout.data()); if(status != CL_SUCCESS) printf("Error copying data to GPU\n"); return miopenStatusSuccess; } template int SoftmaxDriver::RunForwardGPU() { float kernel_total_time = 0.0; float kernel_first_time = 0.0; Timer t; START_TIME for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenSoftmaxForward_V2(GetHandle(), &alpha, inputTensor, in_dev->GetMem(), &beta, outputTensor, out_dev->GetMem(), algo, mode); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } if(inflags.GetValueInt("time") == 1) { STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) printf("Wall-clock Time Forward Softmax Elapsed: %f ms\n", t.gettime_ms() / iter); float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; printf("GPU Kernel Time Forward Softmax Elapsed: %f ms\n", kernel_average_time); } out_dev->FromGPU(GetStream(), out.data()); return miopenStatusSuccess; } template int SoftmaxDriver::RunForwardCPU() { return (0); } template int SoftmaxDriver::RunBackwardGPU() { float kernel_total_time = 0.0; float kernel_first_time = 0.0; Timer t; START_TIME for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenSoftmaxBackward_V2(GetHandle(), &alpha, outputTensor, out_dev->GetMem(), dOutputTensor, dout_dev->GetMem(), &beta, dInputTensor, din_dev->GetMem(), algo, mode); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } if(inflags.GetValueInt("time") == 1) { STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) printf("Wall-clock Time Backward Softmax Elapsed: %f ms\n", t.gettime_ms() / iter); float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; printf("GPU Kernel Time Backward Softmax Elapsed: %f ms\n", kernel_average_time); } din_dev->FromGPU(GetStream(), din.data()); return miopenStatusSuccess; } template int SoftmaxDriver::VerifyForward() { mloSoftmaxForwardRunHost( inputTensor, outputTensor, in.data(), outhost.data(), alpha, beta, algo, mode); auto error = miopen::rms_range(outhost, out); const Tref tolerance = data_type == miopenHalf ? 5e-2 : 1e-3; // 1e-6; if(error > tolerance) { std::cout << "Forward Softmax Failed: " << error << "\n"; } else { printf("Forward Softmax Verifies on CPU and GPU (err=%f)\n", error); } return miopenStatusSuccess; } template int SoftmaxDriver::RunBackwardCPU() { return miopenStatusSuccess; } template int SoftmaxDriver::VerifyBackward() { mloSoftmaxBackwardRunHost(inputTensor, outputTensor, out.data(), dout.data(), dinhost.data(), alpha, beta, algo, mode); auto error = miopen::rms_range(dinhost, din); const Tref tolerance = data_type == miopenHalf ? 5e-2 : 1e-3; // 1e-6; if(error > tolerance) { std::cout << "Backward Softmax Failed: " << error << "\n"; } else { printf("Backward Softmax Verifies on CPU and GPU (err=%f)\n", error); } return miopenStatusSuccess; } #endif // GUARD_MIOPEN_SOFTMAX_DRIVER_HPP