/******************************************************************************* * * MIT License * * Copyright (c) 2017 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #ifndef GUARD_MIOPEN_CONV_DRIVER_HPP #define GUARD_MIOPEN_CONV_DRIVER_HPP /*#ifdef MIOPEN_NEURON_SOFTRELU /// \todo This needs to be explained or rewritten in clear manner. #undef MIOPEN_NEURON_SOFTRELU #endif #ifdef MIOPEN_NEURON_POWER /// \todo This needs to be explained or rewritten in clear manner. #undef MIOPEN_NEURON_POWER #endif */ #include "InputFlags.hpp" #include "conv_verify.hpp" #include "driver.hpp" #include "mloConvHost.hpp" #include "tensor_driver.hpp" #include "timer.hpp" #include "util_driver.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "random.hpp" #include #include #include #include #include #include <../test/verify.hpp> #include <../test/serialize.hpp> #include <../test/tensor_holder.hpp> #include <../test/cpu_conv.hpp> #include <../test/cpu_bias.hpp> #include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_PAD_BUFFERS_2M) #if MIOPEN_BACKEND_OPENCL #define STATUS_SUCCESS CL_SUCCESS typedef cl_int status_t; #else // MIOPEN_BACKEND_HIP #define STATUS_SUCCESS 0 typedef int status_t; #endif // Use values which are distinctively greater then miopenStatus_t, // so that these can be ORed with any miopen status code // without loss of information. typedef enum { // These four codes could be returned together, ORed: EC_VerifyFwd = 0x100, EC_VerifyBwd = 0x200, EC_VerifyWrw = 0x400, EC_VerifyBwdBias = 0x800, } errorCode_t; template void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) { std::ofstream outFile(fileName, std::ios::binary); if(outFile) { outFile.write(reinterpret_cast(data), dataNumItems * sizeof(T)); outFile.close(); printf("Wrote output to file %s\n", fileName); } else { printf("Could not open file %s for writing\n", fileName); } } template bool readBufferFromFile(T* data, size_t dataNumItems, const char* fileName) { std::ifstream infile(fileName, std::ios::binary); if(infile) { infile.read(reinterpret_cast(data), dataNumItems * sizeof(T)); infile.close(); printf("Read data from input file %s\n", fileName); return true; } else { printf("Could not open file %s for reading\n", fileName); return false; } } // Tgpu and Tref are the data-type in GPU memory and CPU memory respectively. // They are not necessarily the same as the computation type on GPU or CPU template class ConvDriver : public Driver { public: ConvDriver() : Driver() { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&weightTensor); miopenCreateTensorDescriptor(&outputTensor); miopenCreateTensorDescriptor(&biasTensor); miopenCreateTensorDescriptor(&inputTensor_vect4); miopenCreateTensorDescriptor(&weightTensor_vect4); miopenCreateConvolutionDescriptor(&convDesc); workspace_dev = nullptr; // the variable name is implementation dependent, checking size instead InitDataType(); } int AddCmdLineArgs(); int ParseCmdLineArgs(int argc, char* argv[]); InputFlags& GetInputFlags() { return inflags; } int GetandSetData(); std::vector GetInputTensorLengthsFromCmdLine(); std::vector GetWeightTensorLengthsFromCmdLine(); std::vector GetBiasTensorLengthsFromCmdLine(); int SetConvDescriptorFromCmdLineArgs(); std::vector GetOutputTensorLengths(); int AllocateBuffersAndCopy(); int FindForward(int& ret_algo_count, int request_algo_count, std::vector& perf_results); int RunForwardGPU(); int RunForwardCPU(); int FindBackwardData(int& ret_algo_count, int request_algo_count, std::vector& perf_results); int FindBackwardWeights(int& ret_algo_count, int request_algo_count, std::vector& perf_results); int RunBackwardGPU(); int RunBackwardDataCPU(); int RunBackwardWeightsCPU(); int RunBackwardBiasCPU(); int VerifyBackward(); int VerifyForward(); ~ConvDriver() { miopenDestroyTensorDescriptor(biasTensor); miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(weightTensor); miopenDestroyTensorDescriptor(inputTensor); miopenDestroyTensorDescriptor(inputTensor_vect4); miopenDestroyTensorDescriptor(weightTensor_vect4); miopenDestroyConvolutionDescriptor(convDesc); } private: InputFlags inflags; boost::optional immediate_solution; miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t weightTensor; miopenTensorDescriptor_t outputTensor; miopenTensorDescriptor_t biasTensor; miopenTensorDescriptor_t inputTensor_vect4; miopenTensorDescriptor_t weightTensor_vect4; std::unique_ptr in_dev; std::unique_ptr in_vect4_dev; std::unique_ptr din_dev; std::unique_ptr wei_dev; std::unique_ptr wei_vect4_dev; std::unique_ptr dwei_dev; std::unique_ptr out_dev; std::unique_ptr dout_dev; std::unique_ptr b_dev; std::unique_ptr db_dev; std::unique_ptr workspace_dev; std::size_t ws_sizeof_find_fwd; std::size_t ws_sizeof_find_bwd; std::size_t ws_sizeof_find_wrw; tensor in; tensor wei; tensor out; tensor dout; tensor b; tensor outhost; tensor dwei_host; tensor din_host; tensor db_host; std::vector din; std::vector dwei; std::vector out_int8; std::vector db; std::vector b_int8; miopenConvolutionDescriptor_t convDesc; bool is_wrw = true, is_bwd = true, is_fwd = true; bool is_wrw_winograd = false; bool time_enabled = false; bool wall_enabled = false; int num_iterations = 1; Timer wall; Timer2 fwd_auxiliary; Timer2 bwd_auxiliary; Timer2 wrw_auxiliary; void PrintForwardTime(float kernel_total_time, float kernel_first_time) const; int RunForwardGpuImmed(bool is_transform); int RunForwardGpuFind(bool is_transform); void PrintBackwardDataTime(float kernel_total_time, float kernel_first_time); int RunBackwardDataGpuImmed(); int RunBackwardDataGpuFind(); void PrintBackwardWrwTime(float kernel_total_time, float kernel_first_time); int RunBackwardWrwGpuImmed(); int RunBackwardWrwGpuFind(); std::string GetVerificationCacheFileName() const; std::string GetVCacheFwdOutBasename() const; std::string GetVCacheBwdDataBasename() const; std::string GetVCacheBwdWeightBasename() const; std::string GetVCacheBiasBwdDataBasename() const; bool IsInputTensorTransform() const; bool TryReadVerificationCache(const std::string& file_name, miopenTensorDescriptor_t& tensorDesc, Tref* data) const; void TrySaveVerificationCache(const std::string& file_name, std::vector& data) const; }; // Check if int8 type tensor x and w need to be transformed to a pack of 4 elements along channel // (NCHW_VECT_C format) template bool ConvDriver::IsInputTensorTransform() const { return (data_type == miopenInt8 && inflags.GetValueInt("in_channels") % 4 != 0) || data_type == miopenInt8x4; } template int ConvDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); num_iterations = inflags.GetValueInt("iter"); if(num_iterations < 1) { std::cout << "Fatal: Number of iterations must be > 0: " << num_iterations << std::endl; return 1; } time_enabled = (inflags.GetValueInt("time") != 0); wall_enabled = time_enabled && (inflags.GetValueInt("wall") != 0); if(time_enabled) { miopenEnableProfiling(GetHandle(), true); } is_fwd = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 1); is_bwd = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 2); is_wrw = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 4); const auto solution_value = inflags.GetValueInt("solution"); if(solution_value >= 0) immediate_solution = solution_value; return 0; } template int ConvDriver::GetandSetData() { std::vector in_len = GetInputTensorLengthsFromCmdLine(); std::vector wei_len = GetWeightTensorLengthsFromCmdLine(); SetTensorNd(inputTensor, in_len, data_type); SetTensorNd(weightTensor, wei_len, data_type); if(inflags.GetValueInt("tensor_vect") == 1 && data_type == miopenInt8) { data_type = miopenInt8x4; } if(IsInputTensorTransform()) { std::vector in_len_vect4(in_len.begin(), in_len.end()), wei_len_vect4(wei_len.begin(), wei_len.end()); in_len_vect4[1] = ((in_len[1] + 3) / 4) * 4; SetTensorNd(inputTensor_vect4, in_len_vect4, data_type); wei_len_vect4[1] = ((wei_len[1] + 3) / 4) * 4; SetTensorNd(weightTensor_vect4, wei_len_vect4, data_type); } SetConvDescriptorFromCmdLineArgs(); std::vector out_len = GetOutputTensorLengths(); miopenDataType_t y_type = (data_type == miopenInt8 || data_type == miopenInt8x4) ? miopenFloat : data_type; SetTensorNd(outputTensor, out_len, y_type); if(inflags.GetValueInt("bias") != 0) { std::vector bias_len = GetBiasTensorLengthsFromCmdLine(); SetTensorNd(biasTensor, bias_len, data_type); } return (0); } template int ConvDriver::AddCmdLineArgs() { inflags.AddInputFlag( "spatial_dim", '_', "2", "convolution spatial dimension (Default-2)", "int"); inflags.AddInputFlag("forw", 'F', "0", "Flag enables fwd, bwd, wrw convolutions" "\n0 fwd+bwd+wrw (default)" "\n1 fwd only" "\n2 bwd only" "\n4 wrw only" "\n3 fwd+bwd" "\n5 fwd+wrw" "\n6 bwd+wrw", "int"); inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); inflags.AddInputFlag("in_d", '!', "32", "Input Depth (Default=32)", "int"); inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag( "out_channels", 'k', "32", "Number of Output Channels (Default=32)", "int"); inflags.AddInputFlag("fil_d", '@', "3", "Filter Depth (Default=3)", "int"); inflags.AddInputFlag("fil_h", 'y', "3", "Filter Height (Default=3)", "int"); inflags.AddInputFlag("fil_w", 'x', "3", "Filter Width (Default=3)", "int"); inflags.AddInputFlag( "conv_stride_d", '#', "1", "Convolution Stride for Depth (Default=1)", "int"); inflags.AddInputFlag( "conv_stride_h", 'u', "1", "Convolution Stride for Height (Default=1)", "int"); inflags.AddInputFlag( "conv_stride_w", 'v', "1", "Convolution Stride for Width (Default=1)", "int"); inflags.AddInputFlag("pad_d", '$', "0", "Zero Padding for Depth (Default=0)", "int"); inflags.AddInputFlag("pad_h", 'p', "0", "Zero Padding for Height (Default=0)", "int"); inflags.AddInputFlag("pad_w", 'q', "0", "Zero Padding for Width (Default=0)", "int"); inflags.AddInputFlag("pad_val", 'r', "0", "Padding Value (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_d", '%', "0", "Zero Padding Output for Depth (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_h", 'Y', "0", "Zero Padding Output for Height (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_w", 'X', "0", "Zero Padding Output for Width (Default=0)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("verification_cache", 'C', "", "Use specified directory to cache verification data. Off by default.", "string"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag( "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); inflags.AddInputFlag("search", 's', "0", "Search Kernel Config (Default=0)", "int"); inflags.AddInputFlag("printconv", 'P', "1", "Print Convolution Dimensions (Default=1)", "int"); inflags.AddInputFlag("dump_output", 'o', "0", "Dumps the output buffers (Default=0)", "int"); inflags.AddInputFlag("in_data", 'd', "", "Input data filename (Default=)", "string"); inflags.AddInputFlag("weights", 'e', "", "Input weights filename (Default=)", "string"); inflags.AddInputFlag("bias", 'b', "", "Use Bias (Default=0)", "int"); inflags.AddInputFlag( "mode", 'm', "conv", "Convolution Mode (conv, trans) (Default=conv)", "str"); inflags.AddInputFlag( "pad_mode", 'z', "default", "Padding Mode (same, valid, default) (Default=default)", "str"); inflags.AddInputFlag("tensor_vect", 'Z', "0", "tensor vectorization type (none, vect_c, vect_n) (Default=0)", "int"); inflags.AddInputFlag("dilation_d", '^', "1", "Dilation of Filter Depth (Default=1)", "int"); inflags.AddInputFlag("dilation_h", 'l', "1", "Dilation of Filter Height (Default=1)", "int"); inflags.AddInputFlag("dilation_w", 'j', "1", "Dilation of Filter Width (Default=1)", "int"); inflags.AddInputFlag("in_bias", 'a', "", "Input bias filename (Default=)", "string"); inflags.AddInputFlag("group_count", 'g', "1", "Number of Groups (Default=1)", "int"); inflags.AddInputFlag("dout_data", 'D', "", "dy data filename for backward weight computation (Default=)", "string"); inflags.AddInputFlag("solution", 'S', "-1", "Use immediate mode, run solution with specified id." "\nAccepts integer argument N:" "\n=0 Immediate mode, build and run fastest solution" "\n>0 Immediate mode, build and run solution_id = N" "\n<0 Use Find() API (Default=-1)", "int"); return 0; } template std::vector ConvDriver::GetInputTensorLengthsFromCmdLine() { std::vector in_lens; int spatial_dim = inflags.GetValueInt("spatial_dim"); in_lens.resize(2 + spatial_dim); in_lens[0] = inflags.GetValueInt("batchsize"); in_lens[1] = inflags.GetValueInt("in_channels"); auto in_spatial_lens = boost::adaptors::slice(in_lens, 2, 2 + spatial_dim); if(spatial_dim == 2) { in_spatial_lens[0] = inflags.GetValueInt("in_h"); in_spatial_lens[1] = inflags.GetValueInt("in_w"); } else if(spatial_dim == 3) { in_spatial_lens[0] = inflags.GetValueInt("in_d"); in_spatial_lens[1] = inflags.GetValueInt("in_h"); in_spatial_lens[2] = inflags.GetValueInt("in_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } return in_lens; } template std::vector ConvDriver::GetWeightTensorLengthsFromCmdLine() { std::vector wei_lens; int spatial_dim = inflags.GetValueInt("spatial_dim"); wei_lens.resize(2 + spatial_dim); auto wei_spatial_lens = boost::adaptors::slice(wei_lens, 2, 2 + spatial_dim); int group_count = std::max(inflags.GetValueInt("group_count"), 1); int wei_k_len = inflags.GetValueInt("out_channels"); int wei_c_len = inflags.GetValueInt("in_channels"); if(spatial_dim == 2) { wei_spatial_lens[0] = inflags.GetValueInt("fil_h"); wei_spatial_lens[1] = inflags.GetValueInt("fil_w"); } else if(spatial_dim == 3) { wei_spatial_lens[0] = inflags.GetValueInt("fil_d"); wei_spatial_lens[1] = inflags.GetValueInt("fil_h"); wei_spatial_lens[2] = inflags.GetValueInt("fil_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } if(group_count > 1) { if(wei_c_len % group_count != 0 || wei_k_len % group_count != 0 || group_count > wei_c_len || group_count > wei_k_len) { MIOPEN_THROW("Invalid group number\n"); } } miopenConvolutionMode_t mode; if((inflags.GetValueStr("mode")) == "conv") { mode = miopenConvolution; } else if((inflags.GetValueStr("mode")) == "trans") { mode = miopenTranspose; } else { MIOPEN_THROW("Incorrect Convolution Mode\n"); } if(mode == miopenTranspose) { wei_lens[0] = wei_c_len; wei_lens[1] = wei_k_len / group_count; } else { wei_lens[0] = wei_k_len; wei_lens[1] = wei_c_len / group_count; } return wei_lens; } template std::vector ConvDriver::GetBiasTensorLengthsFromCmdLine() { int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector bias_lens(2 + spatial_dim, 1); bias_lens[1] = inflags.GetValueInt("out_channels"); return bias_lens; } template int ConvDriver::SetConvDescriptorFromCmdLineArgs() { int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector in_spatial_lens(spatial_dim); std::vector wei_spatial_lens(spatial_dim); std::vector pads(spatial_dim); std::vector conv_strides(spatial_dim); std::vector conv_dilations(spatial_dim); std::vector trans_output_pads(spatial_dim); if(spatial_dim == 2) { in_spatial_lens[0] = inflags.GetValueInt("in_h"); in_spatial_lens[1] = inflags.GetValueInt("in_w"); wei_spatial_lens[0] = inflags.GetValueInt("fil_h"); wei_spatial_lens[1] = inflags.GetValueInt("fil_w"); pads[0] = inflags.GetValueInt("pad_h"); pads[1] = inflags.GetValueInt("pad_w"); conv_strides[0] = inflags.GetValueInt("conv_stride_h"); conv_strides[1] = inflags.GetValueInt("conv_stride_w"); conv_dilations[0] = inflags.GetValueInt("dilation_h"); conv_dilations[1] = inflags.GetValueInt("dilation_w"); trans_output_pads[0] = inflags.GetValueInt("trans_output_pad_h"); trans_output_pads[1] = inflags.GetValueInt("trans_output_pad_w"); } else if(spatial_dim == 3) { in_spatial_lens[0] = inflags.GetValueInt("in_d"); in_spatial_lens[1] = inflags.GetValueInt("in_h"); in_spatial_lens[2] = inflags.GetValueInt("in_w"); wei_spatial_lens[0] = inflags.GetValueInt("fil_d"); wei_spatial_lens[1] = inflags.GetValueInt("fil_h"); wei_spatial_lens[2] = inflags.GetValueInt("fil_w"); pads[0] = inflags.GetValueInt("pad_d"); pads[1] = inflags.GetValueInt("pad_h"); pads[2] = inflags.GetValueInt("pad_w"); conv_strides[0] = inflags.GetValueInt("conv_stride_d"); conv_strides[1] = inflags.GetValueInt("conv_stride_h"); conv_strides[2] = inflags.GetValueInt("conv_stride_w"); conv_dilations[0] = inflags.GetValueInt("dilation_d"); conv_dilations[1] = inflags.GetValueInt("dilation_h"); conv_dilations[2] = inflags.GetValueInt("dilation_w"); trans_output_pads[0] = inflags.GetValueInt("trans_output_pad_d"); trans_output_pads[1] = inflags.GetValueInt("trans_output_pad_h"); trans_output_pads[2] = inflags.GetValueInt("trans_output_pad_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } int out_c = inflags.GetValueInt("out_channels"); int in_c = inflags.GetValueInt("in_channels"); int group_count = std::max(inflags.GetValueInt("group_count"), 1); if(group_count > 1) { if(in_c % group_count != 0 || out_c % group_count != 0 || group_count > in_c || group_count > out_c) { printf("Invalid group number\n"); exit(0); } } miopenConvolutionMode_t mode; if((inflags.GetValueStr("mode")) == "conv") { mode = miopenConvolution; } else if((inflags.GetValueStr("mode")) == "trans") { mode = miopenTranspose; } else { printf("Incorrect Convolution Mode\n"); exit(0); } // adjust padding based on user-defined padding mode if(mode == miopenConvolution && (miopen::all_of(conv_dilations, [](auto v) { return v == 1; }) || miopen::all_of(wei_spatial_lens, [](auto v) { return v == 1; }))) { if((inflags.GetValueStr("pad_mode")) == "same") { for(int i = 0; i < spatial_dim; ++i) { pads[i] = (in_spatial_lens[i] % conv_strides[i] == 0) ? (std::max((wei_spatial_lens[i] - conv_strides[i]), 0)) : (std::max((wei_spatial_lens[i] - (in_spatial_lens[i] % conv_strides[i])), 0)); pads[i] /= 2; } } else if((inflags.GetValueStr("pad_mode")) == "valid") { for(int i = 0; i < spatial_dim; ++i) { pads[i] = 0; } } } miopenInitConvolutionNdDescriptor( convDesc, spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), mode); miopenSetConvolutionGroupCount(convDesc, group_count); if(mode == miopenTranspose) { miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data()); } return miopenStatusSuccess; } template std::vector ConvDriver::GetOutputTensorLengths() { int ndim = miopen::deref(inputTensor).GetSize(); std::vector out_lens(ndim); miopenGetConvolutionNdForwardOutputDim( convDesc, inputTensor, weightTensor, &ndim, out_lens.data()); return out_lens; } namespace detail { template T RanGenWeights() { return RAN_GEN(static_cast(-0.5), static_cast(0.5)); } // Shift FP16 distribution towards positive numbers, // otherwise Winograd FP16 validation fails. template <> float16 RanGenWeights() { return RAN_GEN(static_cast(-1.0 / 3.0), static_cast(0.5)); } } // namespace detail template int ConvDriver::AllocateBuffersAndCopy() { if(wall_enabled) { fwd_auxiliary.start(); fwd_auxiliary.pause(); bwd_auxiliary.start(); bwd_auxiliary.pause(); wrw_auxiliary.start(); wrw_auxiliary.pause(); } bool is_transform = IsInputTensorTransform(); bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; size_t in_sz = GetTensorSize(inputTensor); size_t wei_sz = GetTensorSize(weightTensor); size_t out_sz = GetTensorSize(outputTensor); // Workaround: Pad buffers allocations to be a multiple of 2M if(miopen::IsEnabled(MIOPEN_DRIVER_PAD_BUFFERS_2M{})) { // PadBufferSize(in_sz, sizeof(Tgpu)); PadBufferSize(wei_sz, sizeof(Tgpu)); PadBufferSize(out_sz, sizeof(Tgpu)); } #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif ws_sizeof_find_fwd = 0; ws_sizeof_find_wrw = 0; ws_sizeof_find_bwd = 0; if(!immediate_solution) { miopenStatus_t rc = miopenStatusSuccess; if(is_wrw && rc == miopenStatusSuccess) { wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetWorkSpaceSize(GetHandle(), outputTensor, inputTensor, convDesc, weightTensor, &ws_sizeof_find_wrw); wrw_auxiliary.pause(wall_enabled); } if(is_bwd && rc == miopenStatusSuccess) { bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetWorkSpaceSize(GetHandle(), outputTensor, weightTensor, convDesc, inputTensor, &ws_sizeof_find_bwd); bwd_auxiliary.pause(wall_enabled); } if(is_fwd && rc == miopenStatusSuccess) { fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardGetWorkSpaceSize( GetHandle(), (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, &ws_sizeof_find_fwd); fwd_auxiliary.pause(wall_enabled); } if(rc != miopenStatusSuccess) { std::cout << "Error getting workspace size, status = " << rc << std::endl; return rc; } const auto wsSizeof = std::max(std::max(ws_sizeof_find_bwd, ws_sizeof_find_wrw), ws_sizeof_find_fwd); if(wsSizeof != 0) { workspace_dev = std::unique_ptr(new GPUMem(ctx, wsSizeof, 1)); } } if(is_fwd || is_wrw) in = tensor(miopen::deref(inputTensor).GetLengths()); if(is_fwd || is_bwd) wei = tensor(miopen::deref(weightTensor).GetLengths()); if(is_fwd) out = tensor(miopen::deref(outputTensor).GetLengths()); if(is_bwd || is_wrw) dout = tensor(miopen::deref(outputTensor).GetLengths()); if(is_bwd) din = std::vector(in_sz, static_cast(0)); if(is_wrw) dwei = std::vector(wei_sz, static_cast(0)); if(is_int8) out_int8 = std::vector(out_sz, static_cast(0)); if(is_transform) { in_vect4_dev = std::unique_ptr( new GPUMem(ctx, GetTensorSize(inputTensor_vect4), sizeof(Tgpu))); wei_vect4_dev = std::unique_ptr( new GPUMem(ctx, GetTensorSize(weightTensor_vect4), sizeof(Tgpu))); } outhost = tensor(miopen::deref(outputTensor).GetLengths()); din_host = tensor(miopen::deref(inputTensor).GetLengths()); dwei_host = tensor(miopen::deref(weightTensor).GetLengths()); std::string inFileName = inflags.GetValueStr("in_data"); std::string weiFileName = inflags.GetValueStr("weights"); std::string biasFileName = inflags.GetValueStr("in_bias"); std::string doutFileName = inflags.GetValueStr("dout_data"); /* Unless seed is persistent between runs validation using cache stored in file is impossible. */ srand(0); bool dataRead = false; if(is_fwd || is_wrw) if(!inFileName.empty()) dataRead = readBufferFromFile(in.data.data(), in_sz, inFileName.c_str()); bool weiRead = false; if(is_fwd || is_bwd) if(!weiFileName.empty()) weiRead = readBufferFromFile(wei.data.data(), wei_sz, weiFileName.c_str()); if(is_int8) { float Data_scale = 127.0; if(!dataRead) { for(int i = 0; i < in_sz; i++) { if(is_fwd || is_wrw) in.data[i] = static_cast(Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0))); else /// \anchor move_rand /// Move rand() forward, even if buffer is unused. This provides the same /// initialization of input buffers regardless of which kinds of /// convolutions are currently selectedfor testing (see the "-F" option). /// Verification cache would be broken otherwise. rand(); } } if(inflags.GetValueInt("bias") != 0) { size_t b_sz = GetTensorSize(biasTensor); b_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(float))); b_int8 = std::vector(b_sz, static_cast(0)); for(int i = 0; i < b_sz; i++) { b_int8[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); } if(!biasFileName.empty()) { readBufferFromFile(b_int8.data(), b_sz, biasFileName.c_str()); } b_dev->ToGPU(q, b_int8.data()); } if(!weiRead) { for(int i = 0; i < wei_sz; i++) if(is_fwd || is_bwd) wei.data[i] = static_cast(Data_scale * 2 * detail::RanGenWeights()); else /// \ref move_rand rand(); } } else { Tgpu Data_scale = static_cast(0.01); bool doutRead = false; if(is_bwd || is_wrw) if(!doutFileName.empty()) doutRead = readBufferFromFile(dout.data.data(), out_sz, doutFileName.c_str()); if(!dataRead) { for(int i = 0; i < in_sz; i++) { if(is_fwd || is_wrw) in.data[i] = Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0)); else /// \ref move_rand rand(); } } if(!doutRead) { for(int i = 0; i < out_sz; i++) if(is_bwd || is_wrw) dout.data[i] = Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0)); else /// \ref move_rand rand(); } if(inflags.GetValueInt("bias") != 0) { size_t b_sz = GetTensorSize(biasTensor); b_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(Tgpu))); db_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(Tgpu))); b = tensor(miopen::deref(biasTensor).GetLengths()); db = std::vector(b_sz, static_cast(0)); db_host = tensor(miopen::deref(biasTensor).GetLengths()); for(int i = 0; i < b_sz; i++) { b.data[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); db[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); } if(!biasFileName.empty()) { readBufferFromFile(b.data.data(), b_sz, biasFileName.c_str()); } b_dev->ToGPU(q, b.data.data()); db_dev->ToGPU(q, db.data()); } if(!weiRead) { for(int i = 0; i < wei_sz; i++) if(is_fwd || is_bwd) wei.data[i] = Data_scale * detail::RanGenWeights(); else /// \ref move_rand rand(); } } if(inflags.GetValueInt("dump_output")) { if(is_fwd || is_wrw) dumpBufferToFile("dump_in.bin", in.data.data(), in_sz); if(is_fwd || is_bwd) dumpBufferToFile("dump_wei.bin", wei.data.data(), wei_sz); if(inflags.GetValueInt("bias") != 0) dumpBufferToFile("dump_bias.bin", b.data.data(), b.data.size()); if(is_bwd || is_wrw) dumpBufferToFile("dump_dout.bin", dout.data.data(), out_sz); } status_t status = STATUS_SUCCESS; if(is_fwd || is_wrw) { in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); status |= in_dev->ToGPU(q, in.data.data()); } if(is_bwd) { din_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); status |= din_dev->ToGPU(q, din.data()); } if(is_fwd || is_bwd) { wei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(Tgpu))); status |= wei_dev->ToGPU(q, wei.data.data()); } if(is_wrw) { dwei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(Tgpu))); status |= dwei_dev->ToGPU(q, dwei.data()); } if(is_bwd || is_wrw) { dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); status |= dout_dev->ToGPU(q, dout.data.data()); } if(is_fwd) { out_dev = std::unique_ptr( new GPUMem(ctx, out_sz, is_int8 ? sizeof(float) : sizeof(Tgpu))); status |= (is_int8 ? out_dev->ToGPU(q, out_int8.data()) : out_dev->ToGPU(q, out.data.data())); } if(status != STATUS_SUCCESS) { std::cout << "Error copying data to GPU, status = " << status << std::endl; return miopenStatusNotInitialized; } return miopenStatusSuccess; } template int ConvDriver::FindForward(int& ret_algo_count, int request_algo_count, std::vector& perf_results) { bool is_transform = IsInputTensorTransform(); fwd_auxiliary.resume(wall_enabled); const auto rc = miopenFindConvolutionForwardAlgorithm( GetHandle(), (is_transform ? inputTensor_vect4 : inputTensor), (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()), (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()), convDesc, outputTensor, out_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_fwd, (inflags.GetValueInt("search") == 1) ? true : false); fwd_auxiliary.pause(wall_enabled); return rc; } template void ConvDriver::PrintForwardTime(const float kernel_total_time, const float kernel_first_time) const { float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Forward Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2) { printf("stats: for conv%dd\n", num_dim); return; } int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w; size_t inputBytes = in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t weightBytes = wei_n * wei_c * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "fwd-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_h, out_w, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } template int ConvDriver::RunForwardGPU() { if(!is_fwd) return 0; int rc; bool is_transform = IsInputTensorTransform(); if(is_transform) { float aph = 1.0; float bta = 0.0; miopenTransformTensor(GetHandle(), &aph, inputTensor, in_dev->GetMem(), &bta, inputTensor_vect4, in_vect4_dev->GetMem()); miopenTransformTensor(GetHandle(), &aph, weightTensor, wei_dev->GetMem(), &bta, weightTensor_vect4, wei_vect4_dev->GetMem()); } if(immediate_solution) rc = RunForwardGpuImmed(is_transform); else rc = RunForwardGpuFind(is_transform); if(rc != miopenStatusSuccess) return rc; if(inflags.GetValueInt("bias") != 0) { float alpha = static_cast(1), beta = static_cast(0); miopenConvolutionForwardBias(GetHandle(), &alpha, biasTensor, b_dev->GetMem(), &beta, outputTensor, out_dev->GetMem()); if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); printf("GPU Kernel Time Forward Conv. Bias Elapsed: %f ms\n", time); } } bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; if(is_int8) out_dev->FromGPU(GetStream(), out_int8.data()); else out_dev->FromGPU(GetStream(), out.data.data()); if(inflags.GetValueInt("dump_output")) { if(is_int8) dumpBufferToFile("dump_fwd_out_gpu.bin", out_int8.data(), out_int8.size()); else dumpBufferToFile("dump_fwd_out_gpu.bin", out.data.data(), out.data.size()); } return rc; } template int ConvDriver::RunForwardGpuFind(const bool is_transform) { int ret_algo_count; int request_algo_count = 2; // The library returns `request_algo_count` algorithms to the caller. However this does // not affect auto-tuning and find-db updates. Internally, the library searches for // *all* available algorithms during Find(), -- regardless of how many algorithms // requested, -- so perf-db and find-db are fully updated. std::vector perf_results(request_algo_count); auto rc = FindForward(ret_algo_count, request_algo_count, perf_results); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Forward Conv. ret_algo_count == 0"); float alpha = static_cast(1), beta = static_cast(0); float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionForward(GetHandle(), &alpha, (is_transform ? inputTensor_vect4 : inputTensor), (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()), (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()), convDesc, perf_results[0].fwd_algo, // use the fastest algo &beta, outputTensor, out_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, perf_results[0].memory); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); fwd_auxiliary.stop(); std::cout << "Wall-clock Time Forward Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << fwd_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { printf("MIOpen Forward Conv. Algorithm: %d\n", perf_results[0].fwd_algo); PrintForwardTime(kernel_total_time, kernel_first_time); } return rc; } template int ConvDriver::RunForwardGpuImmed(const bool is_transform) { std::size_t count; fwd_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionForwardGetSolutionCount(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, &count); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardGetSolution(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, count, &count, solutions.data()); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Forward Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) std::cout << "Solution[" << s.solution_id << "] - algo: " << miopen::ConvolutionAlgoToString(s.algorithm) << ", time: " << s.time << " ms, ws: " << s.workspace_size << std::endl; if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } if(selected == nullptr) { std::cout << "Invalid solution id: " << *immediate_solution << std::endl; return miopenStatusBadParm; } std::size_t ws_size; fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardGetSolutionWorkspaceSize( handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, selected->solution_id, &ws_size); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardCompileSolution(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, selected->solution_id); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionForwardImmediate( handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()), (is_transform ? inputTensor_vect4 : inputTensor), (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()), convDesc, outputTensor, out_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); // The 1st is warm-up. Disregard it in wall time. } } } if(wall_enabled) { wall.stop(); fwd_auxiliary.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Forward Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << fwd_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { std::cout << "MIOpen Forward Conv. Algorithm: " << miopen::ConvolutionAlgoToString(selected->algorithm) << "[" << selected->solution_id << "]" << std::endl; PrintForwardTime(kernel_total_time, kernel_first_time); } return miopenStatusSuccess; } template int ConvDriver::RunForwardCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), outhost, wei, in, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); if(inflags.GetValueInt("bias") != 0) { cpu_bias_forward(outhost, b); } } else { cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, outhost, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); if(inflags.GetValueInt("bias") != 0) { outhost.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { outhost(out_n_id, out_k_id, out_spatial_id_pack...) = double(outhost(out_n_id, out_k_id, out_spatial_id_pack...)) + double(b.data[out_k_id]); }); } } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_fwd_out_cpu.bin", outhost.data.data(), outhost.data.size()); } TrySaveVerificationCache(GetVCacheFwdOutBasename(), outhost.data); return 0; } template int ConvDriver::FindBackwardData(int& ret_algo_count, int request_algo_count, std::vector& perf_results) { bwd_auxiliary.resume(wall_enabled); const auto rc = miopenFindConvolutionBackwardDataAlgorithm( GetHandle(), outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, inputTensor, din_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_bwd, (inflags.GetValueInt("search") == 1) ? true : false); bwd_auxiliary.pause(wall_enabled); return rc; } template int ConvDriver::FindBackwardWeights(int& ret_algo_count, int request_algo_count, std::vector& perf_results) { wrw_auxiliary.resume(wall_enabled); const auto rc = miopenFindConvolutionBackwardWeightsAlgorithm( GetHandle(), outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, weightTensor, dwei_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_wrw, (inflags.GetValueInt("search") == 1) ? true : false); wrw_auxiliary.pause(wall_enabled); return rc; } template int ConvDriver::RunBackwardGPU() { if(data_type == miopenInt8 || data_type == miopenInt8x4) { std::cout << "Int8 Backward Convolution is not supported" << std::endl; return 0; } if(!(is_bwd || is_wrw)) return 0; int ret = 0; if(is_bwd) { if(immediate_solution) ret |= RunBackwardDataGpuImmed(); else ret |= RunBackwardDataGpuFind(); } if(is_wrw) { if(immediate_solution) ret |= RunBackwardWrwGpuImmed(); else ret |= RunBackwardWrwGpuFind(); } if(inflags.GetValueInt("dump_output")) { if(is_bwd) dumpBufferToFile("dump_bwd_din_gpu.bin", din.data(), din.size()); if(is_wrw) dumpBufferToFile("dump_bwd_dwei_gpu.bin", dwei.data(), dwei.size()); } if(inflags.GetValueInt("bias") != 0) { float alpha = static_cast(1), beta = static_cast(0); ret |= miopenConvolutionBackwardBias(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), &beta, biasTensor, db_dev->GetMem()); if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); printf("GPU Kernel Time Backward Bias Conv. Elapsed: %f ms\n", time); } db_dev->FromGPU(GetStream(), db.data()); if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_db_gpu.bin", db.data(), db.size()); } } return ret; } template int ConvDriver::RunBackwardDataGpuFind() { int ret_algo_count; int request_algo_count = 2; std::vector perf_results_data(request_algo_count); auto rc = FindBackwardData(ret_algo_count, request_algo_count, perf_results_data); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Backward Data Conv. ret_algo_count == 0"); float kernel_total_time = 0.0; float kernel_first_time = 0.0; float alpha = static_cast(1), beta = static_cast(0); wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardData(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, perf_results_data[0].bwd_data_algo, &beta, inputTensor, din_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, perf_results_data[0].memory); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); bwd_auxiliary.stop(); std::cout << "Wall-clock Time Backward Data Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << bwd_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { printf("MIOpen Backward Data Conv. Algorithm: %d\n", perf_results_data[0].bwd_data_algo); PrintBackwardDataTime(kernel_total_time, kernel_first_time); } din_dev->FromGPU(GetStream(), din.data()); return rc; } template void ConvDriver::PrintBackwardDataTime(float kernel_total_time, float kernel_first_time) { float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Backward Data Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2) { printf("stats: for conv%dd\n", num_dim); return; } int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * out_h * out_w * wei_h * wei_w * out_c; size_t weightBytes = wei_n * wei_c * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t inputBytes = in_n * in_c * out_c * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "bwdd-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, wei_h, wei_w, out_c, out_h, out_w, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } template int ConvDriver::RunBackwardWrwGpuFind() { int ret_algo_count; int request_algo_count = 2; float kernel_total_time = 0.0; float kernel_first_time = 0.0; float alpha = static_cast(1), beta = static_cast(0); std::vector perf_results_weights(request_algo_count); auto rc = FindBackwardWeights(ret_algo_count, request_algo_count, perf_results_weights); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Backward Weights Conv. ret_algo_count == 0"); kernel_total_time = 0.0; kernel_first_time = 0.0; const auto algo = perf_results_weights[0].bwd_weights_algo; const auto ws_size = perf_results_weights[0].memory; is_wrw_winograd = (algo == miopenConvolutionBwdWeightsAlgoWinograd); wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardWeights(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, algo, &beta, weightTensor, dwei_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_size); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); wrw_auxiliary.stop(); std::cout << "Wall-clock Time Backward Weights Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << wrw_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { printf("MIOpen Backward Weights Conv. Algorithm: %d\n", algo); PrintBackwardWrwTime(kernel_total_time, kernel_first_time); } dwei_dev->FromGPU(GetStream(), dwei.data()); return rc; } template void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float kernel_first_time) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Backward Weights Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2) { printf("stats: for conv%dd\n", num_dim); return; } int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * out_h * out_w * wei_h * wei_w * out_c; size_t readBytes = 0; size_t outputBytes = 0; printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "bwdw-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_h, out_w, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } template int ConvDriver::RunBackwardDataGpuImmed() { std::size_t count; bwd_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionBackwardDataGetSolutionCount( handle, outputTensor, weightTensor, convDesc, inputTensor, &count); bwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetSolution( handle, outputTensor, weightTensor, convDesc, inputTensor, count, &count, solutions.data()); bwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Backward Data Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) std::cout << "Solution[" << s.solution_id << "] - algo: " << miopen::ConvolutionAlgoToString(s.algorithm) << ", time: " << s.time << " ms, ws: " << s.workspace_size << std::endl; if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } if(selected == nullptr) { std::cout << "Invalid solution id: " << *immediate_solution << std::endl; return miopenStatusBadParm; } std::size_t ws_size; bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetSolutionWorkspaceSize( handle, outputTensor, weightTensor, convDesc, inputTensor, selected->solution_id, &ws_size); bwd_auxiliary.pause(wall_enabled); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataCompileSolution( handle, outputTensor, weightTensor, convDesc, inputTensor, selected->solution_id); bwd_auxiliary.pause(wall_enabled); float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardDataImmediate(handle, outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, inputTensor, din_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); } } } if(wall_enabled) { wall.stop(); bwd_auxiliary.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Backward Data Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << bwd_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { std::cout << "MIOpen Backward Data Conv. Algorithm: " << miopen::ConvolutionAlgoToString(selected->algorithm) << "[" << selected->solution_id << "]" << std::endl; PrintBackwardDataTime(kernel_total_time, kernel_first_time); } din_dev->FromGPU(GetStream(), din.data()); return rc; } template int ConvDriver::RunBackwardWrwGpuImmed() { std::size_t count; wrw_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionBackwardWeightsGetSolutionCount( handle, outputTensor, inputTensor, convDesc, weightTensor, &count); wrw_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetSolution( handle, outputTensor, inputTensor, convDesc, weightTensor, count, &count, solutions.data()); wrw_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Backward Weights Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) std::cout << "Solution[" << s.solution_id << "] - algo: " << miopen::ConvolutionAlgoToString(s.algorithm) << ", time: " << s.time << " ms, ws: " << s.workspace_size << std::endl; if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } if(selected == nullptr) { std::cout << "Invalid solution id: " << *immediate_solution << std::endl; return miopenStatusBadParm; } std::size_t ws_size; wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetSolutionWorkspaceSize( handle, outputTensor, inputTensor, convDesc, weightTensor, selected->solution_id, &ws_size); wrw_auxiliary.pause(wall_enabled); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsCompileSolution( handle, outputTensor, inputTensor, convDesc, weightTensor, selected->solution_id); wrw_auxiliary.pause(wall_enabled); float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardWeightsImmediate(handle, outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, weightTensor, dwei_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); } } } if(wall_enabled) { wall.stop(); wrw_auxiliary.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Backward Weights Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << wrw_auxiliary.gettime_ms() << " ms" << std::endl; } if(time_enabled) { std::cout << "MIOpen Backward Weights Conv. Algorithm: " << miopen::ConvolutionAlgoToString(selected->algorithm) << "[" << selected->solution_id << "]" << std::endl; PrintBackwardWrwTime(kernel_total_time, kernel_first_time); } is_wrw_winograd = (selected->algorithm == miopenConvolutionAlgoWinograd); dwei_dev->FromGPU(GetStream(), dwei.data()); return rc; } template int ConvDriver::RunBackwardWeightsCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), dout, dwei_host, in, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } else { cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, dwei_host, dout, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile( "dump_bwd_dwei_cpu.bin", dwei_host.data.data(), dwei_host.data.size()); } TrySaveVerificationCache(GetVCacheBwdWeightBasename(), dwei_host.data); return 0; } template int ConvDriver::RunBackwardDataCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), dout, wei, din_host, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } else { cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), din_host, wei, dout, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_din_cpu.bin", din_host.data.data(), din_host.data.size()); } TrySaveVerificationCache(GetVCacheBwdDataBasename(), din_host.data); return 0; } template int ConvDriver::RunBackwardBiasCPU() { cpu_bias_backward_data(dout, db_host); if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_db_cpu.bin", db_host.data.data(), db_host.data.size()); } TrySaveVerificationCache(GetVCacheBiasBwdDataBasename(), db_host.data); return 0; } template std::string ConvDriver::GetVerificationCacheFileName() const { std::ostringstream ss; miopenConvolutionMode_t mode; int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector pads(spatial_dim); std::vector conv_strides(spatial_dim); std::vector conv_dilations(spatial_dim); std::vector trans_output_pads(spatial_dim); miopenGetConvolutionNdDescriptor(convDesc, spatial_dim, &spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), &mode); auto get_datatype_string = [](auto type) { if(std::is_same::value) { return "int8"; } else if(std::is_same::value) { return "float16"; } else if(std::is_same::value) { return "float"; } else if(std::is_same::value) { return "double"; } else if(std::is_same::value) { return "bfloat16"; } else { MIOPEN_THROW("unknown data type"); } }; ss << mode; ss << "_" << spatial_dim; ss << "_" << miopen::deref(convDesc).paddingMode; ss << "_" << miopen::deref(convDesc).GetGroupCount(); miopen::LogRange(ss << "_", miopen::deref(inputTensor).GetLengths(), "x"); miopen::LogRange(ss << "_", miopen::deref(weightTensor).GetLengths(), "x"); miopen::LogRange(ss << "_", pads, "x"); miopen::LogRange(ss << "_", conv_strides, "x"); miopen::LogRange(ss << "_", conv_dilations, "x"); miopen::LogRange(ss << "_", trans_output_pads, "x"); ss << "_" << inflags.GetValueInt("pad_val"); ss << "_" << inflags.GetValueInt("bias"); ss << "_" << "GPU" << get_datatype_string(Tgpu{}); ss << "_" << "REF" << get_datatype_string(Tref{}); return ss.str(); } template bool ConvDriver::TryReadVerificationCache(const std::string& file_name, miopenTensorDescriptor_t& tensorDesc, Tref* data) const { const auto verification_cache_path = inflags.GetValueStr("verification_cache"); if(!verification_cache_path.empty()) { const auto file_path = verification_cache_path + "/" + file_name + "_" + GetVerificationCacheFileName(); if(std::ifstream(file_path).good()) { if(readBufferFromFile(data, GetTensorSize(tensorDesc), file_path.c_str())) { return true; } } } return false; } template void ConvDriver::TrySaveVerificationCache(const std::string& file_name, std::vector& data) const { const auto verification_cache_path = inflags.GetValueStr("verification_cache"); if(!verification_cache_path.empty()) { const auto file_path = verification_cache_path + "/" + file_name + "_" + GetVerificationCacheFileName(); dumpBufferToFile(file_path.c_str(), data.data(), data.size()); } } template std::string ConvDriver::GetVCacheFwdOutBasename() const { return "conv_fwd_out"; } template std::string ConvDriver::GetVCacheBwdDataBasename() const { return "conv_bwd_dat"; } template std::string ConvDriver::GetVCacheBwdWeightBasename() const { return "conv_bwd_wei"; } template std::string ConvDriver::GetVCacheBiasBwdDataBasename() const { return "bias_bwd_dat"; } template int ConvDriver::VerifyForward() { if(!is_fwd) return 0; if(!TryReadVerificationCache(GetVCacheFwdOutBasename(), outputTensor, outhost.data.data())) { RunForwardCPU(); } auto error = miopen::rms_range(outhost.data, out.data); if(data_type == miopenInt8 || data_type == miopenInt8x4) error = miopen::rms_range(outhost.data, out_int8); const Tref tolerance = ((sizeof(Tgpu) == 4 || sizeof(Tgpu) == 1) ? static_cast(1e-6) : static_cast(7e-2)); if(!(error < tolerance)) { std::cout << "Forward Convolution Failed: " << error << std::endl; return EC_VerifyFwd; } std::cout << "Forward Convolution Verifies on CPU and GPU (" << error << ')' << std::endl; return 0; } template int ConvDriver::VerifyBackward() { if(data_type == miopenInt8 || data_type == miopenInt8x4) { std::cout << "Int8 Backward Convolution is not supported" << std::endl; return 0; } if(!(is_bwd || is_wrw)) return 0; const double tolerance = sizeof(Tgpu) == 4 ? 1e-6 : 7e-2; int cumulative_rc = 0; if(is_bwd) { if(!TryReadVerificationCache(GetVCacheBwdDataBasename(), inputTensor, din_host.data.data())) { RunBackwardDataCPU(); } auto error_data = miopen::rms_range(din_host.data, din); if(!(error_data < tolerance)) { std::cout << "Backward Convolution Data Failed: " << error_data << std::endl; cumulative_rc |= EC_VerifyBwd; } else { std::cout << "Backward Convolution Data Verifies on CPU and GPU (" << error_data << ')' << std::endl; } } if(is_wrw) { if(!TryReadVerificationCache( GetVCacheBwdWeightBasename(), weightTensor, dwei_host.data.data())) { RunBackwardWeightsCPU(); } // Winograd algorithm has worse precision than Direct and Gemm. // Winograd-specific precision loss is roughly 2+2 bits. // Affects only WrW FP32 for now. auto tolerance_wrw = tolerance; if(is_wrw_winograd && std::is_same::value) tolerance_wrw *= 16.0; auto error_weights = miopen::rms_range(dwei_host.data, dwei); if(!(error_weights < tolerance_wrw)) { std::cout << "Backward Convolution Weights Failed: " << error_weights << std::endl; cumulative_rc |= EC_VerifyWrw; } else { std::cout << "Backward Convolution Weights Verifies on CPU and GPU (" << error_weights << ')' << std::endl; } } if(inflags.GetValueInt("bias") != 0) { if(!TryReadVerificationCache( GetVCacheBiasBwdDataBasename(), biasTensor, db_host.data.data())) { RunBackwardBiasCPU(); } auto error_bias = miopen::rms_range(db_host.data, db); if(!(error_bias < tolerance)) { std::cout << "Backward Convolution Bias Failed: " << error_bias << std::endl; cumulative_rc |= EC_VerifyBwdBias; } else { std::cout << "Backward Convolution Bias Verifies on CPU and GPU (" << error_bias << ')' << std::endl; } } return cumulative_rc; } #endif // GUARD_MIOPEN_CONV_DRIVER_HPP