项目中的模型一直都是直接操作NV12的yuv格式数据,这次的模型只支持RGB格式的输入,正好来自己实现对应的算子。
这里记录一下对应算子的实现过程,主要涉及到NV12到RGB的变换,RGB的crop/resize操作,对于数据的Norm/ToFloat操作,调整Layout等等。
cu文件是要nvcc来进行编译的,但是其头文件可以供外部的cpp文件调用,另外这里的核函数并没有涉及到stream的考虑,因为这个涉及到之后的性能优化环节,要有先来后到。实际stream也就是在核函数调用前的<<<>>>中传入stream而已,然后之后要跟着同步stream的操作。与函数实现逻辑无关。
cuda_transformation.cu
在这里实现真正的核函数,
NV12toRGB
这里的坑点在于 BT.601/709 FULL/非FULL的yuv格式,如果出了差错会导致图像看起来色度不对,遇到过的问题就是红色很不明显,原因就是转换公式写的有问题。
__global__ void NV12toRGB(uint8_t *yuv, uint8_t *rgb, int width, int height) { const int nv_start = width * height; int i, j, nv_index = 0; uint8_t y, u, v; int r, g, b; j = blockIdx.x * blockDim.x + threadIdx.x; i = blockIdx.y * blockDim.y + threadIdx.y; if (i >= height || j >= width) return; nv_index = i / 2 * width + j - j % 2; int rgb_index = i * width + j; y = yuv[rgb_index]; u = yuv[nv_start + nv_index]; v = yuv[nv_start + nv_index + 1]; r = y + (140 * (v - 128)) / 100; // r g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g b = y + (177 * (u - 128)) / 100; // b if (r > 255) r = 255; if (g > 255) g = 255; if (b > 255) b = 255; if (r < 0) r = 0; if (g < 0) g = 0; if (b < 0) b = 0; rgb[rgb_index * 3 + 0] = b; rgb[rgb_index * 3 + 1] = g; rgb[rgb_index * 3 + 2] = r; } int cudaNV12toRGB(uint8_t *input, uint8_t *output, size_t width, size_t height) { if (!input || !output) return cudaErrorInvalidDevicePointer; const dim3 blockDim(32, 32, 1); const dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1); NV12toRGB<<<gridDim, blockDim>>>(input, output, width, height); return cudaDeviceSynchronize(); }
RGBBilinearResize
__global__ void RGBBilinearResize(uint8_t *input, uint8_t *output, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { // 计算线程的全局索引 int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= outputWidth || y >= outputHeight) return; // gx,gy是相对于resize后的图中的点,这里计算对应的原图中的浮点位置,确定要从哪里采样 float gx = ((float)x) / outputWidth * (inputWidth - 1); float gy = ((float)y) / outputHeight * (inputHeight - 1); // 对应的整数位置及其偏移量 int gxi = (int)gx; int gyi = (int)gy; float dx = gx - gxi; float dy = gy - gyi; // 读取四个最近的像素值 uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0], input[(gyi * inputWidth + gxi) * 3 + 1], input[(gyi * inputWidth + gxi) * 3 + 2]}; uint8_t topRight[3] = {input[(gyi * inputWidth + gxi + 1) * 3 + 0], input[(gyi * inputWidth + gxi + 1) * 3 + 1], input[(gyi * inputWidth + gxi + 1) * 3 + 2]}; uint8_t bottomLeft[3] = {input[((gyi + 1) * inputWidth + gxi) * 3 + 0], input[((gyi + 1) * inputWidth + gxi) * 3 + 1], input[((gyi + 1) * inputWidth + gxi) * 3 + 2]}; uint8_t bottomRight[3] = { input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0], input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1], input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]}; // 对每个通道进行双线性插值 for (int i = 0; i < 3; i++) { float top = topLeft[i] * (1 - dx) + topRight[i] * dx; float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx; output[(y * outputWidth + x) * 3 + i] = top * (1 - dy) + bottom * dy; } } int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width, size_t height, size_t resize_width, size_t resize_height) { if (!input || !output) return cudaErrorInvalidDevicePointer; const dim3 blockDim(32, 32, 1); const dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1); RGBBilinearResize<<<gridDim, blockDim>>>(input, output, width, height, resize_width, resize_height); return cudaDeviceSynchronize(); }
RGBToFloat
这里的实现要额外记录下,因为涉及到debug中的opencv-dump所以在传入模型前的数据都是BGR格式的,在转浮点这里重新调整成模型需要的RGB格式。
__global__ void RGBToFloat(uint8_t *input, float *output, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; int idx = y * width + x; output[idx * 3 + 0] = input[idx * 3 + 2] / 255.0f; // R output[idx * 3 + 1] = input[idx * 3 + 1] / 255.0f; // G output[idx * 3 + 2] = input[idx * 3 + 0] / 255.0f; // B } int cudaRGBToFloat(uint8_t *input, float *output, int width, int height) { dim3 blockDim(16, 16); dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y); RGBToFloat<<<gridDim, blockDim>>>(input, output, width, height); return cudaDeviceSynchronize(); }
RGBNormalize
__global__ void RGBNormalize(float *image, int width, int height, float mean[], float std[]) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) { return; } int idx = y * width + x; if (std[0] < 1e-6 || std[1] < 1e-6 || std[2] < 1e-6) { printf("Error: std values are too small for safe division. "); return; } image[idx * 3 + 0] = (image[idx * 3 + 0] - mean[0]) / std[0]; // B image[idx * 3 + 1] = (image[idx * 3 + 1] - mean[1]) / std[1]; // G image[idx * 3 + 2] = (image[idx * 3 + 2] - mean[2]) / std[2]; // R } int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]) { dim3 blockDim(16, 16); dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y); RGBNormalize<<<gridDim, blockDim>>>(d_image, width, height, mean, std); cudaError_t cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "CUDA error: %s ", cudaGetErrorString(cudaStatus)); return -1; } return 0; }
HWC2CHW
template <typename T> __global__ void HWC2CHW(const T* input, T* output, int height, int width) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; int channelSize = width * height; int hwcIndex = y * width + x; int chwIndex; for (int c = 0; c < 3; ++c) { chwIndex = c * channelSize + y * width + x; output[chwIndex] = input[hwcIndex * 3 + c]; } } template <typename T> int cudaHWC2CHW(const T* input, T* output, int height, int width) { dim3 blockDim(16, 16); dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y); HWC2CHW<<<gridDim, blockDim>>>(input, output, height, width); return cudaDeviceSynchronize(); } template int cudaHWC2CHW<float>(const float* input, float* output, int height, int width);
cuda_transformation.h
void convertNV12toYUV444withActions_cuda(uint8_t *src_img, uint8_t *src_imgcuda, uint8_t *tmpImagecuda, ImageTransParam &trans_param, uint8_t *dst_imgcuda, uint8_t *dst_img, cudaStream_t stream); void convertNV12toYUV444withActions_cuda1(uint8_t *src_imgcuda, ImageTransParam &trans_param, uint8_t *dst_imgcuda); int cudaNV12toRGB(uint8_t* input, uint8_t* output, size_t width, size_t height); int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width, size_t height, size_t resize_width, size_t resize_height); int cudaRGBToFloat(uint8_t *input, float *output, int width, int height); int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]); template <typename T> int cudaHWC2CHW(const T* input, T* output, int height, int width);
image_transformation.h
这里也是对该变换进行封装,虽然项目是面向对象的抽象出了类似Transformer这个类,但是出于逻辑清晰和方便调试,我这里提供的都是面向过程的代码,另外附上了cpu中算子的实现。实际上一个图像处理算子的实现,一般过程是先生成cpu的,基于NCHW的循环版本,再对其改装成gpu上的算子,毕竟gpu的算子调试相较cpu不是很方便。虽然有cuda-gdb这种东西。可以看到cpu和gpu的版本基本上只在循环方式上有差别,因此核函数也是可以称为 for_each_pixel_func
void TransformNV12toRGB(uint8_t *input, uint8_t *output, int width, int height) { int ret = cudaNV12toRGB(input, output, width, height); if (ret != 0){ HSLOG_E << "cudaNV12toRGB FAILED"; } } void CpuTransformNV12toRGB(uint8_t *yuv, uint8_t *rgb, int width, int height) { const int nv_start = width * height; uint32_t i, j, index = 0, rgb_index = 0; uint8_t y, u, v; int r, g, b, nv_index = 0; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { // nv_index = (rgb_index / 2 - width / 2 * ((i + 1) / 2)) * 2; nv_index = i / 2 * width + j - j % 2; y = yuv[rgb_index]; u = yuv[nv_start + nv_index]; v = yuv[nv_start + nv_index + 1]; r = y + (140 * (v - 128)) / 100; // r g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g b = y + (177 * (u - 128)) / 100; // b if (r > 255) r = 255; if (g > 255) g = 255; if (b > 255) b = 255; if (r < 0) r = 0; if (g < 0) g = 0; if (b < 0) b = 0; // index = rgb_index % width + (height - i - 1) * width; index = rgb_index % width + i * width; rgb[index * 3 + 0] = b; rgb[index * 3 + 1] = g; rgb[index * 3 + 2] = r; rgb_index++; } } } void TransformRGBResize(uint8_t *input, uint8_t *output, size_t width, size_t height, size_t resize_width, size_t resize_height) { int ret = cudaRGBBilinearResize(input, output, width, height, resize_width, resize_height); if (ret != 0){ HSLOG_E << "cudaRGBBilinearResize FAILED: " << ret; } } void CPURGBBilinearResize(uint8_t *input, uint8_t *output, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { for (int y = 0; y < outputHeight; y++) { for (int x = 0; x < outputWidth; x++) { // 计算对应的原图中的浮点位置 float gx = ((float)x) / outputWidth * (inputWidth - 1); float gy = ((float)y) / outputHeight * (inputHeight - 1); // 对应的整数位置及其偏移量 int gxi = (int)gx; int gyi = (int)gy; float dx = gx - gxi; float dy = gy - gyi; // 读取四个最近的像素值 uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0], input[(gyi * inputWidth + gxi) * 3 + 1], input[(gyi * inputWidth + gxi) * 3 + 2]}; uint8_t topRight[3] = { input[(gyi * inputWidth + gxi + 1) * 3 + 0], input[(gyi * inputWidth + gxi + 1) * 3 + 1], input[(gyi * inputWidth + gxi + 1) * 3 + 2]}; uint8_t bottomLeft[3] = { input[((gyi + 1) * inputWidth + gxi) * 3 + 0], input[((gyi + 1) * inputWidth + gxi) * 3 + 1], input[((gyi + 1) * inputWidth + gxi) * 3 + 2]}; uint8_t bottomRight[3] = { input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0], input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1], input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]}; // 对每个通道进行双线性插值 for (int i = 0; i < 3; i++) { float top = topLeft[i] * (1 - dx) + topRight[i] * dx; float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx; output[(y * outputWidth + x) * 3 + i] = static_cast<uint8_t>(top * (1 - dy) + bottom * dy); } } } } void TransfromConvertRGBToFloat(uint8_t *input, float *output, int width, int height){ int ret = cudaRGBToFloat(input, output, width, height); if (ret != 0){ HSLOG_E << "cudaRGBToFloat FAILED: " << ret; } } void TransfromRGBNormalize(float *input, int width, int height, float* mean, float* std){ int ret = cudaRGBNormalize(input, width, height, mean, std); if (ret != 0){ HSLOG_E << "cudaRGBNormalize FAILED: " << ret; } } template <typename T> int TransfromHWC2CHW(const T* input, T* output, int height, int width){ int ret = cudaHWC2CHW<T>(input, output, height, width); if (ret != 0){ HSLOG_E << "cudaHWC2CHW FAILED: " << ret; } }
pre_process_module.cpp
这里额外加入一些dump的操作,以及debuggpu前N个字节的操作,方便调试
void PreProcessModule::Transform21dImage(hobot::dataflow::spMsgResourceProc proc, const hobot::dataflow::MessageLists &msgs){ UNUSED(proc); auto &input_img_batch_msgs = msgs[0]; std::shared_ptr<ImageBatchMsg<GPUImageMsg>> batch_image_msg = std::static_pointer_cast<ImageBatchMsg<GPUImageMsg>>( input_img_batch_msgs->at(0)); for (int i = 0; i < batch_image_msg->batch_size_; ++i) { auto image_msg = batch_image_msg->batch_img_msg_[i]; int height = image_msg->img_trans_param_.src_height; int width = image_msg->img_trans_param_.src_width; image_transformation_[i].TransformNV12toRGB(image_msg->cuda_nv12_, image_transformation_[i].cuda_image_out_, width, height); static int cnt = 0; if (true) { std::string input_file_path= "/home/yuxuan03.zhang/utils_code/lcc/query/" + std::to_string(cnt) + ".jpg"; cv::Mat bgrImage = cv::imread(input_file_path); if (bgrImage.empty()) { std::cerr << "Error: Image cannot be loaded!" << std::endl; } size_t size = bgrImage.total() * bgrImage.elemSize(); // 计算需要复制的内存大小 HSLOG_E << "height: " << height << "width: " << width << "size: " << size << "file" << input_file_path; // 将数据从 cv::Mat 复制到 GPU 内存 cudaMemcpy(image_transformation_[i].cuda_image_out_, bgrImage.ptr(), size, cudaMemcpyHostToDevice); image_msg->SetDoneTimestamp(cnt); cnt++; } // int size = width * height * 3 / 2; // uint8_t* cpu_nv12 = new uint8_t[size]; // cudaMemcpy(cpu_nv12, image_msg->cuda_nv12_, size, cudaMemcpyDeviceToHost); // cv::Mat nv12Img(height + height / 2, width, CV_8UC1, cpu_nv12); // cv::Mat bgrImg; // cv::cvtColor(nv12Img, bgrImg, cv::COLOR_YUV2BGR_NV12); // std::string file = std::to_string(image_msg->GetGenTimestamp()) + "_nv12.png"; // cv::imwrite(file, bgrImg); // delete[] cpu_nv12; // int dataSize = width * height * 3; // uint8_t* cpu_rgb = new uint8_t[dataSize]; // cudaMemcpy(cpu_rgb, image_transformation_[i].cuda_image_out_, dataSize, cudaMemcpyDeviceToHost); // cv::Mat rgb_img(height, width, CV_8UC3, cpu_rgb); // std::string file1 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb.png"; // cv::imwrite(file1, rgb_img); // delete[] cpu_rgb; image_transformation_[i].TransformRGBResize(image_transformation_[i].cuda_image_out_, image_transformation_[i].cuda_image_trans_buffer_, width, height, 910, 512); HSLOG_E <<"Resize: " << PrintFirstNUint8Bytes((uint8_t*)image_transformation_[i].cuda_image_trans_buffer_); // uint8_t* cpu_rgb_resize = new uint8_t[910*512*3]; // cudaMemcpy(cpu_rgb_resize, image_transformation_[i].cuda_image_trans_buffer_, 910*512*3, cudaMemcpyDeviceToHost); // cv::Mat rgb_resize_img(512, 910, CV_8UC3, cpu_rgb_resize); // std::string file2 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb_resize.png"; // cv::imwrite(file2, rgb_resize_img); // delete[] cpu_rgb_resize; image_transformation_[i].TransfromConvertRGBToFloat(image_transformation_[i].cuda_image_trans_buffer_, (float*)image_transformation_[i].cuda_image_out_, 910, 512); HSLOG_E <<"BRGToRGBFloat: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_); std::vector<float> mean = {0.485, 0.456, 0.406}; std::vector<float> std = {0.229, 0.224, 0.225}; float* mean_gpu = (float*)image_transformation_[i].cuda_image_trans_buffer_; float* std_gpu = mean_gpu+3; cudaMemcpy(mean_gpu, mean.data(), 3 * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(std_gpu, std.data(), 3 * sizeof(float), cudaMemcpyHostToDevice); image_transformation_[i].TransfromRGBNormalize((float*)image_transformation_[i].cuda_image_out_, 910, 512, mean_gpu, std_gpu); HSLOG_E <<"Norm: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_); image_transformation_[i].TransfromHWC2CHW((float*)image_transformation_[i].cuda_image_out_, (float*)image_msg->cuda_yuv444_, 512, 910); HSLOG_E <<"HWC2CHW: " << PrintFirstNFloatBytes((float*)image_msg->cuda_yuv444_); if (true) { float *cuda_image_out_ = (float*)image_msg->cuda_yuv444_; size_t dataSize = 3 * 512 * 910 * sizeof(float); float *hostData = new float[dataSize / sizeof(float)]; cudaMemcpy(hostData, cuda_image_out_, dataSize, cudaMemcpyDeviceToHost); std::string input_file_path= "./dump_bin/" + std::to_string(cnt) + ".bin"; std::ofstream outFile(input_file_path, std::ios::out | std::ios::binary); outFile.write(reinterpret_cast<char *>(hostData), dataSize); outFile.close(); delete[] hostData; } } SEND_DATA(SLOT_OUT_BATCH_TRANS_IMAGE, batch_image_msg); }