关于编辑器:SkeyeRTSPLive高效转码之SkeyeVideoDecoder采用Nvidia独立显卡高效硬件解码解决方案2

在我之前写的一篇文章《SkeyeRTSPLive 传统视频监控互联网 + 实现利器解决方案》中提到 RTSP 转 RTMP 的转流过程，简化流程就是通过 SkeyeRTSPClient 拉 RTSP 流，获取音视频编码数据，而后再通过 SkeyeRTMPPusher 推出去，流程非常简单；而后再理论开发过程中，咱们发现其实这个过程并没有设想中那么简略；首先，RTSP 协定反对多种音视频编码格局，如音频反对 AAC,G711，G726, 等，视频反对 H264，H625，MJPEG, MPEG 等等各种格局，而 SkeyeRTMPPusher 推流只反对 H264（已扩大反对 H265）格局，这时，音频咱们能够通过 SkeyeAACEncoder 将音频转码成 AAC 格局，而视频咱们能够通过 SkeyeVideoDecoder 解码成原始数据，而后再通过 SkeyeVideoEncoder 将原始数据转码成 RTMP 推送指定的格局，本文，咱们将重点讲述 SkeyeVideoDecoder 基于 Nvidia(英伟达)独立显卡的解码流程。

SkeyeNvDecoder 库是基于 Nvidia 独立显卡驱动的硬件解码程序，该解码程序效率十分高效且具备弱小的并行解码效能力，其解码效率比 ffmpeg 软件解码效率提到至多 5 - 6 倍，最新的 RTX 系列显卡其解码效率甚至比软解码高 10-12 倍，轻松解码多路 4K 乃至 8K 高清视频无压力，本文采纳的是截止目前（20190714）最新的显卡驱动，CUDA 版本须要 10.0 或者以上版本反对。

1. 接口申明如下：

#ifndef SKEYENVDECODERAPI_H
#define SKEYENVDECODERAPI_H

#include <string>

//++ typedefine start
#ifndef SKEYENVDECODER_HANDLE
#define SKEYENVDECODER_HANDLE void*
#endif//SKEYENVDECODER_HANDLE

typedef enum _OutputFormat //native= 默认解码器输入为 NV12 格局
{native = 0, bgrp, rgbp, bgra, rgba, bgra64, rgba64}OutputFormat;


typedef enum _SKEYENvDecoder_CodecType {
    SKEYENvDecoder_Codec_MPEG1 = 0,                                         /**<  MPEG1             */
    SKEYENvDecoder_Codec_MPEG2,                                           /**<  MPEG2             */
    SKEYENvDecoder_Codec_MPEG4,                                           /**<  MPEG4             */
    SKEYENvDecoder_Codec_VC1,                                             /**<  VC1               */
    SKEYENvDecoder_Codec_H264,                                            /**<  H264              */
    SKEYENvDecoder_Codec_JPEG,                                            /**<  JPEG              */
    SKEYENvDecoder_Codec_H264_SVC,                                        /**<  H264-SVC          */
    SKEYENvDecoder_Codec_H264_MVC,                                        /**<  H264-MVC          */
    SKEYENvDecoder_Codec_HEVC,                                            /**<  HEVC              */
    SKEYENvDecoder_Codec_VP8,                                             /**<  VP8               */
    SKEYENvDecoder_Codec_VP9,                                             /**<  VP9               */
    SKEYENvDecoder_Codec_NumCodecs,                                       /**<  Max codecs        */
} SKEYENvDecoder_CodecType;

typedef enum _SKEYENvDecoder_YUVType {

    // Uncompressed YUV
    SKEYENvDecoder_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')),   /**< Y,U,V (4:2:0)      */
    SKEYENvDecoder_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,V,U (4:2:0)      */
    SKEYENvDecoder_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,UV  (4:2:0)      */
    SKEYENvDecoder_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')),   /**< YUYV/YUY2 (4:2:2)  */
    SKEYENvDecoder_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y'))    /**< UYVY (4:2:2)       */
} SKEYENvDecoder_YUVType;

#ifdef __cplusplus
extern "C"
{
#endif

int SKEYENvDecoder_Initsize(std::string &erroStr);

// 除非应用低提早模式，否则请不要应用此标记 bLowLatency，然而应用此标记很难取得硬件解码器 100% 的利用率。SKEYENVDECODER_HANDLE NvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, std::string &erroStr);
int NvDecoder_Decode(NVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned);
void SKEYENvDecoder_Release(NVDECODER_HANDLE handle) ;
int NvDecoder_Uninitsize();


#ifdef __cplusplus
}
#endif

#endif // SKEYENVDECODERAPI_H

2. SkeyeNvDecoder 解码库调用流程

第一步，初始化注册解码器
留神，注册解码器函数全局只需调用一；

int SKEYENvDecoder_Initsize(string &erroStr)
{
  try
  {if (!isInitsized) { // 显卡只初始化一次
          ck(cuInit(0));
          int nGpu = 0;
          ck(cuDeviceGetCount(&nGpu));
          for (int i = 0; i < nGpu; i++)
          {
              CUdevice cuDevice = 0;
              ck(cuDeviceGet(&cuDevice, i));
              char szDeviceName[128];
              ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
              LOG(INFO) << "Find Gpu:" << szDeviceName << std::endl;
              CUcontext cuContext = NULL;
              ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice));

              m_ctxV.push_back({cuContext,szDeviceName});
          }
          isInitsized = true;
          m_curIndex = 0;
      }

      if (m_ctxV.empty()) {return -1;}
  }
  catch (const std::exception& ex)
  {erroStr = ex.what();
      std::cout << ex.what();
      return -2;
  }
  return 1;
}

第二步，创立解码器实例

SKEYENVDECODER_HANDLE SKEYENvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, string &erroStr)
{//if (!isInitsized || !m_ctxV.size()) {
    //    return NULL;
    //}

    try {ck(cuInit(0));
        int nGpu = 0;
        ck(cuDeviceGetCount(&nGpu));
        CUcontext cuContext = NULL;
        m_curIndex++;
        m_curIndex = (m_curIndex) % nGpu;
        for (int i = 0; i < nGpu; i++)
        {if (m_curIndex == i)
            {
                CUdevice cuDevice = 0;
                ck(cuDeviceGet(&cuDevice, i));
                char szDeviceName[128];
                ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
                LOG(INFO) << "Find Gpu:" << szDeviceName << std::endl;
                ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice));
            }
        }
        //std::pair<CUcontext, std::string> &v = m_ctxV.at(m_curIndex++ % m_ctxV.size());
        //std::cout << "Use Contex in" << v.second << std::endl;

        const char *aszChromaFormat[] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4"};
        cudaVideoCodec aeCodec[] = { cudaVideoCodec_JPEG, cudaVideoCodec_MPEG1, cudaVideoCodec_MPEG2, cudaVideoCodec_MPEG4, cudaVideoCodec_H264, cudaVideoCodec_HEVC,
            cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_VC1, cudaVideoCodec_VP8,
            cudaVideoCodec_VP9, cudaVideoCodec_VP9, cudaVideoCodec_VP9 };
        int anBitDepthMinus8[] = { 0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 4, 0, 0, 0, 2, 4};
        cudaVideoChromaFormat aeChromaFormat[] = { cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420,
            cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444,
            cudaVideoChromaFormat_444, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420 };

        CUVIDDECODECAPS videoDecodeCaps = {};
        videoDecodeCaps.eCodecType = (cudaVideoCodec)codec;
        videoDecodeCaps.eChromaFormat = cudaVideoChromaFormat_420;
        videoDecodeCaps.nBitDepthMinus8 = 0;
        for (int i = 0; i < sizeof(aeCodec) / sizeof(aeCodec[0]); i++) 
        {if (aeCodec[i] == codec)
            {videoDecodeCaps.eChromaFormat = aeChromaFormat[i];
                videoDecodeCaps.nBitDepthMinus8 = anBitDepthMinus8[i];
                break;
            }
        }
        errCode = cuvidGetDecoderCaps(&videoDecodeCaps);

        if (CUDA_SUCCESS == errCode) { // 判断显卡是否反对 1080p 解码
            LOG(INFO) << "cuvid Decoder Caps nMaxWidth" << videoDecodeCaps.nMaxWidth << "nMaxHeigth" << videoDecodeCaps.nMaxHeight << std::endl;
            if (!videoDecodeCaps.bIsSupported) {
                erroStr = "Codec not supported on this GPU Decoder";
                errCode = -1;
            }
            else
            {
                // 判断是否反对指定格局分辨率视频解码
                if (videoDecodeCaps.nMaxWidth >= videoW && videoDecodeCaps.nMaxHeight >= videoH)
                {
                    NvDecoder* pDecoder = new NvDecoder(/*v.first*/cuContext, videoW, videoH, eOutputFormat== native?false:true,
                        (cudaVideoCodec)codec, NULL, bLowLatency, eOutputFormat);
                    pDecoder->Start();
                    return pDecoder;
                }
                else
                {
                    erroStr = "Width and height not supported on this GPU Decoder";
                    errCode = -2;
                }
            }
        }
    }
    catch (std::exception &e)
    {erroStr = e.what();
    }
    return NULL;
}

第三步，调用解码函数解码

int SKEYENvDecoder_Decode(SKEYENVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned)
{if (!handle)
        return -1;
    NvDecoder* pDecoder = (NvDecoder*)handle;

    int anSize[] = { 0, 3, 3, 4, 4, 8, 8};
    //std::unique_ptr<uint8_t[]> pImage(new uint8_t[nFrameSize]);
    std::vector<uint8_t *>* vecOutBuffer = pDecoder->GetFrameBufferVector();
    size_t nFrameSize = pDecoder->GetOutFrameSize();
    *pnFrameLen = nFrameSize;

    int nFrameReturned = 0, nFrame = 0;
    uint8_t **ppFrame = NULL;

    bool bLowLatency = pDecoder->IsSetLowLatency();
    bool bSuc = pDecoder->Decode(pData, nSize, &ppFrame, &nFrameReturned, CUVID_PKT_ENDOFPICTURE/*bLowLatency?CUVID_PKT_ENDOFPICTURE : 0*/);
    if (!bSuc)
        return -2;
    //if (!nFrame && nFrameReturned > 0)
    //LOG(INFO) << "nFrameReturned =" <<nFrameReturned;//pDecoder->GetVideoInfo();

    for (int i = 0; i < nFrameReturned; i++)
    {if (native != pDecoder->GetSetOutputFormat())
        {if (i >= (*vecOutBuffer).size())
            {(*vecOutBuffer).push_back(new uint8_t[nFrameSize]);
            }
        }

        if (pDecoder->GetBitDepth() == 8) 
        {switch (pDecoder->GetSetOutputFormat()) 
            {
            case native:
                //GetImage((CUdeviceptr)ppFrame[i], (*vecOutBuffer)[i], pDecoder->GetWidth(), pDecoder->GetHeight() + (pDecoder->GetChromaHeight() * pDecoder->GetNumChromaPlanes()));
                break;
            case bgrp:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
                break;
            case rgbp:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
                break;
            case bgra:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case rgba:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case bgra64:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case rgba64:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            }
        }
    }
    nFrame += nFrameReturned;

    if (nFrameReturned > 0)
    {if (pnFrameReturned)
            *pnFrameReturned = nFrameReturned;
        if (native != pDecoder->GetSetOutputFormat())
        {if (pppFrame && (*vecOutBuffer).size() > 0)
                *pppFrame = &(*vecOutBuffer)[0];
        }
        else
        {if (pppFrame && ppFrame)
                *pppFrame = ppFrame;    
        }
    }
}

第四步，进行解码，销毁解码器

void SKEYENvDecoder_Release(SKEYENVDECODER_HANDLE handle)
{if (!handle)
        return;
    NvDecoder* pDecoder = (NvDecoder*)handle;    
    pDecoder->Stop();
    delete pDecoder;
    m_curIndex--;
    if (m_curIndex < 0)
        m_curIndex = 0;
}

第五步，登记解码器，开释资源

int SKEYENvDecoder_Uninitsize()
{
    isInitsized = false;
    for (int nI = 0; nI < m_ctxV.size(); nI++)
    {cuCtxDestroy(m_ctxV[nI].first);
    }
    m_ctxV.clear();
    m_curIndex = 0;
    return 1;
}

自此，SKEYENvDecoder 的封装就实现了，咱们能够通过其接口调用 Nvidia 的显卡进行硬件解码测试，以下为实在利用成果，硬解 12 路效果图 cpu I5 占比 11，730 显卡点 75-80，如下图所示：

有任何技术问题，欢送大家和我技术交换：
295222688@qq.com

大家也能够退出 SkeyePlayer 流媒体播放器 QQ 群进行探讨：
102644504

SkeyeVideoDecoder 基 Nvidia 独立显卡的硬解码库 SkeyeNvDecoder

1. 接口申明如下：

2. SkeyeNvDecoder 解码库调用流程