关于编辑器:SkeyeRTSPLive高效转码之SkeyeVideoDecoder采用Nvidia独立显卡高效硬件解码解决方案2

19次阅读

共计 11061 个字符,预计需要花费 28 分钟才能阅读完成。

在我之前写的一篇文章《SkeyeRTSPLive 传统视频监控互联网 + 实现利器解决方案》中提到 RTSP 转 RTMP 的转流过程,简化流程就是通过 SkeyeRTSPClient 拉 RTSP 流,获取音视频编码数据,而后再通过 SkeyeRTMPPusher 推出去,流程非常简单;而后再理论开发过程中,咱们发现其实这个过程并没有设想中那么简略;首先,RTSP 协定反对多种音视频编码格局,如音频反对 AAC,G711,G726, 等,视频反对 H264,H625,MJPEG, MPEG 等等各种格局,而 SkeyeRTMPPusher 推流只反对 H264(已扩大反对 H265)格局,这时,音频咱们能够通过 SkeyeAACEncoder 将音频转码成 AAC 格局,而视频咱们能够通过 SkeyeVideoDecoder 解码成原始数据,而后再通过 SkeyeVideoEncoder 将原始数据转码成 RTMP 推送指定的格局,本文,咱们将重点讲述 SkeyeVideoDecoder 基于 Nvidia(英伟达)独立显卡的解码流程。

SkeyeVideoDecoder 基 Nvidia 独立显卡的硬解码库 SkeyeNvDecoder

SkeyeNvDecoder 库是基于 Nvidia 独立显卡驱动的硬件解码程序,该解码程序效率十分高效且具备弱小的并行解码效能力,其解码效率比 ffmpeg 软件解码效率提到至多 5 - 6 倍,最新的 RTX 系列显卡其解码效率甚至比软解码高 10-12 倍,轻松解码多路 4K 乃至 8K 高清视频无压力,本文采纳的是截止目前(20190714)最新的显卡驱动,CUDA 版本须要 10.0 或者以上版本反对。

1. 接口申明如下:
#ifndef SKEYENVDECODERAPI_H
#define SKEYENVDECODERAPI_H

#include <string>

//++ typedefine start
#ifndef SKEYENVDECODER_HANDLE
#define SKEYENVDECODER_HANDLE void*
#endif//SKEYENVDECODER_HANDLE

typedef enum _OutputFormat //native= 默认解码器输入为 NV12 格局
{native = 0, bgrp, rgbp, bgra, rgba, bgra64, rgba64}OutputFormat;


typedef enum _SKEYENvDecoder_CodecType {
    SKEYENvDecoder_Codec_MPEG1 = 0,                                         /**<  MPEG1             */
    SKEYENvDecoder_Codec_MPEG2,                                           /**<  MPEG2             */
    SKEYENvDecoder_Codec_MPEG4,                                           /**<  MPEG4             */
    SKEYENvDecoder_Codec_VC1,                                             /**<  VC1               */
    SKEYENvDecoder_Codec_H264,                                            /**<  H264              */
    SKEYENvDecoder_Codec_JPEG,                                            /**<  JPEG              */
    SKEYENvDecoder_Codec_H264_SVC,                                        /**<  H264-SVC          */
    SKEYENvDecoder_Codec_H264_MVC,                                        /**<  H264-MVC          */
    SKEYENvDecoder_Codec_HEVC,                                            /**<  HEVC              */
    SKEYENvDecoder_Codec_VP8,                                             /**<  VP8               */
    SKEYENvDecoder_Codec_VP9,                                             /**<  VP9               */
    SKEYENvDecoder_Codec_NumCodecs,                                       /**<  Max codecs        */
} SKEYENvDecoder_CodecType;

typedef enum _SKEYENvDecoder_YUVType {

    // Uncompressed YUV
    SKEYENvDecoder_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')),   /**< Y,U,V (4:2:0)      */
    SKEYENvDecoder_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,V,U (4:2:0)      */
    SKEYENvDecoder_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,UV  (4:2:0)      */
    SKEYENvDecoder_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')),   /**< YUYV/YUY2 (4:2:2)  */
    SKEYENvDecoder_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y'))    /**< UYVY (4:2:2)       */
} SKEYENvDecoder_YUVType;

#ifdef __cplusplus
extern "C"
{
#endif

int SKEYENvDecoder_Initsize(std::string &erroStr);

// 除非应用低提早模式,否则请不要应用此标记 bLowLatency,然而应用此标记很难取得硬件解码器 100% 的利用率。SKEYENVDECODER_HANDLE NvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, std::string &erroStr);
int NvDecoder_Decode(NVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned);
void SKEYENvDecoder_Release(NVDECODER_HANDLE handle) ;
int NvDecoder_Uninitsize();


#ifdef __cplusplus
}
#endif

#endif // SKEYENVDECODERAPI_H
2. SkeyeNvDecoder 解码库调用流程
  • 第一步,初始化注册解码器
    留神,注册解码器函数全局只需调用一;

    int SKEYENvDecoder_Initsize(string &erroStr)
    {
      try
      {if (!isInitsized) { // 显卡只初始化一次
              ck(cuInit(0));
              int nGpu = 0;
              ck(cuDeviceGetCount(&nGpu));
              for (int i = 0; i < nGpu; i++)
              {
                  CUdevice cuDevice = 0;
                  ck(cuDeviceGet(&cuDevice, i));
                  char szDeviceName[128];
                  ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
                  LOG(INFO) << "Find Gpu:" << szDeviceName << std::endl;
                  CUcontext cuContext = NULL;
                  ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice));
    
                  m_ctxV.push_back({cuContext,szDeviceName});
              }
              isInitsized = true;
              m_curIndex = 0;
          }
    
          if (m_ctxV.empty()) {return -1;}
      }
      catch (const std::exception& ex)
      {erroStr = ex.what();
          std::cout << ex.what();
          return -2;
      }
      return 1;
    }
  • 第二步,创立解码器实例
SKEYENVDECODER_HANDLE SKEYENvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, string &erroStr)
{//if (!isInitsized || !m_ctxV.size()) {
    //    return NULL;
    //}

    try {ck(cuInit(0));
        int nGpu = 0;
        ck(cuDeviceGetCount(&nGpu));
        CUcontext cuContext = NULL;
        m_curIndex++;
        m_curIndex = (m_curIndex) % nGpu;
        for (int i = 0; i < nGpu; i++)
        {if (m_curIndex == i)
            {
                CUdevice cuDevice = 0;
                ck(cuDeviceGet(&cuDevice, i));
                char szDeviceName[128];
                ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
                LOG(INFO) << "Find Gpu:" << szDeviceName << std::endl;
                ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice));
            }
        }
        //std::pair<CUcontext, std::string> &v = m_ctxV.at(m_curIndex++ % m_ctxV.size());
        //std::cout << "Use Contex in" << v.second << std::endl;

        const char *aszChromaFormat[] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4"};
        cudaVideoCodec aeCodec[] = { cudaVideoCodec_JPEG, cudaVideoCodec_MPEG1, cudaVideoCodec_MPEG2, cudaVideoCodec_MPEG4, cudaVideoCodec_H264, cudaVideoCodec_HEVC,
            cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_VC1, cudaVideoCodec_VP8,
            cudaVideoCodec_VP9, cudaVideoCodec_VP9, cudaVideoCodec_VP9 };
        int anBitDepthMinus8[] = { 0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 4, 0, 0, 0, 2, 4};
        cudaVideoChromaFormat aeChromaFormat[] = { cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420,
            cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444,
            cudaVideoChromaFormat_444, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420 };

        CUVIDDECODECAPS videoDecodeCaps = {};
        videoDecodeCaps.eCodecType = (cudaVideoCodec)codec;
        videoDecodeCaps.eChromaFormat = cudaVideoChromaFormat_420;
        videoDecodeCaps.nBitDepthMinus8 = 0;
        for (int i = 0; i < sizeof(aeCodec) / sizeof(aeCodec[0]); i++) 
        {if (aeCodec[i] == codec)
            {videoDecodeCaps.eChromaFormat = aeChromaFormat[i];
                videoDecodeCaps.nBitDepthMinus8 = anBitDepthMinus8[i];
                break;
            }
        }
        errCode = cuvidGetDecoderCaps(&videoDecodeCaps);

        if (CUDA_SUCCESS == errCode) { // 判断显卡是否反对 1080p 解码
            LOG(INFO) << "cuvid Decoder Caps nMaxWidth" << videoDecodeCaps.nMaxWidth << "nMaxHeigth" << videoDecodeCaps.nMaxHeight << std::endl;
            if (!videoDecodeCaps.bIsSupported) {
                erroStr = "Codec not supported on this GPU Decoder";
                errCode = -1;
            }
            else
            {
                // 判断是否反对指定格局分辨率视频解码
                if (videoDecodeCaps.nMaxWidth >= videoW && videoDecodeCaps.nMaxHeight >= videoH)
                {
                    NvDecoder* pDecoder = new NvDecoder(/*v.first*/cuContext, videoW, videoH, eOutputFormat== native?false:true,
                        (cudaVideoCodec)codec, NULL, bLowLatency, eOutputFormat);
                    pDecoder->Start();
                    return pDecoder;
                }
                else
                {
                    erroStr = "Width and height not supported on this GPU Decoder";
                    errCode = -2;
                }
            }
        }
    }
    catch (std::exception &e)
    {erroStr = e.what();
    }
    return NULL;
}
  • 第三步,调用解码函数解码
int SKEYENvDecoder_Decode(SKEYENVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned)
{if (!handle)
        return -1;
    NvDecoder* pDecoder = (NvDecoder*)handle;

    int anSize[] = { 0, 3, 3, 4, 4, 8, 8};
    //std::unique_ptr<uint8_t[]> pImage(new uint8_t[nFrameSize]);
    std::vector<uint8_t *>* vecOutBuffer = pDecoder->GetFrameBufferVector();
    size_t nFrameSize = pDecoder->GetOutFrameSize();
    *pnFrameLen = nFrameSize;

    int nFrameReturned = 0, nFrame = 0;
    uint8_t **ppFrame = NULL;

    bool bLowLatency = pDecoder->IsSetLowLatency();
    bool bSuc = pDecoder->Decode(pData, nSize, &ppFrame, &nFrameReturned, CUVID_PKT_ENDOFPICTURE/*bLowLatency?CUVID_PKT_ENDOFPICTURE : 0*/);
    if (!bSuc)
        return -2;
    //if (!nFrame && nFrameReturned > 0)
    //LOG(INFO) << "nFrameReturned =" <<nFrameReturned;//pDecoder->GetVideoInfo();

    for (int i = 0; i < nFrameReturned; i++)
    {if (native != pDecoder->GetSetOutputFormat())
        {if (i >= (*vecOutBuffer).size())
            {(*vecOutBuffer).push_back(new uint8_t[nFrameSize]);
            }
        }

        if (pDecoder->GetBitDepth() == 8) 
        {switch (pDecoder->GetSetOutputFormat()) 
            {
            case native:
                //GetImage((CUdeviceptr)ppFrame[i], (*vecOutBuffer)[i], pDecoder->GetWidth(), pDecoder->GetHeight() + (pDecoder->GetChromaHeight() * pDecoder->GetNumChromaPlanes()));
                break;
            case bgrp:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
                break;
            case rgbp:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
                break;
            case bgra:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case rgba:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case bgra64:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            case rgba64:
                if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
                    YUV444ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                else
                    Nv12ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
                GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
                break;
            }
        }
    }
    nFrame += nFrameReturned;

    if (nFrameReturned > 0)
    {if (pnFrameReturned)
            *pnFrameReturned = nFrameReturned;
        if (native != pDecoder->GetSetOutputFormat())
        {if (pppFrame && (*vecOutBuffer).size() > 0)
                *pppFrame = &(*vecOutBuffer)[0];
        }
        else
        {if (pppFrame && ppFrame)
                *pppFrame = ppFrame;    
        }
    }
}
  • 第四步,进行解码,销毁解码器
void SKEYENvDecoder_Release(SKEYENVDECODER_HANDLE handle)
{if (!handle)
        return;
    NvDecoder* pDecoder = (NvDecoder*)handle;    
    pDecoder->Stop();
    delete pDecoder;
    m_curIndex--;
    if (m_curIndex < 0)
        m_curIndex = 0;
}
  • 第五步,登记解码器,开释资源
int SKEYENvDecoder_Uninitsize()
{
    isInitsized = false;
    for (int nI = 0; nI < m_ctxV.size(); nI++)
    {cuCtxDestroy(m_ctxV[nI].first);
    }
    m_ctxV.clear();
    m_curIndex = 0;
    return 1;
}

自此,SKEYENvDecoder 的封装就实现了,咱们能够通过其接口调用 Nvidia 的显卡进行硬件解码测试,以下为实在利用成果,硬解 12 路效果图 cpu I5 占比 11,730 显卡点 75-80,如下图所示:

有任何技术问题,欢送大家和我技术交换:
295222688@qq.com

大家也能够退出 SkeyePlayer 流媒体播放器 QQ 群进行探讨:
102644504

正文完
 0