关于人工智能:NCCL源码解析③机器内拓扑分析

作者|KIDGINBROOK
更新|潘丽晨

上节介绍所有节点执行了 bootstrap 网络连接的建设，接下来介绍下拓扑剖析。

因为 GPU 机器架构是多种多样的，一台机器上可能有多个网卡，多个 GPU 卡，卡间连接也各不相同，因而须要对机器内设施连贯拓扑进行剖析，以使性能在各种拓扑构造下都尽可能好。

接着上回持续看 initTransportsRank。

static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
  // We use 3 AllGathers
  // 1. {peerInfo, comm}
  // 2. ConnectTransport[nranks], ConnectValue[nranks]
  // 3. {nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
 
  int rank = comm->rank;
  int nranks = comm->nRanks;
  uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
  TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
 
  // AllGather1 - begin
  struct {
    struct ncclPeerInfo peerInfo;
    struct ncclComm* comm;
  } *allGather1Data;
 
  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
  allGather1Data[rank].comm = comm;
  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
  NCCLCHECK(fillInfo(comm, myInfo, commHash));
  ...
}

创立 nrank 个 allGather1Data，而后通过 fillInfo 填充以后 rank 的 peerInfo，ncclPeerInfo 是 rank 的一些根本信息，比方 rank 号，在哪个机器的哪个过程等。

struct ncclPeerInfo {
  int rank;
  int cudaDev;
  int gdrSupport;
  uint64_t hostHash;
  uint64_t pidHash;
  dev_t shmDev;
  int64_t busId;
};
 
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
  info->rank = comm->rank;
  CUDACHECK(cudaGetDevice(&info->cudaDev));
  info->hostHash=getHostHash()+commHash;
  info->pidHash=getPidHash()+commHash;
 
  // Get the device MAJOR:MINOR of /dev/shm so we can use that
  // information to decide whether we can use SHM for inter-process
  // communication in a container environment
  struct stat statbuf;
  SYSCHECK(stat("/dev/shm", &statbuf), "stat");
  info->shmDev = statbuf.st_dev;
 
  info->busId = comm->busId;
 
  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
  return ncclSuccess;
}

获取以后卡的 rank，PCIe busId，/dev/shm 的设施号，填充到 ncclPeerInfo，而后通过 ncclGpuGdrSupport 查看是否反对 gdr，rdma 在通信前须要注册一段内存，使得网卡晓得虚拟地址和物理地址的映射，然而如果每次通信都须要将 data 从显存拷贝到内存再通信的话效率就比拟低。

而 IB 提供了 peer memory 的接口，使得 ib 网卡能够拜访其余 PCIe 空间，nv 基于 peer memory 实现了本人的驱动，使得 rdma 能够间接注册显存，这样通信就能够防止 host 和 device 的内存拷贝，IB 能够间接 dma 显存，即 gdr。

static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
  int netDevs;
  NCCLCHECK(ncclNetDevices(&netDevs));
  *gdrSupport = 0;
  for (int dev=0; dev<netDevs; dev++) {
    // Find a net device which is GDR-capable
    ncclNetProperties_t props;
    NCCLCHECK(ncclNet->getProperties(dev, &props));
    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
 
    // Allocate memory on the GPU and try to register it on the NIC.
    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
    ncclNetHandle_t handle;
    void* gpuPtr = NULL;
    void* mHandle = NULL;
    NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
    NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
    NCCLCHECK(ncclNetAccept(lComm, &rComm));
    CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
    ncclDebugNoWarn = NCCL_NET;
    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
      *gdrSupport = 1;
    }
    ncclDebugNoWarn = 0;
    CUDACHECK(cudaFree(gpuPtr));
    NCCLCHECK(ncclNetCloseRecv(rComm));
    NCCLCHECK(ncclNetCloseSend(sComm));
    NCCLCHECK(ncclNetCloseListen(lComm));
    break;
  }
  return ncclSuccess;
}

这里会遍历每一个网卡，获取网卡的信息，由第一节能够晓得这里的 ncclNet 就是 ncclNetIb。

ncclResult_t ncclIbGdrSupport(int ibDev) {
  static int moduleLoaded = -1; 
  if (moduleLoaded == -1) {moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
  }
  if (moduleLoaded == 0) return ncclSystemError;
  return ncclSuccess;
}
 
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {props->name = ncclIbDevs[dev].devName;
  props->pciPath = ncclIbDevs[dev].pciPath;
  props->guid = ncclIbDevs[dev].guid;
  props->ptrSupport = NCCL_PTR_HOST;
  if (ncclIbGdrSupport(dev) != ncclSuccess) {INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d'%s'(no module)", dev, ncclIbDevs[dev].devName);
  } else {props->ptrSupport |= NCCL_PTR_CUDA;}
  props->speed = ncclIbDevs[dev].speed;
  props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
  props->maxComms = ncclIbDevs[dev].maxQp;
  return ncclSuccess;
}

这里次要是获取网卡名，PCIe 门路，guid 等信息，而后查看是否有 /sys/kernel/mm/memory_peers/nv_mem/version 判断是否装置了 nv_peermem，即 nv 的驱动，如果装置了的话则设置 props->ptrSupport |= NCCL_PTR_CUDA，示意能够注册显存。

而后尝试注册显存，如果能够注册则设置 gdrSupport 为 1，这里其实会创立 rdma 连贯，这个在后边会独自介绍，本次先略过

static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
  ...
  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
 
  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
  for (int i = 0; i < nranks; i++) {memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
      return ncclInvalidUsage;
    }
  }
  // AllGather1 data is used again below
  // AllGather1 - end
 
  // Topo detection / System graph creation
  NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
  ...
}

而后 bootstrapAllGather 播送 allGather1Data，将获取到的其余节点 peerinfo 拷贝到 comm 里。

在看具体拓扑剖析流程之前先简略理解一下 PCIe 的一些概念，一个简略的 PCIe 零碎示例如下。

每个 CPU 都有本人的 root complex，后简称为 RC，RC 会帮忙 cpu 和其余局部通信，比方和内存，和 PCIe 零碎，当 cpu 发送过去一个物理地址之后，如果这个地址是在 PCIe 空间，会被 RC 转换成 PCIe 申请进行通信。

switch 的作用是扩大 PCIe 端口，下边能够连贯设施或者其余 switch，上游来的申请被被他转发，PCIe 设施能够连在 RC，也能够连在 swtich，一个 switch 的外部如下所示

外部有一个 PCIe 总线，而后通过多个 Bridge 扩大出多个端口，其中上边的那个称为上游端口，其余的叫做上游端口。

前文有提到 NCCL 中很罕用的一个变量名叫 busId，比方 gpu 和 ib 网卡，留神辨别 NCCL 里的 busId 并不是指的总线号，指的其实是定位一个 PCIe 设施用到的 id，即 BDF(bus + device + function)，一个 bus 上有多个设施，一个设施有多个性能，因而通过 BDF 就能够定位一个设施，在机器启动实现 PCIe 的配置之后会将相干信息通过 sysfs 提供给用户，NCCL 就是通过 sysfs 来实现拓扑检测的。

而后看下执行的 ncclTopoGetSystem，这个函数就是本节的重点，会将以后 rank 的 PCI 树建设起来，分为两个步骤，先应用 xml 示意整个 PCI 树结构，而后基于 xml 转成 ncclTopoNode，其中 xml 定义如下，一个 ncclXmlNode 示意了 PCI 树的一个节点。

struct ncclXmlNode {char name[MAX_STR_LEN];
  struct {char key[MAX_STR_LEN];
    char value[MAX_STR_LEN];
  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
  int nAttrs;
  int type;
  struct ncclXmlNode* parent;
  struct ncclXmlNode* subs[MAX_SUBS];
  int nSubs;
};
 
struct ncclXml {struct ncclXmlNode nodes[MAX_NODES];
  int maxIndex;
};

ncclXmlNode 示意一个节点，记录了父节点和所有子节点，节点有 name 和 attr，通过 xmlSetAttr 进行设置属性。
ncclXml 中预调配了所有的 node，maxIndex 示意调配到了哪里，而后简略介绍下几个 xml 相干的 api。

static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub);

xmlAddNode 进行 node 的调配，示意在 xml 里新申请一个节点 sub，sub 的 name 设置为 subName，父节点为 parent。

static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue)

xmlFindTagKv 会遍历 xml 已调配的节点，找到节点名为 tagName 的节点 n，而后判断节点 n[“attrName”] 是否等于 attrValue，如果相等，则设置 node 为 n。

static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index)

xmlGetAttrIndex 会查看 attrName 是 node 的第几个属性。

而后开始看拓扑剖析的过程。

ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
  struct ncclXml* xml;
  NCCLCHECK(ncclCalloc(&xml, 1));
  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
  }
  if (xml->maxIndex == 0) {
    // Create top tag
    struct ncclXmlNode* top;
    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
  }
 
  // Auto-detect GPUs if needed
  for (int r=0; r<comm->nRanks; r++) {if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
      struct ncclXmlNode* node;
      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
      if (node == NULL) continue;
      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
    }   
  }
  ...
}

首先通过 xmlAddNode 创立根节点 ”system”（后续应用双引号示意 xml 树节点），并设置根节点属性 ”system” [“version”] = NCCL_TOPO_XML_VERSION，而后遍历每个 rank 的 hosthash，如果相等的话阐明在同一个机器，而后执行 ncclTopoFillGpu，将 gpu 退出到 xml 树。

ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
  struct ncclXmlNode* node;
  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
  ...
}

ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) {NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
  if (*pciNode == NULL) {NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
  }
  NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
  return ncclSuccess;
}

通过 ncclTopoGetPciNode 获取 xml 中的有没有创立以后卡的 xml node，此时没有，所以就新建一个 xml node 叫做 ”pci”，示意以后 gpu 卡，设置 ”pci”[“busid”]=busd。

而后执行 ncclTopoGetXmlFromSys，这个函数次要逻辑就是在 sysfs 中获取 gpu 节点到 cpu 的门路，通过这个门路转成 xml 树，并读取该门路下相干属性设置到 xml 里。

ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
  // Fill info, then parent
  const char* busId;
  NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
  char* path = NULL;
  int index;
  NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
  if (index == -1) {if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
  if (index == -1) {if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
    char deviceSpeedStr[MAX_STR_LEN];
    float deviceSpeed;
    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
    sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
    char portSpeedStr[MAX_STR_LEN];
    float portSpeed;
    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
    sscanf(portSpeedStr, "%f GT/s", &portSpeed);
    NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
  if (index == -1) {if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
    char strValue[MAX_STR_LEN];
    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
    int deviceWidth = strtol(strValue, NULL, 0); 
    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue));
    int portWidth = strtol(strValue, NULL, 0); 
    NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth)));
  }
  ...
}

首先设置 pciNode 的各种属性，通过 getPciPath 获取 busId 对应的 sysfs 门路 path，其实这个门路就是 PCI 树中根到叶结点的门路。

static ncclResult_t getPciPath(const char* busId, char** path) {char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
  memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
  memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
  *path = realpath(busPath, NULL);
  if (*path == NULL) {WARN("Could not find real path of %s", busPath);
    return ncclSystemError;
  }
  return ncclSuccess;
}

举个例子比方 path 是 /sys/devices/pci0000:10/0000:10:00.0/0000:11:00.0/0000:12:00.0/0000:13:00.0/0000
:14:00.0/0000:15:00.0/0000:16:00.0/0000:17:00.0，其中 GPU 的 busId 是 0000:17:00.0，那么这个 path 对应下图，留神，下图略去了 15:00.0 对应的 switch。

而后读取 path 下的属性，获取 class（PCI 设施类型），link_speed，link_width 等设置到 xml pciNode 中，ncclTopoGetStrFromSys 其实就是读取 path 下的内核文件保留到 strValue。

ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {char filePath[PATH_MAX];
  sprintf(filePath, "%s/%s", path, fileName);
  int offset = 0;
  FILE* file;
  if ((file = fopen(filePath, "r")) != NULL) {while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file);
      offset += len;
    }   
    fclose(file);
  }
  if (offset == 0) {strValue[0] = '\0';
    INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath);
  } else {strValue[offset-1] = '\0';
  }
  return ncclSuccess;
}

ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
  // Fill info, then parent
  ...
  struct ncclXmlNode* parent = pciNode->parent;
  if (parent == NULL) {if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
 
    // Save that for later in case next step is a CPU
    char numaIdStr[MAX_STR_LEN];
    NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
 
    // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
    // switch, or stop if we reach a CPU root complex.
    int slashCount = 0;
    int parentOffset;
    for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) {if (path[parentOffset] == '/') {
        slashCount++;
        path[parentOffset] = '\0';
        int start = parentOffset - 1;
        while (start>0 && path[start] != '/') start--;
        // Check whether the parent path looks like "BBBB:BB:DD.F" or not.
        if (checkBDFFormat(path+start+1) == 0) {
          // This a CPU root complex. Create a CPU tag and stop there.
          struct ncclXmlNode* topNode;
          NCCLCHECK(xmlFindTag(xml, "system", &topNode));
          NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
          if (parent == NULL) {NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
            NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
          }
        } else if (slashCount == 2) {
          // Continue on the upper PCI switch
          for (int i = strlen(path)-1; i>0; i--) {if (path[i] == '/') {NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1));
              if (parent == NULL) {NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
                NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1));
              }
              break;
            }
          }
        }
      }
      if (parent) break;
    }
    pciNode->parent = parent;
    parent->subs[parent->nSubs++] = pciNode;
  }
  if (strcmp(parent->name, "pci") == 0) {NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
  } else if (strcmp(parent->name, "cpu") == 0) {NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
  }
  free(path);
  return ncclSuccess;
}

而后从 pciNode 开始往上跳，因为一个 switch 的上游端口和上游端口别离对应了一个 bridge，NCCL 应用上游端口 bridge 的 busid 示意这个 switch，因而这里要向上跳两次再建设一个 xml node 示意这个 switch，往上找到一个 PCI 设施就将 slashCount 加一

当 slashCount== 2 就找到了一个 switch 上游端口，这个时候创立一个新的 xml pci 节点 parent 示意以后 switch，而后将以后节点 pciNode 链接到 parent，此时 parent 依然是 xml pci 节点。

因而，持续递归执行 ncclTopoGetXmlFromSys，直到遇到 RC，此时给 ”system” 创立一个子节点 ”cpu”，进行递归，而后执行 ncclTopoGetXmlFromCpu，设置 ”cpu” 的各种属性，比方 arch（比方 x86 还是 arm），affinity（该 cpu 的 numa 都有哪些 cpu core），numaid 等。

到这里 ncclTopoGetXmlFromSys 就执行完结了，接着看 ncclTopoFillGpu。

ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
  ...
  NCCLCHECK(wrapNvmlSymbols());
  NCCLCHECK(wrapNvmlInit());
  nvmlDevice_t nvmlDev;
  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
  NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
  return ncclSuccess;
}

而后通过 wrapNvmlSymbols 加载动静库 libnvidia-ml.so.1，用来获取 gpu 的相干信息。

ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) {
  struct ncclXmlNode* gpuNode = NULL;
  NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
  if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
 
  int index = -1;
 
  int dev = -1;
  NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
  if (index == -1) {if (nvmlDev == NULL) {WARN("No NVML, trying to use CUDA instead");
      const char* busId;
      NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
      if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
    } else {NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
    }
    NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
  }
  NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
  if (dev == -1) {*gpuNodeRet = NULL; return ncclSuccess;}
 
  NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
  if (index == -1) {
    int cudaMajor, cudaMinor;
    if (nvmlDev == NULL) {
      cudaDeviceProp devProp;
      CUDACHECK(cudaGetDeviceProperties(&devProp, dev));
      cudaMajor = devProp.major; cudaMinor = devProp.minor;
    } else {NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
    }
    NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
  }
  int sm;
  NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
 
  struct ncclXmlNode* nvlNode = NULL;
  NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
  if (nvlNode == NULL) {
    // NVML NVLink detection
    int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
 
    if (maxNvLinks > 0 && nvmlDev == NULL) {WARN("No NVML device handle. Skipping nvlink detection.\n");
      maxNvLinks = 0;
    }
 
    for (int l=0; l<maxNvLinks; ++l) {
      // Check whether we can use this NVLink for P2P
      unsigned canP2P;
      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
      // Make sure the Nvlink is up. The previous call should have trained the link.
      nvmlEnableState_t isActive;
      if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
 
      // Try to figure out what's on the other side of the NVLink
      nvmlPciInfo_t remoteProc;
      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
      // Make a lower case copy of the bus ID for calling ncclDeviceType
      // PCI system path is in lower case
      char* p = remoteProc.busId;
      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {lowerId = tolower(p);
        if (p == 0) break;
      }
 
      NCCLCHECK(xmlGetSubKv(gpuNode, "nvlink", &nvlNode, "target", lowerId));
      if (nvlNode == NULL) {NCCLCHECK(xmlAddNode(xml, gpuNode, "nvlink", &nvlNode));
        NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1));
      } else {
        int count;
        NCCLCHECK(xmlGetAttrInt(nvlNode, "count", &count));
        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count+1));
      }
    }
  }
  // Fill target classes
  for (int s=0; s<gpuNode->nSubs; s++) {struct ncclXmlNode* sub = gpuNode->subs[s];
    if (strcmp(sub->name, "nvlink") != 0) continue;
    int index;
    NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
    if (index == -1) {
      const char* busId;
      NCCLCHECK(xmlGetAttr(sub, "target", &busId));
      if (strcmp(busId, "fffffff:ffff:ff") == 0) {
        // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
        NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
      } else {
        char* path;
        NCCLCHECK(getPciPath(busId, &path));
        NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
      }
    }
  }
  *gpuNodeRet = gpuNode;
  return ncclSuccess;
}

首先在 xml gpu 节点 ”pci” 下创立节点 ”gpu”，而后设置 ”gpu” 节点的属性，比方 dev，计算能力 sm，而后开始查问 nvlink 相干信息，遍历所有可能的 nvlink，通过 nvmlDeviceGetNvLinkCapability 查问 nvlink 信息。

如果这个 nvlink 被启用，那么在 ”gpu” 节点下新建一个 ”nvlink” 节点，设置 ”target” 属性示意 nvlink 对端的 PCIe busId，将 ”target” 雷同的 ”nvlink” 节点示意为一个，用 ”count” 示意起止点之间有多少条 nvlink，而后设置属性 ”tclass” 示意 ”target” 是什么类型的 PCI 设施。

到这里 ncclTopoFillGpu 就执行完结了，此时 xml 如下所示，图里只展现了一张网卡的状况，其中 ”gpu” 和他的父节点其实都是指的同一个 gpu。

而后回到 ncclTopoGetSystem，会设置 ”gpu” 的 rank 和 gdr 属性。

而后是对于所有的网卡，相似上述 gpu 的过程，通过 ncclTopoGetXmlFromSys 建设 xml 树，如下所示，只展现一张网卡的状况，其中 ”net”，”nic” 和 ”nic” 的父节点都示意同一张网卡。

<system version="1">
  <cpu numaid="0" affinity="00000000,0000000f,ffff0000,00000000,000fffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
    <pci busid="0000:11:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
      <pci busid="0000:13:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
        <pci busid="0000:15:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
          <pci busid="0000:17:00.0" class="0x030200" link_speed="16 GT/s" link_width="16">
            <gpu dev="0" sm="80" rank="0" gdr="1">
              <nvlink target="0000:e7:00.0" count="2" tclass="0x068000"/>
              <nvlink target="0000:e4:00.0" count="2" tclass="0x068000"/>
              <nvlink target="0000:e6:00.0" count="2" tclass="0x068000"/>
              <nvlink target="0000:e9:00.0" count="2" tclass="0x068000"/>
              <nvlink target="0000:e5:00.0" count="2" tclass="0x068000"/>
              <nvlink target="0000:e8:00.0" count="2" tclass="0x068000"/>
            </gpu>
          </pci>
        </pci>
      </pci>
      <pci busid="0000:1c:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
        <nic>
          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x82d0c0003f6ceb8" maxconn="262144" gdr="1"/>
        </nic>
      </pci>
    </pci>
  </cpu>
</system>

总结一下，本节次要介绍了 NCCL 拓扑剖析的过程，通过 sysfs 将 gpu 和网卡对应的 pci 树结构建设进去了 xml 树。

（原文：
https://blog.csdn.net/KIDGIN7439/article/details/126990961）

欢送 Star、试用 OneFlow 最新版本：https://github.com/Oneflow-Inc/oneflow/

关于人工智能:NCCL源码解析③机器内拓扑分析

Just My Socks（注册教程 内含优惠码）

Just My Socks（注册教程内含优惠码）