关于pytorch:PyTorch之具体显存占用分析

78次阅读

共计 3474 个字符，预计需要花费 9 分钟才能阅读完成。

原始文档：https://www.yuque.com/lart/ug…

PyTorch 应用中，因为显卡显存是固定的，并且短期内难以进一步晋升，所以把握显存具体占用的细节有助于咱们写出更加高效的代码，甚至跑出更好的后果。

所以本文联合 Connolly 的文章《PyTorch 显存机制剖析》依照本人的需要进行了批改，同时梳理了 checkpoint 机制应用过程中的显存变换状况。

间接看代码。正文中表明了特定的显存占用和参数数量。

首先导入相干的包：

import torch
from torch.utils.checkpoint import checkpoint

初始化必要的数据和构造：

initial_usage = torch.cuda.memory_allocated()
print("0", initial_usage)  # 0

# 模型初始化
linear1 = torch.nn.Linear(1024, 1024, bias=False).cuda()
after_init_linear1 = torch.cuda.memory_allocated()
print("1", after_init_linear1 - initial_usage, linear1.weight.numel())  # 4194304 1048576

linear2 = torch.nn.Sequential(torch.nn.Linear(1024, 1024, bias=False), torch.nn.Linear(1024, 1, bias=False)).cuda()
after_init_linear2 = torch.cuda.memory_allocated()
print("2", after_init_linear2 - after_init_linear1, sum([m.weight.numel() for m in linear2]))  # 4198400 1049600

# 输出定义
inputs = torch.randn(size=(1024, 1024), device="cuda:0")
after_init_inputs = torch.cuda.memory_allocated()
print("3", after_init_inputs - after_init_linear2, inputs.numel())  # 4194304 1048576

第一次迭代：

print("Iter: 0")

# 前向流传
o = linear1(inputs)
after_linear1 = torch.cuda.memory_allocated()
print("4", after_linear1 - after_init_inputs, o.numel())  # 4194304 1048576

o = checkpoint(linear2, o)
after_linear2 = torch.cuda.memory_allocated()
# 4096 1024 这里应用了 checkpoint，能够看到这里并没有存储 linear2 外部的后果，仅蕴含输入 o
print("5", after_linear2 - after_linear1, o.numel())

"""
在 PyTorch 中，显存是按页为单位进行调配的，这可能是 CUDA 设施的限度。就算咱们只想申请 4 字节的显存，pytorch 也会先向 CUDA 设施申请 2MB 的显存到本人的 cache 区中，而后 pytorch 再为咱们调配 512 字节或者 1024 字节的空间。这个在应用 torch.cuda.memory_allocated() 的时候能够看进去 512 字节；用 torch.cuda.memory_cached() 能够看出向 CUDA 申请的 2MB。"""
loss = sum(o)
after_loss = torch.cuda.memory_allocated()
# 16785920 512
print("6", after_loss, after_loss - after_linear2)

# 后向流传
"""
后向流传会将模型的两头激活值给耗费并开释掉掉，并为每一个模型中的参数计算其对应的梯度。在第一次执行的时候，会为模型参数（即叶子结点）调配对应的用来存储梯度的空间。所以第一次之后，仅有两头激活值空间在变换。"""
loss.backward()
after_backward = torch.cuda.memory_allocated()
# 20984320 4198400=-4194304(开释 linear1 输入的 o)+4194304(申请 linear1 权重对应的梯度)+4198400(申请 linear2 权重对应的梯度)
# 因为 checkpoint 的应用，所以 linear2 没有存储两头激活值，然而保留了最终的激活值，因为变量 o 对其援用仍然在，所以 linear2 的输入未被开释。# linear1 自身不波及到两头激活值，而其输入则因为变量 o 指向了新的内存，所以会被主动回收。print("7", after_backward, after_backward - after_loss)

第二次迭代：

print("Iter: 1")

# 前向流传
o = linear1(inputs)
after_linear1 = torch.cuda.memory_allocated()
print("8", after_linear1 - after_backward, o.numel())  # 4190208 1048576

o = checkpoint(linear2, o)
after_linear2 = torch.cuda.memory_allocated()
# 4096 1024
print("9", after_linear2 - after_linear1, o.numel())

"""因为前一次计算的 loss 的援用还在，所以这里没有再新申请空间。"""
loss = sum(o)
after_loss = torch.cuda.memory_allocated()
print("10", after_loss, after_loss - after_linear2)  # 25178624 0

# 后向流传
loss.backward()
after_backward = torch.cuda.memory_allocated()
# 20984320 -4194304
# 这减去局部的恰好等于两头激活值的占用：-4190208(linear1 的输入 o)-4096(linear2 输入 o)
# 这里的 linaer2 应用了 checkpoint，则不存 linear2 两头特色的额定占用，因为这部分是在运算外部申请并实时开释的
print("11", after_backward, after_backward - after_loss)

第三次迭代：

del loss  # 用于验证 loss 对应的内存的回收状况

print("Iter: 2")

# 前向流传
o = linear1(inputs)
after_linear1 = torch.cuda.memory_allocated()
print("12", after_linear1 - after_backward, o.numel())  # 4190208 1048576

o = linear2(o)
after_linear2 = torch.cuda.memory_allocated()
# 4198400=1024*1024*4(linear2 的两头特色)+1024*4(linear2 输入 o) 1024
print("13", after_linear2 - after_linear1, o.numel())

"""在前一次计算后，del loss 的话，能够看到这里会申请 512 字节的空间"""
loss = sum(o)
after_loss = torch.cuda.memory_allocated()
print("14", after_loss, after_loss - after_linear2)  # 29372928 512

# 后向流传
loss.backward()
after_backward = torch.cuda.memory_allocated()
# 20984320 -8388608
# 这减去局部的恰好等于两头激活值的占用：-4190208(linear1 的输入 o)-4194304(1024*1024*4(linear2 两头特色))-4096(linear2 输入 o)
print("15", after_backward, after_backward - after_loss)

正文完