宿主机 装置 driver
# 禁用宿主机 自带 显卡 驱动lsmod | grep nouveau nouveau 1949696 0 mxm_wmi 16384 1 nouveau wmi 32768 2 mxm_wmi,nouveau video 49152 1 nouveau i2c_algo_bit 16384 1 nouveau ttm 106496 2 qxl,nouveau drm_kms_helper 184320 4 qxl,nouveau drm 491520 5 drm_kms_helper,qxl,ttm,nouveau# 首先金庸 nouveanvi /etc/modprobe.d/blacklist.conf# 在最初一行增加:blacklist nouveaumodprobe_path='/etc/modprobe.d/blacklist.conf'sed -i "s/blacklist nouveau//g" ${modprobe_path}echo -e '\nblacklist nouveau' >> ${modprobe_path}sudo update-initramfs -u# 敞开图形界面systemctl set-default multi-user.targetrebootlsmod | grep nouveau# 无输入 代表胜利echo "nameserver 114.114.114.114" > /etc/resolv.confsudo sed -i "s@http.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.listsudo sed -i "s@http.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.listapt update -yapt install nvidia-driver-460-server -y# 敞开图形界面systemctl set-default multi-user.targetdistribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \ && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.listsudo apt-get update -ysudo apt-get install -y nvidia-docker2sudo systemctl restart dockersudo systemctl enable docker
测试 nvidia-smi 驱动
sudo mkdir -p /etc/dockersudo tee /etc/docker/daemon.json <<-'EOF'{ "registry-mirrors": ["https://wm12hkla.mirror.aliyuncs.com"]}EOFsudo systemctl daemon-reloadsudo systemctl restart dockerecho "nameserver 114.114.114.114" > /etc/resolv.conf# 国内 减速sudo docker run --rm --gpus all registry.cn-hangzhou.aliyuncs.com/mkmk/all:nvidia-cuda-11-base nvidia-smiThu Apr 8 16:52:50 2021 +-----------------------------------------------------------------------------+| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 ||-------------------------------+----------------------+----------------------+| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC || Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. || | | MIG M. ||===============================+======================+======================|| 0 GeForce GT 730 Off | 00000000:00:03.0 N/A | N/A || 30% 30C P0 N/A / N/A | 0MiB / 2002MiB | N/A Default || | | N/A |+-------------------------------+----------------------+----------------------+| 1 GeForce GT 730 Off | 00000000:00:04.0 N/A | N/A || 30% 27C P0 N/A / N/A | 0MiB / 2002MiB | N/A Default || | | N/A |+-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+| Processes: || GPU GI CI PID Type Process name GPU Memory || ID ID Usage ||=============================================================================|| No running processes found |+-----------------------------------------------------------------------------+root@free_cicd:~# # 敞开图形界面systemctl set-default multi-user.target
应用 gpu-burn 测试 gpu
# 前台运行docker run -it --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter # 删除docker stop gpu-jupyter1 && docker rm gpu-jupyter1 # 后盾运行 , 后盾 运行 须要 查看 明码docker run -d --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter docker logs gpu-jupyter1ip:8888 拜访即可
gub-burn 的 dockerfile
docker run -it --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter # 推送到 近程register_url='192.168.170.100:5000'docker tag ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter registry.cn-hangzhou.aliyuncs.com/mkmk/all:tensorflow-2.4.1-gpu-jupyterdocker push registry.cn-hangzhou.aliyuncs.com/mkmk/all:tensorflow-2.4.1-gpu-jupyter
赠送的 一些 tf 简略代码
# 测试 是否 应用的 gpuimport tensorflow as tftf.test.is_gpu_available( cuda_only=False, min_cuda_compute_capability=None) print("is_gpu: ", tf.test.is_gpu_available())# 查看 所有的 可用 计算 设施from tensorflow.python.client import device_libprint(device_lib.list_local_devices())# 加法 # cpu%%timewith tf.device("/device:CPU:0"): a=tf.zeros([1000,1000]) print("a on gpu:",a.device.endswith('GPU:0')) for i in range(10000): b=tf.add(a,a)-->a on gpu: FalseCPU times: user 7.74 s, sys: 1.2 s, total: 8.94 sWall time: 3.39 s# gpu%%timewith tf.device("/device:GPU:0"): a=tf.zeros([1000,1000]) print("a on gpu:",a.device.endswith('GPU:0')) for i in range(10000): b=tf.add(a,a)-->a on gpu: TrueCPU times: user 900 ms, sys: 1.22 s, total: 2.12 sWall time: 2.12 s解说: 真正的 计算工夫 是 是 用户态 计算工夫, 7s 不是 物理工夫 而是 cpu 逻辑 耗时CPU times: user 7.74 s, (cpu 耗时)CPU times: user 900 ms, (GPU 耗时)
来聊聊天啊