-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_dl.sh
338 lines (294 loc) · 11.9 KB
/
prepare_dl.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# Prepare GPUs for deep learning
# Note: This script is not meant to be run directly. .sh is just for syntax highlighting.
# Please copy the command you need and run it in the shell one by one.
sudo timedatectl set-timezone "Asia/Shanghai"
# change locale
sudo locale-gen "en_US.UTF-8"
sudo dpkg-reconfigure locales
# or directly edit /etc/default/locale and then !reboot!
LANG=en_US.UTF-8
LANGUAGE=en_US.UTF-8
LC_CTYPE=en_US.UTF-8
LC_NUMERIC=en_US.UTF-8
LC_TIME=en_US.UTF-8
LC_COLLATE=en_US.UTF-8
LC_MONETARY=en_US.UTF-8
LC_MESSAGES=en_US.UTF-8
LC_PAPER=en_US.UTF-8
LC_NAME=en_US.UTF-8
LC_ADDRESS=en_US.UTF-8
LC_TELEPHONE=en_US.UTF-8
LC_MEASUREMENT=en_US.UTF-8
LC_IDENTIFICATION=en_US.UTF-8
LC_ALL=en_US.UTF-8
# Add user
sudo adduser xxx
sudo passwd xxx
sudo usermod -aG sudo xxx
# Basic packages for Ubuntu
# https://mirrors.tuna.tsinghua.edu.cn/help/ubuntu/
sudo cp /etc/apt/sources.list /etc/apt/sources.list.bak
sudo vim /etc/apt/sources.list
""" Ubuntu 20.04
# 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
# 预发布软件源,不建议启用
# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
"""
# Issue: Certificate verification failed, https://github.com/tuna/issues/issues/1342
# Solution 1: change https to http
# Solution 2: update ca-certificates first:
sudo apt-get install --only-upgrade ca-certificates
sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-7 \
git \
curl \
vim \
wget \
ca-certificates \
libjpeg-dev \
libpng-dev \
librdmacm1 \
libibverbs1 \
ibverbs-providers \
zsh \
tzdata \
libgl1-mesa-glx \
libglib2.0-0
zip \
unzip \
rsync \
htop \
language-pack-en \
nethogs \
sysstat \
gnupg \
lsb-release
# Install Nvidia Driver - Ubuntu 20.04
# https://developer.nvidia.com/cuda-downloads
wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
sudo sh cuda_11.6.2_510.47.03_linux.run
"""
===========
= Summary =
===========
Driver: Installed
Toolkit: Installed in /usr/local/cuda-11.6/
Please make sure that
- PATH includes /usr/local/cuda-11.6/bin
- LD_LIBRARY_PATH includes /usr/local/cuda-11.6/lib64, or, add /usr/local/cuda-11.6/lib64 to /etc/ld.so.conf and run ldconfig as root
To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-11.6/bin
To uninstall the NVIDIA Driver, run nvidia-uninstall
Logfile is /var/log/cuda-installer.log
"""
# Issue: WARNING: An NVIDIA kernel module 'nvidia' appears to be already loaded in your kernel.
sudo lsof /dev/nvidia*
# Install Nvidia Driver - Ubuntu 18.04
# https://developer.nvidia.com/cuda-downloads
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
sudo apt-get update
sudo apt-get -y install cuda
# Install v2ray as you may need :)
# https://github.com/v2fly/v2ray-core/releases/latest
wget https://hub.fastgit.xyz/v2fly/v2ray-core/releases/download/v4.44.0/v2ray-linux-64.zip
unzip v2ray-linux-64.zip -d v2ray && sudo cp v2ray/v2ray v2ray/v2ctl /usr/local/bin
v2ray -c config.json
export http_proxy="http://127.0.0.1:8001" https_proxy="http://127.0.0.1:8001"
# Tmux & Tmux configuration
sudo apt-get install -y libevent-dev ncurses-dev automake pkg-config \
&& cd /tmp \
&& wget -O tmux-2.8.tar.gz https://github.com/tmux/tmux/archive/2.8.tar.gz \
&& tar zxvf tmux-2.8.tar.gz \
&& cd tmux-2.8 \
&& ./autogen.sh \
&& ./configure --prefix=/usr/local \
&& make \
&& sudo make install \
&& rm -rf /tmp/tmux-2.8 /tmp/tmux-2.8.tar.gz
git clone https://github.com/hughplay/tmux-config.git /tmp/tmux-config \
&& bash /tmp/tmux-config/install.sh \
&& rm -rf /tmp/tmux-config \
&& echo "set -g default-shell `which zsh`" >> ~/.tmux.conf
# ZSH configuration
curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh | bash -s -- --unattended
wget https://raw.githubusercontent.com/oskarkrawczyk/honukai-iterm/master/honukai.zsh-theme -O ${ZSH:-~/.oh-my-zsh}/themes/honukai.zsh-theme --no-check-certificate \
&& sed -i.bak '/ZSH_THEME/s/\".*\"/\"honukai\"/' ~/.zshrc \
&& git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH:-~/.oh-my-zsh}/custom/plugins/zsh-autosuggestions\
&& sed -i.bak '/plugin/s/(.*)/(git zsh-autosuggestions)/' ~/.zshrc
# ZSH as default shell
chsh -s "$(command -v zsh)" "${USER}"
"""
# Install Miniconda
wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/install-conda.sh \
&& chmod +x /tmp/install-conda.sh \
&& /tmp/install-conda.sh -b -p $HOME/.miniconda \
&& rm -f /tmp/install-conda.sh \
&& export PATH="$HOME/.miniconda/bin:$PATH" \
&& conda init \
&& cat <<EOT >> ~/.condarc
channels:
- defaults
show_channel_urls: true
channel_alias: https://mirrors.tuna.tsinghua.edu.cn/anaconda
default_channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/pro
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
custom_channels:
conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
EOT
"""
conda install -y python=3.8
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install gpustat
sudo nvidia-smi daemon
gpustat -i 0.3
# PyTorch stable wheels mirror: https://mirror.sjtu.edu.cn/pytorch-wheels/torch_stable.html
pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1+cu111 -f https://mirror.sjtu.edu.cn/pytorch-wheels/torch_stable.html
# Install docker
curl https://get.docker.com | sh \
&& sudo systemctl --now enable docker
# https://docs.docker.com/engine/install/linux-postinstall/
sudo usermod -aG docker $USER
# relogin or run:
newgrp docker
# test
docker run hello-world
# Install nvidia-docker2
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update && sudo apt-get install -y nvidia-docker2
sudo systemctl restart docker
# test
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
# add proxy for docker
## docker pull through proxy: method 1 (verfied)
sudo mkdir /etc/systemd/system/docker.service.d
sudo vim /etc/systemd/system/docker.service.d/http-proxy.conf
[Service]
Environment="HTTP_PROXY=http://proxy.example.com:8001/"
Environment="HTTPS_PROXY=http://proxy.example.com:8001/"
sudo systemctl daemon-reload
sudo systemctl restart docker
## docker pull through proxy: method 2
sudo mkdir /etc/docker
sudo vim /etc/docker/daemon.json
{
"proxies": {
"default": {
"httpProxy": "http://127.0.0.1:8001",
"httpsProxy": "https://127.0.0.1:8001",
"noProxy": "localhost,127.0.0.1"
}
}
}
sudo service docker restart
# check the disk
lsblk
# Format & Mount disk
sudo mkfs.ext4 /dev/sdx
lsblk -f
sudo mkdir <mount_point>
sudo vim /etc/fstab
# UUID=<uuid> <mount_point> ext4 defaults 0 0
sudo mount -a
# use LVM to merge two disks into one logical volume
# Install necessary software packages
sudo apt-get install mdadm lvm2
# Create a RAID array
sudo mdadm --create /dev/md0 --level=0 --raid-devices=2 /dev/nvme1n1 /dev/nvme0n1
# Verify RAID array creation
cat /proc/mdstat
# Initialize LVM on the RAID array
sudo pvcreate /dev/md0
# Create a volume group (VG)
sudo vgcreate <vg-name> /dev/md0
# Create a logical volume (LV) within the volume group
sudo lvcreate -n <lv-name> -l 100%FREE <vg-name>
# Format the logical volume with a file system
sudo mkfs.ext4 /dev/<vg-name>/<lv-name>
# Create a mount point for the logical volume
sudo mkdir /mnt/<mount-point>
# Mount the logical volume
sudo mount /dev/<vg-name>/<lv-name> /mnt/<mount-point>
# Update /etc/fstab to automatically mount the logical volume
sudo nano /etc/fstab
# Add the following line to the end of the file:
# /dev/<vg-name>/<lv-name> /mnt/<mount-point> ext4 defaults 0 0
# Save the changes and exit the text editor
# Reboot the server to apply the changes and verify the mount
sudo reboot
# setup a group shared disk
## create a new group
sudo groupadd <share_grp>
## create a shared folder with group owner as <share_grp>
sudo mkdir /data/share
sudo chgrp <share_grp> /data/share
sudo chmod g+s /data/share
## add user to the group
sudo usermod -aG <share_grp> <username>
## user test (user with <username>)
newgrp <share_grp>
mkdir /data/share/test
ls -alh /data/share # folder test's owner should be <username>:<share_grp>
# nvidia-smi hangs issue
sudo systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target
sudo reboot -f
## server monitoring with netdata
# install netdata
wget -O /tmp/netdata-kickstart.sh https://my-netdata.io/kickstart.sh && sh /tmp/netdata-kickstart.sh --no-updates --stable-channel --disable-telemetry
# change hostname
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
sudo update-alternatives --config editor # select vim if you need
# [global]
# hostname: xxx
sudo systemctl restart netdata
# enable nvidia-smi
sudo ./edit-config go.d.conf
# nvidia_smi: yes
sudo systemctl restart netdata
# collect data from multiple servers to the center server
# https://learn.netdata.cloud/docs/streaming/understanding-how-streaming-works#enable-streaming-on-the-parent-node
# (parent)
uuidgen
sudo ./edit-config stream.conf
# replace [API_KEY] with [<uuidgen result>]
[<uuidgen result>]
enabled = yes
default memory mode = dbengine
# (child)
[stream]
enabled = yes
destination = <parent IP>
api key = <uuidgen of parent>
# umount busy disks: https://stackoverflow.com/questions/7878707/how-to-unmount-a-busy-device
# repair xfs disk when it hangs when mounting
sudo xfs_repair -L /dev/sdb1
# prevent OOM
sudo apt install earlyoom
sudo systemctl enable earlyoom
sudo systemctl start earlyoom