ucx, openmp, openmpi notes
parent
da5f1a33bc
commit
55cbdd2c66
|
@ -0,0 +1,37 @@
|
|||
# XXX Fail
|
||||
mkdir ucx # extra dir, dpkg writes to parent dir
|
||||
cd ucx/
|
||||
git clone https://github.com/openucx/ucx
|
||||
cd ucx/
|
||||
git checkout v1.15.0
|
||||
# XXX cruft
|
||||
cat > debian/rules.in <<EOF
|
||||
%:
|
||||
dh \$@
|
||||
|
||||
override_dh_auto_configure:
|
||||
./contrib/configure-release-mt --prefix=/usr \
|
||||
--enable-examples --with-java=no \
|
||||
--enable-mt --with-rocm=/opt/rocm
|
||||
chmod +x debian/rules
|
||||
|
||||
override_dh_auto_install:
|
||||
dh_auto_install --destdir=debian/tmp
|
||||
|
||||
override_dh_shlibdeps:
|
||||
dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info
|
||||
if [ -e debian/ucx-cuda.substvars ]; then \
|
||||
sed -i -e 's/libnvidia-compute-\([0-9]\+\)/& | libnvidia-ml1/' \
|
||||
debian/ucx-cuda.substvars \
|
||||
; fi
|
||||
EOF
|
||||
|
||||
CC="clang" CXX="clang++" CFLAGS="-Wno-error" ./autogen.sh
|
||||
CC="clang" CXX="clang++" CFLAGS="-Wno-error" ./configure --enable-mt --with-rocm=/opt/rocm
|
||||
CC="clang" CXX="clang++" CFLAGS="-Wno-error" dpkg-buildpackage -rfakeroot -b -uc
|
||||
|
||||
# sudo apt purge --autoremove libucx-dev libucx0 ucx-utils
|
||||
sudo dpkg -i ucx_1.15.348d14f_amd64.deb
|
||||
exit
|
||||
# configure: WARNING: HIP Runtime not found
|
||||
|
|
@ -47,7 +47,8 @@ Install dependencies from Debian repositories.
|
|||
environment-modules python3-numpy pybind11-dev libopengl-dev zip zsh \
|
||||
hpcc gawk googletest libdw-dev libgtest-dev libsigsegv2 \
|
||||
libbabeltrace-dev libbabeltrace1 libbison-dev libncurses5-dev \
|
||||
libtext-unidecode-perl tex-common texinfo
|
||||
libtext-unidecode-perl tex-common texinfo ucx-utils libucx-dev \
|
||||
librdmacm-dev
|
||||
|
||||
# Packages like this aren't used from Debian's repository.
|
||||
# Make sure they are gone.
|
|
@ -16,7 +16,7 @@ tinyrocs is currently working with AMD 7900 XTX GPUs and ``tinygrad``!
|
|||
about
|
||||
hardware
|
||||
firmware
|
||||
os
|
||||
debian
|
||||
kernel
|
||||
toolchain-6.0.2
|
||||
benchmarks
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
# SOME DESCRIPTIVE TITLE.
|
||||
# Copyright (C) 2023, 2024 Jeff Moe
|
||||
# This file is distributed under the same license as the tinyrocs: Direct to
|
||||
# Chip Liquid Cooled GPU AI Cluster package.
|
||||
# FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
|
||||
#
|
||||
#, fuzzy
|
||||
msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: tinyrocs: Direct to Chip Liquid Cooled GPU AI Cluster 0\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2024-02-07 14:45-0700\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: en\n"
|
||||
"Language-Team: en <LL@li.org>\n"
|
||||
"Plural-Forms: nplurals=2; plural=(n != 1);\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=utf-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"Generated-By: Babel 2.14.0\n"
|
||||
|
||||
#: ../../../_source/debian.rst:3
|
||||
msgid "Debian Operating System"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:4
|
||||
msgid "The tinyrocs operating system (OS) is based on Debian GNU/Linux."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:8
|
||||
msgid "Debian"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:9
|
||||
msgid "Debian stable (bookworm/12) is the base installation."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:13
|
||||
msgid "Installation"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:14
|
||||
msgid "Do a minimal Debian server installation, no GUI, just ssh."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:18
|
||||
msgid "Dependencies"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:19
|
||||
msgid "Install dependencies from Debian repositories."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:62
|
||||
msgid "OS Configuration"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:63
|
||||
msgid "Operating system configuration."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:79
|
||||
msgid "User Configuration"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:80
|
||||
msgid ""
|
||||
"Set up the user account. Configure to use various caching services already "
|
||||
"available in the cluster."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:85
|
||||
msgid "ccache"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:87
|
||||
msgid ""
|
||||
"There is a ``redis`` ``ccache`` server on the tinyrocs network. Edit ``~/."
|
||||
"config/ccache/ccache.conf`` thusly:"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:98
|
||||
msgid "PATH"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:100
|
||||
msgid "Add the ROCm binary path and ccache (XXX) to ``~/.bashrc``:"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:108
|
||||
msgid "Python pip cache"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:110
|
||||
msgid ""
|
||||
"Set up to use LAN ``pip`` cache ``pydev`` if available, by editing ``~/."
|
||||
"config/pip/pip.conf``, such as:"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:124
|
||||
msgid "Monitoring and Control"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:125
|
||||
msgid "Applications to monitor and control the hardware."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:127
|
||||
msgid "Most require dependenices built first."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:131
|
||||
msgid "nvtop"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:132
|
||||
msgid "``nvtop`` is nice to quickly visualize the GPUs in a text console."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/debian.rst:134
|
||||
msgid "Something like:"
|
||||
msgstr ""
|
|
@ -8,7 +8,7 @@ msgid ""
|
|||
msgstr ""
|
||||
"Project-Id-Version: tinyrocs 0\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2024-02-07 13:20-0700\n"
|
||||
"POT-Creation-Date: 2024-02-07 14:44-0700\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: en\n"
|
||||
|
@ -347,3 +347,30 @@ msgstr ""
|
|||
#: ../../../_source/toolchain-6.0.2.rst:270
|
||||
msgid "``hipBLAS`` plz."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/toolchain-6.0.2.rst:287
|
||||
msgid "OpenMP"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/toolchain-6.0.2.rst:288
|
||||
msgid ""
|
||||
"OpenMP can be built as a part of LLVM, but it fails in a first pass build. "
|
||||
"It can be built (perhaps) a rebuild of LLVM with LLVm."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/toolchain-6.0.2.rst:291
|
||||
msgid "OpenMP repos, check rebuild."
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/toolchain-6.0.2.rst:295
|
||||
msgid "OpenMPI"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../_source/toolchain-6.0.2.rst:296
|
||||
msgid ""
|
||||
"OpenMPI, not to be confused with OpenMP. They can be used independently of "
|
||||
"each other, or used together. Briefly, OpenMP parallelizes across one "
|
||||
"machine (GPUs/CPUs), and OpenMPI parallelizes across multiple machines "
|
||||
"(network). Both rebuilt for ROCm is ideal. Then applications need to be "
|
||||
"built against them."
|
||||
msgstr ""
|
||||
|
|
|
@ -273,6 +273,39 @@ hipBLAS
|
|||
:language: bash
|
||||
|
||||
|
||||
.. UCX
|
||||
.. ---
|
||||
.. UCX is in Debian, but it doesn't support ROCm for ``gfx1100``,
|
||||
.. so it needs re-building.
|
||||
.. OpenMPI needs UCX.
|
||||
..
|
||||
.. .. literalinclude:: _static/toolchain/rocm-6.0.2/build-ucx.sh
|
||||
.. :language: bash
|
||||
|
||||
|
||||
OpenMP
|
||||
------
|
||||
OpenMP can be built as a part of LLVM, but it fails in a first pass build.
|
||||
It can be built (perhaps) a rebuild of LLVM with LLVm.
|
||||
|
||||
OpenMP repos, check rebuild.
|
||||
|
||||
|
||||
OpenMPI
|
||||
-------
|
||||
OpenMPI, not to be confused with OpenMP.
|
||||
They can be used independently of each other, or used together.
|
||||
Briefly, OpenMP parallelizes across one machine (GPUs/CPUs),
|
||||
and OpenMPI parallelizes across multiple machines (network).
|
||||
Both rebuilt for ROCm is ideal. Then applications need to be
|
||||
built against them.
|
||||
|
||||
|
||||
.. rccl
|
||||
.. ----
|
||||
.. rccl, TODO.
|
||||
|
||||
|
||||
.. aomp
|
||||
.. ----
|
||||
.. ``aomp``.
|
||||
|
|
Loading…
Reference in New Issue