1
0
Fork 0
ai-doc/src/LLM.tex

561 lines
18 KiB
TeX

%
% LLM.tex
%
% AI Documentation
%
% Copyright (C) 2022, 2023, Jeff Moe
%
% This document is licensed under the Creative Commons Attribution 4.0
% International Public License (CC BY-SA 4.0) by Jeff Moe.
%
\section{Introduction}
Large Language Model, aka LLM.
\begin{mdframed}[backgroundcolor=blue!10,linecolor=blue!30]
\begin{itemize}
\item BLOOM --- \url{https://huggingface.co/bigscience/bloom}
\url{https://bigscience.huggingface.co/blog/bloom}
\item PolyGlot --- \url{https://github.com/EleutherAI/polyglot}
\item Maxtext --- \url{https://github.com/EleutherAI/maxtext}
\item H2O.ai --- \url{https://h2o.ai/}
\end{itemize}
\end{mdframed}
\section{LoLLMS}
Web gui for LLM.
\begin{minted}{sh}
mkdir -p ~/devel/ParisNeo
cd ~/devel/ParisNeo
git clone --recursive https://github.com/ParisNeo/lollms-webui
cd lollms-webui/
pyenv local 3.11.6
# it likes name env for environment
virtualenv -p 3.11.6 env
source env/bin/activate
pip install -U setuptools wheel pip
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
~/devel/pytorch/verify.py
pip install -r requirements.txt
pip install exllamav2
bash webui.sh
# * When script prompts, put dir here:
# `~/devel/ParisNeo/lollms`
#
# * In `~/devel/ParisNeo/lollms/config` set IP to 0.0.0.0.
#
# * Open firewall.
\end{minted}
\section{Llama}
Llama very nice creature. Is also LLM.
\section{llama.cpp}
Widely used.
\begin{minted}{sh}
# deps, probably useful elsewhere too... Not all needed... XXX
sudo apt install libgloo-dev gds-tools libcufile-dev nvidia-fs-dkms libclblast-dev clblast-utils
\end{minted}
\begin{minted}{sh}
mkdir -p ~/devel/ggerganov
cd ~/devel/ggerganov
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp/
cmake \
-DCMAKE_AR=/usr/bin/gcc-ar-11 \
-DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 \
-DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 \
-DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
-DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 \
-DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 \
-DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
-DCUDAToolkit_INCLUDE_DIR=/usr/include \
-DCUDA_cuFile_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile.so.0 \
-DCUDA_cuFile_rdma_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile_rdma.so.1 \
-DLLAMA_MPI=ON \
-DLLAMA_NATIVE=OFF \
-DLLAMA_CUBLAS=ON \
-DLLAMA_AVX=ON \
..
# FAIL
# 13th Gen Intel(R) Core(TM) i9-13900KS
# flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb intel_pt sha_ni xsaveopt xsavec xgetbv1 xsaves split_lock_detect avx_vnni dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req hfi vnmi umip pku ospke waitpkg gfni vaes vpclmulqdq tme rdpid movdiri movdir64b fsrm md_clear serialize pconfig arch_lbr ibt flush_l1d arch_capabilities
-DLLAMA_AVX512=ON \
-DLLAMA_AVX512_VBMI=ON \
-DLLAMA_AVX512_VNNI=ON \
# maybe
-DLLAMA_F16C=ON -DLLAMA_FMA=ON -DLLAMA_LTO=ON -DLLAMA_QKK_64=ON
# Then build
make -j`nproc`
\end{minted}
\section{ollama}
% XXX mv
\begin{minted}{sh}
# misc
apt update
apt -y install git git-lfs cmake vim-tiny cmake-curses-gui libncurses-dev
# nvtop
git clone https://github.com/Syllo/nvtop
cd nvtop
mkdir build
cd build
cmake ..
ccmake ..
make -j`nproc`
./src/nvtop
# go
wget https://go.dev/dl/go1.21.2.linux-amd64.tar.gz
#sudo rm -rf /usr/local/go && sudo tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz
rm -rf /usr/local/go && tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz
echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc
# $ XXX formatting
# Build
# Maybe for ppc64le, but doesn't work:
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
export CUDAToolkit_INCLUDE_DIR=/usr/include
# Total crap that does work (on morvolta):
cd /usr/lib
ln -s /usr/include .
git clone --recursive https://github.com/jmorganca/ollama
cd ollama/
go clean
go generate ./...
go build .
# Run
# set up podrun dirs:
mkdir /workspace/ollama
ln -s /workspace/ollama .ollama
./olama server
./ollama pull wizardcoder:34b-python
# ollama pull wizardcoder:7b-python
# ollama pull wizardcoder:13b-python
# ollama pull wizardcoder:34b-python
# ollama pull wizardlm:70b-llama2-q4_0
./ollama run wizardcoder:34b-python
ollama pull wizardcoder:34b-python
ollama pull wizardcoder:34b-python-q2_K
ollama pull wizardcoder:34b-python-q3_K_L
ollama pull wizardcoder:34b-python-q3_K_M
ollama pull wizardcoder:34b-python-q3_K_S
ollama pull wizardcoder:34b-python-q4_0
ollama pull wizardcoder:34b-python-q4_1
ollama pull wizardcoder:34b-python-q4_K_M
ollama pull wizardcoder:34b-python-q4_K_S
ollama pull wizardcoder:34b-python-q5_0
ollama pull wizardcoder:34b-python-q5_1
ollama pull wizardcoder:34b-python-q5_K_M
ollama pull wizardcoder:34b-python-q5_K_S
ollama pull wizardcoder:34b-python-q6_K
ollama pull wizardcoder:34b-python-q8_0
ollama run wizardcoder:34b-python
ollama run wizardcoder:34b-python-q2_K
ollama run wizardcoder:34b-python-q3_K_L
ollama run wizardcoder:34b-python-q3_K_M
ollama run wizardcoder:34b-python-q3_K_S
ollama run wizardcoder:34b-python-q4_0
ollama run wizardcoder:34b-python-q4_1
ollama run wizardcoder:34b-python-q4_K_M
ollama run wizardcoder:34b-python-q4_K_S
ollama run wizardcoder:34b-python-q5_0
ollama run wizardcoder:34b-python-q5_1
ollama run wizardcoder:34b-python-q5_K_M
ollama run wizardcoder:34b-python-q5_K_S
ollama run wizardcoder:34b-python-q6_K
ollama run wizardcoder:34b-python-q8_0
ollama pull orca-mini
ollama list
NAME ID SIZE MODIFIED
codellama:34b-code d78387764871 19 GB 3 days ago
codellama:34b-code-q8_0 2b6690e05081 36 GB 4 days ago
codellama:34b-instruct-q8_0 bf3f9995e93b 36 GB 3 days ago
codellama:34b-python 5ede7f0ac6c4 19 GB 3 days ago
codellama:34b-python-q8_0 73563ada07a6 36 GB 4 days ago
everythinglm:13b-16k-q8_0 9ef6e6d7446f 14 GB 4 days ago
falcon:180b 90d369418a4f 102 GB 3 days ago
falcon:180b-chat-q4_0 90d369418a4f 102 GB 3 days ago
falcon:180b-text-q4_0 aca441c3e642 102 GB 3 days ago
falcon:180b-text-q8_0 ad08feb8304f 191 GB 3 days ago
falcon:40b 2d9a4bfc8555 24 GB 3 days ago
falcon:40b-instruct 2d9a4bfc8555 24 GB 3 days ago
falcon:40b-instruct-fp16 7cbd92dfea70 84 GB 3 days ago
falcon:40b-text 77ecf2f4218a 24 GB 3 days ago
falcon:40b-text-fp16 c42691ec8c94 84 GB 3 days ago
llama2:70b f60ae38a353b 39 GB 3 days ago
mistral:7b-instruct-q8_0 f97a185cacf4 7.7 GB 4 days ago
mistral:7b-text-q8_0 e99f29355cda 7.7 GB 4 days ago
nexusraven:13b 336957c1d527 7.4 GB 3 days ago
nexusraven:13b-q8_0 25fba36ef0af 14 GB 3 days ago
orca-mini:13b-v3 5bc199b18569 7.4 GB 43 hours ago
orca-mini:70b-v3 179d05395377 39 GB 43 hours ago
orca-mini:7b-v3 de4ca17ad6a7 3.8 GB 44 hours ago
orca-mini:latest 5e9bc249c869 1.9 GB 4 days ago
phind-codellama:34b-python 4f719aca701f 19 GB 3 days ago
phind-codellama:34b-python-q8_0 9e8121c0c614 36 GB 3 days ago
phind-codellama:34b-v2 e2b45b77c8bf 19 GB 4 days ago
phind-codellama:34b-v2-q2_K fa4f2f18144d 14 GB 3 days ago
phind-codellama:34b-v2-q4_0 e2b45b77c8bf 19 GB 3 days ago
phind-codellama:34b-v2-q4_1 8870ba145794 21 GB 3 days ago
phind-codellama:34b-v2-q4_K_S 9c3bbb7e9ad4 19 GB 3 days ago
phind-codellama:34b-v2-q6_K b20c5fb7a66d 28 GB 3 days ago
phind-codellama:34b-v2-q8_0 1f6f3dca7bbc 36 GB 3 days ago
sqlcoder:15b c06a24fb83df 9.0 GB 3 days ago
sqlcoder:15b-fp16 c5e9fd3852f9 32 GB 3 days ago
sqlcoder:15b-q8_0 7cab292fc701 17 GB 3 days ago
starcoder:15b-base-q8_0 dbe41581594a 17 GB 4 days ago
vicuna:33b 1a7295496c4f 18 GB 3 days ago
vicuna:33b-q8_0 3c52edc13a77 35 GB 3 days ago
wizard-math:70b ec65b71e5de1 39 GB 3 days ago
wizard-math:70b-q5_K_M c14e6fd7faea 49 GB 3 days ago
wizardcoder:13b-python 767adb2cd4cc 7.3 GB 5 days ago
wizardcoder:34b-python 7bd03d2d0b8d 19 GB 8 days ago
wizardcoder:34b-python-q2_K 36a635134a0a 14 GB 5 days ago
wizardcoder:34b-python-q6_K 0c1c3994fb50 28 GB 5 days ago
wizardcoder:34b-python-q8_0 86d30a8b017c 36 GB 4 days ago
wizardcoder:7b-python ae6c60a3e5ce 3.8 GB 4 days ago
\end{minted}
\section{Axolotl}
Best beast ever.
\begin{minted}{sh}
mkdir -p ~/devel/OpenAccess-AI-Collective
cd ~/devel/OpenAccess-AI-Collective/
git clone --recursive https://github.com/OpenAccess-AI-Collective/axolotl
cd axolotl/
pyenv local 3.10
virtualenv -p 3.10 venv
source venv/bin/activate
pip install -U setuptools pip wheel
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install packaging
pip3 install -e '.[flash-attn,deepspeed]'
pip3 install -U git+https://github.com/huggingface/peft.git
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml --lora_model_dir="./lora-out"
\end{minted}
\section{VLLM}
Install.
\begin{minted}{sh}
mkdir -p ~/devel/vllm-project
cd ~/devel/vllm-project
git clone https://github.com/vllm-project/vllm
cd vllm/
pyenv local 3.11.6
virtualenv -p 3.11.6 venv
source venv/bin/activate
pip install -U setuptools wheel pip
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt
pip install -e .
\end{minted}
Run server.
\begin{minted}{python}
python -m \
vllm.entrypoints.openai.api_server \
--model lmsys/vicuna-7b-v1.3 \
--host 0.0.0.0 \
--port 8080
\end{minted}
Other models available, such as:
\begin{minted}{python}
--model facebook/opt-125m \
--model lmsys/vicuna-7b-v1.3 \
--model tiiuae/falcon-40b \
--model tiiuae/falcon-7b \
--model lmsys/vicuna-13b-v1.3 \
--model openlm-research/open_llama_13b \
--model mistralai/Mistral-7B-v0.1 \
--model mistralai/Mistral-7B-Instruct-v0.1 \
--model bigcode/starcoder \
--model WizardLM/WizardCoder-15B-V1.0 \
--model bigscience/bloom \
--model OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5 \
--model meta-llama/Llama-2-70b-hf \
\end{minted}
List models.
\begin{minted}{sh}
curl http://localhost:8080/v1/models
\end{minted}
Simple query, such as:
\begin{minted}{sh}
!/bin/bash
curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "lmsys/vicuna-7b-v1.3",
"prompt": "Write python code please.",
"max_tokens": 7,
"temperature": 0
}'
\end{minted}
Python scriptlet example:
\begin{minted}{python}
!/usr/bin/env python3
from vllm import LLM
prompts = ["Write some code.", "Rewrite it in rust."]
llm = LLM(model="lmsys/vicuna-7b-v1.3")
outputs = llm.generate(prompts)
print("OUTPUTS:")
print(outputs)
\end{minted}
\section{oobabooga}
oobabooga text GUI for LLM.
It's startup script uses Conda, so meh.
\begin{minted}{sh}
mkdir -p ~/devel/oobabooga
cd ~/devel/oobabooga
git clone --recursive https://github.com/oobabooga/text-generation-webui
cd text-generation-webui/
# XXX
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
export CUDAToolkit_INCLUDE_DIR=/usr/include
export CUDACXX=/usr/lib/ccache/nvcc
export GCC=/usr/lib/ccache/gcc-11
export GXX=/usr/lib/ccache/g++-11
export CMAKE_CUDA_COMPILER=/usr/lib/ccache/nvcc
# XXX
pyenv local 3.10
virtualenv -p 3.10 env
source env/bin/activate
pip install -U setuptools wheel pip
pip install py-cpuinfo==9.0.0
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#pip install -r requirements.txt
# Perhaps
pip install -r requirements_nowheels.txt
# Other versions available, such as requirements_amd.txt
# Maybe
pip install exllama
# Might need GCC 11 (12 is default in Debian bookworm)
# Get models:
python download-model.py
# Run server:
python server.py --help
python server.py --listen
# Open port 7860 on firewall
\end{minted}
\section{TabbyML}
TabbyML code completion, self-hosted.
All this crap phones home by default, including TabbyML.
Firewall your vscode host! :)
\begin{minted}{sh}
# Add to ~/.bashrc to limit some telemetry:
export TABBY_DISABLE_USAGE_COLLECTION=1
\end{minted}
Building tabby requires a more recent version of cmake, but it's available
in backports:
\begin{minted}{sh}
apt install cmake -t bookworm-backports
# Also need deps:
apt install protobuf-compiler
\end{minted}
Build etc:
\begin{minted}{sh}
mkdir -p ~/devel/TabbyML/
cd ~/devel/TabbyML/
git clone --recursive https://github.com/TabbyML/tabby
cd tabby/
# Assumes you have latest rust installed...
cargo update
cargo build
# Perhaps put the thing somewhere
sudo cp -p ./target/debug/tabby /usr/local/bin/
\end{minted}
Then once it is built, get models.
But there's some screwiness, as perhaps the tabby config dir gets
owned by root (!!! XXX). If so, downloads fail. Meh:
\begin{minted}{sh}
chown -R user:user ~/user/.tabby
\end{minted}
Tabby can download from Huggingface (default, USA) or
or Modelscope (China).
Take your pick of none, or one of the below, perhaps add to ~/.bashrc:
\begin{minted}{sh}
export TABBY_REGISTRY="huggingface"
export TABBY_REGISTRY="modelscope"
\end{minted}
Download models thusly:
\begin{minted}{sh}
tabby download --model TabbyML/Codegen-2B
tabby download --model TabbyML/Codegen2-4B
tabby download --model TabbyML/Codegen25-7B
tabby download --model TabbyML/CodeLlama-7B
tabby download --model TabbyML/CodeLlama-13B
tabby download --model TabbyML/J-1B
tabby download --model TabbyML/J-350M
tabby download --model TabbyML/Mistral-7B
tabby download --model TabbyML/NeoX-70M
tabby download --model TabbyML/NeoX-1.3B
tabby download --model TabbyML/Phi-1_5B
tabby download --model TabbyML/SantaCoder-1B
tabby download --model TabbyML/StableCode-3B
tabby download --model TabbyML/StarCoder-1B
tabby download --model TabbyML/StarCoder-3B
tabby download --model TabbyML/StarCoder-7B
tabby download --model TabbyML/T5P-220M
tabby download --model TabbyML/Vicuna-7B
tabby download --model TabbyML/Vicuna-13B
tabby download --model TabbyML/WizardCoder-1B
tabby download --model TabbyML/WizardCoder-3B
tabby download --model TabbyML/WizardCoder-15B
\end{minted}
Models get stored to ~/.tabby/models.
Completion models include: CodeLlama, StarCoder.
Chat models include: Mistral, WizardCoder.
Run server, such as:
\begin{minted}{sh}
# GPU
tabby serve --model TabbyML/StarCoder-7B --chat-model TabbyML/WizardCoder-3B --device cuda --device-indices 0
# CPU
tabby serve --model TabbyML/SantaCoder-1B --chat-model TabbyML/WizardCoder-1B
\end{minted}
Meh, need to recompile:
\begin{minted}{sh}
what(): This CTranslate2 package was not compiled with CUDA support
\end{minted}
\section{CTranslate2}
Perhaps something like this to get CUDA built. Note this doesn't
include cudaNN though, due to need to figure out licensing.
\begin{minted}{sh}
mkdir -p ~/devel/OpenNMT
cd ~/devel/OpenNMT/
git clone --recursive https://github.com/OpenNMT/CTranslate2
cd CTranslate2/
mkdir build
cd build/
cmake -DCUDA_HOST_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 -DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DWITH_CUDA=ON -DWITH_MKL=OFF -DOPENMP_RUNTIME=NONE ..
make -j`nproc`
sudo make install
sudo ldconfig
\end{minted}
\section{vscodium}
Using an IDE with an AI assistant.
\subsection{Continue}
The ``Continue'' extension in vscodium sometimes works.
If Google is firewalled, the extension won't load, even using
a locally hosted server.
\subsection{TabbyML}
This half works for a self-hosted server.
\subsection{Huggingface}
An option for running self-hosted server:
\begin{minted}{sh}
mkdir -p ~/devel/LucienShui/
cd ~/devel/LucienShui/
git clone https://github.com/LucienShui/huggingface-vscode-endpoint-server
cd huggingface-vscode-endpoint-server
virtualenv venv
source venv/bin/activate
pip install -U setuptools wheel pip
# It wants an old torch.
pip install torch==1.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
pip install -r requirements.txt
python main.py --port 8080 --host 0.0.0.0 --pretrained "Phind/Phind-CodeLlama-34B-v2"
\end{minted}
\subsection{llm-vscode Inference Server}
Perhaps.
Uses vllm-project.
\begin{minted}{sh}
mkdir -p ~/devel/wangcx18
cd ~/devel/wangcx18/
git clone --recursive https://github.com/wangcx18/llm-vscode-inference-server
cd llm-vscode-inference-server/
virtualenv venv
source venv/bin/activate
pip install -U pip setuptools wheel
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install torch torchvision torchaudio
pip install -r requirements.txt
python api_server.py --host 0.0.0.0 --port 8080
\end{minted}