ai-doc/src/LLM.tex

%
% LLM.tex
%
% AI Documentation
%
% Copyright (C) 2022, 2023, Jeff Moe
%
% This document is licensed under the Creative Commons Attribution 4.0
% International Public License (CC BY-SA 4.0) by Jeff Moe.
%

\section{Introduction}
Large Language Model, aka LLM.

\begin{mdframed}[backgroundcolor=blue!10,linecolor=blue!30]
  \begin{itemize}
    \item BLOOM --- \url{https://huggingface.co/bigscience/bloom}
      \url{https://bigscience.huggingface.co/blog/bloom}
    \item PolyGlot --- \url{https://github.com/EleutherAI/polyglot}
    \item Maxtext --- \url{https://github.com/EleutherAI/maxtext}
    \item H2O.ai --- \url{https://h2o.ai/}
  \end{itemize}
\end{mdframed}


\section{LoLLMS}
Web gui for LLM.

\begin{minted}{sh}
mkdir -p ~/devel/ParisNeo
cd ~/devel/ParisNeo
git clone --recursive https://github.com/ParisNeo/lollms-webui
cd lollms-webui/
pyenv local 3.11.6
# it likes name env for environment
virtualenv -p 3.11.6 env
source env/bin/activate
pip install -U setuptools wheel pip
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
~/devel/pytorch/verify.py
pip install -r requirements.txt
pip install exllamav2
bash webui.sh

# * When script prompts, put dir here:
# `~/devel/ParisNeo/lollms`
#
# * In `~/devel/ParisNeo/lollms/config` set IP to 0.0.0.0.
#
# * Open firewall.
\end{minted}


\section{Llama}
Llama very nice creature. Is also LLM.


\section{llama.cpp}
Widely used.

\begin{minted}{sh}
# deps, probably useful elsewhere too... Not all needed... XXX
sudo apt install libgloo-dev gds-tools libcufile-dev nvidia-fs-dkms libclblast-dev clblast-utils
\end{minted}

\begin{minted}{sh}
mkdir -p ~/devel/ggerganov
cd ~/devel/ggerganov
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp/

cmake \
  -DCMAKE_AR=/usr/bin/gcc-ar-11 \
  -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 \
  -DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 \
  -DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
  -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 \
  -DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 \
  -DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
  -DCUDAToolkit_INCLUDE_DIR=/usr/include \
  -DCUDA_cuFile_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile.so.0 \
  -DCUDA_cuFile_rdma_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile_rdma.so.1 \
  -DLLAMA_MPI=ON \
  -DLLAMA_NATIVE=OFF \
  -DLLAMA_CUBLAS=ON \
  -DLLAMA_AVX=ON \
  ..

# FAIL
# 13th Gen Intel(R) Core(TM) i9-13900KS
# flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb intel_pt sha_ni xsaveopt xsavec xgetbv1 xsaves split_lock_detect avx_vnni dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req hfi vnmi umip pku ospke waitpkg gfni vaes vpclmulqdq tme rdpid movdiri movdir64b fsrm md_clear serialize pconfig arch_lbr ibt flush_l1d arch_capabilities
  -DLLAMA_AVX512=ON \
  -DLLAMA_AVX512_VBMI=ON \
  -DLLAMA_AVX512_VNNI=ON \

# maybe
-DLLAMA_F16C=ON -DLLAMA_FMA=ON -DLLAMA_LTO=ON -DLLAMA_QKK_64=ON


# Then build
make -j`nproc`
\end{minted}


\section{ollama}
% XXX mv
\begin{minted}{sh}
# misc
apt update
apt -y install git git-lfs cmake vim-tiny cmake-curses-gui libncurses-dev

# nvtop
git clone https://github.com/Syllo/nvtop
cd nvtop
mkdir build
cd build
cmake ..
ccmake ..
make -j`nproc`
./src/nvtop


# go
wget https://go.dev/dl/go1.21.2.linux-amd64.tar.gz
#sudo rm -rf /usr/local/go && sudo tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz
rm -rf /usr/local/go && tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz

echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc
# $ XXX formatting

# Build

# Maybe for ppc64le, but doesn't work:
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
export CUDAToolkit_INCLUDE_DIR=/usr/include

# Total crap that does work (on morvolta):
cd /usr/lib
ln -s /usr/include .

git clone --recursive https://github.com/jmorganca/ollama
cd ollama/
go clean
go generate ./...
go build .

# Run

# set up podrun dirs:
mkdir /workspace/ollama
ln -s /workspace/ollama .ollama

./olama server

./ollama pull wizardcoder:34b-python

# ollama pull wizardcoder:7b-python
# ollama pull wizardcoder:13b-python
# ollama pull wizardcoder:34b-python
# ollama pull wizardlm:70b-llama2-q4_0

./ollama run wizardcoder:34b-python

ollama pull wizardcoder:34b-python
ollama pull wizardcoder:34b-python-q2_K
ollama pull wizardcoder:34b-python-q3_K_L
ollama pull wizardcoder:34b-python-q3_K_M
ollama pull wizardcoder:34b-python-q3_K_S
ollama pull wizardcoder:34b-python-q4_0
ollama pull wizardcoder:34b-python-q4_1
ollama pull wizardcoder:34b-python-q4_K_M
ollama pull wizardcoder:34b-python-q4_K_S
ollama pull wizardcoder:34b-python-q5_0
ollama pull wizardcoder:34b-python-q5_1
ollama pull wizardcoder:34b-python-q5_K_M
ollama pull wizardcoder:34b-python-q5_K_S
ollama pull wizardcoder:34b-python-q6_K
ollama pull wizardcoder:34b-python-q8_0

ollama run wizardcoder:34b-python
ollama run wizardcoder:34b-python-q2_K
ollama run wizardcoder:34b-python-q3_K_L
ollama run wizardcoder:34b-python-q3_K_M
ollama run wizardcoder:34b-python-q3_K_S
ollama run wizardcoder:34b-python-q4_0
ollama run wizardcoder:34b-python-q4_1
ollama run wizardcoder:34b-python-q4_K_M
ollama run wizardcoder:34b-python-q4_K_S
ollama run wizardcoder:34b-python-q5_0
ollama run wizardcoder:34b-python-q5_1
ollama run wizardcoder:34b-python-q5_K_M
ollama run wizardcoder:34b-python-q5_K_S
ollama run wizardcoder:34b-python-q6_K
ollama run wizardcoder:34b-python-q8_0
ollama pull orca-mini

ollama list
NAME                               ID              SIZE      MODIFIED
codellama:34b-code                 d78387764871    19 GB     3 days ago
codellama:34b-code-q8_0            2b6690e05081    36 GB     4 days ago
codellama:34b-instruct-q8_0        bf3f9995e93b    36 GB     3 days ago
codellama:34b-python               5ede7f0ac6c4    19 GB     3 days ago
codellama:34b-python-q8_0          73563ada07a6    36 GB     4 days ago
everythinglm:13b-16k-q8_0          9ef6e6d7446f    14 GB     4 days ago
falcon:180b                        90d369418a4f    102 GB    3 days ago
falcon:180b-chat-q4_0              90d369418a4f    102 GB    3 days ago
falcon:180b-text-q4_0              aca441c3e642    102 GB    3 days ago
falcon:180b-text-q8_0              ad08feb8304f    191 GB    3 days ago
falcon:40b                         2d9a4bfc8555    24 GB     3 days ago
falcon:40b-instruct                2d9a4bfc8555    24 GB     3 days ago
falcon:40b-instruct-fp16           7cbd92dfea70    84 GB     3 days ago
falcon:40b-text                    77ecf2f4218a    24 GB     3 days ago
falcon:40b-text-fp16               c42691ec8c94    84 GB     3 days ago
llama2:70b                         f60ae38a353b    39 GB     3 days ago
mistral:7b-instruct-q8_0           f97a185cacf4    7.7 GB    4 days ago
mistral:7b-text-q8_0               e99f29355cda    7.7 GB    4 days ago
nexusraven:13b                     336957c1d527    7.4 GB    3 days ago
nexusraven:13b-q8_0                25fba36ef0af    14 GB     3 days ago
orca-mini:13b-v3                   5bc199b18569    7.4 GB    43 hours ago
orca-mini:70b-v3                   179d05395377    39 GB     43 hours ago
orca-mini:7b-v3                    de4ca17ad6a7    3.8 GB    44 hours ago
orca-mini:latest                   5e9bc249c869    1.9 GB    4 days ago
phind-codellama:34b-python         4f719aca701f    19 GB     3 days ago
phind-codellama:34b-python-q8_0    9e8121c0c614    36 GB     3 days ago
phind-codellama:34b-v2             e2b45b77c8bf    19 GB     4 days ago
phind-codellama:34b-v2-q2_K        fa4f2f18144d    14 GB     3 days ago
phind-codellama:34b-v2-q4_0        e2b45b77c8bf    19 GB     3 days ago
phind-codellama:34b-v2-q4_1        8870ba145794    21 GB     3 days ago
phind-codellama:34b-v2-q4_K_S      9c3bbb7e9ad4    19 GB     3 days ago
phind-codellama:34b-v2-q6_K        b20c5fb7a66d    28 GB     3 days ago
phind-codellama:34b-v2-q8_0        1f6f3dca7bbc    36 GB     3 days ago
sqlcoder:15b                       c06a24fb83df    9.0 GB    3 days ago
sqlcoder:15b-fp16                  c5e9fd3852f9    32 GB     3 days ago
sqlcoder:15b-q8_0                  7cab292fc701    17 GB     3 days ago
starcoder:15b-base-q8_0            dbe41581594a    17 GB     4 days ago
vicuna:33b                         1a7295496c4f    18 GB     3 days ago
vicuna:33b-q8_0                    3c52edc13a77    35 GB     3 days ago
wizard-math:70b                    ec65b71e5de1    39 GB     3 days ago
wizard-math:70b-q5_K_M             c14e6fd7faea    49 GB     3 days ago
wizardcoder:13b-python             767adb2cd4cc    7.3 GB    5 days ago
wizardcoder:34b-python             7bd03d2d0b8d    19 GB     8 days ago
wizardcoder:34b-python-q2_K        36a635134a0a    14 GB     5 days ago
wizardcoder:34b-python-q6_K        0c1c3994fb50    28 GB     5 days ago
wizardcoder:34b-python-q8_0        86d30a8b017c    36 GB     4 days ago
wizardcoder:7b-python              ae6c60a3e5ce    3.8 GB    4 days ago
\end{minted}

\section{Axolotl}
Best beast ever.

\begin{minted}{sh}
mkdir -p ~/devel/OpenAccess-AI-Collective
cd ~/devel/OpenAccess-AI-Collective/
git clone --recursive https://github.com/OpenAccess-AI-Collective/axolotl
cd axolotl/
pyenv local 3.10
virtualenv -p 3.10 venv
source venv/bin/activate
pip install -U setuptools pip wheel
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install packaging
pip3 install -e '.[flash-attn,deepspeed]'
pip3 install -U git+https://github.com/huggingface/peft.git
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml --lora_model_dir="./lora-out"
\end{minted}


\section{VLLM}
Install.

\begin{minted}{sh}
mkdir -p ~/devel/vllm-project
cd ~/devel/vllm-project
git clone https://github.com/vllm-project/vllm
cd vllm/
pyenv local 3.11.6
virtualenv -p 3.11.6 venv
source venv/bin/activate
pip install -U setuptools wheel pip
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt
pip install -e .
\end{minted}

Run server.

\begin{minted}{python}
python -m \
  vllm.entrypoints.openai.api_server \
  --model lmsys/vicuna-7b-v1.3 \
  --host 0.0.0.0 \
  --port 8080
\end{minted}

Other models available, such as:

\begin{minted}{python}
  --model facebook/opt-125m \
  --model lmsys/vicuna-7b-v1.3 \
  --model tiiuae/falcon-40b \
  --model tiiuae/falcon-7b \
  --model lmsys/vicuna-13b-v1.3 \
  --model openlm-research/open_llama_13b \
  --model mistralai/Mistral-7B-v0.1 \
  --model mistralai/Mistral-7B-Instruct-v0.1 \
  --model bigcode/starcoder \
  --model WizardLM/WizardCoder-15B-V1.0 \
  --model bigscience/bloom \
  --model OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5 \
  --model meta-llama/Llama-2-70b-hf \
\end{minted}

List models.

\begin{minted}{sh}
curl http://localhost:8080/v1/models
\end{minted}

Simple query, such as:

\begin{minted}{sh}
!/bin/bash

curl http://localhost:8080/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "lmsys/vicuna-7b-v1.3",
        "prompt": "Write python code please.",
        "max_tokens": 7,
        "temperature": 0
    }'
\end{minted}

Python scriptlet example:

\begin{minted}{python}
!/usr/bin/env python3

from vllm import LLM

prompts = ["Write some code.", "Rewrite it in rust."]
llm = LLM(model="lmsys/vicuna-7b-v1.3")
outputs = llm.generate(prompts)
print("OUTPUTS:")
print(outputs)
\end{minted}


\section{oobabooga}
oobabooga text GUI for LLM.
It's startup script uses Conda, so meh.

\begin{minted}{sh}
mkdir -p ~/devel/oobabooga
cd ~/devel/oobabooga
git clone --recursive https://github.com/oobabooga/text-generation-webui
cd text-generation-webui/
# XXX
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
export CUDAToolkit_INCLUDE_DIR=/usr/include
export CUDACXX=/usr/lib/ccache/nvcc
export GCC=/usr/lib/ccache/gcc-11
export GXX=/usr/lib/ccache/g++-11
export CMAKE_CUDA_COMPILER=/usr/lib/ccache/nvcc
# XXX


pyenv local 3.10
virtualenv -p 3.10 env
source env/bin/activate
pip install -U setuptools wheel pip
pip install py-cpuinfo==9.0.0
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#pip install -r requirements.txt
# Perhaps
pip install -r requirements_nowheels.txt
# Other versions available, such as requirements_amd.txt
# Maybe
pip install exllama
# Might need GCC 11 (12 is default in Debian bookworm)
# Get models:
python download-model.py
# Run server:
python server.py --help
python server.py --listen
# Open port 7860 on firewall
\end{minted}


\section{TabbyML}
TabbyML code completion, self-hosted.

All this crap phones home by default, including TabbyML.
Firewall your vscode host! :)

\begin{minted}{sh}
# Add to ~/.bashrc to limit some telemetry:
export TABBY_DISABLE_USAGE_COLLECTION=1
\end{minted}

Building tabby requires a more recent version of cmake, but it's available
in backports:

\begin{minted}{sh}
apt install cmake -t bookworm-backports
# Also need deps:
apt install protobuf-compiler
\end{minted}

Build etc:

\begin{minted}{sh}
mkdir -p ~/devel/TabbyML/
cd ~/devel/TabbyML/
git clone --recursive https://github.com/TabbyML/tabby
cd tabby/
# Assumes you have latest rust installed...
cargo update
cargo build
# Perhaps put the thing somewhere
sudo cp -p ./target/debug/tabby /usr/local/bin/
\end{minted}

Then once it is built, get models.
But there's some screwiness, as perhaps the tabby config dir gets
owned by root (!!! XXX). If so, downloads fail. Meh:


\begin{minted}{sh}
chown -R user:user ~/user/.tabby
\end{minted}

Tabby can download from Huggingface (default, USA) or
or Modelscope (China).
Take your pick of none, or one of the below, perhaps add to ~/.bashrc:

\begin{minted}{sh}
export TABBY_REGISTRY="huggingface"
export TABBY_REGISTRY="modelscope"
\end{minted}

Download models thusly:

\begin{minted}{sh}
tabby download --model TabbyML/Codegen-2B
tabby download --model TabbyML/Codegen2-4B
tabby download --model TabbyML/Codegen25-7B
tabby download --model TabbyML/CodeLlama-7B
tabby download --model TabbyML/CodeLlama-13B
tabby download --model TabbyML/J-1B
tabby download --model TabbyML/J-350M
tabby download --model TabbyML/Mistral-7B
tabby download --model TabbyML/NeoX-70M
tabby download --model TabbyML/NeoX-1.3B
tabby download --model TabbyML/Phi-1_5B
tabby download --model TabbyML/SantaCoder-1B
tabby download --model TabbyML/StableCode-3B
tabby download --model TabbyML/StarCoder-1B
tabby download --model TabbyML/StarCoder-3B
tabby download --model TabbyML/StarCoder-7B
tabby download --model TabbyML/T5P-220M
tabby download --model TabbyML/Vicuna-7B
tabby download --model TabbyML/Vicuna-13B
tabby download --model TabbyML/WizardCoder-1B
tabby download --model TabbyML/WizardCoder-3B
tabby download --model TabbyML/WizardCoder-15B
\end{minted}

Models get stored to ~/.tabby/models.

Completion models include: CodeLlama, StarCoder.
Chat models include: Mistral, WizardCoder.


Run server, such as:

\begin{minted}{sh}
# GPU
tabby serve --model TabbyML/StarCoder-7B --chat-model TabbyML/WizardCoder-3B --device cuda --device-indices 0
# CPU
tabby serve --model TabbyML/SantaCoder-1B --chat-model TabbyML/WizardCoder-1B
\end{minted}

Meh, need to recompile:

\begin{minted}{sh}
what():  This CTranslate2 package was not compiled with CUDA support
\end{minted}


\section{CTranslate2}
Perhaps something like this to get CUDA built. Note this doesn't
include cudaNN though, due to need to figure out licensing.

\begin{minted}{sh}
mkdir -p ~/devel/OpenNMT
cd ~/devel/OpenNMT/
git clone --recursive https://github.com/OpenNMT/CTranslate2
cd CTranslate2/
mkdir build
cd build/

cmake -DCUDA_HOST_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 -DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DWITH_CUDA=ON -DWITH_MKL=OFF -DOPENMP_RUNTIME=NONE ..

make -j`nproc`

sudo make install
sudo ldconfig

\end{minted}


\section{vscodium}
Using an IDE with an AI assistant.

\subsection{Continue}
The ``Continue'' extension in vscodium sometimes works.
If Google is firewalled, the extension won't load, even using
a locally hosted server.

\subsection{TabbyML}
This half works for a self-hosted server.

\subsection{Huggingface}
An option for running self-hosted server:

\begin{minted}{sh}
mkdir -p ~/devel/LucienShui/
cd ~/devel/LucienShui/
git clone https://github.com/LucienShui/huggingface-vscode-endpoint-server
cd huggingface-vscode-endpoint-server
virtualenv venv
source venv/bin/activate
pip install -U setuptools wheel pip
# It wants an old torch.
pip install torch==1.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
pip install -r requirements.txt
python main.py --port 8080 --host 0.0.0.0 --pretrained "Phind/Phind-CodeLlama-34B-v2"
\end{minted}

\subsection{llm-vscode Inference Server}
Perhaps.
Uses vllm-project.

\begin{minted}{sh}
mkdir -p ~/devel/wangcx18
cd ~/devel/wangcx18/
git clone --recursive https://github.com/wangcx18/llm-vscode-inference-server
cd llm-vscode-inference-server/
virtualenv venv
source venv/bin/activate
pip install -U pip setuptools wheel
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install torch torchvision torchaudio
pip install -r requirements.txt
python api_server.py  --host 0.0.0.0 --port 8080
\end{minted}