561 lines
18 KiB
TeX
561 lines
18 KiB
TeX
%
|
|
% LLM.tex
|
|
%
|
|
% AI Documentation
|
|
%
|
|
% Copyright (C) 2022, 2023, Jeff Moe
|
|
%
|
|
% This document is licensed under the Creative Commons Attribution 4.0
|
|
% International Public License (CC BY-SA 4.0) by Jeff Moe.
|
|
%
|
|
|
|
\section{Introduction}
|
|
Large Language Model, aka LLM.
|
|
|
|
\begin{mdframed}[backgroundcolor=blue!10,linecolor=blue!30]
|
|
\begin{itemize}
|
|
\item BLOOM --- \url{https://huggingface.co/bigscience/bloom}
|
|
\url{https://bigscience.huggingface.co/blog/bloom}
|
|
\item PolyGlot --- \url{https://github.com/EleutherAI/polyglot}
|
|
\item Maxtext --- \url{https://github.com/EleutherAI/maxtext}
|
|
\item H2O.ai --- \url{https://h2o.ai/}
|
|
\end{itemize}
|
|
\end{mdframed}
|
|
|
|
|
|
\section{LoLLMS}
|
|
Web gui for LLM.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/ParisNeo
|
|
cd ~/devel/ParisNeo
|
|
git clone --recursive https://github.com/ParisNeo/lollms-webui
|
|
cd lollms-webui/
|
|
pyenv local 3.11.6
|
|
# it likes name env for environment
|
|
virtualenv -p 3.11.6 env
|
|
source env/bin/activate
|
|
pip install -U setuptools wheel pip
|
|
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
~/devel/pytorch/verify.py
|
|
pip install -r requirements.txt
|
|
pip install exllamav2
|
|
bash webui.sh
|
|
|
|
# * When script prompts, put dir here:
|
|
# `~/devel/ParisNeo/lollms`
|
|
#
|
|
# * In `~/devel/ParisNeo/lollms/config` set IP to 0.0.0.0.
|
|
#
|
|
# * Open firewall.
|
|
\end{minted}
|
|
|
|
|
|
\section{Llama}
|
|
Llama very nice creature. Is also LLM.
|
|
|
|
|
|
\section{llama.cpp}
|
|
Widely used.
|
|
|
|
\begin{minted}{sh}
|
|
# deps, probably useful elsewhere too... Not all needed... XXX
|
|
sudo apt install libgloo-dev gds-tools libcufile-dev nvidia-fs-dkms libclblast-dev clblast-utils
|
|
\end{minted}
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/ggerganov
|
|
cd ~/devel/ggerganov
|
|
git clone https://github.com/ggerganov/llama.cpp
|
|
cd llama.cpp/
|
|
|
|
cmake \
|
|
-DCMAKE_AR=/usr/bin/gcc-ar-11 \
|
|
-DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 \
|
|
-DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 \
|
|
-DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
|
|
-DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 \
|
|
-DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 \
|
|
-DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 \
|
|
-DCUDAToolkit_INCLUDE_DIR=/usr/include \
|
|
-DCUDA_cuFile_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile.so.0 \
|
|
-DCUDA_cuFile_rdma_LIBRARY=/usr/lib/x86_64-linux-gnu/libcufile_rdma.so.1 \
|
|
-DLLAMA_MPI=ON \
|
|
-DLLAMA_NATIVE=OFF \
|
|
-DLLAMA_CUBLAS=ON \
|
|
-DLLAMA_AVX=ON \
|
|
..
|
|
|
|
# FAIL
|
|
# 13th Gen Intel(R) Core(TM) i9-13900KS
|
|
# flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb intel_pt sha_ni xsaveopt xsavec xgetbv1 xsaves split_lock_detect avx_vnni dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req hfi vnmi umip pku ospke waitpkg gfni vaes vpclmulqdq tme rdpid movdiri movdir64b fsrm md_clear serialize pconfig arch_lbr ibt flush_l1d arch_capabilities
|
|
-DLLAMA_AVX512=ON \
|
|
-DLLAMA_AVX512_VBMI=ON \
|
|
-DLLAMA_AVX512_VNNI=ON \
|
|
|
|
# maybe
|
|
-DLLAMA_F16C=ON -DLLAMA_FMA=ON -DLLAMA_LTO=ON -DLLAMA_QKK_64=ON
|
|
|
|
|
|
# Then build
|
|
make -j`nproc`
|
|
\end{minted}
|
|
|
|
|
|
\section{ollama}
|
|
% XXX mv
|
|
\begin{minted}{sh}
|
|
# misc
|
|
apt update
|
|
apt -y install git git-lfs cmake vim-tiny cmake-curses-gui libncurses-dev
|
|
|
|
# nvtop
|
|
git clone https://github.com/Syllo/nvtop
|
|
cd nvtop
|
|
mkdir build
|
|
cd build
|
|
cmake ..
|
|
ccmake ..
|
|
make -j`nproc`
|
|
./src/nvtop
|
|
|
|
|
|
# go
|
|
wget https://go.dev/dl/go1.21.2.linux-amd64.tar.gz
|
|
#sudo rm -rf /usr/local/go && sudo tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz
|
|
rm -rf /usr/local/go && tar -C /usr/local -xzf go1.21.2.linux-amd64.tar.gz
|
|
|
|
echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc
|
|
# $ XXX formatting
|
|
|
|
# Build
|
|
|
|
# Maybe for ppc64le, but doesn't work:
|
|
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
|
|
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
|
|
export CUDAToolkit_INCLUDE_DIR=/usr/include
|
|
|
|
# Total crap that does work (on morvolta):
|
|
cd /usr/lib
|
|
ln -s /usr/include .
|
|
|
|
git clone --recursive https://github.com/jmorganca/ollama
|
|
cd ollama/
|
|
go clean
|
|
go generate ./...
|
|
go build .
|
|
|
|
# Run
|
|
|
|
# set up podrun dirs:
|
|
mkdir /workspace/ollama
|
|
ln -s /workspace/ollama .ollama
|
|
|
|
./olama server
|
|
|
|
./ollama pull wizardcoder:34b-python
|
|
|
|
# ollama pull wizardcoder:7b-python
|
|
# ollama pull wizardcoder:13b-python
|
|
# ollama pull wizardcoder:34b-python
|
|
# ollama pull wizardlm:70b-llama2-q4_0
|
|
|
|
./ollama run wizardcoder:34b-python
|
|
|
|
ollama pull wizardcoder:34b-python
|
|
ollama pull wizardcoder:34b-python-q2_K
|
|
ollama pull wizardcoder:34b-python-q3_K_L
|
|
ollama pull wizardcoder:34b-python-q3_K_M
|
|
ollama pull wizardcoder:34b-python-q3_K_S
|
|
ollama pull wizardcoder:34b-python-q4_0
|
|
ollama pull wizardcoder:34b-python-q4_1
|
|
ollama pull wizardcoder:34b-python-q4_K_M
|
|
ollama pull wizardcoder:34b-python-q4_K_S
|
|
ollama pull wizardcoder:34b-python-q5_0
|
|
ollama pull wizardcoder:34b-python-q5_1
|
|
ollama pull wizardcoder:34b-python-q5_K_M
|
|
ollama pull wizardcoder:34b-python-q5_K_S
|
|
ollama pull wizardcoder:34b-python-q6_K
|
|
ollama pull wizardcoder:34b-python-q8_0
|
|
|
|
ollama run wizardcoder:34b-python
|
|
ollama run wizardcoder:34b-python-q2_K
|
|
ollama run wizardcoder:34b-python-q3_K_L
|
|
ollama run wizardcoder:34b-python-q3_K_M
|
|
ollama run wizardcoder:34b-python-q3_K_S
|
|
ollama run wizardcoder:34b-python-q4_0
|
|
ollama run wizardcoder:34b-python-q4_1
|
|
ollama run wizardcoder:34b-python-q4_K_M
|
|
ollama run wizardcoder:34b-python-q4_K_S
|
|
ollama run wizardcoder:34b-python-q5_0
|
|
ollama run wizardcoder:34b-python-q5_1
|
|
ollama run wizardcoder:34b-python-q5_K_M
|
|
ollama run wizardcoder:34b-python-q5_K_S
|
|
ollama run wizardcoder:34b-python-q6_K
|
|
ollama run wizardcoder:34b-python-q8_0
|
|
ollama pull orca-mini
|
|
|
|
ollama list
|
|
NAME ID SIZE MODIFIED
|
|
codellama:34b-code d78387764871 19 GB 3 days ago
|
|
codellama:34b-code-q8_0 2b6690e05081 36 GB 4 days ago
|
|
codellama:34b-instruct-q8_0 bf3f9995e93b 36 GB 3 days ago
|
|
codellama:34b-python 5ede7f0ac6c4 19 GB 3 days ago
|
|
codellama:34b-python-q8_0 73563ada07a6 36 GB 4 days ago
|
|
everythinglm:13b-16k-q8_0 9ef6e6d7446f 14 GB 4 days ago
|
|
falcon:180b 90d369418a4f 102 GB 3 days ago
|
|
falcon:180b-chat-q4_0 90d369418a4f 102 GB 3 days ago
|
|
falcon:180b-text-q4_0 aca441c3e642 102 GB 3 days ago
|
|
falcon:180b-text-q8_0 ad08feb8304f 191 GB 3 days ago
|
|
falcon:40b 2d9a4bfc8555 24 GB 3 days ago
|
|
falcon:40b-instruct 2d9a4bfc8555 24 GB 3 days ago
|
|
falcon:40b-instruct-fp16 7cbd92dfea70 84 GB 3 days ago
|
|
falcon:40b-text 77ecf2f4218a 24 GB 3 days ago
|
|
falcon:40b-text-fp16 c42691ec8c94 84 GB 3 days ago
|
|
llama2:70b f60ae38a353b 39 GB 3 days ago
|
|
mistral:7b-instruct-q8_0 f97a185cacf4 7.7 GB 4 days ago
|
|
mistral:7b-text-q8_0 e99f29355cda 7.7 GB 4 days ago
|
|
nexusraven:13b 336957c1d527 7.4 GB 3 days ago
|
|
nexusraven:13b-q8_0 25fba36ef0af 14 GB 3 days ago
|
|
orca-mini:13b-v3 5bc199b18569 7.4 GB 43 hours ago
|
|
orca-mini:70b-v3 179d05395377 39 GB 43 hours ago
|
|
orca-mini:7b-v3 de4ca17ad6a7 3.8 GB 44 hours ago
|
|
orca-mini:latest 5e9bc249c869 1.9 GB 4 days ago
|
|
phind-codellama:34b-python 4f719aca701f 19 GB 3 days ago
|
|
phind-codellama:34b-python-q8_0 9e8121c0c614 36 GB 3 days ago
|
|
phind-codellama:34b-v2 e2b45b77c8bf 19 GB 4 days ago
|
|
phind-codellama:34b-v2-q2_K fa4f2f18144d 14 GB 3 days ago
|
|
phind-codellama:34b-v2-q4_0 e2b45b77c8bf 19 GB 3 days ago
|
|
phind-codellama:34b-v2-q4_1 8870ba145794 21 GB 3 days ago
|
|
phind-codellama:34b-v2-q4_K_S 9c3bbb7e9ad4 19 GB 3 days ago
|
|
phind-codellama:34b-v2-q6_K b20c5fb7a66d 28 GB 3 days ago
|
|
phind-codellama:34b-v2-q8_0 1f6f3dca7bbc 36 GB 3 days ago
|
|
sqlcoder:15b c06a24fb83df 9.0 GB 3 days ago
|
|
sqlcoder:15b-fp16 c5e9fd3852f9 32 GB 3 days ago
|
|
sqlcoder:15b-q8_0 7cab292fc701 17 GB 3 days ago
|
|
starcoder:15b-base-q8_0 dbe41581594a 17 GB 4 days ago
|
|
vicuna:33b 1a7295496c4f 18 GB 3 days ago
|
|
vicuna:33b-q8_0 3c52edc13a77 35 GB 3 days ago
|
|
wizard-math:70b ec65b71e5de1 39 GB 3 days ago
|
|
wizard-math:70b-q5_K_M c14e6fd7faea 49 GB 3 days ago
|
|
wizardcoder:13b-python 767adb2cd4cc 7.3 GB 5 days ago
|
|
wizardcoder:34b-python 7bd03d2d0b8d 19 GB 8 days ago
|
|
wizardcoder:34b-python-q2_K 36a635134a0a 14 GB 5 days ago
|
|
wizardcoder:34b-python-q6_K 0c1c3994fb50 28 GB 5 days ago
|
|
wizardcoder:34b-python-q8_0 86d30a8b017c 36 GB 4 days ago
|
|
wizardcoder:7b-python ae6c60a3e5ce 3.8 GB 4 days ago
|
|
\end{minted}
|
|
|
|
\section{Axolotl}
|
|
Best beast ever.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/OpenAccess-AI-Collective
|
|
cd ~/devel/OpenAccess-AI-Collective/
|
|
git clone --recursive https://github.com/OpenAccess-AI-Collective/axolotl
|
|
cd axolotl/
|
|
pyenv local 3.10
|
|
virtualenv -p 3.10 venv
|
|
source venv/bin/activate
|
|
pip install -U setuptools pip wheel
|
|
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
pip3 install packaging
|
|
pip3 install -e '.[flash-attn,deepspeed]'
|
|
pip3 install -U git+https://github.com/huggingface/peft.git
|
|
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
|
|
accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml --lora_model_dir="./lora-out"
|
|
\end{minted}
|
|
|
|
|
|
\section{VLLM}
|
|
Install.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/vllm-project
|
|
cd ~/devel/vllm-project
|
|
git clone https://github.com/vllm-project/vllm
|
|
cd vllm/
|
|
pyenv local 3.11.6
|
|
virtualenv -p 3.11.6 venv
|
|
source venv/bin/activate
|
|
pip install -U setuptools wheel pip
|
|
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
pip install -r requirements.txt
|
|
pip install -e .
|
|
\end{minted}
|
|
|
|
Run server.
|
|
|
|
\begin{minted}{python}
|
|
python -m \
|
|
vllm.entrypoints.openai.api_server \
|
|
--model lmsys/vicuna-7b-v1.3 \
|
|
--host 0.0.0.0 \
|
|
--port 8080
|
|
\end{minted}
|
|
|
|
Other models available, such as:
|
|
|
|
\begin{minted}{python}
|
|
--model facebook/opt-125m \
|
|
--model lmsys/vicuna-7b-v1.3 \
|
|
--model tiiuae/falcon-40b \
|
|
--model tiiuae/falcon-7b \
|
|
--model lmsys/vicuna-13b-v1.3 \
|
|
--model openlm-research/open_llama_13b \
|
|
--model mistralai/Mistral-7B-v0.1 \
|
|
--model mistralai/Mistral-7B-Instruct-v0.1 \
|
|
--model bigcode/starcoder \
|
|
--model WizardLM/WizardCoder-15B-V1.0 \
|
|
--model bigscience/bloom \
|
|
--model OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5 \
|
|
--model meta-llama/Llama-2-70b-hf \
|
|
\end{minted}
|
|
|
|
List models.
|
|
|
|
\begin{minted}{sh}
|
|
curl http://localhost:8080/v1/models
|
|
\end{minted}
|
|
|
|
Simple query, such as:
|
|
|
|
\begin{minted}{sh}
|
|
!/bin/bash
|
|
|
|
curl http://localhost:8080/v1/completions \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"model": "lmsys/vicuna-7b-v1.3",
|
|
"prompt": "Write python code please.",
|
|
"max_tokens": 7,
|
|
"temperature": 0
|
|
}'
|
|
\end{minted}
|
|
|
|
Python scriptlet example:
|
|
|
|
\begin{minted}{python}
|
|
!/usr/bin/env python3
|
|
|
|
from vllm import LLM
|
|
|
|
prompts = ["Write some code.", "Rewrite it in rust."]
|
|
llm = LLM(model="lmsys/vicuna-7b-v1.3")
|
|
outputs = llm.generate(prompts)
|
|
print("OUTPUTS:")
|
|
print(outputs)
|
|
\end{minted}
|
|
|
|
|
|
\section{oobabooga}
|
|
oobabooga text GUI for LLM.
|
|
It's startup script uses Conda, so meh.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/oobabooga
|
|
cd ~/devel/oobabooga
|
|
git clone --recursive https://github.com/oobabooga/text-generation-webui
|
|
cd text-generation-webui/
|
|
# XXX
|
|
export CMAKE_C_COMPILER=/usr/lib/ccache/gcc-11
|
|
export CMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11
|
|
export CUDAToolkit_INCLUDE_DIR=/usr/include
|
|
export CUDACXX=/usr/lib/ccache/nvcc
|
|
export GCC=/usr/lib/ccache/gcc-11
|
|
export GXX=/usr/lib/ccache/g++-11
|
|
export CMAKE_CUDA_COMPILER=/usr/lib/ccache/nvcc
|
|
# XXX
|
|
|
|
|
|
pyenv local 3.10
|
|
virtualenv -p 3.10 env
|
|
source env/bin/activate
|
|
pip install -U setuptools wheel pip
|
|
pip install py-cpuinfo==9.0.0
|
|
pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
#pip install -r requirements.txt
|
|
# Perhaps
|
|
pip install -r requirements_nowheels.txt
|
|
# Other versions available, such as requirements_amd.txt
|
|
# Maybe
|
|
pip install exllama
|
|
# Might need GCC 11 (12 is default in Debian bookworm)
|
|
# Get models:
|
|
python download-model.py
|
|
# Run server:
|
|
python server.py --help
|
|
python server.py --listen
|
|
# Open port 7860 on firewall
|
|
\end{minted}
|
|
|
|
|
|
\section{TabbyML}
|
|
TabbyML code completion, self-hosted.
|
|
|
|
All this crap phones home by default, including TabbyML.
|
|
Firewall your vscode host! :)
|
|
|
|
\begin{minted}{sh}
|
|
# Add to ~/.bashrc to limit some telemetry:
|
|
export TABBY_DISABLE_USAGE_COLLECTION=1
|
|
\end{minted}
|
|
|
|
Building tabby requires a more recent version of cmake, but it's available
|
|
in backports:
|
|
|
|
\begin{minted}{sh}
|
|
apt install cmake -t bookworm-backports
|
|
# Also need deps:
|
|
apt install protobuf-compiler
|
|
\end{minted}
|
|
|
|
Build etc:
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/TabbyML/
|
|
cd ~/devel/TabbyML/
|
|
git clone --recursive https://github.com/TabbyML/tabby
|
|
cd tabby/
|
|
# Assumes you have latest rust installed...
|
|
cargo update
|
|
cargo build
|
|
# Perhaps put the thing somewhere
|
|
sudo cp -p ./target/debug/tabby /usr/local/bin/
|
|
\end{minted}
|
|
|
|
Then once it is built, get models.
|
|
But there's some screwiness, as perhaps the tabby config dir gets
|
|
owned by root (!!! XXX). If so, downloads fail. Meh:
|
|
|
|
|
|
\begin{minted}{sh}
|
|
chown -R user:user ~/user/.tabby
|
|
\end{minted}
|
|
|
|
Tabby can download from Huggingface (default, USA) or
|
|
or Modelscope (China).
|
|
Take your pick of none, or one of the below, perhaps add to ~/.bashrc:
|
|
|
|
\begin{minted}{sh}
|
|
export TABBY_REGISTRY="huggingface"
|
|
export TABBY_REGISTRY="modelscope"
|
|
\end{minted}
|
|
|
|
Download models thusly:
|
|
|
|
\begin{minted}{sh}
|
|
tabby download --model TabbyML/Codegen-2B
|
|
tabby download --model TabbyML/Codegen2-4B
|
|
tabby download --model TabbyML/Codegen25-7B
|
|
tabby download --model TabbyML/CodeLlama-7B
|
|
tabby download --model TabbyML/CodeLlama-13B
|
|
tabby download --model TabbyML/J-1B
|
|
tabby download --model TabbyML/J-350M
|
|
tabby download --model TabbyML/Mistral-7B
|
|
tabby download --model TabbyML/NeoX-70M
|
|
tabby download --model TabbyML/NeoX-1.3B
|
|
tabby download --model TabbyML/Phi-1_5B
|
|
tabby download --model TabbyML/SantaCoder-1B
|
|
tabby download --model TabbyML/StableCode-3B
|
|
tabby download --model TabbyML/StarCoder-1B
|
|
tabby download --model TabbyML/StarCoder-3B
|
|
tabby download --model TabbyML/StarCoder-7B
|
|
tabby download --model TabbyML/T5P-220M
|
|
tabby download --model TabbyML/Vicuna-7B
|
|
tabby download --model TabbyML/Vicuna-13B
|
|
tabby download --model TabbyML/WizardCoder-1B
|
|
tabby download --model TabbyML/WizardCoder-3B
|
|
tabby download --model TabbyML/WizardCoder-15B
|
|
\end{minted}
|
|
|
|
Models get stored to ~/.tabby/models.
|
|
|
|
Completion models include: CodeLlama, StarCoder.
|
|
Chat models include: Mistral, WizardCoder.
|
|
|
|
|
|
Run server, such as:
|
|
|
|
\begin{minted}{sh}
|
|
# GPU
|
|
tabby serve --model TabbyML/StarCoder-7B --chat-model TabbyML/WizardCoder-3B --device cuda --device-indices 0
|
|
# CPU
|
|
tabby serve --model TabbyML/SantaCoder-1B --chat-model TabbyML/WizardCoder-1B
|
|
\end{minted}
|
|
|
|
Meh, need to recompile:
|
|
|
|
\begin{minted}{sh}
|
|
what(): This CTranslate2 package was not compiled with CUDA support
|
|
\end{minted}
|
|
|
|
|
|
\section{CTranslate2}
|
|
Perhaps something like this to get CUDA built. Note this doesn't
|
|
include cudaNN though, due to need to figure out licensing.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/OpenNMT
|
|
cd ~/devel/OpenNMT/
|
|
git clone --recursive https://github.com/OpenNMT/CTranslate2
|
|
cd CTranslate2/
|
|
mkdir build
|
|
cd build/
|
|
|
|
cmake -DCUDA_HOST_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++-11 -DCMAKE_CXX_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_CXX_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc-11 -DCMAKE_C_COMPILER_AR=/usr/bin/gcc-ar-11 -DCMAKE_C_COMPILER_RANLIB=/usr/bin/gcc-ranlib-11 -DWITH_CUDA=ON -DWITH_MKL=OFF -DOPENMP_RUNTIME=NONE ..
|
|
|
|
make -j`nproc`
|
|
|
|
sudo make install
|
|
sudo ldconfig
|
|
|
|
\end{minted}
|
|
|
|
|
|
\section{vscodium}
|
|
Using an IDE with an AI assistant.
|
|
|
|
\subsection{Continue}
|
|
The ``Continue'' extension in vscodium sometimes works.
|
|
If Google is firewalled, the extension won't load, even using
|
|
a locally hosted server.
|
|
|
|
\subsection{TabbyML}
|
|
This half works for a self-hosted server.
|
|
|
|
\subsection{Huggingface}
|
|
An option for running self-hosted server:
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/LucienShui/
|
|
cd ~/devel/LucienShui/
|
|
git clone https://github.com/LucienShui/huggingface-vscode-endpoint-server
|
|
cd huggingface-vscode-endpoint-server
|
|
virtualenv venv
|
|
source venv/bin/activate
|
|
pip install -U setuptools wheel pip
|
|
# It wants an old torch.
|
|
pip install torch==1.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
|
|
pip install -r requirements.txt
|
|
python main.py --port 8080 --host 0.0.0.0 --pretrained "Phind/Phind-CodeLlama-34B-v2"
|
|
\end{minted}
|
|
|
|
\subsection{llm-vscode Inference Server}
|
|
Perhaps.
|
|
Uses vllm-project.
|
|
|
|
\begin{minted}{sh}
|
|
mkdir -p ~/devel/wangcx18
|
|
cd ~/devel/wangcx18/
|
|
git clone --recursive https://github.com/wangcx18/llm-vscode-inference-server
|
|
cd llm-vscode-inference-server/
|
|
virtualenv venv
|
|
source venv/bin/activate
|
|
pip install -U pip setuptools wheel
|
|
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
pip install torch torchvision torchaudio
|
|
pip install -r requirements.txt
|
|
python api_server.py --host 0.0.0.0 --port 8080
|
|
\end{minted}
|