diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..599dbe44 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,21 @@ +[submodule "smallthinker/ggml/src/ggml-kompute/kompute"] + path = smallthinker/ggml/src/ggml-kompute/kompute + url = https://github.com/nomic-ai/kompute.git +[submodule "smallthinker/powerinfer/third_part/perfetto"] + path = smallthinker/powerinfer/third_part/perfetto + url = https://github.com/google/perfetto.git +[submodule "smallthinker/powerinfer/third_part/benchmark"] + path = smallthinker/powerinfer/third_part/benchmark + url = https://github.com/google/benchmark.git +[submodule "smallthinker/powerinfer/third_part/googletest"] + path = smallthinker/powerinfer/third_part/googletest + url = https://github.com/google/googletest.git +[submodule "smallthinker/powerinfer/third_part/libaio"] + path = smallthinker/powerinfer/third_part/libaio + url = https://github.com/crossbuild/libaio.git +[submodule "smallthinker/powerinfer/third_part/liburing"] + path = smallthinker/powerinfer/third_part/liburing + url = https://github.com/axboe/liburing.git +[submodule "smallthinker/powerinfer/third_part/fmt"] + path = smallthinker/powerinfer/third_part/fmt + url = https://github.com/fmtlib/fmt.git diff --git a/smallthinker/README.md b/smallthinker/README.md index f7e43d5b..d868c4c2 100644 --- a/smallthinker/README.md +++ b/smallthinker/README.md @@ -1,18 +1,22 @@ ## Intro -- SmallThinker is a family of on-device native Mixture-of-Experts (MoE) language models specially designed for local deployment, co-developed by the IPADS and School of AI at Shanghai Jiao Tong University and Zenergize AI. Designed from the ground up for resource-constrained environments, SmallThinker brings powerful, private, and low-latency AI directly to your personal devices, without relying on the cloud. +- SmallThinker ([SmallThinker-21BA3B-Instruct](https://huggingface.co/PowerInfer/SmallThinker-21BA3B-Instruct) and [SmallThinker-4BA0.6B-Instruct](https://huggingface.co/PowerInfer/SmallThinker-4BA0.6B-Instruct)) is a family of on-device native Mixture-of-Experts (MoE) language models specially designed for local deployment, co-developed by the IPADS and School of AI at Shanghai Jiao Tong University and Zenergize AI. Designed from the ground up for resource-constrained environments, SmallThinker brings powerful, private, and low-latency AI directly to your personal devices, without relying on the cloud. - This inference framework is specifically optimized for sparse model inference to achieve faster speeds, leveraging the router's pre-selection mechanism to enable efficient inference even in memory-constrained scenarios. ## Demo + +https://github.com/user-attachments/assets/cefd466e-3b1f-47a9-8dc3-f1cf5119045e + + ## Speed ### SmallThinker 21B -| Model | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | +| Model | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | |--------------------------------------|---------------------|----------|-----------|--------------|----------------| | SmallThinker 21B+sparse | 11.47 | 30.19 | 23.03 | 10.84 | 6.61 | -| SmallThinker 21B+sparse +limited memory | 84 | limit 8G | 20.30 | 15.50 | 8.56 | +| SmallThinker 21B+sparse +limited memory | limit 8G | 20.30 | 15.50 | 8.56 | - | | Qwen3 30B A3B | 16.20 | 33.52 | 20.18 | 9.07 | - | -| Qwen3 30B A3Blimited memory | 81.38 | limit 8G | 10.11 | 0.18 | 6.32 | +| Qwen3 30B A3Blimited memory | limit 8G | 10.11 | 0.18 | 6.32 | - | | Gemma 3n E2B | 1G, theoretically | 36.88 | 27.06 | 12.50 | 6.66 | | Gemma 3n E4B | 2G, theoretically | 21.93 | 16.58 | 7.37 | 4.01 | @@ -31,12 +35,21 @@ Note:i9 14900、1+13 8ge4 use 4 threads,others use the number of threads that can achieve the maximum speed ## Setup +1. init submodule: -1. install clang-21 and mold: +```bash +git submodule update --init --recursive +``` +2. install clang-21 and mold: ```bash sudo apt install clang-21 mold ``` +3. cd smallthinker before compiling +```bash +cd smallthinker +``` + ## Convert Model ```bash @@ -140,11 +153,12 @@ python get_no_moe_weights_ffn.py /path/to/gguf_q4_0 /path/to/no_moe_gguf_q4_0 ```bash EXPERT_BUNDLE_PATH=/path/to/bundle ./llama-cli -m /path/to/no_moe_gguf_q4_0 --no-cnv --temp 0.6 --top-k 20 --top-p 0.95 --samplers "temperature;top_k;top_p" -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nCalculate the integral of f(x) = sin(x) from 0 to 3pi/4.<|im_end|>\n<|im_start|>assistant" -t 4 -n 256 -ub 4 ``` -### LM Head Sparsity: -1. The 4B model uses a sparse lm_head which may lead to some loss in precision. If you want to disable it, change the condition at src/llama-model.cpp:7580 to false.But the speed is slower. +### Note: +1. The models use a sparse lm_head which may lead to some loss in precision. If you want to disable it, change the condition at src/llama-model.cpp:7580 to false.But the speed is slower. +2. It may require root privileges when running in Termux when run the Memory-Efficient Version. ## Acknowledgements We would like to thank the following projects: -- [llama.cpp](https://github.com/ggml-org/llama.cpp) \ No newline at end of file +- [llama.cpp](https://github.com/ggml-org/llama.cpp) diff --git a/smallthinker/ggml/src/ggml-kompute/kompute b/smallthinker/ggml/src/ggml-kompute/kompute new file mode 160000 index 00000000..7c20efa3 --- /dev/null +++ b/smallthinker/ggml/src/ggml-kompute/kompute @@ -0,0 +1 @@ +Subproject commit 7c20efa30bb53d08bf04f84e510275766ebe9923 diff --git a/smallthinker/powerinfer/third_part/benchmark b/smallthinker/powerinfer/third_part/benchmark new file mode 160000 index 00000000..77c03fbc --- /dev/null +++ b/smallthinker/powerinfer/third_part/benchmark @@ -0,0 +1 @@ +Subproject commit 77c03fbcdcb7f28cd1f65d0e222542ef08ffd277 diff --git a/smallthinker/powerinfer/third_part/fmt b/smallthinker/powerinfer/third_part/fmt new file mode 160000 index 00000000..35dcc582 --- /dev/null +++ b/smallthinker/powerinfer/third_part/fmt @@ -0,0 +1 @@ +Subproject commit 35dcc58263d6b55419a5932bd6b0b3029a0a8c00 diff --git a/smallthinker/powerinfer/third_part/googletest b/smallthinker/powerinfer/third_part/googletest new file mode 160000 index 00000000..32f9f4c8 --- /dev/null +++ b/smallthinker/powerinfer/third_part/googletest @@ -0,0 +1 @@ +Subproject commit 32f9f4c82afa4249af66b55278df15c16b3031ea diff --git a/smallthinker/powerinfer/third_part/libaio b/smallthinker/powerinfer/third_part/libaio new file mode 160000 index 00000000..5a546a83 --- /dev/null +++ b/smallthinker/powerinfer/third_part/libaio @@ -0,0 +1 @@ +Subproject commit 5a546a834c36070648158d19dd564762d59f8eb8 diff --git a/smallthinker/powerinfer/third_part/liburing b/smallthinker/powerinfer/third_part/liburing new file mode 160000 index 00000000..f2b6fb85 --- /dev/null +++ b/smallthinker/powerinfer/third_part/liburing @@ -0,0 +1 @@ +Subproject commit f2b6fb85b79baf17f2c0ea24a357c652caa2d7ba diff --git a/smallthinker/powerinfer/third_part/perfetto b/smallthinker/powerinfer/third_part/perfetto new file mode 160000 index 00000000..967c5777 --- /dev/null +++ b/smallthinker/powerinfer/third_part/perfetto @@ -0,0 +1 @@ +Subproject commit 967c577748320170af142088787e64c36790d7b3