From 6629a93cdbb4847169ec87d7d97e9a780e5ed0d3 Mon Sep 17 00:00:00 2001 From: wwwisman <120352666+wisman-tccr@users.noreply.github.com> Date: Sun, 27 Jul 2025 17:11:14 +0800 Subject: [PATCH 1/4] Update README.md --- smallthinker/README.md | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/smallthinker/README.md b/smallthinker/README.md index f7e43d5b..4b93e2ff 100644 --- a/smallthinker/README.md +++ b/smallthinker/README.md @@ -5,14 +5,22 @@ ## Demo + +https://github.com/user-attachments/assets/cefd466e-3b1f-47a9-8dc3-f1cf5119045e + + ## Speed ### SmallThinker 21B -| Model | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | +| Model + +https://github.com/user-attachments/assets/37079e94-599b-4e7f-8000-0c095ebe0d59 + + | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | |--------------------------------------|---------------------|----------|-----------|--------------|----------------| | SmallThinker 21B+sparse | 11.47 | 30.19 | 23.03 | 10.84 | 6.61 | -| SmallThinker 21B+sparse +limited memory | 84 | limit 8G | 20.30 | 15.50 | 8.56 | +| SmallThinker 21B+sparse +limited memory | limit 8G | 20.30 | 15.50 | 8.56 | - | | Qwen3 30B A3B | 16.20 | 33.52 | 20.18 | 9.07 | - | -| Qwen3 30B A3Blimited memory | 81.38 | limit 8G | 10.11 | 0.18 | 6.32 | +| Qwen3 30B A3Blimited memory | limit 8G | 10.11 | 0.18 | 6.32 | - | | Gemma 3n E2B | 1G, theoretically | 36.88 | 27.06 | 12.50 | 6.66 | | Gemma 3n E4B | 2G, theoretically | 21.93 | 16.58 | 7.37 | 4.01 | @@ -31,12 +39,20 @@ Note:i9 14900、1+13 8ge4 use 4 threads,others use the number of threads that can achieve the maximum speed ## Setup - -1. install clang-21 and mold: +1. cd smallthinker before compiling +```bash +cd smallthinker +``` +2. install clang-21 and mold: ```bash sudo apt install clang-21 mold ``` +3. init submodule: + +```bash +git submodule update --init --recursive +``` ## Convert Model ```bash @@ -140,11 +156,12 @@ python get_no_moe_weights_ffn.py /path/to/gguf_q4_0 /path/to/no_moe_gguf_q4_0 ```bash EXPERT_BUNDLE_PATH=/path/to/bundle ./llama-cli -m /path/to/no_moe_gguf_q4_0 --no-cnv --temp 0.6 --top-k 20 --top-p 0.95 --samplers "temperature;top_k;top_p" -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nCalculate the integral of f(x) = sin(x) from 0 to 3pi/4.<|im_end|>\n<|im_start|>assistant" -t 4 -n 256 -ub 4 ``` -### LM Head Sparsity: -1. The 4B model uses a sparse lm_head which may lead to some loss in precision. If you want to disable it, change the condition at src/llama-model.cpp:7580 to false.But the speed is slower. +### Note: +1. The models use a sparse lm_head which may lead to some loss in precision. If you want to disable it, change the condition at src/llama-model.cpp:7580 to false.But the speed is slower. +2. It may require root privileges when running in Termux when run the Memory-Efficient Version. ## Acknowledgements We would like to thank the following projects: -- [llama.cpp](https://github.com/ggml-org/llama.cpp) \ No newline at end of file +- [llama.cpp](https://github.com/ggml-org/llama.cpp) From bf5b965df699c8fa4837cec4a19bdf631266793e Mon Sep 17 00:00:00 2001 From: wisman <2659530589@qq.com> Date: Sun, 27 Jul 2025 09:28:51 +0000 Subject: [PATCH 2/4] update --- .gitmodules | 21 +++++++++++++++++++ smallthinker/ggml/src/ggml-kompute/kompute | 1 + smallthinker/powerinfer/third_part/benchmark | 1 + smallthinker/powerinfer/third_part/fmt | 1 + smallthinker/powerinfer/third_part/googletest | 1 + smallthinker/powerinfer/third_part/libaio | 1 + smallthinker/powerinfer/third_part/liburing | 1 + smallthinker/powerinfer/third_part/perfetto | 1 + 8 files changed, 28 insertions(+) create mode 100644 .gitmodules create mode 160000 smallthinker/ggml/src/ggml-kompute/kompute create mode 160000 smallthinker/powerinfer/third_part/benchmark create mode 160000 smallthinker/powerinfer/third_part/fmt create mode 160000 smallthinker/powerinfer/third_part/googletest create mode 160000 smallthinker/powerinfer/third_part/libaio create mode 160000 smallthinker/powerinfer/third_part/liburing create mode 160000 smallthinker/powerinfer/third_part/perfetto diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..599dbe44 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,21 @@ +[submodule "smallthinker/ggml/src/ggml-kompute/kompute"] + path = smallthinker/ggml/src/ggml-kompute/kompute + url = https://github.com/nomic-ai/kompute.git +[submodule "smallthinker/powerinfer/third_part/perfetto"] + path = smallthinker/powerinfer/third_part/perfetto + url = https://github.com/google/perfetto.git +[submodule "smallthinker/powerinfer/third_part/benchmark"] + path = smallthinker/powerinfer/third_part/benchmark + url = https://github.com/google/benchmark.git +[submodule "smallthinker/powerinfer/third_part/googletest"] + path = smallthinker/powerinfer/third_part/googletest + url = https://github.com/google/googletest.git +[submodule "smallthinker/powerinfer/third_part/libaio"] + path = smallthinker/powerinfer/third_part/libaio + url = https://github.com/crossbuild/libaio.git +[submodule "smallthinker/powerinfer/third_part/liburing"] + path = smallthinker/powerinfer/third_part/liburing + url = https://github.com/axboe/liburing.git +[submodule "smallthinker/powerinfer/third_part/fmt"] + path = smallthinker/powerinfer/third_part/fmt + url = https://github.com/fmtlib/fmt.git diff --git a/smallthinker/ggml/src/ggml-kompute/kompute b/smallthinker/ggml/src/ggml-kompute/kompute new file mode 160000 index 00000000..7c20efa3 --- /dev/null +++ b/smallthinker/ggml/src/ggml-kompute/kompute @@ -0,0 +1 @@ +Subproject commit 7c20efa30bb53d08bf04f84e510275766ebe9923 diff --git a/smallthinker/powerinfer/third_part/benchmark b/smallthinker/powerinfer/third_part/benchmark new file mode 160000 index 00000000..77c03fbc --- /dev/null +++ b/smallthinker/powerinfer/third_part/benchmark @@ -0,0 +1 @@ +Subproject commit 77c03fbcdcb7f28cd1f65d0e222542ef08ffd277 diff --git a/smallthinker/powerinfer/third_part/fmt b/smallthinker/powerinfer/third_part/fmt new file mode 160000 index 00000000..35dcc582 --- /dev/null +++ b/smallthinker/powerinfer/third_part/fmt @@ -0,0 +1 @@ +Subproject commit 35dcc58263d6b55419a5932bd6b0b3029a0a8c00 diff --git a/smallthinker/powerinfer/third_part/googletest b/smallthinker/powerinfer/third_part/googletest new file mode 160000 index 00000000..32f9f4c8 --- /dev/null +++ b/smallthinker/powerinfer/third_part/googletest @@ -0,0 +1 @@ +Subproject commit 32f9f4c82afa4249af66b55278df15c16b3031ea diff --git a/smallthinker/powerinfer/third_part/libaio b/smallthinker/powerinfer/third_part/libaio new file mode 160000 index 00000000..5a546a83 --- /dev/null +++ b/smallthinker/powerinfer/third_part/libaio @@ -0,0 +1 @@ +Subproject commit 5a546a834c36070648158d19dd564762d59f8eb8 diff --git a/smallthinker/powerinfer/third_part/liburing b/smallthinker/powerinfer/third_part/liburing new file mode 160000 index 00000000..f2b6fb85 --- /dev/null +++ b/smallthinker/powerinfer/third_part/liburing @@ -0,0 +1 @@ +Subproject commit f2b6fb85b79baf17f2c0ea24a357c652caa2d7ba diff --git a/smallthinker/powerinfer/third_part/perfetto b/smallthinker/powerinfer/third_part/perfetto new file mode 160000 index 00000000..967c5777 --- /dev/null +++ b/smallthinker/powerinfer/third_part/perfetto @@ -0,0 +1 @@ +Subproject commit 967c577748320170af142088787e64c36790d7b3 From cc25b21c9aa303c92affc7d4e6d8d7d60c03977c Mon Sep 17 00:00:00 2001 From: wwwisman <120352666+wisman-tccr@users.noreply.github.com> Date: Sun, 27 Jul 2025 17:38:44 +0800 Subject: [PATCH 3/4] Update README.md --- smallthinker/README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/smallthinker/README.md b/smallthinker/README.md index 4b93e2ff..0fe9d06b 100644 --- a/smallthinker/README.md +++ b/smallthinker/README.md @@ -1,5 +1,5 @@ ## Intro -- SmallThinker is a family of on-device native Mixture-of-Experts (MoE) language models specially designed for local deployment, co-developed by the IPADS and School of AI at Shanghai Jiao Tong University and Zenergize AI. Designed from the ground up for resource-constrained environments, SmallThinker brings powerful, private, and low-latency AI directly to your personal devices, without relying on the cloud. +- SmallThinker ([SmallThinker-21BA3B-Instruct](https://huggingface.co/PowerInfer/SmallThinker-21BA3B-Instruct) and [SmallThinker-4BA0.6B-Instruct](https://huggingface.co/PowerInfer/SmallThinker-4BA0.6B-Instruct)) is a family of on-device native Mixture-of-Experts (MoE) language models specially designed for local deployment, co-developed by the IPADS and School of AI at Shanghai Jiao Tong University and Zenergize AI. Designed from the ground up for resource-constrained environments, SmallThinker brings powerful, private, and low-latency AI directly to your personal devices, without relying on the cloud. - This inference framework is specifically optimized for sparse model inference to achieve faster speeds, leveraging the router's pre-selection mechanism to enable efficient inference even in memory-constrained scenarios. @@ -11,11 +11,7 @@ https://github.com/user-attachments/assets/cefd466e-3b1f-47a9-8dc3-f1cf5119045e ## Speed ### SmallThinker 21B -| Model - -https://github.com/user-attachments/assets/37079e94-599b-4e7f-8000-0c095ebe0d59 - - | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | +| Model | Memory(GiB) | i9 14900 | 1+13 8ge4 | rk3588 (16G) | Raspberry PI 5 | |--------------------------------------|---------------------|----------|-----------|--------------|----------------| | SmallThinker 21B+sparse | 11.47 | 30.19 | 23.03 | 10.84 | 6.61 | | SmallThinker 21B+sparse +limited memory | limit 8G | 20.30 | 15.50 | 8.56 | - | From 4eda77147730923a1c770649991ea88be153c6e3 Mon Sep 17 00:00:00 2001 From: wwwisman <120352666+wisman-tccr@users.noreply.github.com> Date: Sun, 27 Jul 2025 17:40:29 +0800 Subject: [PATCH 4/4] Update README.md --- smallthinker/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/smallthinker/README.md b/smallthinker/README.md index 0fe9d06b..d868c4c2 100644 --- a/smallthinker/README.md +++ b/smallthinker/README.md @@ -35,21 +35,22 @@ https://github.com/user-attachments/assets/cefd466e-3b1f-47a9-8dc3-f1cf5119045e Note:i9 14900、1+13 8ge4 use 4 threads,others use the number of threads that can achieve the maximum speed ## Setup -1. cd smallthinker before compiling +1. init submodule: + ```bash -cd smallthinker +git submodule update --init --recursive ``` 2. install clang-21 and mold: ```bash sudo apt install clang-21 mold ``` -3. init submodule: - +3. cd smallthinker before compiling ```bash -git submodule update --init --recursive +cd smallthinker ``` + ## Convert Model ```bash python3 convert_hf_to_gguf.py /path/to/safetensors_model --outtype f16 --outfile /path/to/gguf_fp16 --transpose-down all