Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ cc_library(
"//data/config",
"//data/dictionary:binary_dictionaries",
"//data/dictionary:text_dictionaries",
"//data/jieba_dict",
],
strip_include_prefix = "src",
deps = [
Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ option(BUILD_SHARED_LIBS "Build opencc as shared library" ON)
option(ENABLE_GTEST "Build all tests." OFF)
option(ENABLE_BENCHMARK "Build benchmark tests." OFF)
option(ENABLE_DARTS "Build DartsDict (ocd format)." ON)
option(ENABLE_JIEBA "Build JiebaSegmentation (experimental)." OFF)
option(BUILD_PYTHON "Build python library" OFF)
option(USE_SYSTEM_DARTS "Use system version of Darts" OFF)
option(USE_SYSTEM_GOOGLE_BENCHMARK "Use system version of Google Benchmark" OFF)
Expand Down Expand Up @@ -208,6 +209,12 @@ if (ENABLE_DARTS)
)
endif()

if (ENABLE_JIEBA)
add_definitions(
-DENABLE_JIEBA
)
endif()


######## Dependencies

Expand Down
31 changes: 31 additions & 0 deletions data/config/s2twp_jieba.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"name": "Simplified Chinese to Traditional Chinese (Taiwan standard, with phrases, Jieba Segmentation - Experimental)",
"segmentation": {
"type": "jieba",
"dict_path": "jieba_dict/jieba.dict.utf8",
"model_path": "jieba_dict/hmm_model.utf8",
"user_dict_path": "jieba_dict/user.dict.utf8"
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd2",
"file": "STPhrases.ocd2"
}, {
"type": "ocd2",
"file": "STCharacters.ocd2"
}]
}
}, {
"dict": {
"type": "ocd2",
"file": "TWPhrases.ocd2"
}
}, {
"dict": {
"type": "ocd2",
"file": "TWVariants.ocd2"
}
}]
}
35 changes: 35 additions & 0 deletions data/config/tw2sp_jieba.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "Traditional Chinese (Taiwan standard) to Simplified Chinese (with phrases, Jieba Segmentation - Experimental)",
"segmentation": {
"type": "jieba",
"dict_path": "jieba_dict/jieba.dict.utf8",
"model_path": "jieba_dict/hmm_model.utf8",
"user_dict_path": "jieba_dict/user.dict.utf8"
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd2",
"file": "TWPhrasesRev.ocd2"
}, {
"type": "ocd2",
"file": "TWVariantsRevPhrases.ocd2"
}, {
"type": "ocd2",
"file": "TWVariantsRev.ocd2"
}]
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "ocd2",
"file": "TSPhrases.ocd2"
}, {
"type": "ocd2",
"file": "TSCharacters.ocd2"
}]
}
}]
}
6 changes: 6 additions & 0 deletions data/jieba_dict/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package(default_visibility = ["//visibility:public"])

filegroup(
name = "jieba_dict",
srcs = glob(["*.utf8", "README.md"]),
)
45 changes: 45 additions & 0 deletions data/jieba_dict/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Jieba 分词词典

此目录包含 Jieba 中文分词所需的词典文件,来源于 [libcppjieba](https://github.com/yanyiwu/libcppjieba)。

## 文件说明

- **jieba.dict.utf8** (4.9 MB) - 主词典文件,包含词语及其词频
- **hmm_model.utf8** (508 KB) - 隐马尔可夫模型(HMM)文件,用于识别未登录词
- **user.dict.utf8** (33 B) - 用户自定义词典(可选)

## 许可证

这些词典文件继承自 jieba 项目,遵循 MIT 许可证。

## 使用方式

在 OpenCC 配置文件中指定这些词典的路径。IDF 和停用词数据
会从 `deps/libcppjieba/dict/` 自动解析,无需复制到此目录:

```json
{
"segmentation": {
"type": "jieba",
"dict_path": "jieba_dict/jieba.dict.utf8",
"model_path": "jieba_dict/hmm_model.utf8",
"user_dict_path": "jieba_dict/user.dict.utf8"
}
}
```

## 自定义用户词典

您可以编辑 `user.dict.utf8` 添加自定义词语,格式为:

```
词语 词频 词性
```

例如:
```
云计算 5 n
机器学习 8 n
```

每行一个词语,词频和词性可选。
34 changes: 34 additions & 0 deletions data/jieba_dict/hmm_model.utf8

Large diffs are not rendered by default.

Loading
Loading