中文搜索
列出索引
curl 'http://localhost:9200/_cat/indices?v'
如下结果:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size dataset.size
yellow open book_idx c1a4fmDyRkyLQoKMcDlvLw 1 1 10 0 21.9kb 21.9kb 21.9kb
现有的默认索引 books_idx
对英文处理效果不错,但是对中文的效果不理想。可以尝试使用上一节安装的 smartcn 分词器。
创建基于 smartcn 的索引
curl -X PUT "http://localhost:9200/book_idx_cn?pretty" -H 'Content-Type: application/json' -d'
{
"settings": {
"analysis": {
"analyzer": {
"default": {
"type": "smartcn"
}
}
}
}
}'
创建结果:
{
"acknowledged" : true,
"shards_acknowledged" : true,
"index" : "book_idx_cn"
}
更新代码
修改 src/infrastructure/config/mod.rs:
@@ -11,6 +11,7 @@ pub struct Config {
#[derive(Debug, Deserialize, Serialize)]
pub struct SearchConfig {
pub address: String,
+ pub index: String,
}
#[derive(Debug, Deserialize, Serialize)]
修改 config.toml:
@@ -4,3 +4,4 @@ page_size = 10
[search]
address = "http://localhost:9200"
+index = "book_idx_cn"
修改 src/infrastructure/search/es.rs:
@@ -8,18 +8,21 @@ use serde_json::{json, Value};
use crate::domain::gateway::BookManager;
use crate::domain::model;
-const INDEX_BOOK: &str = "book_idx";
-
pub struct ElasticSearchEngine {
client: Elasticsearch,
+ index: String,
page_size: u32,
}
impl ElasticSearchEngine {
- pub fn new(address: &str, page_size: u32) -> Result<Self, Box<dyn Error>> {
+ pub fn new(address: &str, index: &str, page_size: u32) -> Result<Self, Box<dyn Error>> {
let transport = Transport::single_node(address)?;
let client = Elasticsearch::new(transport);
- Ok(ElasticSearchEngine { client, page_size })
+ Ok(ElasticSearchEngine {
+ client,
+ index: index.to_string(),
+ page_size,
+ })
}
}
@@ -28,7 +31,7 @@ impl BookManager for ElasticSearchEngine {
async fn index_book(&self, b: &model::Book) -> Result<String, Box<dyn Error>> {
let response = self
.client
- .index(IndexParts::Index(INDEX_BOOK))
+ .index(IndexParts::Index(&self.index))
.body(b)
.send()
.await?;
@@ -39,7 +42,7 @@ impl BookManager for ElasticSearchEngine {
async fn search_books(&self, q: &str) -> Result<Vec<model::Book>, Box<dyn Error>> {
let response = self
.client
- .search(SearchParts::Index(&[INDEX_BOOK]))
+ .search(SearchParts::Index(&[&self.index]))
.from(0)
.size(self.page_size as i64)
.body(json!({
修改 src/application/wire_helper.rs:
@@ -12,6 +12,7 @@ impl WireHelper {
pub fn new(c: &Config) -> Result<Self, Box<dyn std::error::Error>> {
let engine = Arc::new(search::ElasticSearchEngine::new(
&c.search.address,
+ &c.search.index,
c.app.page_size,
)?);
Ok(WireHelper { engine })
编辑完成后,重新启动服务器。
填入中文测试数据
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"哈利·波特与魔法石","author":"J.K. 罗琳","published_at":"1997-06-26","content":"一个年轻男孩发现自己是一名巫师,并开始在霍格沃茨魔法学校接受教育,在那里他揭开了“魔法石”的秘密。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"杀死一只知更鸟","author":"哈珀·李","published_at":"1960-07-11","content":"小说设置在大萧条期间的美国南部,通过年轻的斯考特·芬奇的眼睛探索了种族不义和道德成长的主题。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"魔戒","author":"J.R.R. 托尔金","published_at":"1954-07-29","content":"一个名叫弗罗多·巴金斯的霍比特人踏上了一场艰险的旅程,摧毁一枚强大的戒指,并拯救中土世界,免受黑暗领主索伦的侵害。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"麦田里的守望者","author":"J.D. 赛林格","published_at":"1951-07-16","content":"霍尔顿·考菲尔德在被驱逐出预备学校后,讲述了他在纽约市的经历,探讨了疏远、身份和纯真等主题。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"牧羊少年奇幻之旅","author":"保罗·柯艾略","published_at":"1988-01-01","content":"牧羊人圣地亚哥从西班牙前往埃及,寻找埋在金字塔附近的宝藏。沿途,他学到了追随梦想的重要性。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"饥饿游戏","author":"苏珊·柯林斯","published_at":"2008-09-14","content":"在一个反乌托邦的未来,青少年被迫参加名为饥饿游戏的电视死斗。凯特尼斯·艾弗丁自愿代替妹妹参加,并成为叛乱的象征。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"1984","author":"乔治·奥威尔","published_at":"1949-06-08","content":"温斯顿·史密斯生活在一个由党领导的极权社会中。他反抗压迫性的政权,但最终屈服于其控制。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"龙纹身的女孩","author":"斯蒂格·拉尔森","published_at":"2005-08-01","content":"记者米卡埃尔·布隆奎斯特和黑客丽斯贝特·萨兰德调查了一个富裕家庭的年轻女子失踪案,揭示了黑暗的秘密和腐败。"}' \
http://localhost:3000/books
curl -X POST \
-H "Content-Type: application/json" \
-d '{"title":"消失的女孩","author":"吉莲·弗林","published_at":"2012-06-05","content":"在他们的第五个结婚纪念日上,尼克·邓恩的妻子艾米失踪了。随着媒体的狂热报道和怀疑的增加,尼克发现自己陷入了一场欺骗和背叛的旋涡中。"}' \
http://localhost:3000/books
验证效果
用 ”记者牧羊人“ 进行检索验证。
Query 的分词结果
curl -X POST 'http://localhost:9200/_analyze' -H 'Content-Type: application/json' -d '{
"analyzer": "smartcn",
"text": "记者牧羊人"
}'
{"tokens":[{"token":"记者","start_offset":0,"end_offset":2,"type":"word","position":0},{"token":"牧羊人","start_offset":2,"end_offset":5,"type":"word","position":1}]}
Query 的查询结果
curl 'http://localhost:3000/books?q=%E8%AE%B0%E8%80%85%E7%89%A7%E7%BE%8A%E4%BA%BA'
%E8%AE%B0%E8%80%85%E7%89%A7%E7%BE%8A%E4%BA%BA
是“记者牧羊人
”URL 编码后的值。
[
{
"title": "牧羊少年奇幻之旅",
"author": "保罗·柯艾略",
"published_at": "1988-01-01",
"content": "牧羊人圣地亚哥从西班牙前往埃及,寻找埋在金字塔附近的宝藏。沿途,他学到了追随梦想的重要性。"
},
{
"title": "龙纹身的女孩",
"author": "斯蒂格·拉尔森",
"published_at": "2005-08-01",
"content": "记者米卡埃尔·布隆奎斯特和黑客丽斯贝特·萨兰德调查了一个富裕家庭的年轻女子失踪案,揭示了黑暗的秘密和腐败。"
}
]
两个结果的匹配得分及其原因解释。
2.2704897
{"description":"max of:","details":[{"description":"sum of:","details":[{"description":"weight(content:牧羊人 in 4) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":1},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.9924302},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":22.0},{"description":"avgdl, average length of field","details":[],"value":31.4}],"value":0.5179809}],"value":2.2704897}],"value":2.2704897}],"value":2.2704897}],"value":2.2704897}
1.8797743
{"description":"max of:","details":[{"description":"sum of:","details":[{"description":"weight(content:记者 in 7) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":1},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.9924302},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":36.0},{"description":"avgdl, average length of field","details":[],"value":31.4}],"value":0.42884457}],"value":1.8797743}],"value":1.8797743}],"value":1.8797743}],"value":1.8797743}
再用 ”麦田龙女孩“ 进行检索验证。
Query 的分词结果
curl -X POST 'http://localhost:9200/_analyze' -H 'Content-Type: application/json' -d '{
"analyzer": "smartcn",
"text": "麦田龙女孩"
}'
{"tokens":[{"token":"麦田","start_offset":0,"end_offset":2,"type":"word","position":0},{"token":"龙","start_offset":2,"end_offset":3,"type":"word","position":1},{"token":"女孩","start_offset":3,"end_offset":5,"type":"word","position":2}]}
Query 的查询结果
curl 'http://localhost:3000/books?q=%E9%BA%A6%E7%94%B0%E9%BE%99%E5%A5%B3%E5%AD%A9'
[
{
"title": "龙纹身的女孩",
"author": "斯蒂格·拉尔森",
"published_at": "2005-08-01",
"content": "记者米卡埃尔·布隆奎斯特和黑客丽斯贝特·萨兰德调查了一个富裕家庭的年轻女子失踪案,揭示了黑暗的秘密和腐败。"
},
{
"title": "麦田里的守望者",
"author": "J.D. 赛林格",
"published_at": "1951-07-16",
"content": "霍尔顿·考菲尔德在被驱逐出预备学校后,讲述了他在纽约市的经历,探讨了疏远、身份和纯真等主题。"
},
{
"title": "消失的女孩",
"author": "吉莲·弗林",
"published_at": "2012-06-05",
"content": "在他们的第五个结婚纪念日上,尼克·邓恩的妻子艾米失踪了。随着媒体的狂热报道和怀疑的增加,尼克发现自己陷入了一场欺骗和背叛的旋涡中。"
}
]
三个结果的匹配得分及其原因解释。
3.1877713
{"description":"max of:","details":[{"description":"sum of:","details":[{"description":"weight(title:龙 in 7) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":1},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.9924302},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":5.0},{"description":"avgdl, average length of field","details":[],"value":4.1}],"value":0.41709054}],"value":1.8282523}],"value":1.8282523},{"description":"weight(title:女孩 in 7) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":2},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.4816046},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":5.0},{"description":"avgdl, average length of field","details":[],"value":4.1}],"value":0.41709054}],"value":1.3595191}],"value":1.3595191}],"value":3.1877713}],"value":3.1877713}
1.8282523
{"description":"max of:","details":[{"description":"sum of:","details":[{"description":"weight(title:麦田 in 3) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":1},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.9924302},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":5.0},{"description":"avgdl, average length of field","details":[],"value":4.1}],"value":0.41709054}],"value":1.8282523}],"value":1.8282523}],"value":1.8282523}],"value":1.8282523}
1.6642681
{"description":"max of:","details":[{"description":"sum of:","details":[{"description":"weight(title:女孩 in 8) [PerFieldSimilarity], result of:","details":[{"description":"score(freq=1.0), computed as boost * idf * tf from:","details":[{"description":"boost","details":[],"value":2.2},{"description":"idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:","details":[{"description":"n, number of documents containing term","details":[],"value":2},{"description":"N, total number of documents with field","details":[],"value":10}],"value":1.4816046},{"description":"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:","details":[{"description":"freq, occurrences of term within document","details":[],"value":1.0},{"description":"k1, term saturation parameter","details":[],"value":1.2},{"description":"b, length normalization parameter","details":[],"value":0.75},{"description":"dl, length of field","details":[],"value":3.0},{"description":"avgdl, average length of field","details":[],"value":4.1}],"value":0.5105853}],"value":1.6642681}],"value":1.6642681}],"value":1.6642681}],"value":1.6642681}
Loading...
> 此处输出代码运行结果