riot 引擎入门 引擎的原理
引擎中处理用户请求、分词、索引和排序分别由不同的协程(goroutines)完成。
1. 主协程,用于收发用户请求 2. 分词器(segmenter)协程,负责分词 3. 索引器(indexer)协程,负责建立和查找索引表 4. 排序器(ranker)协程,负责对文档评分排序
索引流程 搜索流程 文档抓取 type Weibo struct {
Id uint64
Timestamp uint64
UserName string
RepostsCount uint64
Text string
}
索引 import (
"github.com/go-ego/riot"
"github.com/go-ego/riot/types"
)
var searcher riot.Engine
searcher.Init(types.EngineOpts{
GseDict: "data/dict/dictionary.txt",
StopTokenFile: "data/dict/stop_tokens.txt",
IndexerOpts: &types.IndexerOpts{
IndexType: types.LocsIndex,
},
})
searcher.Index(docId, types.DocData{
Content: weibo.Text, // Weibo结构体见上文的定义。必须是UTF-8格式。
Fields: WeiboScoringFields{
Timestamp: weibo.Timestamp,
RepostsCount: weibo.RepostsCount,
},
})
type WeiboScoringFields struct {
Timestamp uint64
RepostsCount uint64
}
type WeiboScoringCriteria struct {
}
func (criteria WeiboScoringCriteria) Score {
doc types.IndexedDoc, fields interface{}) []float32 {
if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) {
return []float32{}
}
wsf := fields.(WeiboScoringFields)
output := make([]float32, 3)
if doc.TokenProximity > MaxTokenProximity { // 第一步
output[0] = 1.0 / float32(doc.TokenProximity)
} else {
output[0] = 1.0
}
output[1] = float32(wsf.Timestamp / (SecondsInADay * 3)) // 第二步
output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000)) // 第三步
return output
}
response := searcher.Search(types.SearchReq{
Text: "自行车运动",
RankOpts: &types.RankOpts{
ScoringCriteria: &WeiboScoringCriteria{},
OutputOffset: 0,
MaxOutputs: 100,
},
})
显示 总结
文档抓取 type Weibo struct {
Id uint64
Timestamp uint64
UserName string
RepostsCount uint64
Text string
}
索引 import (
"github.com/go-ego/riot"
"github.com/go-ego/riot/types"
)
var searcher riot.Engine
searcher.Init(types.EngineOpts{
GseDict: "data/dict/dictionary.txt",
StopTokenFile: "data/dict/stop_tokens.txt",
IndexerOpts: &types.IndexerOpts{
IndexType: types.LocsIndex,
},
})
searcher.Index(docId, types.DocData{
Content: weibo.Text, // Weibo结构体见上文的定义。必须是UTF-8格式。
Fields: WeiboScoringFields{
Timestamp: weibo.Timestamp,
RepostsCount: weibo.RepostsCount,
},
})
type WeiboScoringFields struct {
Timestamp uint64
RepostsCount uint64
}
type WeiboScoringCriteria struct {
}
func (criteria WeiboScoringCriteria) Score {
doc types.IndexedDoc, fields interface{}) []float32 {
if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) {
return []float32{}
}
wsf := fields.(WeiboScoringFields)
output := make([]float32, 3)
if doc.TokenProximity > MaxTokenProximity { // 第一步
output[0] = 1.0 / float32(doc.TokenProximity)
} else {
output[0] = 1.0
}
output[1] = float32(wsf.Timestamp / (SecondsInADay * 3)) // 第二步
output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000)) // 第三步
return output
}
response := searcher.Search(types.SearchReq{
Text: "自行车运动",
RankOpts: &types.RankOpts{
ScoringCriteria: &WeiboScoringCriteria{},
OutputOffset: 0,
MaxOutputs: 100,
},
})
显示 总结
import ( "github.com/go-ego/riot" "github.com/go-ego/riot/types" )
var searcher riot.Engine searcher.Init(types.EngineOpts{ GseDict: "data/dict/dictionary.txt", StopTokenFile: "data/dict/stop_tokens.txt", IndexerOpts: &types.IndexerOpts{ IndexType: types.LocsIndex, }, })
searcher.Index(docId, types.DocData{ Content: weibo.Text, // Weibo结构体见上文的定义。必须是UTF-8格式。 Fields: WeiboScoringFields{ Timestamp: weibo.Timestamp, RepostsCount: weibo.RepostsCount, }, })
type WeiboScoringFields struct { Timestamp uint64 RepostsCount uint64 }
type WeiboScoringCriteria struct { } func (criteria WeiboScoringCriteria) Score { doc types.IndexedDoc, fields interface{}) []float32 { if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) { return []float32{} } wsf := fields.(WeiboScoringFields) output := make([]float32, 3) if doc.TokenProximity > MaxTokenProximity { // 第一步 output[0] = 1.0 / float32(doc.TokenProximity) } else { output[0] = 1.0 } output[1] = float32(wsf.Timestamp / (SecondsInADay * 3)) // 第二步 output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000)) // 第三步 return output }
response := searcher.Search(types.SearchReq{ Text: "自行车运动", RankOpts: &types.RankOpts{ ScoringCriteria: &WeiboScoringCriteria{}, OutputOffset: 0, MaxOutputs: 100, }, })
显示 总结
读到这里,你应该对使用 riot 引擎进行微博搜索有了基本了解,建议你自己动手将其完成。如果你没有耐心,可以看看已经完成的代码,见
examples/codelab/search_server.go// 一个微博搜索的例子。 package main import ( "bufio" "flag" "io" "log" "os" "reflect" "strconv" "strings" "encoding/gob" "encoding/json" "net/http" "os/signal" "github.com/go-ego/riot" "github.com/go-ego/riot/types" ) const ( // SecondsInADay seconds in a day SecondsInADay = 86400 // MaxTokenProximity max token proximity MaxTokenProximity = 2 ) var ( searcher = riot.Engine{} wbs = map[string]Weibo{} weiboData = flag.String("weibo_data", "testdata/weibo_data.txt", "微博数据文件") dictFile = flag.String("dict_file", "data/dict/dictionary.txt", "词典文件") stopTokenFile = flag.String("stop_token_file", "data/dict/stop_tokens.txt", "停用词文件") staticFolder = flag.String("static_folder", "static", "静态文件目录") ) // Weibo weibo json struct type Weibo struct { // Id uint64 `json:"id"` Id string `json:"id"` Timestamp uint64 `json:"timestamp"` UserName string `json:"user_name"` RepostsCount uint64 `json:"reposts_count"` Text string `json:"text"` } /******************************************************************************* 索引 *******************************************************************************/ func indexWeibo() { // 读入微博数据 file, err := os.Open(*weiboData) if err != nil { log.Fatal(err) } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { data := strings.Split(scanner.Text(), "||||") if len(data) != 10 { continue } wb := Weibo{} // wb.Id, _ = strconv.ParseUint(data[0], 10, 64) wb.Id = data[0] wb.Timestamp, _ = strconv.ParseUint(data[1], 10, 64) wb.UserName = data[3] wb.RepostsCount, _ = strconv.ParseUint(data[4], 10, 64) wb.Text = data[9] wbs[wb.Id] = wb } log.Print("添加索引") for docId, weibo := range wbs { searcher.Index(docId, types.DocData{ Content: weibo.Text, Fields: WeiboScoringFields{ Timestamp: weibo.Timestamp, RepostsCount: weibo.RepostsCount, }, }) } searcher.Flush() log.Printf("索引了%d条微博\n", len(wbs)) } /******************************************************************************* 评分 *******************************************************************************/ // WeiboScoringFields weibo scoring fields type WeiboScoringFields struct { Timestamp uint64 RepostsCount uint64 } // WeiboScoringCriteria custom weibo scoring criteria type WeiboScoringCriteria struct { } // Score score and sort func (criteria WeiboScoringCriteria) Score( doc types.IndexedDoc, fields interface{}) []float32 { if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) { return []float32{} } wsf := fields.(WeiboScoringFields) output := make([]float32, 3) if doc.TokenProximity > MaxTokenProximity { output[0] = 1.0 / float32(doc.TokenProximity) } else { output[0] = 1.0 } output[1] = float32(wsf.Timestamp / (SecondsInADay * 3)) output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000)) return output } /******************************************************************************* JSON-RPC *******************************************************************************/ // JsonResponse json response type JsonResponse struct { Docs []*Weibo `json:"docs"` } // JsonRpcServer json rpc server func JsonRpcServer(w http.ResponseWriter, req *http.Request) { query := req.URL.Query().Get("query") output := searcher.SearchDoc(types.SearchReq{ Text: query, RankOpts: &types.RankOpts{ ScoringCriteria: &WeiboScoringCriteria{}, OutputOffset: 0, MaxOutputs: 100, }, }) // 整理为输出格式 docs := []*Weibo{} for _, doc := range output.Docs { wb := wbs[doc.DocId] wb.Text = doc.Content // for _, t := range output.Tokens { // wb.Text = strings.Replace(wb.Text, t, "<font color=red>"+t+"</font>", -1) // } docs = append(docs, &wb) } response, _ := json.Marshal(&JsonResponse{Docs: docs}) // fmt.Println("response...", response) w.Header().Set("Content-Type", "application/json") io.WriteString(w, string(response)) } /******************************************************************************* 主函数 *******************************************************************************/ func main() { // 解析命令行参数 flag.Parse() // 初始化 gob.Register(WeiboScoringFields{}) log.Print("引擎开始初始化") searcher.Init(types.EngineOpts{ Using: 1, GseDict: *dictFile, StopTokenFile: *stopTokenFile, IndexerOpts: &types.IndexerOpts{ IndexType: types.LocsIndex, }, // 如果你希望使用持久存储,启用下面的选项 // 默认使用leveldb持久化,如果你希望修改数据库类型 // 请用 StoreEngine: " " 或者修改 Riot_Store_Engine 环境变量 // UseStore: true, // StoreFolder: "weibo_search", // StoreEngine: "bg", }) log.Println("引擎初始化完毕") wbs = make(map[string]Weibo) // 索引 log.Println("建索引开始") go indexWeibo() log.Println("建索引完毕") // 捕获 ctrl-c c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt) go func() { for range c { log.Println("捕获Ctrl-c,退出服务器") searcher.Close() os.Exit(0) } }() http.HandleFunc("/json", JsonRpcServer) http.Handle("/", http.FileServer(http.Dir(*staticFolder))) log.Println("服务器启动") log.Fatal(http.ListenAndServe(":8080", nil)) }
如果你想进一步了解 riot 引擎,建议你直接阅读代码。代码的目录结构如下:
riot/ 引擎,包括主协程、分词协程、索引器协程和排序器协程的实现 /core 核心部件,包括索引器和排序器 /data 字典文件和停用词文件 /docs 文档 /engine 引擎,包括主协程、分词协程、索引器协程和排序器协程的实现, 已移到主目录 /examples 例子和性能测试程序 /testdata 测试数据 /types 常用结构体 /utils 常用函数
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did18493