/* * ===================================================================================== * * Filename: split_tool.h * * Description: split_tool class definition. * * Version: 1.0 * Created: 09/08/2018 * Revision: none * Compiler: gcc * * Author: zhulin, shzhulin3@jd.com * Company: JD.com, Inc. * * ===================================================================================== */ #ifndef __SPLIT_TOOL_H__ #define __SPLIT_TOOL_H__ #include #include #include #include #include #include "utf8_str.h" #include "trainCorpus.h" #include "dtcapi.h" using namespace std; typedef struct { string szIpadrr; unsigned uBid; unsigned uPort; unsigned uWeight; unsigned uStatus; }SDTCroute; typedef struct { string szTablename; string szAccesskey; unsigned uTimeout; unsigned uKeytype; std::vector vecRoute; }SDTCHost; struct WordInfo { WordInfo() { word_id = 0; word_freq = 0; appid = 0; } uint32_t word_id; uint32_t word_freq; uint32_t appid; }; struct RouteValue { double max_route; uint32_t idx; RouteValue() { max_route = 0; idx = 0; } }; class FBSegment { public: FBSegment(); ~FBSegment(){} bool Init(); bool Init2(string train_path); bool Init3(string train_path, string word_path); vector segment(iutf8string &phrase, uint32_t appid); void segment2(iutf8string &phrase, uint32_t appid, vector &vec, string mode = "PrePostNGram", bool Hmm_flag = false); void cut_for_search(iutf8string &phrase, uint32_t appid, vector > &search_res_all, string mode = "PrePostNGram"); void cut_ngram(iutf8string &phrase, vector &search_res, uint32_t n); bool WordValid(string word, uint32_t appid); bool GetWordInfo(string word, uint32_t appid, WordInfo &word_info); bool GetWordInfoFromDictOnly(string word, uint32_t appid, WordInfo &word_info); private: void __cut_DAG_NO_HMM(string senstece, uint32_t appid, vector &vec); void get_DAG(string sentence, uint32_t appid, map > &DAG); void calc(string sentence, const map > &DAG, map &route, uint32_t appid); vector viterbi(string sentence); vector HMM_split(string sentence); double CalSegProbability(const vector &vec); void FMM2(iutf8string &phrase, uint32_t appid, vector &vec); void BMM2(iutf8string &phrase, uint32_t appid, vector &vec); void BMM(iutf8string &phrase, uint32_t appid, vector &vec); vector segment_part(iutf8string &phrase, uint32_t appid); bool isAlphaOrDigit(string str); set common_dict; map > custom_dict; set punct_set; set alpha_set; map > next_dict; uint32_t train_cnt; TrainCorpus train_corpus; map > word_dict; }; #endif