diff --git a/src/search_local/index_read/comm.h b/src/search_local/index_read/comm.h index 86c8923..d256828 100644 --- a/src/search_local/index_read/comm.h +++ b/src/search_local/index_read/comm.h @@ -25,6 +25,7 @@ #include #include #include +#include #define DOC_CNT 10000 #define MAX_DOCID_LENGTH 32 @@ -37,6 +38,8 @@ const uint32_t MAX_SEARCH_LEN = 60; const uint32_t SINGLE_WORD_LEN = 18; const uint32_t MAX_VALUE_LEN = 51200; +const double DOUBLE_EPS = 1e-3; + typedef std::tr1::unordered_map hash_double_map; typedef std::tr1::unordered_map hash_string_map; @@ -91,20 +94,20 @@ enum SORTTYPE { }; enum FieldType{ - FIELD_INT = 1, - FIELD_STRING, - FIELD_TEXT, - FIELD_IP, - FIELD_LNG, - FIELD_LAT, - FIELD_GIS, - FIELD_DISTANCE, - FIELD_DOUBLE, - FIELD_LONG, - FIELD_INDEX = 11, - FIELD_LNG_ARRAY, - FIELD_LAT_ARRAY, - FIELD_WKT, + FIELD_INT = 1, + FIELD_STRING = 2, + FIELD_TEXT = 3, + FIELD_IP = 4, + FIELD_GEO_POINT = 5, + FIELD_LAT = 6, + FIELD_GIS = 7, + FIELD_DISTANCE = 8, + FIELD_DOUBLE = 9, + FIELD_LONG = 10, + FIELD_INDEX = 11, + FIELD_LNG_ARRAY = 12, + FIELD_LAT_ARRAY = 13, + FIELD_GEO_SHAPE = 14 }; enum SEGMENTTAG { @@ -251,6 +254,23 @@ enum KeyType INVERTKEY, }; +struct ScoreDocIdNode{ + double d_score; + std::string s_docid; + + ScoreDocIdNode(double score , std::string docid) + : d_score(score) + , s_docid(docid) + { } + + bool operator<(const ScoreDocIdNode& score_docid_node) const { + if (fabs(d_score - score_docid_node.d_score) < DOUBLE_EPS){ + return s_docid.compare(score_docid_node.s_docid) < 0; + } + return (d_score + DOUBLE_EPS) < score_docid_node.d_score; + } +}; + struct IndexInfo { uint32_t appid; std::string doc_id; diff --git a/src/search_local/index_read/process/bool_query_process.cc b/src/search_local/index_read/process/bool_query_process.cc index f7be184..cacee70 100644 --- a/src/search_local/index_read/process/bool_query_process.cc +++ b/src/search_local/index_read/process/bool_query_process.cc @@ -18,9 +18,9 @@ BoolQueryProcess::BoolQueryProcess(const Json::Value& value) query_process_map_.insert(std::make_pair(E_INDEX_READ_TERM , new TermQueryProcess(parse_value_ ))); query_process_map_.insert(std::make_pair(E_INDEX_READ_RANGE - , RangeQueryGenerator::Instance()->GetRangeQueryProcess(E_INDEX_READ_RANGE , parse_value_))); - query_process_map_.insert(std::make_pair(E_INDEX_READ_RANGE_PRE_TERM - , RangeQueryGenerator::Instance()->GetRangeQueryProcess(E_INDEX_READ_RANGE_PRE_TERM , parse_value_))); + , new RangeQueryProcess(parse_value_ ))); + query_process_map_.insert(std::make_pair(E_INDEX_READ_PRE_TERM + , new PreTerminal(parse_value_ ))); } BoolQueryProcess::~BoolQueryProcess() @@ -122,7 +122,7 @@ int BoolQueryProcess::ParseContent(int logic_type){ int BoolQueryProcess::GetValidDoc(){ bool bRet = false; if (component_->TerminalTag()){ - range_query_pre_term_ = dynamic_cast(query_process_map_[E_INDEX_READ_RANGE_PRE_TERM]); + range_query_pre_term_ = dynamic_cast(query_process_map_[E_INDEX_READ_PRE_TERM]); if (range_query_pre_term_ != NULL){ return range_query_pre_term_->GetValidDoc(); } @@ -230,7 +230,7 @@ int BoolQueryProcess::InitQueryProcess(uint32_t type , const Json::Value& value) } else if(value.isMember(RANGE)){ parse_value = parse_value_[RANGE]; if (component_->TerminalTag()){ - query_type = E_INDEX_READ_RANGE_PRE_TERM; + query_type = E_INDEX_READ_PRE_TERM; }else{ query_type = E_INDEX_READ_RANGE; diff --git a/src/search_local/index_read/process/bool_query_process.h b/src/search_local/index_read/process/bool_query_process.h index c20d141..7d14344 100644 --- a/src/search_local/index_read/process/bool_query_process.h +++ b/src/search_local/index_read/process/bool_query_process.h @@ -4,7 +4,7 @@ #include "query_process.h" class RangeQueryProcess; -class RangeQueryPreTerminal; +class PreTerminal; class GeoDistanceQueryProcess; class BoolQueryProcess : public QueryProcess{ @@ -31,7 +31,7 @@ private: private: std::map query_process_map_; RangeQueryProcess* range_query_; - RangeQueryPreTerminal* range_query_pre_term_; + PreTerminal* range_query_pre_term_; GeoDistanceQueryProcess* geo_distance_query_; }; diff --git a/src/search_local/index_read/process/geo_distance_query_process.cc b/src/search_local/index_read/process/geo_distance_query_process.cc index cb8f320..cd0d9b5 100644 --- a/src/search_local/index_read/process/geo_distance_query_process.cc +++ b/src/search_local/index_read/process/geo_distance_query_process.cc @@ -69,7 +69,7 @@ int GeoDistanceQueryProcess::ParseContent(int logic_type){ int GeoDistanceQueryProcess::GetValidDoc(){ std::vector index_info_vet; - int iret = ValidDocFilter::Instance()->MixTextInvertIndexSearch(component_->AndKeys(), index_info_vet + int iret = ValidDocFilter::Instance()->TextInvertIndexSearch(component_->AndKeys(), index_info_vet , high_light_word_, docid_keyinfovet_map_ , key_doccount_map_); if (iret != 0) { return iret; } @@ -86,6 +86,8 @@ int GeoDistanceQueryProcess::GetScore(){ { case SORT_RELEVANCE: case SORT_TIMESTAMP: + case SORT_FIELD_ASC: + case SORT_FIELD_DESC: { hash_double_map::iterator dis_iter = o_distance_.begin(); for(; dis_iter != o_distance_.end(); ++dis_iter){ @@ -93,17 +95,23 @@ int GeoDistanceQueryProcess::GetScore(){ double score = dis_iter->second; if ((o_geo_point_.d_distance > -0.0001 && o_geo_point_.d_distance < 0.0001) || (score + 1e-6 <= o_geo_point_.d_distance)){ - skipList_.InsertNode(score, doc_id.c_str()); + scoredocid_set_.insert(ScoreDocIdNode(score , doc_id)); } } } break; case DONT_SORT: - case SORT_FIELD_ASC: - case SORT_FIELD_DESC: { - // do nothing + hash_double_map::iterator dis_iter = o_distance_.begin(); + for(; dis_iter != o_distance_.end(); ++dis_iter){ + std::string doc_id = dis_iter->first; + if ((o_geo_point_.d_distance > -0.0001 && o_geo_point_.d_distance < 0.0001) + || (dis_iter->second + 1e-6 <= o_geo_point_.d_distance)){ + scoredocid_set_.insert(ScoreDocIdNode(1 , doc_id)); + } + } } + break; default: break; } @@ -113,5 +121,10 @@ int GeoDistanceQueryProcess::GetScore(){ void GeoDistanceQueryProcess::SortScore(int& i_sequence , int& i_rank) { - SortForwardBySkipList(i_sequence , i_rank); + // 默认升序,距离近在前 + if (SORT_FIELD_ASC == component_->SortType()){ + AscSort(i_sequence , i_rank); + }else { // 降序和不排序处理 + DescSort(i_sequence , i_rank); + } } \ No newline at end of file diff --git a/src/search_local/index_read/process/match_query_process.cc b/src/search_local/index_read/process/match_query_process.cc index c2ea5e1..c9352f5 100644 --- a/src/search_local/index_read/process/match_query_process.cc +++ b/src/search_local/index_read/process/match_query_process.cc @@ -62,8 +62,24 @@ int MatchQueryProcess::ParseContent(int logic_type){ int MatchQueryProcess::GetValidDoc(){ std::vector index_info_vet; - int iret = ValidDocFilter::Instance()->PureTextInvertIndexSearch(component_->OrKeys() - , index_info_vet , high_light_word_, docid_keyinfovet_map_); + if (component_->OrKeys().empty()){ + return -RT_GET_FIELD_ERROR; + } + + int iret = 0; + if (SEGMENT_DEFAULT == component_->OrKeys()[FIRST_TEST_INDEX][FIRST_SPLIT_WORD_INDEX].segment_tag){ + iret = ValidDocFilter::Instance()->TextInvertIndexSearch(component_->OrKeys() + , index_info_vet + , high_light_word_ + , docid_keyinfovet_map_ + , key_doccount_map_); + }else{ + iret = ValidDocFilter::Instance()->HanPinTextInvertIndexSearch(component_->OrKeys() + , index_info_vet + , high_light_word_ + , docid_keyinfovet_map_); + } + if (iret != 0) { return iret; } bool bRet = doc_manager_->GetDocContent(index_info_vet , valid_docs_); diff --git a/src/search_local/index_read/process/match_query_process.h b/src/search_local/index_read/process/match_query_process.h index ffbdacf..7c9487f 100644 --- a/src/search_local/index_read/process/match_query_process.h +++ b/src/search_local/index_read/process/match_query_process.h @@ -21,6 +21,9 @@ #include "query_process.h" +#define FIRST_TEST_INDEX 0 +#define FIRST_SPLIT_WORD_INDEX 0 + class MatchQueryProcess: public QueryProcess{ public: MatchQueryProcess(const Json::Value& value); diff --git a/src/search_local/index_read/process/query_process.cc b/src/search_local/index_read/process/query_process.cc index 393b46d..14f25e3 100644 --- a/src/search_local/index_read/process/query_process.cc +++ b/src/search_local/index_read/process/query_process.cc @@ -2,26 +2,20 @@ #include #include "../valid_doc_filter.h" #include "../order_op.h" -#include "../result_cache.h" -#include "cachelist_unit.h" - -extern CCacheListUnit* cachelist; QueryProcess::QueryProcess(const Json::Value& value) : component_(NULL) , doc_manager_(NULL) , request_(NULL) , parse_value_(value) - , skipList_() + , scoredocid_set_() , response_() , valid_docs_() , high_light_word_() , docid_keyinfovet_map_() , key_doccount_map_() , sort_field_type_() -{ - skipList_.InitList(); -} +{ } QueryProcess::~QueryProcess() { } @@ -73,13 +67,13 @@ int QueryProcess::GetScore() { switch (component_->SortType()) { - case SORT_RELEVANCE: + case SORT_RELEVANCE: // 按照相关度得分,并以此排序 { // 范围查的时候如果不指定排序类型,需要在这里对skipList进行赋值 - if (docid_keyinfovet_map_.empty() && skipList_.GetSize() == 0) { + if (docid_keyinfovet_map_.empty() && scoredocid_set_.empty()) { std::set::iterator iter = valid_docs_.begin(); for(; iter != valid_docs_.end(); iter++){ - skipList_.InsertNode(1, (*iter).c_str()); + scoredocid_set_.insert(ScoreDocIdNode(1,*iter)); } break; } @@ -101,11 +95,11 @@ int QueryProcess::GetScore() score += log((DOC_CNT - ui_doc_count + 0.5) / (ui_doc_count + 0.5)) * ((D_BM25_K1 + 1)*ui_word_freq) \ / (D_BM25_K + ui_word_freq) * (D_BM25_K2 + 1) * 1 / (D_BM25_K2 + 1); } - skipList_.InsertNode(score, doc_id.c_str()); + scoredocid_set_.insert(ScoreDocIdNode(score , doc_id)); } } break; - case SORT_TIMESTAMP: + case SORT_TIMESTAMP: // 按照时间戳得分,并以此排序 { std::map::iterator docid_keyinfovet_iter = docid_keyinfovet_map_.begin(); for (; docid_keyinfovet_iter != docid_keyinfovet_map_.end(); ++ docid_keyinfovet_iter){ @@ -121,20 +115,20 @@ int QueryProcess::GetScore() } double score = (double)key_info[0].created_time; - skipList_.InsertNode(score, doc_id.c_str()); + scoredocid_set_.insert(ScoreDocIdNode(score , doc_id)); } } break; - case DONT_SORT: + case DONT_SORT: // 不排序,docid有序 { std::set::iterator valid_docs_iter = valid_docs_.begin(); for(; valid_docs_iter != valid_docs_.end(); valid_docs_iter++){ std::string doc_id = *valid_docs_iter; - skipList_.InsertNode(1, doc_id.c_str()); + scoredocid_set_.insert(ScoreDocIdNode(1 , doc_id)); } } break; - case SORT_FIELD_ASC: + case SORT_FIELD_ASC: // 按照指定字段进行升降排序 case SORT_FIELD_DESC: { std::set::iterator valid_docs_iter = valid_docs_.begin(); @@ -155,12 +149,10 @@ int QueryProcess::GetScore() void QueryProcess::SortScore(int& i_sequence , int& i_rank) { if ((SORT_FIELD_DESC == component_->SortType() || SORT_FIELD_ASC == component_->SortType()) - && 0 == skipList_.GetSize()){ + && scoredocid_set_.empty()){ SortByCOrderOp(i_rank); - }else if(SORT_FIELD_ASC == component_->SortType()){ - SortForwardBySkipList(i_sequence , i_rank); - }else{ - SortBackwardBySkipList(i_sequence, i_rank); + }else{ // 默认降序,分高的在前(地理位置查询除外) + DescSort(i_sequence, i_rank); } } @@ -190,61 +182,54 @@ void QueryProcess::SortByCOrderOp(int& i_rank) } } -void QueryProcess::SortForwardBySkipList(int& i_sequence , int& i_rank) +void QueryProcess::AscSort(int& i_sequence , int& i_rank) { - log_debug("m_has_gis, size:%d ", skipList_.GetSize()); - SkipListNode* tmp = skipList_.GetHeader()->level[0].forward; - + log_debug("m_has_gis, size:%d ", scoredocid_set_.size()); int i_limit_start = component_->PageSize() * (component_->PageIndex() - 1); int i_limit_end = component_->PageSize() * component_->PageIndex() - 1; - while (tmp->level[0].forward != NULL) { + std::set::iterator iter = scoredocid_set_.begin(); + for( ;iter != scoredocid_set_.end(); ++iter){ // 通过extra_filter_keys进行额外过滤(针对区分度不高的字段) - if(doc_manager_->CheckDocByExtraFilterKey(tmp->value) == false){ - log_debug("CheckDocByExtraFilterKey failed, %s", tmp->value); - tmp = tmp->level[0].forward; + if(doc_manager_->CheckDocByExtraFilterKey(iter->s_docid) == false){ + log_debug("CheckDocByExtraFilterKey failed, %s", iter->s_docid); continue; } i_sequence ++; i_rank ++; if(component_->ReturnAll() == 0){ if (i_sequence < i_limit_start || i_sequence > i_limit_end) { - tmp = tmp->level[0].forward; continue; } } Json::Value doc_info; - doc_info["doc_id"] = Json::Value(tmp->value); - doc_info["score"] = Json::Value(tmp->key); + doc_info["doc_id"] = Json::Value(iter->s_docid); + doc_info["score"] = Json::Value(iter->d_score); response_["result"].append(doc_info); - tmp = tmp->level[0].forward; } } -void QueryProcess::SortBackwardBySkipList(int& i_sequence , int& i_rank) +void QueryProcess::DescSort(int& i_sequence , int& i_rank) { int i_limit_start = component_->PageSize() * (component_->PageIndex() - 1); int i_limit_end = component_->PageSize() * component_->PageIndex() - 1; - SkipListNode *tmp = skipList_.GetFooter()->backward; - while(tmp->backward != NULL) { - if(doc_manager_->CheckDocByExtraFilterKey(tmp->value) == false){ - tmp = tmp->backward; + std::set::iterator riter = scoredocid_set_.rbegin(); + for( ;riter != scoredocid_set_.rend(); ++riter){ + if(doc_manager_->CheckDocByExtraFilterKey(riter->s_docid) == false){ continue; } i_sequence++; i_rank++; if (component_->ReturnAll() == 0){ if (i_sequence < i_limit_start || i_sequence > i_limit_end) { - tmp = tmp->backward; continue; } } Json::Value doc_info; - doc_info["doc_id"] = Json::Value(tmp->value); - doc_info["score"] = Json::Value(tmp->key); + doc_info["doc_id"] = Json::Value(riter->s_docid); + doc_info["score"] = Json::Value(riter->d_score); response_["result"].append(doc_info); - tmp = tmp->backward; } } diff --git a/src/search_local/index_read/process/query_process.h b/src/search_local/index_read/process/query_process.h index d197ad4..a1092ff 100644 --- a/src/search_local/index_read/process/query_process.h +++ b/src/search_local/index_read/process/query_process.h @@ -47,7 +47,7 @@ enum E_INDEX_READ_QUERY_PROCESS{ E_INDEX_READ_MATCH, E_INDEX_READ_TERM, E_INDEX_READ_RANGE, - E_INDEX_READ_RANGE_PRE_TERM + E_INDEX_READ_PRE_TERM }; class QueryProcess{ @@ -81,8 +81,8 @@ protected: protected: void SortByCOrderOp(int& i_rank); - void SortForwardBySkipList(int& i_sequence , int& i_rank); - void SortBackwardBySkipList(int& i_sequence , int& i_rank); + void AscSort(int& i_sequence , int& i_rank); + void DescSort(int& i_sequence , int& i_rank); void AppendHighLightWord(); protected: @@ -91,7 +91,7 @@ protected: CTaskRequest* request_; Json::Value parse_value_; - SkipList skipList_; + std::set scoredocid_set_; Json::Value response_; ValidDocSet valid_docs_; diff --git a/src/search_local/index_read/process/range_query_process.cc b/src/search_local/index_read/process/range_query_process.cc index 4b47260..4d7afeb 100644 --- a/src/search_local/index_read/process/range_query_process.cc +++ b/src/search_local/index_read/process/range_query_process.cc @@ -106,15 +106,25 @@ int RangeQueryProcess::GetValidDoc() -RangeQueryPreTerminal::RangeQueryPreTerminal(const Json::Value& value) - : RangeQueryProcess(value) +PreTerminal::PreTerminal(const Json::Value& value) + : QueryProcess(value) , candidate_doc_() {} -RangeQueryPreTerminal::~RangeQueryPreTerminal() +PreTerminal::~PreTerminal() {} -int RangeQueryPreTerminal::GetValidDoc(){ +int PreTerminal::ParseContent(int logic_type){ + log_info("PreTerminal do not need parse content"); + return 0; + } + +int PreTerminal::ParseContent(){ + log_info("PreTerminal do not need parse content"); + return 0; + } + +int PreTerminal::GetValidDoc(){ uint32_t count = 0; uint32_t N = 2; uint32_t limit_start = 0; @@ -162,12 +172,12 @@ int RangeQueryPreTerminal::GetValidDoc(){ return 0; } -int RangeQueryPreTerminal::GetScore(){ +int PreTerminal::GetScore(){ log_info("RangeQueryPreTerminal do not need get score"); return 0; } -void RangeQueryPreTerminal::SetResponse(){ +void PreTerminal::SetResponse(){ response_["code"] = 0; int sequence = -1; int rank = 0; diff --git a/src/search_local/index_read/process/range_query_process.h b/src/search_local/index_read/process/range_query_process.h index 346885f..e68ff0a 100644 --- a/src/search_local/index_read/process/range_query_process.h +++ b/src/search_local/index_read/process/range_query_process.h @@ -25,12 +25,16 @@ private: friend class BoolQueryProcess; }; -class RangeQueryPreTerminal : public RangeQueryProcess{ +class PreTerminal : public QueryProcess{ public: - RangeQueryPreTerminal(const Json::Value& value); - virtual~ RangeQueryPreTerminal(); + PreTerminal(const Json::Value& value); + virtual~ PreTerminal(); + +public: + virtual int ParseContent(int logic_type); private: + virtual int ParseContent(); virtual int GetValidDoc(); virtual int GetScore(); virtual void SetResponse(); @@ -42,39 +46,39 @@ private: friend class BoolQueryProcess; }; -class RangeQueryGenerator : private noncopyable{ -public: - RangeQueryGenerator() { }; - virtual~ RangeQueryGenerator() { }; +// class RangeQueryGenerator : private noncopyable{ +// public: +// RangeQueryGenerator() { }; +// virtual~ RangeQueryGenerator() { }; -public: - static RangeQueryGenerator* Instance(){ - return CSingleton::Instance(); - }; +// public: +// static RangeQueryGenerator* Instance(){ +// return CSingleton::Instance(); +// }; - static void Destroy(){ - CSingleton::Destroy(); - }; +// static void Destroy(){ +// CSingleton::Destroy(); +// }; -public: - // 内存释放由调用方处理 - QueryProcess* GetRangeQueryProcess(int iType , const Json::Value& parse_value){ - QueryProcess* current_range_query = NULL; - switch (iType){ - case E_INDEX_READ_RANGE:{ - current_range_query = new RangeQueryProcess(parse_value); - } - break; - case E_INDEX_READ_RANGE_PRE_TERM:{ - current_range_query = new RangeQueryPreTerminal(parse_value); - } - break; - default: - break; - } +// public: +// // 内存释放由调用方处理 +// QueryProcess* GetRangeQueryProcess(int iType , const Json::Value& parse_value){ +// QueryProcess* current_range_query = NULL; +// switch (iType){ +// case E_INDEX_READ_RANGE:{ +// current_range_query = new RangeQueryProcess(parse_value); +// } +// break; +// case E_INDEX_READ_PRE_TERM:{ +// current_range_query = new PreTerminal(parse_value); +// } +// break; +// default: +// break; +// } - return current_range_query; - } -}; +// return current_range_query; +// } +// }; #endif \ No newline at end of file diff --git a/src/search_local/index_read/process/term_query_process.cc b/src/search_local/index_read/process/term_query_process.cc index 144eb06..14dde1e 100644 --- a/src/search_local/index_read/process/term_query_process.cc +++ b/src/search_local/index_read/process/term_query_process.cc @@ -48,7 +48,7 @@ int TermQueryProcess::ParseContent(int logic_type){ int TermQueryProcess::GetValidDoc(){ std::vector index_info_vet; - int iret = ValidDocFilter::Instance()->MixTextInvertIndexSearch(component_->OrKeys() + int iret = ValidDocFilter::Instance()->TextInvertIndexSearch(component_->OrKeys() , index_info_vet , high_light_word_, docid_keyinfovet_map_ , key_doccount_map_); if (iret != 0) { return iret; } diff --git a/src/search_local/index_read/search_task.cc b/src/search_local/index_read/search_task.cc index a847cf7..5f35ab2 100644 --- a/src/search_local/index_read/search_task.cc +++ b/src/search_local/index_read/search_task.cc @@ -76,13 +76,7 @@ int SearchTask::Process(CTaskRequest *request) }else if (query.isMember(GEOSHAPE)){ query_process_ = new GeoShapeQueryProcess(query[GEOSHAPE]); }else if (query.isMember(RANGE)){ - if (component_->TerminalTag()){ - query_process_ = RangeQueryGenerator::Instance()->GetRangeQueryProcess(E_INDEX_READ_RANGE_PRE_TERM - , query[RANGE]); - }else{ - query_process_ = RangeQueryGenerator::Instance()->GetRangeQueryProcess(E_INDEX_READ_RANGE - , query[RANGE]); - } + query_process_ = new RangeQueryProcess(query[RANGE]); }else if (query.isMember(BOOL)){ query_process_ = new BoolQueryProcess(query[BOOL]); }else{ diff --git a/src/search_local/index_read/search_util.cc b/src/search_local/index_read/search_util.cc index 2f8556a..07ca71b 100644 --- a/src/search_local/index_read/search_util.cc +++ b/src/search_local/index_read/search_util.cc @@ -40,1490 +40,1453 @@ using namespace std; string initial_table[] = { "b","p","m","f","d","t","n","l","g","k","h","j","q","x","zh","ch","sh","r","z","c","s","y","w" }; typedef struct { - char syllable[8]; - char small_syllable[2][8]; + char syllable[8]; + char small_syllable[2][8]; }SMALLSYLLABLEITEM; static SMALLSYLLABLEITEM small_syllable_items[] = { - {"ao", {"a", "o"}}, - {"bian", {"bi", "an"}}, - {"yue", {"yu", "e"}}, - {"biao", {"bi", "ao"}}, - {"cuan", {"cu", "an"}}, - {"dia", {"di", "a"}}, - {"dian", {"di", "an"}}, - {"diao", {"di", "ao"}}, - {"die", {"di", "e"}}, - {"duan", {"du", "an"}}, - {"gua", {"gu", "a"}}, - {"guai", {"gu", "ai"}}, - {"guan", {"gu", "an"}}, - {"hua", {"hu", "a"}}, - {"huai", {"hu", "ai"}}, - {"huan", {"hu", "an"}}, - {"huo", {"hu", "o"}}, - {"jia", {"ji", "a"}}, - {"jian", {"ji", "an"}}, - {"jiang", {"ji", "ang"}}, - {"jiao", {"ji", "ao"}}, - {"jie", {"ji", "e"}}, - {"jue", {"ju", "e"}}, - {"juan", {"ju", "an"}}, - {"kua", {"ku", "a"}}, - {"kuai", {"ku", "ai"}}, - {"kuo", {"ku", "o"}}, - {"lia", {"li", "a"}}, - {"lian", {"li", "an"}}, - {"liang", {"li", "ang"}}, - {"liao", {"li", "ao"}}, - {"lie", {"li", "e"}}, - {"luan", {"lu", "an"}}, - {"mian", {"mi", "an"}}, - {"miao", {"mi", "ao"}}, - {"mie", {"mi", "e"}}, - {"nao", {"na", "o"}}, - {"nian", {"ni", "an"}}, - {"niao", {"ni", "ao"}}, - {"nie", {"ni", "e"}}, - {"pian", {"pi", "an"}}, - {"piao", {"pi", "ao"}}, - {"pie", {"pi", "e"}}, - {"qia", {"qi", "a"}}, - {"qian", {"qi", "an"}}, - {"qiang", {"qi", "ang"}}, - {"qiao", {"qi", "ao"}}, - {"qie", {"qi", "e"}}, - {"quan", {"qu", "an"}}, - {"que", {"qu", "e"}}, - {"suan", {"su", "an"}}, - {"shuan", {"shu", "an"}}, - {"shuang", {"shu", "ang"}}, - {"shuo", {"shu", "o"}}, - {"tian", {"ti", "an"}}, - {"tie", {"ti", "e"}}, - {"tuan", {"tu", "an"}}, - {"tuo", {"tu", "o"}}, - {"xia", {"xi", "a"}}, - {"xian", {"xi", "an"}}, - {"xiao", {"xi", "ao"}}, - {"xiang", {"xi", "ang"}}, - {"xie", {"xi", "e"}}, - {"xuan", {"xu", "an"}}, - {"xue", {"xu", "e"}}, - {"yao", {"ya", "o"}}, - {"yuan", {"yu", "an"}}, - {"yue", {"yu", "e"}}, - {"zao", {"za", "o"}}, - {"zuan", {"zu", "an"}}, - {"zhao", {"zha", "o"}}, - {"zhua", {"zhu", "a"}}, - {"zhuan", {"zhu", "an"}} + {"ao", {"a", "o"}}, + {"bian", {"bi", "an"}}, + {"yue", {"yu", "e"}}, + {"biao", {"bi", "ao"}}, + {"cuan", {"cu", "an"}}, + {"dia", {"di", "a"}}, + {"dian", {"di", "an"}}, + {"diao", {"di", "ao"}}, + {"die", {"di", "e"}}, + {"duan", {"du", "an"}}, + {"gua", {"gu", "a"}}, + {"guai", {"gu", "ai"}}, + {"guan", {"gu", "an"}}, + {"hua", {"hu", "a"}}, + {"huai", {"hu", "ai"}}, + {"huan", {"hu", "an"}}, + {"huo", {"hu", "o"}}, + {"jia", {"ji", "a"}}, + {"jian", {"ji", "an"}}, + {"jiang", {"ji", "ang"}}, + {"jiao", {"ji", "ao"}}, + {"jie", {"ji", "e"}}, + {"jue", {"ju", "e"}}, + {"juan", {"ju", "an"}}, + {"kua", {"ku", "a"}}, + {"kuai", {"ku", "ai"}}, + {"kuo", {"ku", "o"}}, + {"lia", {"li", "a"}}, + {"lian", {"li", "an"}}, + {"liang", {"li", "ang"}}, + {"liao", {"li", "ao"}}, + {"lie", {"li", "e"}}, + {"luan", {"lu", "an"}}, + {"mian", {"mi", "an"}}, + {"miao", {"mi", "ao"}}, + {"mie", {"mi", "e"}}, + {"nao", {"na", "o"}}, + {"nian", {"ni", "an"}}, + {"niao", {"ni", "ao"}}, + {"nie", {"ni", "e"}}, + {"pian", {"pi", "an"}}, + {"piao", {"pi", "ao"}}, + {"pie", {"pi", "e"}}, + {"qia", {"qi", "a"}}, + {"qian", {"qi", "an"}}, + {"qiang", {"qi", "ang"}}, + {"qiao", {"qi", "ao"}}, + {"qie", {"qi", "e"}}, + {"quan", {"qu", "an"}}, + {"que", {"qu", "e"}}, + {"suan", {"su", "an"}}, + {"shuan", {"shu", "an"}}, + {"shuang", {"shu", "ang"}}, + {"shuo", {"shu", "o"}}, + {"tian", {"ti", "an"}}, + {"tie", {"ti", "e"}}, + {"tuan", {"tu", "an"}}, + {"tuo", {"tu", "o"}}, + {"xia", {"xi", "a"}}, + {"xian", {"xi", "an"}}, + {"xiao", {"xi", "ao"}}, + {"xiang", {"xi", "ang"}}, + {"xie", {"xi", "e"}}, + {"xuan", {"xu", "an"}}, + {"xue", {"xu", "e"}}, + {"yao", {"ya", "o"}}, + {"yuan", {"yu", "an"}}, + {"yue", {"yu", "e"}}, + {"zao", {"za", "o"}}, + {"zuan", {"zu", "an"}}, + {"zhao", {"zha", "o"}}, + {"zhua", {"zhu", "a"}}, + {"zhuan", {"zhu", "an"}} }; bool IsSmallSyllable(string str, string &syllable1, string &syllable2) { - int num = sizeof(small_syllable_items) / sizeof(SMALLSYLLABLEITEM); - for (int i = 0; i < num; i++) { - if (strcmp(small_syllable_items[i].syllable, str.c_str()) == 0) { - syllable1 = (small_syllable_items[i].small_syllable)[0]; - syllable2 = (small_syllable_items[i].small_syllable)[1]; - return true; - } - } - return false; + int num = sizeof(small_syllable_items) / sizeof(SMALLSYLLABLEITEM); + for (int i = 0; i < num; i++) { + if (strcmp(small_syllable_items[i].syllable, str.c_str()) == 0) { + syllable1 = (small_syllable_items[i].small_syllable)[0]; + syllable2 = (small_syllable_items[i].small_syllable)[1]; + return true; + } + } + return false; } bool operator<(const Content &a, const Content &b) { - if (a.type > b.type) { - return false; - } - else { - return a.str < b.str; - } + if (a.type > b.type) { + return false; + } + else { + return a.str < b.str; + } } string readFileIntoString(char * filename) { - ifstream ifile(filename); - //将文件读入到ostringstream对象buf中 - ostringstream buf; - char ch; - while (buf && ifile.get(ch)) - buf.put(ch); - //返回与流对象buf关联的字符串 - return buf.str(); + ifstream ifile(filename); + //将文件读入到ostringstream对象buf中 + ostringstream buf; + char ch; + while (buf && ifile.get(ch)) + buf.put(ch); + //返回与流对象buf关联的字符串 + return buf.str(); } vector splitInt(const string& src, string separate_character) { - vector strs; + vector strs; - //分割字符串的长度,这样就可以支持如“,,”多字符串的分隔符 - int separate_characterLen = separate_character.size(); - int lastPosition = 0, index = -1; - string str; - int pos = 0; - while (-1 != (index = src.find(separate_character, lastPosition))) - { - if (src.substr(lastPosition, index - lastPosition) != " ") { - str = src.substr(lastPosition, index - lastPosition); - pos = atoi(str.c_str()); - strs.push_back(pos); - } - lastPosition = index + separate_characterLen; - } - string lastString = src.substr(lastPosition);//截取最后一个分隔符后的内容 - if (!lastString.empty() && lastString != " ") - pos = atoi(lastString.c_str()); - strs.push_back(pos);//如果最后一个分隔符后还有内容就入队 - return strs; + //分割字符串的长度,这样就可以支持如“,,”多字符串的分隔符 + int separate_characterLen = separate_character.size(); + int lastPosition = 0, index = -1; + string str; + int pos = 0; + while (-1 != (index = src.find(separate_character, lastPosition))) + { + if (src.substr(lastPosition, index - lastPosition) != " ") { + str = src.substr(lastPosition, index - lastPosition); + pos = atoi(str.c_str()); + strs.push_back(pos); + } + lastPosition = index + separate_characterLen; + } + string lastString = src.substr(lastPosition);//截取最后一个分隔符后的内容 + if (!lastString.empty() && lastString != " ") + pos = atoi(lastString.c_str()); + strs.push_back(pos);//如果最后一个分隔符后还有内容就入队 + return strs; } set splitStr(const string& src, string separate_character) { - set strs; + set strs; - //分割字符串的长度,这样就可以支持如“,,”多字符串的分隔符 - int separate_characterLen = separate_character.size(); - int lastPosition = 0, index = -1; - while (-1 != (index = src.find(separate_character, lastPosition))) - { - if (src.substr(lastPosition, index - lastPosition) != " ") { - strs.insert(src.substr(lastPosition, index - lastPosition)); - } - lastPosition = index + separate_characterLen; - } - string lastString = src.substr(lastPosition);//截取最后一个分隔符后的内容 - if (!lastString.empty() && lastString != " ") - strs.insert(lastString);//如果最后一个分隔符后还有内容就入队 - return strs; + //分割字符串的长度,这样就可以支持如“,,”多字符串的分隔符 + int separate_characterLen = separate_character.size(); + int lastPosition = 0, index = -1; + while (-1 != (index = src.find(separate_character, lastPosition))) + { + if (src.substr(lastPosition, index - lastPosition) != " ") { + strs.insert(src.substr(lastPosition, index - lastPosition)); + } + lastPosition = index + separate_characterLen; + } + string lastString = src.substr(lastPosition);//截取最后一个分隔符后的内容 + if (!lastString.empty() && lastString != " ") + strs.insert(lastString);//如果最后一个分隔符后还有内容就入队 + return strs; } bool CheckWordContinus(const vector &word_vec, map > &pos_map) { - if (word_vec.size() < 2) { - return true; - } + if (word_vec.size() < 2) { + return true; + } - size_t i = 0; - string word_before = ""; - string word_after = ""; - bool all_match = true; - for (; i < word_vec.size() - 1; i++) { - word_before = word_vec[i]; - word_after = word_vec[i + 1]; - vector before_vec; - vector after_vec; - if (pos_map.find(word_before) != pos_map.end()) { - before_vec = pos_map[word_before]; - } - else { - all_match = false; - break; - } - if (pos_map.find(word_after) != pos_map.end()) { - after_vec = pos_map[word_after]; - } - else { - all_match = false; - break; - } + size_t i = 0; + string word_before = ""; + string word_after = ""; + bool all_match = true; + for (; i < word_vec.size() - 1; i++) { + word_before = word_vec[i]; + word_after = word_vec[i + 1]; + vector before_vec; + vector after_vec; + if (pos_map.find(word_before) != pos_map.end()) { + before_vec = pos_map[word_before]; + } + else { + all_match = false; + break; + } + if (pos_map.find(word_after) != pos_map.end()) { + after_vec = pos_map[word_after]; + } + else { + all_match = false; + break; + } - size_t j = 0; - size_t k = 0; - int before_pos = 0; - int after_pos = 0; - bool match_flag = false; - for (j = 0; j < before_vec.size(); j++) { - if (match_flag == true) { - break; - } - before_pos = before_vec[j]; - for (k = 0; k < after_vec.size(); k++) { - after_pos = after_vec[k]; - if (after_pos - before_pos <= 2) { - match_flag = true; - break; - } - } - } + size_t j = 0; + size_t k = 0; + int before_pos = 0; + int after_pos = 0; + bool match_flag = false; + for (j = 0; j < before_vec.size(); j++) { + if (match_flag == true) { + break; + } + before_pos = before_vec[j]; + for (k = 0; k < after_vec.size(); k++) { + after_pos = after_vec[k]; + if (after_pos - before_pos <= 2) { + match_flag = true; + break; + } + } + } - if (match_flag == false) { - all_match = false; - break; - } - } + if (match_flag == false) { + all_match = false; + break; + } + } - if (all_match == true) { - return true; - } - else { - return false; - } + if (all_match == true) { + return true; + } + else { + return false; + } } void split_func(string pinyin, string &split_str, int type) { - int i = 0; - stringstream result; - for (i = 0; i<(int)pinyin.size(); i++) - { - if (strchr("aeiouv", pinyin.at(i))) - { - result << pinyin.at(i); - continue; - } - else - { - if (pinyin.at(i) != 'n') //不是n从该辅音前分开 - { - if (i == 0) - { - result << pinyin.at(i); - } - else - { - result << ' ' << pinyin.at(i); - } - if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && - (pinyin.at(i + 1) == 'h')) - { - if (type == 1) { - result << 'h'; - } - else { - result << " h"; - } - i++; - } - continue; - } - else //是n,继续向后 - { - if (i == (int)pinyin.size() - 1) - { - result << pinyin.at(i); - continue; - } - else - i++; //继续向后 + int i = 0; + stringstream result; + for (i = 0; i<(int)pinyin.size(); i++) + { + if (strchr("aeiouv", pinyin.at(i))) + { + result << pinyin.at(i); + continue; + } + else + { + if (pinyin.at(i) != 'n') //不是n从该辅音前分开 + { + if (i == 0) + { + result << pinyin.at(i); + } + else + { + result << ' ' << pinyin.at(i); + } + if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && + (pinyin.at(i + 1) == 'h')) + { + if (type == 1) { + result << 'h'; + } + else { + result << " h"; + } + i++; + } + continue; + } + else //是n,继续向后 + { + if (i == (int)pinyin.size() - 1) + { + result << pinyin.at(i); + continue; + } + else + i++; //继续向后 - if (strchr("aeiouv", pinyin.at(i))) //如果是元音,从n前分开 - { - if (i == 1) - { - result << 'n' << pinyin.at(i); - continue; - } - else - { - result << ' ' << 'n' << pinyin.at(i); - continue; - } - } - //如果是辅音字母 - else - { - if (pinyin.at(i) == 'g') - { - if (i == (int)pinyin.size() - 1) - { - result << 'n' << pinyin.at(i); - continue; - } - else - i++; //继续向后 + if (strchr("aeiouv", pinyin.at(i))) //如果是元音,从n前分开 + { + if (i == 1) + { + result << 'n' << pinyin.at(i); + continue; + } + else + { + result << ' ' << 'n' << pinyin.at(i); + continue; + } + } + //如果是辅音字母 + else + { + if (pinyin.at(i) == 'g') + { + if (i == (int)pinyin.size() - 1) + { + result << 'n' << pinyin.at(i); + continue; + } + else + i++; //继续向后 - if (strchr("aeiouv", pinyin.at(i))) - { - result << 'n' << ' ' << 'g' << pinyin.at(i); - continue; - } - else - { - result << 'n' << 'g' << ' ' << pinyin.at(i); - if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && - (pinyin.at(i + 1) == 'h')) - { - if (type == 1) { - result << 'h'; - } - else { - result <<" h"; - } - i++; - } - continue; - } - } - else //不是g的辅音字母,从n后分开 - { - result << 'n' << ' ' << pinyin.at(i); - if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && - (pinyin.at(i + 1) == 'h')) - { - if (type == 1) { - result << 'h'; - } - else { - result << " h"; - } - i++; - } - continue; - } - } - } - } - } - split_str = result.str(); + if (strchr("aeiouv", pinyin.at(i))) + { + result << 'n' << ' ' << 'g' << pinyin.at(i); + continue; + } + else + { + result << 'n' << 'g' << ' ' << pinyin.at(i); + if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && + (pinyin.at(i + 1) == 'h')) + { + if (type == 1) { + result << 'h'; + } + else { + result <<" h"; + } + i++; + } + continue; + } + } + else //不是g的辅音字母,从n后分开 + { + result << 'n' << ' ' << pinyin.at(i); + if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') && + (pinyin.at(i + 1) == 'h')) + { + if (type == 1) { + result << 'h'; + } + else { + result << " h"; + } + i++; + } + continue; + } + } + } + } + } + split_str = result.str(); } void FMM(string &str, vector &vec) { - int maxLen = 6; - int len_phrase = str.length(); - int i = 0, j = 0; + int maxLen = 6; + int len_phrase = str.length(); + int i = 0, j = 0; - while (i < len_phrase) { - int end = i + maxLen; - if (end >= len_phrase) { - end = len_phrase; - } - string sub = str.substr(i, end - i); - for (j = sub.length(); j >= 0; j--) { - if (j == 1) - break; - string key = sub.substr(0, j); - if (DataManager::Instance()->IsPhonetic(key) == true) { - vec.push_back(key); - i += key.length() - 1; - break; - } - } - if (j == 1) { - vec.push_back(string(1, sub[0])); - } - i += 1; - } - return; + while (i < len_phrase) { + int end = i + maxLen; + if (end >= len_phrase) { + end = len_phrase; + } + string sub = str.substr(i, end - i); + for (j = sub.length(); j >= 0; j--) { + if (j == 1) + break; + string key = sub.substr(0, j); + if (DataManager::Instance()->IsPhonetic(key) == true) { + vec.push_back(key); + i += key.length() - 1; + break; + } + } + if (j == 1) { + vec.push_back(string(1, sub[0])); + } + i += 1; + } + return; } set sets_intersection(set v1, set v2) { - set v; - set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求交集 - return v; + set v; + set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求交集 + return v; } set sets_intersection(set v1, set v2) { - set v; - set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求交集 - return v; + set v; + set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求交集 + return v; } set sets_union(set v1, set v2) { - set v; - set_union(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求并集 - return v; + set v; + set_union(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求并集 + return v; } set sets_difference(set v1, set v2) { - set v; - set_difference(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求差集 - return v; + set v; + set_difference(v1.begin(), v1.end(), v2.begin(), v2.end(), inserter(v, v.begin()));//求差集 + return v; } int GetMultipleWords(string m_Data, set >& cvset) { - vector result; - set initial_vec(initial_table, initial_table + 23); - iutf8string utf8_str(m_Data); - int i = 0; - bool isAllChinese = true; - for (; i < utf8_str.length(); ) { - if (utf8_str[i].size() > 1) { - Content content; - content.type = CHINESE; - content.str = utf8_str[i]; - result.push_back(content); - i++; - } - else { - isAllChinese = false; - string tmp = utf8_str[i]; - int j = 1; - for (; j < utf8_str.length() - i; j++) { - if (utf8_str[i + j].size() > 1) { - break; - } - tmp += utf8_str[i + j]; - } - i += j; - string split_str; - split_func(tmp, split_str); - vector vec = splitEx(split_str, " "); - vector::iterator iter = vec.begin(); - for (; iter != vec.end(); iter++) { - Content content; - content.str = *iter; - if (initial_vec.find(*iter) != initial_vec.end()) { - content.type = INITIAL; - } - else if ((*iter).size() == 1) { - content.type = INITIAL; - } - else { - content.type = WHOLE_SPELL; - } - result.push_back(content); - } - } - } - cvset.insert(result); - if (isAllChinese == false) { - // 歧义拼音继续拆分 yue --> yu e - vector syllableVec; - vector::iterator iter = result.begin(); - for (; iter != result.end(); iter++) { - Content content = *iter; - string syllable1; - string syllable2; - if (content.type == WHOLE_SPELL) { - bool flag = IsSmallSyllable(content.str, syllable1, syllable2); - if (flag == true) { - Content content1; - Content content2; - content1.str = syllable1; - content1.type = WHOLE_SPELL; - content2.str = syllable2; - content2.type = WHOLE_SPELL; - syllableVec.push_back(content1); - syllableVec.push_back(content2); - } - else { - syllableVec.push_back(content); - } - } - else { - syllableVec.push_back(content); - } - } - cvset.insert(syllableVec); + vector result; + set initial_vec(initial_table, initial_table + 23); + iutf8string utf8_str(m_Data); + int i = 0; + bool isAllChinese = true; + for (; i < utf8_str.length(); ) { + if (utf8_str[i].size() > 1) { + Content content; + content.type = CHINESE; + content.str = utf8_str[i]; + result.push_back(content); + i++; + } + else { + isAllChinese = false; + string tmp = utf8_str[i]; + int j = 1; + for (; j < utf8_str.length() - i; j++) { + if (utf8_str[i + j].size() > 1) { + break; + } + tmp += utf8_str[i + j]; + } + i += j; + string split_str; + split_func(tmp, split_str); + vector vec = splitEx(split_str, " "); + vector::iterator iter = vec.begin(); + for (; iter != vec.end(); iter++) { + Content content; + content.str = *iter; + if (initial_vec.find(*iter) != initial_vec.end()) { + content.type = INITIAL; + } + else if ((*iter).size() == 1) { + content.type = INITIAL; + } + else { + content.type = WHOLE_SPELL; + } + result.push_back(content); + } + } + } + cvset.insert(result); + if (isAllChinese == false) { + // 歧义拼音继续拆分 yue --> yu e + vector syllableVec; + vector::iterator iter = result.begin(); + for (; iter != result.end(); iter++) { + Content content = *iter; + string syllable1; + string syllable2; + if (content.type == WHOLE_SPELL) { + bool flag = IsSmallSyllable(content.str, syllable1, syllable2); + if (flag == true) { + Content content1; + Content content2; + content1.str = syllable1; + content1.type = WHOLE_SPELL; + content2.str = syllable2; + content2.type = WHOLE_SPELL; + syllableVec.push_back(content1); + syllableVec.push_back(content2); + } + else { + syllableVec.push_back(content); + } + } + else { + syllableVec.push_back(content); + } + } + cvset.insert(syllableVec); - // 将zh ch sh切分开 - vector cvec; - for (i = 0; i < utf8_str.length(); ) { - if (utf8_str[i].size() > 1) { - Content content; - content.type = CHINESE; - content.str = utf8_str[i]; - cvec.push_back(content); - i++; - } - else { - isAllChinese = false; - string tmp = utf8_str[i]; - int j = 1; - for (; j < utf8_str.length() - i; j++) { - if (utf8_str[i + j].size() > 1) { - break; - } - tmp += utf8_str[i + j]; - } - i += j; - string split_str; - split_func(tmp, split_str, 2); - vector vec = splitEx(split_str, " "); - vector::iterator iter = vec.begin(); - for (; iter != vec.end(); iter++) { - Content content; - content.str = *iter; - if (initial_vec.find(*iter) != initial_vec.end()) { - content.type = INITIAL; - } - else if ((*iter).size() == 1) { - content.type = INITIAL; - } - else { - content.type = WHOLE_SPELL; - } - cvec.push_back(content); - } - } - } - cvset.insert(cvec); + // 将zh ch sh切分开 + vector cvec; + for (i = 0; i < utf8_str.length(); ) { + if (utf8_str[i].size() > 1) { + Content content; + content.type = CHINESE; + content.str = utf8_str[i]; + cvec.push_back(content); + i++; + } + else { + isAllChinese = false; + string tmp = utf8_str[i]; + int j = 1; + for (; j < utf8_str.length() - i; j++) { + if (utf8_str[i + j].size() > 1) { + break; + } + tmp += utf8_str[i + j]; + } + i += j; + string split_str; + split_func(tmp, split_str, 2); + vector vec = splitEx(split_str, " "); + vector::iterator iter = vec.begin(); + for (; iter != vec.end(); iter++) { + Content content; + content.str = *iter; + if (initial_vec.find(*iter) != initial_vec.end()) { + content.type = INITIAL; + } + else if ((*iter).size() == 1) { + content.type = INITIAL; + } + else { + content.type = WHOLE_SPELL; + } + cvec.push_back(content); + } + } + } + cvset.insert(cvec); - // 最大匹配进行拼音拆分,对于wuenda这种可以有效支持 - vector fmmVec; - for (i = 0; i < utf8_str.length(); ) { - if (utf8_str[i].size() > 1) { - Content content; - content.type = CHINESE; - content.str = utf8_str[i]; - fmmVec.push_back(content); - i++; - } - else { - isAllChinese = false; - string tmp = utf8_str[i]; - int j = 1; - for (; j < utf8_str.length() - i; j++) { - if (utf8_str[i + j].size() > 1) { - break; - } - tmp += utf8_str[i + j]; - } - i += j; - vector vec; - FMM(tmp, vec); - vector::iterator iter = vec.begin(); - for (; iter != vec.end(); iter++) { - Content content; - content.str = *iter; - if (initial_vec.find(*iter) != initial_vec.end()) { - content.type = INITIAL; - } - else if ((*iter).size() == 1) { - content.type = INITIAL; - } - else { - content.type = WHOLE_SPELL; - } - fmmVec.push_back(content); - } - } - } - cvset.insert(fmmVec); - } + // 最大匹配进行拼音拆分,对于wuenda这种可以有效支持 + vector fmmVec; + for (i = 0; i < utf8_str.length(); ) { + if (utf8_str[i].size() > 1) { + Content content; + content.type = CHINESE; + content.str = utf8_str[i]; + fmmVec.push_back(content); + i++; + } + else { + isAllChinese = false; + string tmp = utf8_str[i]; + int j = 1; + for (; j < utf8_str.length() - i; j++) { + if (utf8_str[i + j].size() > 1) { + break; + } + tmp += utf8_str[i + j]; + } + i += j; + vector vec; + FMM(tmp, vec); + vector::iterator iter = vec.begin(); + for (; iter != vec.end(); iter++) { + Content content; + content.str = *iter; + if (initial_vec.find(*iter) != initial_vec.end()) { + content.type = INITIAL; + } + else if ((*iter).size() == 1) { + content.type = INITIAL; + } + else { + content.type = WHOLE_SPELL; + } + fmmVec.push_back(content); + } + } + } + cvset.insert(fmmVec); + } - return 0; + return 0; } void ConvertCharIntelligent(const string word, IntelligentInfo &info, int &len) { - int i = 1; - int index = 0; - int length = word.size(); + int i = 1; + int index = 0; + int length = word.size(); - for (; index < length; index++, i++) { - if (index > 15){ - break; - } + for (; index < length; index++, i++) { + if (index > 15){ + break; + } - if (i == 1) { - info.initial_char_01 = word[index]; - } - if (i == 2) { - info.initial_char_02 = word[index]; - } - if (i == 3) { - info.initial_char_03 = word[index]; - } - if (i == 4) { - info.initial_char_04 = word[index]; - } - if (i == 5) { - info.initial_char_05 = word[index]; - } - if (i == 6) { - info.initial_char_06 = word[index]; - } - if (i == 7) { - info.initial_char_07 = word[index]; - } - if (i == 8) { - info.initial_char_08 = word[index]; - } - if (i == 9) { - info.initial_char_09 = word[index]; - } - if (i == 10) { - info.initial_char_10 = word[index]; - } - if (i == 11) { - info.initial_char_11 = word[index]; - } - if (i == 12) { - info.initial_char_12 = word[index]; - } - if (i == 13) { - info.initial_char_13 = word[index]; - } - if (i == 14) { - info.initial_char_14 = word[index]; - } - if (i == 15) { - info.initial_char_15 = word[index]; - } - if (i == 16) { - info.initial_char_16 = word[index]; - } - } - len = index; - return ; + if (i == 1) { + info.initial_char_01 = word[index]; + } + if (i == 2) { + info.initial_char_02 = word[index]; + } + if (i == 3) { + info.initial_char_03 = word[index]; + } + if (i == 4) { + info.initial_char_04 = word[index]; + } + if (i == 5) { + info.initial_char_05 = word[index]; + } + if (i == 6) { + info.initial_char_06 = word[index]; + } + if (i == 7) { + info.initial_char_07 = word[index]; + } + if (i == 8) { + info.initial_char_08 = word[index]; + } + if (i == 9) { + info.initial_char_09 = word[index]; + } + if (i == 10) { + info.initial_char_10 = word[index]; + } + if (i == 11) { + info.initial_char_11 = word[index]; + } + if (i == 12) { + info.initial_char_12 = word[index]; + } + if (i == 13) { + info.initial_char_13 = word[index]; + } + if (i == 14) { + info.initial_char_14 = word[index]; + } + if (i == 15) { + info.initial_char_15 = word[index]; + } + if (i == 16) { + info.initial_char_16 = word[index]; + } + } + len = index; + return ; } void ConvertIntelligent(const vector &result, IntelligentInfo &info, bool &flag) { - int i = 1; - flag = true; - vector::const_iterator content_iter = result.begin(); - for (; content_iter != result.end(); content_iter++, i++) { - if (i > 8) - break; + int i = 1; + flag = true; + vector::const_iterator content_iter = result.begin(); + for (; content_iter != result.end(); content_iter++, i++) { + if (i > 8) + break; - uint32_t charact_id = 0; - uint32_t phonetic_id = 0; + uint32_t charact_id = 0; + uint32_t phonetic_id = 0; - if (i == 1) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_01 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_01 = phonetic_id; - } - else { - info.initial_char_01 = (*content_iter).str; - } - } - if (i == 2) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_02 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_02 = phonetic_id; - } - else { - info.initial_char_02 = (*content_iter).str; - } - } - if (i == 3) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_03 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_03 = phonetic_id; - } - else { - info.initial_char_03 = (*content_iter).str; - } - } - if (i == 4) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_04 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_04 = phonetic_id; - } - else { - info.initial_char_04 = (*content_iter).str; - } - } - if (i == 5) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_05 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_05 = phonetic_id; - } - else { - info.initial_char_05 = (*content_iter).str; - } - } - if (i == 6) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_06 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_06 = phonetic_id; - } - else { - info.initial_char_06 = (*content_iter).str; - } - } - if (i == 7) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_07 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_07 = phonetic_id; - } - else { - info.initial_char_07 = (*content_iter).str; - } - } - if (i == 8) { - if ((*content_iter).type == CHINESE) { // 查找字id - DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); - info.charact_id_08 = charact_id; - } - else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id - DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); - info.phonetic_id_08 = phonetic_id; - } - else { - info.initial_char_08 = (*content_iter).str; - } - } - // 如果查找id为0,则视为无效 - if (charact_id == 0 && phonetic_id == 0 && (*content_iter).type != INITIAL) { - flag = false; - break; - } - } + if (i == 1) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_01 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_01 = phonetic_id; + } + else { + info.initial_char_01 = (*content_iter).str; + } + } + if (i == 2) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_02 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_02 = phonetic_id; + } + else { + info.initial_char_02 = (*content_iter).str; + } + } + if (i == 3) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_03 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_03 = phonetic_id; + } + else { + info.initial_char_03 = (*content_iter).str; + } + } + if (i == 4) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_04 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_04 = phonetic_id; + } + else { + info.initial_char_04 = (*content_iter).str; + } + } + if (i == 5) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_05 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_05 = phonetic_id; + } + else { + info.initial_char_05 = (*content_iter).str; + } + } + if (i == 6) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_06 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_06 = phonetic_id; + } + else { + info.initial_char_06 = (*content_iter).str; + } + } + if (i == 7) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_07 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_07 = phonetic_id; + } + else { + info.initial_char_07 = (*content_iter).str; + } + } + if (i == 8) { + if ((*content_iter).type == CHINESE) { // 查找字id + DataManager::Instance()->GetCharactId((*content_iter).str, charact_id); + info.charact_id_08 = charact_id; + } + else if ((*content_iter).type == WHOLE_SPELL) { // 查找拼音id + DataManager::Instance()->GetPhoneticId((*content_iter).str, phonetic_id); + info.phonetic_id_08 = phonetic_id; + } + else { + info.initial_char_08 = (*content_iter).str; + } + } + // 如果查找id为0,则视为无效 + if (charact_id == 0 && phonetic_id == 0 && (*content_iter).type != INITIAL) { + flag = false; + break; + } + } } vector GetSingleWord(string m_Data) { - vector result; - set initial_vec(initial_table, initial_table + 23); - iutf8string utf8_str(m_Data); - int i = 0; - for (; i < utf8_str.length(); ) { - if (utf8_str[i].size() > 1) { - Content content; - content.type = CHINESE; - content.str = utf8_str[i]; - result.push_back(content); - i++; - } - else { - string tmp = utf8_str[i]; - int j = 1; - for (; j < utf8_str.length() - i; j++) { - if (utf8_str[i + j].size() > 1) { - break; - } - tmp += utf8_str[i + j]; - } - i += j; - string split_str; - split_func(tmp, split_str); - vector vec = splitEx(split_str, " "); - vector::iterator iter = vec.begin(); - for (; iter != vec.end(); iter++) { - Content content; - content.str = *iter; - if (initial_vec.find(*iter) != initial_vec.end()) { - content.type = INITIAL; - } else if ((*iter).size() == 1) { - content.type = INITIAL; - } - else { - content.type = WHOLE_SPELL; - } - result.push_back(content); - } - } - } + vector result; + set initial_vec(initial_table, initial_table + 23); + iutf8string utf8_str(m_Data); + int i = 0; + for (; i < utf8_str.length(); ) { + if (utf8_str[i].size() > 1) { + Content content; + content.type = CHINESE; + content.str = utf8_str[i]; + result.push_back(content); + i++; + } + else { + string tmp = utf8_str[i]; + int j = 1; + for (; j < utf8_str.length() - i; j++) { + if (utf8_str[i + j].size() > 1) { + break; + } + tmp += utf8_str[i + j]; + } + i += j; + string split_str; + split_func(tmp, split_str); + vector vec = splitEx(split_str, " "); + vector::iterator iter = vec.begin(); + for (; iter != vec.end(); iter++) { + Content content; + content.str = *iter; + if (initial_vec.find(*iter) != initial_vec.end()) { + content.type = INITIAL; + } else if ((*iter).size() == 1) { + content.type = INITIAL; + } + else { + content.type = WHOLE_SPELL; + } + result.push_back(content); + } + } + } - return result; + return result; } int GetSuggestWord(string m_Data, vector &word_vec, uint32_t suggest_cnt) { - vector result = GetSingleWord(m_Data); - IntelligentInfo info; - bool flag = true; - ConvertIntelligent(result, info, flag); - if (flag == false) { - log_debug("ConvertIntelligent invalid."); - return 0; - } - if (info.initial_char_01 == "" && info.charact_id_01 == 0) { - if (m_Data.length() != 1) { - uint32_t charact_id = 0; - DataManager::Instance()->GetCharactId(m_Data, charact_id); - info.charact_id_01 = charact_id; - } - else { - info.initial_char_01 = m_Data; - } - } - log_debug("charact_id_01: %d, charact_id_02: %d,charact_id_03: %d, charact_id_04: %d, phonetic_id_01: %d, phonetic_id_02: %d, \ - phonetic_id_03: %d, phonetic_id_04: %d, initial_char_01: %s, initial_char_02: %s, initial_char_03: %s, initial_char_04: %s", - info.charact_id_01, info.charact_id_02, info.charact_id_03, info.charact_id_04, info.phonetic_id_01, info.phonetic_id_02, - info.phonetic_id_03, info.phonetic_id_04, info.initial_char_01.c_str(), info.initial_char_02.c_str(), info.initial_char_03.c_str(), - info.initial_char_04.c_str()); + vector result = GetSingleWord(m_Data); + IntelligentInfo info; + bool flag = true; + ConvertIntelligent(result, info, flag); + if (flag == false) { + log_debug("ConvertIntelligent invalid."); + return 0; + } + if (info.initial_char_01 == "" && info.charact_id_01 == 0) { + if (m_Data.length() != 1) { + uint32_t charact_id = 0; + DataManager::Instance()->GetCharactId(m_Data, charact_id); + info.charact_id_01 = charact_id; + } + else { + info.initial_char_01 = m_Data; + } + } + log_debug("charact_id_01: %d, charact_id_02: %d,charact_id_03: %d, charact_id_04: %d, phonetic_id_01: %d, phonetic_id_02: %d, \ + phonetic_id_03: %d, phonetic_id_04: %d, initial_char_01: %s, initial_char_02: %s, initial_char_03: %s, initial_char_04: %s", + info.charact_id_01, info.charact_id_02, info.charact_id_03, info.charact_id_04, info.phonetic_id_01, info.phonetic_id_02, + info.phonetic_id_03, info.phonetic_id_04, info.initial_char_01.c_str(), info.initial_char_02.c_str(), info.initial_char_03.c_str(), + info.initial_char_04.c_str()); - if (DataManager::Instance()->GetSuggestWord(info, word_vec, suggest_cnt) == false) { - return -RT_GET_SUGGEST_ERR; - } + if (DataManager::Instance()->GetSuggestWord(info, word_vec, suggest_cnt) == false) { + return -RT_GET_SUGGEST_ERR; + } - return 0; + return 0; } int GetEnSuggestWord(string m_Data, vector &word_vec, uint32_t suggest_cnt) { - vector result; - size_t i = 0; - for (; i < m_Data.length(); i++) { - Content content; - content.str = m_Data[i]; - content.type = INITIAL; - result.push_back(content); - } + vector result; + size_t i = 0; + for (; i < m_Data.length(); i++) { + Content content; + content.str = m_Data[i]; + content.type = INITIAL; + result.push_back(content); + } - IntelligentInfo info; - bool flag = true; - ConvertIntelligent(result, info, flag); - if (flag == false) { - log_debug("ConvertIntelligent invalid."); - return 0; - } - log_debug("initial_char_01: %s, initial_char_02: %s, initial_char_03: %s, initial_char_04: %s", - info.initial_char_01.c_str(), info.initial_char_02.c_str(), info.initial_char_03.c_str(), info.initial_char_04.c_str()); + IntelligentInfo info; + bool flag = true; + ConvertIntelligent(result, info, flag); + if (flag == false) { + log_debug("ConvertIntelligent invalid."); + return 0; + } + log_debug("initial_char_01: %s, initial_char_02: %s, initial_char_03: %s, initial_char_04: %s", + info.initial_char_01.c_str(), info.initial_char_02.c_str(), info.initial_char_03.c_str(), info.initial_char_04.c_str()); - if (DataManager::Instance()->GetEnSuggestWord(info, word_vec, suggest_cnt) == false) { - return -RT_GET_SUGGEST_ERR; - } + if (DataManager::Instance()->GetEnSuggestWord(info, word_vec, suggest_cnt) == false) { + return -RT_GET_SUGGEST_ERR; + } - return 0; + return 0; } bool isAllNumber(string str) { - bool flag = true; - size_t i = 0; - for (; i < str.size(); i++) { - if ((str[i] < '0') || (str[i] > '9')) { - flag = false; - break; - } - } - return flag; + bool flag = true; + size_t i = 0; + for (; i < str.size(); i++) { + if ((str[i] < '0') || (str[i] > '9')) { + flag = false; + break; + } + } + return flag; } bool isAllAlpha(string str) { - bool flag = true; - size_t i = 0; - for (; i < str.size(); i++) { - if (!isupper(str[i]) && !islower(str[i])) { - flag = false; - break; - } - } - return flag; + bool flag = true; + size_t i = 0; + for (; i < str.size(); i++) { + if (!isupper(str[i]) && !islower(str[i])) { + flag = false; + break; + } + } + return flag; } bool isContainCharacter(string str) { - bool flag = false; - size_t i = 0; - for (; i < str.size(); i++) { - if (str[i] < 0) { - flag = true; - break; - } - } - return flag; + bool flag = false; + size_t i = 0; + for (; i < str.size(); i++) { + if (str[i] < 0) { + flag = true; + break; + } + } + return flag; } bool isAllChinese(string str) { - bool flag = true; - iutf8string utf8_str(str); - int i = 0; - for (; i < utf8_str.length(); i++) { - if (utf8_str[i].size() == 1) { // 如果有长度为1的情况,则不是全汉字 - flag = false; - break; - } - } + bool flag = true; + iutf8string utf8_str(str); + int i = 0; + for (; i < utf8_str.length(); i++) { + if (utf8_str[i].size() == 1) { // 如果有长度为1的情况,则不是全汉字 + flag = false; + break; + } + } - return flag; + return flag; } int judgeDataType(string str) { - int data_type = 0; - if (isAllAlpha(str)) { - if (DataManager::Instance()->IsEnWord(str)) { - data_type = DATA_ENGLISH; - } - else { - data_type = DATA_PHONETIC; - } - } - else { - if (isAllChinese(str)) { - data_type = DATA_CHINESE; - } - else { - data_type = DATA_HYBRID; - } - } - return data_type; + int data_type = 0; + if (isAllAlpha(str)) { + if (DataManager::Instance()->IsEnWord(str)) { + data_type = DATA_ENGLISH; + } + else { + data_type = DATA_PHONETIC; + } + } + else { + if (isAllChinese(str)) { + data_type = DATA_CHINESE; + } + else { + data_type = DATA_HYBRID; + } + } + return data_type; } int Convert2Phonetic(string str, string &phonetic) { - phonetic = ""; - iutf8string utf8_str(str); - int i = 0; - for (; i < utf8_str.length(); i++) { - phonetic.append(DataManager::Instance()->GetPhonetic(utf8_str[i])); - } - return 0; + phonetic = ""; + iutf8string utf8_str(str); + int i = 0; + for (; i < utf8_str.length(); i++) { + phonetic.append(DataManager::Instance()->GetPhonetic(utf8_str[i])); + } + return 0; } int JudgeWord(uint32_t appid, string str, bool &is_correct, string &probably_key) { - is_correct = true; - probably_key = ""; - if (DataManager::Instance()->IsChineseWord(appid, str)) { - log_debug("correct Chinese word"); - return 0; - } - if (DataManager::Instance()->IsEnWord(str)) { - log_debug("correct English word"); - return 0; - } - int data_type = 0; - data_type = judgeDataType(str); - log_debug("data_type: %d.", data_type); - is_correct = false; - vector word_str_vec; - int ret = GetSuggestWord(str, word_str_vec, 1); - if (ret != 0) { - log_error("GetSuggestWord error."); - return -RT_DB_ERR; - } + is_correct = true; + probably_key = ""; + if (DataManager::Instance()->IsChineseWord(appid, str)) { + log_debug("correct Chinese word"); + return 0; + } + if (DataManager::Instance()->IsEnWord(str)) { + log_debug("correct English word"); + return 0; + } + int data_type = 0; + data_type = judgeDataType(str); + log_debug("data_type: %d.", data_type); + is_correct = false; + vector word_str_vec; + int ret = GetSuggestWord(str, word_str_vec, 1); + if (ret != 0) { + log_error("GetSuggestWord error."); + return -RT_DB_ERR; + } - if (word_str_vec.size() > 0) { - probably_key = word_str_vec[0]; - } - else { - if (data_type == DATA_PHONETIC) { - ret = GetEnSuggestWord(str, word_str_vec, 1); - if (ret != 0) { - log_error("GetEnSuggestWord error."); - return -RT_DB_ERR; - } - if (word_str_vec.size() > 0) - probably_key = word_str_vec[0]; - else { - probably_key = SplitManager::Instance()->correction(str); - } - } - else if (data_type == DATA_CHINESE) { - vector keys; - string split_data = SplitManager::Instance()->split(str, 0); - log_debug("split_data: %s", split_data.c_str()); - keys = splitEx(split_data, "|"); - size_t single = 0; - vector::const_iterator iter = keys.begin(); - for (; iter != keys.end(); iter++) { - if ((*iter).length() == 3) { - single++; - } - } - if (single > keys.size() - 1) { - string data_new = ""; - Convert2Phonetic(str, data_new); - word_str_vec.clear(); - ret = GetSuggestWord(data_new, word_str_vec, 1); - if (ret == 0) { - if (word_str_vec.size() > 0) { - probably_key = word_str_vec[0]; - } - else { - probably_key = SplitManager::Instance()->ch_correction(str); - } - } - else { - log_error("GetSuggestWord error."); - return -RT_DB_ERR; - } - } - else { - is_correct = true; - } - } - else { - is_correct = true; - } - } + if (word_str_vec.size() > 0) { + probably_key = word_str_vec[0]; + } + else { + if (data_type == DATA_PHONETIC) { + ret = GetEnSuggestWord(str, word_str_vec, 1); + if (ret != 0) { + log_error("GetEnSuggestWord error."); + return -RT_DB_ERR; + } + if (word_str_vec.size() > 0) + probably_key = word_str_vec[0]; + else { + probably_key = SplitManager::Instance()->correction(str); + } + } + else if (data_type == DATA_CHINESE) { + vector keys; + string split_data = SplitManager::Instance()->split(str, 0); + log_debug("split_data: %s", split_data.c_str()); + keys = splitEx(split_data, "|"); + size_t single = 0; + vector::const_iterator iter = keys.begin(); + for (; iter != keys.end(); iter++) { + if ((*iter).length() == 3) { + single++; + } + } + if (single > keys.size() - 1) { + string data_new = ""; + Convert2Phonetic(str, data_new); + word_str_vec.clear(); + ret = GetSuggestWord(data_new, word_str_vec, 1); + if (ret == 0) { + if (word_str_vec.size() > 0) { + probably_key = word_str_vec[0]; + } + else { + probably_key = SplitManager::Instance()->ch_correction(str); + } + } + else { + log_error("GetSuggestWord error."); + return -RT_DB_ERR; + } + } + else { + is_correct = true; + } + } + else { + is_correct = true; + } + } - if (is_correct == true) { // not sure - log_debug("not sure, maybe multi words."); - } - else { - if (probably_key != str) { - log_debug("not correct, probably word is: %s", probably_key.c_str()); - } - else { - log_debug("not correct, and seems no similar word."); - } - } - is_correct = false; - return 0; + if (is_correct == true) { // not sure + log_debug("not sure, maybe multi words."); + } + else { + if (probably_key != str) { + log_debug("not correct, probably word is: %s", probably_key.c_str()); + } + else { + log_debug("not correct, and seems no similar word."); + } + } + is_correct = false; + return 0; } int GetInitialVec(IntelligentInfo &info, int len) { - int count = 0; - vector initials(8, ""); + int count = 0; + vector initials(8, ""); - if (info.initial_char_01 != "") - initials[count++] = info.initial_char_01; - if (info.initial_char_02 != "") - initials[count++] = info.initial_char_02; - if (info.initial_char_03 != "") - initials[count++] = info.initial_char_03; - if (info.initial_char_04 != "") - initials[count++] = info.initial_char_04; - if (info.initial_char_05 != "") - initials[count++] = info.initial_char_05; - if (info.initial_char_06 != "") - initials[count++] = info.initial_char_06; - if (info.initial_char_07 != "") - initials[count++] = info.initial_char_07; - if (info.initial_char_08 != "") - initials[count++] = info.initial_char_08; + if (info.initial_char_01 != "") + initials[count++] = info.initial_char_01; + if (info.initial_char_02 != "") + initials[count++] = info.initial_char_02; + if (info.initial_char_03 != "") + initials[count++] = info.initial_char_03; + if (info.initial_char_04 != "") + initials[count++] = info.initial_char_04; + if (info.initial_char_05 != "") + initials[count++] = info.initial_char_05; + if (info.initial_char_06 != "") + initials[count++] = info.initial_char_06; + if (info.initial_char_07 != "") + initials[count++] = info.initial_char_07; + if (info.initial_char_08 != "") + initials[count++] = info.initial_char_08; - if (count == 0 || count == 8) - return -1; + if (count == 0 || count == 8) + return -1; - for (int i = count - 1; i >= 0; i--) - { - if (i + len >= 8) - return -1; - initials[i + len] = initials[i]; - } + for (int i = count - 1; i >= 0; i--) + { + if (i + len >= 8) + return -1; + initials[i + len] = initials[i]; + } - for (int i = 0; i < len; i++) - { - initials[i] = ""; - } + for (int i = 0; i < len; i++) + { + initials[i] = ""; + } - info.initial_char_01 = initials[0]; - info.initial_char_02 = initials[1]; - info.initial_char_03 = initials[2]; - info.initial_char_04 = initials[3]; - info.initial_char_05 = initials[4]; - info.initial_char_06 = initials[5]; - info.initial_char_07 = initials[6]; - info.initial_char_08 = initials[7]; + info.initial_char_01 = initials[0]; + info.initial_char_02 = initials[1]; + info.initial_char_03 = initials[2]; + info.initial_char_04 = initials[3]; + info.initial_char_05 = initials[4]; + info.initial_char_06 = initials[5]; + info.initial_char_07 = initials[6]; + info.initial_char_08 = initials[7]; - return 0; + return 0; } int ShiftIntelligentInfoWithoutCharacter(IntelligentInfo &info, int len) { - int count = 0; - vector initials(16, ""); + int count = 0; + vector initials(16, ""); - if (info.initial_char_01 != "") - initials[count++] = info.initial_char_01; - if (info.initial_char_02 != "") - initials[count++] = info.initial_char_02; - if (info.initial_char_03 != "") - initials[count++] = info.initial_char_03; - if (info.initial_char_04 != "") - initials[count++] = info.initial_char_04; - if (info.initial_char_05 != "") - initials[count++] = info.initial_char_05; - if (info.initial_char_06 != "") - initials[count++] = info.initial_char_06; - if (info.initial_char_07 != "") - initials[count++] = info.initial_char_07; - if (info.initial_char_08 != "") - initials[count++] = info.initial_char_08; - if (info.initial_char_09 != "") - initials[count++] = info.initial_char_09; - if (info.initial_char_10 != "") - initials[count++] = info.initial_char_10; - if (info.initial_char_11 != "") - initials[count++] = info.initial_char_11; - if (info.initial_char_12 != "") - initials[count++] = info.initial_char_12; - if (info.initial_char_13 != "") - initials[count++] = info.initial_char_13; - if (info.initial_char_14 != "") - initials[count++] = info.initial_char_14; - if (info.initial_char_15 != "") - initials[count++] = info.initial_char_15; - if (info.initial_char_16 != "") - initials[count++] = info.initial_char_16; + if (info.initial_char_01 != "") + initials[count++] = info.initial_char_01; + if (info.initial_char_02 != "") + initials[count++] = info.initial_char_02; + if (info.initial_char_03 != "") + initials[count++] = info.initial_char_03; + if (info.initial_char_04 != "") + initials[count++] = info.initial_char_04; + if (info.initial_char_05 != "") + initials[count++] = info.initial_char_05; + if (info.initial_char_06 != "") + initials[count++] = info.initial_char_06; + if (info.initial_char_07 != "") + initials[count++] = info.initial_char_07; + if (info.initial_char_08 != "") + initials[count++] = info.initial_char_08; + if (info.initial_char_09 != "") + initials[count++] = info.initial_char_09; + if (info.initial_char_10 != "") + initials[count++] = info.initial_char_10; + if (info.initial_char_11 != "") + initials[count++] = info.initial_char_11; + if (info.initial_char_12 != "") + initials[count++] = info.initial_char_12; + if (info.initial_char_13 != "") + initials[count++] = info.initial_char_13; + if (info.initial_char_14 != "") + initials[count++] = info.initial_char_14; + if (info.initial_char_15 != "") + initials[count++] = info.initial_char_15; + if (info.initial_char_16 != "") + initials[count++] = info.initial_char_16; - if (count <= 0 || count >= 16) - return -1; + if (count <= 0 || count >= 16) + return -1; - for (int i = count - 1; i >= 0; i--) - { - if (i + len >= 16) - return -1; - initials[i + len] = initials[i]; - } + for (int i = count - 1; i >= 0; i--) + { + if (i + len >= 16) + return -1; + initials[i + len] = initials[i]; + } - for (int i = 0; i < len; i++) - { - initials[i] = ""; - } + for (int i = 0; i < len; i++) + { + initials[i] = ""; + } - info.initial_char_01 = initials[0]; - info.initial_char_02 = initials[1]; - info.initial_char_03 = initials[2]; - info.initial_char_04 = initials[3]; - info.initial_char_05 = initials[4]; - info.initial_char_06 = initials[5]; - info.initial_char_07 = initials[6]; - info.initial_char_08 = initials[7]; - info.initial_char_09 = initials[8]; - info.initial_char_10 = initials[9]; - info.initial_char_11 = initials[10]; - info.initial_char_12 = initials[11]; - info.initial_char_13 = initials[12]; - info.initial_char_14 = initials[13]; - info.initial_char_15 = initials[14]; - info.initial_char_16 = initials[15]; + info.initial_char_01 = initials[0]; + info.initial_char_02 = initials[1]; + info.initial_char_03 = initials[2]; + info.initial_char_04 = initials[3]; + info.initial_char_05 = initials[4]; + info.initial_char_06 = initials[5]; + info.initial_char_07 = initials[6]; + info.initial_char_08 = initials[7]; + info.initial_char_09 = initials[8]; + info.initial_char_10 = initials[9]; + info.initial_char_11 = initials[10]; + info.initial_char_12 = initials[11]; + info.initial_char_13 = initials[12]; + info.initial_char_14 = initials[13]; + info.initial_char_15 = initials[14]; + info.initial_char_16 = initials[15]; - return 0; + return 0; } const char *GetFormatTimeStr(uint32_t ulTime) { - static char acTimeStr[64]; - struct tm stTm; + static char acTimeStr[64]; + struct tm stTm; - time_t myTime = ulTime;//time_t 在64位是8字节 + time_t myTime = ulTime;//time_t 在64位是8字节 - memset(acTimeStr, 0, sizeof(acTimeStr)); + memset(acTimeStr, 0, sizeof(acTimeStr)); - localtime_r(&myTime, &stTm); + localtime_r(&myTime, &stTm); - snprintf(acTimeStr, sizeof(acTimeStr) - 1, "%04d-%02d-%02d %02d:%02d:%02d", - stTm.tm_year + 1900, stTm.tm_mon + 1, stTm.tm_mday, stTm.tm_hour, stTm.tm_min, stTm.tm_sec); + snprintf(acTimeStr, sizeof(acTimeStr) - 1, "%04d-%02d-%02d %02d:%02d:%02d", + stTm.tm_year + 1900, stTm.tm_mon + 1, stTm.tm_mday, stTm.tm_hour, stTm.tm_min, stTm.tm_sec); - return acTimeStr; + return acTimeStr; } void ToLower(string &str) { - size_t i = 0; - for (; i < str.size(); i++) { - if (str[i]>='A' && str[i]<='Z') { - str[i] = str[i]+32; - } - } + size_t i = 0; + for (; i < str.size(); i++) { + if (str[i]>='A' && str[i]<='Z') { + str[i] = str[i]+32; + } + } } string CharToString(char c) { - stringstream stream; - stream << c; - return stream.str(); + stringstream stream; + stream << c; + return stream.str(); } string ToString(uint32_t appid) { - ostringstream oss; - oss<& gisCode) { - if ((lng != "") && (lat != "")) { - GeoPoint geo1; - geo1.lon = atof(lng.c_str()); - geo1.lat = atof(lat.c_str()); - gisCode = GetArroundGeoHash(geo1, distance, 6); - } else if(ip != ""){ - return true; - } else { - return false; - } - return true; + if ((lng != "") && (lat != "")) { + GeoPoint geo1; + geo1.lon = atof(lng.c_str()); + geo1.lat = atof(lat.c_str()); + gisCode = GetArroundGeoHash(geo1, distance, 6); + } else if(ip != ""){ + return true; + } else { + return false; + } + return true; } bool GetGisCode(const vector& lng_arr, const vector& lat_arr, vector& gisCode){ - double lng_max = atof(lng_arr[0].c_str()); - double lng_min = atof(lng_arr[0].c_str()); - double lat_max = atof(lat_arr[0].c_str()); - double lat_min = atof(lat_arr[0].c_str()); - for(size_t i = 1; i < lng_arr.size(); i++){ - if(lng_max < atof(lng_arr[i].c_str())){ - lng_max = atof(lng_arr[i].c_str()); - } - if(lng_min > atof(lng_arr[i].c_str())){ - lng_min = atof(lng_arr[i].c_str()); - } - } - for(size_t i = 1; i < lat_arr.size(); i++){ - if(lat_max < atof(lat_arr[i].c_str())){ - lat_max = atof(lat_arr[i].c_str()); - } - if(lat_min > atof(lat_arr[i].c_str())){ - lat_min = atof(lat_arr[i].c_str()); - } - } - EnclosingRectangle enclose_rectangle(lng_max , lng_min , lat_max , lat_min); + double lng_max = atof(lng_arr[0].c_str()); + double lng_min = atof(lng_arr[0].c_str()); + double lat_max = atof(lat_arr[0].c_str()); + double lat_min = atof(lat_arr[0].c_str()); + for(size_t i = 1; i < lng_arr.size(); i++){ + if(lng_max < atof(lng_arr[i].c_str())){ + lng_max = atof(lng_arr[i].c_str()); + } + if(lng_min > atof(lng_arr[i].c_str())){ + lng_min = atof(lng_arr[i].c_str()); + } + } + for(size_t i = 1; i < lat_arr.size(); i++){ + if(lat_max < atof(lat_arr[i].c_str())){ + lat_max = atof(lat_arr[i].c_str()); + } + if(lat_min > atof(lat_arr[i].c_str())){ + lat_min = atof(lat_arr[i].c_str()); + } + } + EnclosingRectangle enclose_rectangle(lng_max , lng_min , lat_max , lat_min); - gisCode = GetArroundGeoHash(enclose_rectangle, 6); - return true; + gisCode = GetArroundGeoHash(enclose_rectangle, 6); + return true; } double strToDouble(const string& str) { - double num; - istringstream iss(str); - iss >> num; - return num; + double num; + istringstream iss(str); + iss >> num; + return num; } double toRadians(double d){ - return d * 3.1415 / 180.0; + return d * 3.1415 / 180.0; } double distanceSimplify(double lat1, double lng1, double lat2, double lng2) { - double dx = lng1 - lng2; // 经度差值 - double dy = lat1 - lat2; // 纬度差值 - double b = (lat1 + lat2) / 2.0; // 平均纬度 - double Lx = toRadians(dx) * 6378.137 * cos(toRadians(b)); // 东西距离 - double Ly = 6378.137 * toRadians(dy); // 南北距离 - return sqrt(Lx * Lx + Ly * Ly); // 用平面的矩形对角距离公式计算总距离 + double dx = lng1 - lng2; // 经度差值 + double dy = lat1 - lat2; // 纬度差值 + double b = (lat1 + lat2) / 2.0; // 平均纬度 + double Lx = toRadians(dx) * 6378.137 * cos(toRadians(b)); // 东西距离 + double Ly = 6378.137 * toRadians(dy); // 南北距离 + return sqrt(Lx * Lx + Ly * Ly); // 用平面的矩形对角距离公式计算总距离 } -bool GetGisDistance(uint32_t appid, const string& latLeft, const string& lngLeft, hash_double_map& distances, hash_string_map& doc_content) +bool GetGisDistance(uint32_t appid, const string& Latitude, const string& Longtitude, hash_double_map& distances, hash_string_map& doc_content) { - double lat1 = strToDouble(latLeft); - double lng1 = strToDouble(lngLeft); - int lon_len = strlen("longitude\":\""); - int lat_len = strlen("latitude\":\""); + double d_query_lat = strToDouble(Latitude); + double d_query_lng = strToDouble(Longtitude); - hash_string_map::iterator doc_it; - for (doc_it = doc_content.begin(); doc_it != doc_content.end(); doc_it++) { - if (doc_it->second == "") { - log_error("content is invalid, appid:%d, doc_id:%s, content:%s.",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); - continue; - } - // 如果是aoi数据则改为计算查询点到aoi的最短距离 - if(doc_it->second.find("longitude_list") != string::npos){ - Json::Reader reader; - Json::Value data; - bool result = reader.parse(doc_it->second, data); - if (result && data.isMember("latitude_list") && data["latitude_list"].isArray() - && data.isMember("longitude_list") && data["longitude_list"].isArray()) { - if(data["latitude_list"].size() != data["longitude_list"].size()){ - log_error("latitude_list size not equal longitude_list size, appid:%d, doc_id:%s, content:%s",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); - continue; - } - Polygon polygon; - for (uint32_t idx = 0; idx < data["longitude_list"].size(); ++idx) { - vector point; - if(!data["longitude_list"][idx].isString() || !data["latitude_list"][idx].isString()){ - log_error("longitude or latitude is not string!"); - continue; - } - point.push_back(strToDouble(data["longitude_list"][idx].asString())); - point.push_back(strToDouble(data["latitude_list"][idx].asString())); - polygon.push_back(point); - } - distances[doc_it->first] = GetShortestDistance(lng1, lat1, polygon) / 1000; - } - } else { - double lat2; - double lng2; - /* - rapidjson::Document document; - bool result = document.Parse(doc_it->second.data()).HasParseError(); - if(!result && document.HasMember("latitude") && document["latitude"].IsString() && document.HasMember("longitude") && document["longitude"].IsString()){ - lat2 = strToDouble(document["latitude"].GetString()); - lng2 = strToDouble(document["longitude"].GetString()); - } else { - log_error("content hasn't gis info, appid:%d, doc_id:%s, content:%s.",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); - continue; - }*/ + hash_string_map::iterator doc_it = doc_content.begin(); + for ( ; doc_it != doc_content.end(); doc_it++) { + if (doc_it->second == "") { + log_error("content is invalid, appid:%d, doc_id:%s, content:%s.",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); + continue; + } - size_t pos1 = doc_it->second.find("longitude"); - size_t pos2 = doc_it->second.find_first_of(",", pos1); - if(pos1 != string::npos && pos2 != string::npos){ - string longitude = doc_it->second.substr(pos1+lon_len, pos2-pos1-lon_len-1); - lng2 = strToDouble(longitude); - } else { - log_debug("content has no longitude info, appid:%d, doc_id:%s, content:%s.",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); - distances[doc_it->first] = 1; - continue; - } - size_t pos3 = doc_it->second.find("latitude"); - size_t pos4 = doc_it->second.find_first_of(",", pos3); - if(pos3 != string::npos && pos4 != string::npos){ - string latitude = doc_it->second.substr(pos3+lat_len, pos4-pos3-lat_len-1); - lat2 = strToDouble(latitude); - } else { - log_debug("content has no latitude info, appid:%d, doc_id:%s, content:%s.",appid, (doc_it->first).c_str(), (doc_it->second).c_str()); - continue; - } - - double dis = distanceSimplify(lat1, lng1, lat2, lng2); - distances[doc_it->first] = round(dis * 1000)/1000; - } - } - return true; + Json::Reader read(Json::Features::strictMode()); + Json::Value snap_json; + int ret = read.parse(doc_it->second , snap_json); + if (0 == ret){ + log_error("parse json error [%s], errmsg : %s", doc_it->second.c_str(), read.getFormattedErrorMessages().c_str()); + return false; + } + + Json::Value::Members member = snap_json.getMemberNames(); + Json::Value::Members::iterator iter = member.begin(); + for (; iter != member.end(); ++iter){ + uint32_t segment_tag = 0; + FieldInfo field_info; + uint32_t uiret = DBManager::Instance()->GetWordField(segment_tag, appid, *iter, field_info); + if (FIELD_GEO_POINT == field_info.field_type){ + GeoPointContext geo_point(snap_json[*iter]); + double d_target_lat = strToDouble(geo_point.sLatitude); + double d_target_lng = strToDouble(geo_point.sLongtitude); + double dis = distanceSimplify(d_query_lat, d_query_lng, d_target_lat, d_target_lng); + distances[doc_it->first] = round(dis * 1000)/1000; + }else if (FIELD_GEO_SHAPE == field_info.field_type){ + // temp no handle ,latter add + distances[doc_it->first] = 1; + } + } + } + return true; } uint32_t GetIpNum(string ip) { - uint32_t s; - if (inet_pton(AF_INET, ip.c_str(), (void *)&s) == 0){ - log_error("resolve ip error. ip : %s.", ip.c_str()); - return 0; - } - return ntohl(s); + uint32_t s; + if (inet_pton(AF_INET, ip.c_str(), (void *)&s) == 0){ + log_error("resolve ip error. ip : %s.", ip.c_str()); + return 0; + } + return ntohl(s); } int ShiftIntelligentInfo(IntelligentInfo &info, int len) { - uint16_t *p = &info.charact_id_01; + uint16_t *p = &info.charact_id_01; - if (len >= 8) { - return -1; - } + if (len >= 8) { + return -1; + } - if (len < 1) { - return -2; - } + if (len < 1) { + return -2; + } - memmove(&p[len], &p[0], sizeof(uint16_t) * (8 - len)); + memmove(&p[len], &p[0], sizeof(uint16_t) * (8 - len)); - for (int i = 0; i < len; i++) { - p[i] = 0; - } + for (int i = 0; i < len; i++) { + p[i] = 0; + } - p = &info.phonetic_id_01; + p = &info.phonetic_id_01; - memmove(&p[len], &p[0], sizeof(uint16_t) * (8 - len)); + memmove(&p[len], &p[0], sizeof(uint16_t) * (8 - len)); - for (int i = 0; i < len; i++) { - p[i] = 0; - } + for (int i = 0; i < len; i++) { + p[i] = 0; + } - GetInitialVec(info, len); + GetInitialVec(info, len); - return 0; + return 0; } bool GetSuggestDoc(FieldInfo& fieldInfo, uint32_t len, const IntelligentInfo &info, vector &doc_id_set, uint32_t appid, set& highlightWord) { - bool bRet; - int index = 0; - uint32_t field = fieldInfo.field; - uint32_t segment_feature = fieldInfo.segment_feature; - bRet = g_hanpinIndexInstance.GetSuggestDoc(appid, index, len, field, info, doc_id_set, highlightWord); - if (bRet == false) - goto resError; + bool bRet; + int index = 0; + uint32_t field = fieldInfo.field; + uint32_t segment_feature = fieldInfo.segment_feature; + bRet = g_hanpinIndexInstance.GetSuggestDoc(appid, index, len, field, info, doc_id_set, highlightWord); + if (bRet == false) + goto resError; - if (segment_feature == 0) { - return true; - } + if (segment_feature == 0) { + return true; + } - for (uint32_t i = 1; i <= 8 - len; i++) { - index = index + 1; - IntelligentInfo t2 = info; - if (ShiftIntelligentInfo(t2, i) < 0) { - continue; - } - bRet = g_hanpinIndexInstance.GetSuggestDoc(appid, index, len, field, t2, doc_id_set, highlightWord); - if (bRet == false) - goto resError; - } - return true; + for (uint32_t i = 1; i <= 8 - len; i++) { + index = index + 1; + IntelligentInfo t2 = info; + if (ShiftIntelligentInfo(t2, i) < 0) { + continue; + } + bRet = g_hanpinIndexInstance.GetSuggestDoc(appid, index, len, field, t2, doc_id_set, highlightWord); + if (bRet == false) + goto resError; + } + return true; resError: - log_error("GetSuggestDoc invalid."); - return false; + log_error("GetSuggestDoc invalid."); + return false; } bool GetSuggestDocWithoutCharacter(FieldInfo& fieldInfo, uint32_t len, const IntelligentInfo &info, vector &doc_id_set, uint32_t appid, set& highlightWord) { - bool bRet; - int index = 0; - uint32_t field = fieldInfo.field; - uint32_t segment_feature = fieldInfo.segment_feature; - bRet = g_hanpinIndexInstance.GetSuggestDocWithoutCharacter(appid, index, len, field, info, doc_id_set, highlightWord); - if (bRet == false) - goto resError; + bool bRet; + int index = 0; + uint32_t field = fieldInfo.field; + uint32_t segment_feature = fieldInfo.segment_feature; + bRet = g_hanpinIndexInstance.GetSuggestDocWithoutCharacter(appid, index, len, field, info, doc_id_set, highlightWord); + if (bRet == false) + goto resError; - if (segment_feature == 0) { - return true; - } + if (segment_feature == 0) { + return true; + } - for (uint32_t i = 1; i <= 16 - len; i++) { - index = index + 1; - IntelligentInfo t2 = info; - if (ShiftIntelligentInfoWithoutCharacter(t2, i) < 0) { - continue; - } - bRet = g_hanpinIndexInstance.GetSuggestDocWithoutCharacter(appid, index, len, field, t2, doc_id_set, highlightWord); - if (bRet == false) - goto resError; - } - return true; + for (uint32_t i = 1; i <= 16 - len; i++) { + index = index + 1; + IntelligentInfo t2 = info; + if (ShiftIntelligentInfoWithoutCharacter(t2, i) < 0) { + continue; + } + bRet = g_hanpinIndexInstance.GetSuggestDocWithoutCharacter(appid, index, len, field, t2, doc_id_set, highlightWord); + if (bRet == false) + goto resError; + } + return true; resError: - log_error("GetSuggestDoc invalid."); - return false; + log_error("GetSuggestDoc invalid."); + return false; } int GetDocByShiftWord(FieldInfo fieldInfo, vector &doc_id_set, uint32_t appid, set& highlightWord) { - log_debug("GetDocByShiftWord start"); - bool bRet = true; - set > result; - GetMultipleWords(fieldInfo.word, result); + log_debug("GetDocByShiftWord start"); + bool bRet = true; + set > result; + GetMultipleWords(fieldInfo.word, result); - int len = result.size(); - if (len <= 0) { - log_error("get shift word error."); - return -RT_GET_SUGGEST_ERR; - } + int len = result.size(); + if (len <= 0) { + log_error("get shift word error."); + return -RT_GET_SUGGEST_ERR; + } - set >::iterator iter; - for (iter = result.begin(); iter != result.end(); iter++) - { - IntelligentInfo info; - vector tmp = *iter; - ConvertIntelligent(tmp, info, bRet); - int length = tmp.size(); - if (bRet) { - bRet = GetSuggestDoc(fieldInfo, length, info, doc_id_set, appid, highlightWord); - if (!bRet) { - log_error("GetSuggestDocInfo error."); - return -RT_DTC_ERR; - } - } - } + set >::iterator iter; + for (iter = result.begin(); iter != result.end(); iter++) + { + IntelligentInfo info; + vector tmp = *iter; + ConvertIntelligent(tmp, info, bRet); + int length = tmp.size(); + if (bRet) { + bRet = GetSuggestDoc(fieldInfo, length, info, doc_id_set, appid, highlightWord); + if (!bRet) { + log_error("GetSuggestDocInfo error."); + return -RT_DTC_ERR; + } + } + } - return 0; + return 0; } int GetDocByShiftEnWord(FieldInfo fieldInfo, vector &doc_id_set, uint32_t appid, set& highlightWord) { - log_debug("GetDocByShiftEnWord start"); - bool bRet = true; + log_debug("GetDocByShiftEnWord start"); + bool bRet = true; - int length = 0; - IntelligentInfo enInfo; - ConvertCharIntelligent(fieldInfo.word, enInfo, length); - bRet = GetSuggestDocWithoutCharacter(fieldInfo, length, enInfo, doc_id_set, appid, highlightWord); - if (!bRet) { - log_error("GetEnSuggestDocInfo error."); - return -RT_DTC_ERR; - } - return 0; + int length = 0; + IntelligentInfo enInfo; + ConvertCharIntelligent(fieldInfo.word, enInfo, length); + bRet = GetSuggestDocWithoutCharacter(fieldInfo, length, enInfo, doc_id_set, appid, highlightWord); + if (!bRet) { + log_error("GetEnSuggestDocInfo error."); + return -RT_DTC_ERR; + } + return 0; } uint64_t GetSysTimeMicros(){ - timeval tv; - memset(&tv, 0, sizeof(tv)); + timeval tv; + memset(&tv, 0, sizeof(tv)); - if (0 != gettimeofday(&tv, NULL)){ - return 0; - } + if (0 != gettimeofday(&tv, NULL)){ + return 0; + } - uint64_t ullTime = (uint64_t)tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; - return ullTime; + uint64_t ullTime = (uint64_t)tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; + return ullTime; } string trim(string& str) @@ -1534,11 +1497,11 @@ string trim(string& str) } string delPrefix(string& str){ - size_t pos1 = str.find_first_of("(("); - size_t pos2 = str.find_last_of("))"); - string res = str; - if(pos1 != string::npos && pos2 != string::npos){ - res = str.substr(pos1+2, pos2-pos1-3); - } - return res; + size_t pos1 = str.find_first_of("(("); + size_t pos2 = str.find_last_of("))"); + string res = str; + if(pos1 != string::npos && pos2 != string::npos){ + res = str.substr(pos1+2, pos2-pos1-3); + } + return res; } \ No newline at end of file diff --git a/src/search_local/index_read/search_util.h b/src/search_local/index_read/search_util.h index 4d8cfec..1376865 100644 --- a/src/search_local/index_read/search_util.h +++ b/src/search_local/index_read/search_util.h @@ -57,7 +57,7 @@ set sets_intersection(set v1, set v2); // 集合求交 set sets_union(set v1, set v2); // 集合求并集 set sets_difference(set v1, set v2); // 集合求差集 double strToDouble(const string& str); -bool GetGisDistance(uint32_t appid, const string& latLeft, const string& lngLeft, hash_double_map& distances, hash_string_map& doc_content); +bool GetGisDistance(uint32_t appid, const string& Latitude, const string& Longtitude, hash_double_map& distances, hash_string_map& doc_content); void ConvertCharIntelligent(const string word, IntelligentInfo &info, int &len); void ConvertIntelligent(const vector &result, IntelligentInfo &info, bool &flag); bool GetGisCode(string lng, string lat, string ip, double distance, vector& gisCode); diff --git a/src/search_local/index_read/valid_doc_filter.cc b/src/search_local/index_read/valid_doc_filter.cc index 074081a..958681f 100644 --- a/src/search_local/index_read/valid_doc_filter.cc +++ b/src/search_local/index_read/valid_doc_filter.cc @@ -275,12 +275,12 @@ int ValidDocFilter::Process(const std::vector >& keys, st return 0; } -int ValidDocFilter::PureTextInvertIndexSearch(const std::vector >& keys +int ValidDocFilter::HanPinTextInvertIndexSearch(const std::vector >& keys , std::vector& index_info_vet , std::set& highlightWord , std::map& docid_keyinfo_map){ if (keys.empty() || keys.size() > 1){ - return -RT_GET_DOC_ERR; + return -RT_GET_FIELD_ERROR; } const std::vector& key_field_info_vet = keys[0]; std::vector::const_iterator iter = key_field_info_vet.cbegin(); @@ -350,13 +350,13 @@ int ValidDocFilter::RangeQueryInvertIndexSearch(const std::vector >& keys +int ValidDocFilter::TextInvertIndexSearch(const std::vector >& keys , std::vector& index_info_vet , std::set& highlightWord , std::map& docid_keyinfo_map , std::map& key_doccount_map){ if (keys.empty() || keys.size() > 1){ - return -RT_GET_DOC_ERR; + return -RT_GET_FIELD_ERROR; } const std::vector& key_field_info_vet = keys[0]; std::vector::const_iterator iter = key_field_info_vet.cbegin(); diff --git a/src/search_local/index_read/valid_doc_filter.h b/src/search_local/index_read/valid_doc_filter.h index 09018ab..1445c6a 100644 --- a/src/search_local/index_read/valid_doc_filter.h +++ b/src/search_local/index_read/valid_doc_filter.h @@ -52,7 +52,7 @@ public: , std::set& highlightWord, std::map& docid_keyinfo_map , std::map& key_doccount_map); - int PureTextInvertIndexSearch(const std::vector >& keys + int HanPinTextInvertIndexSearch(const std::vector >& keys , std::vector& index_info_vet , std::set& highlightWord , std::map& docid_keyinfo_map); @@ -60,7 +60,7 @@ public: int RangeQueryInvertIndexSearch(const std::vector >& keys , std::vector& index_info_vet); - int MixTextInvertIndexSearch(const std::vector >& keys + int TextInvertIndexSearch(const std::vector >& keys , std::vector& index_info_vet , std::set& highlightWord , std::map& docid_keyinfo_map