isearch/comm/split_tool.cc
2021-03-19 18:38:34 +08:00

872 lines
25 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "split_tool.h"
#include <fstream>
#include <vector>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <float.h>
#include "log.h"
using namespace std;
#define MAX_WORD_LEN 8
#define TOTAL 8000000
#define ALPHA_DIGIT "0123456789\
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
bool isAllAlphaOrDigit(string str) {
bool flag = true;
size_t i = 0;
for (; i < str.size(); i++) {
if (!isupper(str[i]) && !islower(str[i]) && !isdigit(str[i])) {
flag = false;
break;
}
}
return flag;
}
FBSegment::FBSegment() {
punct_set.clear();
train_cnt = 0;
}
bool FBSegment::isAlphaOrDigit(string str) {
if (alpha_set.find(str) != alpha_set.end())
{
return true;
}
return false;
}
bool FBSegment::Init() {
string en_punct = ",.!?/'\"<>\\:;\n";
string punct = ",。!?、;:“”‘’()《》 ";
punct = punct.append(en_punct);
iutf8string utf8_punct(punct);
for (int i = 0; i < utf8_punct.length(); i++) {
punct_set.insert(utf8_punct[i]);
}
string alphadigit = ALPHA_DIGIT;
iutf8string utf8_alpha(alphadigit);
for (int i = 0; i < utf8_alpha.length(); i++) {
alpha_set.insert(utf8_alpha[i]);
}
return true;
}
bool FBSegment::Init2(string train_path) {
bool ret = Init();
if (ret == false) {
return ret;
}
string str;
ifstream train_infile;
train_infile.open(train_path.c_str());
if (train_infile.is_open() == false) {
log_error("open file error: %s.\n", train_path.c_str());
return false;
}
string beg_tag = "<BEG>";
string end_tag = "<END>";
while (getline(train_infile, str))
{
vector<string> str_vec = splitEx(str, " ");
vector<string> line_list;
vector<string>::iterator iter = str_vec.begin();
for (; iter != str_vec.end(); iter++) {
if (punct_set.find(*iter) == punct_set.end() && *iter != "") {
line_list.push_back(*iter);
}
}
train_cnt += line_list.size();
for (int i = -1; i < (int)line_list.size(); i++) {
string word1;
string word2;
if (i == -1) {
word1 = beg_tag;
word2 = line_list[i + 1];
}
else if (i == (int)line_list.size() - 1) {
word1 = line_list[i];
word2 = end_tag;
}
else {
word1 = line_list[i];
word2 = line_list[i + 1];
}
if (next_dict.find(word1) == next_dict.end()) {
map<string, int> dict;
next_dict[word1] = dict;
}
if (next_dict[word1].find(word2) == next_dict[word1].end()) {
next_dict[word1][word2] = 1;
}
else {
next_dict[word1][word2] += 1;
}
}
}
train_infile.close();
ret = train_corpus.Init(train_path);
if (ret == false) {
log_error("train_corpus init error.");
return ret;
}
log_info("total training words length is: %u, next_dict count: %d.", train_cnt, (int)next_dict.size());
return true;
}
bool FBSegment::Init3(string train_path, string word_path) {
bool ret = Init2(train_path);
if (ret == false) {
return ret;
}
string str;
ifstream word_infile;
word_infile.open(word_path.c_str());
if (word_infile.is_open() == false) {
log_error("open file error: %s.\n", word_path.c_str());
return false;
}
uint32_t word_id = 0;
uint32_t appid = 0;
string word;
uint32_t word_freq = 0;
while (getline(word_infile, str))
{
vector<string> str_vec = splitEx(str, "\t");
word_id = atoi(str_vec[0].c_str());
word = str_vec[1];
appid = atoi(str_vec[2].c_str());
word_freq = atoi(str_vec[3].c_str());
WordInfo word_info;
word_info.appid = appid;
word_info.word_freq = word_freq;
word_info.word_id = word_id;
word_dict[word][appid] = word_info;
}
log_info("word_dict count: %d", (int)word_dict.size());
return true;
}
bool FBSegment::WordValid(string word, uint32_t appid) {
if(punct_set.find(word) != punct_set.end()){
return false;
}
if (word_dict.find(word) != word_dict.end()) {
map<uint32_t, WordInfo> wordInfo = word_dict[word];
if (wordInfo.find(0) != wordInfo.end() || wordInfo.find(appid) != wordInfo.end()) {
return true;
}
}
return false;
}
bool FBSegment::GetWordInfo(string word, uint32_t appid, WordInfo &word_info) {
if (word_dict.find(word) != word_dict.end()) {
map<uint32_t, WordInfo> wordInfo = word_dict[word];
if (wordInfo.find(0) != wordInfo.end()) {
word_info = wordInfo[0];
return true;
}
if (wordInfo.find(appid) != wordInfo.end()) {
word_info = wordInfo[appid];
return true;
}
}
return false;
}
bool FBSegment::GetWordInfoFromDictOnly(string word, uint32_t appid, WordInfo &word_info) {
if (word_dict.find(word) != word_dict.end()) {
map<uint32_t, WordInfo> wordInfo = word_dict[word];
if (wordInfo.find(0) != wordInfo.end()) {
word_info = wordInfo[0];
return true;
}
if (wordInfo.find(appid) != wordInfo.end()) {
word_info = wordInfo[appid];
return true;
}
}
return false;
}
void FBSegment::FMM2(iutf8string &phrase, uint32_t appid, vector<string> &fmm_list) {
int maxlen = MAX_WORD_LEN;
int len_phrase = phrase.length();
int i = 0, j = 0;
while (i < len_phrase) {
int end = i + maxlen;
if (end >= len_phrase)
end = len_phrase;
iutf8string phrase_sub = phrase.utf8substr(i, end - i);
for (j = phrase_sub.length(); j >= 0; j--) {
if (j == 1)
break;
iutf8string key = phrase_sub.utf8substr(0, j);
if (WordValid(key.stlstring(), appid) == true) {
fmm_list.push_back(key.stlstring());
i += key.length() - 1;
break;
}
}
if (j == 1) {
fmm_list.push_back(phrase_sub[0]);
}
i += 1;
}
return;
}
// not query from DTC
void FBSegment::BMM(iutf8string &phrase, uint32_t appid, vector<string> &bmm_list) {
int maxlen = MAX_WORD_LEN - 2;
int len_phrase = phrase.length();
int i = len_phrase, j = 0;
while (i > 0) {
int start = i - maxlen;
if (start < 0)
start = 0;
iutf8string phrase_sub = phrase.utf8substr(start, i - start);
for (j = 0; j < phrase_sub.length(); j++) {
if (j == phrase_sub.length() - 1)
break;
iutf8string key = phrase_sub.utf8substr(j, phrase_sub.length() - j);
string word = key.stlstring();
if (WordValid(word, appid) == true) {
vector<string>::iterator iter = bmm_list.begin();
bmm_list.insert(iter, key.stlstring());
i -= key.length() - 1;
break;
}
}
if (j == phrase_sub.length() - 1) {
vector<string>::iterator iter = bmm_list.begin();
bmm_list.insert(iter, "" + phrase_sub[j]);
}
i -= 1;
}
return;
}
void FBSegment::BMM2(iutf8string &phrase, uint32_t appid, vector<string> &bmm_list) {
int maxlen = MAX_WORD_LEN;
int len_phrase = phrase.length();
int i = len_phrase, j = 0;
while (i > 0) {
int start = i - maxlen;
if (start < 0)
start = 0;
iutf8string phrase_sub = phrase.utf8substr(start, i-start);
for (j = 0; j < phrase_sub.length(); j++) {
if (j == phrase_sub.length() - 1)
break;
iutf8string key = phrase_sub.utf8substr(j, phrase_sub.length()-j);
if (WordValid(key.stlstring(), appid) == true) {
vector<string>::iterator iter = bmm_list.begin();
bmm_list.insert(iter, key.stlstring());
i -= key.length() - 1;
break;
}
}
if (j == phrase_sub.length() - 1) {
vector<string>::iterator iter = bmm_list.begin();
bmm_list.insert(iter, "" + phrase_sub[j]);
}
i -= 1;
}
return;
}
vector<string> FBSegment::segment_part(iutf8string &phrase, uint32_t appid) {
vector<string> fmm_list;
FMM2(phrase, appid, fmm_list); // 正向最大匹配
vector<string> bmm_list;
BMM2(phrase, appid, bmm_list); // 反向最大匹配
//如果正反向分词结果词数不同,则取分词数量较少的那个
if (fmm_list.size() != bmm_list.size()) {
if (fmm_list.size() > bmm_list.size())
return bmm_list;
else return fmm_list;
}
//如果分词结果词数相同
else {
//如果正反向的分词结果相同,就说明没有歧义,可返回任意一个
int i, FSingle = 0, BSingle = 0;
bool isSame = true;
for (i = 0; i < (int)fmm_list.size(); i++) {
if (fmm_list.at(i) != (bmm_list.at(i)))
isSame = false;
if (fmm_list.at(i).length() == 1)
FSingle += 1;
if (bmm_list.at(i).length() == 1)
BSingle += 1;
}
if (isSame)
return fmm_list;
else {
//分词结果不同,返回其中单字较少的那个
if (BSingle > FSingle)
return fmm_list;
else return bmm_list;
}
}
}
void FBSegment::segment2(iutf8string &phrase, uint32_t appid, vector<string> &new_res_all, string mode, bool Hmm_flag) {
vector<string> sen_list;
set<string> special_set; // 记录英文和数字字符串
string tmp_words = "";
bool flag = false; // 记录是否有英文或者数字的flag
for (int i = 0; i < phrase.length(); i++) {
if (isAlphaOrDigit(phrase[i])) {
if (tmp_words != "" and flag == false) {
sen_list.push_back(tmp_words);
tmp_words = "";
}
flag = true;
tmp_words += phrase[i];
}
else if(punct_set.find(phrase[i]) != punct_set.end()){
if (tmp_words != "") {
sen_list.push_back(tmp_words);
sen_list.push_back(phrase[i]);
if (flag == true) {
special_set.insert(tmp_words);
flag = false;
}
tmp_words = "";
}
}
else {
if (flag == true) {
sen_list.push_back(tmp_words);
special_set.insert(tmp_words);
flag = false;
tmp_words = phrase[i];
}
else {
tmp_words += phrase[i];
}
}
}
if (tmp_words != "") {
sen_list.push_back(tmp_words);
if (flag == true) {
special_set.insert(tmp_words);
}
}
tmp_words = "";
vector<string> res_all;
for (int i = 0; i < (int)sen_list.size(); i++) {
// special_set中保存了连续的字母数字串不需要进行分词
if (special_set.find(sen_list[i]) == special_set.end() && punct_set.find(sen_list[i]) == punct_set.end()) {
iutf8string utf8_str(sen_list[i]);
vector<string> parse_list;
if (mode == "Pre") {
FMM2(utf8_str, appid, parse_list);
}
else if (mode == "Post") {
BMM2(utf8_str, appid, parse_list);
}
else if (mode == "DAG") {
__cut_DAG_NO_HMM(sen_list[i], appid, parse_list);
}
else if (mode == "Cache") { // word dict in cache, not from DTC
BMM(utf8_str, appid, parse_list);
}
else { // PrePostNGram
vector<string> parse_list1;
vector<string> parse_list2;
FMM2(utf8_str, appid, parse_list1);
BMM2(utf8_str, appid, parse_list2);
parse_list1.insert(parse_list1.begin(), "<BEG>");
parse_list1.push_back("<END>");
parse_list2.insert(parse_list2.begin(), "<BEG>");
parse_list2.push_back("<END>");
// CalList1和CalList2分别记录两个句子词序列不同的部分
vector<string> cal_list1;
vector<string> cal_list2;
// pos1和pos2记录两个句子的当前字的位置cur1和cur2记录两个句子的第几个词
uint32_t pos1 = 0;
uint32_t pos2 = 0;
uint32_t cur1 = 0;
uint32_t cur2 = 0;
while (1) {
if (cur1 == parse_list1.size() && cur2 == parse_list2.size()) {
break;
}
// 如果当前位置一样
if (pos1 == pos2) {
// 当前位置一样,并且词也一样
if (parse_list1[cur1].size() == parse_list2[cur2].size()) {
pos1 += parse_list1[cur1].size();
pos2 += parse_list2[cur2].size();
// 说明此时得到两个不同的词序列根据bigram选择概率大的
// 注意算不同的时候要考虑加上前面一个词和后面一个词,拼接的时候再去掉即可
if (cal_list1.size() > 0) {
cal_list1.insert(cal_list1.begin(), parse_list[parse_list.size() - 1]);
cal_list2.insert(cal_list2.begin(), parse_list[parse_list.size() - 1]);
if (cur1 < parse_list1.size()-1) {
cal_list1.push_back(parse_list1[cur1]);
cal_list2.push_back(parse_list2[cur2]);
}
double p1 = CalSegProbability(cal_list1);
double p2 = CalSegProbability(cal_list2);
vector<string> cal_list = (p1 > p2) ? cal_list1 : cal_list2;
cal_list.erase(cal_list.begin());
if (cur1 < parse_list1.size() - 1) {
cal_list.pop_back();
}
parse_list.insert(parse_list.end(), cal_list.begin(), cal_list.end());
cal_list1.clear();
cal_list2.clear();
}
parse_list.push_back(parse_list1[cur1]);
cur1++;
cur2++;
}
// pos相同len(ParseList1[cur1])不同向后滑动不同的添加到list中
else if (parse_list1[cur1].size() > parse_list2[cur2].size()) {
cal_list2.push_back(parse_list2[cur2]);
pos2 += parse_list2[cur2].size();
cur2++;
}
else {
cal_list1.push_back(parse_list1[cur1]);
pos1 += parse_list1[cur1].size();
cur1++;
}
}
else {
// pos不同而结束的位置相同两个同时向后滑动
if (pos1 + parse_list1[cur1].size() == pos2 + parse_list2[cur2].size()) {
cal_list1.push_back(parse_list1[cur1]);
cal_list2.push_back(parse_list2[cur2]);
pos1 += parse_list1[cur1].size();
pos2 += parse_list2[cur2].size();
cur1++;
cur2++;
}
else if (pos1 + parse_list1[cur1].size() > pos2 + parse_list2[cur2].size()) {
cal_list2.push_back(parse_list2[cur2]);
pos2 += parse_list2[cur2].size();
cur2++;
}
else {
cal_list1.push_back(parse_list1[cur1]);
pos1 += parse_list1[cur1].size();
cur1++;
}
}
}
parse_list.erase(parse_list.begin());
parse_list.pop_back();
}
res_all.insert(res_all.end(), parse_list.begin(), parse_list.end());
}
else {
if(punct_set.find(sen_list[i]) == punct_set.end()){
res_all.push_back(sen_list[i]);
}
}
}
if (Hmm_flag == false) {
new_res_all.assign(res_all.begin(), res_all.end());
}
else {
// 使用HMM发现新词
string buf = "";
for (size_t i = 0; i < res_all.size(); i++) {
iutf8string utf8_str(res_all[i]);
if (utf8_str.length() == 1 && punct_set.find(res_all[i]) == punct_set.end() && res_all[i].length() > 1) { // 确保res_all[i]是汉字
buf += res_all[i];
}
else {
if (buf.length() > 0) {
iutf8string utf8_buf(buf);
if (utf8_buf.length() == 1) {
new_res_all.push_back(buf);
}
else if (WordValid(buf, appid) == false) { // 连续的单字组合起来使用HMM算法进行分词
vector<string> vec = HMM_split(buf);
new_res_all.insert(new_res_all.end(), vec.begin(), vec.end());
}
else { // 是否有这种情况
new_res_all.push_back(buf);
}
}
buf = "";
new_res_all.push_back(res_all[i]);
}
}
if (buf.length() > 0) {
iutf8string utf8_buf(buf);
if (utf8_buf.length() == 1) {
new_res_all.push_back(buf);
}
else if (WordValid(buf, appid) == false) { // 连续的单字组合起来使用HMM算法进行分词
vector<string> vec = HMM_split(buf);
new_res_all.insert(new_res_all.end(), vec.begin(), vec.end());
}
else { // 是否有这种情况
new_res_all.push_back(buf);
}
buf = "";
}
}
return;
}
void FBSegment::cut_for_search(iutf8string &phrase, uint32_t appid, vector<vector<string> > &search_res_all, string mode) {
// 搜索引擎模式
vector<string> new_res_all;
segment2(phrase, appid, new_res_all, mode);
for (size_t i = 0; i < new_res_all.size(); i++) {
vector<string> vec;
iutf8string utf8_str(new_res_all[i]);
if (utf8_str.length() > 2 && isAllAlphaOrDigit(new_res_all[i]) == false) {
for (int j = 0; j < utf8_str.length() - 1; j++) {
string tmp_str = utf8_str.substr(j, 2);
if (WordValid(tmp_str, appid) == true) {
vec.push_back(tmp_str);
}
}
}
if (utf8_str.length() > 3 && isAllAlphaOrDigit(new_res_all[i]) == false) {
for (int j = 0; j < utf8_str.length() - 2; j++) {
string tmp_str = utf8_str.substr(j, 3);
if (WordValid(tmp_str, appid) == true) {
vec.push_back(tmp_str);
}
}
}
vec.push_back(new_res_all[i]);
search_res_all.push_back(vec);
}
return;
}
void FBSegment::cut_ngram(iutf8string &phrase, vector<string> &search_res, uint32_t n) {
uint32_t N = (n > (uint32_t)phrase.length()) ? (uint32_t)phrase.length() : n;
for (size_t i = 1; i <= N; i++) {
for (size_t j = 0; j < (size_t)phrase.length() - i + 1; j++) {
string tmp_str = phrase.substr(j, i);
search_res.push_back(tmp_str);
}
}
}
vector<char> FBSegment::viterbi(string sentence) {
iutf8string utf8_str(sentence);
vector< map<char, double> > V;
map<char, vector<char> > path;
char states[4] = { 'B','M','E','S' };
map<char, double> prob_map;
for (size_t i = 0; i < sizeof(states); i++) {
char y = states[i];
double emit_value = train_corpus.MinEmit();
if (train_corpus.emit_dict[y].find(utf8_str[0]) != train_corpus.emit_dict[y].end()) {
emit_value = train_corpus.emit_dict[y].at(utf8_str[0]);
}
prob_map[y] = train_corpus.start_dict[y] * emit_value; // 在位置0以y状态为末尾的状态序列的最大概率
path[y].push_back(y);
}
V.push_back(prob_map);
for (int j = 1; j < utf8_str.length(); j++) {
map<char, vector<char> > new_path;
prob_map.clear();
for (size_t k = 0; k < sizeof(states); k++) {
char y = states[k];
double max_prob = 0.0;
char state = ' ';
for (size_t m = 0; m < sizeof(states); m++) {
char y0 = states[m]; // 从y0 -> y状态的递归
//cout << j << " " << y0 << " " << y << " " << V[j - 1][y0] << " " << train_corpus.trans_dict[y0][y] << " " << train_corpus.emit_dict[y].at(utf8_str[j]) << endl;
double emit_value = train_corpus.MinEmit();
if (train_corpus.emit_dict[y].find(utf8_str[j]) != train_corpus.emit_dict[y].end()) {
emit_value = train_corpus.emit_dict[y].at(utf8_str[j]);
}
double prob = V[j - 1][y0] * train_corpus.trans_dict[y0][y] * emit_value;
if (prob > max_prob) {
max_prob = prob;
state = y0;
}
}
prob_map[y] = max_prob;
new_path[y] = path[state];
new_path[y].push_back(y);
}
V.push_back(prob_map);
path = new_path;
}
double max_prob = 0.0;
char state = ' ';
for (size_t i = 0; i < sizeof(states); i++) {
char y = states[i];
if (V[utf8_str.length() - 1][y] > max_prob) {
max_prob = V[utf8_str.length() - 1][y];
state = y;
}
}
return path[state];
}
vector<string> FBSegment::HMM_split(string sentence) {
vector<char> pos_list = viterbi(sentence);
string result;
iutf8string utf8_str(sentence);
for (size_t i = 0; i < pos_list.size(); i++) {
result += utf8_str[i];
if (pos_list[i] == 'E') {
std::size_t found = result.find_last_of(" ");
string new_word = result.substr(found + 1);
//printf("new word: %s\n", new_word.c_str());
}
if (pos_list[i] == 'E' || pos_list[i] == 'S') {
result += ' ';
}
}
if (result[result.size()-1] == ' ') {
result = result.substr(0, result.size() - 1);
}
return splitEx(result, " ");
}
double FBSegment::CalSegProbability(const vector<string> &vec) {
double p = 0;
string word1;
string word2;
// 由于概率很小,对连乘做了取对数处理转化为加法
for (int pos = 0; pos < (int)vec.size(); pos++) {
if (pos != (int)vec.size() - 1) {
// 乘以后面词的条件概率
word1 = vec[pos];
word2 = vec[pos + 1];
if (next_dict.find(word1) == next_dict.end()) {
// 加1平滑
p += log(1.0 / train_cnt);
}
else {
double numerator = 1.0;
uint32_t denominator = train_cnt;
map<string, int>::iterator iter = next_dict[word1].begin();
for (; iter != next_dict[word1].end(); iter++) {
if (iter->first == word2) {
numerator += iter->second;
}
denominator += iter->second;
}
p += log(numerator / denominator);
}
}
// 乘以第一个词的概率
if ((pos == 0 && vec[pos] != "<BEG>") || (pos == 1 && vec[0] == "<BEG>")) {
uint32_t word_freq = 0;
WordInfo word_info;
if (GetWordInfo(vec[pos], 0, word_info)) {
word_freq = word_info.word_freq;
p += log(word_freq + 1.0 / next_dict.size() + train_cnt);
}
else {
p += log(1.0 / next_dict.size() + train_cnt);
}
}
}
return p;
}
vector<string> FBSegment::segment(iutf8string &phrase, uint32_t appid) {
vector<string> res;
int last = 0;
for (int i = 0; i < phrase.length(); i++) {
if (punct_set.find(phrase[i]) != punct_set.end()) {
iutf8string fragment = phrase.utf8substr(last, i - last);
for (int j = 0; j < fragment.length(); ) { // 继续拆分,将连续的中文或者连续的英文字母合并
if (fragment[j].length() == 1) {
string tmp = fragment[j];
int k = 1;
for (; k < fragment.length() - j; k++) {
if (fragment[j + k].size() > 1) { // 连续英文,遇到非英文字母停止
res.push_back(tmp);
break;
}
else { // 非英文或数字,断开
char frag_char = fragment[j + k][0];
if (!isupper(frag_char) && !islower(frag_char) && !isdigit(frag_char)) {
res.push_back(tmp);
break;
}
}
tmp += fragment[j + k];
}
if (k == fragment.length() - j) { // 如果循环完没有break则插入到res
res.push_back(tmp);
}
j = j + k;
}
else {
string tmp = fragment[j];
int k = 1;
for (; k < fragment.length() - j; k++) {
if (fragment[j + k].size() == 1) { // 连续中文,遇到非中文字母停止
res.push_back(tmp);
break;
}
tmp += fragment[j + k];
}
if (k == fragment.length() - j) {
res.push_back(tmp);
}
j = j + k;
}
}
last = i + 1;
}
}
if (last < phrase.length()) {
string fragment = phrase.substr(last, phrase.length() - last);
res.push_back(fragment);
}
vector<string> res_all;
vector<string>::iterator iter = res.begin();
for (; iter != res.end(); iter++) {
vector<string> res;
string str = *iter;
if (isAllAlphaOrDigit(str)) { // 英文不进行分词
res_all.push_back(str);
}
else {
iutf8string utf8_str(*iter);
res = segment_part(utf8_str, appid);
res_all.insert(res_all.end(), res.begin(), res.end());
}
}
return res_all;
}
void FBSegment::get_DAG(string sentence, uint32_t appid, map<uint32_t, vector<uint32_t> > &DAG) {
iutf8string utf8_str(sentence);
uint32_t N = utf8_str.length();
for (uint32_t k = 0; k < N; k++) {
uint32_t i = k;
vector<uint32_t> tmplist;
string frag = utf8_str[k];
while (i < N) {
if (WordValid(frag, appid) == true) {
tmplist.push_back(i);
}
i++;
frag = utf8_str.substr(k, i + 1 - k);
}
if (tmplist.empty()) {
tmplist.push_back(k);
}
DAG[k] = tmplist;
}
return;
}
void FBSegment::calc(string sentence, const map<uint32_t, vector<uint32_t> > &DAG, map<uint32_t, RouteValue> &route, uint32_t appid) {
iutf8string utf8_str(sentence);
uint32_t N = utf8_str.length();
RouteValue route_N;
route[N] = route_N;
double logtotal = log(TOTAL);
for (int i = N - 1; i > -1; i--) {
vector<uint32_t> vec = DAG.at(i);
double max_route = -DBL_MAX;
uint32_t max_idx = 0;
for (size_t t = 0; t < vec.size(); t++) {
string word = utf8_str.substr(i, vec[t] + 1 - i);
WordInfo word_info;
uint32_t word_freq = 1;
/* 不查DTC改为从本地词库查询
if (word_manager.WordValid(word, appid, word_info) == true) {
word_freq = word_info.word_freq;
}
*/
if (word_dict.find(word) != word_dict.end()) {
map<uint32_t, WordInfo> wordInfo = word_dict[word];
if (wordInfo.find(0) != wordInfo.end()) {
word_info = wordInfo[0];
word_freq = word_info.word_freq;
}
if (wordInfo.find(appid) != wordInfo.end()) {
word_info = wordInfo[appid];
word_freq = word_info.word_freq;
}
}
double route_value = log(word_freq) - logtotal + route[vec[t] + 1].max_route;
if (route_value > max_route) {
max_route = route_value;
max_idx = vec[t];
}
}
RouteValue route_value;
route_value.max_route = max_route;
route_value.idx = max_idx;
route[i] = route_value;
}
}
void FBSegment::__cut_DAG_NO_HMM(string sentence, uint32_t appid, vector<string> &vec) {
map<uint32_t, vector<uint32_t> > DAG;
get_DAG(sentence, appid, DAG);
map<uint32_t, RouteValue> route;
calc(sentence, DAG, route, appid);
iutf8string utf8_str(sentence);
uint32_t N = utf8_str.length();
uint32_t i = 0;
string buf = "";
while (i < N) {
uint32_t j = route[i].idx + 1;
string l_word = utf8_str.substr(i, j - i);
if (isAllAlphaOrDigit(l_word)) {
buf += l_word;
i = j;
}
else {
if (!buf.empty()) {
vec.push_back(buf);
buf = "";
}
vec.push_back(l_word);
i = j;
}
}
if (!buf.empty()) {
vec.push_back(buf);
buf = "";
}
return;
}