isearch/comm/split_tool.cc

#include "split_tool.h"
#include <fstream>
#include <vector>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <float.h>
#include "log.h"
using namespace std;
#define MAX_WORD_LEN 8
#define TOTAL 8000000
#define ALPHA_DIGIT "0123456789１２３４５６７８９０\
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZａｂｃｄｅｆｇｈｉｇｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＵＶＷＸＹＺ"


bool isAllAlphaOrDigit(string str) {
	bool flag = true;
	size_t i = 0;
	for (; i < str.size(); i++) {
		if (!isupper(str[i]) && !islower(str[i]) && !isdigit(str[i])) {
			flag = false;
			break;
		}
	}
	return flag;
}

FBSegment::FBSegment() {
	punct_set.clear();
	train_cnt = 0;
}

bool FBSegment::isAlphaOrDigit(string str) {
	if (alpha_set.find(str) != alpha_set.end())
	{
		return true;
	}
	return false;
}


bool FBSegment::Init() {

	string en_punct = ",.!?/'\"<>\\:;\n";
	string punct = "，。！？、；：“”‘’（）《》 ";
	punct = punct.append(en_punct);
	iutf8string utf8_punct(punct);
	for (int i = 0; i < utf8_punct.length(); i++) {
		punct_set.insert(utf8_punct[i]);
	}

	string alphadigit = ALPHA_DIGIT;
	iutf8string utf8_alpha(alphadigit);
	for (int i = 0; i < utf8_alpha.length(); i++) {
		alpha_set.insert(utf8_alpha[i]);
	}

	return true;
}

bool FBSegment::Init2(string train_path) {
	bool ret = Init();
	if (ret == false) {
		return ret;
	}

	string str;
	ifstream train_infile;
	train_infile.open(train_path.c_str());
	if (train_infile.is_open() == false) {
		log_error("open file error: %s.\n", train_path.c_str());
		return false;
	}
	string beg_tag = "<BEG>";
	string end_tag = "<END>";
	while (getline(train_infile, str))
	{
		vector<string> str_vec = splitEx(str, " ");
		vector<string> line_list;
		vector<string>::iterator iter = str_vec.begin();
		for (; iter != str_vec.end(); iter++) {
			if (punct_set.find(*iter) == punct_set.end() && *iter != "") {
				line_list.push_back(*iter);
			}
		}
		train_cnt += line_list.size();
		for (int i = -1; i < (int)line_list.size(); i++) {
			string word1;
			string word2;
			if (i == -1) {
				word1 = beg_tag;
				word2 = line_list[i + 1];
			}
			else if (i == (int)line_list.size() - 1) {
				word1 = line_list[i];
				word2 = end_tag;
			}
			else {
				word1 = line_list[i];
				word2 = line_list[i + 1];
			}
			if (next_dict.find(word1) == next_dict.end()) {
				map<string, int> dict;
				next_dict[word1] = dict;
			}
			if (next_dict[word1].find(word2) == next_dict[word1].end()) {
				next_dict[word1][word2] = 1;
			}
			else {
				next_dict[word1][word2] += 1;
			}
		}

	}
	train_infile.close();

	ret = train_corpus.Init(train_path);
	if (ret == false) {
		log_error("train_corpus init error.");
		return ret;
	}
	log_info("total training words length is: %u, next_dict count: %d.", train_cnt, (int)next_dict.size());
	
	return true;
}

bool FBSegment::Init3(string train_path, string word_path) {
	bool ret = Init2(train_path);
	if (ret == false) {
		return ret;
	}

	string str;
	ifstream word_infile;
	word_infile.open(word_path.c_str());
	if (word_infile.is_open() == false) {
		log_error("open file error: %s.\n", word_path.c_str());
		return false;
	}

	uint32_t word_id = 0;
	uint32_t appid = 0;
	string word;
	uint32_t word_freq = 0;
	while (getline(word_infile, str))
	{
		vector<string> str_vec = splitEx(str, "\t");
		word_id = atoi(str_vec[0].c_str());
		word = str_vec[1];
		appid = atoi(str_vec[2].c_str());
		word_freq = atoi(str_vec[3].c_str());
		WordInfo word_info;
		word_info.appid = appid;
		word_info.word_freq = word_freq;
		word_info.word_id = word_id;
		word_dict[word][appid] = word_info;
	}
	log_info("word_dict count: %d", (int)word_dict.size());

	return true;
}

bool FBSegment::WordValid(string word, uint32_t appid) {
	if(punct_set.find(word) != punct_set.end()){
		return false;
	}
	if (word_dict.find(word) != word_dict.end()) {
		map<uint32_t, WordInfo> wordInfo = word_dict[word];
		if (wordInfo.find(0) != wordInfo.end() || wordInfo.find(appid) != wordInfo.end()) {
			return true;
		}
	}

	return false;
}

bool FBSegment::GetWordInfo(string word, uint32_t appid, WordInfo &word_info) {
	if (word_dict.find(word) != word_dict.end()) {
		map<uint32_t, WordInfo> wordInfo = word_dict[word];
		if (wordInfo.find(0) != wordInfo.end()) {
			word_info = wordInfo[0];
			return true;
		}
		if (wordInfo.find(appid) != wordInfo.end()) {
			word_info = wordInfo[appid];
			return true;
		}
	}

	return false;
}

bool FBSegment::GetWordInfoFromDictOnly(string word, uint32_t appid, WordInfo &word_info) {
	if (word_dict.find(word) != word_dict.end()) {
		map<uint32_t, WordInfo> wordInfo = word_dict[word];
		if (wordInfo.find(0) != wordInfo.end()) {
			word_info = wordInfo[0];
			return true;
		}
		if (wordInfo.find(appid) != wordInfo.end()) {
			word_info = wordInfo[appid];
			return true;
		}
	}
	return false;
}

void FBSegment::FMM2(iutf8string  &phrase, uint32_t appid, vector<string> &fmm_list) {
	int maxlen = MAX_WORD_LEN;
	int len_phrase = phrase.length();
	int i = 0, j = 0;

	while (i < len_phrase) {
		int end = i + maxlen;
		if (end >= len_phrase)
			end = len_phrase;
		iutf8string phrase_sub = phrase.utf8substr(i, end - i);
		for (j = phrase_sub.length(); j >= 0; j--) {
			if (j == 1)
				break;
			iutf8string key = phrase_sub.utf8substr(0, j);
			if (WordValid(key.stlstring(), appid) == true) {
				fmm_list.push_back(key.stlstring());
				i += key.length() - 1;
				break;
			}
		}
		if (j == 1) {
			fmm_list.push_back(phrase_sub[0]);
		}
		i += 1;
	}
	return;
}

// not query from DTC
void FBSegment::BMM(iutf8string  &phrase, uint32_t appid, vector<string> &bmm_list) {
	int maxlen = MAX_WORD_LEN - 2;
	int len_phrase = phrase.length();
	int i = len_phrase, j = 0;

	while (i > 0) {
		int start = i - maxlen;
		if (start < 0)
			start = 0;
		iutf8string phrase_sub = phrase.utf8substr(start, i - start);
		for (j = 0; j < phrase_sub.length(); j++) {
			if (j == phrase_sub.length() - 1)
				break;
			iutf8string key = phrase_sub.utf8substr(j, phrase_sub.length() - j);
			string word = key.stlstring();
			if (WordValid(word, appid) == true) {
				vector<string>::iterator iter = bmm_list.begin();
				bmm_list.insert(iter, key.stlstring());
				i -= key.length() - 1;
				break;
			}
		}
		if (j == phrase_sub.length() - 1) {
			vector<string>::iterator iter = bmm_list.begin();
			bmm_list.insert(iter, "" + phrase_sub[j]);
		}
		i -= 1;
	}
	return;
}

void FBSegment::BMM2(iutf8string  &phrase, uint32_t appid, vector<string> &bmm_list) {
	 int maxlen = MAX_WORD_LEN;
	 int len_phrase = phrase.length();
	 int i = len_phrase, j = 0;

	 while (i > 0) {
		 int start = i - maxlen;
		 if (start < 0)
			 start = 0;
		 iutf8string phrase_sub = phrase.utf8substr(start, i-start);
		 for (j = 0; j < phrase_sub.length(); j++) {
			 if (j == phrase_sub.length() - 1)
				 break;
			 iutf8string key = phrase_sub.utf8substr(j, phrase_sub.length()-j);
			 if (WordValid(key.stlstring(), appid) == true) {
				 vector<string>::iterator iter = bmm_list.begin();
				 bmm_list.insert(iter, key.stlstring());
				 i -= key.length() - 1;
				 break;
			 }
		 }
		 if (j == phrase_sub.length() - 1) {
			 vector<string>::iterator iter = bmm_list.begin();
			 bmm_list.insert(iter, "" + phrase_sub[j]);
		 }
		 i -= 1;
	 }
	 return;
 }

 vector<string> FBSegment::segment_part(iutf8string &phrase, uint32_t appid) {
	 vector<string> fmm_list;
	 FMM2(phrase, appid, fmm_list); // 正向最大匹配
	 vector<string> bmm_list;
	 BMM2(phrase, appid, bmm_list); // 反向最大匹配
	 //如果正反向分词结果词数不同，则取分词数量较少的那个  
	 if (fmm_list.size() != bmm_list.size()) {
		 if (fmm_list.size() > bmm_list.size())
			 return bmm_list;
		 else return fmm_list;
	 }
	 //如果分词结果词数相同  
	 else {
		 //如果正反向的分词结果相同，就说明没有歧义，可返回任意一个  
		 int i, FSingle = 0, BSingle = 0;
		 bool isSame = true;
		 for (i = 0; i < (int)fmm_list.size(); i++) {
			 if (fmm_list.at(i) != (bmm_list.at(i)))
				 isSame = false;
			 if (fmm_list.at(i).length() == 1)
				 FSingle += 1;
			 if (bmm_list.at(i).length() == 1)
				 BSingle += 1;
		 }
		 if (isSame)
			 return fmm_list;
		 else {
			 //分词结果不同，返回其中单字较少的那个  
			 if (BSingle > FSingle)
				 return fmm_list;
			 else return bmm_list;
		 }
	 }
 }

 void FBSegment::segment2(iutf8string &phrase, uint32_t appid, vector<string> &new_res_all, string mode, bool Hmm_flag) {
	 vector<string> sen_list;
	 set<string> special_set;  // 记录英文和数字字符串
	 string tmp_words = "";
	 bool flag = false; // 记录是否有英文或者数字的flag
	 for (int i = 0; i < phrase.length(); i++) {
		 if (isAlphaOrDigit(phrase[i])) {
			 if (tmp_words != "" and flag == false) {
				 sen_list.push_back(tmp_words);
				 tmp_words = "";
			 }
			 flag = true;
			 tmp_words += phrase[i];
		 }
		 else if(punct_set.find(phrase[i]) != punct_set.end()){
			 if (tmp_words != "") {
				 sen_list.push_back(tmp_words);
				 sen_list.push_back(phrase[i]);
				 if (flag == true) {
					 special_set.insert(tmp_words);
					 flag = false;
				 }
				 tmp_words = "";
			 }
		 }
		 else {
			 if (flag == true) {
				 sen_list.push_back(tmp_words);
				 special_set.insert(tmp_words);
				 flag = false;
				 tmp_words = phrase[i];
			 }
			 else {
				 tmp_words += phrase[i];
			 }
		 }
	 }
	 if (tmp_words != "") {
		 sen_list.push_back(tmp_words);
		 if (flag == true) {
			 special_set.insert(tmp_words);
		 }
	 }
	 tmp_words = "";
	 vector<string> res_all;
	 for (int i = 0; i < (int)sen_list.size(); i++) {
		 // special_set中保存了连续的字母数字串，不需要进行分词
		 if (special_set.find(sen_list[i]) == special_set.end() && punct_set.find(sen_list[i]) == punct_set.end()) {
			 iutf8string utf8_str(sen_list[i]);
			 vector<string> parse_list;
			 if (mode == "Pre") {
				 FMM2(utf8_str, appid, parse_list);
			 }
			 else if (mode == "Post") {
				 BMM2(utf8_str, appid, parse_list);
			 }
			 else if (mode == "DAG") {
				 __cut_DAG_NO_HMM(sen_list[i], appid, parse_list);
			 }
			 else if (mode == "Cache") {  // word dict in cache, not from DTC
				 BMM(utf8_str, appid, parse_list);
			 }
			 else { // PrePostNGram
				 vector<string> parse_list1;
				 vector<string> parse_list2;
				 FMM2(utf8_str, appid, parse_list1);
				 BMM2(utf8_str, appid, parse_list2);
				 parse_list1.insert(parse_list1.begin(), "<BEG>");
				 parse_list1.push_back("<END>");
				 parse_list2.insert(parse_list2.begin(), "<BEG>");
				 parse_list2.push_back("<END>");
				 
				 // CalList1和CalList2分别记录两个句子词序列不同的部分
				 vector<string> cal_list1;
				 vector<string> cal_list2;
				 // pos1和pos2记录两个句子的当前字的位置，cur1和cur2记录两个句子的第几个词
				 uint32_t pos1 = 0;
				 uint32_t pos2 = 0;
				 uint32_t cur1 = 0;
				 uint32_t cur2 = 0;
				 while (1) {
					 if (cur1 == parse_list1.size() && cur2 == parse_list2.size()) {
						 break;
					 }
					 // 如果当前位置一样
					 if (pos1 == pos2) {
						 // 当前位置一样，并且词也一样
						 if (parse_list1[cur1].size() == parse_list2[cur2].size()) {
							 pos1 += parse_list1[cur1].size();
							 pos2 += parse_list2[cur2].size();
							 // 说明此时得到两个不同的词序列，根据bigram选择概率大的
							 // 注意算不同的时候要考虑加上前面一个词和后面一个词，拼接的时候再去掉即可
							 if (cal_list1.size() > 0) {
								 cal_list1.insert(cal_list1.begin(), parse_list[parse_list.size() - 1]);
								 cal_list2.insert(cal_list2.begin(), parse_list[parse_list.size() - 1]);
								 if (cur1 < parse_list1.size()-1) {
									 cal_list1.push_back(parse_list1[cur1]);
									 cal_list2.push_back(parse_list2[cur2]);
								 }
								 double p1 = CalSegProbability(cal_list1);
								 double p2 = CalSegProbability(cal_list2);

								 vector<string> cal_list = (p1 > p2) ? cal_list1 : cal_list2;
								 cal_list.erase(cal_list.begin());
								 if (cur1 < parse_list1.size() - 1) {
									 cal_list.pop_back();
								 }
								 parse_list.insert(parse_list.end(), cal_list.begin(), cal_list.end());
								 cal_list1.clear();
								 cal_list2.clear();
							 }
							 parse_list.push_back(parse_list1[cur1]);
							 cur1++;
							 cur2++;
						 }
						 // pos相同，len(ParseList1[cur1])不同，向后滑动，不同的添加到list中
						 else if (parse_list1[cur1].size() > parse_list2[cur2].size()) {
							 cal_list2.push_back(parse_list2[cur2]);
							 pos2 += parse_list2[cur2].size();
							 cur2++;
						 }
						 else {
							 cal_list1.push_back(parse_list1[cur1]);
							 pos1 += parse_list1[cur1].size();
							 cur1++;
						 }
					 }
					 else { 
						 // pos不同，而结束的位置相同，两个同时向后滑动
						 if (pos1 + parse_list1[cur1].size() == pos2 + parse_list2[cur2].size()) {
							 cal_list1.push_back(parse_list1[cur1]);
							 cal_list2.push_back(parse_list2[cur2]);
							 pos1 += parse_list1[cur1].size();
							 pos2 += parse_list2[cur2].size();
							 cur1++;
							 cur2++;
						 }
						 else if (pos1 + parse_list1[cur1].size() > pos2 + parse_list2[cur2].size()) {
							 cal_list2.push_back(parse_list2[cur2]);
							 pos2 += parse_list2[cur2].size();
							 cur2++;
						 }
						 else {
							 cal_list1.push_back(parse_list1[cur1]);
							 pos1 += parse_list1[cur1].size();
							 cur1++;
						 }
					 }
				 }
				 parse_list.erase(parse_list.begin());
				 parse_list.pop_back();
			 }
			 res_all.insert(res_all.end(), parse_list.begin(), parse_list.end());
		 }
		 else {
		 	if(punct_set.find(sen_list[i]) == punct_set.end()){
			 	res_all.push_back(sen_list[i]);
		 	}
		 }
	 }

	 if (Hmm_flag == false) {
		 new_res_all.assign(res_all.begin(), res_all.end());
	 }
	 else {
		 // 使用HMM发现新词
		 string buf = "";
		 for (size_t i = 0; i < res_all.size(); i++) {
			 iutf8string utf8_str(res_all[i]);
			 if (utf8_str.length() == 1 && punct_set.find(res_all[i]) == punct_set.end() && res_all[i].length() > 1) { // 确保res_all[i]是汉字
				 buf += res_all[i];
			 }
			 else {
				 if (buf.length() > 0) {
					 iutf8string utf8_buf(buf);
					 if (utf8_buf.length() == 1) {
						 new_res_all.push_back(buf);
					 }
					 else if (WordValid(buf, appid) == false) { // 连续的单字组合起来，使用HMM算法进行分词
						 vector<string> vec = HMM_split(buf);
						 new_res_all.insert(new_res_all.end(), vec.begin(), vec.end());
					 }
					 else { // 是否有这种情况
						 new_res_all.push_back(buf);
					 }
				 }
				 buf = "";
				 new_res_all.push_back(res_all[i]);
			 }
		 }

		 if (buf.length() > 0) {
			 iutf8string utf8_buf(buf);
			 if (utf8_buf.length() == 1) {
				 new_res_all.push_back(buf);
			 }
			 else if (WordValid(buf, appid) == false) { // 连续的单字组合起来，使用HMM算法进行分词
				 vector<string> vec = HMM_split(buf);
				 new_res_all.insert(new_res_all.end(), vec.begin(), vec.end());
			 }
			 else { // 是否有这种情况
				 new_res_all.push_back(buf);
			 }
			 buf = "";
		 }
	 }
	 
	 return;
 }

void FBSegment::cut_for_search(iutf8string &phrase, uint32_t appid, vector<vector<string> > &search_res_all, string mode) {
	 // 搜索引擎模式
	vector<string> new_res_all;
	segment2(phrase, appid, new_res_all, mode);
	 for (size_t i = 0; i < new_res_all.size(); i++) {
		 vector<string> vec;
		 iutf8string utf8_str(new_res_all[i]);
		 if (utf8_str.length() > 2 && isAllAlphaOrDigit(new_res_all[i]) == false) {
			 for (int j = 0; j < utf8_str.length() - 1; j++) {
				 string tmp_str = utf8_str.substr(j, 2);
				 if (WordValid(tmp_str, appid) == true) {
					 vec.push_back(tmp_str);
				 }
			 }
		 }
		 if (utf8_str.length() > 3 && isAllAlphaOrDigit(new_res_all[i]) == false) {
			 for (int j = 0; j < utf8_str.length() - 2; j++) {
				 string tmp_str = utf8_str.substr(j, 3);
				 if (WordValid(tmp_str, appid) == true) {
					 vec.push_back(tmp_str);
				 }
			 }
		 }
		 vec.push_back(new_res_all[i]);
		 search_res_all.push_back(vec);
	 }

	 return;
 }

void FBSegment::cut_ngram(iutf8string &phrase, vector<string> &search_res, uint32_t n) {
	uint32_t N = (n > (uint32_t)phrase.length()) ? (uint32_t)phrase.length() : n;
	for (size_t i = 1; i <= N; i++) {
		for (size_t j = 0; j < (size_t)phrase.length() - i + 1; j++) {
			string tmp_str = phrase.substr(j, i);
			search_res.push_back(tmp_str);
		}
	}
}

 vector<char> FBSegment::viterbi(string sentence) {
	 iutf8string utf8_str(sentence);
	 vector< map<char, double> > V;
	 map<char, vector<char> > path;
	 char states[4] = { 'B','M','E','S' };
	 map<char, double> prob_map;
	 for (size_t i = 0; i < sizeof(states); i++) {
		 char y = states[i];
		 double emit_value = train_corpus.MinEmit();
		 if (train_corpus.emit_dict[y].find(utf8_str[0]) != train_corpus.emit_dict[y].end()) {
			 emit_value = train_corpus.emit_dict[y].at(utf8_str[0]);
		 }
		 prob_map[y] = train_corpus.start_dict[y] * emit_value;  // 在位置0，以y状态为末尾的状态序列的最大概率
		 path[y].push_back(y);
	 }
	 V.push_back(prob_map);
	 for (int j = 1; j < utf8_str.length(); j++) {
		 map<char, vector<char> > new_path;
		 prob_map.clear();
		 for (size_t k = 0; k < sizeof(states); k++) {
			 char y = states[k];
			 double max_prob = 0.0;
			 char state = ' ';
			 for (size_t m = 0; m < sizeof(states); m++) {
				 char y0 = states[m];  // 从y0 -> y状态的递归
				 //cout << j << " " << y0 << " " << y << " " << V[j - 1][y0] << " " << train_corpus.trans_dict[y0][y] << " " << train_corpus.emit_dict[y].at(utf8_str[j]) << endl;
				 double emit_value = train_corpus.MinEmit();
				 if (train_corpus.emit_dict[y].find(utf8_str[j]) != train_corpus.emit_dict[y].end()) {
					 emit_value = train_corpus.emit_dict[y].at(utf8_str[j]);
				 }
				 double prob = V[j - 1][y0] * train_corpus.trans_dict[y0][y] * emit_value;
				 if (prob > max_prob) {
					 max_prob = prob;
					 state = y0;
				 }
			 }
			 prob_map[y] = max_prob;
			 new_path[y] = path[state];
			 new_path[y].push_back(y);
		 }
		 V.push_back(prob_map);
		 path = new_path;
	 }
	 double max_prob = 0.0;
	 char state = ' ';
	 for (size_t i = 0; i < sizeof(states); i++) {
		 char y = states[i];
		 if (V[utf8_str.length() - 1][y] > max_prob) {
			 max_prob = V[utf8_str.length() - 1][y];
			 state = y;
		 }
	 }
	 return path[state];
 }

 vector<string> FBSegment::HMM_split(string sentence) {

	 vector<char> pos_list = viterbi(sentence);
	 string result;
	 iutf8string utf8_str(sentence);
	 for (size_t i = 0; i < pos_list.size(); i++) {
		 result += utf8_str[i];
		 if (pos_list[i] == 'E') {
			 std::size_t found = result.find_last_of(" ");
			 string new_word = result.substr(found + 1);
			 //printf("new word: %s\n", new_word.c_str());
		 }
		 if (pos_list[i] == 'E' || pos_list[i] == 'S') {
			 result += ' ';
		 }
	 }
	 if (result[result.size()-1] == ' ') {
		 result = result.substr(0, result.size() - 1);
	 }

	 return splitEx(result, " ");
 }

 double FBSegment::CalSegProbability(const vector<string> &vec) {
	 double p = 0;
	 string word1;
	 string word2;
	 // 由于概率很小，对连乘做了取对数处理转化为加法
	 for (int pos = 0; pos < (int)vec.size(); pos++) {
		 if (pos != (int)vec.size() - 1) {
			 // 乘以后面词的条件概率
			 word1 = vec[pos];
			 word2 = vec[pos + 1];
			 if (next_dict.find(word1) == next_dict.end()) {
				 // 加1平滑
				 p += log(1.0 / train_cnt);
			 }
			 else {
				 double numerator = 1.0;
				 uint32_t denominator = train_cnt;
				 map<string, int>::iterator iter = next_dict[word1].begin();
				 for (; iter != next_dict[word1].end(); iter++) {
					 if (iter->first == word2) {
						 numerator += iter->second;
					 }
					 denominator += iter->second;
				 }
				 p += log(numerator / denominator);
			 }
		 }
		 // 乘以第一个词的概率
		 if ((pos == 0 && vec[pos] != "<BEG>") || (pos == 1 && vec[0] == "<BEG>")) {
			 uint32_t word_freq = 0;
			 WordInfo word_info;
			 if (GetWordInfo(vec[pos], 0, word_info)) {
				 word_freq = word_info.word_freq;
				 p += log(word_freq + 1.0 / next_dict.size() + train_cnt);
			 }
			 else {
				 p += log(1.0 / next_dict.size() + train_cnt);
			 }
		 }
	 }

	 return p;
 }
 
 vector<string> FBSegment::segment(iutf8string &phrase, uint32_t appid) {
	 vector<string> res;
	 int last = 0;
	 for (int i = 0; i < phrase.length(); i++) {
		 if (punct_set.find(phrase[i]) != punct_set.end()) {
			 iutf8string fragment = phrase.utf8substr(last, i - last);
			 for (int j = 0; j < fragment.length(); ) { // 继续拆分，将连续的中文或者连续的英文字母合并
				 if (fragment[j].length() == 1) {
					 string tmp = fragment[j];
					 int k = 1;
					 for (; k < fragment.length() - j; k++) {
						 if (fragment[j + k].size() > 1) { // 连续英文，遇到非英文字母停止
							 res.push_back(tmp);
							 break;
						 }
						 else { // 非英文或数字，断开
							 char frag_char = fragment[j + k][0];
							 if (!isupper(frag_char) && !islower(frag_char) && !isdigit(frag_char)) {
								 res.push_back(tmp);
								 break;
							 }
						 }
						 tmp += fragment[j + k];
					 }
					 if (k == fragment.length() - j) { // 如果循环完没有break，则插入到res
						 res.push_back(tmp);
					 }
					 j = j + k;
				 }
				 else {
					 string tmp = fragment[j];
					 int k = 1;
					 for (; k < fragment.length() - j; k++) {
						 if (fragment[j + k].size() == 1) { // 连续中文，遇到非中文字母停止
							 res.push_back(tmp);
							 break;
						 }
						 tmp += fragment[j + k];
					 }
					 if (k == fragment.length() - j) {
						 res.push_back(tmp);
					 }
					 j = j + k;
				 }
			 }
			 last = i + 1;
		 }
	 }
	 if (last < phrase.length()) {
		 string fragment = phrase.substr(last, phrase.length() - last);
		 res.push_back(fragment);
	 }
	 vector<string> res_all;
	 vector<string>::iterator iter = res.begin();
	 for (; iter != res.end(); iter++) {
		 vector<string> res;
		 string str = *iter;
		 if (isAllAlphaOrDigit(str)) { // 英文不进行分词
			 res_all.push_back(str);
		 }
		 else {
			 iutf8string utf8_str(*iter);
			 res = segment_part(utf8_str, appid);
			 res_all.insert(res_all.end(), res.begin(), res.end());
		 }
	 }
	 return res_all;
 }
 
void FBSegment::get_DAG(string sentence, uint32_t appid, map<uint32_t, vector<uint32_t> > &DAG) {
	 iutf8string utf8_str(sentence);
	 uint32_t N = utf8_str.length();
	 for (uint32_t k = 0; k < N; k++) {
		 uint32_t i = k;
		 vector<uint32_t> tmplist;
		 string frag = utf8_str[k];
		 while (i < N) {
			 if (WordValid(frag, appid) == true) {
				 tmplist.push_back(i);
			 }
			 i++;
			 frag = utf8_str.substr(k, i + 1 - k);
		 }
		 if (tmplist.empty()) {
			 tmplist.push_back(k);
		 }
		 DAG[k] = tmplist;
	 }
	 return;
 }

 void FBSegment::calc(string sentence, const map<uint32_t, vector<uint32_t> > &DAG, map<uint32_t, RouteValue> &route, uint32_t appid) {
	 iutf8string utf8_str(sentence);
	 uint32_t N = utf8_str.length();
	 RouteValue route_N;
	 route[N] = route_N;
	 double logtotal = log(TOTAL);
	 for (int i = N - 1; i > -1; i--) {
		 vector<uint32_t> vec = DAG.at(i);
		 double max_route = -DBL_MAX;
		 uint32_t max_idx = 0;
		 for (size_t t = 0; t < vec.size(); t++) {
			 string word = utf8_str.substr(i, vec[t] + 1 - i);
			 WordInfo word_info;
			 uint32_t word_freq = 1;
			 /* 不查DTC，改为从本地词库查询
			 if (word_manager.WordValid(word, appid, word_info) == true) {
				 word_freq = word_info.word_freq;
			 }
			 */
			 if (word_dict.find(word) != word_dict.end()) {
				 map<uint32_t, WordInfo> wordInfo = word_dict[word];
				 if (wordInfo.find(0) != wordInfo.end()) {
					 word_info = wordInfo[0];
					 word_freq = word_info.word_freq;
				 }
				 if (wordInfo.find(appid) != wordInfo.end()) {
					 word_info = wordInfo[appid];
					 word_freq = word_info.word_freq;
				 }
			 }
			 double route_value = log(word_freq) - logtotal + route[vec[t] + 1].max_route;
			 if (route_value > max_route) {
				 max_route = route_value;
				 max_idx = vec[t];
			 }
		 }
		 RouteValue route_value;
		 route_value.max_route = max_route;
		 route_value.idx = max_idx;
		 route[i] = route_value;
	 }
 }

void FBSegment::__cut_DAG_NO_HMM(string sentence, uint32_t appid, vector<string> &vec) {
	 map<uint32_t, vector<uint32_t> > DAG;
	 get_DAG(sentence, appid, DAG);
	 map<uint32_t, RouteValue> route;
	 calc(sentence, DAG, route, appid);
	 iutf8string utf8_str(sentence);
	 uint32_t N = utf8_str.length();
	 uint32_t i = 0;
	 string buf = "";
	 while (i < N) {
		 uint32_t j = route[i].idx + 1;
		 string l_word = utf8_str.substr(i, j - i);
		 if (isAllAlphaOrDigit(l_word)) {
			 buf += l_word;
			 i = j;
		 }
		 else {
			 if (!buf.empty()) {
				 vec.push_back(buf);
				 buf = "";
			 }
			 vec.push_back(l_word);
			 i = j;
		 }
	 }
	 if (!buf.empty()) {
		 vec.push_back(buf);
		 buf = "";
	 }

	 return;
 }