mirror of
https://gitee.com/TheAlgorithms/Statistical-Learning-Method_Code.git
synced 2024-12-22 20:54:21 +08:00
208 lines
9.8 KiB
Plaintext
208 lines
9.8 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Original Topics:\n",
|
||
"['tech', 'business', 'sport', 'entertainment', 'politics']\n",
|
||
"1/10\n",
|
||
"2/10\n",
|
||
"3/10\n",
|
||
"4/10\n",
|
||
"5/10\n",
|
||
"6/10\n",
|
||
"7/10\n",
|
||
"8/10\n",
|
||
"9/10\n",
|
||
"10/10\n",
|
||
"Topic 1: said year government people mobile last number growth phone market\n",
|
||
"Topic 2: said people film could would also technology made make government\n",
|
||
"Topic 3: said would could best music also world election labour people\n",
|
||
"Topic 4: said first england also time game players wales would team\n",
|
||
"Topic 5: said also would company year world sales firm market last\n",
|
||
"Time: 531.1292963027954\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import string\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"import time\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义加载数据的函数\n",
|
||
"def load_data(file):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" file - (str) 数据文件的路径\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" org_topics - (list) 原始话题标签列表\n",
|
||
" text - (list) 文本列表\n",
|
||
" words - (list) 单词列表\n",
|
||
" \n",
|
||
" '''\n",
|
||
" df = pd.read_csv(file) #读取文件\n",
|
||
" org_topics = df['category'].unique().tolist() #保存文本原始的话题标签\n",
|
||
" df.drop('category', axis=1, inplace=True)\n",
|
||
" n = df.shape[0] #n为文本数量\n",
|
||
" text = []\n",
|
||
" words = []\n",
|
||
" for i in df['text'].values:\n",
|
||
" t = i.translate(str.maketrans('', '', string.punctuation)) #去除文本中的标点符号\n",
|
||
" t = [j for j in t.split() if j not in stopwords.words('english')] #去除文本中的停止词\n",
|
||
" t = [j for j in t if len(j) > 3] #长度小于等于3的单词大多是无意义的,直接去除\n",
|
||
" text.append(t) #将处理后的文本保存到文本列表中\n",
|
||
" words.extend(set(t)) #将文本中所包含的单词保存到单词列表中\n",
|
||
" words = list(set(words)) #去除单词列表中的重复单词\n",
|
||
" return org_topics, text, words\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义构建单词-文本矩阵的函数,这里矩阵的每一项表示单词在文本中的出现频次,也可以用TF-IDF来表示\n",
|
||
"def frequency_counter(text, words):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" text - (list) 文本列表\n",
|
||
" words - (list) 单词列表\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" words - (list) 出现频次为前1000的单词列表\n",
|
||
" X - (array) 单词-文本矩阵\n",
|
||
" \n",
|
||
" '''\n",
|
||
" words_cnt = np.zeros(len(words)) #用来保存单词的出现频次\n",
|
||
" X = np.zeros((1000, len(text))) #定义m*n的矩阵,其中m为单词列表中的单词个数,为避免运行时间过长,这里只取了出现频次为前1000的单词,因此m为1000,n为文本个数\n",
|
||
" #循环计算words列表中各单词出现的词频\n",
|
||
" for i in range(len(text)):\n",
|
||
" t = text[i] #取出第i条文本\n",
|
||
" for w in t:\n",
|
||
" ind = words.index(w) #取出第i条文本中的第t个单词在单词列表中的索引\n",
|
||
" words_cnt[ind] += 1 #对应位置的单词出现频次加一\n",
|
||
" sort_inds = np.argsort(words_cnt)[::-1] #对单词出现频次降序排列后取出其索引值\n",
|
||
" words = [words[ind] for ind in sort_inds[:1000]] #将出现频次前1000的单词保存到words列表\n",
|
||
" #构建单词-文本矩阵\n",
|
||
" for i in range(len(text)):\n",
|
||
" t = text[i] #取出第i条文本\n",
|
||
" for w in t:\n",
|
||
" if w in words: #如果文本t中的单词w在单词列表中,则将X矩阵中对应位置加一\n",
|
||
" ind = words.index(w)\n",
|
||
" X[ind, i] += 1\n",
|
||
" return words, X\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义概率潜在语义分析函数,采用EM算法进行PLSA模型的参数估计\n",
|
||
"def do_plsa(X, K, words, iters = 10):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" X - (array) 单词-文本矩阵\n",
|
||
" K - (int) 设定的话题数\n",
|
||
" words - (list) 出现频次为前1000的单词列表\n",
|
||
" iters - (int) 设定的迭代次数\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" P_wi_zk - (array) 话题zk条件下产生单词wi的概率数组\n",
|
||
" P_zk_dj - (array) 文本dj条件下属于话题zk的概率数组\n",
|
||
" \n",
|
||
" '''\n",
|
||
" M, N = X.shape #M为单词数,N为文本数\n",
|
||
" #P_wi_zk表示P(wi|zk),是一个K*M的数组,其中每个值表示第k个话题zk条件下产生第i个单词wi的概率,这里将每个值随机初始化为0-1之间的浮点数\n",
|
||
" P_wi_zk = np.random.rand(K, M)\n",
|
||
" #对于每个话题zk,保证产生单词wi的概率的总和为1\n",
|
||
" for k in range(K):\n",
|
||
" P_wi_zk[k] /= np.sum(P_wi_zk[k])\n",
|
||
" #P_zk_dj表示P(zk|dj),是一个N*K的数组,其中每个值表示第j个文本dj条件下产生第k个话题zk的概率,这里将每个值随机初始化为0-1之间的浮点数\n",
|
||
" P_zk_dj = np.random.rand(N, K)\n",
|
||
" #对于每个文本dj,属于话题zk的概率的总和为1\n",
|
||
" for n in range(N):\n",
|
||
" P_zk_dj[n] /= np.sum(P_zk_dj[n])\n",
|
||
" #P_zk_wi_dj表示P(zk|wi,dj),是一个M*N*K的数组,其中每个值表示在单词-文本对(wi,dj)的条件下属于第k个话题zk的概率,这里设置初始值为0\n",
|
||
" P_zk_wi_dj = np.zeros((M, N, K))\n",
|
||
" #迭代执行E步和M步\n",
|
||
" for i in range(iters):\n",
|
||
" print('{}/{}'.format(i+1, iters)) \n",
|
||
" #执行E步\n",
|
||
" for m in range(M):\n",
|
||
" for n in range(N):\n",
|
||
" sums = 0\n",
|
||
" for k in range(K):\n",
|
||
" P_zk_wi_dj[m, n, k] = P_wi_zk[k, m] * P_zk_dj[n, k] #计算P(zk|wi,dj)的分子部分,即P(wi|zk)*P(zk|dj)\n",
|
||
" sums += P_zk_wi_dj[m, n, k] #计算P(zk|wi,dj)的分母部分,即P(wi|zk)*P(zk|dj)在K个话题上的总和\n",
|
||
" P_zk_wi_dj[m, n, :] = P_zk_wi_dj[m, n, :] / sums #得到单词-文本对(wi,dj)条件下的P(zk|wi,dj)\n",
|
||
" #执行M步,计算P(wi|zk)\n",
|
||
" for k in range(K):\n",
|
||
" s1 = 0\n",
|
||
" for m in range(M):\n",
|
||
" P_wi_zk[k, m] = 0\n",
|
||
" for n in range(N):\n",
|
||
" P_wi_zk[k, m] += X[m, n] * P_zk_wi_dj[m, n, k] #计算P(wi|zk)的分子部分,即n(wi,dj)*P(zk|wi,dj)在N个文本上的总和,其中n(wi,dj)为单词-文本矩阵X在文本对(wi,dj)处的频次\n",
|
||
" s1 += P_wi_zk[k, m] #计算P(wi|zk)的分母部分,即n(wi,dj)*P(zk|wi,dj)在N个文本和M个单词上的总和\n",
|
||
" P_wi_zk[k, :] = P_wi_zk[k, :] / s1 #得到话题zk条件下的P(wi|zk)\n",
|
||
" #执行M步,计算P(zk|dj)\n",
|
||
" for n in range(N):\n",
|
||
" for k in range(K):\n",
|
||
" P_zk_dj[n, k] = 0\n",
|
||
" for m in range(M):\n",
|
||
" P_zk_dj[n, k] += X[m, n] * P_zk_wi_dj[m, n, k] #同理计算P(zk|dj)的分子部分,即n(wi,dj)*P(zk|wi,dj)在N个文本上的总和\n",
|
||
" P_zk_dj[n, k] = P_zk_dj[n, k] / np.sum(X[:, n]) #得到文本dj条件下的P(zk|dj),其中n(dj)为文本dj中的单词个数,由于我们只取了出现频次前1000的单词,所以这里n(dj)计算的是文本dj中在单词列表中的单词数\n",
|
||
" return P_wi_zk, P_zk_dj\n",
|
||
"\n",
|
||
"\n",
|
||
"if __name__ == \"__main__\":\n",
|
||
" org_topics, text, words = load_data('bbc_text.csv') #加载数据\n",
|
||
" print('Original Topics:')\n",
|
||
" print(org_topics) #打印原始的话题标签列表\n",
|
||
" start = time.time() #保存开始时间\n",
|
||
" words, X = frequency_counter(text, words) #取频次前1000的单词重新构建单词列表,并构建单词-文本矩阵\n",
|
||
" K = 5 #设定话题数为5\n",
|
||
" P_wi_zk, P_zk_dj = do_plsa(X, K, words, iters = 10) #采用EM算法对PLSA模型进行参数估计\n",
|
||
" #打印出每个话题zk条件下出现概率最大的前10个单词,即P(wi|zk)在话题zk中最大的10个值对应的单词,作为对话题zk的文本描述\n",
|
||
" for k in range(K):\n",
|
||
" sort_inds = np.argsort(P_wi_zk[k])[::-1] #对话题zk条件下的P(wi|zk)的值进行降序排列后取出对应的索引值\n",
|
||
" topic = [] #定义一个空列表用于保存话题zk概率最大的前10个单词\n",
|
||
" for i in range(10):\n",
|
||
" topic.append(words[sort_inds[i]]) \n",
|
||
" topic = ' '.join(topic) #将10个单词以空格分隔,构成对话题zk的文本表述\n",
|
||
" print('Topic {}: {}'.format(k+1, topic)) #打印话题zk\n",
|
||
" end = time.time()\n",
|
||
" print('Time:', end-start)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|