{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original Topics:\n", "['tech', 'business', 'sport', 'entertainment', 'politics']\n", "1/10\n", "2/10\n", "3/10\n", "4/10\n", "5/10\n", "6/10\n", "7/10\n", "8/10\n", "9/10\n", "10/10\n", "Topic 1: said game england would time first back play last good\n", "Topic 2: said year would economy growth also economic bank government could\n", "Topic 3: said year games sales company also market last firm 2004\n", "Topic 4: film said music best also people year show number digital\n", "Topic 5: said would people government labour election party blair could also\n", "Time: 7620.509902954102\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import string\n", "from nltk.corpus import stopwords\n", "import time\n", "\n", "\n", "#定义加载数据的函数\n", "def load_data(file, K):\n", " '''\n", " INPUT:\n", " file - (str) 数据文件的路径\n", " K - (int) 设定的话题数\n", " \n", " OUTPUT:\n", " org_topics - (list) 原始话题标签列表\n", " text - (list) 文本列表\n", " words - (list) 单词列表\n", " alpha - (list) 话题概率分布,模型超参数\n", " beta - (list) 单词概率分布,模型超参数\n", " \n", " '''\n", " df = pd.read_csv(file) #读取文件\n", " org_topics = df['category'].unique().tolist() #保存文本原始的话题标签\n", " M = df.shape[0] #文本数\n", " alpha = np.zeros(K) #alpha是LDA模型的一个超参数,是对话题概率的预估计,这里取文本数据中各话题的比例作为alpha值,实际可以通过模型训练得到\n", " beta = np.zeros(1000) #beta是LDA模型的另一个超参数,是词汇表中单词的概率分布,这里取各单词在所有文本中的比例作为beta值,实际也可以通过模型训练得到\n", " #计算各话题的比例作为alpha值\n", " for k, topic in enumerate(org_topics):\n", " alpha[k] = df[df['category'] == topic].shape[0] / M\n", " df.drop('category', axis=1, inplace=True)\n", " n = df.shape[0] #n为文本数量\n", " text = []\n", " words = []\n", " for i in df['text'].values:\n", " t = i.translate(str.maketrans('', '', string.punctuation)) #去除文本中的标点符号\n", " t = [j for j in t.split() if j not in stopwords.words('english')] #去除文本中的停止词\n", " t = [j for j in t if len(j) > 3] #长度小于等于3的单词大多是无意义的,直接去除\n", " text.append(t) #将处理后的文本保存到文本列表中\n", " words.extend(set(t)) #将文本中所包含的单词保存到单词列表中\n", " words = list(set(words)) #去除单词列表中的重复单词\n", " words_cnt = np.zeros(len(words)) #用来保存单词的出现频次\n", " #循环计算words列表中各单词出现的词频\n", " for i in range(len(text)):\n", " t = text[i] #取出第i条文本\n", " for w in t:\n", " ind = words.index(w) #取出第i条文本中的第t个单词在单词列表中的索引\n", " words_cnt[ind] += 1 #对应位置的单词出现频次加一\n", " sort_inds = np.argsort(words_cnt)[::-1] #对单词出现频次降序排列后取出其索引值\n", " words = [words[ind] for ind in sort_inds[:1000]] #将出现频次前1000的单词保存到words列表\n", " #去除文本text中不在词汇表words中的单词\n", " for i in range(len(text)):\n", " t = []\n", " for w in text[i]:\n", " if w in words:\n", " ind = words.index(w)\n", " t.append(w)\n", " beta[ind] += 1 #统计各单词在文本中的出现频次\n", " text[i] = t\n", " beta /= np.sum(beta) #除以文本的总单词数得到各单词所占比例,作为beta值\n", " return org_topics, text, words, alpha, beta\n", "\n", "\n", "#定义潜在狄利克雷分配函数,采用收缩的吉布斯抽样算法估计模型的参数theta和phi\n", "def do_lda(text, words, alpha, beta, K, iters):\n", " '''\n", " INPUT:\n", " text - (list) 文本列表\n", " words - (list) 单词列表\n", " alpha - (list) 话题概率分布,模型超参数\n", " beta - (list) 单词概率分布,模型超参数\n", " K - (int) 设定的话题数\n", " iters - (int) 设定的迭代次数\n", " \n", " OUTPUT:\n", " theta - (array) 话题的条件概率分布p(zk|dj),这里写成p(zk|dj)是为了和PLSA模型那一章的符号统一一下,方便对照着看\n", " phi - (array) 单词的条件概率分布p(wi|zk)\n", " \n", " '''\n", " M = len(text) #文本数\n", " V = len(words) #单词数\n", " N_MK = np.zeros((M, K)) #文本-话题计数矩阵\n", " N_KV = np.zeros((K, V)) #话题-单词计数矩阵\n", " N_M = np.zeros(M) #文本计数向量\n", " N_K = np.zeros(K) #话题计数向量\n", " Z_MN = [] #用来保存每条文本的每个单词所在位置处抽样得到的话题\n", " #算法20.2的步骤(2),对每个文本的所有单词抽样产生话题,并进行计数\n", " for m in range(M):\n", " zm = []\n", " t = text[m]\n", " for n, w in enumerate(t):\n", " v = words.index(w)\n", " z = np.random.randint(K)\n", " zm.append(z)\n", " N_MK[m, z] += 1\n", " N_M[m] += 1\n", " N_KV[z, v] += 1\n", " N_K[z] += 1\n", " Z_MN.append(zm)\n", " #算法20.2的步骤(3),多次迭代进行吉布斯抽样\n", " for i in range(iters):\n", " print('{}/{}'.format(i+1, iters))\n", " for m in range(M):\n", " t = text[m]\n", " for n, w in enumerate(t):\n", " v = words.index(w)\n", " z = Z_MN[m][n]\n", " N_MK[m, z] -= 1\n", " N_M[m] -= 1\n", " N_KV[z][v] -= 1\n", " N_K[z] -= 1\n", " p = [] #用来保存对K个话题的条件分布p(zi|z_i,w,alpha,beta)的计算结果\n", " sums_k = 0 \n", " for k in range(K):\n", " p_zk = (N_KV[k][v] + beta[v]) * (N_MK[m][k] + alpha[k]) #话题zi=k的条件分布p(zi|z_i,w,alpha,beta)的分子部分\n", " sums_v = 0\n", " sums_k += N_MK[m][k] + alpha[k] #累计(nmk + alpha_k)在K个话题上的和\n", " for t in range(V):\n", " sums_v += N_KV[k][t] + beta[t] #累计(nkv + beta_v)在V个单词上的和\n", " p_zk /= sums_v\n", " p.append(p_zk)\n", " p = p / sums_k\n", " p = p / np.sum(p) #对条件分布p(zi|z_i,w,alpha,beta)进行归一化,保证概率的总和为1\n", " new_z = np.random.choice(a=K, p=p) #根据以上计算得到的概率进行抽样,得到新的话题\n", " Z_MN[m][n] = new_z #更新当前位置处的话题为上面抽样得到的新话题\n", " #更新计数\n", " N_MK[m, new_z] += 1\n", " N_M[m] += 1\n", " N_KV[new_z, v] += 1\n", " N_K[new_z] += 1\n", " #算法20.2的步骤(4),利用得到的样本计数,估计模型的参数theta和phi\n", " theta = np.zeros((M, K))\n", " phi = np.zeros((K, V))\n", " for m in range(M):\n", " sums_k = 0\n", " for k in range(K):\n", " theta[m, k] = N_MK[m][k] + alpha[k] #参数theta的分子部分\n", " sums_k += theta[m, k] #累计(nmk + alpha_k)在K个话题上的和,参数theta的分母部分\n", " theta[m] /= sums_k #计算参数theta\n", " for k in range(K):\n", " sums_v = 0\n", " for v in range(V):\n", " phi[k, v] = N_KV[k][v] + beta[v] #参数phi的分子部分\n", " sums_v += phi[k][v] #累计(nkv + beta_v)在V个单词上的和,参数phi的分母部分\n", " phi[k] /= sums_v #计算参数phi\n", " return theta, phi\n", "\n", "\n", "if __name__ == \"__main__\":\n", " K = 5 #设定话题数为5\n", " org_topics, text, words, alpha, beta = load_data('bbc_text.csv', K) #加载数据\n", " print('Original Topics:')\n", " print(org_topics) #打印原始的话题标签列表\n", " start = time.time() #保存开始时间\n", " iters = 10 #为了避免运行时间过长,这里只迭代10次,实际上10次是不够的,要迭代足够的次数保证吉布斯抽样进入燃烧期,这样得到的参数才能尽可能接近样本的实际概率分布\n", " theta, phi = do_lda(text, words, alpha, beta, K, iters) #LDA的吉布斯抽样\n", " #打印出每个话题zk条件下出现概率最大的前10个单词,即P(wi|zk)在话题zk中最大的10个值对应的单词,作为对话题zk的文本描述\n", " for k in range(K):\n", " sort_inds = np.argsort(phi[k])[::-1] #对话题zk条件下的P(wi|zk)的值进行降序排列后取出对应的索引值\n", " topic = [] #定义一个空列表用于保存话题zk概率最大的前10个单词\n", " for i in range(10):\n", " topic.append(words[sort_inds[i]]) \n", " topic = ' '.join(topic) #将10个单词以空格分隔,构成对话题zk的文本表述\n", " print('Topic {}: {}'.format(k+1, topic)) #打印话题zk\n", " end = time.time()\n", " print('Time:', end-start)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.04935605, 0.10338287, 0.06575088, 0.46867464, 0.31283557],\n", " [0.18828892, 0.33184591, 0.36042376, 0.00247833, 0.11696308],\n", " [0.64543178, 0.13184591, 0.06042376, 0.15962119, 0.00267737],\n", " ...,\n", " [0.41026611, 0.05564755, 0.31881135, 0.21280899, 0.002466 ],\n", " [0.34581233, 0.01506225, 0.12993015, 0.06198299, 0.44721227],\n", " [0.74515492, 0.00347293, 0.15499489, 0.09353762, 0.00283963]])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "theta" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.99768905e-02, 1.06570534e-02, 5.88055310e-03, ...,\n", " 1.18881649e-03, 8.27178858e-09, 1.50724726e-03],\n", " [3.50778918e-02, 1.05511733e-02, 8.79264523e-03, ...,\n", " 3.00802648e-04, 9.01573088e-09, 9.01573088e-09],\n", " [3.44183618e-02, 7.06465729e-03, 1.14162405e-02, ...,\n", " 4.52128900e-04, 1.69555219e-04, 1.10105081e-08],\n", " [1.81454758e-02, 3.13112016e-03, 1.39941969e-02, ...,\n", " 1.18602254e-04, 1.99237185e-03, 9.24197417e-09],\n", " [4.45921371e-02, 1.96021164e-02, 8.00255656e-03, ...,\n", " 6.17454557e-09, 6.17454557e-09, 3.01086896e-04]])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "phi" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }