{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time: 0.14439177513122559\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import time \n", "\n", "\n", "#定义加载数据的函数\n", "def load_data(file):\n", " '''\n", " INPUT:\n", " file - (str) 数据文件的路径\n", " \n", " OUTPUT:\n", " df - (dataframe) 读取的数据表格\n", " X - (array) 特征数据数组\n", " \n", " '''\n", " df = pd.read_csv(file) #读取csv文件\n", " df.drop('Sports', axis=1, inplace=True) #去掉类别数据\n", " X = np.asarray(df.values).T #将数据转换成数组\n", " return df, X\n", "\n", "\n", "#定义规范化函数,对每一列特征进行规范化处理,使其成为期望为0方差为1的标准分布\n", "def Normalize(X):\n", " '''\n", " INPUT:\n", " X - (array) 特征数据数组\n", " \n", " OUTPUT:\n", " X - (array) 规范化处理后的特征数据数组\n", " \n", " '''\n", " m, n = X.shape\n", " for i in range(m):\n", " E_xi = np.mean(X[i]) #第i列特征的期望\n", " Var_xi = np.var(X[i], ddof=1) #第i列特征的方差\n", " for j in range(n):\n", " X[i][j] = (X[i][j] - E_xi) / np.sqrt(Var_xi) #对第i列特征的第j条数据进行规范化处理\n", " return X\n", "\n", "\n", "#定义奇异值分解函数,计算V矩阵和特征值\n", "def cal_V(X):\n", " '''\n", " INPUT:\n", " X - (array) 特征数据数组\n", " \n", " OUTPUT:\n", " eigvalues - (list) 特征值列表,其中特征值按从大到小排列\n", " V - (array) V矩阵\n", " \n", " '''\n", " newX = X.T / np.sqrt(X.shape[1]-1) #构造新矩阵X'\n", " Sx = np.matmul(newX.T, newX) #计算X的协方差矩阵Sx = X'.T * X'\n", " V_T = [] #用于保存V的转置\n", " w, v = np.linalg.eig(Sx) #计算Sx的特征值和对应的特征向量,即为X’的奇异值和奇异向量\n", " tmp = {} #定义一个字典用于保存特征值和特征向量,字典的键为特征值,值为对应的特征向量\n", " for i in range(len(w)):\n", " tmp[w[i]] = v[i]\n", " eigvalues = sorted(tmp, reverse=True) #将特征值逆序排列后保存到eigvalues列表中\n", " for i in eigvalues:\n", " d = 0\n", " for j in range(len(tmp[i])):\n", " d += tmp[i][j] ** 2\n", " V_T.append(tmp[i] / np.sqrt(d)) #计算特征值i的单位特征向量,即为V矩阵的列向量,将其保存到V_T中\n", " V = np.array(V_T).T #对V_T进行转置得到V矩阵\n", " return eigvalues, V\n", "\n", "\n", "#定义主成分分析函数\n", "def do_pca(X, k):\n", " '''\n", " INPUT:\n", " X - (array) 特征数据数组\n", " k - (int) 设定的主成分个数\n", " \n", " OUTPUT:\n", " fac_load - (array) 因子负荷量数组\n", " dimrates - (list) 可解释偏差列表\n", " Y - (array) 主成分矩阵\n", " \n", " '''\n", " eigvalues, V = cal_V(X) #计算特征值和V矩阵\n", " Vk = V[:, :k] #取V矩阵的前k列\n", " Y = np.matmul(Vk.T, X) #计算主成分矩阵,将m*n的样本矩阵X转换成k*n的样本主成分矩阵\n", " dimrates = [i / sum(eigvalues) for i in eigvalues[:k]] #计算可解释偏差,即前k个奇异值中每个奇异值占奇异值总和的比例,这个比例表示主成分i可解释原始数据中的可变性的比例\n", " fac_load = np.zeros((k, X.shape[0])) #用来保存主成分的因子负荷量\n", " for i in range(k): \n", " for j in range(X.shape[0]):\n", " fac_load[i][j] = np.sqrt(eigvalues[i]) * Vk[j][i] / np.sqrt(np.var(X[j])) #计算主成分i对应原始特征j的因子负荷量,保存到fac_load中\n", " return fac_load, dimrates, Y\n", "\n", "\n", "if __name__ == \"__main__\":\n", " df, X = load_data('cars.csv') #加载数据\n", " start = time.time() #保存开始时间\n", " X = Normalize(X) #对样本数据进行规范化处理\n", " k = 3 #设定主成分个数为3\n", " fac_load, dimrates, Y = do_pca(X, k) #进行主成分分析\n", " pca_result = pd.DataFrame(fac_load, index=['Dimension1', 'Dimension2', 'Dimension3'], columns=df.columns) #将结果保存为dataframe格式\n", " pca_result['Explained Variance'] = dimrates #将可解释偏差保存到pca_result的'Explained Variance'列\n", " end = time.time() #保存结束时间\n", " print('Time:', end-start)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SUVWagonMinivanPickupAWDRWDRetailDealerEngineCylindersHorsepowerCityMPGHighwayMPGWeightWheelbaseLengthWidthExplained Variance
Dimension10.093402-1.203972-0.2384520.744765-0.8177940.628960-1.410562-0.913081-0.3540610.306548-0.718787-0.0120870.7761560.3065250.024460-0.1657100.0049590.435236
Dimension20.2188850.381160-0.8257740.2881590.3514360.299775-0.5313480.8514090.3885870.181236-0.198039-0.0064270.286177-0.519626-0.205063-0.214403-0.0094510.166736
Dimension3-0.0383480.014097-0.065819-1.162422-0.4582300.171052-0.3346200.0875110.181597-0.024812-0.0540030.0012390.060208-0.069595-0.023274-0.027595-0.0261040.103441
\n", "
" ], "text/plain": [ " SUV Wagon Minivan Pickup AWD RWD \\\n", "Dimension1 0.093402 -1.203972 -0.238452 0.744765 -0.817794 0.628960 \n", "Dimension2 0.218885 0.381160 -0.825774 0.288159 0.351436 0.299775 \n", "Dimension3 -0.038348 0.014097 -0.065819 -1.162422 -0.458230 0.171052 \n", "\n", " Retail Dealer Engine Cylinders Horsepower CityMPG \\\n", "Dimension1 -1.410562 -0.913081 -0.354061 0.306548 -0.718787 -0.012087 \n", "Dimension2 -0.531348 0.851409 0.388587 0.181236 -0.198039 -0.006427 \n", "Dimension3 -0.334620 0.087511 0.181597 -0.024812 -0.054003 0.001239 \n", "\n", " HighwayMPG Weight Wheelbase Length Width \\\n", "Dimension1 0.776156 0.306525 0.024460 -0.165710 0.004959 \n", "Dimension2 0.286177 -0.519626 -0.205063 -0.214403 -0.009451 \n", "Dimension3 0.060208 -0.069595 -0.023274 -0.027595 -0.026104 \n", "\n", " Explained Variance \n", "Dimension1 0.435236 \n", "Dimension2 0.166736 \n", "Dimension3 0.103441 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca_result" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }