Statistical-Learning-Method.../PCA/PCA.ipynb
2021-01-26 16:48:23 +08:00

290 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time: 0.14439177513122559\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import time \n",
"\n",
"\n",
"#定义加载数据的函数\n",
"def load_data(file):\n",
" '''\n",
" INPUT:\n",
" file - (str) 数据文件的路径\n",
" \n",
" OUTPUT:\n",
" df - (dataframe) 读取的数据表格\n",
" X - (array) 特征数据数组\n",
" \n",
" '''\n",
" df = pd.read_csv(file) #读取csv文件\n",
" df.drop('Sports', axis=1, inplace=True) #去掉类别数据\n",
" X = np.asarray(df.values).T #将数据转换成数组\n",
" return df, X\n",
"\n",
"\n",
"#定义规范化函数对每一列特征进行规范化处理使其成为期望为0方差为1的标准分布\n",
"def Normalize(X):\n",
" '''\n",
" INPUT:\n",
" X - (array) 特征数据数组\n",
" \n",
" OUTPUT:\n",
" X - (array) 规范化处理后的特征数据数组\n",
" \n",
" '''\n",
" m, n = X.shape\n",
" for i in range(m):\n",
" E_xi = np.mean(X[i]) #第i列特征的期望\n",
" Var_xi = np.var(X[i], ddof=1) #第i列特征的方差\n",
" for j in range(n):\n",
" X[i][j] = (X[i][j] - E_xi) / np.sqrt(Var_xi) #对第i列特征的第j条数据进行规范化处理\n",
" return X\n",
"\n",
"\n",
"#定义奇异值分解函数计算V矩阵和特征值\n",
"def cal_V(X):\n",
" '''\n",
" INPUT:\n",
" X - (array) 特征数据数组\n",
" \n",
" OUTPUT:\n",
" eigvalues - (list) 特征值列表,其中特征值按从大到小排列\n",
" V - (array) V矩阵\n",
" \n",
" '''\n",
" newX = X.T / np.sqrt(X.shape[1]-1) #构造新矩阵X'\n",
" Sx = np.matmul(newX.T, newX) #计算X的协方差矩阵Sx = X'.T * X'\n",
" V_T = [] #用于保存V的转置\n",
" w, v = np.linalg.eig(Sx) #计算Sx的特征值和对应的特征向量即为X的奇异值和奇异向量\n",
" tmp = {} #定义一个字典用于保存特征值和特征向量,字典的键为特征值,值为对应的特征向量\n",
" for i in range(len(w)):\n",
" tmp[w[i]] = v[i]\n",
" eigvalues = sorted(tmp, reverse=True) #将特征值逆序排列后保存到eigvalues列表中\n",
" for i in eigvalues:\n",
" d = 0\n",
" for j in range(len(tmp[i])):\n",
" d += tmp[i][j] ** 2\n",
" V_T.append(tmp[i] / np.sqrt(d)) #计算特征值i的单位特征向量即为V矩阵的列向量将其保存到V_T中\n",
" V = np.array(V_T).T #对V_T进行转置得到V矩阵\n",
" return eigvalues, V\n",
"\n",
"\n",
"#定义主成分分析函数\n",
"def do_pca(X, k):\n",
" '''\n",
" INPUT:\n",
" X - (array) 特征数据数组\n",
" k - (int) 设定的主成分个数\n",
" \n",
" OUTPUT:\n",
" fac_load - (array) 因子负荷量数组\n",
" dimrates - (list) 可解释偏差列表\n",
" Y - (array) 主成分矩阵\n",
" \n",
" '''\n",
" eigvalues, V = cal_V(X) #计算特征值和V矩阵\n",
" Vk = V[:, :k] #取V矩阵的前k列\n",
" Y = np.matmul(Vk.T, X) #计算主成分矩阵将m*n的样本矩阵X转换成k*n的样本主成分矩阵\n",
" dimrates = [i / sum(eigvalues) for i in eigvalues[:k]] #计算可解释偏差即前k个奇异值中每个奇异值占奇异值总和的比例这个比例表示主成分i可解释原始数据中的可变性的比例\n",
" fac_load = np.zeros((k, X.shape[0])) #用来保存主成分的因子负荷量\n",
" for i in range(k): \n",
" for j in range(X.shape[0]):\n",
" fac_load[i][j] = np.sqrt(eigvalues[i]) * Vk[j][i] / np.sqrt(np.var(X[j])) #计算主成分i对应原始特征j的因子负荷量保存到fac_load中\n",
" return fac_load, dimrates, Y\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" df, X = load_data('cars.csv') #加载数据\n",
" start = time.time() #保存开始时间\n",
" X = Normalize(X) #对样本数据进行规范化处理\n",
" k = 3 #设定主成分个数为3\n",
" fac_load, dimrates, Y = do_pca(X, k) #进行主成分分析\n",
" pca_result = pd.DataFrame(fac_load, index=['Dimension1', 'Dimension2', 'Dimension3'], columns=df.columns) #将结果保存为dataframe格式\n",
" pca_result['Explained Variance'] = dimrates #将可解释偏差保存到pca_result的'Explained Variance'列\n",
" end = time.time() #保存结束时间\n",
" print('Time:', end-start)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SUV</th>\n",
" <th>Wagon</th>\n",
" <th>Minivan</th>\n",
" <th>Pickup</th>\n",
" <th>AWD</th>\n",
" <th>RWD</th>\n",
" <th>Retail</th>\n",
" <th>Dealer</th>\n",
" <th>Engine</th>\n",
" <th>Cylinders</th>\n",
" <th>Horsepower</th>\n",
" <th>CityMPG</th>\n",
" <th>HighwayMPG</th>\n",
" <th>Weight</th>\n",
" <th>Wheelbase</th>\n",
" <th>Length</th>\n",
" <th>Width</th>\n",
" <th>Explained Variance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Dimension1</th>\n",
" <td>0.093402</td>\n",
" <td>-1.203972</td>\n",
" <td>-0.238452</td>\n",
" <td>0.744765</td>\n",
" <td>-0.817794</td>\n",
" <td>0.628960</td>\n",
" <td>-1.410562</td>\n",
" <td>-0.913081</td>\n",
" <td>-0.354061</td>\n",
" <td>0.306548</td>\n",
" <td>-0.718787</td>\n",
" <td>-0.012087</td>\n",
" <td>0.776156</td>\n",
" <td>0.306525</td>\n",
" <td>0.024460</td>\n",
" <td>-0.165710</td>\n",
" <td>0.004959</td>\n",
" <td>0.435236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dimension2</th>\n",
" <td>0.218885</td>\n",
" <td>0.381160</td>\n",
" <td>-0.825774</td>\n",
" <td>0.288159</td>\n",
" <td>0.351436</td>\n",
" <td>0.299775</td>\n",
" <td>-0.531348</td>\n",
" <td>0.851409</td>\n",
" <td>0.388587</td>\n",
" <td>0.181236</td>\n",
" <td>-0.198039</td>\n",
" <td>-0.006427</td>\n",
" <td>0.286177</td>\n",
" <td>-0.519626</td>\n",
" <td>-0.205063</td>\n",
" <td>-0.214403</td>\n",
" <td>-0.009451</td>\n",
" <td>0.166736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dimension3</th>\n",
" <td>-0.038348</td>\n",
" <td>0.014097</td>\n",
" <td>-0.065819</td>\n",
" <td>-1.162422</td>\n",
" <td>-0.458230</td>\n",
" <td>0.171052</td>\n",
" <td>-0.334620</td>\n",
" <td>0.087511</td>\n",
" <td>0.181597</td>\n",
" <td>-0.024812</td>\n",
" <td>-0.054003</td>\n",
" <td>0.001239</td>\n",
" <td>0.060208</td>\n",
" <td>-0.069595</td>\n",
" <td>-0.023274</td>\n",
" <td>-0.027595</td>\n",
" <td>-0.026104</td>\n",
" <td>0.103441</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SUV Wagon Minivan Pickup AWD RWD \\\n",
"Dimension1 0.093402 -1.203972 -0.238452 0.744765 -0.817794 0.628960 \n",
"Dimension2 0.218885 0.381160 -0.825774 0.288159 0.351436 0.299775 \n",
"Dimension3 -0.038348 0.014097 -0.065819 -1.162422 -0.458230 0.171052 \n",
"\n",
" Retail Dealer Engine Cylinders Horsepower CityMPG \\\n",
"Dimension1 -1.410562 -0.913081 -0.354061 0.306548 -0.718787 -0.012087 \n",
"Dimension2 -0.531348 0.851409 0.388587 0.181236 -0.198039 -0.006427 \n",
"Dimension3 -0.334620 0.087511 0.181597 -0.024812 -0.054003 0.001239 \n",
"\n",
" HighwayMPG Weight Wheelbase Length Width \\\n",
"Dimension1 0.776156 0.306525 0.024460 -0.165710 0.004959 \n",
"Dimension2 0.286177 -0.519626 -0.205063 -0.214403 -0.009451 \n",
"Dimension3 0.060208 -0.069595 -0.023274 -0.027595 -0.026104 \n",
"\n",
" Explained Variance \n",
"Dimension1 0.435236 \n",
"Dimension2 0.166736 \n",
"Dimension3 0.103441 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pca_result"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}