mirror of
https://gitee.com/TheAlgorithms/Statistical-Learning-Method_Code.git
synced 2024-12-22 20:54:21 +08:00
290 lines
10 KiB
Plaintext
290 lines
10 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Time: 0.14439177513122559\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import time \n",
|
||
"\n",
|
||
"\n",
|
||
"#定义加载数据的函数\n",
|
||
"def load_data(file):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" file - (str) 数据文件的路径\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" df - (dataframe) 读取的数据表格\n",
|
||
" X - (array) 特征数据数组\n",
|
||
" \n",
|
||
" '''\n",
|
||
" df = pd.read_csv(file) #读取csv文件\n",
|
||
" df.drop('Sports', axis=1, inplace=True) #去掉类别数据\n",
|
||
" X = np.asarray(df.values).T #将数据转换成数组\n",
|
||
" return df, X\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义规范化函数,对每一列特征进行规范化处理,使其成为期望为0方差为1的标准分布\n",
|
||
"def Normalize(X):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" X - (array) 特征数据数组\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" X - (array) 规范化处理后的特征数据数组\n",
|
||
" \n",
|
||
" '''\n",
|
||
" m, n = X.shape\n",
|
||
" for i in range(m):\n",
|
||
" E_xi = np.mean(X[i]) #第i列特征的期望\n",
|
||
" Var_xi = np.var(X[i], ddof=1) #第i列特征的方差\n",
|
||
" for j in range(n):\n",
|
||
" X[i][j] = (X[i][j] - E_xi) / np.sqrt(Var_xi) #对第i列特征的第j条数据进行规范化处理\n",
|
||
" return X\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义奇异值分解函数,计算V矩阵和特征值\n",
|
||
"def cal_V(X):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" X - (array) 特征数据数组\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" eigvalues - (list) 特征值列表,其中特征值按从大到小排列\n",
|
||
" V - (array) V矩阵\n",
|
||
" \n",
|
||
" '''\n",
|
||
" newX = X.T / np.sqrt(X.shape[1]-1) #构造新矩阵X'\n",
|
||
" Sx = np.matmul(newX.T, newX) #计算X的协方差矩阵Sx = X'.T * X'\n",
|
||
" V_T = [] #用于保存V的转置\n",
|
||
" w, v = np.linalg.eig(Sx) #计算Sx的特征值和对应的特征向量,即为X’的奇异值和奇异向量\n",
|
||
" tmp = {} #定义一个字典用于保存特征值和特征向量,字典的键为特征值,值为对应的特征向量\n",
|
||
" for i in range(len(w)):\n",
|
||
" tmp[w[i]] = v[i]\n",
|
||
" eigvalues = sorted(tmp, reverse=True) #将特征值逆序排列后保存到eigvalues列表中\n",
|
||
" for i in eigvalues:\n",
|
||
" d = 0\n",
|
||
" for j in range(len(tmp[i])):\n",
|
||
" d += tmp[i][j] ** 2\n",
|
||
" V_T.append(tmp[i] / np.sqrt(d)) #计算特征值i的单位特征向量,即为V矩阵的列向量,将其保存到V_T中\n",
|
||
" V = np.array(V_T).T #对V_T进行转置得到V矩阵\n",
|
||
" return eigvalues, V\n",
|
||
"\n",
|
||
"\n",
|
||
"#定义主成分分析函数\n",
|
||
"def do_pca(X, k):\n",
|
||
" '''\n",
|
||
" INPUT:\n",
|
||
" X - (array) 特征数据数组\n",
|
||
" k - (int) 设定的主成分个数\n",
|
||
" \n",
|
||
" OUTPUT:\n",
|
||
" fac_load - (array) 因子负荷量数组\n",
|
||
" dimrates - (list) 可解释偏差列表\n",
|
||
" Y - (array) 主成分矩阵\n",
|
||
" \n",
|
||
" '''\n",
|
||
" eigvalues, V = cal_V(X) #计算特征值和V矩阵\n",
|
||
" Vk = V[:, :k] #取V矩阵的前k列\n",
|
||
" Y = np.matmul(Vk.T, X) #计算主成分矩阵,将m*n的样本矩阵X转换成k*n的样本主成分矩阵\n",
|
||
" dimrates = [i / sum(eigvalues) for i in eigvalues[:k]] #计算可解释偏差,即前k个奇异值中每个奇异值占奇异值总和的比例,这个比例表示主成分i可解释原始数据中的可变性的比例\n",
|
||
" fac_load = np.zeros((k, X.shape[0])) #用来保存主成分的因子负荷量\n",
|
||
" for i in range(k): \n",
|
||
" for j in range(X.shape[0]):\n",
|
||
" fac_load[i][j] = np.sqrt(eigvalues[i]) * Vk[j][i] / np.sqrt(np.var(X[j])) #计算主成分i对应原始特征j的因子负荷量,保存到fac_load中\n",
|
||
" return fac_load, dimrates, Y\n",
|
||
"\n",
|
||
"\n",
|
||
"if __name__ == \"__main__\":\n",
|
||
" df, X = load_data('cars.csv') #加载数据\n",
|
||
" start = time.time() #保存开始时间\n",
|
||
" X = Normalize(X) #对样本数据进行规范化处理\n",
|
||
" k = 3 #设定主成分个数为3\n",
|
||
" fac_load, dimrates, Y = do_pca(X, k) #进行主成分分析\n",
|
||
" pca_result = pd.DataFrame(fac_load, index=['Dimension1', 'Dimension2', 'Dimension3'], columns=df.columns) #将结果保存为dataframe格式\n",
|
||
" pca_result['Explained Variance'] = dimrates #将可解释偏差保存到pca_result的'Explained Variance'列\n",
|
||
" end = time.time() #保存结束时间\n",
|
||
" print('Time:', end-start)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>SUV</th>\n",
|
||
" <th>Wagon</th>\n",
|
||
" <th>Minivan</th>\n",
|
||
" <th>Pickup</th>\n",
|
||
" <th>AWD</th>\n",
|
||
" <th>RWD</th>\n",
|
||
" <th>Retail</th>\n",
|
||
" <th>Dealer</th>\n",
|
||
" <th>Engine</th>\n",
|
||
" <th>Cylinders</th>\n",
|
||
" <th>Horsepower</th>\n",
|
||
" <th>CityMPG</th>\n",
|
||
" <th>HighwayMPG</th>\n",
|
||
" <th>Weight</th>\n",
|
||
" <th>Wheelbase</th>\n",
|
||
" <th>Length</th>\n",
|
||
" <th>Width</th>\n",
|
||
" <th>Explained Variance</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Dimension1</th>\n",
|
||
" <td>0.093402</td>\n",
|
||
" <td>-1.203972</td>\n",
|
||
" <td>-0.238452</td>\n",
|
||
" <td>0.744765</td>\n",
|
||
" <td>-0.817794</td>\n",
|
||
" <td>0.628960</td>\n",
|
||
" <td>-1.410562</td>\n",
|
||
" <td>-0.913081</td>\n",
|
||
" <td>-0.354061</td>\n",
|
||
" <td>0.306548</td>\n",
|
||
" <td>-0.718787</td>\n",
|
||
" <td>-0.012087</td>\n",
|
||
" <td>0.776156</td>\n",
|
||
" <td>0.306525</td>\n",
|
||
" <td>0.024460</td>\n",
|
||
" <td>-0.165710</td>\n",
|
||
" <td>0.004959</td>\n",
|
||
" <td>0.435236</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Dimension2</th>\n",
|
||
" <td>0.218885</td>\n",
|
||
" <td>0.381160</td>\n",
|
||
" <td>-0.825774</td>\n",
|
||
" <td>0.288159</td>\n",
|
||
" <td>0.351436</td>\n",
|
||
" <td>0.299775</td>\n",
|
||
" <td>-0.531348</td>\n",
|
||
" <td>0.851409</td>\n",
|
||
" <td>0.388587</td>\n",
|
||
" <td>0.181236</td>\n",
|
||
" <td>-0.198039</td>\n",
|
||
" <td>-0.006427</td>\n",
|
||
" <td>0.286177</td>\n",
|
||
" <td>-0.519626</td>\n",
|
||
" <td>-0.205063</td>\n",
|
||
" <td>-0.214403</td>\n",
|
||
" <td>-0.009451</td>\n",
|
||
" <td>0.166736</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Dimension3</th>\n",
|
||
" <td>-0.038348</td>\n",
|
||
" <td>0.014097</td>\n",
|
||
" <td>-0.065819</td>\n",
|
||
" <td>-1.162422</td>\n",
|
||
" <td>-0.458230</td>\n",
|
||
" <td>0.171052</td>\n",
|
||
" <td>-0.334620</td>\n",
|
||
" <td>0.087511</td>\n",
|
||
" <td>0.181597</td>\n",
|
||
" <td>-0.024812</td>\n",
|
||
" <td>-0.054003</td>\n",
|
||
" <td>0.001239</td>\n",
|
||
" <td>0.060208</td>\n",
|
||
" <td>-0.069595</td>\n",
|
||
" <td>-0.023274</td>\n",
|
||
" <td>-0.027595</td>\n",
|
||
" <td>-0.026104</td>\n",
|
||
" <td>0.103441</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" SUV Wagon Minivan Pickup AWD RWD \\\n",
|
||
"Dimension1 0.093402 -1.203972 -0.238452 0.744765 -0.817794 0.628960 \n",
|
||
"Dimension2 0.218885 0.381160 -0.825774 0.288159 0.351436 0.299775 \n",
|
||
"Dimension3 -0.038348 0.014097 -0.065819 -1.162422 -0.458230 0.171052 \n",
|
||
"\n",
|
||
" Retail Dealer Engine Cylinders Horsepower CityMPG \\\n",
|
||
"Dimension1 -1.410562 -0.913081 -0.354061 0.306548 -0.718787 -0.012087 \n",
|
||
"Dimension2 -0.531348 0.851409 0.388587 0.181236 -0.198039 -0.006427 \n",
|
||
"Dimension3 -0.334620 0.087511 0.181597 -0.024812 -0.054003 0.001239 \n",
|
||
"\n",
|
||
" HighwayMPG Weight Wheelbase Length Width \\\n",
|
||
"Dimension1 0.776156 0.306525 0.024460 -0.165710 0.004959 \n",
|
||
"Dimension2 0.286177 -0.519626 -0.205063 -0.214403 -0.009451 \n",
|
||
"Dimension3 0.060208 -0.069595 -0.023274 -0.027595 -0.026104 \n",
|
||
"\n",
|
||
" Explained Variance \n",
|
||
"Dimension1 0.435236 \n",
|
||
"Dimension2 0.166736 \n",
|
||
"Dimension3 0.103441 "
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pca_result"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|