Statistical-Learning-Method.../Clustering/K-means_Clustering/K-means_Clustering.ipynb
2021-01-26 16:44:22 +08:00

246 lines
24 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data shape: (150, 4)\n",
"Length of labels: 150\n",
"1/2\n",
"{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 57, 60, 67, 69, 79, 80, 81, 93, 98], 1: [50, 51, 52, 54, 58, 62, 63, 65, 68, 71, 72, 73, 74, 75, 76, 77, 83, 86, 87, 91, 92, 97, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 137, 139, 140, 141, 143, 144, 145, 146, 147], 2: [53, 55, 56, 59, 61, 64, 66, 70, 78, 82, 84, 85, 88, 89, 90, 94, 95, 96, 99, 100, 101, 106, 113, 114, 115, 121, 127, 136, 138, 142, 148, 149]}\n",
"2/2\n",
"{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 1: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], 2: []}\n",
"Time: 0.0059719085693359375\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl4VPXZ//H3Tdj3fSeEHcKiwrC57+IGKtparUitRdvy2Mc+CnGrCFaFLtrF1kKL1S5qBdSAKBU3tG6EKtkgEMIW1kDYQyDL/fsj0V+MQAayzEzm87quXNd8z3zPzP2F5DMnZ+bcMXdHRESiQ51QFyAiIjVHoS8iEkUU+iIiUUShLyISRRT6IiJRRKEvIhJFFPoiIlFEoS8iEkUU+iIiUaRuqAsor23bth4XFxfqMkREIsqKFSt2uXu7iuaFXejHxcWRlJQU6jJERCKKmW0MZp5O74iIRBGFvohIFFHoi4hEEYW+iEgUUeiLiEQRhb6ISBRR6IuIRBGFvohIGHgrfQcvLd9U7c8TdhdniYhEk10HjzAtMY1FydsYGtuSG4Z1o04dq7bnU+iLiISAu/PqF1t4ZGE6eUeKuOfSvtxxXq9qDXxQ6IuI1Litew/zwCspvJuRw9DYlsy6fgi92zerkedW6IuI1JDiYucfn23iicWrKHZ4+Op4JoyOI6aaj+7LUuiLiNSArJyDJMxP4bMNuZzduy2PXzeYbq0b13gdCn0RkWpUWFTMnz9cz5NvraFB3TrMun4INwzrilnNHd2XpdAXEakm6Vv3M2X+SlK37OeygR2YMW4Q7Zs3DGlNCn0RkSp2pLCI37+TyR/fW0fLxvX4w81DuXxQx5Ad3Zel0BcRqUIrNuYyZV4y63IOMX5oVx66agAtG9cPdVlfUeiLiFSBQ0cK+cWSDJ77eAOdWzTiudtGcF7fCv96YY0Lqg2DmY0xswwzyzSzhGPcP9HMcszsi9Kv28vcF2tm/zazVWaWbmZxVVe+iEjofbA2h8ueWsZfP9rAhFHdWXL3uWEZ+BDEkb6ZxQBPA5cA2cByM0t09/RyU19y98nHeIjngZ+7+1tm1hQormzRIiLhYF9eAY++ns7LK7Lp2a4JL985muFxrUNd1gkFc3pnBJDp7lkAZvYiMA4oH/rfYGbxQF13fwvA3Q9WolYRkbDxZup2HnotldxDR/nR+b2466I+NKwXE+qyKhRM6HcBNpcZZwMjjzFvvJmdC6wB7nb3zUBfYK+ZLQB6AEuBBHcvKrujmU0CJgHExsae9CJERGrKzgP5TEtMY3HKduI7NefZicMZ1KVFqMsKWjDn9I/1GSMvN14IxLn7EEqC/bnS7XWBc4B7gOFAT2DiNx7Mfba7B9w90K5deJ4HE5Ho5u7MW5HNJb9extJVO7n3sn68NvmsiAp8CO5IPxvoVmbcFdhadoK77y4znAPMLLPv52VODb0KjAL+cqoFi4jUtOw9edz/SirL1uQQ6N6KJ8YPoXf7pqEu65QEE/rLgT5m1gPYAtwI3FR2gpl1cvdtpcOxwKoy+7Yys3bungNcCCRVSeUiItWsuNj52ycbmfnmagAeGTuQW0Z1r/b2x9WpwtB390IzmwwsAWKAue6eZmbTgSR3TwTuMrOxQCGQS+kpHHcvMrN7gLet5FK0FZT8JiAiEtbW5Rxk6rxkkjbu4dy+7Xjs2kF0bVXzDdKqmrmXPz0fWoFAwJOS9MuAiIRGQVExs5dl8Zu319KoXgw/uyqe64Z2CYsWCidiZivcPVDRPF2RKyJSKnXLPqbMSyZ9236uGNyRR8YOol2zBqEuq0op9EUk6uUXFPGbt9cye1kWrZvU55nvDmXMoE6hLqtaKPRFJKot35DL1HnJZO06xA3DuvLglfG0aFwv1GVVG4W+iESlg0cKmfXmap7/eCNdWzXib98fwTl9av91Qgp9EYk676/J4f4FKWzdd5iJZ8Zx72X9aNIgOuIwOlYpIgLsOXSUGa+ns+C/W+jVrgnz7hzNsO7h3SCtqin0RaTWc3feSN3Oz15LZW9eAf9zYW8mX9ibBnXDv0FaVVPoi0ittnN/Pg+9lsqStB0M7tKC528bSXzn5qEuK2QU+iJSK7k7L6/I5tFF6RwpLCbh8v7cfnYP6sYE9bejai2FvojUOptz87hvQQofZu5iRFxrnhg/mJ7tIrNBWlVT6ItIrVFU7Dz/8QZmvZlBTB1jxjWDuHlEbEQ3SKtqCn0RqRXW7jjA1PnJ/HfTXs7v147Hrh1M55aNQl1W2FHoi0hEKygq5pn31vG7dzJp0iCGp759OuNO7xz2DdJCRaEvIhErJXsf985byertB7hqSCemjR1I26a1q0FaVVPoi0jEyS8o4smla5izLIu2TRsw+5ZhXDqwY6jLiggKfRGJKJ9m7SZhQQrrdx3ixuHduO+KAbRoVHsbpFU1hb6IRIQD+QXMfHM1f/9kE7GtG/OP20dyVu+2oS4r4ij0RSTsvbt6J/e/ksKO/fncfnYPfnppXxrXV3ydCv2riUjYyj10lOkL03j1i630ad+UP/zwTM6IbRXqsiKaQl9Ewo67syh5G9MS09h3uICfXNSHH13QKyobpFU1hb6IhJUd+/N54JVUlq7awZCuLfjHD0bSv2P0Nkiragp9EQkL7s5Lyzfz88WrOFpYzANXDOB7Z8VFfYO0qqbQF5GQ27j7EPctSOGjdbsZ1bM1T1w3hLi2TUJdVq0U1EuomY0xswwzyzSzhGPcP9HMcszsi9Kv28vd39zMtpjZ76uqcBGJfEXFzp8/yOKyp5aRkr2Px64dzD9vH6XAr0YVHumbWQzwNHAJkA0sN7NEd08vN/Uld598nIeZAbxfqUpFpFbJ2H6AKfOTWbl5Lxf1b8+j1w6iUws1SKtuwZzeGQFkunsWgJm9CIwDyof+MZnZMKAD8CYQOMU6RaSWOFpYzB/ey+TpdzNp1rAev7nxdMaepgZpNSWY0O8CbC4zzgZGHmPeeDM7F1gD3O3um82sDvAr4BbgosoWKyKRbeXmvUyZl0zGjgOMO70zP7sqnjZqkFajggn9Y738ernxQuAFdz9iZncCzwEXAj8CFpe+ABz/CcwmAZMAYmNjg6lbRCLI4aNF/PqtDP7y4XraN2vInycEuDi+Q6jLikrBhH420K3MuCuwtewEd99dZjgHmFl6ezRwjpn9CGgK1Dezg+6eUG7/2cBsgEAgUP4FRUQi2EfrdpEwP4VNuXncNDKWhMv707yhGqSFSjChvxzoY2Y9gC3AjcBNZSeYWSd331Y6HAusAnD3m8vMmQgEyge+iNRO+/MLeHzxal74bBPd2zTmhR+MYnSvNqEuK+pVGPruXmhmk4ElQAww193TzGw6kOTuicBdZjYWKARygYnVWLOIhLml6Tt44NUUcg4cYdK5Pbn74r40qq8WCuHA3MPrbEogEPCkpKRQlyEip2D3wSM8sjCdxJVb6d+xGTPHD+G0bi1DXVZUMLMV7l7hJyR1Ra6IVJq7k7hyK9MS0zh4pJC7L+7LD8/vRf26aqEQbhT6IlIp2/Yd5sFXUnl79U5O79aSWdcPoW+HZqEuS45DoS8ip6S42Hlh+SYeX7yaomLnoavimXhmHDF1dJFVOFPoi8hJW7/rEAnzk/l0fS5n9W7D49cOIbZN41CXJUFQ6ItI0AqLipn7n/X86t9rqF+3DjPHD+ZbgW5qoRBBFPoiEpRV2/YzdX4yydn7uCS+A49eM4gOzRuGuiw5SQp9ETmhI4VFPP3uOv7wbiYtGtXj9zedwZWDO+noPkIp9EXkuP67aQ9T5yWzdudBrjujCw9dFU+rJvVDXZZUgkJfRL4h72ghv1yyhmc/Wk+n5g159nvDuaBf+1CXJVVAoS8iX/OfzF0kLEhmc+5hbhnVnSlj+tFMDdJqDYW+iACw73ABj72+ipeSNtOjbRNemjSKkT3VIK22UeiLCP9O286Dr6ay+9BR7jyvF/97cR8a1lODtNpIoS8SxXIOHGHawjReT97GgE7N+cutwxnctUWoy5JqpNAXiULuziufb2H6onTyjhRxz6V9ueO8XtSLUYO02k6hLxJltuw9zAOvpPBeRg5DY0sapPVurwZp0UKhLxIlioudf3y6kSfeWI0D066O55bRapAWbRT6IlEgK+cgCfNT+GxDLuf0actj1w6mW2s1SItGCn2RWqywqJg5H6znyaVraFi3Dr+4fgjXD+uqFgpRTKEvUkulbd3H1PnJpG7Zz2UDOzBj3CDaq0Fa1FPoi9Qy+QVF/O6dtTzzfhatGtfnjzcP5fLBnUJdloQJhb5ILbJiYy5T5iWzLucQ44d25aGrBtCysRqkyf+n0BepBQ4dKeQXSzJ47uMNdG7RiOduG8F5fduFuiwJQwp9kQi3bE0O9y1IYeu+w0wY1Z17x/SnaQP9aMuxBXX5nZmNMbMMM8s0s4Rj3D/RzHLM7IvSr9tLt59uZh+bWZqZJZvZt6t6ASLRam/eUe55eSUT5n5Gg3p1+Ncdo3lk3CAFvpxQhd8dZhYDPA1cAmQDy80s0d3Ty019yd0nl9uWB0xw97Vm1hlYYWZL3H1vVRQvEq3eSNnGQ6+lsSfvKD++oBf/c6EapElwgjkkGAFkunsWgJm9CIwDyof+N7j7mjK3t5rZTqAdoNAXOQU7D+Tz8GtpvJG6nYGdm/PcbcMZ2FkN0iR4wYR+F2BzmXE2MPIY88ab2bnAGuBudy+7D2Y2AqgPrDvFWkWilrszb0U2j76+isMFRUwZ048fnNNTDdLkpAUT+se6dM/LjRcCL7j7ETO7E3gOuPCrBzDrBPwNuNXdi7/xBGaTgEkAsbGxQZYuEh025+Zx/yspfLB2F8PjWvHE+CH0atc01GVJhAom9LOBbmXGXYGtZSe4++4ywznAzC8HZtYceB140N0/OdYTuPtsYDZAIBAo/4IiEpWKi53nP97ArCUZGDB93EC+O7I7ddQgTSohmNBfDvQxsx7AFuBG4KayE8ysk7tvKx2OBVaVbq8PvAI87+4vV1nVIrVc5s6DJMxPJmnjHs7t247Hrh1E11ZqkCaVV2Hou3uhmU0GlgAxwFx3TzOz6UCSuycCd5nZWKAQyAUmlu7+LeBcoI2Zfbltort/UbXLEKkdCoqKmb0si98sXUvjBjH86obTuG5oFzVIkypj7uF1NiUQCHhSUlKoyxCpcalb9jFlXjLp2/Zz5eBOTBs7kHbNGoS6LIkQZrbC3QMVzdNVHCIhll9QxG/eXsvsZVm0blKfZ747jDGDOoa6LKmlFPoiIbR8Qy5T5yWTtesQ3wp05YEr4mnRuF6oy5JaTKEvEgIHjxQy683VPP/xRrq2asTfvz+Ss/u0DXVZEgUU+iI17N2MnTywIIVt+/P53llx3HNpP5qoX47UEH2nidSQPYeOMmNROgs+30Lv9k2Zd+eZDOveKtRlSZRR6ItUM3dnccp2Hk5MZW9eAXdd2JsfX9ibBnXVIE1qnkJfpBrt3J/Pg6+m8u/0HQzu0oLnbxtJfOfmoS5LophCX6QauDsvJ2Uz4/V0jhYWc9/l/fn+2T2oqwZpEmIKfZEqtml3SYO0DzN3MaJHa564bjA91SBNwoRCX6SKFBU7f/1oA79ckkFMHePRawZx04hYNUiTsKLQF6kCa3ccYMr8ZD7ftJcL+rXj59cOpnPLRqEuS+QbFPoilXC0sJhn3l/H79/JpEmDGJ769umMO72zGqRJ2FLoi5yi5Oy9TJmXzOrtB7j6tM48fHU8bZuqQZqEN4W+yEnKLyjiybfWMOeDLNo1a8CcCQEuie8Q6rJEgqLQFzkJn2TtJmF+Mht25/GdEd1IuHwALRqpQZpEDoW+SBAO5BfwxBur+cenm4ht3Zh/3j6SM3urQZpEHoW+SAXeWb2DB15JZcf+fG4/uwf/d2k/GtVXCwWJTAp9kePIPXSU6QvTePWLrfTt0JQ/3HwmZ8SqQZpENoW+SDnuzsLkbUxLTONAfgE/uagPP76gN/XrqoWCRD6FvkgZ2/eVNEhbumoHp3VtwczrR9K/oxqkSe2h0Beh5Oj+xeWbeez1VRQUF/PAFQO47ewexKiFgtQyCn2Jeht3HyJhfgofZ+1mVM/WPHHdEOLaNgl1WSLVQqEvUauo2Hn2P+v55b8zqFenDo9fN5gbh3dTCwWp1RT6EpUytpc0SFu5eS8XD2jPo9cMpmOLhqEuS6TaBfVxBDMbY2YZZpZpZgnHuH+imeWY2RelX7eXue9WM1tb+nVrVRYvcrKOFhbz1NI1XPW7D9icm8dvv3MGcyYEFPgSNSo80jezGOBp4BIgG1huZonunl5u6kvuPrncvq2Bh4EA4MCK0n33VEn1Iifhi817mTovmYwdBxh3emcevnogrZvUD3VZIjUqmNM7I4BMd88CMLMXgXFA+dA/lsuAt9w9t3Tft4AxwAunVq7IyTt8tIhf/TuDuf9ZT/tmDfnLrQEuGqAGaRKdggn9LsDmMuNsYOQx5o03s3OBNcDd7r75OPt2Kb+jmU0CJgHExsYGV7lIED5at4uE+Slsys3j5pGxTL28P80bqkGaRK9gzukf66MMXm68EIhz9yHAUuC5k9gXd5/t7gF3D7Rr1y6IkkRObH9+AfctSOamOZ9Sx+DFSaP4+bWDFfgS9YI50s8GupUZdwW2lp3g7rvLDOcAM8vse365fd872SJFTsbS9B088GoKOQeOcMe5Pfnfi/uqQZpIqWBCfznQx8x6AFuAG4Gbyk4ws07uvq10OBZYVXp7CfCYmX3ZpepS4L5KVy1yDLsOHuGRheksXLmV/h2bMWdCgCFdW4a6LJGwUmHou3uhmU2mJMBjgLnunmZm04Ekd08E7jKzsUAhkAtMLN0318xmUPLCATD9yzd1RaqKu/PaF1t5ZGEaB48U8tNL+nLneb3UIE3kGMz9G6fYQyoQCHhSUlKoy5AIsXXvYR58NZV3Vu/kjNiWzBw/hL4dmoW6LJEaZ2Yr3D1Q0TxdkSsRqbjY+ednm3jijdUUFTs/uyqeW8+MU4M0kQoo9CXirN91iIT5yXy6Ppezerfh8WuHENumcajLEokICn2JGIVFxfzlw/X8+q011K9bh1njh3BDoKsapImcBIW+RIT0rfuZOj+ZlC37uCS+A49eM4gOzdUvR+RkKfQlrB0pLOL372Tyx/fW0bJxPZ6+aShXDO6oo3uRU6TQl7C1YuMeps5PJnPnQa47owsPXRVPKzVIE6kUhb6EnbyjhfxiSQZ//WgDnZo35NnvDeeCfu1DXZZIraDQl7Dy4dpdJCxIJnvPYSaM7s6UMf1p2kDfpiJVRT9NEhb25RXw88Xp/Cspmx5tm/CvO0YzokfrUJclUuso9CXk3kzdzkOvpZJ76Cg/PL8XP7moDw3rqUGaSHVQ6EvI5Bw4wrTENF5P2caATs2Ze+twBndtEeqyRGo1hb7UOHdnwX+3MH1ROoePFnHvZf2YdG5P6sWoQZpIdVPoS43asvcw9y9I4f01OQzr3oqZ44fQu33TUJclEjUU+lIjioudv3+6kZlvrMaBaVfHM2F0HHXUIE2kRin0pdqtyzlIwvxklm/Ywzl92vLYtYPp1loN0kRCQaEv1aagqJg5H2Tx1NK1NKxbh19cP4Trh6lBmkgoKfSlWqRu2cfU+cmkbd3PmIEdmX7NQNo3U4M0kVBT6EuVyi8o4nfvrOWZ97No1bg+f7x5KJcP7hTqskSklEJfqkzShlymzE8mK+cQ1w/ryoNXDqBlYzVIEwknCn2ptENHShqkPffxBjq3aMTzt43g3L7tQl2WiByDQl8q5f01Ody/IIWt+w5z6+g47r2sH03UIE0kbOmnU07J3ryjzFi0ivn/zaZnuya8fMdoAnFqkCYS7hT6ctLeSNnGQ6+lsSfvKJMv6M3kC3urQZpIhAiq2YmZjTGzDDPLNLOEE8y73szczAKl43pm9pyZpZjZKjO7r6oKl5q3c38+d/5tBT/8x3/p0LwBiZPP4p7L+inwRSJIhUf6ZhYDPA1cAmQDy80s0d3Ty81rBtwFfFpm8w1AA3cfbGaNgXQze8HdN1TVAqT6uTvzVmQzY1E6+YXFTB3Tnx+c04O6apAmEnGCOb0zAsh09ywAM3sRGAekl5s3A5gF3FNmmwNNzKwu0Ag4CuyvbNFSczbn5nH/Kyl8sHYXw+Na8cT4IfRqpwZpIpEqmNDvAmwuM84GRpadYGZnAN3cfZGZlQ39eZS8QGwDGgN3u3tu5UqWmlBU7Dz/8QZ+sSQDA2aMG8jNI7urQZpIhAsm9I/1U+5f3WlWB3gSmHiMeSOAIqAz0Ar4wMyWfvlbQ5nHmARMAoiNjQ2qcKk+mTsPMHV+Cis27uG8vu34+bWD6NpKDdJEaoNgQj8b6FZm3BXYWmbcDBgEvFfaSKsjkGhmY4GbgDfdvQDYaWb/AQLA10Lf3WcDswECgYAjIVFQVMyf3l/Hb9/OpHGDGH79rdO49owuapAmUosEE/rLgT5m1gPYAtxISZgD4O77gLZfjs3sPeAed08ys4uAC83s75Sc3hkFPFV15UtVSd2yj3vnJbNq236uHNKJaVcPpF2zBqEuS0SqWIWh7+6FZjYZWALEAHPdPc3MpgNJ7p54gt2fBp4FUik5TfSsuydXQd1SRfILinhq6VrmfJBF6yb1+dMtw7hsYMdQlyUi1cTcw+tsSiAQ8KSkpFCXERU+W59LwvxksnYd4tuBbtx/xQBaNK4X6rJE5BSY2Qp3D1Q0T1fkRqED+QXMejODv32yka6tGvH374/k7D5tK95RRCKeQj/KvJuxkwcWpLBtfz63ndWDey7rS+P6+jYQiRb6aY8Sew4dZcaidBZ8voU+7Zsy784zGda9VajLEpEaptCv5dyd11O28fBraew7XMBdF/bmxxf2pkFd9csRiUYK/Vpsx/58Hnw1lbfSdzC4Swv+fvtIBnRqHuqyRCSEFPq1kLvzr6TNPPr6Ko4WFnPf5f35/tlqkCYiCv1aZ9PuPBIWJPPRut2M6NGameOH0KNtk1CXJSJhQqFfSxQVO3/9aAO/XJJBTB3j0WsGcdOIWDVIE5GvUejXAmt2HGDKvGS+2LyXC/u359FrBtG5ZaNQlyUiYUihH8GOFhbzzPvr+N07a2naoC6/ufF0xp7WWQ3SROS4FPoRauXmvUydn8zq7Qe4+rTOTLs6njZN1SBNRE5MoR9hDh8t4smla/jzB1m0a9aAORMCXBLfIdRliUiEUOhHkI/X7ea+Bcls2J3Hd0Z0474rBtC8oRqkiUjwFPoRYH9+AU+8sZp/frqJ2NaN+eftIzmztxqkicjJU+iHuXdW7+D+BansPJDPD87pwU8v6Uej+mqhICKnRqEfpnYfPML0Rem89sVW+nVoxjO3DOP0bi1DXZaIRDiFfphxdxJXbuWRhekcyC/gfy/uw4/O7039umqhICKVp9API9v2HebBV1J5e/VOTuvWklnjh9CvY7NQlyUitYhCPwwUFzsvLt/M44tXUVBczINXDuB7Z/UgRi0URKSKKfRDbMOuQyQsSOaTrFxG92zDE+MH072NGqSJSPVQ6IdIUbEz98P1/OqtDOrVqcMT1w3m28O7qYWCiFQrhX4IrN6+n6nzklmZvY+LB7Tn0WsG07FFw1CXJSJRQKFfg44UFvH0u+v4w7uZtGhUj9995wyuGtJJR/ciUmMU+jXk8017mDo/mTU7DnLN6Z352dUDad2kfqjLEpEoE9SHv81sjJllmFmmmSWcYN71ZuZmFiizbYiZfWxmaWaWYmZRdR4j72ghMxalc90fP+JAfiFzJwZ46sYzFPgiEhIVHumbWQzwNHAJkA0sN7NEd08vN68ZcBfwaZltdYG/A7e4+0ozawMUVGH9Ye2jzF0kLEhhU24eN4+MJeHy/jRTgzQRCaFgTu+MADLdPQvAzF4ExgHp5ebNAGYB95TZdimQ7O4rAdx9d6UrjgD7Dhfw+OJVvLh8M3FtGvPipFGM6tkm1GWJiAQV+l2AzWXG2cDIshPM7Aygm7svMrOyod8XcDNbArQDXnT3WeWfwMwmAZMAYmNjT24FYeat9B08+GoKOQeOcMd5Pbn74r40rKcGaSISHoIJ/WN9tMS/utOsDvAkMPE4j382MBzIA942sxXu/vbXHsx9NjAbIBAI+DceJQLsOniEaYlpLEreRv+OzZgzIcCQrmqQJiLhJZjQzwa6lRl3BbaWGTcDBgHvlX70sCOQaGZjS/d93913AZjZYmAo8LXQj2TuzqtfbOGRhenkHSni/y7pyx3n9VKDNBEJS8GE/nKgj5n1ALYANwI3fXmnu+8DvvqLHmb2HnCPuyeZ2Tpgipk1Bo4C51HyW0GtsHXvYR54JYV3M3I4I7akQVqfDmqQJiLhq8LQd/dCM5sMLAFigLnunmZm04Ekd088wb57zOzXlLxwOLDY3V+votpDprjY+cdnm5j5xmqKip2fXRXPrWfGqUGaiIQ9cw+vU+iBQMCTkpJCXcZxZeUcJGFBCp+tz+Xs3m15/LrBdGvdONRliUiUK32/NFDRPF2RG6TComL+/OF6nnxrDfXr1mHW+CHcEOiqFgoiElEU+kFI37qfKfNXkrplP5fGd2DGNYPo0DyqLiwWkVpCoX8CRwqL+P07mfzxvXW0bFyPp28ayhWDO+roXkQilkL/OFZsLGmQlrnzINcN7cJDV8bTSv1yRCTCKfTLOXSkkF/+O4O/frSBzi0a8dfvDef8fu1DXZaISJVQ6Jfxwdoc7luQQvaew0wY3Z0pY/rTtIH+iUSk9lCiAfvyCnj09XReXpFNz7ZN+NcdoxnRo3WoyxIRqXJRH/pvpm7noddSyT10lB+e34ufXNRHDdJEpNaK2tDfeSCfaYlpLE7ZTnyn5jw7cTiDurQIdVkiItUq6kLf3Vnw3y1MX5TO4YIi7r2sH5PO7Um9GDVIE5HaL6pCP3tPHve/ksqyNTkM696KmeOH0Lt901CXJSJSY6Ii9IuLnb99spGZb64G4JGxA7llVHfqqEGaiESZWh/663IOMnVeMkkb93BOn7Y8dq0apIlI9Kq1oV9QVMzsZVn85u21NKoXwy9vOI3xQ7uohYKIRLVaGfqpW/YxdX4yaVv3c/mgjjwybiDtm6lBmoiXLhcvAAAFZUlEQVRIrQr9/IIifvv2Wv60LItWjevzx5uHcvngTqEuS0QkbNSa0N+cm8etz35GVs4hbhjWlQevjKdF43qhLktEJKzUmtDv0LwhcW2aMO3qgZzbt12oyxERCUu1JvTr163D3InDQ12GiEhY02WoIiJRRKEvIhJFFPoiIlFEoS8iEkWCCn0zG2NmGWaWaWYJJ5h3vZm5mQXKbY81s4Nmdk9lCxYRkVNXYeibWQzwNHA5EA98x8zijzGvGXAX8OkxHuZJ4I3KlSoiIpUVzJH+CCDT3bPc/SjwIjDuGPNmALOA/LIbzewaIAtIq2StIiJSScGEfhdgc5lxdum2r5jZGUA3d19UbnsTYCrwSCXrFBGRKhDMxVnHakvpX91pVoeS0zcTjzHvEeBJdz94ou6WZjYJmFQ6PGhmGUHUdTxtgV2V2D8SRduao229oDVHi8qsuXswk4IJ/WygW5lxV2BrmXEzYBDwXmmwdwQSzWwsMBK43sxmAS2BYjPLd/ffl30Cd58NzA6m4IqYWZK7ByqeWXtE25qjbb2gNUeLmlhzMKG/HOhjZj2ALcCNwE1f3unu+yh5dQLAzN4D7nH3JOCcMtunAQfLB76IiNScCs/pu3shMBlYAqwC/uXuaWY2vfRoXkREIkRQDdfcfTGwuNy2nx1n7vnH2T7tJGs7VVVymijCRNuao229oDVHi2pfs7l7xbNERKRWUBsGEZEoEpGhX1FbCDNrYGYvld7/qZnF1XyVVSuINf/UzNLNLNnM3jazoD6+Fc4q2/4jEgWzZjP7Vun/dZqZ/bOma6xqQXxvx5rZu2b2een39xWhqLOqmNlcM9tpZqnHud/M7Lel/x7JZja0Sgtw94j6AmKAdUBPoD6wEogvN+dHwDOlt28EXgp13TWw5guAxqW3fxgNay6d1wxYBnwCBEJddw38P/cBPgdalY7bh7ruGljzbOCHpbfjgQ2hrruSaz4XGAqkHuf+KyhpW2PAKODTqnz+SDzSD6YtxDjgudLb84CL7ERXh4W/Ctfs7u+6e17p8BNKrqeIZJVq/xGhglnzD4Cn3X0PgLvvrOEaq1owa3ageentFnz9OqGI4+7LgNwTTBkHPO8lPgFamlmnqnr+SAz9CttClJ3jJR853Qe0qZHqqkcway7r+0R+g7tTbv8RwYL5f+4L9DWz/5jZJ2Y2psaqqx7BrHka8F0zy6bkU4T/UzOlhczJ/ryflEj8G7knbAtxEnMiSdDrMbPvAgHgvGqtqPpVpv1HpArm/7kuJad4zqfkt7kPzGyQu++t5tqqSzBr/g7wV3f/lZmNBv5Wuubi6i8vJKo1vyLxSL+ithBfm2NmdSn5lfBEv06Fu2DWjJldDDwAjHX3IzVUW3U5mfYfGyg595kY4W/mBvu9/Zq7F7j7eiCDkheBSBXMmr8P/AvA3T8GGlKmC0AtFNTP+6mKxND/qi2EmdWn5I3axHJzEoFbS29fD7zjpe+QRKgK11x6quNPlAR+pJ/nhQrW7O773L2tu8e5exwl72OM9ZL2H5EqmO/tVyl50x4za0vJ6Z6sGq2yagWz5k3ARQBmNoCS0M+p0SprViIwofRTPKOAfe6+raoePOJO77h7oZl92RYiBpjrpW0hgCR3TwT+QsmvgJmUHOHfGLqKKy/INf8CaAq8XPqe9SZ3j9g2GUGuuVYJcs1LgEvNLB0oAu51992hq7pyglzz/wFzzOxuSk5zTIzkgzgze4GS03NtS9+neBioB+Duz1DyvsUVQCaQB3yvSp8/gv/tRETkJEXi6R0RETlFCn0RkSii0BcRiSIKfRGRKKLQFxGJIgp9EZEootAXEYkiCn0RkSjy/wDZpjw/Gjo3LQAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#导入所需的库\n",
"import numpy as np\n",
"import time\n",
"import random \n",
"from scipy.special import comb\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"\n",
"#定义加载数据的函数,这里以鸢尾花数据集为例\n",
"def load_data(file):\n",
" '''\n",
" INPUT:\n",
" file - (str) 数据文件的路径\n",
" \n",
" OUTPUT:\n",
" Xarray - (array) 特征数据数组\n",
" Ylist - (list) 类别标签列表\n",
" \n",
" '''\n",
" Xlist = [] #定义一个列表用来保存每条数据\n",
" Ylist = [] #定义一个列表用来保存每条数据的类别标签\n",
" fr = open(file)\n",
" for line in fr.readlines(): #逐行读取数据,鸢尾花数据集每一行表示一个鸢尾花的特征和类别标签,用逗号分隔\n",
" cur = line.split(',')\n",
" label = cur[-1]\n",
" X = [float(x) for x in cur[:-1]] #用列表来表示一条特征数据\n",
" Xlist.append(X)\n",
" Ylist.append(label)\n",
" Xarray = np.array(Xlist) #将特征数据转换为数组类型,方便之后的操作\n",
" print('Data shape:', Xarray.shape)\n",
" print('Length of labels:', len(Ylist))\n",
" return Xarray, Ylist\n",
"\n",
"\n",
"#定义标准化函数对每一列特征进行min-max标准化将数据缩放到0-1之间\n",
"#标准化处理对于计算距离的机器学习方法是非常重要的,因为特征的尺度不同会导致计算出来的距离倾向于尺度大的特征,为保证距离对每一列特征都是公平的,必须将所有特征缩放到同一尺度范围内\n",
"def Normalize(Xarray):\n",
" '''\n",
" INPUT:\n",
" Xarray - (array) 特征数据数组\n",
" \n",
" OUTPUT:\n",
" Xarray - (array) 标准化处理后的特征数据数组\n",
" \n",
" '''\n",
" for f in range(Xarray.shape[1]):\n",
" maxf = np.max(Xarray[:, f])\n",
" minf = np.min(Xarray[:, f])\n",
" for n in range(Xarray.shape[0]):\n",
" Xarray[n][f] = (Xarray[n][f]-minf) / (maxf-minf) \n",
" return Xarray\n",
"\n",
"\n",
"#定义计算两条数据间的距离的函数,这里计算的是欧式距离\n",
"def cal_distance(xi, xj):\n",
" '''\n",
" INPUT:\n",
" Xi - (array) 第i条特征数据\n",
" Xj - (array) 第j条特征数据\n",
" \n",
" OUTPUT:\n",
" dist - (float) 两条数据的欧式距离\n",
" \n",
" '''\n",
" dist = 0\n",
" for col in range(len(xi)):\n",
" dist += (xi[col]-xj[col]) ** 2\n",
" return dist\n",
"\n",
"\n",
"#定义计算类中心的函数,以当前类中所包含数据的各个特征均值作为新的新的类中心\n",
"def cal_groupcenter(group, Xarray):\n",
" '''\n",
" INPUT:\n",
" group - (list) 类所包含的数据列表\n",
" Xarray - (array) 特征数据数组\n",
" \n",
" OUTPUT:\n",
" center - (array) 新的类中心\n",
" \n",
" '''\n",
" center = np.zeros(Xarray.shape[1])\n",
" for i in range(Xarray.shape[1]):\n",
" for n in group:\n",
" center[i] += Xarray[n][i] #计算当前类中第i个特征的数据之和\n",
" center = center / Xarray.shape[0] #计算各个特征的均值\n",
" return center\n",
"\n",
"\n",
"#定义计算调整兰德系数(ARI)的函数,调整兰德系数是一种聚类方法的常用评估方法\n",
"def Adjusted_Rand_Index(group_dict, Ylist, k):\n",
" '''\n",
" INPUT:\n",
" group_dict - (dict) 类别字典\n",
" Ylist - (list) 类别标签列表\n",
" k - (int) 设定的类别数\n",
" \n",
" OUTPUT:\n",
" (int) 调整兰德系数\n",
" \n",
" '''\n",
" group_array = np.zeros((k, k)) #定义一个数组,用来保存聚类所产生的类别标签与给定的外部标签各类别之间共同包含的数据数量\n",
" ylabel = list(set(Ylist)) #Ylist保存的标签为字符串用ylabel来保存各个标签在y_dict中类别以标签在ylabel列表中的索引值来表示类\n",
" y_dict = {i:[] for i in range(k)} #定义一个空字典用来保存外部标签中各类所包含的数据结构与group_dict相同\n",
" for i in range(len(Ylist)):\n",
" y_dict[ylabel.index(Ylist[i])].append(i)\n",
" #循环计算group_array的值\n",
" for i in range(k):\n",
" for j in range(k):\n",
" for n in range(len(Ylist)):\n",
" if n in group_dict[i] and n in y_dict[j]:\n",
" group_array[i][j] += 1 #如果数据n同时在group_dict的类别i和y_dict的类别j中group_array[i][j]的数值加一\n",
" RI = 0 #定义兰德系数(RI)\n",
" sum_i = np.zeros(3) #定义一个数组用于保存聚类结果group_dict中每一类的个数\n",
" sum_j = np.zeros(3) #定义一个数组用于保存外部标签y_dict中每一类的个数\n",
" for i in range(k):\n",
" for j in range(k):\n",
" sum_i[i] += group_array[i][j]\n",
" sum_j[j] += group_array[i][j]\n",
" if group_array[i][j] >= 2:\n",
" RI += comb(group_array[i][j], 2) #comb用于计算group_array[i][j]中两两组合的组合数\n",
" ci = 0 #ci保存聚类结果中同一类中的两两组合数之和\n",
" cj = 0 #cj保存外部标签中同一类中的两两组合数之和\n",
" for i in range(k):\n",
" if sum_i[i] >= 2:\n",
" ci += comb(sum_i[i], 2)\n",
" for j in range(k):\n",
" if sum_j[j] >= 2:\n",
" cj += comb(sum_j[j], 2)\n",
" E_RI = ci * cj / comb(len(Ylist), 2) #计算RI的期望\n",
" max_RI = (ci + cj) / 2 #计算RI的最大值\n",
" return (RI-E_RI) / (max_RI-E_RI) #返回调整兰德系数的值\n",
"\n",
"\n",
"#定义k均值聚类函数\n",
"def Kmeans(Xarray, k, iters):\n",
" '''\n",
" INPUT:\n",
" Xarray - (array) 特征数据数组\n",
" k - (int) 设定的类别数\n",
" iters - (int) 设定的迭代次数\n",
" \n",
" OUTPUT:\n",
" group_dict - (dict) 类别字典\n",
" scores - (int) 每次迭代的ARI得分列表\n",
" \n",
" '''\n",
" center_inds = random.sample(range(Xarray.shape[0]), k) #从特征数据中随机抽取k个数据索引\n",
" centers = [Xarray[ci] for ci in center_inds] #将这k个数据索引所对应的特征数据作为初始的k个聚类中心\n",
" scores = [] #定义一个空列表用来保存每次迭代的ARI得分\n",
" for i in range(iters):\n",
" group_dict = {i:[] for i in range(k)} #定义一个空字典,用于保存聚类所产生的所有类别,其中字典的键为类别标签,值为类别所包含的数据列表,以索引表示每条数据\n",
" print('{}/{}'.format(i+1, iters))\n",
" #循环计算每条数据到各个聚类中心的距离\n",
" for n in range(Xarray.shape[0]):\n",
" dists = [] #保存第n条数据到各个聚类中心的距离\n",
" for ci in range(k):\n",
" dist = cal_distance(Xarray[n], centers[ci])\n",
" dists.append(dist)\n",
" g = dists.index(min(dists)) #取距离最近的中心所在的类\n",
" group_dict[g].append(n) #将第n条数据的索引n保存到g类\n",
" print(group_dict)\n",
" for i in range(k):\n",
" centers[i] = cal_groupcenter(group_dict[i], Xarray) #根据每一类所包含的数据重新计算类中心\n",
" scores.append(Adjusted_Rand_Index(group_dict, Ylist, k)) #将该轮迭代的ARI得分保存到scores列表\n",
" return group_dict, scores\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" Xarray, Ylist = load_data('..\\iris.data') #加载数据\n",
" start = time.time() #保存开始时间\n",
" Xarray = Normalize(Xarray) #对特征数据进行标准化处理\n",
" k = 3 #设定聚类数为3\n",
" iters = 2 #设定迭代次数为2\n",
" group_dict, scores = Kmeans(Xarray, k, iters) #进行k均值聚类\n",
" end = time.time() #保存结束时间\n",
" print('Time:', end-start)\n",
" plt.plot(range(iters), scores) #绘制ARI得分折线图"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}