data-science-ipython-notebooks/scikit-learn/scikit-learn-pca.ipynb

283 lines
84 KiB
Plaintext
Raw Normal View History

2015-05-31 22:06:35 +08:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# scikit-learn-pca"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Credits: Forked from [PyCon 2015 Scikit-learn Tutorial](https://github.com/jakevdp/sklearn_pycon2015) by Jake VanderPlas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dimensionality Reduction: PCA"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Reduced dataset shape:', (150, 2))\n",
"Meaning of the 2 components:\n",
"0.362 x sepal length (cm) + -0.082 x sepal width (cm) + 0.857 x petal length (cm) + 0.359 x petal width (cm)\n",
"-0.657 x sepal length (cm) + -0.730 x sepal width (cm) + 0.176 x petal length (cm) + 0.075 x petal width (cm)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAFVCAYAAADLxheZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XdAVFf68PHvnaFKR5oKgiCMiNh77yYauyaxxahpZlPd\n3nez7/52N7vp3ZJoEltiNCb2GmOw9z42UBBEpHem3PePMSAqisAU8Pn85Z17597nMMgz99xznqOo\nqooQQgghHI/G3gEIIYQQ4s4kSQshhBAOSpK0EEII4aAkSQshhBAOSpK0EEII4aAkSQshhBAOyqk2\nb9bpdN2Af+v1+gG3vP4qMAvIuPHSs3q9/mxtriWEEEI8aGqcpHU63W+AqUDBHXZ3BKbp9frDNT2/\nEEII8aCrTXf3eWAcoNxhXyfgDzqdbqdOp/tdLa4hhBBCPLBqnKT1ev1KwFjF7qXAs8BAoLdOpxtR\n0+sIIYQQD6paPZO+i3f0en0egE6nWwt0ANZWdbCqqqqi3OmGXAghhGiQqpX06jxJ63Q6H+CYTqdr\nDRRhuZtecLf3KIpCRkZ+XYdic4GBXvW+HQ2hDdAw2tEQ2gDSDkfSENoADaMdgYFe1TquLpK0CqDT\n6SYBnnq9ft6N59DbgVJgi16v31AH1xFCCCEeKLVK0nq9PgnoeePfS296fSmW59JCCCGEqCEpZiKE\nEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOS\nJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOqi/WkhXhgXL16haNH\nt+HsrCUgMIa28Z3tHZIQogGTJC1ENRUUFHD06LdMmdoDgIQEPXq9KzpdvJ0jE0I0VNLdLUQ1nTlz\njGEPxZZv9+qlIzn5tB0jEkI0dJKkhaimgIAQEhOvl28XFhaj0bjaMSIhREMn3d1CVFNERCTbth0l\n/epBPDxdOKsvZMyYZ+0dlhCiAZMkLcR9GDhwLMXFxZSVlRLfxtfe4QghGjhJ0kLcJ3d3d9zd3e0d\nRq2pqsrKlWvJzMpj6JBeRESE2zskIcQtJEkL8QBSVZU//fkd9h1qhEbjzoZNX/La38bRJi723m8W\nQtiMDBwT4gGUkZHBvgNFaDSWHoHC4uasXLnTzlEJIW4lSVqIB5BGo0FRzJVeUzSqnaIRQlRFkrQQ\nD6CAgAD692mMaspFVU14eyQydfIwe4clhLiFPJMW4gH1hz/MpscPO0lLy2DI4OcJDAywd0hCiFtI\nkhbiATagfx97hyCEuAvp7hZCCCEclCRpIYQQwkFJkhZCCCEclCRpIYQQwkFJkhZCCCEclCRpIYQQ\nwkFJkhZCCCEclCRpIYQQwkFJkhZCCCEcVK0qjul0um7Av/V6/YBbXh8J/BkwAp/q9fr5tbmOELZw\n7NgB0q+lEBUZR2RktL3DEUKImt9J63S63wDzANdbXncG3gSGAP2AZ3Q6XVBtghTCWq5cuczGjQuZ\nP/8vBIckM2lSc8oMBzhyZI+9QxNCiFp1d58HxgHKLa/HAuf1en2uXq83AD8BfWtxHSGsorS0lKPH\nvmPK1La0im1MXFw4AL1768jMPG3n6IQQohZJWq/Xr8TSnX0rbyD3pu18wKem1xHCWpKTL9GlSzN7\nhyGEEFWyxipYuYDXTdteQPa93hQY6HWvQ+qFhtCOhtAGuHc7nJ2j2LM3gdjYcFxcnDl3LoXo6FD2\n7j1PRIv2DvFzcIQY6oK0w3E0hDZAw2nHvVgjSZ8BonU6nR9QiKWr+7/3elNGRr4VQrGtwECvet+O\nhtAGqG47tKjmSJYv24ObuxNLlhwkKrKMmJj2REbG2P3n8GB9Fo6vIbSjIbQBGkY7qvsloy6StAqg\n0+kmAZ56vX6eTqebA2zE0p2+QK/Xp9XBdYSoc50790NV+2I2m+nVU2vvcIQQopJaJWm9Xp8E9Lzx\n76U3vb4GWFOryBqIA9t/4Oii5YBK/BOP0nXgQHuHJG6hKAparSRoIYTjsUZ3t7ghUa/n4Mt/JfJq\nIQBHD56m8fImRLWOtXNkQggh6gOpOGZFh7fvoMXVgvLtiPRCju7YYceIhBBC1CeSpK0oVBdDpltF\nN+p1Nw1No1vaMSIhhBD1iSRpK+o6oD/Or0zlZAtfTkb44vzSJLoPHmzvsIQQQtQT8kzayibMeQn1\n1RcBywAlIYQQorokSduAJGchhBA1Id3dQgghhIOSJC2EEEI4KEnSQtRT586dYtOmVSQlXbR3KEII\nK5EkLUQ9lLBrE65up5k0uTklpXs5dCjB3iEJIaxAkrQQ9VBZaRIdO7ZAURR699aRna23d0hCCCuQ\nJC1EPXTrjAFF/icL0SDJf20h6qUQ9PorABw5kkQj93A7x2M9qqpiMpnsHYYQdiHzpIWoh/r3H8mR\nI3s5fCiJZs1a0r17W3uHZBVfr1jHsq/2UVamEB/nzT9ee0lWLBMPFEnSQtRT7dt3s3cIVrN1WwJr\n1+4jYfdpGnm0wcXVj32HSli46BtmzXzU3uEJYTOSpEWDlpFxjX37vsPTS0N+HgwePBk3Nzd7hyXu\nYs+eA7z5zi6MphB8/fuQnXUIrZM7Wq0b168X3PsEQjQgkqRFg7Znzzc8OaMriqJgNBpZ/OVyRoyY\nbu+wxF3s2n0CoymkfNvTK5qSolQ8vbzp3KmLHSMTwvYkSYsGzcdXWz4S2snJCQ8P1c4RVc/169fR\naMoAF3uHYnP+/h6YzTloNJYeD6Mhg9axCg8/FMugQb3tHJ0QtiVJWjRoeXkVo4JNJhOFhY6dpFVV\nZfXqBehauZJ+TSUpEUaMeMLeYdnUtKnjOHPmHY6duIpWa+TxafHMmDHB3mEJYReSpEWD1qnjSBYu\nXIuPtxM5OWYGDHjc3iHdUVpaCsdP7CQlOYlHRraidWvLlKqoqOvs359Aly697Byh7Wi1Wv79rzkU\nFRXh7OyMs7OzvUMSwm4kSYsGrUmTUB4Z8ay9w7ir69czOH36eyZP7gbo+Oqr7YSE+OHv702TJn78\ntDPV3iHaRaNGjewdghB2J8VMhLCh4uJitm5bw/bt6zAYDAAcOvwTEx/tWn7MhAn92LXrBACrVh2i\nffuet53n8OFdbN26jO3bv8NsNtsmeCGEzcmdtBA2UlRUxIYNc3lyRg+MRhNffP4xo0fPxtXFnby8\nInx8PADIzi7gyOHrGAzniYocRkBAYKXz7Nq1meiYYoYMjSEvr4ivv/qU0aOfqlYMBoOBbdu+wdnF\nCKonAwaMvq3EqBDCcUiSFsJGEnZtYuasnjg5OeHi4syUqR3YvGk7ffoMZdnST+jdJwSz2czuXdd5\n9tm/EhTkTUZG/m3nKStLIza2DQDe3o1oHGBCVdVqJdv16z9n0uTWuLu7kpWVz/p1yxk2zDGf0wsh\nJEkLYTuqGY2m4gmTRqPBbDah0WgYN+45Ll48j6IojB0bddeEW1ZWuXu7tMRU7bthX18z7u6uAPj7\ne+HmnliDhgghbEWeSQthIz16DOazTxMwmUyUlRn4fNF+evQYCFhWtYqKiiYysuU9E25MTB+WLdvL\npUtX2bL5GF5esdWOoaSk8hS04iJZuEIIRyZ30kLYiKenN4MGzWD58q1oFA0jRz6Hi8v9FyuJiIgi\nKGg6Fy+eIzS0KwEBAdV+b3h4DxYv/pGwMG8uXsyhXduR9319IYTtSJIWwoY8Pb0ZOmRsrc/TqFEj\n2rRpd9/v0+niadmyNbm5OcS28neYQWMGgwGNRiMrXAlxC0nSQjxgtFot/v6N7R0GYKmw9vd/fMCB\ng1lotSZGj2zDzBkT7R2WEA5DnkkLIexm+fLvSNjjgsHUgpKyliz7OhH92XP2DksIhyF30lZW3akx\nQtyqsLCQ776bi68fFBSYiWs9hPDwKHuHVafSM3LRaNzLt81mXy5eSEIXE33H4zdv3s6qbzfRsWMs\nT816sGqaiweTJGkr2bV
"text/plain": [
"<matplotlib.figure.Figure at 0x10c246f90>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn; \n",
"from sklearn import neighbors, datasets\n",
"\n",
"import pylab as pl\n",
"\n",
"seaborn.set()\n",
"\n",
"iris = datasets.load_iris()\n",
"\n",
"X, y = iris.data, iris.target\n",
"from sklearn.decomposition import PCA\n",
"pca = PCA(n_components=2)\n",
"pca.fit(X)\n",
"X_reduced = pca.transform(X)\n",
"print(\"Reduced dataset shape:\", X_reduced.shape)\n",
"\n",
"import pylab as pl\n",
"pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y,\n",
" cmap='RdYlBu')\n",
"\n",
"print(\"Meaning of the 2 components:\")\n",
"for component in pca.components_:\n",
" print(\" + \".join(\"%.3f x %s\" % (value, name)\n",
" for value, name in zip(component,\n",
" iris.feature_names)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dimensionality Reduction: Principal Component Analysis in-depth\n",
"\n",
"Here we'll explore **Principal Component Analysis**, which is an extremely useful linear dimensionality reduction technique. Principal Component Analysis is a very powerful unsupervised method for *dimensionality reduction* in data. Look for directions in the data with the most variance.\n",
"\n",
"Useful to explore data, visualize data and relationships.\n",
"\n",
"It's easiest to visualize by looking at a two-dimensional dataset:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHl5JREFUeJzt3X+MpVV9x/HP3ZnZWYa5sJPN1N1GAtSNJ2BdIZFApFiw\n0arFRFghWGvARVFru1pYtVaWFrQNjUssVBpCA1ntDzeuSIypKW2EVQuRbJr1R4o9YXXXVLsmq5kt\nIwsDM3v7x51nuPPc58d5ftznPM8871dCwszcuc+Zs3fmc895zvmeTq/XEwAAqNY63w0AAKCNCGAA\nADwggAEA8IAABgDAAwIYAAAPCGAAADwYz/NNxpgJSQ9KOlvSpKRPW2u/Fvf4xcWl3tzcyXwtbJGZ\nmSnRT+noJ3f0lRv6yR195WZ2tttJe0zeEfC7JB231r5e0pslfS7pwePjYzkv0y70kxv6yR195YZ+\nckdflSfXCFjSfklfXv7/dZIWy2kOAADtkCuArbXPSpIxpqt+GH+yzEYBALDW5V6EZYw5S9Kjkr5g\nrd1XXpMAAFj7OnlqQRtjXibpgKQ/tNY+5vAtFJwGALRJ6iKsvAF8t6RrJNmBT7/FWvt8zLf0jh+f\nz3ydtpmd7Yp+Skc/uaOv3NBP7ugrNy6roPPeA/6wpA/n+V4AAEAhDgAAvCCAAQDwgAAGAMADAhgA\nAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAG\nAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCA\nAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8I\nYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMAD\nAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDw\ngAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMCDwgFsjLnYGPNY\nGY0BAKAtxot8szHmY5L+QNKvymkOAADtUHQEfFjS1ZI6JbQFAIDWKBTA1tqvSFosqS0AALRGoSno\nLGZnu1VdqtHoJzf0kzv6yg395I6+KkdlAXz8+HxVl2qs2dku/eSAfnJHX7mhn9zRV25c3qSUtQ2p\nV9LzAADQCoVHwNbao5JeV7wpAIC22rPvkH54dE6SdN45M9p13YWeWzR6FOIAAHi1Z98hPXV0Tj31\np1OfOjqnW+59XD/5+dqe6iaAAQBeBSPfQXPzC7rnoe97aE11CGAAADwggAEAXp13zszQ52a6k9q5\nfZuH1lSHAAYAeLXrugs1051c+XimO6m7PnSpzt68tvcbE8AAAO92bt+mme5kK0a+gcoKcQAAEOfs\nzV3d9aFLfTejUoyAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgFXQAFATbTyQoM0YAQNADbT1QII2\nI4ABoAbaeiBBmxHAAAB4QAADQA209UCCNiOAAaAG2nogQZsRwABQE208kKDN2IYEADXR1AMJ2D6V\nDwEMABjiGqrB9qlAsH1q5/ZtTJ+nIIABYATqPCpMa1tSqM7Org7VpO1TTRzNV4l7wABQsjoX1XBp\nG3uSq0EAA2iVPfsO6cY7H9WNdz6qPfsOjeQadQ6wstvG9qn8CGAArRE3+jv80xOlPHcQ7L2Yxywu\nnSp8nSpkCVW2T+VHAANojbjR36cffLLQ84aDvc5cwjVrqLJ9Kh8CGAAKigr2KM8tLI64JelcwzVL\nqAbbpxj5ZsMqaACtMbVhXM8+vzoEZ7qTunXHxZVcf3Gppx13Pqpzt3S1+/qLKrlmlJ3bt63c840L\n16buSW4SAhhAK+zZd2gofDudfgBtfflGHT+ef4XyeefMrNq2Ezx3L2Y++sixed1y7+OamZ7UkWPP\nrDxHVVuVCNd6YAoaQCtETRP3eiplZXLUtO5tKSPcufkF/fjYM7XcqoRqEMAAUILwPdOzN3d1fsSC\npyR12aqEajAFDaD2yqgqFTVNXNaq3XD7goVIu667ULfc+7jm5hcKXwNrDyNgALVWVlWpUe1XTWtf\nMDLudF76npnuZOToeKY7qY3T60deKAT1QAADqLWilZsGC2TMTE+Wvl81rX3Bgqfbrr9o1bWj3hBs\n2TSlI8fmuS/cEgQwgDUrPDr98fKKYx8n9UTtlQ3fN65zCUuUj3vAAGqtyL3bKk7qydK+qHvZbAdq\nLwIYQK2FFzIF927LlnehV1L7gueM2g4cdW7uKBeKoX6YggZQe3lrDbseKrD7vicKLfSKat/g9Hec\n8PQyBxu0S6cXV6qlXL0iVWbaYna2W6gaT1vQT+7oKzmNnm/860cjq1YVGW0nnYqUdI2f/Hx+VZnI\nuoUvryk3s7PdTtpjmIIGsCYNTv+Oj3XUnVpfu6ncqNE4ZSLbgwAGsGbE3XNdXEoei75m66y++/Tx\nVZ/LMt0ddf846n7uoO7UBEHbcgQwgDUhuOcaJ2r180pwdvqj5CCo06aeBwM3fMLS4OKqex76/lAV\nrE5H2jjNwioQwABilFH+sejzZ2mD65m8g8+9Eti9/ii505GmT5tIDMdw0IdPWJJeCvsghBeXTkmS\nxsfW1fK+LvwggAEMCYdM1JaZUT9/2W0ITynHnY40PrYu8fmzBD33c5GEbUgAhoy6IpPL82dtQ9SW\no0DV23nYuwsXBDCA2pibX8h9AEF4D213aiJx77DrHmGX7wsftMDeXbgggAEMiRtNzkxPRn6+rOeX\nXppqPmfLcIClBeRgQYybr71gqPbyoLxFL8LfJ0nnbj6j9EMesPYRwECFBk/mqfNRc1EhI/UPMyjj\ndJ645w/MzS/oxK9eyByQe/Yd0tz8gvNIeuP0+sj/z/J9kt9DHtBcVMKqESrMuGlqP0VtkwlGTKP6\nox3uq6RVxeGvXXP5Vt2+92Dk85ZRjzmo+BR3WH3QN65Vof74b741tCK505Hed+X5uuRVm4ceX+Tf\nI67K1ajqVKcZ9Yr1QU39/auaSyUsArhGeGG7aWo/lflHO+0P7uD+1qnJcZ18fjH22ju3b9P+A4cj\nwygpHMsKmiJBONgPSX/Jgr+Eg31V5N9jlAGcNUyrfmPX1N+/qrkEMFPQQMOEz7gNHxyw6uu9/j7V\nuHAKVhXHrTgeHxv+G1LFfc4tm6acwnewH5KUfcB93gVcadL+baNwhnBzEcBARcr6o532BzdrQYok\n3an1Tvdho+5tu9zvjqtelTd4XAR9VeTfY1SnFhGm7UIAAxWp41FzQeDErUqem1/Qxun1iSt8o0Zt\nN33mMaeRXFKIFg2eTuelqecoRf898h6RWLZRjcYxegQwUKGd27etTOvm3fOa9gc3aYtP+HuCwNl1\n3YWR082SdOTY/ErbB8MpGOFGjWCjDj8IAnVwZFxkBUrUzzm1YbwfvMuLr9L6KgjRTWduyBxYQZWr\nMt9E5QnTOr6xgxsWYdUIixvcNLmfylowk3bG7eDXOx2tnHXbnZrQ+Fj/fXf4mjvufDTxmoPXSTv4\noAzhfolanDT4c0r9EW944ZLLecCjfE1lXVTl0t6wKs8QbvLvX5VYBd0wvLDdNLmfylo9m/YHN/j6\nunUdbX/9b2j/gR/FPjatbeE2VhW+4VOLot64XHP5K/Tg1384NOIeDG+XcBrVayrPG64qwzSPJv/+\nVYkAbhhe2G6a3E9V7x/N0ldJwZq0XakMwSlEUvSJQXH9NniEYNTX7v/oFU7XH9Vrqm77hcvQ5N+/\nKrkEMKchARWKOqS9Lgtmdl13oW76zGOx92/3HzhceIV1VGAW2bMaF77B18o8wQkoG4uwgArVccHM\n4KKopEALVjXn1Z2a0P0
"text/plain": [
"<matplotlib.figure.Figure at 0x10c39ddd0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"np.random.seed(1)\n",
"X = np.dot(np.random.random(size=(2, 2)), np.random.normal(size=(2, 200))).T\n",
"plt.plot(X[:, 0], X[:, 1], 'o')\n",
"plt.axis('equal');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that there is a definite trend in the data. What PCA seeks to do is to find the **Principal Axes** in the data, and explain how important those axes are in describing the data distribution:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.75871884 0.01838551]\n",
"[[ 0.94446029 0.32862557]\n",
" [ 0.32862557 -0.94446029]]\n"
]
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"pca = PCA(n_components=2)\n",
"pca.fit(X)\n",
"print(pca.explained_variance_)\n",
"print(pca.components_)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt0FFd+J/BvVfX7oQdSC+RGYJ6FefoF2AZssI2NH+NB\nDGSSzGTy3MzM7jmbTXZPsptk81eyJ3v2zGazm2Rzkp1MstkkM2OQwB57sBk/sC2PwRgzYMAFxhgJ\nEFIL9Oz3o/aPUhXdrW6p1eru6qa/nxxPkFTddVUIfeveuvd3BVVVQURERJUlmt0AIiKiesQAJiIi\nMgEDmIiIyAQMYCIiIhMwgImIiEzAACYiIjKBpZgXybJsBfB3ABYDsAP4Y0VRXs53fCKRVIeHQ8W1\nsI40N7vA6zQzXqfC8VoVhtepcLxWhfH5vMJMxxTbA/4agICiKI8C2AXgL6Y72GKRijxNfeF1Kgyv\nU+F4rQrD61Q4XqvSKaoHDOBFAPsn/ywCSJSmOURERPWhqABWFCUIALIse6GF8R+UslFERER3uqIn\nYcmy3AHgTQD/V1GU75euSURERHc+oZha0LIszwfwNoB/rSjKWwW8hAWniYionsw4CavYAP5zAPsA\nKGmffkZRlEiel6iBwPisz1NvfD4veJ1mxutUOF6rwvA6FY7XqjCFzIIu9hnwbwH4rWJeS0RERCzE\nQUREZAoGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYw\nERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBE\nREQmYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMR\nEZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTERE\nZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYwERGR\nCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQm\nYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmA\nAUxERGQCBjAREZEJGMBEREQmmHMAy7K8WZblt0rRGCIionphmcuLZVn+XQBfBzBRmuYQERHVh7n2\ngD8DsAeAUIK2EBER1Y05BbCiKF0AEiVqCxERUd2Y0xD0bPh83kqdqqbxOhWG16lwvFaF4XUq3J1+\nrc6ePYuDBw/CZrPht3/7t2GxlCcqKxbAgcB4pU5Vs3w+L69TAXidCsdrVRhep8LdqdfqypUvcPDg\nAXR17cf582eNz8fjKv7Vv/r2rN+vkJuUUgWwWqL3ISIiqojBwUG89FIXurr248SJ4zmPWbDgrrKd\nf84BrCjKFwAemXtTiIioXvWc6cfQaAQA0NrowJZ17WU5z+joCF555WV0de3He+8dRSqVmnKMw+HA\nU089g6997RvYseOJsrQDqOAQNBERUS49Z/oRGAkbHwdGwnjteC82r56PJo99zu8fCoXw+us/RlfX\nfrz55hHEYrEpx1gsFmzf/jg6O/fimWeeg8dT/ufcDGAiIjKV3vNNF44mcOzcAJ7etKio94zFYnj7\n7TfQ1bUfhw+/ilAoOOUYQRDw8MNb0Nm5F88//2W0tLQUda5iMYCJiOiOkEwm8cEH76Oraz9+9KOD\nGB4eznncvffeh87Offjylztx113+CrfyNgYwERGZqrXRkTEEDQBOuwWbV8+f8bWqquLUqZPo6tqP\nQ4e6cONGf87jVqxYic7OvdizZy+WLl1eknbPFQOYiIhMtWVdO1473otwVKvr5LRbZhx6VpRP0d39\nIrq7D+Dy5c9zHrNwYQd27/4KOjv3Yu3adRCE6irayAAmIiLTbV49H8fODRh/zqW394qxVvfcuU9y\nHtPa2ooXXuhEZ+c+bNy4CaJYvZv+MYCJiMh0TR57zl5vIWt1vd4GPPfcl9DZuRfbtj1WtspVpVYb\nrSQioroxOjqCV1/9Ebq6XsS77+Zfq7tz5y7s2bMPTzyxEw6Hw4SWzg0DmIiITBcKhXDkyGF0de3H\nG2+8nnOtriRJGWt1vd4GE1paOgxgIiIyRTwez1irGwzm3lpeX6v7pS/trvha3XJiABMRUcWkUin8\n9Kc9M67V3bDhPnR27sXu3XtMXatbTgxgIiIqK1VV8bOffYyurv04ePBA3rW6y5evwJ49+9DZ+RUs\nW7aiwq2sPAYwEVGVqNSGBJVy4YKCrq4X0d29P+9aXb9/IXbv/gr27NmLtWvXV91a3XJiABMRVYFy\nb0hQKX19vejuPoDu7v04e/ZMzmNaW1vxpS/tRmfnPmzatLmq1+qWEwOYiKgKlGNDgkoZHBzEyy93\no6trPz788FjOYzwer7FW99FHt9fMWt1y4hUgIqJZGxsbndxXN/9aXbvdjqeeegadnXvx5JNP1eRa\n3XJiABMRVYG5bEhQKeFwGC++eBjf+97/nXat7mOP7UBn5148++zzNb9Wt5wYwEREVaCYDQkqIR6P\n4+jRN9HVtR8//vEredfqPvTQI8Za3dbW1gq3sjYxgImIqkQhGxJUQiqVythX99atWzmPW7/+XmOt\nrt+/sMKtrH0MYCKiKpFvQ4JKSF+re+hQF/r7r+c8TpZlvPDCHnR27sXy5dpa3Z4z/Thx+TKAO2P5\nVKUwgImI6pi+VvfgwQP4/PNLOY9pbWvHV/f9HPbs2YsdO7ZgaOj2MPSdsnzKDAxgIqIyqOaiGt0/\nOYHXXj2E4+8dxtUvlJzHeBqa8cDDT2LT1mewbNW9cDtt6Fgyf0qhjFpePmU2BjARUYlVY68wEAjg\npZe68Q//9M/49JOTOY/xeLx49tnn0b5yG+R1G2GxWI2v6aG6YgknWJUKA5iI6koleqZDoxH0DU4g\nFNFmNLscFnS0eSreKxwbG83YVzeZTE45xmK1Yf0Dj2Lrjufw77/1i3A6nTj03mWoqlrQOWph+VS1\nYgATUd3I1zN9dtuykry3HuynLgZgt0nG14KROC5dG8Uyf+OczzOTcDicsa9uNBqdcowoSrhnw0PY\ntHUX7t20A06XB067BU6nE8DsQrVal0/VAgYwEdWNfM8r3/n4KrbMoceWHezReAqhSAJetxUWqfx1\njuPxON555y0cOPDitGt1N29+GPc+9BRWbngM3sZ5xuezw3W2oVoty6dqDQOYiGiOsoO90W3DrfEI\nxoNxSJKAZErFPK8DN26FSnbOVCqFY8d+iq6u/Xj55e68a3XXrdtgrNVduLADAAoK19mEqpnLp2oZ\nA5iI6sbweAQDt7Seqv5c1mm34NH7FiIZjZfsPC6HBfGkFUMjEdghweu2YjwcQyKZwnd+cArrl87D\nzo2zDyxVVXH69Cljre7169dyHrds2XJ0du7Fnj37jLW66QoJV4Zq+TGAiagu9JzpR5PHjpujEcST\nKQQjcfQOjOMbu1ZhXoMDgUDxAZz9zLSjzQN1QIUAAZIkYCIch8epzSiOJ5I4fekmUiogioIxUWu6\nCWEXL14w9tXNt1b3rrv8xr6669ZtmHZfXYZrdWAAE1Fd0IeJ/T4PrgW0Z6Rtza6SLK3J9cz0G7tW\n4buvnAdUFU7b1F+1F/pGEIsnjYlZ2UuVrl7tM/bV/eST0znPO2/ePLzwQif27NmHTZseqtt9dWsV\nA5iI6orDJpVlNnL2sG6Tx46HVs9HYCQMpXcEKrRlPVZJhN/nxpUbE7BImb3UwcFB/Mkr38enJ9/A\nsWM/zXket9uDZ599Hnv27MWjj+6A1WrNeRxVPwYwEVW9UqzdLed61ez26cU29J6xy2FBMBKHVRKN\n8LdYRPhb3QiHJvDxsTf
"text/plain": [
"<matplotlib.figure.Figure at 0x10c47f490>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.5)\n",
"for length, vector in zip(pca.explained_variance_ratio_, pca.components_):\n",
" v = vector * 3 * np.sqrt(length)\n",
" plt.plot([0, v[0]], [0, v[1]], '-k', lw=3)\n",
"plt.axis('equal');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice that one vector is longer than the other. In a sense, this tells us that that direction in the data is somehow more \"important\" than the other direction.\n",
"The explained variance quantifies this measure of \"importance\" in direction.\n",
"\n",
"Another way to think of it is that the second principal component could be **completely ignored** without much loss of information! Let's see what our data look like if we only keep 95% of the variance:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(200, 2)\n",
"(200, 1)\n"
]
}
],
"source": [
"clf = PCA(0.95) # keep 95% of variance\n",
"X_trans = clf.fit_transform(X)\n",
"print(X.shape)\n",
"print(X_trans.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Isomap: manifold learning, good when PCA doesn't work like in a loop. Large number of datasets, can use randomized PCA."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By specifying that we want to throw away 5% of the variance, the data is now compressed by a factor of 50%! Let's see what the data look like after this compression:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl0ZFd9J/Dv22tRlfaWe3EvttXPGyaOGcNAgCQ4noRJ\nwpZM0hMIhnCSEDYHBhTAaaBjAzbjgOPgk2UgQCZxwhKICVlgIMQzzoTMiY0NjnkY293udnerJZVU\nKtXy1jt/vHqlKqmkKlWV9ErS9+PT57RKpaqrZ7W+726/KwkhQERERFtLjrsBREREuxEDmIiIKAYM\nYCIiohgwgImIiGLAACYiIooBA5iIiCgGaidfZJqmBuCTAA4BMADcalnWl9d6vuf5Yn6+1FkLd5Hh\n4RR4nVrjdWofr1V7eJ3ax2vVnvHxjNTqOZ32gH8JwIxlWS8C8JMAfn+9J6uq0uHb7C68Tu3hdWof\nr1V7eJ3ax2vVOx31gAF8DsDnq3+XAXi9aQ4REdHu0FEAW5ZVBADTNDMIw/i9vWwUERHRTtfxIizT\nNC8G8A0An7Es6y961yQiIqKdT+qkFrRpmhMAvgngNyzL+sc2voQFp4mIaDdpuQir0wC+C8DPA7Dq\nHv4py7Iqa3yJmJkpbPh9dpvx8Qx4nVrjdWofr1V7eJ3ax2vVnnZWQXc6B/w2AG/r5GuJiIiIhTiI\niIhiwQAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYi\nIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiI\nKAYMYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKi\nGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhi\nwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYiIooB\nA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYM\nYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCA\niYiIYsAAJiIiigEDmIiIKAZdB7Bpms81TfMfe9EYIiKi3ULt5otN03wXgFcDWOpNc4iIiHaHbnvA\nPwDwSgBSD9pCRES0a3QVwJZl/RUAr0dtISIi2jW6GoLeiPHxzFa91bbG69QeXqf28Vq1h9epfbxW\nvbFlATwzU9iqt9q2xsczvE5t4HVqH69Ve3id2sdr1Z52blJ6tQ1J9Oh1iIiIdoWue8CWZZ0E8Pzu\nm0JERLvVdK6EiuMDABK6gomRVMwt2nwsxEFERLGazpVQdjyI6n9lx8OZC0uwXT/upm0qBjAREcUq\n6vnW84IAM/PlGFqzdRjAREREMWAAExFRrBK6suoxVZYxPpyMoTVbhwFMRESxmhhJQZWX40iVZRzY\nMwBDWx3MOwkDmIiIYjc+nIQqy7ui5xvZskIcREREazE0BQf2DMTdjC3FACYior4yNWXgoYfC4edr\nr/Vx++12zC3aHByCJiKivjE1ZeDBBxUIAQgBPPiggmPHknj88Z0XVzvvOyIiom0r6vnWm52VcPy4\nEUNrNhcDmIiIKAYMYCIi6hvXXru6KtbYmMCJEztvHpgBTETUJ6ZzJZw6X8Cp8wVM50pxNycWt99u\nY2xs+YC9sTGBe+8tY3IyiLFVm4MBTETUB3brgQTNnDgRhvBO7flGuA2JiKgPrHcgwW7bHzs5GeDe\ne3f2QQwAA5iIiDq0W/brbhYOQRMR9YHtdiDBbtqvu1l4pYiI+sB2O5BgN+3X3SwcgiYi6hPjw8na\nIfRx93w5vLz52AMmIuoT0YEEcfd82xlert+v63oBHNdHOuPire9YiKPJ2xIDmIiIGjz0kFILVcf1\n4XrBquHlaL+u6wUIhMDwSICP3TOHvQfsXbt9aqMYwEREm6Cfi2q85WYFL/4xAy/+MQNvuXl1TzsK\nVQFAAAiEgOP6CIRoeN6JEzaGhn2MjAS4+Z352uPR9ilaHwOYiKjH+rmoxltuVvDwt9Xa8PLD31bx\nildpePSx5XC94ipn1dcNjwR4883zDY9NTga4654cPnbPHA4f8Ta97TsNA5iIdpWt6JlWHB+5fAXn\nZks4N1tCLl/pm17hIw+vXns7Nyfjllv02sdT781jZGS59ONIdXj5yCWrbyC22/apfsIAJqJdY62e\nacXpvvdWH+xPPLOAiuvX3qfi+riQK8Px4u8BtyOhK7j5nWEIR8PLa4Xqdts+1U+4DYmIdo21yj2e\nnyshrUodv24U7BHHDVByKsimdKhKGE5CEmt9+Za65tkeHv5246/+0dEAt97qAAivwcRICu6lS/jY\nPXMAlkN1Lf20fWo7YQ+YiKhLK4N9cMAABLBYcpBfcpArVBAEAoWiG1MLl939MR+jo8vDy6OjAb74\nBRdXXdF4AzI+nIQqy20NJ/fL9qnthgFMRLvGUtnBubkizs0VMbdYARD27i4aTfX0fQxNRjato1jx\n4IsA2ZSOQtFFvuTgXx+bxukLhZ6+30bdequD0dGgrue7GkN183EImoh2helcCemkhmLZgxcEsB0f\ncwsVXH3pKBK6im4iMaErDUPQI4MJCAHsG0lBUWQslVwMpDQAgOcHODtXhAQJAgJBsPwaEyOrbwSi\nilRnzoQ91AMHRNeVqa66QsIXvxD1xjsfeqfusAdMRLtCNEw8lDVqQ6uZAb0nK5ObLUS6+tJRGJoK\nVZaRMFb3daYXwhXS621ViipSnT4toVgM/zzxhIx//mcefLATsAdMRLuKrsrYM9L7hUIrFyIZmoJD\nF2VQdrxa0AKAokgYyRiYzVegSI0B6gUB3v4ODY9/LwEAOHNGwv79AqXSci/V84CzZ2UkEgGOHzd2\nxbm5OxUDmIj63nSuVOvBrjVU28rKYWKgd/tVV7YvmjOdGEnhzIUlGJqMiutDUSRMDIdtVyQZQ1kD\nd9w2iEe/G+7BdWxA1wE9HK1GsSjhySclCAFIHCnecRjARNTXVm7xiYZqo15mu6Iw9KqTrq221vSq\nfVHAT+dKyA5otff+0z/ah6/+nYFyWUIyKTCx18czZ1QoKjAx4cEwgEQCqFRk+D6gVn9bqyqwb1+A\nsTGBEyd4QtF2xgAmor621t7dmflyWwFa3zuVZdTmanu1X7VV+6LVxB/7yAj+8i802HbYlZWksD2a\nJlAuSzj9dFge0veA6WkVFx/0MLHXx+mnJSRUIFosdcklYfhy6Hn7YwAT0Y61snfqB4AqSxvuPXcq\nWsH8ne+Evdh6QgC+DwSBBF0XCHwpbJ/SWLBj8qgLSUKtvek02PPdIRjARNTXupm77bb33E37jr1q\nD55+WkalEobtWoQAXFeCYQCGIaBpAsMjYbujGsy9Gi6n/sIAJqK+tllztyt1utArat8HfyeDr381\niXJZAiDBdcPhZSHaWz2lqsBR08PUb8/jYx8ZBIB1azDT9scAJqK+12mt4XZ7z2dnljpe6DU1ZeC+\n+9JYWAiDVtdR6/UGQevw1XSBdAq47roAJ064SA4Gbddgpu2NAUxEfS9ayLRR7faeS/bq05DWGqp+\n0YtSOHUqXMhlGAKHDoX7dKOKVnbd9Ox6Q8+SBAwOBbjqahd3fNjH5GT4ArbLgw12CwYwEe1I0ZCy\n4/kolT2MDia6DrT68AWAfF7Cd78rQanrJEehK0nh32UZtXCOKApw5NIKbv/vBVx/nd7Qy+70ZoO2\nHwYwEe0YUejOLYZjwCODCWiqjMGMvu7XpQwV8yseazZUXR++kSAI/9QHrSSFf1RNQJIE5OqWI0kW\neMu7zuP5P1LB2FACl+w
"text/plain": [
"<matplotlib.figure.Figure at 0x10c87da50>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"X_new = clf.inverse_transform(X_trans)\n",
"plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.2)\n",
"plt.plot(X_new[:, 0], X_new[:, 1], 'ob', alpha=0.8)\n",
"plt.axis('equal');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The light points are the original data, while the dark points are the projected version. We see that after truncating 5% of the variance of this dataset and then reprojecting it, the \"most important\" features of the data are maintained, and we've compressed the data by 50%!\n",
"\n",
"This is the sense in which \"dimensionality reduction\" works: if you can approximate a data set in a lower dimension, you can often have an easier time visualizing it or fitting complicated models to the data."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}