mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
283 lines
84 KiB
Python
283 lines
84 KiB
Python
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# scikit-learn-pca"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Credits: Forked from [PyCon 2015 Scikit-learn Tutorial](https://github.com/jakevdp/sklearn_pycon2015) by Jake VanderPlas"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Dimensionality Reduction: PCA"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"('Reduced dataset shape:', (150, 2))\n",
|
||
|
"Meaning of the 2 components:\n",
|
||
|
"0.362 x sepal length (cm) + -0.082 x sepal width (cm) + 0.857 x petal length (cm) + 0.359 x petal width (cm)\n",
|
||
|
"-0.657 x sepal length (cm) + -0.730 x sepal width (cm) + 0.176 x petal length (cm) + 0.075 x petal width (cm)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAFVCAYAAADLxheZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XdAVFf68PHvnaFKR5oKgiCMiNh77yYauyaxxahpZlPd\n3nez7/52N7vp3ZJoEltiNCb2GmOw9z42UBBEpHem3PePMSAqisAU8Pn85Z17597nMMgz99xznqOo\nqooQQgghHI/G3gEIIYQQ4s4kSQshhBAOSpK0EEII4aAkSQshhBAOSpK0EEII4aAkSQshhBAOyqk2\nb9bpdN2Af+v1+gG3vP4qMAvIuPHSs3q9/mxtriWEEEI8aGqcpHU63W+AqUDBHXZ3BKbp9frDNT2/\nEEII8aCrTXf3eWAcoNxhXyfgDzqdbqdOp/tdLa4hhBBCPLBqnKT1ev1KwFjF7qXAs8BAoLdOpxtR\n0+sIIYQQD6paPZO+i3f0en0egE6nWwt0ANZWdbCqqqqi3OmGXAghhGiQqpX06jxJ63Q6H+CYTqdr\nDRRhuZtecLf3KIpCRkZ+XYdic4GBXvW+HQ2hDdAw2tEQ2gDSDkfSENoADaMdgYFe1TquLpK0CqDT\n6SYBnnq9ft6N59DbgVJgi16v31AH1xFCCCEeKLVK0nq9PgnoeePfS296fSmW59JCCCGEqCEpZiKE\nEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOS\nJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOSJC2EEEI4KEnSQgghhIOqi/WkhXhgXL16haNH\nt+HsrCUgMIa28Z3tHZIQogGTJC1ENRUUFHD06LdMmdoDgIQEPXq9KzpdvJ0jE0I0VNLdLUQ1nTlz\njGEPxZZv9+qlIzn5tB0jEkI0dJKkhaimgIAQEhOvl28XFhaj0bjaMSIhREMn3d1CVFNERCTbth0l\n/epBPDxdOKsvZMyYZ+0dlhCiAZMkLcR9GDhwLMXFxZSVlRLfxtfe4QghGjhJ0kLcJ3d3d9zd3e0d\nRq2pqsrKlWvJzMpj6JBeRESE2zskIcQtJEkL8QBSVZU//fkd9h1qhEbjzoZNX/La38bRJi723m8W\nQtiMDBwT4gGUkZHBvgNFaDSWHoHC4uasXLnTzlEJIW4lSVqIB5BGo0FRzJVeUzSqnaIRQlRFkrQQ\nD6CAgAD692mMaspFVU14eyQydfIwe4clhLiFPJMW4gH1hz/MpscPO0lLy2DI4OcJDAywd0hCiFtI\nkhbiATagfx97hyCEuAvp7hZCCCEclCRpIYQQwkFJkhZCCCEclCRpIYQQwkFJkhZCCCEclCRpIYQQ\nwkFJkhZCCCEclCRpIYQQwkFJkhZCCCEcVK0qjul0um7Av/V6/YBbXh8J/BkwAp/q9fr5tbmOELZw\n7NgB0q+lEBUZR2RktL3DEUKImt9J63S63wDzANdbXncG3gSGAP2AZ3Q6XVBtghTCWq5cuczGjQuZ\nP/8vBIckM2lSc8oMBzhyZI+9QxNCiFp1d58HxgHKLa/HAuf1en2uXq83AD8BfWtxHSGsorS0lKPH\nvmPK1La0im1MXFw4AL1768jMPG3n6IQQohZJWq/Xr8TSnX0rbyD3pu18wKem1xHCWpKTL9GlSzN7\nhyGEEFWyxipYuYDXTdteQPa93hQY6HWvQ+qFhtCOhtAGuHc7nJ2j2LM3gdjYcFxcnDl3LoXo6FD2\n7j1PRIv2DvFzcIQY6oK0w3E0hDZAw2nHvVgjSZ8BonU6nR9QiKWr+7/3elNGRr4VQrGtwECvet+O\nhtAGqG47tKjmSJYv24ObuxNLlhwkKrKMmJj2REbG2P3n8GB9Fo6vIbSjIbQBGkY7qvsloy6StAqg\n0+kmAZ56vX6eTqebA2zE0p2+QK/Xp9XBdYSoc50790NV+2I2m+nVU2vvcIQQopJaJWm9Xp8E9Lzx\n76U3vb4GWFOryBqIA9t/4Oii5YBK/BOP0nXgQHuHJG6hKAparSRoIYTjsUZ3t7ghUa/n4Mt/JfJq\nIQBHD56m8fImRLWOtXNkQggh6gOpOGZFh7fvoMXVgvLtiPRCju7YYceIhBBC1CeSpK0oVBdDpltF\nN+p1Nw1No1vaMSIhhBD1iSRpK+o6oD/Or0zlZAtfTkb44vzSJLoPHmzvsIQQQtQT8kzayibMeQn1\n1RcBywAlIYQQorokSduAJGchhBA1Id3dQgghhIOSJC2EEEI4KEnSQtRT586dYtOmVSQlXbR3KEII\nK5EkLUQ9lLBrE65up5k0uTklpXs5dCjB3iEJIaxAkrQQ9VBZaRIdO7ZAURR699aRna23d0hCCCuQ\nJC1EPXTrjAFF/icL0SDJf20h6qUQ9PorABw5kkQj93A7x2M9qqpiMpnsHYYQdiHzpIWoh/r3H8mR\nI3s5fCiJZs1a0r17W3uHZBVfr1jHsq/2UVamEB/nzT9ee0lWLBMPFEnSQtRT7dt3s3cIVrN1WwJr\n1+4jYfdpGnm0wcXVj32HSli46BtmzXzU3uEJYTOSpEWDlpFxjX37vsPTS0N+HgwePBk3Nzd7hyXu\nYs+eA7z5zi6MphB8/fuQnXUIrZM7Wq0b168X3PsEQjQgkqRFg7Znzzc8OaMriqJgNBpZ/OVyRoyY\nbu+wxF3s2n0CoymkfNvTK5qSolQ8vbzp3KmLHSMTwvYkSYsGzcdXWz4S2snJCQ8P1c4RVc/169fR\naMoAF3uHYnP+/h6YzTloNJYeD6Mhg9axCg8/FMugQb3tHJ0QtiVJWjRoeXkVo4JNJhOFhY6dpFVV\nZfXqBehauZJ+TSUpEUaMeMLeYdnUtKnjOHPmHY6duIpWa+TxafHMmDHB3mEJYReSpEWD1qnjSBYu\nXIuPtxM5OWYGDHjc3iHdUVpaCsdP7CQlOYlHRraidWvLlKqoqOvs359Aly697Byh7Wi1Wv79rzkU\nFRXh7OyMs7OzvUMSwm4kSYsGrUmTUB4Z8ay9w7ir69czOH36eyZP7gbo+Oqr7YSE+OHv702TJn78\ntDPV3iHaRaNGjewdghB2J8VMhLCh4uJitm5bw/bt6zAYDAAcOvwTEx/tWn7MhAn92LXrBACrVh2i\nffuet53n8OFdbN26jO3bv8NsNtsmeCGEzcmdtBA2UlRUxIYNc3lyRg+MRhNffP4xo0fPxtXFnby8\nInx8PADIzi7gyOHrGAzniYocRkBAYKXz7Nq1meiYYoYMjSEvr4ivv/qU0aOfqlYMBoOBbdu+wdnF\nCKonAwaMvq3EqBDCcUiSFsJGEnZtYuasnjg5OeHi4syUqR3YvGk7ffoMZdnST+jdJwSz2czuXdd5\n9tm/EhTkTUZG/m3nKStLIza2DQDe3o1oHGBCVdVqJdv16z9n0uTWuLu7kpWVz/p1yxk2zDGf0wsh\nJEkLYTuqGY2m4gmTRqPBbDah0WgYN+45Ll48j6IojB0bddeEW1ZWuXu7tMRU7bthX18z7u6uAPj7\ne+HmnliDhgghbEWeSQthIz16DOazTxMwmUyUlRn4fNF+evQYCFhWtYqKiiYysuU9E25MTB+WLdvL\npUtX2bL5GF5esdWOoaSk8hS04iJZuEIIRyZ30kLYiKenN4MGzWD58q1oFA0jRz6Hi8v9FyuJiIgi\nKGg6Fy+eIzS0KwEBAdV+b3h4DxYv/pGwMG8uXsyhXduR9319IYTtSJIWwoY8Pb0ZOmRsrc/TqFEj\n2rRpd9/v0+niadmyNbm5OcS28neYQWMGgwGNRiMrXAlxC0nSQjxgtFot/v6N7R0GYKmw9vd/fMCB\ng1lotSZGj2zDzBkT7R2WEA5DnkkLIexm+fLvSNjjgsHUgpKyliz7OhH92XP2DksIhyF30lZW3akx\nQtyqsLCQ776bi68fFBSYiWs9hPDwKHuHVafSM3LRaNzLt81mXy5eSEIXE33H4zdv3s6qbzfRsWMs\nT816sGqaiweTJGkr2bV
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10c246f90>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"%matplotlib inline\n",
|
||
|
"import numpy as np\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import seaborn; \n",
|
||
|
"from sklearn import neighbors, datasets\n",
|
||
|
"\n",
|
||
|
"import pylab as pl\n",
|
||
|
"\n",
|
||
|
"seaborn.set()\n",
|
||
|
"\n",
|
||
|
"iris = datasets.load_iris()\n",
|
||
|
"\n",
|
||
|
"X, y = iris.data, iris.target\n",
|
||
|
"from sklearn.decomposition import PCA\n",
|
||
|
"pca = PCA(n_components=2)\n",
|
||
|
"pca.fit(X)\n",
|
||
|
"X_reduced = pca.transform(X)\n",
|
||
|
"print(\"Reduced dataset shape:\", X_reduced.shape)\n",
|
||
|
"\n",
|
||
|
"import pylab as pl\n",
|
||
|
"pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y,\n",
|
||
|
" cmap='RdYlBu')\n",
|
||
|
"\n",
|
||
|
"print(\"Meaning of the 2 components:\")\n",
|
||
|
"for component in pca.components_:\n",
|
||
|
" print(\" + \".join(\"%.3f x %s\" % (value, name)\n",
|
||
|
" for value, name in zip(component,\n",
|
||
|
" iris.feature_names)))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Dimensionality Reduction: Principal Component Analysis in-depth\n",
|
||
|
"\n",
|
||
|
"Here we'll explore **Principal Component Analysis**, which is an extremely useful linear dimensionality reduction technique. Principal Component Analysis is a very powerful unsupervised method for *dimensionality reduction* in data. Look for directions in the data with the most variance.\n",
|
||
|
"\n",
|
||
|
"Useful to explore data, visualize data and relationships.\n",
|
||
|
"\n",
|
||
|
"It's easiest to visualize by looking at a two-dimensional dataset:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHl5JREFUeJzt3X+MpVV9x/HP3ZnZWYa5sJPN1N1GAtSNJ2BdIZFApFiw\n0arFRFghWGvARVFru1pYtVaWFrQNjUssVBpCA1ntDzeuSIypKW2EVQuRbJr1R4o9YXXXVLsmq5kt\nIwsDM3v7x51nuPPc58d5ftznPM8871dCwszcuc+Zs3fmc895zvmeTq/XEwAAqNY63w0AAKCNCGAA\nADwggAEA8IAABgDAAwIYAAAPCGAAADwYz/NNxpgJSQ9KOlvSpKRPW2u/Fvf4xcWl3tzcyXwtbJGZ\nmSnRT+noJ3f0lRv6yR195WZ2tttJe0zeEfC7JB231r5e0pslfS7pwePjYzkv0y70kxv6yR195YZ+\nckdflSfXCFjSfklfXv7/dZIWy2kOAADtkCuArbXPSpIxpqt+GH+yzEYBALDW5V6EZYw5S9Kjkr5g\nrd1XXpMAAFj7OnlqQRtjXibpgKQ/tNY+5vAtFJwGALRJ6iKsvAF8t6RrJNmBT7/FWvt8zLf0jh+f\nz3ydtpmd7Yp+Skc/uaOv3NBP7ugrNy6roPPeA/6wpA/n+V4AAEAhDgAAvCCAAQDwgAAGAMADAhgA\nAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAG\nAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCA\nAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8I\nYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMAD\nAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDw\ngAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgAAGAMCDwgFsjLnYGPNY\nGY0BAKAtxot8szHmY5L+QNKvymkOAADtUHQEfFjS1ZI6JbQFAIDWKBTA1tqvSFosqS0AALRGoSno\nLGZnu1VdqtHoJzf0kzv6yg395I6+KkdlAXz8+HxVl2qs2dku/eSAfnJHX7mhn9zRV25c3qSUtQ2p\nV9LzAADQCoVHwNbao5JeV7wpAIC22rPvkH54dE6SdN45M9p13YWeWzR6FOIAAHi1Z98hPXV0Tj31\np1OfOjqnW+59XD/5+dqe6iaAAQBeBSPfQXPzC7rnoe97aE11CGAAADwggAEAXp13zszQ52a6k9q5\nfZuH1lSHAAYAeLXrugs1051c+XimO6m7PnSpzt68tvcbE8AAAO92bt+mme5kK0a+gcoKcQAAEOfs\nzV3d9aFLfTejUoyAAQDwgAAGAMADAhgAAA8IYAAAPCCAAQDwgFXQAFATbTyQoM0YAQNADbT1QII2\nI4ABoAbaeiBBmxHAAAB4QAADQA209UCCNiOAAaAG2nogQZsRwABQE208kKDN2IYEADXR1AMJ2D6V\nDwEMABjiGqrB9qlAsH1q5/ZtTJ+nIIABYATqPCpMa1tSqM7Org7VpO1TTRzNV4l7wABQsjoX1XBp\nG3uSq0EAA2iVPfsO6cY7H9WNdz6qPfsOjeQadQ6wstvG9qn8CGAArRE3+jv80xOlPHcQ7L2Yxywu\nnSp8nSpkCVW2T+VHAANojbjR36cffLLQ84aDvc5cwjVrqLJ9Kh8CGAAKigr2KM8tLI64JelcwzVL\nqAbbpxj5ZsMqaACtMbVhXM8+vzoEZ7qTunXHxZVcf3Gppx13Pqpzt3S1+/qLKrlmlJ3bt63c840L\n16buSW4SAhhAK+zZd2gofDudfgBtfflGHT+ef4XyeefMrNq2Ezx3L2Y++sixed1y7+OamZ7UkWPP\nrDxHVVuVCNd6YAoaQCtETRP3eiplZXLUtO5tKSPcufkF/fjYM7XcqoRqEMAAUILwPdOzN3d1fsSC\npyR12aqEajAFDaD2yqgqFTVNXNaq3XD7goVIu667ULfc+7jm5hcKXwNrDyNgALVWVlWpUe1XTWtf\nMDLudF76npnuZOToeKY7qY3T60deKAT1QAADqLWilZsGC2TMTE+Wvl81rX3Bgqfbrr9o1bWj3hBs\n2TSlI8fmuS/cEgQwgDUrPDr98fKKYx8n9UTtlQ3fN65zCUuUj3vAAGqtyL3bKk7qydK+qHvZbAdq\nLwIYQK2FFzIF927LlnehV1L7gueM2g4cdW7uKBeKoX6YggZQe3lrDbseKrD7vicKLfSKat/g9Hec\n8PQyBxu0S6cXV6qlXL0iVWbaYna2W6gaT1vQT+7oKzmNnm/860cjq1YVGW0nnYqUdI2f/Hx+VZnI\nuoUvryk3s7PdTtpjmIIGsCYNTv+Oj3XUnVpfu6ncqNE4ZSLbgwAGsGbE3XNdXEoei75m66y++/Tx\nVZ/LMt0ddf846n7uoO7UBEHbcgQwgDUhuOcaJ2r180pwdvqj5CCo06aeBwM3fMLS4OKqex76/lAV\nrE5H2jjNwioQwABilFH+sejzZ2mD65m8g8+9Eti9/ii505GmT5tIDMdw0IdPWJJeCvsghBeXTkmS\nxsfW1fK+LvwggAEMCYdM1JaZUT9/2W0ITynHnY40PrYu8fmzBD33c5GEbUgAhoy6IpPL82dtQ9SW\no0DV23nYuwsXBDCA2pibX8h9AEF4D213aiJx77DrHmGX7wsftMDeXbgggAEMiRtNzkxPRn6+rOeX\nXppqPmfLcIClBeRgQYybr71gqPbyoLxFL8LfJ0nnbj6j9EMesPYRwECFBk/mqfNRc1EhI/UPMyjj\ndJ645w/MzS/oxK9eyByQe/Yd0tz8gvNIeuP0+sj/z/J9kt9DHtBcVMKqESrMuGlqP0VtkwlGTKP6\nox3uq6RVxeGvXXP5Vt2+92Dk85ZRjzmo+BR3WH3QN65Vof74b741tCK505Hed+X5uuRVm4ceX+Tf\nI67K1ajqVKcZ9Yr1QU39/auaSyUsArhGeGG7aWo/lflHO+0P7uD+1qnJcZ18fjH22ju3b9P+A4cj\nwygpHMsKmiJBONgPSX/Jgr+Eg31V5N9jlAGcNUyrfmPX1N+/qrkEMFPQQMOEz7gNHxyw6uu9/j7V\nuHAKVhXHrTgeHxv+G1LFfc4tm6acwnewH5KUfcB93gVcadL+baNwhnBzEcBARcr6o532BzdrQYok\n3an1Tvdho+5tu9zvjqtelTd4XAR9VeTfY1SnFhGm7UIAAxWp41FzQeDErUqem1/Qxun1iSt8o0Zt\nN33mMaeRXFKIFg2eTuelqecoRf898h6RWLZRjcYxegQwUKGd27etTOvm3fOa9gc3aYtP+HuCwNl1\n3YWR082SdOTY/ErbB8MpGOFGjWCjDj8IAnVwZFxkBUrUzzm1YbwfvMuLr9L6KgjRTWduyBxYQZWr\nMt9E5QnTOr6xgxsWYdUIixvcNLmfylowk3bG7eDXOx2tnHXbnZrQ+Fj/fXf4mjvufDTxmoPXSTv4\noAzhfolanDT4c0r9EW944ZLLecCjfE1lXVTl0t6wKs8QbvLvX5VYBd0wvLDdNLmfylo9m/YHN/j6\nunUdbX/9b2j/gR/FPjatbeE2VhW+4VOLot64XHP5K/Tg1384NOIeDG+XcBrVayrPG64qwzSPJv/+\nVYkAbhhe2G6a3E9V7x/N0ldJwZq0XakMwSlEUvSJQXH9NniEYNTX7v/oFU7XH9Vrqm77hcvQ5N+/\nKrkEMKchARWKOqS9Lgtmdl13oW76zGOx92/3HzhceIV1VGAW2bMaF77B18o8wQkoG4uwgArVccHM\n4KKopEALVjXn1Z2a0P0
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10c39ddd0>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"np.random.seed(1)\n",
|
||
|
"X = np.dot(np.random.random(size=(2, 2)), np.random.normal(size=(2, 200))).T\n",
|
||
|
"plt.plot(X[:, 0], X[:, 1], 'o')\n",
|
||
|
"plt.axis('equal');"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"We can see that there is a definite trend in the data. What PCA seeks to do is to find the **Principal Axes** in the data, and explain how important those axes are in describing the data distribution:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"[ 0.75871884 0.01838551]\n",
|
||
|
"[[ 0.94446029 0.32862557]\n",
|
||
|
" [ 0.32862557 -0.94446029]]\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.decomposition import PCA\n",
|
||
|
"pca = PCA(n_components=2)\n",
|
||
|
"pca.fit(X)\n",
|
||
|
"print(pca.explained_variance_)\n",
|
||
|
"print(pca.components_)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt0FFd+J/BvVfX7oQdSC+RGYJ6FefoF2AZssI2NH+NB\nDGSSzGTy3MzM7jmbTXZPsptk81eyJ3v2zGazm2Rzkp1MstkkM2OQwB57sBk/sC2PwRgzYMAFxhgJ\nEFIL9Oz3o/aPUhXdrW6p1eru6qa/nxxPkFTddVUIfeveuvd3BVVVQURERJUlmt0AIiKiesQAJiIi\nMgEDmIiIyAQMYCIiIhMwgImIiEzAACYiIjKBpZgXybJsBfB3ABYDsAP4Y0VRXs53fCKRVIeHQ8W1\nsI40N7vA6zQzXqfC8VoVhtepcLxWhfH5vMJMxxTbA/4agICiKI8C2AXgL6Y72GKRijxNfeF1Kgyv\nU+F4rQrD61Q4XqvSKaoHDOBFAPsn/ywCSJSmOURERPWhqABWFCUIALIse6GF8R+UslFERER3uqIn\nYcmy3AHgTQD/V1GU75euSURERHc+oZha0LIszwfwNoB/rSjKWwW8hAWniYionsw4CavYAP5zAPsA\nKGmffkZRlEiel6iBwPisz1NvfD4veJ1mxutUOF6rwvA6FY7XqjCFzIIu9hnwbwH4rWJeS0RERCzE\nQUREZAoGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYw\nERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBE\nREQmYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMR\nEZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTERE\nZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQmYAATERGZgAFMRERkAgYwERGR\nCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmAAUxERGQCBjAREZEJGMBEREQm\nYAATERGZgAFMRERkAgYwERGRCRjAREREJmAAExERmYABTEREZAIGMBERkQkYwERERCZgABMREZmA\nAUxERGQCBjAREZEJGMBEREQmmHMAy7K8WZblt0rRGCIionphmcuLZVn+XQBfBzBRmuYQERHVh7n2\ngD8DsAeAUIK2EBER1Y05BbCiKF0AEiVqCxERUd2Y0xD0bPh83kqdqqbxOhWG16lwvFaF4XUq3J1+\nrc6ePYuDBw/CZrPht3/7t2GxlCcqKxbAgcB4pU5Vs3w+L69TAXidCsdrVRhep8LdqdfqypUvcPDg\nAXR17cf582eNz8fjKv7Vv/r2rN+vkJuUUgWwWqL3ISIiqojBwUG89FIXurr248SJ4zmPWbDgrrKd\nf84BrCjKFwAemXtTiIioXvWc6cfQaAQA0NrowJZ17WU5z+joCF555WV0de3He+8dRSqVmnKMw+HA\nU089g6997RvYseOJsrQDqOAQNBERUS49Z/oRGAkbHwdGwnjteC82r56PJo99zu8fCoXw+us/RlfX\nfrz55hHEYrEpx1gsFmzf/jg6O/fimWeeg8dT/ufcDGAiIjKV3vNNF44mcOzcAJ7etKio94zFYnj7\n7TfQ1bUfhw+/ilAoOOUYQRDw8MNb0Nm5F88//2W0tLQUda5iMYCJiOiOkEwm8cEH76Oraz9+9KOD\nGB4eznncvffeh87Offjylztx113+CrfyNgYwERGZqrXRkTEEDQBOuwWbV8+f8bWqquLUqZPo6tqP\nQ4e6cONGf87jVqxYic7OvdizZy+WLl1eknbPFQOYiIhMtWVdO1473otwVKvr5LRbZhx6VpRP0d39\nIrq7D+Dy5c9zHrNwYQd27/4KOjv3Yu3adRCE6irayAAmIiLTbV49H8fODRh/zqW394qxVvfcuU9y\nHtPa2ooXXuhEZ+c+bNy4CaJYvZv+MYCJiMh0TR57zl5vIWt1vd4GPPfcl9DZuRfbtj1WtspVpVYb\nrSQioroxOjqCV1/9Ebq6XsS77+Zfq7tz5y7s2bMPTzyxEw6Hw4SWzg0DmIiITBcKhXDkyGF0de3H\nG2+8nnOtriRJGWt1vd4GE1paOgxgIiIyRTwez1irGwzm3lpeX6v7pS/trvha3XJiABMRUcWkUin8\n9Kc9M67V3bDhPnR27sXu3XtMXatbTgxgIiIqK1VV8bOffYyurv04ePBA3rW6y5evwJ49+9DZ+RUs\nW7aiwq2sPAYwEVGVqNSGBJVy4YKCrq4X0d29P+9aXb9/IXbv/gr27NmLtWvXV91a3XJiABMRVYFy\nb0hQKX19vejuPoDu7v04e/ZMzmNaW1vxpS/tRmfnPmzatLmq1+qWEwOYiKgKlGNDgkoZHBzEyy93\no6trPz788FjOYzwer7FW99FHt9fMWt1y4hUgIqJZGxsbndxXN/9aXbvdjqeeegadnXvx5JNP1eRa\n3XJiABMRVYG5bEhQKeFwGC++eBjf+97/nXat7mOP7UBn5148++zzNb9Wt5wYwEREVaCYDQkqIR6P\n4+jRN9HVtR8//vEredfqPvTQI8Za3dbW1gq3sjYxgImIqkQhGxJUQiqVythX99atWzmPW7/+XmOt\nrt+/sMKtrH0MYCKiKpFvQ4JKSF+re+hQF/r7r+c8TpZlvPDCHnR27sXy5dpa3Z4z/Thx+TKAO2P5\nVKUwgImI6pi+VvfgwQP4/PNLOY9pbWvHV/f9HPbs2YsdO7ZgaOj2MPSdsnzKDAxgIqIyqOaiGt0/\nOYHXXj2E4+8dxtUvlJzHeBqa8cDDT2LT1mewbNW9cDtt6Fgyf0qhjFpePmU2BjARUYlVY68wEAjg\npZe68Q//9M/49JOTOY/xeLx49tnn0b5yG+R1G2GxWI2v6aG6YgknWJUKA5iI6koleqZDoxH0DU4g\nFNFmNLscFnS0eSreKxwbG83YVzeZTE45xmK1Yf0Dj2Lrjufw77/1i3A6nTj03mWoqlrQOWph+VS1\nYgATUd3I1zN9dtuykry3HuynLgZgt0nG14KROC5dG8Uyf+OczzOTcDicsa9uNBqdcowoSrhnw0PY\ntHUX7t20A06XB067BU6nE8DsQrVal0/VAgYwEdWNfM8r3/n4KrbMoceWHezReAqhSAJetxUWqfx1\njuPxON555y0cOPDitGt1N29+GPc+9BRWbngM3sZ5xuezw3W2oVoty6dqDQOYiGiOsoO90W3DrfEI\nxoNxSJKAZErFPK8DN26FSnbOVCqFY8d+iq6u/Xj55e68a3XXrdtgrNVduLADAAoK19mEqpnLp2oZ\nA5iI6sbweAQDt7Seqv5c1mm34NH7FiIZjZfsPC6HBfGkFUMjEdghweu2YjwcQyKZwnd+cArrl87D\nzo2zDyxVVXH69Cljre7169dyHrds2XJ0du7Fnj37jLW66QoJV4Zq+TGAiagu9JzpR5PHjpujEcST\nKQQjcfQOjOMbu1ZhXoMDgUDxAZz9zLSjzQN1QIUAAZIkYCIch8epzSiOJ5I4fekmUiogioIxUWu6\nCWEXL14w9tXNt1b3rrv8xr6669ZtmHZfXYZrdWAAE1Fd0IeJ/T4PrgW0Z6Rtza6SLK3J9cz0G7tW\n4buvnAdUFU7b1F+1F/pGEIsnjYlZ2UuVrl7tM/bV/eST0znPO2/ePLzwQif27NmHTZseqtt9dWsV\nA5iI6orDJpVlNnL2sG6Tx46HVs9HYCQMpXcEKrRlPVZJhN/nxpUbE7BImb3UwcFB/Mkr38enJ9/A\nsWM/zXket9uDZ599Hnv27MWjj+6A1WrNeRxVPwYwEVW9UqzdLed61ez26cU29J6xy2FBMBKHVRKN\n8LdYRPhb3QiHJvDxsTf
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10c47f490>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.5)\n",
|
||
|
"for length, vector in zip(pca.explained_variance_ratio_, pca.components_):\n",
|
||
|
" v = vector * 3 * np.sqrt(length)\n",
|
||
|
" plt.plot([0, v[0]], [0, v[1]], '-k', lw=3)\n",
|
||
|
"plt.axis('equal');"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Notice that one vector is longer than the other. In a sense, this tells us that that direction in the data is somehow more \"important\" than the other direction.\n",
|
||
|
"The explained variance quantifies this measure of \"importance\" in direction.\n",
|
||
|
"\n",
|
||
|
"Another way to think of it is that the second principal component could be **completely ignored** without much loss of information! Let's see what our data look like if we only keep 95% of the variance:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"(200, 2)\n",
|
||
|
"(200, 1)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"clf = PCA(0.95) # keep 95% of variance\n",
|
||
|
"X_trans = clf.fit_transform(X)\n",
|
||
|
"print(X.shape)\n",
|
||
|
"print(X_trans.shape)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Isomap: manifold learning, good when PCA doesn't work like in a loop. Large number of datasets, can use randomized PCA."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"By specifying that we want to throw away 5% of the variance, the data is now compressed by a factor of 50%! Let's see what the data look like after this compression:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAFRCAYAAACsQn5FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl0ZFd9J/Dv22tRlfaWe3EvttXPGyaOGcNAgCQ4noRJ\nwpZM0hMIhnCSEDYHBhTAaaBjAzbjgOPgk2UgQCZxwhKICVlgIMQzzoTMiY0NjnkY293udnerJZVU\nKtXy1jt/vHqlKqmkKlWV9ErS9+PT57RKpaqrZ7W+726/KwkhQERERFtLjrsBREREuxEDmIiIKAYM\nYCIiohgwgImIiGLAACYiIooBA5iIiCgGaidfZJqmBuCTAA4BMADcalnWl9d6vuf5Yn6+1FkLd5Hh\n4RR4nVrjdWofr1V7eJ3ax2vVnvHxjNTqOZ32gH8JwIxlWS8C8JMAfn+9J6uq0uHb7C68Tu3hdWof\nr1V7eJ3ax2vVOx31gAF8DsDnq3+XAXi9aQ4REdHu0FEAW5ZVBADTNDMIw/i9vWwUERHRTtfxIizT\nNC8G8A0An7Es6y961yQiIqKdT+qkFrRpmhMAvgngNyzL+sc2voQFp4mIaDdpuQir0wC+C8DPA7Dq\nHv4py7Iqa3yJmJkpbPh9dpvx8Qx4nVrjdWofr1V7eJ3ax2vVnnZWQXc6B/w2AG/r5GuJiIiIhTiI\niIhiwQAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYi\nIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiI\nKAYMYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKi\nGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhi\nwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYMYCIiohgwgImIiGLAACYiIooB\nA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCAiYiIYsAAJiIiigEDmIiIKAYM\nYCIiohgwgImIiGLAACYiIooBA5iIiCgGDGAiIqIYMICJiIhiwAAmIiKKAQOYiIgoBgxgIiKiGDCA\niYiIYsAAJiIiigEDmIiIKAZdB7Bpms81TfMfe9EYIiKi3ULt5otN03wXgFcDWOpNc4iIiHaHbnvA\nPwDwSgBSD9pCRES0a3QVwJZl/RUAr0dtISIi2jW6GoLeiPHxzFa91bbG69QeXqf28Vq1h9epfbxW\nvbFlATwzU9iqt9q2xsczvE5t4HVqH69Ve3id2sdr1Z52blJ6tQ1J9Oh1iIiIdoWue8CWZZ0E8Pzu\nm0JERLvVdK6EiuMDABK6gomRVMwt2nwsxEFERLGazpVQdjyI6n9lx8OZC0uwXT/upm0qBjAREcUq\n6vnW84IAM/PlGFqzdRjAREREMWAAExFRrBK6suoxVZYxPpyMoTVbhwFMRESxmhhJQZWX40iVZRzY\nMwBDWx3MOwkDmIiIYjc+nIQqy7ui5xvZskIcREREazE0BQf2DMTdjC3FACYior4yNWXgoYfC4edr\nr/Vx++12zC3aHByCJiKivjE1ZeDBBxUIAQgBPPiggmPHknj88Z0XVzvvOyIiom0r6vnWm52VcPy4\nEUNrNhcDmIiIKAYMYCIi6hvXXru6KtbYmMCJEztvHpgBTETUJ6ZzJZw6X8Cp8wVM50pxNycWt99u\nY2xs+YC9sTGBe+8tY3IyiLFVm4MBTETUB3brgQTNnDgRhvBO7flGuA2JiKgPrHcgwW7bHzs5GeDe\ne3f2QQwAA5iIiDq0W/brbhYOQRMR9YHtdiDBbtqvu1l4pYiI+sB2O5BgN+3X3SwcgiYi6hPjw8na\nIfRx93w5vLz52AMmIuoT0YEEcfd82xlert+v63oBHNdHOuPire9YiKPJ2xIDmIiIGjz0kFILVcf1\n4XrBquHlaL+u6wUIhMDwSICP3TOHvQfsXbt9aqMYwEREm6Cfi2q85WYFL/4xAy/+MQNvuXl1TzsK\nVQFAAAiEgOP6CIRoeN6JEzaGhn2MjAS4+Z352uPR9ilaHwOYiKjH+rmoxltuVvDwt9Xa8PLD31bx\nildpePSx5XC94ipn1dcNjwR4883zDY9NTga4654cPnbPHA4f8Ta97TsNA5iIdpWt6JlWHB+5fAXn\nZks4N1tCLl/pm17hIw+vXns7Nyfjllv02sdT781jZGS59ONIdXj5yCWrbyC22/apfsIAJqJdY62e\nacXpvvdWH+xPPLOAiuvX3qfi+riQK8Px4u8BtyOhK7j5nWEIR8PLa4Xqdts+1U+4DYmIdo21yj2e\nnyshrUodv24U7BHHDVByKsimdKhKGE5CEmt9+Za65tkeHv5246/+0dEAt97qAAivwcRICu6lS/jY\nPXMAlkN1Lf20fWo7YQ+YiKhLK4N9cMAABLBYcpBfcpArVBAEAoWiG1MLl939MR+jo8vDy6OjAb74\nBRdXXdF4AzI+nIQqy20NJ/fL9qnthgFMRLvGUtnBubkizs0VMbdYARD27i4aTfX0fQxNRjato1jx\n4IsA2ZSOQtFFvuTgXx+bxukLhZ6+30bdequD0dGgrue7GkN183EImoh2helcCemkhmLZgxcEsB0f\ncwsVXH3pKBK6im4iMaErDUPQI4MJCAHsG0lBUWQslVwMpDQAgOcHODtXhAQJAgJBsPwaEyOrbwSi\nilRnzoQ91AMHRNeVqa66QsIXvxD1xjsfeqfusAdMRLtCNEw8lDVqQ6uZAb0nK5ObLUS6+tJRGJoK\nVZaRMFb3daYXwhXS621ViipSnT4toVgM/zzxhIx//mcefLATsAdMRLuKrsrYM9L7hUIrFyIZmoJD\nF2VQdrxa0AKAokgYyRiYzVegSI0B6gUB3v4ODY9/LwEAOHNGwv79AqXSci/V84CzZ2UkEgGOHzd2\nxbm5OxUDmIj63nSuVOvBrjVU28rKYWKgd/tVV7YvmjOdGEnhzIUlGJqMiutDUSRMDIdtVyQZQ1kD\nd9w2iEe/G+7BdWxA1wE9HK1GsSjhySclCAFIHCnecRjARNTXVm7xiYZqo15mu6Iw9KqTrq221vSq\nfVHAT+dKyA5otff+0z/ah6/+nYFyWUIyKTCx18czZ1QoKjAx4cEwgEQCqFRk+D6gVn9bqyqwb1+A\nsTGBEyd4QtF2xgAmor621t7dmflyWwFa3zuVZdTmanu1X7VV+6LVxB/7yAj+8i802HbYlZWksD2a\nJlAuSzj9dFge0veA6WkVFx/0MLHXx+mnJSRUIFosdcklYfhy6Hn7YwAT0Y61snfqB4AqSxvuPXcq\nWsH8ne+Evdh6QgC+DwSBBF0XCHwpbJ/SWLBj8qgLSUKtvek02PPdIRjARNTXupm77bb33E37jr1q\nD55+WkalEobtWoQAXFeCYQCGIaBpAsMjYbujGsy9Gi6n/sIAJqK+tllztyt1utArat8HfyeDr381\niXJZAiDBdcPhZSHaWz2lqsBR08PUb8/jYx8ZBIB1azDT9scAJqK+12mt4XZ7z2dnljpe6DU1ZeC+\n+9JYWAiDVtdR6/UGQevw1XSBdAq47roAJ064SA4Gbddgpu2NAUxEfS9ayLRR7faeS/bq05DWGqp+\n0YtSOHUqXMhlGAKHDoX7dKOKVnbd9Ox6Q8+SBAwOBbjqahd3fNjH5GT4ArbLgw12CwYwEe1I0ZCy\n4/kolT2MDia6DrT68AWAfF7Cd78rQanrJEehK0nh32UZtXCOKApw5NIKbv/vBVx/nd7Qy+70ZoO2\nHwYwEe0YUejOLYZjwCODCWiqjMGMvu7XpQwV8yseazZUXR++kSAI/9QHrSSFf1RNQJIE5OqWI0kW\neMu7zuP5P1LB2FACl+w
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10c87da50>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"X_new = clf.inverse_transform(X_trans)\n",
|
||
|
"plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.2)\n",
|
||
|
"plt.plot(X_new[:, 0], X_new[:, 1], 'ob', alpha=0.8)\n",
|
||
|
"plt.axis('equal');"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"The light points are the original data, while the dark points are the projected version. We see that after truncating 5% of the variance of this dataset and then reprojecting it, the \"most important\" features of the data are maintained, and we've compressed the data by 50%!\n",
|
||
|
"\n",
|
||
|
"This is the sense in which \"dimensionality reduction\" works: if you can approximate a data set in a lower dimension, you can often have an easier time visualizing it or fitting complicated models to the data."
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 2",
|
||
|
"language": "python",
|
||
|
"name": "python2"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.9"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|