data-science-ipython-notebooks/scikit-learn/scikit-learn-intro.ipynb

280 lines
1.1 MiB
Plaintext
Raw Normal View History

{
2015-04-14 14:24:23 -04:00
"cells": [
{
2015-04-14 14:24:23 -04:00
"cell_type": "markdown",
"metadata": {},
"source": [
"# scikit-learn-intro"
2015-04-14 14:24:23 -04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2015-05-31 09:46:07 -04:00
"Credits: Forked from [PyCon 2015 Scikit-learn Tutorial](https://github.com/jakevdp/sklearn_pycon2015) by Jake VanderPlas\n",
"\n",
2015-04-14 14:24:23 -04:00
"* Machine Learning Models Cheat Sheet\n",
"* Estimators\n",
"* Introduction: Iris Dataset\n",
"* K-Nearest Neighbors Classifier"
2015-04-14 14:24:23 -04:00
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn; \n",
"from sklearn.linear_model import LinearRegression\n",
"from scipy import stats\n",
"import pylab as pl\n",
"\n",
"seaborn.set()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Machine Learning Models Cheat Sheet"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
2015-04-14 14:24:23 -04:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAACEoAAAUrCAYAAAA3g9BsAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAG\nbgAABm4BRLh3LwAAAAd0SU1FB90EEw8XG1Hi8acAAAAGYktHRAD/AP8A/6C9p5MAC5xvSURBVHja\n7N0HfBR1+sfxv3eo2M7ee+9nOT0PxcKJZzlUVFA5sYsKdk9PTz3RS68EAklIQhopEAgJkEKAAAkh\nBFJJQJoo0gWVTrb//vuMbm53szUkkIQPr9f79dJkZnZ2ZrKZ7PPd5/k/pVQLAAAAAAAAAAAAAADA\nkeD/OAgAAAAAAAAAAAAAAICgBAAAAAAAAAAAAAAAAEEJAAAAAAAAAAAAAAAAghIAAAAAAAAAAAAA\nAAAEJQAAAAAAAAAAAAAAAAhKAAAAAAAAAAAAAAAAEJQAAAAAAAAAAAAAAAAgKAEAAAAAAAAAAAAA\nAEBQAgAAAAAAAAAAAAAAgKAEAAAAAAAAAAAAAAAAQQkAAAAAAAAAAAAAAEBQAgAAAAAAAAAAAAAA\ngKAEAAAAAAAAAAAAAAAAQQkAAAAAAAAAAAAAAACCEgAAAAAAAAAAAAAAAAQlAAAAAAAAAAAAAAAA\nCEoAwKFnsVhaDAZDy/79+3W7du3S/fTTT7qtW7fqN2zYoP/+++9brV+/3qUffvjBJVnflY0bN7ba\nvHmz/scff9QeUx573759uhbrP6PRqO0X5wcAAAAAAAAAAAAgKAHgCCehhp07d+o2bdqkX716tb6h\nocFQXV1tqKysNM6fP984e/ZsY1FRkWn69OmmqVOnmnJycswTJ040p6SkmMePH28eO3asJSYmxhIR\nEWEJDg5W//3vf7uswMBAFRoaqmRfo6OjLWPGjLGMGzfOkpCQYElOTjanpqZqzy07O9s8efJkU15e\nnva8CwsLTSUlJaY5c+YYFyxYYKyqqjLW1dUZmpubDWvWrNECHlu3btX98ssvWjhDghlcWwAAAAAA\nAAAAACAoAQCHqKODFOuls8J3332nl2K+BB/KysqMM2bM0IIOEyZMMEtIQEIDXTnY0J1JKEMCGaNH\nj7bEx8db5JhnZmaac3NztfCFBC/knEggpaamxrBs2TLDypUr9XLOpDPGjh07dHv27NHp9Xq6YQAA\nAAAAAAAAAICgBIAjlxTPZXSFdDSQbg/S5SE9Pd0sxfjIyEhLQEAAQYUeSEIt0gVDOmAkJSWZMzIy\nzJMmTTJPmzbNJB0/pNtFRUWFUUIx0hVkxYoVhm+//VYbTyKhGRlFcuDAAZ3ZbObnCAAAAAAAAAAA\nAAQlAHQt0kVgy5Yt+qamJoOMepAREImJiWY6QKAjyIgUCdXExsZaZHyKjByRcSNynUnoYu7cuVqn\ni9raWm28yNq1a/UbN25s7XIh41r4OQUAAAAAAAAAAABBCQB+kU/2//TTT7rVq1frq6qqjIWFhVpn\nCOkaQDEf3WG0iC1sIR0uJk6cqI0VkfEupaWlxvLycq27RWNjo2HVqlV66YKydetW3c6dO3UtLS06\nxokAAAAAAAAAAAAQlADQg+3bt0+3cuVKvYxHyMnJMcv4BCk0d9cieZB138NCQlR0eLiKjYpSCTEx\nKjk2VqXHx6us8eNV7oQJKj8tTRVOnKhmZWeruZMnq/KpU1XltGmdqsL6GPNyc9WcSZNUSVaW9vgF\n6ekqLyVFTU5OVtmJiSrDuo+p48apJOv+JowercZFR6vRkZHacwm3PqfgoCDF+JJDIywsTMXExFhk\nbIytq4WMEikuLjaVlZUZFy1aZJQRM8uXL9fGiGzatEkv4aK9e/fqjEYjry0AAAAAAAAAAAAEJQB0\nBfJJ+R9//FEnBd6CggLT2LFju1yHiNDgYC3gkDJunBYgKMzMVPOnTFE1M2aoFaWlat38+WpDebna\nsmiR2lFdrXbV1Kh99fVKv2yZsixfrtSKFT2e2fo8Ddbne6ChQe2tq1M7rcdAjsWWykr1/YIFavXc\nuaqppETVzpypFuXnq/m5uVooZEZGhpoyYYIWGJHjK2EMCWJEhIaq7hyO6ZKBnaAgFRUVZZGfseTk\nZHNmZqZ5ypQpppkzZ5pmz55trKioMC5ZssR6GpcZpHPL+vXr9du2bdPt2rVLZ/3XQlcLAAAAAAAA\nAAAAghIA2sFgMLR89913einKZmVlmcPDww95MEI6IESFhWmdHSYmJKhpqamqNDtb67LQUFSk1syd\nqzZXVmqBB2NT0xERdOiqTM3Nan99vfpl6VK1rapKC6SsLStTy0tLVX1hoaouKNC6b8zOydE6Ysi5\nzElKUmlxcSpxzBg1NjpaO9fS+YKwxMH/3ERERLQGLeTnVzpalJSUmBYsWKCFLJqamgxr167Vuln8\n8ssvjA0BAAAAAAAAAAAgKAEceXbv3q2TUQBSTE1KSjIfqi4BMu4iPiZG6wAhoywkACFFdul2cKR0\neYAjOe8tjY1qd22t2r54sdq0cKHWDWTl7NmqsbhYLZ0xQy3My1Nl1uulOCtLFaSladePhGlkVErc\nqFFqVESENkaFcSP+BSwiIyO1gMWECRO0kSH5+fmmWbNmGcvLy41Lly41yKidzZs36/fs2aMzm828\ndgIAAAAAAAAAAIISALoPGaMhnyyfOnWqKSYmptO7RcRERGiFbClsL5k+XX07b57WfYAwBDqbrrFR\n6zoinS5+KC/XxossKynRAhcVeXlal4uZEydqo0Uyfwtb2DpcBDFWxGOwIjo62iLBqpycHHNhYaFJ\nAhX19fVatwoZB7J//34dr7cAAAAAAAAAAICgBIDDQtrqS5v9OXPmGGNjYzslGBEeEqKSx47VPuEv\nBegVpaVacdrAWAx0YzLWRTqc7Kiu1rpbSMhHru26wkJVlZ+v5uXmagGgfOt1n5OYqFLj4rRxMRIO\nCg0OPuIDFUFBQWr06NGWlJQUc25urjYCpLKy0rhs2TKDjPfZsWOHzvqP12kAAAAAAAAAAEBQAkDH\nhCPWr1+vl8JkR3eNCA4KUmlxcdr4gzVlZWp/fT1FdcDdKJGGBrVz6VK1ddEitX7BArVqzhxtjIh0\nVymfOlWV5uSoGRkZKnfCBJURH6+SYmNVbFSUiggNVYFHyPiQUOtzTUhIsEyePNk0e/ZsY01NjdaZ\n4ueff9aZTCZe0wEAAAAAAAAAAEEJAK5JQVGKizNnzjRFRkZ2WDgiOjxcG01QPX262rxwoTI3N1ME\nBw4Rw7Jlak9dndq+eLHaWFGh1paVqeZZs1TtzJmqcto0NXfyZFWUmanyUlNV1vjx2vgQCVqEh4b2\nmDEf0pUiIyPDPGPGDNPChQuNy5cvN2zevFl/4MABxnsAAAAAAAAAAACCEsCRxmAwtKxcuVKfn59v\nCg8Pt3REUXL86NFa4bWppETtrKmhWA10444WBxoa1E9LlmijQ6QDzDLrz7WEnuZPmaKNDZGARWZC\ngtbJYkxkpAoLCelWQQp53UtMTDRPmTLFJOOF6urqDOvWrdPv3LlTZzab+T0BAAAAAAAAAAAISgA9\ngU6na2lubjZIYTDkIIuaocHBamJCglowZYpaN2+e0jU2UmAGjnDm5cvVvvp6taO6Wm2oqFCr585V\ndYWF2uvEzIkTVXZiohaoigwL69IhisDAQBUXF2eR18r58+drnSi2b9/OOA8AAAAAAAAAAEBQAugO\npLDX1NRkyM7ONgcFBR1U8TBxzBi1MC9PbV20SPvEOYVhAO1lam5WO5cuVRvKy9WK0lJVXVCgZufk\nqGmpqSotLk4bAxJ8kK9ZnRWgyM3NbQ1Q/PjjjwQoAAAAAAAAAAAAQQmgK9i/f7+uoqLCGB0dfVBj\nNSaMHauq8vO1gibFXQCHWktDg/px8WKtc01DUZGqyMvTRvzkJidrYz/Cu8DIDwIUAAAAAAAAAACA\noARwGG3btk03Y8YMU3BwcLsKfgEBASo9Pl4tnTFD7amtpVALoMs70NCgtlRWquWlpapy2jQ1MyND\nex0bHRmpvaYRoAAAAAA
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 2,
"metadata": {
"image/png": {
"width": 800
}
2015-04-14 14:24:23 -04:00
},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import Image\n",
"Image(\"http://scikit-learn.org/dev/_static/ml_map.png\", width=800)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Estimators"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Given a scikit-learn *estimator* object named `model`, the following methods are available:\n",
"\n",
"- Available in **all Estimators**\n",
" + `model.fit()` : fit training data. For supervised learning applications,\n",
" this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n",
" For unsupervised learning applications, this accepts only a single argument,\n",
" the data `X` (e.g. `model.fit(X)`).\n",
"- Available in **supervised estimators**\n",
" + `model.predict()` : given a trained model, predict the label of a new set of data.\n",
" This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n",
" and returns the learned label for each object in the array.\n",
" + `model.predict_proba()` : For classification problems, some estimators also provide\n",
" this method, which returns the probability that a new observation has each categorical label.\n",
" In this case, the label with the highest probability is returned by `model.predict()`.\n",
" + `model.score()` : for classification or regression problems, most (all?) estimators implement\n",
" a score method. Scores are between 0 and 1, with a larger score indicating a better fit.\n",
"- Available in **unsupervised estimators**\n",
" + `model.predict()` : predict labels in clustering algorithms.\n",
" + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n",
" This also accepts one argument `X_new`, and returns the new representation of the data based\n",
" on the unsupervised model.\n",
" + `model.fit_transform()` : some estimators implement this method,\n",
" which more efficiently performs a fit and a transform on the same input data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction: Iris Dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
2015-04-14 14:24:23 -04:00
"name": "stdout",
"output_type": "stream",
"text": [
"['target_names', 'data', 'target', 'DESCR', 'feature_names']\n",
"(150, 4)\n",
"(150, 4)\n",
"(150,)\n",
"['setosa' 'versicolor' 'virginica']\n",
"['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n"
]
2015-04-14 14:24:23 -04:00
}
],
"source": [
"from sklearn.datasets import load_iris\n",
"iris = load_iris()\n",
"\n",
"n_samples, n_features = iris.data.shape\n",
"print(iris.keys())\n",
"print((n_samples, n_features))\n",
"print(iris.data.shape)\n",
"print(iris.target.shape)\n",
"print(iris.target_names)\n",
"print(iris.feature_names)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
2015-04-14 14:24:23 -04:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAFkCAYAAAD165gcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xd8VFXawPHfnZn0TgihBEKTQw9NmggoNoqoWFcFG4pd\ndxXL7rvuru66urq2VXTXBoJYEUWKoii9KL2FAwSQDiEhvc7Mff+YEMpMkiHJJJPh+frhY+7k3HOe\nk/bMPefecwzTNBFCCCGEf7DUdwBCCCGEOEkSsxBCCOFHJDELIYQQfkQSsxBCCOFHJDELIYQQfkQS\nsxBCCOFHbL6qWCllAd4DOgBO4G6ttfZVe0IIIUQg8OUV82VAhNZ6EPAs8A8ftiWEEEIEBF8m5kIg\nRillADFAiQ/bEkIIIQKCz4aygWVAKLANiAeu9GFbQgghREAwfLUkp1Lqj7iGsv+klEoCfgK6aq09\nXjmbpmkahuGTWIQQQpw1n/1B7tVvUrUTz9pV9wd8ovDlFXMEkFP28XEgCLBWVNgwDNLTc30YTt1J\nSIiSvvgh6Yt/kr74p4SEqPoO4Zzly8T8EvChUmoJrqT8tNa60IftCSGEEA2ezxKz1joLuMZX9Qsh\nhBCBSBYYEUIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyI\nJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkII\nIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hi\nFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDCj0hiFkIIIfyIJGYhhBDC\nj0hiFkIIIfyIzZeVK6VuA24vOwwDUoBErXWOL9sVQgghGiqfJmat9RRgCoBS6k3gPUnKQgghRMXq\nZChbKdUH6KK1fq8u2hNCCCEaKp9eMZ/ij8BfqyqUkBDl+0jqiPTFP0lf/JP0RYiTfJ6YlVKxQAet\n9aKqyqan5/o6nDqRkBB1TvVl27btrF23mT69u9OhQ/s6iuzsnWvfl4ZC+uKf5A1G/amLK+bBwII6\naEfUg29mzed/762j1NGU6Z9+yX0T+jJyxMX1HZYQQjRYdTHH3AFIq4N2RD34etY67M7mGIaFUkcL\nvv5mdX2HJIQQDZrPr5i11i/7ug1Rf5xO84zjegpECCEChCwwImpk2NDzMMgEwEIGwy5W9RyREEI0\nbHV1V7YIUOPGXUty8nK2pu6mW9cBDBrUr75DEkKIBk0Ss6ixIUMGMmTIwPoOQwghAoIMZQshhBB+\nRK6Yhd9ZuuwXvvt+DTYr3H77SFont6rvkASwctUaZs/5BasFxo29gnbt2tR3SEJUSil1OdBKa/2u\nF2UTgWe01g9U8PkUYLTW+rlaDtONJGbhV9au28QL//qZUkdzALbpybz3v98TGSmLHdSnLVu38fwL\nP1Bc2gKArdum8t9JD9KoUaN6jkyIimmtvz+LskcAj0m57PMbgA21EVdVJDELv7J4ydrypAyQnpnI\nihW/cumlsmhJfVq4cHV5UgbIykli8eKVXH31iHqMSojTKaVmAK9rrReX7dGwAJgEvAPMBo4Bc4FF\nwJtALnAUKMK1bPSnWusBSqmNwEKgO2ACVwG9gAla698ppe4C7gWswCyt9V+VUg8C1wARZe1co7Uu\nrU4/ZI5Z+JVGcZE4HEXlxxYjh5YtW1RyhqgLCY2jMZ0nvy8G2SQny/dF+J13gdvKPr4D1z4NJyQC\nl2qtX8KVqG/TWg/DtQDW6QsyQBQwXWs9FDgADD9RRimVADwJDNJa9wKClVJRQCPgEq11f1wXvedX\ntxOSmIVfueXmq+mdkg3OPVgtuxhzVQs6dqzZs9G7du3hD4+9wg03Pcsrr36AU1ZBOWvXXTeKvr3z\nwbkHi5HGqOHx9OyZUt9hCXGm+UBfpVQcMAgoPOVzu7XW9rKPm2mtU8s+XgIYHupaV/b/fUDoKa+3\nBTZrrYsBtNZ/1FrnAqXAJ0qp94AkajAiLUPZwq9YrVZefOFxcnKyCQoKJiwsrEb1mabJ3577iMPp\nrhuVtu8sJCLiMybc87vaCPecYbFY+Ptzj5KTk43NFkR4eHh9hySEG621Uyn1Ba4r4pmA45RPn/qO\nfJ9SqlNZch5QQXVnXkWfkAZ0VEoFa61LlFKfAW8BV2mt+yulwoHV1ODCVxKz8EvR0TG1Uk9ubg5H\njlrK3w9brGHs2p1ZK3Wfi2rr+yKED30I7AQmAhdxMsGemmjvBz5QSuUBJcB+D2VOVV6H1vqYUupF\nYJFSygRmAb8C+Uqpxbjml9cCzarbAUnMIqBFRUUTG2PneI7r2OksJbFJza7ChRD+S2u9DwgpO5xy\nyqdOXQWpL3BlWZJ9DijWWv92oozWuvxZQK3106ect6jstSln1A0wrHZ6IHPMIsAZhsHEx8aQ1Gw/\n8XH76dsrlwcfuLW+wxJC1K8jwPyyK9wUXEPRfkOumEXA6927O++/2z2gNrEXQlSf1noGMKO+46iI\nXDELIYQQfkQSsxDCK6ZpcuTIYTIyMuo7FCECmgxlCyGq5HA4eOqPr7JufSkWq4NLL27GxMfvqu+w\nhAhIcsUshKjSZ599w/pNMViDkjAsycxfkMuvv66p77CECEiSmIUQVTqeXYDFcsriR0YUBw4eqb+A\nhAhgkphFg2WaJj8uWMisWfMoKiqq+gRRbZcO609oyL7y4/jY/Vw0dGAlZwghqkvmmEWDZJomE598\nmfUbI8AI4quvX+Ct/0wkIiKivkMLSB06tOfZZ67im2+XYbGY3Db2LmJiYus7LCECkiRm0SAtW7aS\ndRtCsQW59mk+eKQNU6d9zb0TbqnnyAJXSkoXUlK61HcYQgQ8GcoWDVJxcTGnv680sNtl1yghRMMn\niVk0SIMHX0Db5HRMpwPTNImN2sX1111W32EJIUSNyVC2aJCCgoL4zxtP8PH0rykpcTDmmvtITEys\n77CEEKLGJDGLBis0NJS77rypvsMQQohaJUPZQgghhB+RK2bhd1asWM1389dgtcAdt4+kZcukGtWX\nn5/PW5M+obTUoGvXllw1OvDnok3T5P33P2fP3uMktYjmnrtvwmKR9+FCNASSmIVf2bBhC8+/+CMl\n9hYAbN32Pu/99w9ERkZVu86JT77Gzt1JGIaFRctSsZfaufbaEbUVsl96+d/vM/8nJxZLBKtW53Ms\n47/83x/vq++whBBekLfQwq8sXLS6PCkDpGc0ZcXK6q/JnJeXy67dTgzD9aNuGI34dc2emobp97am\nZmKxhAFgsYSybVtWPUckhPCWT6+YlVJPA1cCQcCbWuspvmxPNHyxMeE4HNlYrSEAWI0cklo0rXZ9\nYWHhhIU6KCh2HZumSXhYbUTq38LDjUqPhRD+y2dXzEqpocAArfVAYCjQ1ldticBx661j6NEtE9O5\nF6uxm6tGNaNTp47Vrs9qtXLH7YMIC07DdB4gqdkeHnygZndym6bJG29M4b4HXuHxJ15jz569FZad\nMWMuDzz0Kg898grLl/9ao3bPxn0TRtEoZhcO+35io3dx7z0j66xtIUTNGKZp+qRipdTzgAl0AaKB\niVrrysYkzfT0XJ/EUtcSEqKQvlSfaZocP55JcHAIkZGRtVJnUVERwcFODCMMw6jZ1eO7733K51/l\nlA8VN03YzeQP/uxW7+IlK3n+xeWYJAAQFryXdybdRdOm1R8BOMGb74vdbicjI4P4+HhsNv+9nUR+\nX/xTQkKUz4ZZevWbVO3Es3bV/QE//OPLOeYEoDdwHXAv8LEP2xIBxDAMGjWKr7WkDK5nnhMTE2uc\nlAHSdmWUJ2WAw0cs5ObmuJVbv357eVIGyCtM5Ndf19e4fW/ZbDYSExP9OikLIdz58jf2GJCqtbYD\n25VSRUqpxlrrYxWdkJB
"text/plain": [
"<matplotlib.figure.Figure at 0x10da12b90>"
2015-04-14 14:24:23 -04:00
]
},
"metadata": {},
2015-04-14 14:24:23 -04:00
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# 'sepal width (cm)'\n",
"x_index = 1\n",
"# 'petal length (cm)'\n",
"y_index = 2\n",
"\n",
"# this formatter will label the colorbar with the correct target names\n",
"formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])\n",
"\n",
"plt.scatter(iris.data[:, x_index], iris.data[:, y_index],\n",
" c=iris.target, cmap=plt.cm.get_cmap('RdYlBu', 3))\n",
"plt.colorbar(ticks=[0, 1, 2], format=formatter)\n",
"plt.clim(-0.5, 2.5)\n",
"plt.xlabel(iris.feature_names[x_index])\n",
"plt.ylabel(iris.feature_names[y_index]);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## K-Nearest Neighbors Classifier\n",
"\n",
"The K-Nearest Neighbors (KNN) algorithm is a method used for algorithm used for **classification** or for **regression**. In both cases, the input consists of the k closest training examples in the feature space. Given a new, unknown observation, look up which points have the closest features and assign the predominant class."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
2015-04-14 14:24:23 -04:00
"name": "stdout",
"output_type": "stream",
"text": [
"['versicolor']\n",
"['setosa' 'versicolor' 'virginica']\n",
"[[ 0. 0.8 0.2]]\n"
]
},
{
2015-04-14 14:24:23 -04:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAfAAAAFgCAYAAABEyiulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xd4VFX6wPHvJJPeSELovV2qCCKgSAcLIChVVNTVXXVd\nXXV3rdtcd127a10Lrj8UKQJKV5AuRemhcwIJISQEEkhvM5ny+2NCCGSSDExP3s/z8Dw5M2fufe8w\nyTvn3HPfq7NarQghhBDCvwR4OwAhhBBCXDlJ4EIIIYQfkgQuhBBC+CFJ4EIIIYQfkgQuhBBC+CFJ\n4EIIIYQf0ns7gCuRnV0o17wJIYRoMBISonQ1PScjcCGEEMIPSQIXQggh/JAkcCGEEMIPSQIXQggh\n/JAkcCGEEMIPSQIXQggh/JAkcCGEEMIPSQIXQggh/JAkcCGEEMIPub0Sm6ZpTYDdwEilVFKVx58G\nHgKyKx56pOrzQgghhKiZWxO4pmlBwKdAsZ2n+wIzlFJ73RmDEEIIUR+5ewr9TeBjINPOc9cBL2qa\ntlnTtOfdHIcQQghRr7htBK5p2gNAtlLqR03TXgAuL8g+D/gIKAQWa5o2Vim10l3xXC5h4ypP7UoI\n4UcWMsXbIQg/NmxYocf25c4R+K+A0ZqmbQCuBb6sOB9+wXtKqRylVDmwEujjxliEEEKIesVtI3Cl\n1NALP1ck8UeUUlkV7Rhgv6Zp3YESYATwP3fFIoQQQtQ3nrwfuE7TtOlApFJqZsV57w2AAVirlJI5\nbSGEEMJBHkngSqnhF36s8tg8bOfBhRBCCHGFpJCLEEII4YckgQshhBB+yJPnwIUQwufIZWPCX8kI\nXAghhPBDMgIXwsMS9+8neelS9OXlBF1/PWNuv93bIQkh/JAkcCE86HxhIWc++YRJ584BkJqSwtb4\neAbdeKOXIxNC+BuZQhfCg5KSk+lbkbwB2hmNnE9O9mJEQgh/JQlcCA/q2K4dB2JjK9unAwOJbtPG\nixEJIfyVTKEL4UFNGjUi8qGH+HbZMoLKyzH37cudQ4fW/ULhMrLqXNQXksCF8LAB/fszoH9/b4ch\nhPBzMoUuhBBC+CFJ4EIIIYQfkgQuhBBC+CE5By6EqLdkwZqoz2QELoQQQvghSeBCCCGEH5IELoQQ\nQvghSeBCCCGEH5IELoQQQvghSeBCCCGEH5IELoQQQvghSeBCCCGEH5IELoQQQvghSeBCCCGEH5IE\nLoQQQvghqYUuRC3OnDvHjl9+ITI2luE33ohOp/N2SEIIAUgCF6JGqenpHHjzTW7PzCRHp2PugQPc\n8+ij3g5LCCEAmUIXokaJq1dze2YmOiDeaqXj1q1k5uZ6OywhhAAkgQvhMKtMnwshfIgkcCFqcO0t\nt7C0RQusQLZOR8qgQTSPjfV2WEIIAcg5cCFq1K5VK8L++ldWbN9OZKNG3H3DDd4OSQghKkkCF6IW\nTePjuX3MGG+HIYQQ1cgUuhBCCOGHZAQuhKg3FjLF2yEI4TEyAhdCCCH8kCRwIYQQwg/JFLrwaZt/\n+onc/fsxhodz87RpREdEeDsk4UNkylw0ZJLAhc/asmULTWbOZLDBgAX4Kj2d+//6V6lHLoQQyBS6\n8GG5+/ejGQyA7YPa9fhxsgsKvBuUEEL4CBmBC59lDA/HDARWtLOiorg2LMybIQkvkylzIS6SBC58\n1i3TpvFVejqdjx3jXGQk8VOmEBoc7O2whBDCJ0gCFz4rMiyMB/78Z3KLi4kMDSVYLx/XhkZG3ELU\nTP4iCp+m0+mIi4z0dhhCCOFzZBGbEEII4YckgQshhBB+yO1T6JqmNQF2AyOVUklVHr8d+CtgAr5Q\nSn3u7liEEEKI+sKtI3BN04KAT4FiO4+/A4wGhgIPVyR6IYQQQjjA3SPwN4GPgRcue7wbcFwplQ+g\nadoWYAiwyM3xCOE3rFYr3379NcH792MMCaH9xIlc17evt8MSQvgIt43ANU17AMhWSv1Y8VDV+pfR\nQH6VdiEQ465YhPBHP65ezYgVKxh/8iSTk5I487//kV9S4u2whBA+wp1T6L8CRmuatgG4FviyyjR5\nPhBVpW8UkOvGWITwO4bTp4mzWivbXbOzOZWV5cWIhBC+xG1T6EqpoRd+rkjijyilLvz1OQp01jQt\nFtv58SHYptuFEBWi2rcnIzCQlmYzAAdatGBUs2ZejkoI4Ss8WchFp2nadCBSKTVT07Q/AKuxzQL8\nTymV6cFYhPB5w4cPZ0VeHnsSEzGGhtJt4kQiQ0O9HZYQwkforFWm6Hxddnahy4JN2LjKVZsSQriJ\nlFIV/mbYsEKXbi8hIarG+ydLKVUhhM+QhC2E46QSmxBCCOGHZAQuhPAaGXELcfVkBC6EEEL4IUng\nokHarhRr9uzBZDJ5OxQhhLgqMoUuGpxXnn2WkamptAT+GRPDcx98QLhcniWE8DMyAhcNyrdr1zIp\nNZWBQHfghfx8PnjvPW+HJYQQV0wSuGhQzpw5Q/Mq7VAgoLTUW+EIIcRVkyl00aBMGz+e/1u9micN\nBnTA4oAABo4Z4+2wGgxZdS6E60gCFw1K4+hobn/tNd744AOCLRZ6TZjA4P79vR2WEEJcMUngosHp\n2LIlz732mrfDEEIIp8g5cCGEEMIPSQIXQggh/JAkcCGEEMIPyTlwIYTbyKpzIdxHErjwuBNZWbz3\nr38RYLEw9be/ZWCPHt4OyaMM5eX8uGoVVoOB6wYPpmXTpt4OSQi/l5KSxO7dikaNghk5cjQBAc5P\nMBsMJfzww1rMZhg+fCBxcU1cEKnrSAIXHnUmN5evn3iCt61WAoAP//EPAl56if7du3s7NI8wmc3M\nefNN7k1MJBhYumUL1ueeo1Xz5nW+Vghh36FD+3n//Xxyc28H8jl2bA6PPTbDqW2Wlxv497/nc+TI\nfUAgP/+8iBdfHOpTSVzOgQuPeuPDD/mj1UogoAMeB+Z/8omXo/KcfUlJjK5I3gATTp9m94YNXo1J\nCH+3ceMJcnMHV7Ri2LmzFSUl+U5tc8+enzlyZCK2ca6OtLTJbNjws7OhupQkcOFRwaGhlFRpmwBz\nYKC3wvG40JAQSqpM7VkAawM6fiHcITDQfElbrzcQGBjk1DZDQkLgkr9WZgIDdU5t09UkgQuPevnJ\nJ3ktKIgzQB7wj4AAXvjb37wdlsd0b9+eXcOGcSoggEJgdpcujBg3ztthucxCplzyTwhPGD/+Olq1\n+g4oITj4MKNHGwkJCXdqm717D2TQoDVAJpBP9+6zGTNmtCvCdRmd1Wr1dgwOy84udFmwCRtXuWpT\n4goZjUb+9cknGMrKeO6xx4iLjPR2SB5ltVrZdfAghYWF3NCvH2HBwXW/yE9I0hbeUlSUR2LiLpo1\na06nTq5ZGGu1Wtm37xcMBgN9+95AUFBIna8ZNqzQJfu+ICEhqsZhvyRwIYTLSAIXDZ0nE7isQhdC\nXDVJ2EJ4j5wDF0IIIfyQJHAhhBDCD0kCF0IIIfyQnAMXHndIKdT33xNosdB86FD69+vn9DZLDAZW\nzJ5NaE4O5tatGT9tGoE1lFJUyckcWr6cQLOZJoMGccPAgU7vXwghPE0SuPCoM+fPc+qDD5iYlQXA\n9iNHOPr883Tt1Mmp7X730UdM/+UXAoHCXbtYYjYz6d57q/U7n59P0vvvMzEzE4A9hw6xPyqKaxpY\nPXYhhP+TBC48ak9iIjdXJG+AAQUFLNu3z+kEHn3yJBfqmUUBwSdO2O2379AhhlUkb4C+RUUsO3hQ\nErgDZMW5EL5FzoELj2rTpg0qNLSyfSYggJhmzZzebmlMTOXPVqA0Otr+/lu3Jin8YoWm8zod4QkJ\nTu9fCCE8TUbgwqN6du7MD5MmkbR2LYFmM4aBA5kyaJDT2+1z333M++ILos+fJ7dlS262M30O0Kl1\na9ZMncp3q1YRZDJR3K8f04YPd3r/QgjhaVKJTXiF1WrFarW65J69VZktlhoXr12+f4vV6lDfhkqm\nzIW4clKJTdR7Op0Onc7
"text/plain": [
"<matplotlib.figure.Figure at 0x10ddfad10>"
2015-04-14 14:24:23 -04:00
]
},
"metadata": {},
2015-04-14 14:24:23 -04:00
"output_type": "display_data"
}
],
"source": [
"from sklearn import neighbors, datasets\n",
"\n",
"iris = datasets.load_iris()\n",
"X, y = iris.data, iris.target\n",
"\n",
"# create the model\n",
"knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')\n",
"\n",
"# fit the model\n",
"knn.fit(X, y)\n",
"\n",
"# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal?\n",
"X_pred = [3, 5, 4, 2]\n",
"result = knn.predict([X_pred, ])\n",
"\n",
"print(iris.target_names[result])\n",
"print(iris.target_names)\n",
"print(knn.predict_proba([X_pred, ]))\n",
"\n",
"from fig_code import plot_iris_knn\n",
"plot_iris_knn()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note we see overfitting in the K-Nearest Neighbors model above. We'll be addressing overfitting and model validation in a later notebook."
]
2015-04-14 14:24:23 -04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
2015-04-14 14:24:23 -04:00
},
"nbformat": 4,
"nbformat_minor": 0
}