mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
143 lines
133 KiB
Plaintext
143 lines
133 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# scikit-learn"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"* Random Forest Classifier\n",
|
||
|
"* Random Forest Regressor"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"%matplotlib inline\n",
|
||
|
"import numpy as np\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import seaborn; \n",
|
||
|
"from sklearn.linear_model import LinearRegression\n",
|
||
|
"from scipy import stats\n",
|
||
|
"import pylab as pl\n",
|
||
|
"\n",
|
||
|
"seaborn.set()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Random Forest Classifier"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Ensembles of Estimators: Random Forests\n",
|
||
|
"\n",
|
||
|
"A **Random Forest** is a common ensemble method made of up many decision trees."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk4AAAFFCAYAAAAadmKrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnWd4VcXWgN8TQiihC6j0IkyogRCIqIQmhCIIKk1KAOFT\nioqICHhtqCDg5SpgoapUReBerlwFpEiviiDFERDpPRAIAdL292POOaScJCch5KSs93nOk+zZs2fW\n3rNnZs2aNbNtlmUhCIIgCIIgpI6XpwUQBEEQBEHILojiJAiCIAiC4CaiOAmCIAiCILiJKE6CIAiC\nIAhuIoqTIAiCIAiCm4jiJAiCIAiC4CbenhYgM1BKVQL+SuNlnbXWy++BOFkapdRXQB/gn1rr1+5h\nPn8DFVycigVuAqeAzcBMrfWueyVHaiil+gJzgF+01g3vIp04+791tNYHMkK2uyWFMkiJDVrr5hkv\njedQSn0JhNoPg+7F+5YVyx9AKdUMWAdc1lqXSsf1BTHP7inAHygO3AAOAcuBz7XW4S6u+xvz7nXQ\nWv8vneJnCsmVnVLqPuBfQBugCHAeaA2swoP3ppQqAvhqrc/GC3sHeAtYqrXuktky5TRyheKUiF3A\nbTfiXbrXgmRxMmuDrz+BC/GO82Aa3yqAH/CcUmqC1npMJsmTGCvR37tNKyttnLYTOJEorBhQ2/7/\nZhfX7LunEmUy9o7/6XhBAzBtxL0gq5V/fNIsl1KqMbAYKGsPugj8BjwANAKCgJeVUt201huTyTOr\nPo/EuJJ1MdAc05/sB/IDx+PFzfR7U0r1ACYDzwFn453KyHYs15PbFCcL6KK1TtxZCJ5jnNZ6buJA\ne4c2DHgXGKWUuqG1/iDTpYN/A9swVrC7oQbm/Tt21xJlEFrrronDlFJNgfWApbUOznypMp3OQCFg\nNcZa0F0p9YrWOjKD88ly5X83KKWeBL7D9CFLgDe11jre+ZoYa0wr4Eel1GNa6z2JkrFllrx3SZKy\nU0qVxChNFvCE1nptvHMtMc/lVCbLCTAeuN9F+DRgEXA9c8XJmeQ2xUnIJtg7rnFKqWvAFOBdpdR3\nWus/M1mOa8C1DEhHpx4rS5BdOrOMoo/972KgNFAP6AZ8mZGZZKPyTxWl1IPAbEz/8bHWenjiOFrr\ng0qpdphpwCbALKBBpgqaQSRTdiXj/Z/Amqa1TqtbyD1Ha30ZuOxpOXIK4hwuZGm01tOAXzDv6igP\niyPkIOwKQEuM1WAVsMx+aoDHhMoejANKABp4NblIWutY4BX7YT2l1GOZIFtm4TQ6aK2jPSlIMuS2\nAVCmIhanVFBK1cZ03HmB0VrrCYnO9wLmYua5AxM5Dz4B9MPM95cCYoCTwA/ARK31+URpxQHhWuvi\nSqlBwAtANYzF40dgpNb6olKqPmYKqwlmXn0PMFZrvTJRen8DFbTWXkqpocBgoDLGifEnzDSZ21MH\nSqnCwHDgGaCq/X5+x4zO52it41K4/G6YgxmttktGricx9xYI+AKnge+BD7XW55K5pg5mKrAFUAYI\nBzbZr9kVL15fXDiHK6V8gKFAD6AmRrE7A6zFONYnGKXGczCtrbU+mOjcU8DzQEO7/Ocw5TNBa304\nUVyHPJ9i3oF3gI4Y8/wF+32PTe6+M4J4iy0OYKwzs4H6mBHtOK31p/Z4+YBBQE+Mv5oXprNdAEzT\nWrv0NVRKBWM63EcwPlcXMIrNeK31URfxfwaCga+11v3ScCs97TL9prU+rZT6DhgLPKyUqqG1PuQi\nr76k4/m7cjCOtxCjM+adfQt4DNPW/GZP5yellK/9XDfgQcx7Nh94V2sdkyifPJh3sgcQANwH3MKU\n13+AyXYrarqwT6F3tx9+rLVO0WdGa/2r/Zn9qrXe72YeaW0301oX76ruxjsGsMU77qu1npuS47td\neXwRaIx5Zy4Ba4APElvT01KW8d5LB98rpQD6aa2/Tsk53D6AGAE8AVTE9GX7gK+BL+0KcPz4jvsr\nBTTF1FV/+3P8DZiqtf6WHIxYnFLBXtnfsR++Ze80AFBKlQOmYkasoxMpTbOA/2IaxRhgL6ZjUZgX\nbbdSqoSLLG1KqQWYhrkocBhTYUKBtUqp9sB2zPz6X0Ak8DCwQinVxEV6llJqGma6qxSmQhTHOA/+\nYnfwTBX7ff8KvA1UB45iGrPGwAx7/j7upJUOttr/llJK+cWTyaaUmoHxQ2qF8UPah7nPl4C9Sqkk\n0wNKqd4YB+B+mNUwezFl2BnYopRq5UIGK971NnueHwF1Mb4PBzDm+wGY59oopTTs6XgppeZjfERa\nYRTk3zDl3t8uf+dknkkZTHkMAqIwCklZjLK9TSlVNJnrMpKiGIWmJsY5tihwEMD+bm/EOKrWw3RM\nfwJ1gEnAZlfvv1LqH8DPwJP2oL1AQe48jzYu5EivM25v+9/F4JyS2YsZradmdUrP809OxvaYd7wp\npk5HA48CPyil2trPvYp5v49jOrc3gM/jJ6KUyguswAzk2mD8WfYAEZj39C1goz1eemkM5LPfx9pU\n4gKgtZ6bBqUpTe1mWutiBtXdLZiyd7DZ/jufKH7i+j4KUye6YJTjvUABzHu42z6Yc8RNa1mes8vl\nGIwcsMuUeACVWKbGmLr7ClAeMxA+g3n/ZgCr7Iq7q+fxJqbtqoup2zcxg51FSqlhLq7JMeRGxSk9\nJswJmI62APAZOCvgl5jOYp3W+mNHZKVUB0xDHwE011pX1loHaa0rAs3s4WUxyktiimAqVm+tdSWt\ntT/GKmJhVjstB74FSmutG2A0/22YsnT1stow1pgpQBmtdZA97yWY0fwipVT+lG7ePvJZhrEyLQfK\naa3raK1rAbUwS4/b2J/TvSC+M3+ZeP+/gmnsTgMttdbltNaNMKt6PsMoUMuUUoXi3YsCZmIarncx\nz7ER5plMxlhhv7WPrJOjHdAW01lW1lrX1loH2tP4D6ajH+fGff0DeBa4inEwrWQvnwcwjrX5gQVK\nqVouru2MWfbdQGv9kNa6LqbRisR0rP/nRv53Szm7DFXt919Oa73efu4rjAVtC1Bda6201vUxFs9N\nGAvi7PiJ2S1vYzHWv25a6/vtZXM/ppEuCHyjlCqfSI4+GAdet1deKqXqYZQ4C/gm3qlF9r+9U1Ew\nMvL5D8B05mXtz7EippPMg7FgFbLnU0NrXR1jHQDoq5QqHi+dF4AQTGfpr7Wubm93ygBdgThMJ9cp\nDbIlxjFwidJaH7mLdJKQznYzrXXxruuu1roJd7avsLTWwfbfqnjREvQzSqnm9nSjgRe01g/a3+1y\nmPevEKZNdpCmstRar7TL5VDeXnchUwLsCuj3mIH0Ykz/0FBrXQNj7TuJ6Xs+d3G5DTM4fR+4L94z\ndNSlt5VSOXZGK7cpTjbgmFIqLpVfAsdQ+xRUKEabb6OU6op5sVsCYdypRA5aYkahU7XWGxKltRGj\n+IBp7F3xpdZ6QbxrNmGUIzBKQl+t9U37uRvYlTmMudQVq7TWwxxz8VrrCKAXxmpUgTsj7+TojLEa\n/IHp0C7Gk+0QdyryIKVUmveCcQPHShAbxvqGXdkbg+n4esXrsNFaR2qthwI7MKOo/vHSehXwARZr\nrd91mKG11jFa6xGYEVdRjNk6ORwjwx+11mfi5RuBUeZW29NJFvsoboRd/ue11j/ES+eW1vpVjJKa\nH6M0JMYC+mitf4t33XbudPxBKeWfgXyktb5kz/8qgFIqEPP8LgKddLzpYK31KcxUbwTwpFKqbry0\nxtr/DtNafxfvmhhtVlQuxgwsXol3DVrrk1rrPxNPj6WC453fqbX+O164o+EvScoKRkY+/zCgv/39\nQWt9nTudlQ0YpLXeGy/+J5j2xYs7W0eAsULHAG8ntvBorZdgVktC8u2OOxSLJ3NGk552M6118a7r\nrp20DsBH2/9O1lrPiJdvJMbyfRl4KJ61KzPKcgjGV20f8Kyj/trz2I2x+lpAT6WUq3x+1Fq/pe3T\nxVrrKMCx91+Ru5QtS5P
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x104e7a790>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import fig_code\n",
|
||
|
"fig_code.plot_example_decision_tree()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAFRCAYAAAAxT3fNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xd8VFX6x/F3KiH03jsoHURECaKg2BXddVVsa+8/d+2s\njaauvZdVV13dtbu69t4RxC4qGgERBKT33pLfHzeByWQSkpBkhnA/r1deOme+5zzPnbnDufe55zxP\nUm5urpCQkJCQkJCKJzneDoSEhISEhOwohJNuSEhISEhIJRFOuiEhISEhIZVEOOmGhISEhIRUEuGk\nGxISEhISUkmEk25ISEhISEglkVrhFpKSdo1q6YzsYl6Hmu1cM4k+2Yw8khaRommse4yrR/N+vH0M\nNaEm1FSoJt7246vJzf1KEYR3uiHlzmvsf0TUhAvtqVafA+LhU0hISEgiEE66IeVOJrVTinivNvUq\n1ZmQkJCQBCKcdEPKnTX8vrqI9+Yzu1KdCQkJCUkgwkk3pNw5lQ8f4cfo9rEsqMZz8fApJCQkJBGo\n+IVUITscDdjYmEtv4ILO9KpO2m/MWswDw2NMxiEhISE7CpUx6XaOep21ldehpgpojmY8HptP2hqS\nD2BXLFXwfEj44wg1oSbUlEkTb/vx1hS5erkyJt3o5dax2kJNFdU03vJ6XTzsh5pQE2ripom3/Xhr\nYhI+0w0JCQkJCakkwvByqImXJt72Q02oCTXh77uiNGF4OdQkpCbe9kNNqAk14e+7ojQxCcPLISEh\nISEhlUQYXg418dLE236oCTWhJvx9V5QmDC+HmoTUxNt+qAk1oSb8fVeUJiZheDkkJCQkJKSSCMPL\noSZemnjbz3qd5tkMyqTGcmaeyaq6cfQn1ISaKqSJt/14a8LwcqhJSE3c7N/H3j244iLqwlo8wrQW\n/O9wFiaiz6Em1Gxnmnjbj7cmJmF4OWSH4yuqN+WIgXkTLmTgHNpP4tw4uhYSElLFCcPLoSZemrjZ\nf4d9LqJRtCAJ7djNlnM2YXwONaFmO9PE2368NWF4OdQkpCYu9ufTu5gTf1NU34TwOdSEmu1QE2/7\n8dbEJAwvhyQkyyrw3NyHN98NKh4V4vew9GBISEgFEoaXQ028NIXaNpJ1O39owq51qLOYRWnMOYEH\nytP+oXiYydPo3Z50yMVLLN2ZD4Xh5VATarZVE2/78daE4eVQk5CaAm23cfxZHFgneLwK9WbTdgRf\njeHx8rR/GneNoEk99qtJzdnM3JcJAxlXhuMINaEm1CSe/XhrYlIZk25IyFb5kJo92D1iwgUtSO3I\nIYt4qkHwvLXcGMNHgr98oqMylcKN9Mhl/2U0vZqPMoMb75CQkCpIGF4ONfHSFGj7gZ7DqB9Doxtt\nvmH3ISyuZB8rTLOeATex11oOO45GXUmdz7DH+KUd/zyQ2Ynmc6gJNaXQxNt+vDVheDnUJKRmc1sK\nq2awtmGwZbYAv7OkOd9gTRx8LHfNf2i5gv1a0eEEklLy2huTdA4d7+CYfTkzLYF8DjWhpgyaeNuP\ntyYmYXi5ijGWmm9wTBu6/c6Mdjx3Mr+Xt53/0GwyR1Qnoy5zzyA7bRvGO4eZd/DTruwS2b4JX/PZ\nYVsm3O2euVxyAR3fQkqM94fS4wZ6Xc06uI5dctm7BnVzePriUvzAQ0JCEoswvFyFNG/SYgHnj6Fl\nquDB4Hsc9hCPnp63Src8bN3JkF4ceQK1kvA7OTcx6FzuqhfMk2UKPx3Cd3eQPpSd2pP2A2s/4PeT\neUkVWVH8PvUH0nc1asXoCC1JS2W3HHKu55SjGdgp7/v7jiE38s5wnqksn0NNqCmDJt72460Jw8s7\nguZHTruIlvkNSRhC3Uc5ZBXX1CgHW+Oo143RgyLmjOYkX8KuYxhwHQ+WwucCbZ1wHndfT58ldGzO\n95cG1w4J/9mXVPMJO51ARm0FEzxHMpZF3Xj+Do48g0GNI26Ie5JRgwPG8MYIPq8Mn0NNqCmjJt72\n462JSZgco4rwAxk70SPWe4fQ8SF6lYed8QzaN8aCp2poR99tHT8NI/j6dp69lJ+2dbxE41Smfcq0\nJDTAlKj3l5H7Ae8OZUkNejWOEYHuQHo19q0Uh0NCQsqVMLxcRTS1qJYWO4SsGkkN6K4cvosGtE2K\nIYLq1MmzUe7hp0lkvsLeDejQmv8ewNyyjBNvTUss4NNfaT2I1LH4Lu+9eSxcxQcjeBWda9Ekxtig\nLo3F/qwT9thDzQ6libf9eGvC8HJV17Qh+2V+OCDGCfAGM/bn6fKwlczrU+neMbi5LcB0vo3Qllv4\naTRd+nD6pTRJxmd0u5JXRnFbWtH9Eur7iXx9Adn3sQy71KHRMuYn8e25PBLZYSZfb6R99I90Jabx\niaI/64Q99lCzQ2nibT/empiEq5erEOv51xu0O4hm+W3fsfJHnjyGDeVh42R+voL3ruDgmhHtTzKt\nLf8uDxuRvE2TwZy3V0QZvj2o1Y6jRzLl77xS3jZjsQGj2COT3j2YN5Ql2zLeuYxXcJItlJhjGG/c\nyU4X0jP/OdBG3MFnFwaLy0JCQrYzwkm3CnEx3zzM/13DMa3YaQGzNvDqaL5UjtmWhjPqGia3Z/d0\nMpawqCl3HV8BW5O+Y/DFERNuPk1IbcZeKmHSvZeuuPQ8ujYm5UP+cBVvjuT24rZJLSLlZv68M4OS\n2DiTqR157NgSfk5tWNePv4zglDZ0S6P6z3x6Ev9qGsy/ISEh2xnhpFvFOI0ZuEkwyZY45FEa6pBz\nY5ALOT8fcmcVMOFCJhlFPUPOpEZF2IxkLqlpXHkmO+e3DaFBH44dzfxri3h2swG3cc0I9s/c0tzr\nSXZ5gguPL6H9gawcyN15LyvsOy2OUQxtysENaL2MudN5byRPbMu+7JCQHZVw0g1JaNby63LUjmrP\nxUx+rWj7t3HQ1RETbj71SW7DIEVMutex+xkMyoxqP4721/Bn/K/8vS1/RjLsFM5vu+UZfuOFdBtN\nnWt5P56+hYRsj4RbhkISmrMYey9f5kS1P8v0LgUrD1UI6TQpKolFjRhh73xS6de2iNXkLYMtyQnP\nMpLbMbRt1KK5hqTsxkHTYqTsDAkJKZ4qvWVoPns/wh8b03oDG2qyehj/Sim+X7yXmu8omhL1q8Ee\nR/HAbcFz452rUXsuk3bmlf2DLUp1KtDHrLYsncHGNjF+KytYWtQ46dTOFVUyKY/cYO9ton8/WV9Q\nvwdtY2gNoNlHHN6etZXlT6jZrjTxth9vzY63Zeg9amcz+FI65k+y88m5iiXXc0Nl+xNqyt6vI9mX\nBAUPKPq5ZoX4eBLZN7DXlWRFhoW+Zfki/oP5scZpxZL32Ts6kcgaTAnKCY6vKJ/LS1Ob3+axHA2j\nhbNYk87HlelPqNnuNPG2H29NTKrsM92POW0UHSPvNBqT/GcOvZHXhvN9vHyrDN6k+Vcc0Zjmy1la\nj+9PjcMinMrkEVp8R1Ynks+0bQUY8knDAfztGi7cmd3qUudXJi/iuauCSSfmqvBjmTeCf+LMfaiX\nhN/Y8CAfXcJjtoMQcz9WX8MXB3FQ9B37O3xzKXPi4lhIyHZMlQ0vd2DXWKG9LmS8yZG27FtNtLDE\nNmuepnMD/nplRH7kbPa7k2Z/5e0E8bncxl7HgNs4YxB9T6HGXHIe4ojWPHIIM7fVVr/g7/lV/G89\nex4YTLZsJfvWGL4fy6iHOWEV8zP5YTTfpwQTbsKcL8VpjuLFW2nxR7q0J20OOf9lyu48m6g+h5qE\n0MTbfrw1O154eQOrinJoGQui+iZaWGKbNAs4d1hUEZvOpH3PPj/wYPeCz+G2+/DTnRx1AXtXz3vd\njORz6HQPx63m1MxgsfM226oR/OWUZpyBwd+i0tpKFE1ndODU6+mbzt4r+GIUH+dFERYmos+hJmE0\n8bYfb01Mquzq5WlM3BSj/StW1S94t1elmEhG+yJCngfT8uFgm0uVojm9q8doP5puY9i70h2qYuQV\nofjyb7x23ZYJNyQkpAx
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x100431790>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
|
"clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)\n",
|
||
|
"visualize_tree(clf, X, y, boundaries=False);"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"The following data scenarios are not good for random forests:\n",
|
||
|
"* y: lots of 0, few 1\n",
|
||
|
"* Structured data like images, neural network might be better\n",
|
||
|
"* Small data size, which might lead to overfitting\n",
|
||
|
"* High dimensional data, linear model might work better"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 2",
|
||
|
"language": "python",
|
||
|
"name": "python2"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.9"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|