From 000fea08624702c05422e18decd1fa7462fc3b9c Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Wed, 18 Mar 2015 14:32:28 -0400 Subject: [PATCH] Added section Final Data Preparation for Machine Learning, which drops unused columns and converts the DataFrame to a numpy array. --- kaggle/titanic.ipynb | 113 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/kaggle/titanic.ipynb b/kaggle/titanic.ipynb index 3b9409f..ec711a5 100644 --- a/kaggle/titanic.ipynb +++ b/kaggle/titanic.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:8faa925c9373212bcde3580896d60777b13a18934069bb0bec50503c01d983b0" + "signature": "sha256:1fee89283c2ab86fb9af2f3c8a7c9a1585048107972914ca2f77e9456fb2675b" }, "nbformat": 3, "nbformat_minor": 0, @@ -2558,6 +2558,117 @@ "Additional features we might want to engineer might be related to the Name column, for example honorrary or pedestrian titles might give clues and better predictive power for a male's survival." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Data Preparation for Machine Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Many machine learning algorithms do not work on strings and they usually require the data to be in an array, not a DataFrame.\n", + "\n", + "Show only the columns of type 'object' (strings):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df.dtypes[df.dtypes.map(lambda x: x == 'object')]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 37, + "text": [ + "Name object\n", + "Sex object\n", + "Ticket object\n", + "Cabin object\n", + "Embarked object\n", + "dtype: object" + ] + } + ], + "prompt_number": 37 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop the columns we won't use:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 38 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop the Age column since we will be using the AgeFill column instead:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = df.drop(['Age'], axis=1)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 39 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the DataFrame to a numpy array:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "train_data = df.values\n", + "train_data" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 40, + "text": [ + "array([[ 1. , 0. , 3. , ..., 22. , 0. , 1. ],\n", + " [ 2. , 1. , 1. , ..., 38. , 0. , 1. ],\n", + " [ 3. , 1. , 3. , ..., 26. , 0. , 0. ],\n", + " ..., \n", + " [ 889. , 0. , 3. , ..., 21.5, 1. , 3. ],\n", + " [ 890. , 1. , 1. , ..., 26. , 0. , 0. ],\n", + " [ 891. , 0. , 3. , ..., 32. , 0. , 0. ]])" + ] + } + ], + "prompt_number": 40 + }, { "cell_type": "code", "collapsed": false,