From 000fea08624702c05422e18decd1fa7462fc3b9c Mon Sep 17 00:00:00 2001
From: Donne Martin <donne.martin@gmail.com>
Date: Wed, 18 Mar 2015 14:32:28 -0400
Subject: [PATCH] Added section Final Data Preparation for Machine Learning,
 which drops unused columns and converts the DataFrame to a numpy array.

---
 kaggle/titanic.ipynb | 113 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 1 deletion(-)

diff --git a/kaggle/titanic.ipynb b/kaggle/titanic.ipynb
index 3b9409f..ec711a5 100644
--- a/kaggle/titanic.ipynb
+++ b/kaggle/titanic.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:8faa925c9373212bcde3580896d60777b13a18934069bb0bec50503c01d983b0"
+  "signature": "sha256:1fee89283c2ab86fb9af2f3c8a7c9a1585048107972914ca2f77e9456fb2675b"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -2558,6 +2558,117 @@
       "Additional features we might want to engineer might be related to the Name column, for example honorrary or pedestrian titles might give clues and better predictive power for a male's survival."
      ]
     },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Final Data Preparation for Machine Learning"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Many machine learning algorithms do not work on strings and they usually require the data to be in an array, not a DataFrame.\n",
+      "\n",
+      "Show only the columns of type 'object' (strings):"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "df.dtypes[df.dtypes.map(lambda x: x == 'object')]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 37,
+       "text": [
+        "Name        object\n",
+        "Sex         object\n",
+        "Ticket      object\n",
+        "Cabin       object\n",
+        "Embarked    object\n",
+        "dtype: object"
+       ]
+      }
+     ],
+     "prompt_number": 37
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Drop the columns we won't use:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 38
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Drop the Age column since we will be using the AgeFill column instead:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "df = df.drop(['Age'], axis=1)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 39
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Convert the DataFrame to a numpy array:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "train_data = df.values\n",
+      "train_data"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 40,
+       "text": [
+        "array([[   1. ,    0. ,    3. , ...,   22. ,    0. ,    1. ],\n",
+        "       [   2. ,    1. ,    1. , ...,   38. ,    0. ,    1. ],\n",
+        "       [   3. ,    1. ,    3. , ...,   26. ,    0. ,    0. ],\n",
+        "       ..., \n",
+        "       [ 889. ,    0. ,    3. , ...,   21.5,    1. ,    3. ],\n",
+        "       [ 890. ,    1. ,    1. , ...,   26. ,    0. ,    0. ],\n",
+        "       [ 891. ,    0. ,    3. , ...,   32. ,    0. ,    0. ]])"
+       ]
+      }
+     ],
+     "prompt_number": 40
+    },
     {
      "cell_type": "code",
      "collapsed": false,