mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added section Final Data Preparation for Machine Learning, which drops unused columns and converts the DataFrame to a numpy array.
This commit is contained in:
parent
2662e2bb03
commit
000fea0862
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"metadata": {
|
||||
"name": "",
|
||||
"signature": "sha256:8faa925c9373212bcde3580896d60777b13a18934069bb0bec50503c01d983b0"
|
||||
"signature": "sha256:1fee89283c2ab86fb9af2f3c8a7c9a1585048107972914ca2f77e9456fb2675b"
|
||||
},
|
||||
"nbformat": 3,
|
||||
"nbformat_minor": 0,
|
||||
|
@ -2558,6 +2558,117 @@
|
|||
"Additional features we might want to engineer might be related to the Name column, for example honorrary or pedestrian titles might give clues and better predictive power for a male's survival."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Final Data Preparation for Machine Learning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Many machine learning algorithms do not work on strings and they usually require the data to be in an array, not a DataFrame.\n",
|
||||
"\n",
|
||||
"Show only the columns of type 'object' (strings):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.dtypes[df.dtypes.map(lambda x: x == 'object')]"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 37,
|
||||
"text": [
|
||||
"Name object\n",
|
||||
"Sex object\n",
|
||||
"Ticket object\n",
|
||||
"Cabin object\n",
|
||||
"Embarked object\n",
|
||||
"dtype: object"
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 37
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Drop the columns we won't use:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"prompt_number": 38
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Drop the Age column since we will be using the AgeFill column instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df = df.drop(['Age'], axis=1)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"prompt_number": 39
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Convert the DataFrame to a numpy array:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"train_data = df.values\n",
|
||||
"train_data"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 40,
|
||||
"text": [
|
||||
"array([[ 1. , 0. , 3. , ..., 22. , 0. , 1. ],\n",
|
||||
" [ 2. , 1. , 1. , ..., 38. , 0. , 1. ],\n",
|
||||
" [ 3. , 1. , 3. , ..., 26. , 0. , 0. ],\n",
|
||||
" ..., \n",
|
||||
" [ 889. , 0. , 3. , ..., 21.5, 1. , 3. ],\n",
|
||||
" [ 890. , 1. , 1. , ..., 26. , 0. , 0. ],\n",
|
||||
" [ 891. , 0. , 3. , ..., 32. , 0. , 0. ]])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 40
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
|
|
Loading…
Reference in New Issue
Block a user