mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added Data Munging Summary section which contains all the data cleaning and transformation steps described in the notebook.
This commit is contained in:
parent
387922662f
commit
ad54e0ae70
|
@ -2264,6 +2264,94 @@
|
|||
}
|
||||
],
|
||||
"prompt_number": 36
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Munging Summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below is a summary of the data munging we performed on our training data set. We encapsulate this in a function since we'll need to do the same operations to our test set later."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"def clean_data(df, drop_passenger_id):\n",
|
||||
" \n",
|
||||
" # Get the unique values of Sex\n",
|
||||
" sexes = sort(df['Sex'].unique())\n",
|
||||
" \n",
|
||||
" # Generate a mapping of Sex from a string to a number representation \n",
|
||||
" genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n",
|
||||
"\n",
|
||||
" # Transform Sex from a string to a number representation\n",
|
||||
" df['Sex_Val'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)\n",
|
||||
" \n",
|
||||
" # Get the unique values of Embarked\n",
|
||||
" embarked_locs = sort(df['Embarked'].unique())\n",
|
||||
"\n",
|
||||
" # Generate a mapping of Embarked from a string to a number representation \n",
|
||||
" embarked_locs_mapping = dict(zip(embarked_locs, \n",
|
||||
" range(0, len(embarked_locs) + 1)))\n",
|
||||
" \n",
|
||||
" # Transform Embarked from a string to a number representation\n",
|
||||
" df['Embarked_Val'] = df['Embarked'].map(embarked_locs_mapping).astype(int)\n",
|
||||
" \n",
|
||||
" # Fill in missing values of Embarked\n",
|
||||
" # Since the vast majority of passengers embarked in 'S': 3, \n",
|
||||
" # we assign the missing values in Embarked to 'S':\n",
|
||||
" if len(df[df['Embarked'].isnull()] > 0):\n",
|
||||
" df.replace({'Embarked_Val' : \n",
|
||||
" { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n",
|
||||
" }\n",
|
||||
" }, \n",
|
||||
" inplace=True)\n",
|
||||
" \n",
|
||||
" # Fill in missing values of Fare with the average Fare\n",
|
||||
" if len(df[df['Fare'].isnull()] > 0):\n",
|
||||
" avg_fare = df['Fare'].mean()\n",
|
||||
" df.replace({ None: avg_fare }, inplace=True)\n",
|
||||
" \n",
|
||||
" # To keep Age in tact, make a copy of it called AgeFill \n",
|
||||
" # that we will use to fill in the missing ages:\n",
|
||||
" df['AgeFill'] = df['Age']\n",
|
||||
"\n",
|
||||
" # Determine the Age typical for each passenger class by Sex_Val. \n",
|
||||
" # We'll use the median instead of the mean because the Age \n",
|
||||
" # histogram seems to be right skewed.\n",
|
||||
" df['AgeFill'] = df['AgeFill'] \\\n",
|
||||
" .groupby([df['Sex_Val'], df['Pclass']]) \\\n",
|
||||
" .apply(lambda x: x.fillna(x.median()))\n",
|
||||
" \n",
|
||||
" # Define a new feature FamilySize that is the sum of \n",
|
||||
" # Parch (number of parents or children on board) and \n",
|
||||
" # SibSp (number of siblings or spouses):\n",
|
||||
" df['FamilySize'] = df['SibSp'] + df['Parch']\n",
|
||||
" \n",
|
||||
" # Drop the columns we won't use:\n",
|
||||
" df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)\n",
|
||||
" \n",
|
||||
" # Drop the Age column since we will be using the AgeFill column instead.\n",
|
||||
" # Drop the SibSp and Parch columns since we will be using FamilySize instead.\n",
|
||||
" # Drop the PassengerId column since it won't be used as a feature.\n",
|
||||
" df = df.drop(['Age', 'SibSp', 'Parch'], axis=1)\n",
|
||||
" \n",
|
||||
" if drop_passenger_id:\n",
|
||||
" df = df.drop(['PassengerId'], axis=1)\n",
|
||||
" \n",
|
||||
" return df"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"prompt_number": 37
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user