mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added matplotlib IPython Notebook. Contains code to clean data, data will be plotted in the notebook and setting of global params.
This commit is contained in:
parent
557b76f267
commit
21b19dd12f
139
matplotlib/matplotlib.ipynb
Normal file
139
matplotlib/matplotlib.ipynb
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"name": "",
|
||||||
|
"signature": "sha256:08003effc72ed7242f0098f883755ff31312100e58bf63705b8a1b8c45afc8a4"
|
||||||
|
},
|
||||||
|
"nbformat": 3,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"worksheets": [
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# matplotlib"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"* Setting Global Parameters\n",
|
||||||
|
"* Bar Plots, Histograms, subplot2grid\n",
|
||||||
|
"* Normalized Plots\n",
|
||||||
|
"* Scatter Plots, subplots\n",
|
||||||
|
"* Kernel Density Estimation Plots"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pylab as plt"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"prompt_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Prepare data to plot:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_train = pd.read_csv('../data/titanic/train.csv')\n",
|
||||||
|
"\n",
|
||||||
|
"def clean_data(df):\n",
|
||||||
|
" \n",
|
||||||
|
" # Get the unique values of Sex\n",
|
||||||
|
" sexes = sort(df['Sex'].unique())\n",
|
||||||
|
" \n",
|
||||||
|
" # Generate a mapping of Sex from a string to a number representation \n",
|
||||||
|
" genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n",
|
||||||
|
"\n",
|
||||||
|
" # Transform Sex from a string to a number representation\n",
|
||||||
|
" df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)\n",
|
||||||
|
" \n",
|
||||||
|
" # Get the unique values of Embarked\n",
|
||||||
|
" embarked_locs = sort(df['Embarked'].unique())\n",
|
||||||
|
"\n",
|
||||||
|
" # Generate a mapping of Embarked from a string to a number representation \n",
|
||||||
|
" embarked_locs_mapping = dict(zip(embarked_locs, \n",
|
||||||
|
" range(0, len(embarked_locs) + 1)))\n",
|
||||||
|
" \n",
|
||||||
|
" # Transform Embarked from a string to dummy variables\n",
|
||||||
|
" df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)\n",
|
||||||
|
" \n",
|
||||||
|
" # Fill in missing values of Embarked\n",
|
||||||
|
" # Since the vast majority of passengers embarked in 'S': 3, \n",
|
||||||
|
" # we assign the missing values in Embarked to 'S':\n",
|
||||||
|
" if len(df[df['Embarked'].isnull()] > 0):\n",
|
||||||
|
" df.replace({'Embarked_Val' : \n",
|
||||||
|
" { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n",
|
||||||
|
" }\n",
|
||||||
|
" }, \n",
|
||||||
|
" inplace=True)\n",
|
||||||
|
" \n",
|
||||||
|
" # Fill in missing values of Fare with the average Fare\n",
|
||||||
|
" if len(df[df['Fare'].isnull()] > 0):\n",
|
||||||
|
" avg_fare = df['Fare'].mean()\n",
|
||||||
|
" df.replace({ None: avg_fare }, inplace=True)\n",
|
||||||
|
" \n",
|
||||||
|
" # To keep Age in tact, make a copy of it called AgeFill \n",
|
||||||
|
" # that we will use to fill in the missing ages:\n",
|
||||||
|
" df['AgeFill'] = df['Age']\n",
|
||||||
|
"\n",
|
||||||
|
" # Determine the Age typical for each passenger class by Sex_Val. \n",
|
||||||
|
" # We'll use the median instead of the mean because the Age \n",
|
||||||
|
" # histogram seems to be right skewed.\n",
|
||||||
|
" df['AgeFill'] = df['AgeFill'] \\\n",
|
||||||
|
" .groupby([df['Sex_Val'], df['Pclass']]) \\\n",
|
||||||
|
" .apply(lambda x: x.fillna(x.median()))\n",
|
||||||
|
" \n",
|
||||||
|
" # Define a new feature FamilySize that is the sum of \n",
|
||||||
|
" # Parch (number of parents or children on board) and \n",
|
||||||
|
" # SibSp (number of siblings or spouses):\n",
|
||||||
|
" df['FamilySize'] = df['SibSp'] + df['Parch']\n",
|
||||||
|
" \n",
|
||||||
|
" return df\n",
|
||||||
|
"\n",
|
||||||
|
"df_train = clean_data(df_train)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"prompt_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setting Global Parameters"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"# Set the global default size of matplotlib figures\n",
|
||||||
|
"plt.rc('figure', figsize=(10, 5))"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"prompt_number": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user