From 21b19dd12f8bfe9014a403ca678887f53e0238a4 Mon Sep 17 00:00:00 2001
From: Donne Martin <donne.martin@gmail.com>
Date: Mon, 6 Apr 2015 08:51:28 -0400
Subject: [PATCH] Added matplotlib IPython Notebook.  Contains code to clean
 data, data will be plotted in the notebook and setting of global params.

---
 matplotlib/matplotlib.ipynb | 139 ++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 matplotlib/matplotlib.ipynb

diff --git a/matplotlib/matplotlib.ipynb b/matplotlib/matplotlib.ipynb
new file mode 100644
index 0000000..9c0fbf4
--- /dev/null
+++ b/matplotlib/matplotlib.ipynb
@@ -0,0 +1,139 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:08003effc72ed7242f0098f883755ff31312100e58bf63705b8a1b8c45afc8a4"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "# matplotlib"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "* Setting Global Parameters\n",
+      "* Bar Plots, Histograms, subplot2grid\n",
+      "* Normalized Plots\n",
+      "* Scatter Plots, subplots\n",
+      "* Kernel Density Estimation Plots"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import pandas as pd\n",
+      "import numpy as np\n",
+      "import pylab as plt"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Prepare data to plot:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "df_train = pd.read_csv('../data/titanic/train.csv')\n",
+      "\n",
+      "def clean_data(df):\n",
+      "    \n",
+      "    # Get the unique values of Sex\n",
+      "    sexes = sort(df['Sex'].unique())\n",
+      "    \n",
+      "    # Generate a mapping of Sex from a string to a number representation    \n",
+      "    genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n",
+      "\n",
+      "    # Transform Sex from a string to a number representation\n",
+      "    df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)\n",
+      "    \n",
+      "    # Get the unique values of Embarked\n",
+      "    embarked_locs = sort(df['Embarked'].unique())\n",
+      "\n",
+      "    # Generate a mapping of Embarked from a string to a number representation        \n",
+      "    embarked_locs_mapping = dict(zip(embarked_locs, \n",
+      "                                     range(0, len(embarked_locs) + 1)))\n",
+      "    \n",
+      "    # Transform Embarked from a string to dummy variables\n",
+      "    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)\n",
+      "    \n",
+      "    # Fill in missing values of Embarked\n",
+      "    # Since the vast majority of passengers embarked in 'S': 3, \n",
+      "    # we assign the missing values in Embarked to 'S':\n",
+      "    if len(df[df['Embarked'].isnull()] > 0):\n",
+      "        df.replace({'Embarked_Val' : \n",
+      "                       { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n",
+      "                       }\n",
+      "                   }, \n",
+      "                   inplace=True)\n",
+      "    \n",
+      "    # Fill in missing values of Fare with the average Fare\n",
+      "    if len(df[df['Fare'].isnull()] > 0):\n",
+      "        avg_fare = df['Fare'].mean()\n",
+      "        df.replace({ None: avg_fare }, inplace=True)\n",
+      "    \n",
+      "    # To keep Age in tact, make a copy of it called AgeFill \n",
+      "    # that we will use to fill in the missing ages:\n",
+      "    df['AgeFill'] = df['Age']\n",
+      "\n",
+      "    # Determine the Age typical for each passenger class by Sex_Val.  \n",
+      "    # We'll use the median instead of the mean because the Age \n",
+      "    # histogram seems to be right skewed.\n",
+      "    df['AgeFill'] = df['AgeFill'] \\\n",
+      "                        .groupby([df['Sex_Val'], df['Pclass']]) \\\n",
+      "                        .apply(lambda x: x.fillna(x.median()))\n",
+      "            \n",
+      "    # Define a new feature FamilySize that is the sum of \n",
+      "    # Parch (number of parents or children on board) and \n",
+      "    # SibSp (number of siblings or spouses):\n",
+      "    df['FamilySize'] = df['SibSp'] + df['Parch']\n",
+      "    \n",
+      "    return df\n",
+      "\n",
+      "df_train = clean_data(df_train)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Setting Global Parameters"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Set the global default size of matplotlib figures\n",
+      "plt.rc('figure', figsize=(10, 5))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file