data-science-ipython-notebooks/matplotlib/matplotlib.ipynb

{
 "metadata": {
  "name": "",
  "signature": "sha256:08003effc72ed7242f0098f883755ff31312100e58bf63705b8a1b8c45afc8a4"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# matplotlib"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "* Setting Global Parameters\n",
      "* Bar Plots, Histograms, subplot2grid\n",
      "* Normalized Plots\n",
      "* Scatter Plots, subplots\n",
      "* Kernel Density Estimation Plots"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np\n",
      "import pylab as plt"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Prepare data to plot:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_train = pd.read_csv('../data/titanic/train.csv')\n",
      "\n",
      "def clean_data(df):\n",
      "    \n",
      "    # Get the unique values of Sex\n",
      "    sexes = sort(df['Sex'].unique())\n",
      "    \n",
      "    # Generate a mapping of Sex from a string to a number representation    \n",
      "    genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n",
      "\n",
      "    # Transform Sex from a string to a number representation\n",
      "    df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)\n",
      "    \n",
      "    # Get the unique values of Embarked\n",
      "    embarked_locs = sort(df['Embarked'].unique())\n",
      "\n",
      "    # Generate a mapping of Embarked from a string to a number representation        \n",
      "    embarked_locs_mapping = dict(zip(embarked_locs, \n",
      "                                     range(0, len(embarked_locs) + 1)))\n",
      "    \n",
      "    # Transform Embarked from a string to dummy variables\n",
      "    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)\n",
      "    \n",
      "    # Fill in missing values of Embarked\n",
      "    # Since the vast majority of passengers embarked in 'S': 3, \n",
      "    # we assign the missing values in Embarked to 'S':\n",
      "    if len(df[df['Embarked'].isnull()] > 0):\n",
      "        df.replace({'Embarked_Val' : \n",
      "                       { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n",
      "                       }\n",
      "                   }, \n",
      "                   inplace=True)\n",
      "    \n",
      "    # Fill in missing values of Fare with the average Fare\n",
      "    if len(df[df['Fare'].isnull()] > 0):\n",
      "        avg_fare = df['Fare'].mean()\n",
      "        df.replace({ None: avg_fare }, inplace=True)\n",
      "    \n",
      "    # To keep Age in tact, make a copy of it called AgeFill \n",
      "    # that we will use to fill in the missing ages:\n",
      "    df['AgeFill'] = df['Age']\n",
      "\n",
      "    # Determine the Age typical for each passenger class by Sex_Val.  \n",
      "    # We'll use the median instead of the mean because the Age \n",
      "    # histogram seems to be right skewed.\n",
      "    df['AgeFill'] = df['AgeFill'] \\\n",
      "                        .groupby([df['Sex_Val'], df['Pclass']]) \\\n",
      "                        .apply(lambda x: x.fillna(x.median()))\n",
      "            \n",
      "    # Define a new feature FamilySize that is the sum of \n",
      "    # Parch (number of parents or children on board) and \n",
      "    # SibSp (number of siblings or spouses):\n",
      "    df['FamilySize'] = df['SibSp'] + df['Parch']\n",
      "    \n",
      "    return df\n",
      "\n",
      "df_train = clean_data(df_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Setting Global Parameters"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Set the global default size of matplotlib figures\n",
      "plt.rc('figure', figsize=(10, 5))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    }
   ],
   "metadata": {}
  }
 ]
}
Added matplotlib IPython Notebook. Contains code to clean data, data will be plotted in the notebook and setting of global params. 2015-04-06 20:51:28 +08:00			`{`
			`"metadata": {`
			`"name": "",`
			`"signature": "sha256:08003effc72ed7242f0098f883755ff31312100e58bf63705b8a1b8c45afc8a4"`
			`},`
			`"nbformat": 3,`
			`"nbformat_minor": 0,`
			`"worksheets": [`
			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# matplotlib"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"* Setting Global Parameters\n",`
			`"* Bar Plots, Histograms, subplot2grid\n",`
			`"* Normalized Plots\n",`
			`"* Scatter Plots, subplots\n",`
			`"* Kernel Density Estimation Plots"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"import pandas as pd\n",`
			`"import numpy as np\n",`
			`"import pylab as plt"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": [],`
			`"prompt_number": 1`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Prepare data to plot:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"df_train = pd.read_csv('../data/titanic/train.csv')\n",`
			`"\n",`
			`"def clean_data(df):\n",`
			`" \n",`
			`" # Get the unique values of Sex\n",`
			`" sexes = sort(df['Sex'].unique())\n",`
			`" \n",`
			`" # Generate a mapping of Sex from a string to a number representation \n",`
			`" genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n",`
			`"\n",`
			`" # Transform Sex from a string to a number representation\n",`
			`" df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)\n",`
			`" \n",`
			`" # Get the unique values of Embarked\n",`
			`" embarked_locs = sort(df['Embarked'].unique())\n",`
			`"\n",`
			`" # Generate a mapping of Embarked from a string to a number representation \n",`
			`" embarked_locs_mapping = dict(zip(embarked_locs, \n",`
			`" range(0, len(embarked_locs) + 1)))\n",`
			`" \n",`
			`" # Transform Embarked from a string to dummy variables\n",`
			`" df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)\n",`
			`" \n",`
			`" # Fill in missing values of Embarked\n",`
			`" # Since the vast majority of passengers embarked in 'S': 3, \n",`
			`" # we assign the missing values in Embarked to 'S':\n",`
			`" if len(df[df['Embarked'].isnull()] > 0):\n",`
			`" df.replace({'Embarked_Val' : \n",`
			`" { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n",`
			`" }\n",`
			`" }, \n",`
			`" inplace=True)\n",`
			`" \n",`
			`" # Fill in missing values of Fare with the average Fare\n",`
			`" if len(df[df['Fare'].isnull()] > 0):\n",`
			`" avg_fare = df['Fare'].mean()\n",`
			`" df.replace({ None: avg_fare }, inplace=True)\n",`
			`" \n",`
			`" # To keep Age in tact, make a copy of it called AgeFill \n",`
			`" # that we will use to fill in the missing ages:\n",`
			`" df['AgeFill'] = df['Age']\n",`
			`"\n",`
			`" # Determine the Age typical for each passenger class by Sex_Val. \n",`
			`" # We'll use the median instead of the mean because the Age \n",`
			`" # histogram seems to be right skewed.\n",`
			`" df['AgeFill'] = df['AgeFill'] \\\n",`
			`" .groupby([df['Sex_Val'], df['Pclass']]) \\\n",`
			`" .apply(lambda x: x.fillna(x.median()))\n",`
			`" \n",`
			`" # Define a new feature FamilySize that is the sum of \n",`
			`" # Parch (number of parents or children on board) and \n",`
			`" # SibSp (number of siblings or spouses):\n",`
			`" df['FamilySize'] = df['SibSp'] + df['Parch']\n",`
			`" \n",`
			`" return df\n",`
			`"\n",`
			`"df_train = clean_data(df_train)"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": [],`
			`"prompt_number": 2`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## Setting Global Parameters"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"# Set the global default size of matplotlib figures\n",`
			`"plt.rc('figure', figsize=(10, 5))"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": [],`
			`"prompt_number": 3`
			`}`
			`],`
			`"metadata": {}`
			`}`
			`]`
			`}`