From 21b19dd12f8bfe9014a403ca678887f53e0238a4 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Mon, 6 Apr 2015 08:51:28 -0400 Subject: [PATCH] Added matplotlib IPython Notebook. Contains code to clean data, data will be plotted in the notebook and setting of global params. --- matplotlib/matplotlib.ipynb | 139 ++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 matplotlib/matplotlib.ipynb diff --git a/matplotlib/matplotlib.ipynb b/matplotlib/matplotlib.ipynb new file mode 100644 index 0000000..9c0fbf4 --- /dev/null +++ b/matplotlib/matplotlib.ipynb @@ -0,0 +1,139 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:08003effc72ed7242f0098f883755ff31312100e58bf63705b8a1b8c45afc8a4" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# matplotlib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Setting Global Parameters\n", + "* Bar Plots, Histograms, subplot2grid\n", + "* Normalized Plots\n", + "* Scatter Plots, subplots\n", + "* Kernel Density Estimation Plots" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pylab as plt" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare data to plot:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_train = pd.read_csv('../data/titanic/train.csv')\n", + "\n", + "def clean_data(df):\n", + " \n", + " # Get the unique values of Sex\n", + " sexes = sort(df['Sex'].unique())\n", + " \n", + " # Generate a mapping of Sex from a string to a number representation \n", + " genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))\n", + "\n", + " # Transform Sex from a string to a number representation\n", + " df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)\n", + " \n", + " # Get the unique values of Embarked\n", + " embarked_locs = sort(df['Embarked'].unique())\n", + "\n", + " # Generate a mapping of Embarked from a string to a number representation \n", + " embarked_locs_mapping = dict(zip(embarked_locs, \n", + " range(0, len(embarked_locs) + 1)))\n", + " \n", + " # Transform Embarked from a string to dummy variables\n", + " df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)\n", + " \n", + " # Fill in missing values of Embarked\n", + " # Since the vast majority of passengers embarked in 'S': 3, \n", + " # we assign the missing values in Embarked to 'S':\n", + " if len(df[df['Embarked'].isnull()] > 0):\n", + " df.replace({'Embarked_Val' : \n", + " { embarked_locs_mapping[nan] : embarked_locs_mapping['S'] \n", + " }\n", + " }, \n", + " inplace=True)\n", + " \n", + " # Fill in missing values of Fare with the average Fare\n", + " if len(df[df['Fare'].isnull()] > 0):\n", + " avg_fare = df['Fare'].mean()\n", + " df.replace({ None: avg_fare }, inplace=True)\n", + " \n", + " # To keep Age in tact, make a copy of it called AgeFill \n", + " # that we will use to fill in the missing ages:\n", + " df['AgeFill'] = df['Age']\n", + "\n", + " # Determine the Age typical for each passenger class by Sex_Val. \n", + " # We'll use the median instead of the mean because the Age \n", + " # histogram seems to be right skewed.\n", + " df['AgeFill'] = df['AgeFill'] \\\n", + " .groupby([df['Sex_Val'], df['Pclass']]) \\\n", + " .apply(lambda x: x.fillna(x.median()))\n", + " \n", + " # Define a new feature FamilySize that is the sum of \n", + " # Parch (number of parents or children on board) and \n", + " # SibSp (number of siblings or spouses):\n", + " df['FamilySize'] = df['SibSp'] + df['Parch']\n", + " \n", + " return df\n", + "\n", + "df_train = clean_data(df_train)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Global Parameters" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Set the global default size of matplotlib figures\n", + "plt.rc('figure', figsize=(10, 5))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file