Added snippets to start exploring the Titanic data.

This commit is contained in:
Donne Martin 2015-03-14 19:56:28 -04:00
parent bcfae90101
commit 4ad409aa63

View File

@ -1,7 +1,7 @@
{
"metadata": {
"name": "",
"signature": "sha256:da82018e898cd7c48f4841109f673f2618fe52d5d4553d5353acc59f8cfb0c07"
"signature": "sha256:49e4a7e220fabc95e50ebfaf3337366b788c90a91f0c6cffdbb5588fd456923d"
},
"nbformat": 3,
"nbformat_minor": 0,
@ -151,7 +151,448 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explore the Data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.head(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> Braund, Mr. Owen Harris</td>\n",
" <td> male</td>\n",
" <td> 22</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> A/5 21171</td>\n",
" <td> 7.2500</td>\n",
" <td> NaN</td>\n",
" <td> S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td> female</td>\n",
" <td> 38</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> PC 17599</td>\n",
" <td> 71.2833</td>\n",
" <td> C85</td>\n",
" <td> C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> Heikkinen, Miss. Laina</td>\n",
" <td> female</td>\n",
" <td> 26</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> STON/O2. 3101282</td>\n",
" <td> 7.9250</td>\n",
" <td> NaN</td>\n",
" <td> S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 43,
"text": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 \n",
"2 Heikkinen, Miss. Laina female 26 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S "
]
}
],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.tail(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>888</th>\n",
" <td> 889</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
" <td> female</td>\n",
" <td>NaN</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> W./C. 6607</td>\n",
" <td> 23.45</td>\n",
" <td> NaN</td>\n",
" <td> S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>889</th>\n",
" <td> 890</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> Behr, Mr. Karl Howell</td>\n",
" <td> male</td>\n",
" <td> 26</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 111369</td>\n",
" <td> 30.00</td>\n",
" <td> C148</td>\n",
" <td> C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>890</th>\n",
" <td> 891</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> Dooley, Mr. Patrick</td>\n",
" <td> male</td>\n",
" <td> 32</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 370376</td>\n",
" <td> 7.75</td>\n",
" <td> NaN</td>\n",
" <td> Q</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 44,
"text": [
" PassengerId Survived Pclass Name \\\n",
"888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
"889 890 1 1 Behr, Mr. Karl Howell \n",
"890 891 0 3 Dooley, Mr. Patrick \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
"888 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
"889 male 26 0 0 111369 30.00 C148 C \n",
"890 male 32 0 0 370376 7.75 NaN Q "
]
}
],
"prompt_number": 44
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the data types of each column:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.dtypes"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 45,
"text": [
"PassengerId int64\n",
"Survived int64\n",
"Pclass int64\n",
"Name object\n",
"Sex object\n",
"Age float64\n",
"SibSp int64\n",
"Parch int64\n",
"Ticket object\n",
"Fare float64\n",
"Cabin object\n",
"Embarked object\n",
"dtype: object"
]
}
],
"prompt_number": 45
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get some basic information on the DataFrame:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.info()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 891 entries, 0 to 890\n",
"Data columns (total 12 columns):\n",
"PassengerId 891 non-null int64\n",
"Survived 891 non-null int64\n",
"Pclass 891 non-null int64\n",
"Name 891 non-null object\n",
"Sex 891 non-null object\n",
"Age 714 non-null float64\n",
"SibSp 891 non-null int64\n",
"Parch 891 non-null int64\n",
"Ticket 891 non-null object\n",
"Fare 891 non-null float64\n",
"Cabin 204 non-null object\n",
"Embarked 889 non-null object\n",
"dtypes: float64(2), int64(5), object(5)"
]
}
],
"prompt_number": 46
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note Age, Cabin, and Embarked are missing values."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Generate various descriptive statistics on the DataFrame:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 714.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 446.000000</td>\n",
" <td> 0.383838</td>\n",
" <td> 2.308642</td>\n",
" <td> 29.699118</td>\n",
" <td> 0.523008</td>\n",
" <td> 0.381594</td>\n",
" <td> 32.204208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 257.353842</td>\n",
" <td> 0.486592</td>\n",
" <td> 0.836071</td>\n",
" <td> 14.526497</td>\n",
" <td> 1.102743</td>\n",
" <td> 0.806057</td>\n",
" <td> 49.693429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 1.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.420000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 223.500000</td>\n",
" <td> 0.000000</td>\n",
" <td> 2.000000</td>\n",
" <td> 20.125000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 7.910400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 446.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 3.000000</td>\n",
" <td> 28.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 14.454200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 668.500000</td>\n",
" <td> 1.000000</td>\n",
" <td> 3.000000</td>\n",
" <td> 38.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 31.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 891.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 3.000000</td>\n",
" <td> 80.000000</td>\n",
" <td> 8.000000</td>\n",
" <td> 6.000000</td>\n",
" <td> 512.329200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 47,
"text": [
" PassengerId Survived Pclass Age SibSp \\\n",
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
"\n",
" Parch Fare \n",
"count 891.000000 891.000000 \n",
"mean 0.381594 32.204208 \n",
"std 0.806057 49.693429 \n",
"min 0.000000 0.000000 \n",
"25% 0.000000 7.910400 \n",
"50% 0.000000 14.454200 \n",
"75% 0.000000 31.000000 \n",
"max 6.000000 512.329200 "
]
}
],
"prompt_number": 47
}
],
"metadata": {}