mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added snippets to start exploring the Titanic data.
This commit is contained in:
parent
bcfae90101
commit
4ad409aa63
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"metadata": {
|
||||
"name": "",
|
||||
"signature": "sha256:da82018e898cd7c48f4841109f673f2618fe52d5d4553d5353acc59f8cfb0c07"
|
||||
"signature": "sha256:49e4a7e220fabc95e50ebfaf3337366b788c90a91f0c6cffdbb5588fd456923d"
|
||||
},
|
||||
"nbformat": 3,
|
||||
"nbformat_minor": 0,
|
||||
|
@ -151,7 +151,448 @@
|
|||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"prompt_number": 42
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Explore the Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.head(3)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"html": [
|
||||
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>PassengerId</th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Parch</th>\n",
|
||||
" <th>Ticket</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" <th>Cabin</th>\n",
|
||||
" <th>Embarked</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 3</td>\n",
|
||||
" <td> Braund, Mr. Owen Harris</td>\n",
|
||||
" <td> male</td>\n",
|
||||
" <td> 22</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> A/5 21171</td>\n",
|
||||
" <td> 7.2500</td>\n",
|
||||
" <td> NaN</td>\n",
|
||||
" <td> S</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td> 2</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||||
" <td> female</td>\n",
|
||||
" <td> 38</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> PC 17599</td>\n",
|
||||
" <td> 71.2833</td>\n",
|
||||
" <td> C85</td>\n",
|
||||
" <td> C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td> 3</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 3</td>\n",
|
||||
" <td> Heikkinen, Miss. Laina</td>\n",
|
||||
" <td> female</td>\n",
|
||||
" <td> 26</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> STON/O2. 3101282</td>\n",
|
||||
" <td> 7.9250</td>\n",
|
||||
" <td> NaN</td>\n",
|
||||
" <td> S</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 43,
|
||||
"text": [
|
||||
" PassengerId Survived Pclass \\\n",
|
||||
"0 1 0 3 \n",
|
||||
"1 2 1 1 \n",
|
||||
"2 3 1 3 \n",
|
||||
"\n",
|
||||
" Name Sex Age SibSp \\\n",
|
||||
"0 Braund, Mr. Owen Harris male 22 1 \n",
|
||||
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 \n",
|
||||
"2 Heikkinen, Miss. Laina female 26 0 \n",
|
||||
"\n",
|
||||
" Parch Ticket Fare Cabin Embarked \n",
|
||||
"0 0 A/5 21171 7.2500 NaN S \n",
|
||||
"1 0 PC 17599 71.2833 C85 C \n",
|
||||
"2 0 STON/O2. 3101282 7.9250 NaN S "
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 43
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.tail(3)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"html": [
|
||||
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>PassengerId</th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Parch</th>\n",
|
||||
" <th>Ticket</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" <th>Cabin</th>\n",
|
||||
" <th>Embarked</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>888</th>\n",
|
||||
" <td> 889</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 3</td>\n",
|
||||
" <td> Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||||
" <td> female</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 2</td>\n",
|
||||
" <td> W./C. 6607</td>\n",
|
||||
" <td> 23.45</td>\n",
|
||||
" <td> NaN</td>\n",
|
||||
" <td> S</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>889</th>\n",
|
||||
" <td> 890</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> 1</td>\n",
|
||||
" <td> Behr, Mr. Karl Howell</td>\n",
|
||||
" <td> male</td>\n",
|
||||
" <td> 26</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 111369</td>\n",
|
||||
" <td> 30.00</td>\n",
|
||||
" <td> C148</td>\n",
|
||||
" <td> C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>890</th>\n",
|
||||
" <td> 891</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 3</td>\n",
|
||||
" <td> Dooley, Mr. Patrick</td>\n",
|
||||
" <td> male</td>\n",
|
||||
" <td> 32</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 0</td>\n",
|
||||
" <td> 370376</td>\n",
|
||||
" <td> 7.75</td>\n",
|
||||
" <td> NaN</td>\n",
|
||||
" <td> Q</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 44,
|
||||
"text": [
|
||||
" PassengerId Survived Pclass Name \\\n",
|
||||
"888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
|
||||
"889 890 1 1 Behr, Mr. Karl Howell \n",
|
||||
"890 891 0 3 Dooley, Mr. Patrick \n",
|
||||
"\n",
|
||||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
|
||||
"888 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
|
||||
"889 male 26 0 0 111369 30.00 C148 C \n",
|
||||
"890 male 32 0 0 370376 7.75 NaN Q "
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 44
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"View the data types of each column:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.dtypes"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 45,
|
||||
"text": [
|
||||
"PassengerId int64\n",
|
||||
"Survived int64\n",
|
||||
"Pclass int64\n",
|
||||
"Name object\n",
|
||||
"Sex object\n",
|
||||
"Age float64\n",
|
||||
"SibSp int64\n",
|
||||
"Parch int64\n",
|
||||
"Ticket object\n",
|
||||
"Fare float64\n",
|
||||
"Cabin object\n",
|
||||
"Embarked object\n",
|
||||
"dtype: object"
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 45
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Get some basic information on the DataFrame:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.info()"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"stream": "stdout",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"Int64Index: 891 entries, 0 to 890\n",
|
||||
"Data columns (total 12 columns):\n",
|
||||
"PassengerId 891 non-null int64\n",
|
||||
"Survived 891 non-null int64\n",
|
||||
"Pclass 891 non-null int64\n",
|
||||
"Name 891 non-null object\n",
|
||||
"Sex 891 non-null object\n",
|
||||
"Age 714 non-null float64\n",
|
||||
"SibSp 891 non-null int64\n",
|
||||
"Parch 891 non-null int64\n",
|
||||
"Ticket 891 non-null object\n",
|
||||
"Fare 891 non-null float64\n",
|
||||
"Cabin 204 non-null object\n",
|
||||
"Embarked 889 non-null object\n",
|
||||
"dtypes: float64(2), int64(5), object(5)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 46
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note Age, Cabin, and Embarked are missing values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Generate various descriptive statistics on the DataFrame:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"df.describe()"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"html": [
|
||||
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>PassengerId</th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Parch</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 714.000000</td>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td> 446.000000</td>\n",
|
||||
" <td> 0.383838</td>\n",
|
||||
" <td> 2.308642</td>\n",
|
||||
" <td> 29.699118</td>\n",
|
||||
" <td> 0.523008</td>\n",
|
||||
" <td> 0.381594</td>\n",
|
||||
" <td> 32.204208</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td> 257.353842</td>\n",
|
||||
" <td> 0.486592</td>\n",
|
||||
" <td> 0.836071</td>\n",
|
||||
" <td> 14.526497</td>\n",
|
||||
" <td> 1.102743</td>\n",
|
||||
" <td> 0.806057</td>\n",
|
||||
" <td> 49.693429</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td> 1.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 1.000000</td>\n",
|
||||
" <td> 0.420000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td> 223.500000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 2.000000</td>\n",
|
||||
" <td> 20.125000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 7.910400</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td> 446.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 3.000000</td>\n",
|
||||
" <td> 28.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 14.454200</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td> 668.500000</td>\n",
|
||||
" <td> 1.000000</td>\n",
|
||||
" <td> 3.000000</td>\n",
|
||||
" <td> 38.000000</td>\n",
|
||||
" <td> 1.000000</td>\n",
|
||||
" <td> 0.000000</td>\n",
|
||||
" <td> 31.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td> 891.000000</td>\n",
|
||||
" <td> 1.000000</td>\n",
|
||||
" <td> 3.000000</td>\n",
|
||||
" <td> 80.000000</td>\n",
|
||||
" <td> 8.000000</td>\n",
|
||||
" <td> 6.000000</td>\n",
|
||||
" <td> 512.329200</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"metadata": {},
|
||||
"output_type": "pyout",
|
||||
"prompt_number": 47,
|
||||
"text": [
|
||||
" PassengerId Survived Pclass Age SibSp \\\n",
|
||||
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
|
||||
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
|
||||
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
|
||||
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
|
||||
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
|
||||
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
|
||||
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
|
||||
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
|
||||
"\n",
|
||||
" Parch Fare \n",
|
||||
"count 891.000000 891.000000 \n",
|
||||
"mean 0.381594 32.204208 \n",
|
||||
"std 0.806057 49.693429 \n",
|
||||
"min 0.000000 0.000000 \n",
|
||||
"25% 0.000000 7.910400 \n",
|
||||
"50% 0.000000 14.454200 \n",
|
||||
"75% 0.000000 31.000000 \n",
|
||||
"max 6.000000 512.329200 "
|
||||
]
|
||||
}
|
||||
],
|
||||
"prompt_number": 47
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user