From d65c3bd733a40b3dccc6786102451b0a077cc21b Mon Sep 17 00:00:00 2001 From: Shivendra k jha <47923680+skjha1@users.noreply.github.com> Date: Wed, 27 Jan 2021 23:44:49 +0530 Subject: [PATCH] Add files via upload --- pandas/03.14 Pandas For Data Analysis.ipynb | 6589 +++++++++++++++++++ 1 file changed, 6589 insertions(+) create mode 100644 pandas/03.14 Pandas For Data Analysis.ipynb diff --git a/pandas/03.14 Pandas For Data Analysis.ipynb b/pandas/03.14 Pandas For Data Analysis.ipynb new file mode 100644 index 0000000..ca8dca1 --- /dev/null +++ b/pandas/03.14 Pandas For Data Analysis.ipynb @@ -0,0 +1,6589 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Introduction \n", + "\n", + "Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.\n", + "\n", + "You can say pandas is extremely powerful version of Excel\n", + "\n", + "In this section we are going to talk about \n", + "\n", + "* Introduction To pandas \n", + "* Seies \n", + "* DataFrames \n", + "* Missing Data\n", + "* Merging , Joining , And Concatenating \n", + "* Operations \n", + "* Data Input and Output " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Series " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fristly we are going to talk about Series DataType .\n", + "\n", + "A Series is very similar to numpy array , it is built on top of NumPy Array..\n", + "But Series can have axis labels , meaning it can be indexed by labels instead of just number location \n", + "\n", + "Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, python objects, etc.). The axis labels are collectively called index." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets import numpy and pandas \n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# We can convert a list , numpy array , or dict to Series\n", + "\n", + "labels = ['Shivendra','Ragavendra','Narendra']\n", + "my_list= [21,25,30]\n", + "arr=np.array([10,20,30])\n", + "d={'Shivendra':21,'Raghavendra':25,'Narendra':30}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 21\n", + "1 25\n", + "2 30\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Using List \n", + "pd.Series(data=my_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Shivendra 21\n", + "Ragavendra 25\n", + "Narendra 30\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series (data=my_list,index=labels )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Shivendra 21\n", + "Ragavendra 25\n", + "Narendra 30\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series(my_list,labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 10\n", + "1 20\n", + "2 30\n", + "dtype: int32" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NumPy Array\n", + "pd.Series(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Shivendra 10\n", + "Ragavendra 20\n", + "Narendra 30\n", + "dtype: int32" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series (data=arr,index=labels )" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Shivendra 21\n", + "Raghavendra 25\n", + "Narendra 30\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dictonary\n", + "pd.Series (d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data In A Series \n", + "\n", + "A Pandas Series can hold a variety of Objects " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Shivendra\n", + "1 Ragavendra\n", + "2 Narendra\n", + "dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series (data=labels )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using an Index\n", + "\n", + "The key to using a Series is understanding its index. Pandas makes use of these index names or numbers by allowing for fast look ups of information (works like a hash table or dictionary).\n", + "\n", + "Let's see some examples of how to grab information from a Series. Let us create two sereis, ser1 and ser2:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "ser1= pd.Series ([1,2,3,4], index =['Chennai','Bihar','West Bengal','Rajasthan'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 1\n", + "Bihar 2\n", + "West Bengal 3\n", + "Rajasthan 4\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ser1" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "ser2=pd.Series ([1,2,5,4],index=['Chennai','Bihar','Assam','Rajasthan'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 1\n", + "Bihar 2\n", + "Assam 5\n", + "Rajasthan 4\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ser2" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ser1['Chennai']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Assam NaN\n", + "Bihar 4.0\n", + "Chennai 2.0\n", + "Rajasthan 8.0\n", + "West Bengal NaN\n", + "dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Operations are then also done based off of index:\n", + "ser1+ser2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrames\n", + "Pandas DataFrame is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the data, rows, and columns.\n", + "\n", + "DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from numpy.random import randn\n", + "np.random.seed(101)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.DataFrame (randn(5,5),index='Chennai Bihar UtterPredesh Delhi Mumbai'.split(),columns ='SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay'.split())" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selection and Indexing\n", + "\n", + "Let's learn the various methods to grab data from a DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 2.706850\n", + "Bihar -0.319318\n", + "UtterPredesh 0.528813\n", + "Delhi 0.955057\n", + "Mumbai 0.302665\n", + "Name: SRM, dtype: float64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['SRM']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMBHU
Chennai2.7068500.907969
Bihar-0.3193180.605965
UtterPredesh0.5288130.188695
Delhi0.9550571.978757
Mumbai0.302665-1.706086
\n", + "
" + ], + "text/plain": [ + " SRM BHU\n", + "Chennai 2.706850 0.907969\n", + "Bihar -0.319318 0.605965\n", + "UtterPredesh 0.528813 0.188695\n", + "Delhi 0.955057 1.978757\n", + "Mumbai 0.302665 -1.706086" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can pass a list of columns names \n", + "df[['SRM' , 'BHU']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 2.706850\n", + "Bihar -0.319318\n", + "UtterPredesh 0.528813\n", + "Delhi 0.955057\n", + "Mumbai 0.302665\n", + "Name: SRM, dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.SRM # SQL syntax" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Dataframe Columns are just Series" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df['SRM'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a new columns \n", + "df['UPES']=df['SRM']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "df['Harshita']=df['SRM'] + df['BHU']" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_BombayUPESHarshita
Chennai2.7068500.6281330.9079690.5038260.6511182.7068503.614819
Bihar-0.319318-0.8480770.605965-2.0181680.740122-0.3193180.286647
UtterPredesh0.528813-0.5890010.188695-0.758872-0.9332370.5288130.717509
Delhi0.9550570.1907941.9787572.6059670.6835090.9550572.933814
Mumbai0.3026651.693723-1.706086-1.159119-0.1348410.302665-1.403420
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay UPES \\\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118 2.706850 \n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122 -0.319318 \n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237 0.528813 \n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509 0.955057 \n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841 0.302665 \n", + "\n", + " Harshita \n", + "Chennai 3.614819 \n", + "Bihar 0.286647 \n", + "UtterPredesh 0.717509 \n", + "Delhi 2.933814 \n", + "Mumbai -1.403420 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop('UPES',axis=1) # Axis = 1 for column" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_BombayUPES
Chennai2.7068500.6281330.9079690.5038260.6511182.706850
Bihar-0.319318-0.8480770.605965-2.0181680.740122-0.319318
UtterPredesh0.528813-0.5890010.188695-0.758872-0.9332370.528813
Delhi0.9550570.1907941.9787572.6059670.6835090.955057
Mumbai0.3026651.693723-1.706086-1.159119-0.1348410.302665
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay UPES\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118 2.706850\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122 -0.319318\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237 0.528813\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509 0.955057\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841 0.302665" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df # But again it will be appeared we need to use inplace to remove it parmanently" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop('UPES',axis=1,inplace =True )" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop('Harshita',axis=1,inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop('Delhi',axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SRM 2.706850\n", + "NIT_PATNA 0.628133\n", + "BHU 0.907969\n", + "IIT_DELHI 0.503826\n", + "IIT_Bombay 0.651118\n", + "Name: Chennai, dtype: float64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['Chennai']" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SRM -0.319318\n", + "NIT_PATNA -0.848077\n", + "BHU 0.605965\n", + "IIT_DELHI -2.018168\n", + "IIT_Bombay 0.740122\n", + "Name: Bihar, dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can select based on indexing \n", + "df.iloc[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.8480769834036315" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['Bihar','NIT_PATNA']" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NIT_PATNAIIT_DELHI
Bihar-0.848077-2.018168
Mumbai1.693723-1.159119
\n", + "
" + ], + "text/plain": [ + " NIT_PATNA IIT_DELHI\n", + "Bihar -0.848077 -2.018168\n", + "Mumbai 1.693723 -1.159119" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[['Bihar','Mumbai'],['NIT_PATNA','IIT_DELHI']]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
ChennaiTrueTrueTrueTrueTrue
BiharFalseFalseTrueFalseTrue
UtterPredeshTrueFalseTrueFalseFalse
DelhiTrueTrueTrueTrueTrue
MumbaiTrueTrueFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai True True True True True\n", + "Bihar False False True False True\n", + "UtterPredesh True False True False False\n", + "Delhi True True True True True\n", + "Mumbai True True False False False" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df>0" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
BiharNaNNaN0.605965NaN0.740122
UtterPredesh0.528813NaN0.188695NaNNaN
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar NaN NaN 0.605965 NaN 0.740122\n", + "UtterPredesh 0.528813 NaN 0.188695 NaN NaN\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 NaN NaN NaN" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df>0]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df [df['SRM']>0] # It will not print Bihar Cz Bihar is having negetive number" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 0.907969\n", + "UtterPredesh 0.188695\n", + "Delhi 1.978757\n", + "Mumbai -1.706086\n", + "Name: BHU, dtype: float64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['SRM']>0]['BHU'] # It will not print Bihar data since it is having negetive number " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Chennai 2.706850\n", + "Bihar -0.319318\n", + "UtterPredesh 0.528813\n", + "Delhi 0.955057\n", + "Name: SRM, dtype: float64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['BHU']>0]['SRM'] # It will not print mumbai's data" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BHUIIT_DELHI
Chennai0.9079690.503826
UtterPredesh0.188695-0.758872
Delhi1.9787572.605967
Mumbai-1.706086-1.159119
\n", + "
" + ], + "text/plain": [ + " BHU IIT_DELHI\n", + "Chennai 0.907969 0.503826\n", + "UtterPredesh 0.188695 -0.758872\n", + "Delhi 1.978757 2.605967\n", + "Mumbai -1.706086 -1.159119" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['SRM']>0][['BHU','IIT_DELHI']]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Delhi0.9550570.1907941.9787572.6059670.683509
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['SRM']>0.955)& df['BHU']>0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More Index Details\n", + "\n", + "Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
Chennai2.7068500.6281330.9079690.5038260.651118
Bihar-0.319318-0.8480770.605965-2.0181680.740122
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexSRMNIT_PATNABHUIIT_DELHIIIT_Bombay
0Chennai2.7068500.6281330.9079690.5038260.651118
1Bihar-0.319318-0.8480770.605965-2.0181680.740122
2UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237
3Delhi0.9550570.1907941.9787572.6059670.683509
4Mumbai0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " index SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "0 Chennai 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "1 Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "2 UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "3 Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "4 Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "newind ='Tamil_Nadu BIHAR UP Delhi Maharastra'.split()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "df['States']=newind" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_BombayStates
Chennai2.7068500.6281330.9079690.5038260.651118Tamil_Nadu
Bihar-0.319318-0.8480770.605965-2.0181680.740122BIHAR
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237UP
Delhi0.9550570.1907941.9787572.6059670.683509Delhi
Mumbai0.3026651.693723-1.706086-1.159119-0.134841Maharastra
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay States\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118 Tamil_Nadu\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122 BIHAR\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237 UP\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509 Delhi\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841 Maharastra" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
States
Tamil_Nadu2.7068500.6281330.9079690.5038260.651118
BIHAR-0.319318-0.8480770.605965-2.0181680.740122
UP0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Maharastra0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "States \n", + "Tamil_Nadu 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "BIHAR -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UP 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Maharastra 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.set_index('States')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_BombayStates
Chennai2.7068500.6281330.9079690.5038260.651118Tamil_Nadu
Bihar-0.319318-0.8480770.605965-2.0181680.740122BIHAR
UtterPredesh0.528813-0.5890010.188695-0.758872-0.933237UP
Delhi0.9550570.1907941.9787572.6059670.683509Delhi
Mumbai0.3026651.693723-1.706086-1.159119-0.134841Maharastra
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay States\n", + "Chennai 2.706850 0.628133 0.907969 0.503826 0.651118 Tamil_Nadu\n", + "Bihar -0.319318 -0.848077 0.605965 -2.018168 0.740122 BIHAR\n", + "UtterPredesh 0.528813 -0.589001 0.188695 -0.758872 -0.933237 UP\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509 Delhi\n", + "Mumbai 0.302665 1.693723 -1.706086 -1.159119 -0.134841 Maharastra" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "df.set_index('States',inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SRMNIT_PATNABHUIIT_DELHIIIT_Bombay
States
Tamil_Nadu2.7068500.6281330.9079690.5038260.651118
BIHAR-0.319318-0.8480770.605965-2.0181680.740122
UP0.528813-0.5890010.188695-0.758872-0.933237
Delhi0.9550570.1907941.9787572.6059670.683509
Maharastra0.3026651.693723-1.706086-1.159119-0.134841
\n", + "
" + ], + "text/plain": [ + " SRM NIT_PATNA BHU IIT_DELHI IIT_Bombay\n", + "States \n", + "Tamil_Nadu 2.706850 0.628133 0.907969 0.503826 0.651118\n", + "BIHAR -0.319318 -0.848077 0.605965 -2.018168 0.740122\n", + "UP 0.528813 -0.589001 0.188695 -0.758872 -0.933237\n", + "Delhi 0.955057 0.190794 1.978757 2.605967 0.683509\n", + "Maharastra 0.302665 1.693723 -1.706086 -1.159119 -0.134841" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-Index and Index Hierarchy\n", + "\n", + "Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# index Levels \n", + "outside =['Big_data','Big_data','Big_data','AI','AI','AI']\n", + "inside =[1,2,3,1,2,3]\n", + "hier_index=list(zip(outside,inside))\n", + "hier_index=pd.MultiIndex.from_tuples(hier_index)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex([('Big_data', 1),\n", + " ('Big_data', 2),\n", + " ('Big_data', 3),\n", + " ( 'AI', 1),\n", + " ( 'AI', 2),\n", + " ( 'AI', 3)],\n", + " )" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hier_index" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.DataFrame(np.random.rand (6,2),index=hier_index,columns=['Core','volunteers'])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Corevolunteers
Big_data10.7013710.487635
20.6806780.521548
30.0433970.223937
AI10.5752050.120434
20.5001170.138010
30.0528080.178277
\n", + "
" + ], + "text/plain": [ + " Core volunteers\n", + "Big_data 1 0.701371 0.487635\n", + " 2 0.680678 0.521548\n", + " 3 0.043397 0.223937\n", + "AI 1 0.575205 0.120434\n", + " 2 0.500117 0.138010\n", + " 3 0.052808 0.178277" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's show how to index this! For index hierarchy we use df.loc[], if this was on the columns axis, you would just use normal bracket notation df[]. Calling one level of the index returns the sub-dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Corevolunteers
10.7013710.487635
20.6806780.521548
30.0433970.223937
\n", + "
" + ], + "text/plain": [ + " Core volunteers\n", + "1 0.701371 0.487635\n", + "2 0.680678 0.521548\n", + "3 0.043397 0.223937" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['Big_data']" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Core 0.701371\n", + "volunteers 0.487635\n", + "Name: 1, dtype: float64" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['Big_data'].loc[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FrozenList([None, None])" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index.names" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "df.index.names=['Domain','S.NO']" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Corevolunteers
DomainS.NO
Big_data10.7013710.487635
20.6806780.521548
30.0433970.223937
AI10.5752050.120434
20.5001170.138010
30.0528080.178277
\n", + "
" + ], + "text/plain": [ + " Core volunteers\n", + "Domain S.NO \n", + "Big_data 1 0.701371 0.487635\n", + " 2 0.680678 0.521548\n", + " 3 0.043397 0.223937\n", + "AI 1 0.575205 0.120434\n", + " 2 0.500117 0.138010\n", + " 3 0.052808 0.178277" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Corevolunteers
S.NO
10.7013710.487635
20.6806780.521548
30.0433970.223937
\n", + "
" + ], + "text/plain": [ + " Core volunteers\n", + "S.NO \n", + "1 0.701371 0.487635\n", + "2 0.680678 0.521548\n", + "3 0.043397 0.223937" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.xs('Big_data')" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "weather_data = {\n", + " 'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],\n", + " 'temperature': [32,35,28,24,32,31],\n", + " 'windspeed': [6,7,2,7,4,2],\n", + " 'event': ['Rain', 'Sunny', 'Snow','Snow','Rain', 'Sunny']\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.DataFrame(weather_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperaturewindspeedevent
01/1/2017326Rain
11/2/2017357Sunny
21/3/2017282Snow
31/4/2017247Snow
41/5/2017324Rain
51/6/2017312Sunny
\n", + "
" + ], + "text/plain": [ + " day temperature windspeed event\n", + "0 1/1/2017 32 6 Rain\n", + "1 1/2/2017 35 7 Sunny\n", + "2 1/3/2017 28 2 Snow\n", + "3 1/4/2017 24 7 Snow\n", + "4 1/5/2017 32 4 Rain\n", + "5 1/6/2017 31 2 Sunny" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6, 4)" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape # rows, columns = df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperaturewindspeedevent
01/1/2017326Rain
11/2/2017357Sunny
21/3/2017282Snow
31/4/2017247Snow
41/5/2017324Rain
\n", + "
" + ], + "text/plain": [ + " day temperature windspeed event\n", + "0 1/1/2017 32 6 Rain\n", + "1 1/2/2017 35 7 Sunny\n", + "2 1/3/2017 28 2 Snow\n", + "3 1/4/2017 24 7 Snow\n", + "4 1/5/2017 32 4 Rain" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head() # df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperaturewindspeedevent
11/2/2017357Sunny
21/3/2017282Snow
31/4/2017247Snow
41/5/2017324Rain
51/6/2017312Sunny
\n", + "
" + ], + "text/plain": [ + " day temperature windspeed event\n", + "1 1/2/2017 35 7 Sunny\n", + "2 1/3/2017 28 2 Snow\n", + "3 1/4/2017 24 7 Snow\n", + "4 1/5/2017 32 4 Rain\n", + "5 1/6/2017 31 2 Sunny" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail() # df.tail(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperaturewindspeedevent
11/2/2017357Sunny
21/3/2017282Snow
\n", + "
" + ], + "text/plain": [ + " day temperature windspeed event\n", + "1 1/2/2017 35 7 Sunny\n", + "2 1/3/2017 28 2 Snow" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[1:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Columns" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1/1/2017\n", + "1 1/2/2017\n", + "2 1/3/2017\n", + "3 1/4/2017\n", + "4 1/5/2017\n", + "5 1/6/2017\n", + "Name: day, dtype: object" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['day']" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df['day'])" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperature
01/1/201732
11/2/201735
21/3/201728
31/4/201724
41/5/201732
51/6/201731
\n", + "
" + ], + "text/plain": [ + " day temperature\n", + "0 1/1/2017 32\n", + "1 1/2/2017 35\n", + "2 1/3/2017 28\n", + "3 1/4/2017 24\n", + "4 1/5/2017 32\n", + "5 1/6/2017 31" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['day','temperature']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Operations On DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "35" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['temperature'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daytemperaturewindspeedevent
11/2/2017357Sunny
\n", + "
" + ], + "text/plain": [ + " day temperature windspeed event\n", + "1 1/2/2017 35 7 Sunny" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['temperature']>32]" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 1/2/2017\n", + "Name: day, dtype: object" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['day'][df['temperature'] == df['temperature'].max()] # Kinda doing SQL in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.8297084310253524" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['temperature'].std()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Sunny'" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['event'].max() # But mean() won't work since data type is string" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temperaturewindspeed
count6.0000006.000000
mean30.3333334.666667
std3.8297082.338090
min24.0000002.000000
25%28.7500002.500000
50%31.5000005.000000
75%32.0000006.750000
max35.0000007.000000
\n", + "
" + ], + "text/plain": [ + " temperature windspeed\n", + "count 6.000000 6.000000\n", + "mean 30.333333 4.666667\n", + "std 3.829708 2.338090\n", + "min 24.000000 2.000000\n", + "25% 28.750000 2.500000\n", + "50% 31.500000 5.000000\n", + "75% 32.000000 6.750000\n", + "max 35.000000 7.000000" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Missing Data\n", + "\n", + "Let's show a few convenient methods to deal with Missing Data in pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'A':[1,2,np.nan],\n", + " 'B':[5,np.nan,np.nan],\n", + " 'C':[1,2,3]})" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
01.05.01
12.0NaN2
2NaNNaN3
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1.0 5.0 1\n", + "1 2.0 NaN 2\n", + "2 NaN NaN 3" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
01.05.01
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1.0 5.0 1" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
C
01
12
23
\n", + "
" + ], + "text/plain": [ + " C\n", + "0 1\n", + "1 2\n", + "2 3" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
01.05.01
12.0NaN2
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1.0 5.0 1\n", + "1 2.0 NaN 2" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(thresh=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0151
12shivendra2
2shivendrashivendra3
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 5 1\n", + "1 2 shivendra 2\n", + "2 shivendra shivendra 3" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna(value='shivendra')" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.0\n", + "1 2.0\n", + "2 1.5\n", + "Name: A, dtype: float64" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['A'].fillna(value=df['A'].mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Groupby\n", + "\n", + "The groupby method allows you to group rows of data together and call aggregate functions" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "# Create dataframe\n", + "data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],\n", + " 'Person':['Shivendra','Abhishek','Sowjanya','Manish','Mini','Satya'],\n", + " 'Sales':[200,120,340,124,243,350]}" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CompanyPersonSales
0GOOGShivendra200
1GOOGAbhishek120
2MSFTSowjanya340
3MSFTManish124
4FBMini243
5FBSatya350
\n", + "
" + ], + "text/plain": [ + " Company Person Sales\n", + "0 GOOG Shivendra 200\n", + "1 GOOG Abhishek 120\n", + "2 MSFT Sowjanya 340\n", + "3 MSFT Manish 124\n", + "4 FB Mini 243\n", + "5 FB Satya 350" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Now you can use the .groupby() method to group rows together based off of a column name. For instance let's group based off of Company. This will create a DataFrameGroupBy object:**" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('Company')" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sales
Company
FB296.5
GOOG160.0
MSFT232.0
\n", + "
" + ], + "text/plain": [ + " Sales\n", + "Company \n", + "FB 296.5\n", + "GOOG 160.0\n", + "MSFT 232.0" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#You can save this object as a new variable:\n", + "by_comp = df.groupby(\"Company\")\n", + "#And then call aggregate methods off the object:\n", + "by_comp.mean()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sales
Company
FB296.5
GOOG160.0
MSFT232.0
\n", + "
" + ], + "text/plain": [ + " Sales\n", + "Company \n", + "FB 296.5\n", + "GOOG 160.0\n", + "MSFT 232.0" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('Company').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sales
Company
FB75.660426
GOOG56.568542
MSFT152.735065
\n", + "
" + ], + "text/plain": [ + " Sales\n", + "Company \n", + "FB 75.660426\n", + "GOOG 56.568542\n", + "MSFT 152.735065" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#More examples of aggregate methods:\n", + "by_comp.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PersonSales
Company
FBSatya350
GOOGShivendra200
MSFTSowjanya340
\n", + "
" + ], + "text/plain": [ + " Person Sales\n", + "Company \n", + "FB Satya 350\n", + "GOOG Shivendra 200\n", + "MSFT Sowjanya 340" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_comp.max()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PersonSales
Company
FBMini243
GOOGAbhishek120
MSFTManish124
\n", + "
" + ], + "text/plain": [ + " Person Sales\n", + "Company \n", + "FB Mini 243\n", + "GOOG Abhishek 120\n", + "MSFT Manish 124" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_comp.min()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sales
countmeanstdmin25%50%75%max
Company
FB2.0296.575.660426243.0269.75296.5323.25350.0
GOOG2.0160.056.568542120.0140.00160.0180.00200.0
MSFT2.0232.0152.735065124.0178.00232.0286.00340.0
\n", + "
" + ], + "text/plain": [ + " Sales \n", + " count mean std min 25% 50% 75% max\n", + "Company \n", + "FB 2.0 296.5 75.660426 243.0 269.75 296.5 323.25 350.0\n", + "GOOG 2.0 160.0 56.568542 120.0 140.00 160.0 180.00 200.0\n", + "MSFT 2.0 232.0 152.735065 124.0 178.00 232.0 286.00 340.0" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_comp.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CompanyFBGOOGMSFT
Salescount2.0000002.0000002.000000
mean296.500000160.000000232.000000
std75.66042656.568542152.735065
min243.000000120.000000124.000000
25%269.750000140.000000178.000000
50%296.500000160.000000232.000000
75%323.250000180.000000286.000000
max350.000000200.000000340.000000
\n", + "
" + ], + "text/plain": [ + "Company FB GOOG MSFT\n", + "Sales count 2.000000 2.000000 2.000000\n", + " mean 296.500000 160.000000 232.000000\n", + " std 75.660426 56.568542 152.735065\n", + " min 243.000000 120.000000 124.000000\n", + " 25% 269.750000 140.000000 178.000000\n", + " 50% 296.500000 160.000000 232.000000\n", + " 75% 323.250000 180.000000 286.000000\n", + " max 350.000000 200.000000 340.000000" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_comp.describe().transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sales count 2.000000\n", + " mean 160.000000\n", + " std 56.568542\n", + " min 120.000000\n", + " 25% 140.000000\n", + " 50% 160.000000\n", + " 75% 180.000000\n", + " max 200.000000\n", + "Name: GOOG, dtype: float64" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_comp.describe().transpose()['GOOG']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merging, Joining, and Concatenating\n", + "\n", + "There are 3 main ways of combining DataFrames together: Merging, Joining and Concatenating. In this we will discuss these 3 methods with examples.\n", + "\n", + "____" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],\n", + " 'B': ['B0', 'B1', 'B2', 'B3'],\n", + " 'C': ['C0', 'C1', 'C2', 'C3'],\n", + " 'D': ['D0', 'D1', 'D2', 'D3']},\n", + " index=[0, 1, 2, 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],\n", + " 'B': ['B4', 'B5', 'B6', 'B7'],\n", + " 'C': ['C4', 'C5', 'C6', 'C7'],\n", + " 'D': ['D4', 'D5', 'D6', 'D7']},\n", + " index=[4, 5, 6, 7]) " + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],\n", + " 'B': ['B8', 'B9', 'B10', 'B11'],\n", + " 'C': ['C8', 'C9', 'C10', 'C11'],\n", + " 'D': ['D8', 'D9', 'D10', 'D11']},\n", + " index=[8, 9, 10, 11])" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 C1 D1\n", + "2 A2 B2 C2 D2\n", + "3 A3 B3 C3 D3" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
8A8B8C8D8
9A9B9C9D9
10A10B10C10D10
11A11B11C11D11
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "8 A8 B8 C8 D8\n", + "9 A9 B9 C9 D9\n", + "10 A10 B10 C10 D10\n", + "11 A11 B11 C11 D11" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Concatenation\n", + "\n", + "Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use **pd.concat** and pass in a list of DataFrames to concatenate together:" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
8A8B8C8D8
9A9B9C9D9
10A10B10C10D10
11A11B11C11D11
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 C1 D1\n", + "2 A2 B2 C2 D2\n", + "3 A3 B3 C3 D3\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7\n", + "8 A8 B8 C8 D8\n", + "9 A9 B9 C9 D9\n", + "10 A10 B10 C10 D10\n", + "11 A11 B11 C11 D11" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1,df2,df3])" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDABCDABCD
0A0B0C0D0NaNNaNNaNNaNNaNNaNNaNNaN
1A1B1C1D1NaNNaNNaNNaNNaNNaNNaNNaN
2A2B2C2D2NaNNaNNaNNaNNaNNaNNaNNaN
3A3B3C3D3NaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNA4B4C4D4NaNNaNNaNNaN
5NaNNaNNaNNaNA5B5C5D5NaNNaNNaNNaN
6NaNNaNNaNNaNA6B6C6D6NaNNaNNaNNaN
7NaNNaNNaNNaNA7B7C7D7NaNNaNNaNNaN
8NaNNaNNaNNaNNaNNaNNaNNaNA8B8C8D8
9NaNNaNNaNNaNNaNNaNNaNNaNA9B9C9D9
10NaNNaNNaNNaNNaNNaNNaNNaNA10B10C10D10
11NaNNaNNaNNaNNaNNaNNaNNaNA11B11C11D11
\n", + "
" + ], + "text/plain": [ + " A B C D A B C D A B C D\n", + "0 A0 B0 C0 D0 NaN NaN NaN NaN NaN NaN NaN NaN\n", + "1 A1 B1 C1 D1 NaN NaN NaN NaN NaN NaN NaN NaN\n", + "2 A2 B2 C2 D2 NaN NaN NaN NaN NaN NaN NaN NaN\n", + "3 A3 B3 C3 D3 NaN NaN NaN NaN NaN NaN NaN NaN\n", + "4 NaN NaN NaN NaN A4 B4 C4 D4 NaN NaN NaN NaN\n", + "5 NaN NaN NaN NaN A5 B5 C5 D5 NaN NaN NaN NaN\n", + "6 NaN NaN NaN NaN A6 B6 C6 D6 NaN NaN NaN NaN\n", + "7 NaN NaN NaN NaN A7 B7 C7 D7 NaN NaN NaN NaN\n", + "8 NaN NaN NaN NaN NaN NaN NaN NaN A8 B8 C8 D8\n", + "9 NaN NaN NaN NaN NaN NaN NaN NaN A9 B9 C9 D9\n", + "10 NaN NaN NaN NaN NaN NaN NaN NaN A10 B10 C10 D10\n", + "11 NaN NaN NaN NaN NaN NaN NaN NaN A11 B11 C11 D11" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1,df2,df3],axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Operations\n", + "\n", + "There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3
01444abc
12555def
23666ghi
34444xyz
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3\n", + "0 1 444 abc\n", + "1 2 555 def\n", + "2 3 666 ghi\n", + "3 4 444 xyz" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([444, 555, 666], dtype=int64)" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col2'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col2'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "444 2\n", + "555 1\n", + "666 1\n", + "Name: col2, dtype: int64" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "#Select from DataFrame using criteria from multiple columns\n", + "newdf = df[(df['col1']>2) & (df['col2']==444)]" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3
34444xyz
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3\n", + "3 4 444 xyz" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "newdf" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "# Applying Functions\n", + "def times2(x):\n", + " return x*2" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 4\n", + "2 6\n", + "3 8\n", + "Name: col1, dtype: int64" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col1'].apply(times2)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 3\n", + "2 3\n", + "3 3\n", + "Name: col3, dtype: int64" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col3'].apply(len)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['col1'].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Permanently Removing a Column**" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "del df['col1']" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col2col3
0444abc
1555def
2666ghi
3444xyz
\n", + "
" + ], + "text/plain": [ + " col2 col3\n", + "0 444 abc\n", + "1 555 def\n", + "2 666 ghi\n", + "3 444 xyz" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['col2', 'col3'], dtype='object')" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get columns and index names \n", + "df.columns " + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=4, step=1)" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col2col3
0444abc
1555def
2666ghi
3444xyz
\n", + "
" + ], + "text/plain": [ + " col2 col3\n", + "0 444 abc\n", + "1 555 def\n", + "2 666 ghi\n", + "3 444 xyz" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col2col3
0444abc
3444xyz
1555def
2666ghi
\n", + "
" + ], + "text/plain": [ + " col2 col3\n", + "0 444 abc\n", + "3 444 xyz\n", + "1 555 def\n", + "2 666 ghi" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by='col2') #inplace=False by default" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col2col3
0FalseFalse
1FalseFalse
2FalseFalse
3FalseFalse
\n", + "
" + ], + "text/plain": [ + " col2 col3\n", + "0 False False\n", + "1 False False\n", + "2 False False\n", + "3 False False" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check is there any null value or not \n", + "df.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col2col3
0444abc
1555def
2666ghi
3444xyz
\n", + "
" + ], + "text/plain": [ + " col2 col3\n", + "0 444 abc\n", + "1 555 def\n", + "2 666 ghi\n", + "3 444 xyz" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Drop rows with NaN Values\n", + "df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3
01.0NaNabc
12.0555.0def
23.0666.0ghi
3NaN444.0xyz
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3\n", + "0 1.0 NaN abc\n", + "1 2.0 555.0 def\n", + "2 3.0 666.0 ghi\n", + "3 NaN 444.0 xyz" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'col1':[1,2,3,np.nan],\n", + " 'col2':[np.nan,555,666,444],\n", + " 'col3':['abc','def','ghi','xyz']})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3
01FILLabc
12555def
23666ghi
3FILL444xyz
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3\n", + "0 1 FILL abc\n", + "1 2 555 def\n", + "2 3 666 ghi\n", + "3 FILL 444 xyz" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna('FILL')" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "data = {'A':['foo','foo','foo','bar','bar','bar'],\n", + " 'B':['one','one','two','two','one','one'],\n", + " 'C':['x','y','x','y','x','y'],\n", + " 'D':[1,3,2,5,4,1]}\n", + "\n", + "df = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0fooonex1
1foooney3
2footwox2
3bartwoy5
4baronex4
5baroney1
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 foo one x 1\n", + "1 foo one y 3\n", + "2 foo two x 2\n", + "3 bar two y 5\n", + "4 bar one x 4\n", + "5 bar one y 1" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Great" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}