From d4450573c36707fceaea6cf59771173bfedaf6f8 Mon Sep 17 00:00:00 2001
From: Alessandro <hello@alessandromarrella.com>
Date: Sat, 20 Feb 2016 19:35:22 +0100
Subject: [PATCH 1/5] Added DataFrames section

---
 spark/spark.ipynb | 236 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 231 insertions(+), 5 deletions(-)

diff --git a/spark/spark.ipynb b/spark/spark.ipynb
index 9723b4f..4bfac0a 100644
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@@ -64,11 +64,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/bin/sh: pyspark: command not found\r\n"
+     ]
+    }
+   ],
    "source": [
     "!pyspark"
    ]
@@ -82,11 +90,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pyspark.context.SparkContext at 0x103923610>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "sc"
    ]
@@ -113,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
@@ -404,6 +423,213 @@
     "    print user_id, count, user_info"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## DataFrames"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Given the Spark Context, create a SQLContext:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SQLContext\n",
+    "sqlContext = SQLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a dataframe based on the content of a file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = sqlContext.jsonFile(\"file:/path/file.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Display the content of the DataFrame:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print the schema:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Select a column:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.select(\"column_name\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a DataFrame with rows matching a given filter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.filter(df.column_name > 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Aggregate the results and count:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'df' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-5-af17cfa6d2c8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"column_name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "df.groupBy(\"column_name\").count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert a RDD to a DataFrame (by inferring the schema):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = sqlContext.inferSchema(my_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Register the DataFrame as a table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.registerTempTable(\"dataframe_name\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run a SQL Query on a DataFrame registered as a table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From b15edb75859ece2938550dd74b0a9a0d7aaadb10 Mon Sep 17 00:00:00 2001
From: Alessandro <hello@alessandromarrella.com>
Date: Sat, 20 Feb 2016 19:49:32 +0100
Subject: [PATCH 2/5] Added DataFrames section and cleared outputs

---
 .DS_Store         | Bin 0 -> 6148 bytes
 spark/spark.ipynb |  45 +++++++--------------------------------------
 2 files changed, 7 insertions(+), 38 deletions(-)
 create mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..df3e054fbb5df58a533420c860cfc13a5096e329
GIT binary patch
literal 6148
zcmeHK%}T>S5Z<j-Q#}|#QEz+m(nGbj;>l3;<fTy2gGx<k(8Z9HB(+E><Q@Ef4t*G3
z!1r)wcPlN`n~0Q|F#FBU&Scqd!%mhl#%rC{24gm3%mPKs6`=V>a2$0^a>j$m)jO`e
zZqoLG$Qz3#$Nyvi_imactj$~&v*G>w@Y;b)(o*T2=ce-0GqYA<4xfd>dE+YTHc~h3
zBsDiaRXy4AJ!9A1^lD%qMN#l&yWX+%yG?s(FA7svhJHMhdUhhiR@ALUov@WO?d5vx
z-#~QT4gF)8NKXbby|oJXH0_mMuUaXJ&B{){D0<u5rJ~p>SNna-T3avg9-f4~yZeX7
zr{|Yf%{qtAQ^}&i0la}RwEUY!90pN%3En}*Ad-+6AO?tm31Yw;an}3<t%;UG3=jjq
z!vLNS0u<3Pm}yi;2Q>Kmh~s%g6tM9wfhY_*1~ZK?0>X7FpibrHiota{_=Smc3}za2
zI^$|)c#oNxn;QyOvx8r#aK;^t)Di>4z$gP#x@%+oKlu6nKbk~6Vt^Q!C<b_bzp-C~
zC7Iefwm7V{0`vkD1>-V}UnyXyQVg+JidR9EfM1{i=orj2f(L|t1QZR_5Ceb8z$ZkP
BTtol>

literal 0
HcmV?d00001

diff --git a/spark/spark.ipynb b/spark/spark.ipynb
index 4bfac0a..40cdd7e 100644
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@@ -64,19 +64,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/bin/sh: pyspark: command not found\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pyspark"
    ]
@@ -90,22 +82,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pyspark.context.SparkContext at 0x103923610>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "sc"
    ]
@@ -132,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -555,23 +536,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'df' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-5-af17cfa6d2c8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"column_name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "df.groupBy(\"column_name\").count()"
    ]

From e4e1284a15d4f34bb3b091058309cbf6cbc32f86 Mon Sep 17 00:00:00 2001
From: Donne Martin <donne.martin@gmail.com>
Date: Sun, 21 Feb 2016 06:00:40 -0500
Subject: [PATCH 3/5] Move DataFrames before RDDs

---
 spark/spark.ipynb | 401 +++++++++++++++++++++++-----------------------
 1 file changed, 201 insertions(+), 200 deletions(-)

diff --git a/spark/spark.ipynb b/spark/spark.ipynb
index 40cdd7e..43c509d 100644
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@@ -15,6 +15,7 @@
     "\n",
     "* IPython Notebook Setup\n",
     "* Python Shell\n",
+    "* DataFrames\n",
     "* RDDs\n",
     "* Pair RDDs\n",
     "* Running Spark on a Cluster\n",
@@ -91,6 +92,201 @@
     "sc"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## DataFrames"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Given the Spark Context, create a SQLContext:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SQLContext\n",
+    "sqlContext = SQLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a dataframe based on the content of a file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = sqlContext.jsonFile(\"file:/path/file.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Display the content of the DataFrame:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print the schema:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Select a column:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.select(\"column_name\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a DataFrame with rows matching a given filter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.filter(df.column_name > 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Aggregate the results and count:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.groupBy(\"column_name\").count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert a RDD to a DataFrame (by inferring the schema):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = sqlContext.inferSchema(my_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Register the DataFrame as a table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.registerTempTable(\"dataframe_name\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run a SQL Query on a DataFrame registered as a table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -404,201 +600,6 @@
     "    print user_id, count, user_info"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## DataFrames"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Given the Spark Context, create a SQLContext:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from pyspark.sql import SQLContext\n",
-    "sqlContext = SQLContext(sc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Create a dataframe based on the content of a file:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df = sqlContext.jsonFile(\"file:/path/file.json\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Display the content of the DataFrame:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Print the schema:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df.printSchema()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Select a column:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df.select(\"column_name\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Create a DataFrame with rows matching a given filter:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df.filter(df.column_name > 10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Aggregate the results and count:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "df.groupBy(\"column_name\").count()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Convert a RDD to a DataFrame (by inferring the schema):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df = sqlContext.inferSchema(my_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Register the DataFrame as a table:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "df.registerTempTable(\"dataframe_name\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Run a SQL Query on a DataFrame registered as a table:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1497,21 +1498,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
   }
  },
  "nbformat": 4,

From 34889ce7c88c4495f3fb33209986c9d647f6d9f9 Mon Sep 17 00:00:00 2001
From: Donne Martin <donne.martin@gmail.com>
Date: Sun, 21 Feb 2016 06:16:34 -0500
Subject: [PATCH 4/5] Add more Spark DataFrame examples

---
 spark/spark.ipynb | 171 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 168 insertions(+), 3 deletions(-)

diff --git a/spark/spark.ipynb b/spark/spark.ipynb
index 43c509d..3cfe1dd 100644
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@@ -103,9 +103,174 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "From the following [reference](https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html):\n",
+    "\n",
     "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a DataFrame from JSON files on S3:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "users = context.load(\"s3n://path/to/users.json\", \"json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a new DataFrame that contains “young users” only:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "young = users.filter(users.age<21)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, using Pandas-like syntax:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "young = users[users.age<21]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Increment everybody’s age by 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "young.select(young.name, young.age+1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Count the number of young users by gender:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "young.groupBy(\"gender\").count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Join young users with another DataFrame called logs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "young.join(logs, logs.userId == users.userId, \"left_outer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Count the number of users in the young DataFrame:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "young.registerTempTable(\"young\")\n",
+    "context.sql(\"SELECT count(*) FROM young\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert Spark DataFrame to Pandas:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pandas_df = young.toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Spark DataFrame from Pandas:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "spark_df = context.createDataFrame(pandas_df)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -129,7 +294,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Create a dataframe based on the content of a file:"
+    "Create a DataFrame based on the content of a file:"
    ]
   },
   {
@@ -212,7 +377,7 @@
    },
    "outputs": [],
    "source": [
-    "df.filter(df.column_name > 10)"
+    "df.filter(df.column_name>10)"
    ]
   },
   {
@@ -284,7 +449,7 @@
    },
    "outputs": [],
    "source": [
-    "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD"
+    "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\")"
    ]
   },
   {

From 138cd1054eb55d036af3bcae0a476d5f0815953a Mon Sep 17 00:00:00 2001
From: Donne Martin <donne.martin@gmail.com>
Date: Sun, 21 Feb 2016 06:21:59 -0500
Subject: [PATCH 5/5] Add note on DataFrame recomme^Cation over RDD

---
 spark/spark.ipynb | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spark/spark.ipynb b/spark/spark.ipynb
index 3cfe1dd..e84259e 100644
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@@ -458,6 +458,8 @@
    "source": [
     "## RDDs\n",
     "\n",
+    "Note: RDDs are included for completeness.  In Spark 1.3, DataFrames were introduced which are recommended over RDDs.  Check out the [DataFrames announcement](https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html) for more info.\n",
+    "\n",
     "Resilient Distributed Datasets (RDDs) are the fundamental unit of data in Spark.  RDDs can be created from a file, from data in memory, or from another RDD.  RDDs are immutable.\n",
     "\n",
     "There are two types of RDD operations:\n",