diff --git a/spark/spark.ipynb b/spark/spark.ipynb index f619b7b..2f7b5ab 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Start the pyspark shell:" + "Start the pyspark shell (REPL):" ] }, { @@ -44,14 +44,14 @@ }, "outputs": [], "source": [ - "pyspark" + "!pyspark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "View the spark context:" + "View the spark context, the main entry point to the Spark API:" ] }, { @@ -69,7 +69,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## RDDs" + "## RDDs\n", + "\n", + "Resilient Distributed Datasets (RDDs) are the fundamental unit of data in Spark. RDDs can be created from a file, from data in memory, or from another RDD. RDDs are immutable.\n", + "\n", + "There are two types of RDD operations:\n", + "* Actions: Returns values, data is not processed in an RDD until an action is preformed\n", + "* Transformations: Defines a new RDD based on the current\n" ] }, { @@ -1284,7 +1290,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.9" + "version": "2.7.10" } }, "nbformat": 4,