From d4450573c36707fceaea6cf59771173bfedaf6f8 Mon Sep 17 00:00:00 2001 From: Alessandro Date: Sat, 20 Feb 2016 19:35:22 +0100 Subject: [PATCH 1/5] Added DataFrames section --- spark/spark.ipynb | 236 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 231 insertions(+), 5 deletions(-) diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 9723b4f..4bfac0a 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -64,11 +64,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/sh: pyspark: command not found\r\n" + ] + } + ], "source": [ "!pyspark" ] @@ -82,11 +90,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sc" ] @@ -113,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -404,6 +423,213 @@ " print user_id, count, user_info" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given the Spark Context, create a SQLContext:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pyspark.sql import SQLContext\n", + "sqlContext = SQLContext(sc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a dataframe based on the content of a file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = sqlContext.jsonFile(\"file:/path/file.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display the content of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the schema:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select a column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.select(\"column_name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame with rows matching a given filter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.filter(df.column_name > 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aggregate the results and count:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"column_name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" + ] + } + ], + "source": [ + "df.groupBy(\"column_name\").count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert a RDD to a DataFrame (by inferring the schema):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = sqlContext.inferSchema(my_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register the DataFrame as a table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.registerTempTable(\"dataframe_name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run a SQL Query on a DataFrame registered as a table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" + ] + }, { "cell_type": "markdown", "metadata": {}, From b15edb75859ece2938550dd74b0a9a0d7aaadb10 Mon Sep 17 00:00:00 2001 From: Alessandro Date: Sat, 20 Feb 2016 19:49:32 +0100 Subject: [PATCH 2/5] Added DataFrames section and cleared outputs --- .DS_Store | Bin 0 -> 6148 bytes spark/spark.ipynb | 45 +++++++-------------------------------------- 2 files changed, 7 insertions(+), 38 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..df3e054fbb5df58a533420c860cfc13a5096e329 GIT binary patch literal 6148 zcmeHK%}T>S5Zl3;a2$0^a>j$m)jO`e zZqoLG$Qz3#$Nyvi_imactj$~&v*G>w@Y;b)(o*T2=ce-0GqYA<4xfd>dE+YTHc~h3 zBsDiaRXy4AJ!9A1^lD%qMN#l&yWX+%yG?s(FA7svhJHMhdUhhiR@ALUov@WO?d5vx z-#~QT4gF)8NKXbby|oJXH0_mMuUaXJ&B{){D0SNna-T3avg9-f4~yZeX7 zr{|Yf%{qtAQ^}&i0la}RwEUY!90pN%3En}*Ad-+6AO?tm31Yw;an}3Kmh~s%g6tM9wfhY_*1~ZK?0>X7FpibrHiota{_=Smc3}za2 zI^$|)c#oNxn;QyOvx8r#aK;^t)Di>4z$gP#x@%+oKlu6nKbk~6Vt^Q!C-V}UnyXyQVg+JidR9EfM1{i=orj2f(L|t1QZR_5Ceb8z$ZkP BTtol> literal 0 HcmV?d00001 diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 4bfac0a..40cdd7e 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -64,19 +64,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/sh: pyspark: command not found\r\n" - ] - } - ], + "outputs": [], "source": [ "!pyspark" ] @@ -90,22 +82,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sc" ] @@ -132,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false }, @@ -555,23 +536,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"column_name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], + "outputs": [], "source": [ "df.groupBy(\"column_name\").count()" ] From e4e1284a15d4f34bb3b091058309cbf6cbc32f86 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Sun, 21 Feb 2016 06:00:40 -0500 Subject: [PATCH 3/5] Move DataFrames before RDDs --- spark/spark.ipynb | 401 +++++++++++++++++++++++----------------------- 1 file changed, 201 insertions(+), 200 deletions(-) diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 40cdd7e..43c509d 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -15,6 +15,7 @@ "\n", "* IPython Notebook Setup\n", "* Python Shell\n", + "* DataFrames\n", "* RDDs\n", "* Pair RDDs\n", "* Running Spark on a Cluster\n", @@ -91,6 +92,201 @@ "sc" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given the Spark Context, create a SQLContext:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pyspark.sql import SQLContext\n", + "sqlContext = SQLContext(sc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a dataframe based on the content of a file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = sqlContext.jsonFile(\"file:/path/file.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display the content of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the schema:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select a column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.select(\"column_name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame with rows matching a given filter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.filter(df.column_name > 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aggregate the results and count:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.groupBy(\"column_name\").count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert a RDD to a DataFrame (by inferring the schema):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = sqlContext.inferSchema(my_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register the DataFrame as a table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.registerTempTable(\"dataframe_name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run a SQL Query on a DataFrame registered as a table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -404,201 +600,6 @@ " print user_id, count, user_info" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DataFrames" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Given the Spark Context, create a SQLContext:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from pyspark.sql import SQLContext\n", - "sqlContext = SQLContext(sc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a dataframe based on the content of a file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df = sqlContext.jsonFile(\"file:/path/file.json\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Display the content of the DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the schema:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.printSchema()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Select a column:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.select(\"column_name\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a DataFrame with rows matching a given filter:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.filter(df.column_name > 10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Aggregate the results and count:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "df.groupBy(\"column_name\").count()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Convert a RDD to a DataFrame (by inferring the schema):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df = sqlContext.inferSchema(my_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the DataFrame as a table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.registerTempTable(\"dataframe_name\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run a SQL Query on a DataFrame registered as a table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1497,21 +1498,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" + "pygments_lexer": "ipython3", + "version": "3.4.3" } }, "nbformat": 4, From 34889ce7c88c4495f3fb33209986c9d647f6d9f9 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Sun, 21 Feb 2016 06:16:34 -0500 Subject: [PATCH 4/5] Add more Spark DataFrame examples --- spark/spark.ipynb | 171 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 3 deletions(-) diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 43c509d..3cfe1dd 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -103,9 +103,174 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "From the following [reference](https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html):\n", + "\n", "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame from JSON files on S3:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "users = context.load(\"s3n://path/to/users.json\", \"json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a new DataFrame that contains “young users” only:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "young = users.filter(users.age<21)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, using Pandas-like syntax:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "young = users[users.age<21]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Increment everybody’s age by 1:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "young.select(young.name, young.age+1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Count the number of young users by gender:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "young.groupBy(\"gender\").count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Join young users with another DataFrame called logs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "young.join(logs, logs.userId == users.userId, \"left_outer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Count the number of users in the young DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "young.registerTempTable(\"young\")\n", + "context.sql(\"SELECT count(*) FROM young\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert Spark DataFrame to Pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pandas_df = young.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a Spark DataFrame from Pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "spark_df = context.createDataFrame(pandas_df)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -129,7 +294,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a dataframe based on the content of a file:" + "Create a DataFrame based on the content of a file:" ] }, { @@ -212,7 +377,7 @@ }, "outputs": [], "source": [ - "df.filter(df.column_name > 10)" + "df.filter(df.column_name>10)" ] }, { @@ -284,7 +449,7 @@ }, "outputs": [], "source": [ - "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" + "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\")" ] }, { From 138cd1054eb55d036af3bcae0a476d5f0815953a Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Sun, 21 Feb 2016 06:21:59 -0500 Subject: [PATCH 5/5] Add note on DataFrame recomme^Cation over RDD --- spark/spark.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 3cfe1dd..e84259e 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -458,6 +458,8 @@ "source": [ "## RDDs\n", "\n", + "Note: RDDs are included for completeness. In Spark 1.3, DataFrames were introduced which are recommended over RDDs. Check out the [DataFrames announcement](https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html) for more info.\n", + "\n", "Resilient Distributed Datasets (RDDs) are the fundamental unit of data in Spark. RDDs can be created from a file, from data in memory, or from another RDD. RDDs are immutable.\n", "\n", "There are two types of RDD operations:\n",