mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added Spark streaming snippets.
This commit is contained in:
parent
8364d476b3
commit
10d63efb4a
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"name": "",
|
"name": "",
|
||||||
"signature": "sha256:41d7aa3100998f60caf63db5380eee43e78d4de66c65af923e4877df81248ae7"
|
"signature": "sha256:7abeb0526adb21448f2c1ad55b4e1cb97cb81bcc62e9c450b3c5645578c7b59b"
|
||||||
},
|
},
|
||||||
"nbformat": 3,
|
"nbformat": 3,
|
||||||
"nbformat_minor": 0,
|
"nbformat_minor": 0,
|
||||||
|
@ -23,7 +23,8 @@
|
||||||
"* Caching RDDs\n",
|
"* Caching RDDs\n",
|
||||||
"* Checkpointing RDDs\n",
|
"* Checkpointing RDDs\n",
|
||||||
"* Writing and Running a Spark Application\n",
|
"* Writing and Running a Spark Application\n",
|
||||||
"* Configuring Spark Applications"
|
"* Configuring Spark Applications\n",
|
||||||
|
"* Streaming"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -816,6 +817,177 @@
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": []
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Streaming"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Start the Spark Shell locally with at least two threads (need a minimum of two threads for streaming, one for receiving, one for processing):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"!spark-shell --master local[2]"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Create a StreamingContext (similar to SparkContext in core Spark) with a batch duration of 1 second:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"val ssc = new StreamingContext(new SparkConf(), Seconds(1)) \n",
|
||||||
|
"val my_stream = ssc.socketTextStream(hostname, port)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Get a DStream from a streaming data source (text from a socket):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"val logs = ssc.socketTextStream(hostname, port)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"DStreams support regular transformations such as map, flatMap, and filter, and pair transformations such as reduceByKey, groupByKey, and joinByKey.\n",
|
||||||
|
"\n",
|
||||||
|
"Apply a DStream operation to each batch of RDDs (count up requests by user id, reduce by key to get the count):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"val requests = my_stream\n",
|
||||||
|
" .map(line => (line.split(\" \")(2), 1))\n",
|
||||||
|
" .reduceByKey((x, y) => x + y)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The transform(function) method creates a new DStream by executing the input function on the RDDs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"val sorted_requests = requests\n",
|
||||||
|
" .map(pair => pair.swap)\n",
|
||||||
|
" .transform(rdd => rdd.sortByKey(false))"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"foreachRDD(function) performs a function on each RDD in the DStream (map is like a shortcut not requiring you to get the RDD first before doing an operation):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"sorted_requests.foreachRDD((rdd,time) => {\n",
|
||||||
|
" println(\"Top users @ \" + time)\n",
|
||||||
|
" rdd.take(5).foreach(\n",
|
||||||
|
" pair => printf(\"User: %s (%s)\\n\",pair._2, pair._1))\n",
|
||||||
|
"}"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Save the DStream result part files with the given folder prefix, the actual folder will be /dir/requests-timestamp0/:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"requests.saveAsTextFiles(\"/dir/requests\")"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Start the execution of all DStreams:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"ssc.start()"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Wait for all background threads to complete before ending the main thread:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"ssc.awaitTermination()"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {}
|
"metadata": {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user