From 87402ca5b8accc953146497bce0edecda865bd62 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Sat, 28 Feb 2015 12:44:56 -0500 Subject: [PATCH] Added IPython Notebook containing HDFS snippets. --- README.md | 22 +++-- spark/hdfs.ipynb | 226 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 8 deletions(-) create mode 100644 spark/hdfs.ipynb diff --git a/README.md b/README.md index 495420d..4fdd1c7 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,20 @@ IPython Notebooks demonstrating pandas functionality. * [pandas io](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/pandas/pandas_io.ipynb) * [pandas cleaning](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/pandas/pandas_clean.ipynb) +## spark + +IPython Notebooks demonstrating spark and HDFS functionality. + +* [hdfs](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/spark/hdfs.ipynb) + +## commands + +IPython Notebooks demonstrating various command lines for AWS, Unix, etc. + +* [aws commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/aws.ipynb) +* [linux commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/linux.ipynb) +* [jekyll commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/jekyll.ipynb) + ## matplotlib [Coming Soon] IPython Notebooks demonstrating matplotlib functionality. @@ -35,14 +49,6 @@ IPython Notebooks demonstrating pandas functionality. [Coming Soon] IPython Notebooks demonstrating NumPy functionality. -## commands - -IPython Notebooks demonstrating various command lines for AWS, Unix, etc. - -* [aws commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/aws.ipynb) -* [linux commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/linux.ipynb) -* [jekyll commands](http://nbviewer.ipython.org/github/donnemartin/ipython-data-notebooks/blob/master/commands/jekyll.ipynb) - ## References * [Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython](http://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1449319793) diff --git a/spark/hdfs.ipynb b/spark/hdfs.ipynb new file mode 100644 index 0000000..44d7585 --- /dev/null +++ b/spark/hdfs.ipynb @@ -0,0 +1,226 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:50db54b924da35ab4fc1dba7b6a4d444e2206d0d631f40b085a56636d739839c" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HDFS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run an HDFS command:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run a file system command on the file systems (FsShell):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List the user's home directory:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -ls" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List the HDFS root directory:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -ls /" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy a local file to the user's directory on HDFS:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -put file.txt file.txt" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display the contents of the specified HDFS file:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -cat file.txt" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the last 10 lines of the file to the terminal:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -cat file.txt | tail -n 10" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View a directory and all of its files:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -cat dir/* | less" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy an HDFS file to local:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -get file.txt file.txt" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a directory on HDFS:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -mkdir dir" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recursively delete the specified directory and all of its contents:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hdfs dfs -rm -r dir" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")" + ], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file