data-science-ipython-notebooks/mapreduce/test_mr_s3_log_parser.py

88 lines
3.1 KiB
Python

from StringIO import StringIO
import unittest2 as unittest
from mr_s3_log_parser import MrS3LogParser
class MrTestsUtil:
def run_mr_sandbox(self, mr_job, stdin):
# inline runs the job in the same process so small jobs tend to
# run faster and stack traces are simpler
# --no-conf prevents options from local mrjob.conf from polluting
# the testing environment
# "-" reads from standard in
mr_job.sandbox(stdin=stdin)
# make_runner ensures job cleanup is performed regardless of
# success or failure
with mr_job.make_runner() as runner:
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
yield value
class TestMrS3LogParser(unittest.TestCase):
mr_job = None
mr_tests_util = None
RAW_LOG_LINE_INVALID = \
'00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
'00000388225bcc00000 ' \
's3-storage [22/Jul/2013:21:03:27 +0000] ' \
'00.111.222.33 ' \
RAW_LOG_LINE_VALID = \
'00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
'00000388225bcc00000 ' \
's3-storage [22/Jul/2013:21:03:27 +0000] ' \
'00.111.222.33 ' \
'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \
'REST.HEAD.OBJECT user/file.pdf ' \
'"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \
'00000SDZk ' \
'HTTP/1.1" 200 - - 4000272 18 - "-" ' \
'"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \
'00000XMHZJp6DjM9x5JVEAMo8MG00000'
DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000"
DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000"
DATE_VALID = "2013-07-22"
DATE_TIME_VALID = "2013-07-22 21:04:17"
TIME_ZONE_VALID = "+0000"
def __init__(self, *args, **kwargs):
super(TestMrS3LogParser, self).__init__(*args, **kwargs)
self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-'])
self.mr_tests_util = MrTestsUtil()
def test_invalid_log_lines(self):
stdin = StringIO(self.RAW_LOG_LINE_INVALID)
for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
self.assertEqual(result.find("Error"), 0)
def test_valid_log_lines(self):
stdin = StringIO(self.RAW_LOG_LINE_VALID)
for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
self.assertEqual(result.find("Error"), -1)
def test_clean_date_time_zone(self):
date, date_time, time_zone_parsed = \
self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID)
self.assertEqual(date, self.DATE_VALID)
self.assertEqual(date_time, self.DATE_TIME_VALID)
self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID)
# Use a lambda to delay the calling of clean_date_time_zone so that
# assertRaises has enough time to handle it properly
self.assertRaises(ValueError,
lambda: self.mr_job.clean_date_time_zone(
self.DATE_TIME_ZONE_INVALID))
if __name__ == '__main__':
unittest.main()