mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
88 lines
3.1 KiB
Python
88 lines
3.1 KiB
Python
|
|
from StringIO import StringIO
|
|
import unittest2 as unittest
|
|
from mr_s3_log_parser import MrS3LogParser
|
|
|
|
|
|
class MrTestsUtil:
|
|
|
|
def run_mr_sandbox(self, mr_job, stdin):
|
|
# inline runs the job in the same process so small jobs tend to
|
|
# run faster and stack traces are simpler
|
|
# --no-conf prevents options from local mrjob.conf from polluting
|
|
# the testing environment
|
|
# "-" reads from standard in
|
|
mr_job.sandbox(stdin=stdin)
|
|
|
|
# make_runner ensures job cleanup is performed regardless of
|
|
# success or failure
|
|
with mr_job.make_runner() as runner:
|
|
runner.run()
|
|
for line in runner.stream_output():
|
|
key, value = mr_job.parse_output_line(line)
|
|
yield value
|
|
|
|
|
|
class TestMrS3LogParser(unittest.TestCase):
|
|
|
|
mr_job = None
|
|
mr_tests_util = None
|
|
|
|
RAW_LOG_LINE_INVALID = \
|
|
'00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
|
|
'00000388225bcc00000 ' \
|
|
's3-storage [22/Jul/2013:21:03:27 +0000] ' \
|
|
'00.111.222.33 ' \
|
|
|
|
RAW_LOG_LINE_VALID = \
|
|
'00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
|
|
'00000388225bcc00000 ' \
|
|
's3-storage [22/Jul/2013:21:03:27 +0000] ' \
|
|
'00.111.222.33 ' \
|
|
'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \
|
|
'REST.HEAD.OBJECT user/file.pdf ' \
|
|
'"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \
|
|
'00000SDZk ' \
|
|
'HTTP/1.1" 200 - - 4000272 18 - "-" ' \
|
|
'"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \
|
|
'00000XMHZJp6DjM9x5JVEAMo8MG00000'
|
|
|
|
DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000"
|
|
DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000"
|
|
DATE_VALID = "2013-07-22"
|
|
DATE_TIME_VALID = "2013-07-22 21:04:17"
|
|
TIME_ZONE_VALID = "+0000"
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(TestMrS3LogParser, self).__init__(*args, **kwargs)
|
|
self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-'])
|
|
self.mr_tests_util = MrTestsUtil()
|
|
|
|
def test_invalid_log_lines(self):
|
|
stdin = StringIO(self.RAW_LOG_LINE_INVALID)
|
|
|
|
for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
|
|
self.assertEqual(result.find("Error"), 0)
|
|
|
|
def test_valid_log_lines(self):
|
|
stdin = StringIO(self.RAW_LOG_LINE_VALID)
|
|
|
|
for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
|
|
self.assertEqual(result.find("Error"), -1)
|
|
|
|
def test_clean_date_time_zone(self):
|
|
date, date_time, time_zone_parsed = \
|
|
self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID)
|
|
self.assertEqual(date, self.DATE_VALID)
|
|
self.assertEqual(date_time, self.DATE_TIME_VALID)
|
|
self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID)
|
|
|
|
# Use a lambda to delay the calling of clean_date_time_zone so that
|
|
# assertRaises has enough time to handle it properly
|
|
self.assertRaises(ValueError,
|
|
lambda: self.mr_job.clean_date_time_zone(
|
|
self.DATE_TIME_ZONE_INVALID))
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|