system-design-primer/solutions/system_design/web_crawler/web_crawler_snippets.py

# -*- coding: utf-8 -*-


class PagesDataStore(object):

    def __init__(self, db):
        self.db = db
        pass

    def add_link_to_crawl(self, url):
        """Add the given link to `links_to_crawl`."""
        pass

    def remove_link_to_crawl(self, url):
        """Remove the given link from `links_to_crawl`."""
        pass

    def reduce_priority_link_to_crawl(self, url):
        """Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
        pass

    def extract_max_priority_page(self):
        """Return the highest priority link in `links_to_crawl`."""
        pass

    def insert_crawled_link(self, url, signature):
        """Add the given link to `crawled_links`."""
        pass

    def crawled_similar(self, signature):
        """Determine if we've already crawled a page matching the given signature"""
        pass


class Page(object):

    def __init__(self, url, contents, child_urls):
        self.url = url
        self.contents = contents
        self.child_urls = child_urls
        self.signature = self.create_signature()

    def create_signature(self):
        # Create signature based on url and contents
        pass


class Crawler(object):

    def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue):
        self.pages = pages
        self.data_store = data_store
        self.reverse_index_queue = reverse_index_queue
        self.doc_index_queue = doc_index_queue

    def crawl_page(self, page):
        for url in page.child_urls:
            self.data_store.add_link_to_crawl(url)
        self.reverse_index_queue.generate(page)
        self.doc_index_queue.generate(page)
        self.data_store.remove_link_to_crawl(page.url)
        self.data_store.insert_crawled_link(page.url, page.signature)

    def crawl(self):
        while True:
            page = self.data_store.extract_max_priority_page()
            if page is None:
                break
            if self.data_store.crawled_similar(page.signature):
                self.data_store.reduce_priority_link_to_crawl(page.url)
            else:
                self.crawl_page(page)
            page = self.data_store.extract_max_priority_page()
Add Web Crawler solution 2017-03-05 13:06:58 +08:00			`# -- coding: utf-8 --`

Add newlines - PEP8 style (#173) 2018-07-19 14:09:09 +08:00
Add Web Crawler solution 2017-03-05 13:06:58 +08:00			`class PagesDataStore(object):`

Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`def __init__(self, db):`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00			`self.db = db`
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
			`def add_link_to_crawl(self, url):`
			"""Add the given link to `links_to_crawl`."""
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
			`def remove_link_to_crawl(self, url):`
			"""Remove the given link from `links_to_crawl`."""
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`def reduce_priority_link_to_crawl(self, url):`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00			"""Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
			`def extract_max_priority_page(self):`
			"""Return the highest priority link in `links_to_crawl`."""
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
			`def insert_crawled_link(self, url, signature):`
			"""Add the given link to `crawled_links`."""
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00
			`def crawled_similar(self, signature):`
			`"""Determine if we've already crawled a page matching the given signature"""`
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00

			`class Page(object):`

			`def __init__(self, url, contents, child_urls):`
			`self.url = url`
			`self.contents = contents`
			`self.child_urls = child_urls`
			`self.signature = self.create_signature()`

			`def create_signature(self):`
			`# Create signature based on url and contents`
Convert all .py files to be valid Python (#98) 2018-03-07 08:37:46 +08:00			`pass`
Add Web Crawler solution 2017-03-05 13:06:58 +08:00

			`class Crawler(object):`

			`def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue):`
			`self.pages = pages`
			`self.data_store = data_store`
			`self.reverse_index_queue = reverse_index_queue`
			`self.doc_index_queue = doc_index_queue`

			`def crawl_page(self, page):`
			`for url in page.child_urls:`
			`self.data_store.add_link_to_crawl(url)`
			`self.reverse_index_queue.generate(page)`
			`self.doc_index_queue.generate(page)`
			`self.data_store.remove_link_to_crawl(page.url)`
			`self.data_store.insert_crawled_link(page.url, page.signature)`

			`def crawl(self):`
			`while True:`
			`page = self.data_store.extract_max_priority_page()`
			`if page is None:`
			`break`
			`if self.data_store.crawled_similar(page.signature):`
			`self.data_store.reduce_priority_link_to_crawl(page.url)`
			`else:`
			`self.crawl_page(page)`
			`page = self.data_store.extract_max_priority_page()`