2017-03-05 13:06:58 +08:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2018-07-19 14:09:09 +08:00
|
|
|
|
2017-03-05 13:06:58 +08:00
|
|
|
class PagesDataStore(object):
|
|
|
|
|
2018-03-07 08:37:46 +08:00
|
|
|
def __init__(self, db):
|
2017-03-05 13:06:58 +08:00
|
|
|
self.db = db
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
def add_link_to_crawl(self, url):
|
|
|
|
"""Add the given link to `links_to_crawl`."""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
def remove_link_to_crawl(self, url):
|
|
|
|
"""Remove the given link from `links_to_crawl`."""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
2018-03-07 08:37:46 +08:00
|
|
|
def reduce_priority_link_to_crawl(self, url):
|
2017-03-05 13:06:58 +08:00
|
|
|
"""Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
def extract_max_priority_page(self):
|
|
|
|
"""Return the highest priority link in `links_to_crawl`."""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
def insert_crawled_link(self, url, signature):
|
|
|
|
"""Add the given link to `crawled_links`."""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
def crawled_similar(self, signature):
|
|
|
|
"""Determine if we've already crawled a page matching the given signature"""
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
class Page(object):
|
|
|
|
|
|
|
|
def __init__(self, url, contents, child_urls):
|
|
|
|
self.url = url
|
|
|
|
self.contents = contents
|
|
|
|
self.child_urls = child_urls
|
|
|
|
self.signature = self.create_signature()
|
|
|
|
|
|
|
|
def create_signature(self):
|
|
|
|
# Create signature based on url and contents
|
2018-03-07 08:37:46 +08:00
|
|
|
pass
|
2017-03-05 13:06:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
class Crawler(object):
|
|
|
|
|
|
|
|
def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue):
|
|
|
|
self.pages = pages
|
|
|
|
self.data_store = data_store
|
|
|
|
self.reverse_index_queue = reverse_index_queue
|
|
|
|
self.doc_index_queue = doc_index_queue
|
|
|
|
|
|
|
|
def crawl_page(self, page):
|
|
|
|
for url in page.child_urls:
|
|
|
|
self.data_store.add_link_to_crawl(url)
|
|
|
|
self.reverse_index_queue.generate(page)
|
|
|
|
self.doc_index_queue.generate(page)
|
|
|
|
self.data_store.remove_link_to_crawl(page.url)
|
|
|
|
self.data_store.insert_crawled_link(page.url, page.signature)
|
|
|
|
|
|
|
|
def crawl(self):
|
|
|
|
while True:
|
|
|
|
page = self.data_store.extract_max_priority_page()
|
|
|
|
if page is None:
|
|
|
|
break
|
|
|
|
if self.data_store.crawled_similar(page.signature):
|
|
|
|
self.data_store.reduce_priority_link_to_crawl(page.url)
|
|
|
|
else:
|
|
|
|
self.crawl_page(page)
|
|
|
|
page = self.data_store.extract_max_priority_page()
|