py-libp2p/libp2p/kademlia/crawling.py

187 lines
6.2 KiB
Python
Raw Normal View History

2018-10-14 10:32:27 -04:00
from collections import Counter
import logging
2019-04-19 20:44:17 -04:00
from .kad_peerinfo import KadPeerHeap, create_kad_peerinfo
2019-01-15 18:41:41 +01:00
from .utils import gather_dict
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
log = logging.getLogger(__name__) # pylint: disable=invalid-name
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
# pylint: disable=too-few-public-methods
2019-01-09 21:38:56 +03:00
class SpiderCrawl:
2018-10-14 10:32:27 -04:00
"""
Crawl the network and look for given 160-bit keys.
"""
2019-01-15 18:41:41 +01:00
2018-10-14 10:32:27 -04:00
def __init__(self, protocol, node, peers, ksize, alpha):
# pylint: disable=too-many-arguments
2018-10-14 10:32:27 -04:00
"""
Create a new C{SpiderCrawl}er.
Args:
protocol: A :class:`~kademlia.protocol.KademliaProtocol` instance.
node: A :class:`~kademlia.node.Node` representing the key we're
looking for
peers: A list of :class:`~kademlia.node.Node` instances that
provide the entry point for the network
ksize: The value for k based on the paper
alpha: The value for alpha based on the paper
"""
self.protocol = protocol
self.ksize = ksize
self.alpha = alpha
self.node = node
self.nearest = KadPeerHeap(self.node, self.ksize)
2019-01-15 18:41:41 +01:00
self.last_ids_crawled = []
2018-10-14 10:32:27 -04:00
log.info("creating spider with peers: %s", peers)
self.nearest.push(peers)
async def _find(self, rpcmethod):
"""
Get either a value or list of nodes.
Args:
2019-01-15 18:41:41 +01:00
rpcmethod: The protocol's callfindValue or call_find_node.
2018-10-14 10:32:27 -04:00
The process:
1. calls find_* to current ALPHA nearest not already queried nodes,
adding results to current nearest list of k nodes.
2. current nearest list needs to keep track of who has been queried
already sort by nearest, keep KSIZE
3. if list is same as last time, next call should be to everyone not
yet queried
4. repeat, unless nearest list has all been queried, then ur done
"""
log.info("crawling network with nearest: %s", str(tuple(self.nearest)))
count = self.alpha
2019-01-15 18:41:41 +01:00
if self.nearest.get_ids() == self.last_ids_crawled:
2018-10-14 10:32:27 -04:00
count = len(self.nearest)
2019-01-15 18:41:41 +01:00
self.last_ids_crawled = self.nearest.get_ids()
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
dicts = {}
for peer in self.nearest.get_uncontacted()[:count]:
dicts[peer.peer_id] = rpcmethod(peer, self.node)
2019-01-15 18:41:41 +01:00
self.nearest.mark_contacted(peer)
found = await gather_dict(dicts)
return await self._nodes_found(found)
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
async def _nodes_found(self, responses):
2018-10-14 10:32:27 -04:00
raise NotImplementedError
class ValueSpiderCrawl(SpiderCrawl):
def __init__(self, protocol, node, peers, ksize, alpha):
# pylint: disable=too-many-arguments
2018-10-14 10:32:27 -04:00
SpiderCrawl.__init__(self, protocol, node, peers, ksize, alpha)
# keep track of the single nearest node without value - per
# section 2.3 so we can set the key there if found
self.nearest_without_value = KadPeerHeap(self.node, 1)
2018-10-14 10:32:27 -04:00
async def find(self):
"""
Find either the closest nodes or the value requested.
"""
2019-01-15 18:41:41 +01:00
return await self._find(self.protocol.call_find_value)
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
async def _nodes_found(self, responses):
2018-10-14 10:32:27 -04:00
"""
Handle the result of an iteration in _find.
"""
toremove = []
2019-01-15 18:41:41 +01:00
found_values = []
2018-10-14 10:32:27 -04:00
for peerid, response in responses.items():
response = RPCFindResponse(response)
if not response.happened():
toremove.append(peerid)
2019-01-15 18:41:41 +01:00
elif response.has_value():
found_values.append(response.get_value())
2018-10-14 10:32:27 -04:00
else:
2019-01-15 18:41:41 +01:00
peer = self.nearest.get_node(peerid)
self.nearest_without_value.push(peer)
self.nearest.push(response.get_node_list())
2018-10-14 10:32:27 -04:00
self.nearest.remove(toremove)
2019-01-15 18:41:41 +01:00
if found_values:
return await self._handle_found_values(found_values)
if self.nearest.have_contacted_all():
2018-10-14 10:32:27 -04:00
# not found!
return None
return await self.find()
2019-01-15 18:41:41 +01:00
async def _handle_found_values(self, values):
2018-10-14 10:32:27 -04:00
"""
We got some values! Exciting. But let's make sure
they're all the same or freak out a little bit. Also,
make sure we tell the nearest node that *didn't* have
the value to store it.
"""
2019-01-15 18:41:41 +01:00
value_counts = Counter(values)
if len(value_counts) != 1:
2018-10-14 10:32:27 -04:00
log.warning("Got multiple values for key %i: %s",
self.node.xor_id, str(values))
2019-01-15 18:41:41 +01:00
value = value_counts.most_common(1)[0][0]
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
peer = self.nearest_without_value.popleft()
if peer:
await self.protocol.call_store(peer, self.node.peer_id, value)
2018-10-14 10:32:27 -04:00
return value
class NodeSpiderCrawl(SpiderCrawl):
async def find(self):
"""
Find the closest nodes.
"""
2019-01-15 18:41:41 +01:00
return await self._find(self.protocol.call_find_node)
2018-10-14 10:32:27 -04:00
2019-01-15 18:41:41 +01:00
async def _nodes_found(self, responses):
2018-10-14 10:32:27 -04:00
"""
Handle the result of an iteration in _find.
"""
toremove = []
for peerid, response in responses.items():
response = RPCFindResponse(response)
if not response.happened():
toremove.append(peerid)
else:
2019-01-15 18:41:41 +01:00
self.nearest.push(response.get_node_list())
2018-10-14 10:32:27 -04:00
self.nearest.remove(toremove)
2019-01-15 18:41:41 +01:00
if self.nearest.have_contacted_all():
2018-10-14 10:32:27 -04:00
return list(self.nearest)
return await self.find()
2019-01-09 21:38:56 +03:00
class RPCFindResponse:
2018-10-14 10:32:27 -04:00
def __init__(self, response):
"""
A wrapper for the result of a RPC find.
Args:
response: This will be a tuple of (<response received>, <value>)
where <value> will be a list of tuples if not found or
a dictionary of {'value': v} where v is the value desired
"""
self.response = response
def happened(self):
"""
Did the other host actually respond?
"""
return self.response[0]
2019-01-15 18:41:41 +01:00
def has_value(self):
2018-10-14 10:32:27 -04:00
return isinstance(self.response[1], dict)
2019-01-15 18:41:41 +01:00
def get_value(self):
2018-10-14 10:32:27 -04:00
return self.response[1]['value']
2019-01-15 18:41:41 +01:00
def get_node_list(self):
2018-10-14 10:32:27 -04:00
"""
Get the node list in the response. If there's no value, this should
be set.
"""
nodelist = self.response[1] or []
2019-04-19 20:44:17 -04:00
return [create_kad_peerinfo(*nodeple) for nodeple in nodelist]