1242. Web Crawler Multithreaded - cocoder39/coco39_LC GitHub Wiki
1242. Web Crawler Multithreaded
Using deque which serves as FIFO. Tasks appended earlier will
- block the rest of tasks in task queue
- produce new tasks to be appended to task queue
Ideally we should call result for completed task only since a task appended at a later point may get resolved earlier. But checking completion is also time consuming. Always waiting for the top task in queue is a reasonable decision.
from concurrent.futures import ThreadPoolExecutor
class Solution:
def hasSameHostName(self, url1: str, url2: str) -> bool:
return self.getHostName(url1) == self.getHostName(url2)
def getHostName(self, url: str) -> str:
return url.split('/')[2]
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
seen = {startUrl}
with ThreadPoolExecutor(max_workers=100) as executor:
tasks = collections.deque([executor.submit(htmlParser.getUrls, startUrl)])
while tasks:
urls = tasks.popleft().result()
for url in urls:
if url not in seen and self.hasSameHostName(startUrl, url):
seen.add(url)
tasks.append(executor.submit(htmlParser.getUrls, url))
return list(seen)