1242. Web Crawler Multithreaded - cocoder39/coco39_LC GitHub Wiki

1242. Web Crawler Multithreaded

Using deque which serves as FIFO. Tasks appended earlier will

  1. block the rest of tasks in task queue
  2. produce new tasks to be appended to task queue

Ideally we should call result for completed task only since a task appended at a later point may get resolved earlier. But checking completion is also time consuming. Always waiting for the top task in queue is a reasonable decision.

from concurrent.futures import ThreadPoolExecutor

class Solution:
    
    def hasSameHostName(self, url1: str, url2: str) -> bool:
        return self.getHostName(url1) == self.getHostName(url2)
        
    def getHostName(self, url: str) -> str:
        return url.split('/')[2]
    
    def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
        seen = {startUrl}
        with ThreadPoolExecutor(max_workers=100) as executor:
            tasks = collections.deque([executor.submit(htmlParser.getUrls, startUrl)])
            while tasks:
                urls = tasks.popleft().result()
                for url in urls:
                    if url not in seen and self.hasSameHostName(startUrl, url):
                        seen.add(url)
                        tasks.append(executor.submit(htmlParser.getUrls, url))
        return list(seen)