HTML - mwicat/personal GitHub Wiki

Parse all hrefs

pip3 install beautifulsoup4

from bs4 import BeautifulSoup
import urllib.request

site_fname = 'site.html'
f = open(site_fname)

soup = BeautifulSoup(f, "html.parser")
url_re = re.compile("^http://")
urls = []
for link in soup.findAll('a', attrs={'href': url_re}):
    urls.append(link.get('href'))