How to parse data from a web page - lunawyh/covid19viz GitHub Wiki
-
parsing a date from a web page
from lxml import html #code c_tree = html.fromstring(page_content) se_dates = c_tree.xpath('//span[@id="updatedDate"]/text()') for se_data in se_dates: if('2020' in se_data): print(' updated date', se_data) # update file name dt_obj = datetime.datetime.strptime(se_data, '%m/%d/%Y') self.name_file = dt_obj.strftime('%Y%m%d') self.now_date = dt_obj.strftime('%m/%d/%Y') break
-
Parse a link
from lxml import html #code tree = html.fromstring(htmlPage.content) division = tree.xpath('//p//a/@href') link = division[0] link = "https://www.mass.gov" + link print(" get link: " + link)
-
Parse a test to get a link
from lxml import html #code tree = html.fromstring(htmlPage.content) l_links = tree.xpath('//p//a') for l_data in l_links: if('See State Report' in l_data.text_content()): a_address = l_data.get('href') print(' find link at', a_address)
-
Parse a relative element
from lxml import html #code c_tree = html.fromstring(c_page.content) l_text_data = c_tree.xpath('//tbody//tr//p/text()')