How to parse data from a web page - lunawyh/covid19viz GitHub Wiki

  1. parsing a date from a web page

     from lxml import html
     #code
     c_tree = html.fromstring(page_content)
     se_dates = c_tree.xpath('//span[@id="updatedDate"]/text()')
     for se_data in se_dates:
         if('2020' in se_data):
             print('      updated date', se_data)
             # update file name
             dt_obj = datetime.datetime.strptime(se_data, '%m/%d/%Y')
             self.name_file = dt_obj.strftime('%Y%m%d')
             self.now_date = dt_obj.strftime('%m/%d/%Y')
             break
    
  2. Parse a link

     from lxml import html
     #code
     tree = html.fromstring(htmlPage.content)
     division = tree.xpath('//p//a/@href')
     link = division[0]
     link = "https://www.mass.gov" + link
     print("  get link: " + link)
    
  3. Parse a test to get a link

     from lxml import html
     #code
     tree = html.fromstring(htmlPage.content)
     l_links = tree.xpath('//p//a')
     for l_data in l_links:      
         if('See State Report' in l_data.text_content()):
             a_address = l_data.get('href')
             print('  find link at', a_address)
    
  4. Parse a relative element

     from lxml import html
     #code        c_tree = html.fromstring(c_page.content)
     l_text_data = c_tree.xpath('//tbody//tr//p/text()')