Docs - marcelotournier/case-report-dataset GitHub Wiki

How to Create multiple xmls from the master xml (old version)

Based in code from https://github.com/marcelotournier/knowledge_discovery_WHP/blob/master/pubmed_parser.py

from xml.etree import ElementTree as ET


def lazy_parse_xmls(filename='pubmed_result.xml'):
    parser = ET.iterparse(filename)

    for event, element in parser:
        # element is a whole element
        if element.tag == 'PubmedArticle':
            yield ET.tostring(element, encoding='latin1', method='xml').decode("latin1")


gen = lazy_parse_xmls()

filecount = 0

for xmlstring in gen:
    counter = "{:0>10d}".format(filecount)
    with open(f"article_{counter}.xml", "w", encoding='latin1') as xmlfile:
        xmlfile.write(xmlstring)
    filecount += 1