postgres xml csv - ghdrako/doc_snipets GitHub Wiki
-
Python version: 2.x, 3.x
-
Sample input data: Tags.xml
<?xml version="1.0" encoding="utf-8"?>
<tags>
<row Id="1" TagName="line-numbers" Count="43" ExcerptPostId="4450" WikiPostId="4449" />
<row Id="2" TagName="indentation" Count="222" ExcerptPostId="3239" WikiPostId="3238" />
<row Id="6" TagName="macro" Count="90" ExcerptPostId="856" WikiPostId="855" />
</tags>
- Sample output data: tags-no-header.csv
1,line-numbers,43,4450,4449
2,indentation,222,3239,3238
6,macro,90,856,855
Example 1 Source Code:
import xml.etree.ElementTree as ET
import re
import sys
import csv
import time
def process_node(node):
tag_id = node.get('Id')
tag_name = node.get('TagName')
count = node.get('Count')
excerpt_post_id = node.get('ExcerptPostId')
wiki_post_id = node.get('WikiPostId')
return [tag_id, tag_name, count, excerpt_post_id, wiki_post_id]
def main():
start = time.time()
tags_data = open('tags-no-header.csv','w')
csv_writer = csv.writer(tags_data)
#csv_writer.writerow(['Id', 'TagName', 'Count', 'ExcerptPostId', 'WikiPostId'])
# This is the location of the xml file within our file system (not HDFS)
tree = ET.parse('/home/cloudera/stackexchange/Tags.xml')
root = tree.getroot()
nodes = root.findall('row')
for node in nodes:
csv_writer.writerow(process_node(node))
tags_data.close()
end = time.time()
print(end - start)
#if __name__ == "__main__":
main()