From a3d1c34550a4a14658f4e4d3030de66fa49aa028 Mon Sep 17 00:00:00 2001 From: Philipp Falk Date: Wed, 14 Sep 2016 15:00:17 +0200 Subject: [PATCH] added script to parse scraped html --- parsetitlebody.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 parsetitlebody.py diff --git a/parsetitlebody.py b/parsetitlebody.py new file mode 100644 index 0000000..2c0d4f2 --- /dev/null +++ b/parsetitlebody.py @@ -0,0 +1,39 @@ +#/usr/bin/python3 + +import html5lib +import sys + +def find_element(doc, tag, theid): + sel = list(doc.iter('{{http://www.w3.org/1999/xhtml}}{}'.format(tag))) + res = None + if sel == []: + raise Exception('No items with tag "{}" found.'.format(tag)) + for elem in sel: + if 'id' in elem.attrib.keys(): + if elem.attrib['id'] == theid: + return elem + + raise Exception('Element with id "{}" not found.'.format(theid)) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('Usage: {} '.format(sys.argv[0])) + sys.exit(1) + + with open(sys.argv[1], 'r') as f: + inp = f.read() + + parser = html5lib.HTMLParser() + doc = parser.parse(inp) + title = find_element(doc, 'span', 'parent-fieldname-title').text.strip() + + bodywalker = html5lib.getTreeWalker('etree') + bodyserializer = html5lib.serializer.HTMLSerializer() + element = find_element(doc, 'div', 'parent-fieldname-text') + body = ''.join(bodyserializer.serialize(bodywalker(element))).lstrip('
').rstrip('
').strip() + + out=[] + out.append('% use aam ueberschrift="{}"'.format(title)) + out.append('') + out.append(body) + print('\n'.join(out)) -- 2.11.0