added script to parse scraped html
authorPhilipp Falk <philipp@email.mathematik.uni-freiburg.de>
Wed, 14 Sep 2016 13:00:17 +0000 (15:00 +0200)
committerPhilipp Falk <philipp@email.mathematik.uni-freiburg.de>
Wed, 14 Sep 2016 13:00:27 +0000 (15:00 +0200)
parsetitlebody.py [new file with mode: 0644]

diff --git a/parsetitlebody.py b/parsetitlebody.py
new file mode 100644 (file)
index 0000000..2c0d4f2
--- /dev/null
@@ -0,0 +1,39 @@
+#/usr/bin/python3
+
+import html5lib
+import sys
+
+def find_element(doc, tag, theid):
+    sel = list(doc.iter('{{http://www.w3.org/1999/xhtml}}{}'.format(tag)))
+    res = None
+    if sel == []:
+        raise Exception('No items with tag "{}" found.'.format(tag))
+    for elem in sel:
+        if 'id' in elem.attrib.keys():
+            if elem.attrib['id'] == theid:
+                return elem
+
+    raise Exception('Element with id "{}" not found.'.format(theid))
+    
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print('Usage: {} <html file>'.format(sys.argv[0]))
+        sys.exit(1)
+
+    with open(sys.argv[1], 'r') as f:
+        inp = f.read()
+
+    parser = html5lib.HTMLParser()
+    doc = parser.parse(inp)
+    title = find_element(doc, 'span', 'parent-fieldname-title').text.strip()
+
+    bodywalker = html5lib.getTreeWalker('etree')
+    bodyserializer = html5lib.serializer.HTMLSerializer()
+    element = find_element(doc, 'div', 'parent-fieldname-text')
+    body = ''.join(bodyserializer.serialize(bodywalker(element))).lstrip('<div id=parent-fieldname-text class="">').rstrip('</div>').strip()
+
+    out=[]
+    out.append('% use aam ueberschrift="{}"'.format(title))
+    out.append('')
+    out.append(body)
+    print('\n'.join(out))