#!/usr/bin/python3 # canonicalize html dirs to ease comaring them # # run as: # ./tools/c10e-html html import argparse import glob import re import os import sys from bs4 import BeautifulSoup def prettify(filename, parser='lxml', fixup=False): with open(filename, 'r') as doc: soup = BeautifulSoup(doc.read(), parser) with open(filename, 'w') as doc: html = soup.prettify() if fixup: # strip things that mkhtml2 is not producing to reduce the diff html = html.replace('a class="link" href', 'a href') html = html.replace(' target="_top"', '') html = html.replace('summary="Navigation header" ', '') html = html.replace(""" """, '') html = re.sub(""" """, '', html) html = re.sub(r'\s*

\s*

', '', html) html = re.sub(r'\s*\s*', '', html) html = re.sub(r'\s*
\s*
', '', html) html = re.sub(r'\s*]*>', '', html) doc.write(html) def main(htmldir): for filename in glob.glob(os.path.join(htmldir, '*.devhelp2')): prettify(filename, parser='lxml-xml') for filename in glob.glob(os.path.join(htmldir, '*.html')): prettify(filename, fixup=True) if __name__ == '__main__': parser = argparse.ArgumentParser( description='c10e-html - canonicalize html files for diffing') parser.add_argument('args', nargs='*', help='HTML_DIR') options = parser.parse_args() if len(options.args) < 1: sys.exit('Too few arguments') main(options.args[0])