md2man

   1 #!/usr/bin/python3
   2
   3 # This script takes a manpage written in github-flavored markdown and turns it
   4 # into a html web page and a nroff man page.  The input file must have the name
   5 # of the program and the section in the format: NAME.NUM.md. The output files
   6 # are written into the current directory named NAME.NUM.html and NAME.NUM.  The
   7 # input format has one extra extension: if a numbered list starts at 0, it is
   8 # turned into a description list. The dl's dt tag is taken from the contents of
   9 # the first tag inside the li, which is usually a p tag or a code tag.  The
  10 # cmarkgfm lib is used to transforms the input file into html. The html.parser
  11 # is used as a state machine that both tweaks the html and outputs the nroff
  12 # data based on the html tags.
  13 #
  14 # Copyright (C) 2020 Wayne Davison
  15 #
  16 # This program is freely redistributable.
  17
  18 import sys, os, re, argparse, time
  19 from html.parser import HTMLParser
  20
  21 CONSUMES_TXT = set('h1 h2 p li pre'.split())
  22
  23 HTML_START = """\
  24 <html><head>
  25 <title>%s</title>
  26 <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
  27 <style>
  28 body {
  29   max-width: 40em;
  30   margin: auto;
  31   font-size: 1.2em;
  32   font-family: 'Roboto', sans-serif;
  33 }
  34 blockquote pre code {
  35   background: #eee;
  36 }
  37 dd p:first-of-type {
  38   margin-block-start: 0em;
  39 }
  40 </style>
  41 </head><body>
  42 """
  43
  44 HTML_END = """\
  45 <div style="float: right"><p><i>%s</i></p></div>
  46 </body></html>
  47 """
  48
  49 MAN_START = r"""
  50 .TH "%s" "%s" "%s" "" ""
  51 """.lstrip()
  52
  53 MAN_END = """\
  54 """
  55
  56 NORM_FONT = ('\1', r"\fP")
  57 BOLD_FONT = ('\2', r"\fB")
  58 ULIN_FONT = ('\3', r"\fI")
  59
  60 def main():
  61     fi = re.match(r'^(?P<fn>(?P<srcdir>.+/)?(?P<name>(?P<prog>[^/]+)\.(?P<sect>\d+))\.md)$', args.mdfile)
  62     if not fi:
  63         die('Failed to parse NAME.NUM.md out of input file:', args.mdfile)
  64     fi = argparse.Namespace(**fi.groupdict())
  65
  66     if not fi.srcdir:
  67         fi.srcdir = './'
  68
  69     fi.title = fi.prog + '(' + fi.sect + ') man page'
  70     fi.date = None
  71
  72     chk_files = 'latest-year.h Makefile'.split()
  73     for fn in chk_files:
  74         try:
  75             st = os.lstat(fi.srcdir + fn)
  76         except:
  77             die('Failed to find', fi.srcdir + fn)
  78         if not fi.date:
  79             fi.date = time.strftime('%d %b %Y', time.localtime(st.st_mtime))
  80
  81     env_subs = { }
  82
  83     with open(fi.srcdir + 'Makefile', 'r', encoding='utf-8') as fh:
  84         for line in fh:
  85             m = re.match(r'^(\w+)=(.+)', line)
  86             if not m:
  87                 continue
  88             var, val = (m[1], m[2])
  89             while re.search(r'\$\{', val):
  90                 val = re.sub(r'\$\{(\w+)\}', lambda m: env_subs[m[1]], val)
  91             env_subs[var] = val
  92             if var == 'VERSION':
  93                 break
  94
  95     with open(fi.fn, 'r', encoding='utf-8') as fh:
  96         txt = re.sub(r'@VERSION@', env_subs['VERSION'], fh.read())
  97         txt = re.sub(r'@LIBDIR@', env_subs['libdir'], txt)
  98         fi.html_in = cmarkgfm.github_flavored_markdown_to_html(txt)
  99         txt = None
 100
 101     HtmlToManPage(fi)
 102
 103     if args.test:
 104         print("The test was successful.")
 105         return
 106
 107     for fn, txt in ((fi.name + '.html', fi.html_out), (fi.name, fi.man_out)):
 108         print("Wrote:", fn)
 109         with open(fn, 'w', encoding='utf-8') as fh:
 110             fh.write(txt)
 111
 112
 113 class HtmlToManPage(HTMLParser):
 114     def __init__(self, fi):
 115         HTMLParser.__init__(self, convert_charrefs=True)
 116
 117         st = self.state = argparse.Namespace(
 118                 list_state = [ ],
 119                 p_macro = ".P\n",
 120                 at_first_tag_in_li = False,
 121                 at_first_tag_in_dd = False,
 122                 dt_from = None,
 123                 in_pre = False,
 124                 html_out = [ HTML_START % fi.title ],
 125                 man_out = [ MAN_START % (fi.prog, fi.sect, fi.date) ],
 126                 txt = '',
 127                 )
 128
 129         self.feed(fi.html_in)
 130         fi.html_in = None
 131
 132         st.html_out.append(HTML_END % fi.date)
 133         st.man_out.append(MAN_END)
 134
 135         fi.html_out = ''.join(st.html_out)
 136         st.html_out = None
 137
 138         fi.man_out = ''.join(st.man_out)
 139         st.man_out = None
 140
 141
 142     def handle_starttag(self, tag, attrs_list):
 143         st = self.state
 144         if args.debug:
 145             self.output_debug('START', (tag, attrs_list))
 146         if st.at_first_tag_in_li:
 147             if st.list_state[-1] == 'dl':
 148                 st.dt_from = tag
 149                 if tag == 'p':
 150                     tag = 'dt'
 151                 else:
 152                     st.html_out.append('<dt>')
 153             st.at_first_tag_in_li = False
 154         if tag == 'p':
 155             if not st.at_first_tag_in_dd:
 156                 st.man_out.append(st.p_macro)
 157         elif tag == 'li':
 158             st.at_first_tag_in_li = True
 159             lstate = st.list_state[-1]
 160             if lstate == 'dl':
 161                 return
 162             if lstate == 'o':
 163                 st.man_out.append(".IP o\n")
 164             else:
 165                 st.man_out.append(".IP " + str(lstate) + ".\n")
 166                 st.list_state[-1] += 1
 167         elif tag == 'blockquote':
 168             st.man_out.append(".RS 4\n")
 169         elif tag == 'pre':
 170             st.in_pre = True
 171             st.man_out.append(st.p_macro + ".nf\n")
 172         elif tag == 'code' and not st.in_pre:
 173             st.txt += BOLD_FONT[0]
 174         elif tag == 'strong' or tag == 'bold':
 175             st.txt += BOLD_FONT[0]
 176         elif tag == 'i' or tag == 'em':
 177             st.txt += ULIN_FONT[0]
 178         elif tag == 'ol':
 179             start = 1
 180             for var, val in attrs_list:
 181                 if var == 'start':
 182                     start = int(val) # We only support integers.
 183                     break
 184             if st.list_state:
 185                 st.man_out.append(".RS\n")
 186             if start == 0:
 187                 tag = 'dl'
 188                 attrs_list = [ ]
 189                 st.list_state.append('dl')
 190             else:
 191                 st.list_state.append(start)
 192             st.man_out.append(st.p_macro)
 193             st.p_macro = ".IP\n"
 194         elif tag == 'ul':
 195             st.man_out.append(st.p_macro)
 196             if st.list_state:
 197                 st.man_out.append(".RS\n")
 198                 st.p_macro = ".IP\n"
 199             st.list_state.append('o')
 200         st.html_out.append('<' + tag + ' '.join( ' ' + var + '="' + safeText(val) + '"' for var, val in attrs_list) + '>')
 201         st.at_first_tag_in_dd = False
 202
 203
 204     def handle_endtag(self, tag):
 205         st = self.state
 206         if args.debug:
 207             self.output_debug('END', (tag,))
 208         if tag in CONSUMES_TXT or st.dt_from == tag:
 209             txt = st.txt.strip()
 210             st.txt = ''
 211         else:
 212             txt = None
 213         add_to_txt = None
 214         if tag == 'h1':
 215             st.man_out.append(st.p_macro + '.SH "' + manify(txt) + '"\n')
 216         elif tag == 'h2':
 217             st.man_out.append(st.p_macro + '.SS "' + manify(txt) + '"\n')
 218         elif tag == 'p':
 219             if st.dt_from == 'p':
 220                 tag = 'dt'
 221                 st.man_out.append('.IP "' + manify(txt) + '"\n')
 222                 st.dt_from = None
 223             elif txt != '':
 224                 st.man_out.append(manify(txt) + "\n")
 225         elif tag == 'li':
 226             if st.list_state[-1] == 'dl':
 227                 if st.at_first_tag_in_li:
 228                     die("Invalid 0. -> td translation")
 229                 tag = 'dd'
 230             if txt != '':
 231                 st.man_out.append(manify(txt) + "\n")
 232             st.at_first_tag_in_li = False
 233         elif tag == 'blockquote':
 234             st.man_out.append(".RE\n")
 235         elif tag == 'pre':
 236             st.in_pre = False
 237             st.man_out.append(manify(txt) + "\n.fi\n")
 238         elif tag == 'code' and not st.in_pre:
 239              add_to_txt = NORM_FONT[0]
 240         elif tag == 'strong' or tag == 'bold':
 241              add_to_txt = NORM_FONT[0]
 242         elif tag == 'i' or tag == 'em':
 243              add_to_txt = NORM_FONT[0]
 244         elif tag == 'ol' or tag == 'ul':
 245             if st.list_state.pop() == 'dl':
 246                 tag = 'dl'
 247             if st.list_state:
 248                 st.man_out.append(".RE\n")
 249             else:
 250                 st.p_macro = ".P\n"
 251             st.at_first_tag_in_dd = False
 252         st.html_out.append('</' + tag + '>')
 253         if add_to_txt:
 254             if txt is None:
 255                 st.txt += add_to_txt
 256             else:
 257                 txt += add_to_txt
 258         if st.dt_from == tag:
 259             st.man_out.append('.IP "' + manify(txt) + '"\n')
 260             st.html_out.append('</dt><dd>')
 261             st.at_first_tag_in_dd = True
 262             st.dt_from = None
 263         elif tag == 'dt':
 264             st.html_out.append('<dd>')
 265             st.at_first_tag_in_dd = True
 266
 267
 268     def handle_data(self, data):
 269         st = self.state
 270         if args.debug:
 271             self.output_debug('DATA', (data,))
 272         st.html_out.append(safeText(data))
 273         st.txt += data
 274
 275
 276     def output_debug(self, event, extra):
 277         import pprint
 278         st = self.state
 279         if args.debug < 2:
 280             if len(st.html_out) > 2:
 281                 st.html_out = ['...'] + st.html_out[-2:]
 282             if len(st.man_out) > 2:
 283                 st.man_out = ['...'] + st.man_out[-2:]
 284         print(event, extra)
 285         pprint.PrettyPrinter(indent=2).pprint(vars(st))
 286
 287
 288 def manify(txt):
 289     return re.sub(r"^(['.])", r'\&\1', txt.replace('\\', '\\\\')
 290             .replace(NORM_FONT[0], NORM_FONT[1])
 291             .replace(BOLD_FONT[0], BOLD_FONT[1])
 292             .replace(ULIN_FONT[0], ULIN_FONT[1]), flags=re.M)
 293
 294
 295 def safeText(txt):
 296     return txt.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
 297
 298
 299 def warn(*msg):
 300     print(*msg, file=sys.stderr)
 301
 302
 303 def die(*msg):
 304     warn(*msg)
 305     sys.exit(1)
 306
 307
 308 if __name__ == '__main__':
 309     parser = argparse.ArgumentParser(description='Transform a NAME.NUM.md markdown file into a NAME.NUM.html web page & a NAME.NUM man page.', add_help=False)
 310     parser.add_argument('--test', action='store_true', help='Test if we can parse the input w/o updating any files.')
 311     parser.add_argument('--debug', '-D', action='count', default=0, help='Output copious info on the html parsing.')
 312     parser.add_argument("--help", "-h", action="help", help="Output this help message and exit.")
 313     parser.add_argument('mdfile', help="The NAME.NUM.md file to parse.")
 314     args = parser.parse_args()
 315
 316     try:
 317         import cmarkgfm
 318     except:
 319         die("The cmarkgfm library is not available for python3.")
 320
 321     main()