doc: Add helper for converting DocBook files to Markdown

+1 -1

doc/Makefile

··· 3 3 PANDOC ?= pandoc 4 4 5 5 pandoc_media_dir = media 6 - # NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh). 6 + # NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh) and conversion script (/maintainers/scripts/db-to-md.sh). 7 7 # TODO: Remove raw-attribute when we can get rid of DocBook altogether. 8 8 pandoc_commonmark_enabled_extensions = +attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute 9 9 # Not needed:

+88

maintainers/scripts/db-to-md.sh

··· 1 + #! /usr/bin/env nix-shell 2 + #! nix-shell -I nixpkgs=. -i bash -p pandoc 3 + 4 + # This script is temporarily needed while we transition the manual to 5 + # CommonMark. It converts DocBook files into our CommonMark flavour. 6 + 7 + debug= 8 + files=() 9 + 10 + while [ "$#" -gt 0 ]; do 11 + i="$1"; shift 1 12 + case "$i" in 13 + --debug) 14 + debug=1 15 + ;; 16 + *) 17 + files+=("$i") 18 + ;; 19 + esac 20 + done 21 + 22 + echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr 23 + echo "Please report any issues you discover." > /dev/stderr 24 + 25 + outExtension="md" 26 + if [[ $debug ]]; then 27 + outExtension="json" 28 + fi 29 + 30 + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 31 + 32 + # NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile). 33 + # TODO: Remove raw-attribute when we can get rid of DocBook altogether. 34 + pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute 35 + targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart" 36 + if [[ $debug ]]; then 37 + targetLang=json 38 + fi 39 + pandoc_flags=( 40 + # Not needed: 41 + # - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies) 42 + # - media extraction (was only required for diagram generator) 43 + # - myst-reader/roles.lua (only relevant for MyST → DocBook) 44 + # - link-unix-man-references.lua (links should only be added to display output) 45 + # - docbook-writer/rst-roles.lua (only relevant for → DocBook) 46 + # - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook) 47 + "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua" 48 + "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua" 49 + "--lua-filter=$DIR/doc/unknown-code-language.lua" 50 + -f docbook 51 + -t "$targetLang" 52 + --tab-stop=2 53 + --wrap=none 54 + ) 55 + 56 + for file in "${files[@]}"; do 57 + if [[ ! -f "$file" ]]; then 58 + echo "db-to-md.sh: $file does not exist" > /dev/stderr 59 + exit 1 60 + else 61 + rootElement=$(xmllint --xpath 'name(//*)' "$file") 62 + 63 + if [[ $rootElement = chapter ]]; then 64 + extension=".chapter.$outExtension" 65 + elif [[ $rootElement = section ]]; then 66 + extension=".section.$outExtension" 67 + else 68 + echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr 69 + exit 1 70 + fi 71 + 72 + outFile="${file%".section.xml"}" 73 + outFile="${outFile%".chapter.xml"}" 74 + outFile="${outFile%".xml"}$extension" 75 + temp1=$(mktemp) 76 + $DIR/doc/escape-code-markup.py "$file" "$temp1" 77 + if [[ $debug ]]; then 78 + echo "Converted $file to $temp1" > /dev/stderr 79 + fi 80 + temp2=$(mktemp) 81 + $DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2" 82 + if [[ $debug ]]; then 83 + echo "Converted $temp1 to $temp2" > /dev/stderr 84 + fi 85 + pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}" 86 + echo "Converted $file to $outFile" > /dev/stderr 87 + fi 88 + done

+97

maintainers/scripts/doc/escape-code-markup.py

··· 1 + #! /usr/bin/env nix-shell 2 + #! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml 3 + 4 + """ 5 + Pandoc will strip any markup within code elements so 6 + let’s escape them so that they can be handled manually. 7 + """ 8 + 9 + import lxml.etree as ET 10 + import re 11 + import sys 12 + 13 + def replace_element_by_text(el: ET.Element, text: str) -> None: 14 + """ 15 + Author: bernulf 16 + Source: https://stackoverflow.com/a/10520552/160386 17 + SPDX-License-Identifier: CC-BY-SA-3.0 18 + """ 19 + text = text + (el.tail or "") 20 + parent = el.getparent() 21 + if parent is not None: 22 + previous = el.getprevious() 23 + if previous is not None: 24 + previous.tail = (previous.tail or "") + text 25 + else: 26 + parent.text = (parent.text or "") + text 27 + parent.remove(el) 28 + 29 + DOCBOOK_NS = "http://docbook.org/ns/docbook" 30 + 31 + # List of elements that pandoc’s DocBook reader strips markup from. 32 + # https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs 33 + code_elements = [ 34 + # CodeBlock 35 + "literallayout", 36 + "screen", 37 + "programlisting", 38 + # Code (inline) 39 + "classname", 40 + "code", 41 + "filename", 42 + "envar", 43 + "literal", 44 + "computeroutput", 45 + "prompt", 46 + "parameter", 47 + "option", 48 + "markup", 49 + "wordasword", 50 + "command", 51 + "varname", 52 + "function", 53 + "type", 54 + "symbol", 55 + "constant", 56 + "userinput", 57 + "systemitem", 58 + ] 59 + 60 + XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"') 61 + ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>') 62 + 63 + def remove_xmlns(match: re.Match) -> str: 64 + """ 65 + Removes xmlns attributes. 66 + 67 + Expects a match containing an opening tag. 68 + """ 69 + return XMLNS_REGEX.sub('', match.group(0)) 70 + 71 + if __name__ == '__main__': 72 + assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>" 73 + 74 + tree = ET.parse(sys.argv[1]) 75 + name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements]) 76 + 77 + for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"): 78 + text = ET.tostring(markup, encoding=str) 79 + 80 + # tostring adds xmlns attributes to the element we want to stringify 81 + # as if it was supposed to be usable standalone. 82 + # We are just converting it to CDATA so we do not care. 83 + # Let’s strip the namespace declarations to keep the code clean. 84 + # 85 + # Note that this removes even namespaces that were potentially 86 + # in the original file. Though, that should be very rare – 87 + # most of the time, we will stringify empty DocBook elements 88 + # like <xref> or <co> or, at worst, <link> with xlink:href attribute. 89 + # 90 + # Also note that the regex expects the root element to be first 91 + # thing in the string. But that should be fine, the tostring method 92 + # does not produce XML declaration or doctype by default. 93 + text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text) 94 + 95 + replace_element_by_text(markup, text) 96 + 97 + tree.write(sys.argv[2])

+32

maintainers/scripts/doc/replace-xrefs-by-empty-links.py

··· 1 + #! /usr/bin/env nix-shell 2 + #! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml 3 + 4 + """ 5 + Pandoc will try to resolve xrefs and replace them with regular links. 6 + let’s replace them with links with empty labels which MyST 7 + and our pandoc filters recognize as cross-references. 8 + """ 9 + 10 + import lxml.etree as ET 11 + import sys 12 + 13 + XLINK_NS = "http://www.w3.org/1999/xlink" 14 + 15 + ns = { 16 + "db": "http://docbook.org/ns/docbook", 17 + } 18 + 19 + 20 + if __name__ == '__main__': 21 + assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py <input> <output>" 22 + 23 + tree = ET.parse(sys.argv[1]) 24 + for xref in tree.findall(".//db:xref", ns): 25 + text = ET.tostring(xref, encoding=str) 26 + parent = xref.getparent() 27 + link = parent.makeelement('link') 28 + target_name = xref.get("linkend") 29 + link.set(f"{{{XLINK_NS}}}href", f"#{target_name}") 30 + parent.replace(xref, link) 31 + 32 + tree.write(sys.argv[2])

+12

maintainers/scripts/doc/unknown-code-language.lua

··· 1 + --[[ 2 + Adds “unknown” class to CodeBlock AST nodes without any classes. 3 + 4 + This will cause Pandoc to use fenced code block, which we prefer. 5 + ]] 6 + 7 + function CodeBlock(elem) 8 + if #elem.classes == 0 then 9 + elem.classes:insert('unknown') 10 + return elem 11 + end 12 + end