/* * SPDX-FileCopyrightText: Copyright 2023 Roland Csaszar * SPDX-License-Identifier: MIT * * Project: vscode-scheme-repl * File: generate_function_documentation.ts * Date: 19.May.2023 * * ============================================================================== * Parse the HTML documentation from the Chez Scheme page * https://cisco.github.io/ChezScheme/csug10.0/summary.html * and generate a list of objects holding the parsed data in `outFilename`. * Run with `yarn --ignore-engines ts-node generate_function_documentation.ts`. * * The Chez Scheme user's guide is licensed under the Apache License Version 2: * https://cisco.github.io/ChezScheme/csug10.0/canned/copyright.html. */ /* eslint-disable operator-linebreak */ /* eslint-disable indent */ import * as https from "https"; import { createWriteStream, existsSync, readFile, unlinkSync, writeFile, } from "fs"; import { JSDOM } from "jsdom"; import { basename } from "path"; /** * The type of a identifier. * Warning: copied from `./src/identifierDocumentation.ts` because of problems * with module imports. */ type IdentifierType = | "syntax" | "module" | "procedure" | "thread parameter" | "global parameter" | "Error: unknown"; /** * Return the string `s` converted to an `IdentifierType`. * Return `"Error: unknown"` if the string isn't recognized. * @param s The string to convert to an `IdentifierType`. * @returns The string `s` converted to an `IdentifierType`. * Return `"Error: unknown"` if the string isn't recognized. */ function stringToIdentifierType(s: string): IdentifierType { switch (s) { case "syntax": return "syntax"; case "procedure": return "procedure"; case "module": return "module"; case "thread param": return "thread parameter"; case "global param": return "global parameter"; } return "Error: unknown"; } /** * The object to save the data of a function documentation to. * Warning: copied from `./src/identifierDocumentation.ts` because of problems * with module imports. */ type FunctionDoc = { name: string; startParen: boolean; endParen: boolean; params: string[]; type: IdentifierType; moduleNames: string[]; url: URL; description: string; }; /** * The `RegExp` to match an example in a description. * The first group matches the whole example code. */ const exampleRegex = /\n((?:\\`.*?`\s*(?:=>|\*)?\s*
\n)+)(?:
)?\n*$/u; /** * The `RegExp` to match a single line of a multi-line code example with many * individual backticks. * The first group contains the actual data. */ const lineFormatRegex = /^\\`(.*)\\`\s*
\s*$/gmu; /** * The `RegExp` to match a `libraries` stanza in a description. * The first group contains the libraries names. */ const librariesRegex = /^\s*\*\*libraries:\*\*\s*(\\`\(.*?\)\\`(?:,\s*\\`\(.*?\)\\`)*)\s*
\s*$/mu; /** * The base part of the Chez Scheme documentation URL. */ const baseURL = "https://cisco.github.io/ChezScheme/csug10.0/"; /** * The URL of the Chez Scheme documentation website. */ const docURL = baseURL + "summary.html"; /** * The name of the file to save the parsed data to. */ const outFilename = "./src/functionDocumentation.ts"; /** * The list of downloaded files to delete when exiting the program. */ const filesToDelete: string[] = []; /** * Main entry point. */ async function main(): Promise { try { const htmlText = await downloadAndRead(docURL); const tsText = await processHTML(htmlText); await writeFunctionDocumentation(tsText); const deleteSet = new Set(filesToDelete); deleteSet.forEach((file) => unlinkSync(file)); } catch (error) { console.error( `Caught "${error}" trying to process the HTML and saving it.` ); process.exit(1); } } /** * Return the data of the Chez function documentation as Typescript objects in a * text file. * @param text The HTML documentation file to parse. * @returns The data of the Chez function documentation as Typescript objects in a * text file. */ async function processHTML(text: string): Promise { const htmlDoc = new JSDOM(text).window.document; const trs = Array.from(htmlDoc.querySelectorAll("tr")).filter( // eslint-disable-next-line no-magic-numbers (e) => e.childElementCount === 3 && e.children[0].nodeName !== "TH" ); const ids: FunctionDoc[] = trs.map((tr) => parseTR(tr)); const allUrl = Array.from( new Set( ids.map((id) => id.url.protocol + id.url.hostname + id.url.pathname) ) ); await Promise.all( allUrl.map((url) => download(new URL(url), fileNameFromURL(new URL(url))) ) ); await Promise.all(ids.map((id) => addDescription(id))); ids.forEach((id) => addLibraries(id)); return idsDocToTSFile(ids); } /** * Return a TS file content with the list of `FunctionDoc`s. * @param ids The list of `FunctionDoc`s to convert. * @returns A TS file content with the list of `FunctionDoc`s. */ function idsDocToTSFile(ids: FunctionDoc[]): string { const today = new Date(); const date = today.getDate(); const month = today.getMonth() + 1; const year = today.getFullYear(); return `/* * SPDX-FileCopyrightText: Copyright 2023 Roland Csaszar * SPDX-License-Identifier: MIT * * Project: vscode-scheme-repl * File: functionDocumentation.ts * Date: ${date}.${month}.${year} * * ============================================================================== * The Chez Scheme user's guide is licensed under the Apache License Version 2: * https://cisco.github.io/ChezScheme/csug10.0/canned/copyright.html. * Autogenerated by the script \`../generate_function_documentation.ts\`, from * ${docURL} * DO NOT EDIT! */ /* eslint-disable max-lines */ import { FunctionDoc } from "./identifierDocumentation"; export const functionDocs: FunctionDoc[] = [ ${ids .map( (id) => ` { name: "${id.name}", startParen: ${id.startParen}, endParen: ${id.endParen}, params: ["${id.params.join('", "')}"], type: "${id.type}", moduleNames: ["${id.moduleNames.join('", "')}"], url: new URL("${id.url}"), description: \`${id.description}\` },` ) .join("\n")} ] `; } /** * Parse the given `tr` element and add the date to `ids`. * @param tr The `tr` element to parse. * @returns The filled `FunctionDoc` object. */ // eslint-disable-next-line max-statements, max-lines-per-function function parseTR(tr: HTMLTableRowElement): FunctionDoc { const tds = Array.from(tr.childNodes) as HTMLTableCellElement[]; const idType = stringToIdentifierType(tds[1].innerHTML); let name = ""; let params: string[] = []; let startParen = false; let endParen = false; const nameElems = tds[0].childNodes[0].childNodes; if (idType === "global parameter" || idType === "thread parameter") { const tmpName = stringOrEmpty(nameElems[0].textContent); startParen = tmpName.startsWith("("); name = startParen ? tmpName.slice(1) : tmpName; } else if (nameElems.length > 1) { ({ startParen, name, endParen } = parseParamsAndName({ nameElems, startParen, name, params, endParen, })); } else { const tmpName = stringOrEmpty(nameElems[0].textContent).trim(); startParen = tmpName.startsWith("("); endParen = tmpName.endsWith(")"); // eslint-disable-next-line no-nested-ternary name = startParen ? endParen ? tmpName.slice(1).slice(0, -1) : tmpName.slice(1) : endParen ? tmpName.slice(0, -1) : tmpName; } const url = new URL( // eslint-disable-next-line no-magic-numbers, dot-notation (tds[2].childNodes[0] as HTMLAnchorElement).href.startsWith("./") ? // eslint-disable-next-line no-magic-numbers baseURL + (tds[2].childNodes[0] as HTMLAnchorElement).href : // eslint-disable-next-line no-magic-numbers (tds[2].childNodes[0] as HTMLAnchorElement).href ); url.protocol = "https"; return { name, startParen, endParen, type: idType, moduleNames: [], params, url, description: "", }; } /** * Return the parsed data in the object * `{ startParen: boolean; name: string; endParen: boolean }`. * @param data The data needed for this function. * @returns The parsed data in the object * `{ startParen: boolean; name: string; endParen: boolean }`. */ // eslint-disable-next-line max-statements function parseParamsAndName(data: { nameElems: NodeListOf; startParen: boolean; name: string; params: string[]; endParen: boolean; }): { startParen: boolean; name: string; endParen: boolean } { const tmpName = stringOrEmpty(data.nameElems[0].textContent).trimStart(); data.startParen = tmpName.startsWith("("); data.name = data.startParen ? tmpName.slice(1) : tmpName; // eslint-disable-next-line no-plusplus for (let nameIdx = 1; nameIdx < data.nameElems.length - 1; nameIdx++) { if (data.nameElems[nameIdx].nodeName === "I") { data.params.push( stringOrEmpty(data.nameElems[nameIdx].textContent) ); } } const end = stringOrEmpty( data.nameElems[data.nameElems.length - 1].textContent ).trim(); if (end.endsWith(")")) { if (end !== ")") { data.params.push(end.slice(0, -1).trim()); } data.endParen = true; } return { startParen: data.startParen, name: data.name, endParen: data.endParen, }; } /** * Download the identifier's description from the URL in the `FunctionDoc`, * parse it and save it into the field `id.description`. * @param id The `FunctionDoc` to process. */ // eslint-disable-next-line max-statements async function addDescription(id: FunctionDoc) { const htmlString = await downloadAndRead(id.url.toString()); const htmlDoc = new JSDOM(htmlString).window.document; const anchor = id.url.hash.slice(1); let currP = htmlDoc.querySelector(`a[name="${anchor}"]`)?.closest("p"); const first = currP; const text = [""]; while ( currP && (currP === first || // eslint-disable-next-line no-eq-null, eqeqeq currP.querySelector(`a[name]:not(a[name="${anchor}"])`) == null) ) { currP.childNodes.forEach((c) => parseChildNode(c, text)); text.push("
\n"); currP = currP.nextElementSibling as HTMLParagraphElement; } id.description = sanitizeDescription(text.join("")); } /** * Parse the `FunctionDoc` description and add the needed libraries as * `moduleNames` to the object. * @param id The `FunctionDoc` object to process. */ function addLibraries(id: FunctionDoc) { const match = id.description.match(librariesRegex); if (match) { // eslint-disable-next-line prefer-destructuring const librariesRaw = match[1]; const libraries = librariesRaw.replace(/\\`/gu, "").split(/,\s*/gu); id.moduleNames = libraries; } else { id.moduleNames = []; } } /** * Parse a single HTML node of the description. * @param c The HTML node to parse. * @param text The description's text to append to. */ // eslint-disable-next-line max-lines-per-function function parseChildNode(c: ChildNode, text: string[]) { switch (c.nodeName) { case "BR": text.push(`
\n`); break; case "B": text.push(`**${c.textContent?.replace(/\n/gu, " ").trim()}** `); break; case "TT": c.childNodes.forEach((cN) => { switch (cN.nodeName) { case "BR": text.push(`
\n`); break; case "B": text.push( `**${cN.textContent ?.replace(/\n/gu, " ") .trim()}** ` ); break; case "I": text.push( "*`" + `${cN.textContent ?.replace(/\n/gu, " ") .trim()}` + "`*" ); break; case "IMG": if ((cN as HTMLImageElement).src.endsWith("0.gif")) { text.push("=>"); } break; case "#text": text.push( // eslint-disable-next-line no-useless-concat "`" + `${cN.textContent?.replace(/\n/gu, " ")}` + "`" ); break; } }); break; case "#text": text.push(`${c.textContent?.replace(/\n/gu, " ")}`); break; case "SPAN": c.childNodes.forEach((cN) => parseChildNode(cN, text)); break; } } /** * Return a sanitized version of the given text. * That is, without excessive whitespace and with escaped backticks and * backslashes. Also puts examples at the end into one big code block instead of * many individual backticks. * @param text The description text to sanitize. * @returns The sanitized description. */ function sanitizeDescription(text: string): string { let sanitized = text .replace(/[ ]+/gu, " ") .replace(/^ /gmu, "") .replace(/[ ]+\n/gu, "\n") .replace(/\n[\n]+$/u, "\n") .replace(/\n\n[\n]+/gu, "\n\n") // Non-breaking-space. .replace(/\u00A0/gu, " ") .replace(/\\/gu, "\\\\") .replace(/`/gu, "\\`"); const match = sanitized.match(exampleRegex); if (match) { // eslint-disable-next-line prefer-destructuring const example = match[1]; const exampleNoBackticks = example .replace(lineFormatRegex, "$1") .replace(/\\`/gu, "") .replace(/^ /gmu, ""); sanitized = sanitized.replace( example, "**Examples:**\n\n\\`\\`\\`scheme\n" + exampleNoBackticks + "\n\\`\\`\\`\n" ); } return sanitized; } /** * Return the string `s` if it isn't `undefined` or `null`, the empty string * `""` else. Changes all non breaking spaces (`\u00A0`) to "normal" spaces. * @param s The `string` or `undefined` value to "convert". * @returns The string `s` if it isn't `undefined` or `null`, the empty string * `""` else. */ function stringOrEmpty(s: string | undefined | null): string { return s ? s.replace(/\u00A0/gu, " ") : ""; } /** * Return the filename to use for the downloaded file from the given URL. * @param url The URL to generate the filename from. * @returns The filename to use for the downloaded file from the given URL. */ function fileNameFromURL(url: URL) { return url.hostname + basename(url.pathname); } /** * Download the given URL and return the content of the file. * Exits the program if the download fails. Deletes the downloaded file after * reading it * @param url The URL of the website to download. * @returns The content of the downloaded file. */ async function downloadAndRead(url: string) { const urlUrl = new URL(url); const downloadTo = fileNameFromURL(urlUrl); if (!existsSync(downloadTo)) { try { await download(urlUrl, downloadTo); } catch (exp) { console.error(`Caught "${exp}" trying to download from ${url}`); process.exit(1); } } filesToDelete.push(downloadTo); return new Promise((resolve, reject) => { readFile(downloadTo, { encoding: "utf8" }, (r, d) => { if (r) { reject(r); } resolve(d); }); }); } /** * Download a file to the given path `fileName`. * @param url The URL to download. * @param fileName The path to save the downloaded file to. * @returns Nothing. */ async function download(url: URL, fileName: string): Promise { const fileStream = createWriteStream(fileName); return new Promise((resolve, reject) => { https.get(url, (res) => { res.pipe(fileStream); res.on("error", (e) => reject(e)); fileStream.on("finish", () => fileStream.close((err) => { if (err) { reject(err); } resolve(); }) ); }); }); } /** * Write the parsed data to the file `outFilename`. * @param text The text to save. */ async function writeFunctionDocumentation(text: string): Promise { return new Promise((resolve, reject) => { writeFile(outFilename, text, { encoding: "utf8" }, (r) => { if (r) { reject(r); } resolve(); }); }); } main();