ddb3943359
This makes the renderer print the content of informational HTML tags while stripping the tags themselves. Tags like script, iframe, style, etc, which are unlikely to ever hold presentable content, are exempt from this, and their content is skipped from rendering as well as the tags themselves. <br>, a hard-break tag, is supported as a Markdown hard-break replacement (the two spaces before newline). This also adds tests for this behavior inside general_text.md. Fixes #6, a longstanding issue with inline HTML in blockquotes.
586 lines
17 KiB
Go
586 lines
17 KiB
Go
// This file is part of gmnhg.
|
|
|
|
// gmnhg is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
|
|
// gmnhg is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with gmnhg. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
// Package renderer contains an implementation of markdown => text/gemini
|
|
// renderer for github.com/gomarkdown/markdown.
|
|
package renderer
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"html"
|
|
"io"
|
|
"regexp"
|
|
|
|
"github.com/gomarkdown/markdown/ast"
|
|
"github.com/grokify/html-strip-tags-go"
|
|
"github.com/olekukonko/tablewriter"
|
|
)
|
|
|
|
var (
|
|
lineBreak = []byte{'\n'}
|
|
space = []byte{' '}
|
|
linkPrefix = []byte("=> ")
|
|
quoteBrPrefix = []byte("\n> ")
|
|
quotePrefix = []byte("> ")
|
|
itemPrefix = []byte("* ")
|
|
itemIndent = []byte{'\t'}
|
|
preformattedToggle = []byte("```")
|
|
codeDelimiter = []byte("`")
|
|
emphDelimiter = []byte("*")
|
|
strongDelimiter = []byte("**")
|
|
delDelimiter = []byte("~~")
|
|
horizontalRule = []byte("---")
|
|
subOpen = []byte("_{")
|
|
subClose = []byte("}")
|
|
supOpen = []byte("^(")
|
|
supClose = []byte(")")
|
|
)
|
|
|
|
// matches a FULL string that contains no non-whitespace characters
|
|
var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)
|
|
|
|
// fairly tolerant to handle weird HTML
|
|
var tagPairRegexString = `<[\n\f ]*%s([\n\f ]+[^\n\f \/>"'=]+[\n\f ]*(=[\n\f ]*([a-zA-Z1-9\-]+|"[^\n\f"]+"|'[^\n\f']+'))?)*[\n\f ]*>.*?<[\n\f ]*/[\n\f ]*%s[\n\f ]*>`
|
|
|
|
// HTML block tags whose contents should not be rendered
|
|
var htmlNoRenderRegex = []*regexp.Regexp{
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "fieldset", "fieldset")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "form", "form")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "iframe", "iframe")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "script", "script")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "style", "style")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "canvas", "canvas")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "dialog", "dialog")),
|
|
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "progress", "progress")),
|
|
}
|
|
|
|
var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
|
|
var hardBreakTag = regexp.MustCompile(`< *br */? *>`)
|
|
var escapedHtmlChar = regexp.MustCompile(`(?:^|[^\\\\])&[[:alnum:]]+;`)
|
|
|
|
// Renderer implements markdown.Renderer.
|
|
type Renderer struct{}
|
|
|
|
// NewRenderer returns a new Renderer.
|
|
func NewRenderer() Renderer {
|
|
return Renderer{}
|
|
}
|
|
|
|
func getNodeDelimiter(node ast.Node) []byte {
|
|
switch node.(type) {
|
|
case *ast.Code:
|
|
return codeDelimiter
|
|
case *ast.Emph:
|
|
return emphDelimiter
|
|
case *ast.Strong:
|
|
return strongDelimiter
|
|
case *ast.Del:
|
|
return delDelimiter
|
|
default:
|
|
return []byte{}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) link(w io.Writer, node *ast.Link, entering bool) {
|
|
if entering {
|
|
if node.Footnote != nil {
|
|
fmt.Fprintf(w, "[^%d]: %s", node.NoteID, extractText(node.Footnote))
|
|
} else {
|
|
w.Write(linkPrefix)
|
|
w.Write(node.Destination)
|
|
w.Write(space)
|
|
r.text(w, node, true)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) image(w io.Writer, node *ast.Image, entering bool) {
|
|
if entering {
|
|
w.Write(linkPrefix)
|
|
w.Write(node.Destination)
|
|
w.Write(space)
|
|
r.text(w, node, true)
|
|
}
|
|
}
|
|
|
|
func (r Renderer) blockquote(w io.Writer, node *ast.BlockQuote, entering bool) {
|
|
if entering {
|
|
if node := node.AsContainer(); node != nil {
|
|
for _, child := range node.Children {
|
|
w.Write(quotePrefix)
|
|
r.blockquoteText(w, child)
|
|
// double linebreak to ensure Gemini clients don't merge
|
|
// quotes; gomarkdown assumes separate blockquotes are
|
|
// paragraphs of the same blockquote while we don't
|
|
w.Write(lineBreak)
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) hr(w io.Writer, node *ast.HorizontalRule, entering bool) {
|
|
if entering {
|
|
w.Write(horizontalRule)
|
|
w.Write(lineBreak)
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
|
|
// Based on https://pages.uoregon.edu/ncp/Courses/MathInPlainTextEmail.html
|
|
func (r Renderer) subscript(w io.Writer, node *ast.Subscript, entering bool) {
|
|
if entering {
|
|
if node := node.AsLeaf(); node != nil {
|
|
w.Write(subOpen)
|
|
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
|
|
w.Write(subClose)
|
|
}
|
|
}
|
|
}
|
|
func (r Renderer) superscript(w io.Writer, node *ast.Superscript, entering bool) {
|
|
if entering {
|
|
if node := node.AsLeaf(); node != nil {
|
|
w.Write(supOpen)
|
|
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
|
|
w.Write(supClose)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) heading(w io.Writer, node *ast.Heading, entering bool) {
|
|
if entering {
|
|
// pad headings with the relevant number of #-s; Gemini spec
|
|
// used to allow 3 at maximum before a space
|
|
bufLength := node.Level + 1
|
|
heading := make([]byte, bufLength)
|
|
heading[len(heading)-1] = ' '
|
|
for i := 0; i < len(heading)-1; i++ {
|
|
heading[i] = '#'
|
|
}
|
|
w.Write(heading)
|
|
r.text(w, node, true)
|
|
} else {
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
|
|
func extractLinks(node ast.Node) (stack []ast.Node) {
|
|
if node := node.AsContainer(); node != nil {
|
|
for _, subnode := range node.Children {
|
|
stack = append(stack, extractLinks(subnode)...)
|
|
}
|
|
}
|
|
switch node := node.(type) {
|
|
case *ast.Image:
|
|
stack = append(stack, node)
|
|
case *ast.Link:
|
|
stack = append(stack, node)
|
|
// footnotes are represented as links which embed an extra node
|
|
// containing footnote text; the link itself is not considered a
|
|
// container
|
|
if node.Footnote != nil {
|
|
stack = append(stack, extractLinks(node.Footnote)...)
|
|
}
|
|
}
|
|
return stack
|
|
}
|
|
|
|
func (r Renderer) renderLinks(w io.Writer, links []ast.Node) (count uint) {
|
|
for _, link := range links {
|
|
if link, ok := link.(*ast.Link); ok && link.Footnote == nil {
|
|
r.link(w, link, true)
|
|
w.Write(lineBreak)
|
|
count++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func (r Renderer) renderFootnotes(w io.Writer, links []ast.Node) (count uint) {
|
|
for _, link := range links {
|
|
if link, ok := link.(*ast.Link); ok && link.Footnote != nil {
|
|
r.link(w, link, true)
|
|
w.Write(lineBreak)
|
|
count++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func (r Renderer) renderImages(w io.Writer, links []ast.Node) (count uint) {
|
|
for _, link := range links {
|
|
if link, ok := link.(*ast.Image); ok {
|
|
r.image(w, link, true)
|
|
w.Write(lineBreak)
|
|
count++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func (r Renderer) linksList(w io.Writer, links []ast.Node) {
|
|
for _, renderer := range []func(Renderer, io.Writer, []ast.Node) uint{
|
|
Renderer.renderFootnotes,
|
|
Renderer.renderImages,
|
|
Renderer.renderLinks,
|
|
} {
|
|
linksRendered := renderer(r, w, links)
|
|
// ensure breaks between link blocks of the same type
|
|
if linksRendered > 0 {
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
}
|
|
|
|
func isLinksOnlyParagraph(node *ast.Paragraph) bool {
|
|
for _, child := range node.Children {
|
|
switch child := child.(type) {
|
|
case *ast.Text:
|
|
if emptyLineRegex.Find(child.Literal) != nil {
|
|
continue
|
|
}
|
|
case *ast.Link, *ast.Image:
|
|
continue
|
|
}
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isLinksOnlyList(node *ast.List) bool {
|
|
for _, child := range node.Children {
|
|
child, ok := child.(*ast.ListItem)
|
|
if !ok {
|
|
return false // should never happen
|
|
}
|
|
for _, liChild := range child.Children {
|
|
liChild, ok := liChild.(*ast.Paragraph)
|
|
if !ok {
|
|
return false // sublist, etc
|
|
}
|
|
if !isLinksOnlyParagraph(liChild) {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (r Renderer) paragraph(w io.Writer, node *ast.Paragraph, entering bool) (noNewLine bool) {
|
|
linksOnly := isLinksOnlyParagraph(node)
|
|
noNewLine = linksOnly
|
|
if entering {
|
|
children := node.Children
|
|
// current version of gomarkdown/markdown finds an empty
|
|
// *ast.Text element before links/images, breaking the heuristic
|
|
if len(children) >= 2 {
|
|
firstChild, elementIsText := children[0].(*ast.Text)
|
|
if elementIsText && len(firstChild.Literal) == 0 {
|
|
children = children[1:]
|
|
}
|
|
}
|
|
if !linksOnly {
|
|
for _, child := range children {
|
|
// only render links text in the paragraph if they're
|
|
// combined with some other text on page
|
|
switch child := child.(type) {
|
|
case *ast.Text, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
|
|
r.text(w, child, true)
|
|
case *ast.Code:
|
|
r.text(w, child, false)
|
|
case *ast.Hardbreak:
|
|
w.Write(lineBreak)
|
|
case *ast.HTMLSpan:
|
|
if hardBreakTag.Match(child.AsLeaf().Literal) {
|
|
w.Write(lineBreak)
|
|
}
|
|
case *ast.Subscript:
|
|
r.subscript(w, child, true)
|
|
case *ast.Superscript:
|
|
r.superscript(w, child, true)
|
|
}
|
|
}
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func (r Renderer) code(w io.Writer, node *ast.CodeBlock) {
|
|
w.Write(preformattedToggle)
|
|
if node.IsFenced {
|
|
w.Write(node.Info)
|
|
}
|
|
w.Write(lineBreak)
|
|
w.Write(node.Literal)
|
|
w.Write(preformattedToggle)
|
|
w.Write(lineBreak)
|
|
}
|
|
|
|
func (r Renderer) list(w io.Writer, node *ast.List, level int) {
|
|
// the text/gemini spec included with the current Gemini spec does
|
|
// not specify anything about the formatting of lists of level >= 2,
|
|
// as of now this will just render them like in Markdown
|
|
isNumbered := (node.ListFlags & ast.ListTypeOrdered) != 0
|
|
for number, item := range node.Children {
|
|
item, ok := item.(*ast.ListItem)
|
|
if !ok {
|
|
panic("rendering anything but list items is not supported")
|
|
}
|
|
isTerm := (item.ListFlags & ast.ListTypeTerm) == ast.ListTypeTerm
|
|
if l := len(item.Children); l >= 1 {
|
|
// add extra line break to split up definitions
|
|
if isTerm && number > 0 {
|
|
w.Write(lineBreak)
|
|
}
|
|
for i := 0; i < level; i++ {
|
|
w.Write(itemIndent)
|
|
}
|
|
if isNumbered {
|
|
w.Write([]byte(fmt.Sprintf("%d. ", number+1)))
|
|
} else if !isTerm {
|
|
w.Write(itemPrefix)
|
|
}
|
|
r.text(w, item, true)
|
|
w.Write(lineBreak)
|
|
if l >= 2 {
|
|
if list, ok := item.Children[1].(*ast.List); ok {
|
|
r.list(w, list, level+1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func textWithNewlineReplacement(node ast.Node, replacement []byte, unescapeHtml bool) []byte {
|
|
buf := bytes.Buffer{}
|
|
delimiter := getNodeDelimiter(node)
|
|
// special case for footnotes: we want them in the text
|
|
if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
|
|
fmt.Fprintf(&buf, "[^%d]", node.NoteID)
|
|
}
|
|
if leaf := node.AsLeaf(); leaf != nil {
|
|
// replace all newlines in text with preferred symbols; this may
|
|
// be spaces for general text, allowing for soft wrapping, which
|
|
// is recommended as per Gemini spec p. 5.4.1, or line breaks
|
|
// with a blockquote symbols for blockquotes, or just nothing
|
|
buf.Write(delimiter)
|
|
switch node := node.(type) {
|
|
case *ast.Hardbreak:
|
|
buf.Write(lineBreak)
|
|
// If the blockquote ends with a double space, the parser will
|
|
// not create a Hardbreak at the end, so this works.
|
|
if _, ok := leaf.Parent.(*ast.BlockQuote); !ok {
|
|
buf.Write(quotePrefix)
|
|
}
|
|
case *ast.HTMLSpan:
|
|
if hardBreakTag.Match(leaf.Literal) {
|
|
buf.Write(lineBreak)
|
|
}
|
|
buf.Write(leaf.Content)
|
|
case *ast.HTMLBlock:
|
|
buf.Write([]byte(extractHtml(node, quotePrefix)))
|
|
default:
|
|
textWithoutBreaks := lineBreakCharacters.ReplaceAll(leaf.Literal, replacement)
|
|
if unescapeHtml {
|
|
unescapedText := escapedHtmlChar.ReplaceAll(textWithoutBreaks, []byte(html.UnescapeString(string(textWithoutBreaks))))
|
|
buf.Write(unescapedText)
|
|
} else {
|
|
buf.Write(textWithoutBreaks)
|
|
}
|
|
}
|
|
buf.Write(delimiter)
|
|
}
|
|
if node := node.AsContainer(); node != nil {
|
|
buf.Write(delimiter)
|
|
for _, child := range node.Children {
|
|
// skip non-text child elements from rendering
|
|
switch child := child.(type) {
|
|
case *ast.List:
|
|
default:
|
|
buf.Write(textWithNewlineReplacement(child, replacement, unescapeHtml))
|
|
}
|
|
}
|
|
buf.Write(delimiter)
|
|
}
|
|
return buf.Bytes()
|
|
}
|
|
|
|
func (r Renderer) text(w io.Writer, node ast.Node, unescapeHtml bool) {
|
|
w.Write(textWithNewlineReplacement(node, space, unescapeHtml))
|
|
}
|
|
|
|
func (r Renderer) blockquoteText(w io.Writer, node ast.Node) {
|
|
w.Write(textWithNewlineReplacement(node, quoteBrPrefix, true))
|
|
}
|
|
|
|
func extractText(node ast.Node) string {
|
|
return string(textWithNewlineReplacement(node, space, true))
|
|
}
|
|
|
|
func extractHtml(node *ast.HTMLBlock, linePrefix []byte) string {
|
|
// Only render contents of allowed tags
|
|
literal := node.Literal
|
|
for _, re := range htmlNoRenderRegex {
|
|
literal = re.ReplaceAllLiteral(literal, []byte{})
|
|
}
|
|
if len(literal) > 0 {
|
|
literalWithBreaks := hardBreakTag.ReplaceAll(lineBreakCharacters.ReplaceAll(literal, space), append([]byte(lineBreak), linePrefix...))
|
|
literalStripped := strip.StripTags(string(literalWithBreaks))
|
|
return html.UnescapeString(literalStripped)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (r Renderer) tableHead(t *tablewriter.Table, node *ast.TableHeader) {
|
|
if node := node.AsContainer(); node != nil {
|
|
// should always have a single row consisting of at least one
|
|
// cell but worth checking nonetheless; tablewriter only
|
|
// supports a single header row as of now therefore ignore
|
|
// second row and the rest
|
|
if len(node.Children) > 0 {
|
|
if row := node.Children[0].AsContainer(); row != nil {
|
|
cells := make([]string, len(row.Children))
|
|
for i, cell := range row.Children {
|
|
cells[i] = extractText(cell)
|
|
}
|
|
t.SetHeader(cells)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) tableBody(t *tablewriter.Table, node *ast.TableBody) {
|
|
if node := node.AsContainer(); node != nil {
|
|
for _, row := range node.Children {
|
|
if row := row.AsContainer(); row != nil {
|
|
cells := make([]string, len(row.Children))
|
|
for i, cell := range row.Children {
|
|
cells[i] = extractText(cell)
|
|
}
|
|
t.Append(cells)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r Renderer) table(w io.Writer, node *ast.Table, entering bool) {
|
|
if entering {
|
|
w.Write(preformattedToggle)
|
|
w.Write(lineBreak)
|
|
// gomarkdown appears to only parse headings consisting of a
|
|
// single line and always have a TableBody preceded by a single
|
|
// TableHeader but we're better off not relying on it
|
|
t := tablewriter.NewWriter(w)
|
|
t.SetAutoFormatHeaders(false) // TODO: tablewriter options should probably be configurable
|
|
if node := node.AsContainer(); node != nil {
|
|
for _, child := range node.Children {
|
|
switch child := child.(type) {
|
|
case *ast.TableHeader:
|
|
r.tableHead(t, child)
|
|
case *ast.TableBody:
|
|
r.tableBody(t, child)
|
|
}
|
|
}
|
|
}
|
|
t.Render()
|
|
} else {
|
|
w.Write(preformattedToggle)
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
|
|
func (r Renderer) htmlBlock(w io.Writer, node *ast.HTMLBlock, entering bool) {
|
|
if entering {
|
|
htmlString := extractHtml(node, []byte{})
|
|
if len(htmlString) > 0 {
|
|
w.Write([]byte(htmlString))
|
|
w.Write(lineBreak)
|
|
w.Write(lineBreak)
|
|
}
|
|
}
|
|
}
|
|
|
|
// RenderNode implements Renderer.RenderNode().
|
|
func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.WalkStatus {
|
|
// entering in gomarkdown was made to have elements of type switch
|
|
// to enclose themselves within the second pass with entering =
|
|
// false, as Markdown is quite similar to HTML in its structure.
|
|
// As Gemtext is line-oriented, and not tag-oriented, most of
|
|
// container subroutines have to handle their subelements on
|
|
// themselves.
|
|
noNewLine := true
|
|
fetchLinks := false
|
|
switch node := node.(type) {
|
|
case *ast.BlockQuote:
|
|
r.blockquote(w, node, entering)
|
|
fetchLinks = true
|
|
case *ast.HorizontalRule:
|
|
r.hr(w, node, entering)
|
|
case *ast.Heading:
|
|
r.heading(w, node, entering)
|
|
noNewLine = false
|
|
case *ast.Paragraph:
|
|
switch node.Parent.(type) {
|
|
// these (should) handle underlying paragraphs themselves
|
|
case *ast.BlockQuote, *ast.ListItem, *ast.Footnotes:
|
|
default:
|
|
noNewLine = r.paragraph(w, node, entering)
|
|
fetchLinks = true
|
|
}
|
|
case *ast.CodeBlock:
|
|
r.code(w, node)
|
|
// code block is not considered a wrapping element
|
|
w.Write(lineBreak)
|
|
case *ast.List:
|
|
// lists of level >= 2 are rendered recursively along with the
|
|
// first level; the list is a container
|
|
_, parentIsDocument := node.Parent.(*ast.Document)
|
|
// footnotes are rendered as links after the parent paragraph
|
|
if !node.IsFootnotesList && parentIsDocument && !entering {
|
|
if !isLinksOnlyList(node) {
|
|
r.list(w, node, 0)
|
|
noNewLine = false
|
|
}
|
|
fetchLinks = true
|
|
}
|
|
case *ast.Table:
|
|
r.table(w, node, entering)
|
|
noNewLine = false
|
|
fetchLinks = true
|
|
case *ast.HTMLBlock:
|
|
// Do not render if already rendered as part of a blockquote
|
|
if _, ok := node.Parent.(*ast.BlockQuote); !ok {
|
|
r.htmlBlock(w, node, entering)
|
|
}
|
|
}
|
|
if !noNewLine && !entering {
|
|
w.Write(lineBreak)
|
|
}
|
|
if fetchLinks && !entering {
|
|
links := extractLinks(node)
|
|
if len(links) > 0 {
|
|
r.linksList(w, links)
|
|
}
|
|
}
|
|
return ast.GoToNext
|
|
}
|
|
|
|
// RenderHeader implements Renderer.RenderHeader().
|
|
func (r Renderer) RenderHeader(w io.Writer, node ast.Node) {}
|
|
|
|
// RenderFooter implements Renderer.RenderFooter().
|
|
func (r Renderer) RenderFooter(w io.Writer, node ast.Node) {}
|