Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ post.from_markdown(footnote_markdown, api=api)
post.paragraph(content=[{"content": "Some claim."}]).footnote_anchor(1)
post.footnote(1, "The note text, with **formatting** allowed.")


draft = api.post_draft(post.get_draft())

# set section (can only be done after first posting the draft)
Expand Down
57 changes: 39 additions & 18 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ python = "<4.0,>=3.10"
requests = "^2.32.0"
python-dotenv = "^1.2.1"
PyYAML = "^6.0"
markdown-it-py = "^3.0"
mdit-py-plugins = "^0.4"

[tool.poetry.group.dev.dependencies]

Expand Down
210 changes: 210 additions & 0 deletions substack/mdrender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Markdown -> Substack ProseMirror via markdown-it-py.

Implements Post.from_markdown() using a real CommonMark parser (markdown-it-py)
plus the standard footnote plugin, with a small renderer that walks the syntax
tree into Substack's node schema.

Node construction goes through ``substack.nodes`` so the (undocumented) schema
lives in exactly one place.

Footnotes: Substack numbers footnote anchors by their position in the document
and pairs them one-to-one, in order, with the footnote blocks at the end (it
ignores any explicit number and does not support one block serving several
anchors). So each reference is emitted as its own sequentially-numbered anchor,
and a matching footnote block is appended for each -- a definition referenced
more than once is duplicated, which mirrors how Substack's own editor behaves.
"""

from __future__ import annotations

import copy
from typing import Dict, List, Optional

from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
from mdit_py_plugins.footnote import footnote_plugin

from substack import nodes
from substack.nodes import MarkType, NodeType

_MARK_FOR = {
"strong": {"type": MarkType.STRONG},
"em": {"type": MarkType.EM},
"s": {"type": MarkType.STRIKETHROUGH},
}


def _make_parser() -> MarkdownIt:
return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough")


def _coalesce(out_nodes: List[Dict]) -> List[Dict]:
"""Merge adjacent text nodes that carry identical marks (e.g. softbreaks)."""
merged: List[Dict] = []
for node in out_nodes:
if (
merged
and node.get("type") == NodeType.TEXT
and merged[-1].get("type") == NodeType.TEXT
and node.get("marks") == merged[-1].get("marks")
):
merged[-1]["text"] += node["text"]
else:
merged.append(node)
return merged


def _render_inline(node: SyntaxTreeNode, marks: List[Dict], ctx: Dict) -> List[Dict]:
"""Render an inline subtree into a flat list of text / anchor nodes."""
out: List[Dict] = []
for child in node.children:
t = child.type
if t == "text":
if child.content:
out.append(nodes.text(child.content, marks))
elif t == "code_inline":
out.append(nodes.text(child.content, marks + [nodes.code_mark()]))
elif t in _MARK_FOR:
out.extend(_render_inline(child, marks + [_MARK_FOR[t]], ctx))
elif t == "link":
href = child.attrs.get("href", "")
out.extend(_render_inline(child, marks + [nodes.link_mark(href)], ctx))
elif t in ("softbreak", "hardbreak"):
out.append(nodes.text(" ", marks))
elif t == "footnote_ref":
# Number anchors by document position and record which definition each
# one points to, so matching blocks can be emitted 1:1 afterwards.
ctx["order"].append(child.meta["id"])
out.append(nodes.footnote_anchor(len(ctx["order"])))
elif t == "image":
# Inline images are rare in this schema; fall back to alt text.
alt = child.attrs.get("alt") or "".join(
c.content for c in child.children if c.type == "text"
)
if alt:
out.append(nodes.text(alt, marks))
return _coalesce(out)


def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]:
"""If an inline node is just an image (optionally wrapped in a link), return it."""
kids = [c for c in inline.children if c.type != "softbreak"]
if len(kids) == 1 and kids[0].type == "image":
return kids[0]
if len(kids) == 1 and kids[0].type == "link":
inner = [c for c in kids[0].children if c.type != "softbreak"]
if len(inner) == 1 and inner[0].type == "image":
img = inner[0]
img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined]
return img
return None


def _captioned_image(img: SyntaxTreeNode, api) -> Dict:
src = img.attrs.get("src", "")
if src.startswith("/"):
src = src[1:]
if api is not None and not src.startswith("http"):
try:
src = api.get_image(src).get("url")
except Exception:
pass
# markdown-it stores the image alt text as the node's content, not in attrs.
alt = img.content or img.attrs.get("alt") or None
# Standard markdown image title `![alt](src "caption")` maps to Substack's caption node.
title = img.attrs.get("title") or None
caption = [nodes.text(title)] if title else None
return nodes.captioned_image(
src,
alt=alt,
href=getattr(img, "_link_href", None),
caption=caption,
)


def _render_block(node: SyntaxTreeNode, api, ctx: Dict) -> List[Dict]:
"""Render a block-level node into zero or more Substack nodes."""
t = node.type

if t == "paragraph":
inline = node.children[0]
img = _only_image(inline)
if img is not None:
return [_captioned_image(img, api)]
return [nodes.paragraph(_render_inline(inline, [], ctx))]

if t == "heading":
level = int(node.tag[1])
return [nodes.heading(_render_inline(node.children[0], [], ctx), level=level)]

if t == "hr":
return [nodes.horizontal_rule()]

if t in ("fence", "code_block"):
return [
nodes.code_block(
node.content.rstrip("\n"), language=node.info.strip() or None
)
]

if t == "blockquote":
paras: List[Dict] = []
for child in node.children:
paras.extend(_render_block(child, api, ctx))
return [nodes.blockquote(paras)]

if t == "bullet_list":
return [nodes.bullet_list(_render_list_items(node, api, ctx))]

if t == "ordered_list":
return [nodes.ordered_list(_render_list_items(node, api, ctx))]

# footnote_block is handled separately in markdown_to_doc; ignore it here.
return []


def _render_list_items(list_node: SyntaxTreeNode, api, ctx: Dict) -> List[Dict]:
items = []
for li in list_node.children:
content: List[Dict] = []
for child in li.children:
content.extend(_render_block(child, api, ctx))
items.append({"type": NodeType.LIST_ITEM, "content": content})
return items


def _footnote_definitions(tree: SyntaxTreeNode, api) -> Dict[int, List[Dict]]:
"""Map each footnote id to its rendered block content."""
definitions: Dict[int, List[Dict]] = {}
for node in tree.children:
if node.type != "footnote_block":
continue
for fn in node.children:
# A footnote's own content should not register anchors of its own.
local_ctx = {"order": []}
content: List[Dict] = []
for child in fn.children:
content.extend(_render_block(child, api, local_ctx))
definitions[fn.meta["id"]] = content
return definitions


def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]:
"""Convert Markdown into a list of Substack ProseMirror block nodes."""
tree = SyntaxTreeNode(_make_parser().parse(markdown_content))

definitions = _footnote_definitions(tree, api)

ctx: Dict = {"order": []}
out: List[Dict] = []
for node in tree.children:
if node.type == "footnote_block":
continue
out.extend(_render_block(node, api, ctx))

# Emit one footnote block per reference, in anchor order, numbered to match.
for number, footnote_id in enumerate(ctx["order"], start=1):
content = copy.deepcopy(definitions.get(footnote_id, []))
out.append(nodes.footnote(number, content))

return out
Loading