Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for extending inline parsing with custom inline content parsers #321

Merged
merged 10 commits into from
Apr 26, 2024
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
with the exception that 0.x versions can break between minor versions.

## Unreleased
### Added
- Support for extending inline parsing with custom inline content parsers! See
`Parser.Builder#customInlineContentParserFactory`. This allows users or
extensions to hook into inline parsing on a deeper level than using delimiter
processors. It could be used to implement support for math/latex formulas for
example.
### Fixed
- Fix parsing of link reference definitions where it looks like it has a title
but it doesn't because it's followed by characters other than space/tab. In that
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,19 @@ elements in the resulting HTML, you can create your own subclass of
To define the HTML rendering for them, you can use a `NodeRenderer` as
explained above.

#### Customize parsing

There are a few ways to extend parsing or even override built-in parsing,
all of them via methods on `Parser.Builder`
(see [Blocks and inlines](https://spec.commonmark.org/0.31.2/#blocks-and-inlines) in the spec for an overview of blocks/inlines):

- Parsing of specific block types (e.g. headings, code blocks, etc) can be
enabled/disabled with `enabledBlockTypes`
- Parsing of blocks can be extended/overridden with `customBlockParserFactory`
- Parsing of inline content can be extended/overridden with `customInlineContentParserFactory`
- Parsing of [delimiters](https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis) in inline content can be
extended with `customDelimiterProcessor`

#### Thread-safety

Both the `Parser` and `HtmlRenderer` are designed so that you can
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.commonmark.internal;

import org.commonmark.parser.beta.InlineContentParserFactory;
import org.commonmark.internal.util.Parsing;
import org.commonmark.node.*;
import org.commonmark.parser.*;
Expand Down Expand Up @@ -66,6 +67,7 @@ public class DocumentParser implements ParserState {

private final List<BlockParserFactory> blockParserFactories;
private final InlineParserFactory inlineParserFactory;
private final List<InlineContentParserFactory> inlineContentParserFactories;
private final List<DelimiterProcessor> delimiterProcessors;
private final IncludeSourceSpans includeSourceSpans;
private final DocumentBlockParser documentBlockParser;
Expand All @@ -75,9 +77,11 @@ public class DocumentParser implements ParserState {
private final List<BlockParser> allBlockParsers = new ArrayList<>();

public DocumentParser(List<BlockParserFactory> blockParserFactories, InlineParserFactory inlineParserFactory,
List<DelimiterProcessor> delimiterProcessors, IncludeSourceSpans includeSourceSpans) {
List<InlineContentParserFactory> inlineContentParserFactories, List<DelimiterProcessor> delimiterProcessors,
IncludeSourceSpans includeSourceSpans) {
this.blockParserFactories = blockParserFactories;
this.inlineParserFactory = inlineParserFactory;
this.inlineContentParserFactories = inlineContentParserFactories;
this.delimiterProcessors = delimiterProcessors;
this.includeSourceSpans = includeSourceSpans;

Expand Down Expand Up @@ -477,7 +481,7 @@ private void addDefinitionsFrom(ParagraphParser paragraphParser) {
* Walk through a block & children recursively, parsing string content into inline content where appropriate.
*/
private void processInlines() {
InlineParserContextImpl context = new InlineParserContextImpl(delimiterProcessors, definitions);
InlineParserContextImpl context = new InlineParserContextImpl(inlineContentParserFactories, delimiterProcessors, definitions);
InlineParser inlineParser = inlineParserFactory.create(context);

for (BlockParser blockParser : allBlockParsers) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
package org.commonmark.internal;

import org.commonmark.parser.beta.InlineContentParserFactory;
import org.commonmark.node.LinkReferenceDefinition;
import org.commonmark.parser.InlineParserContext;
import org.commonmark.parser.delimiter.DelimiterProcessor;

import java.util.List;
import java.util.Map;

public class InlineParserContextImpl implements InlineParserContext {

private final List<InlineContentParserFactory> inlineContentParserFactories;
private final List<DelimiterProcessor> delimiterProcessors;
private final LinkReferenceDefinitions linkReferenceDefinitions;

public InlineParserContextImpl(List<DelimiterProcessor> delimiterProcessors,
public InlineParserContextImpl(List<InlineContentParserFactory> inlineContentParserFactories,
List<DelimiterProcessor> delimiterProcessors,
LinkReferenceDefinitions linkReferenceDefinitions) {
this.inlineContentParserFactories = inlineContentParserFactories;
this.delimiterProcessors = delimiterProcessors;
this.linkReferenceDefinitions = linkReferenceDefinitions;
}

@Override
public List<InlineContentParserFactory> getCustomInlineContentParserFactories() {
return inlineContentParserFactories;
}

@Override
public List<DelimiterProcessor> getCustomDelimiterProcessors() {
return delimiterProcessors;
Expand Down
118 changes: 69 additions & 49 deletions commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.commonmark.parser.InlineParser;
import org.commonmark.parser.InlineParserContext;
import org.commonmark.parser.SourceLines;
import org.commonmark.parser.beta.Position;
import org.commonmark.parser.beta.*;
import org.commonmark.parser.beta.Scanner;
import org.commonmark.parser.delimiter.DelimiterProcessor;
import org.commonmark.text.Characters;
Expand All @@ -16,11 +16,12 @@

public class InlineParserImpl implements InlineParser, InlineParserState {

private final BitSet specialCharacters;
private final Map<Character, DelimiterProcessor> delimiterProcessors;
private final InlineParserContext context;
private final Map<Character, List<InlineContentParser>> inlineParsers;
private final List<InlineContentParserFactory> inlineContentParserFactories;
private final Map<Character, DelimiterProcessor> delimiterProcessors;
private final BitSet specialCharacters;

private Map<Character, List<InlineContentParser>> inlineParsers;
private Scanner scanner;
private boolean includeSourceSpans;
private int trailingSpaces;
Expand All @@ -36,46 +37,31 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
*/
private Bracket lastBracket;

public InlineParserImpl(InlineParserContext inlineParserContext) {
this.delimiterProcessors = calculateDelimiterProcessors(inlineParserContext.getCustomDelimiterProcessors());

this.context = inlineParserContext;
this.inlineParsers = new HashMap<>();
this.inlineParsers.put('\\', Collections.<InlineContentParser>singletonList(new BackslashInlineParser()));
this.inlineParsers.put('`', Collections.<InlineContentParser>singletonList(new BackticksInlineParser()));
this.inlineParsers.put('&', Collections.<InlineContentParser>singletonList(new EntityInlineParser()));
this.inlineParsers.put('<', Arrays.asList(new AutolinkInlineParser(), new HtmlInlineParser()));

this.specialCharacters = calculateSpecialCharacters(this.delimiterProcessors.keySet(), inlineParsers.keySet());
public InlineParserImpl(InlineParserContext context) {
this.context = context;
this.inlineContentParserFactories = calculateInlineContentParserFactories(context.getCustomInlineContentParserFactories());
this.delimiterProcessors = calculateDelimiterProcessors(context.getCustomDelimiterProcessors());
this.specialCharacters = calculateSpecialCharacters(this.delimiterProcessors.keySet(), this.inlineContentParserFactories);
}

public static BitSet calculateSpecialCharacters(Set<Character> delimiterCharacters, Set<Character> characters) {
BitSet bitSet = new BitSet();
for (Character c : delimiterCharacters) {
bitSet.set(c);
}
for (Character c : characters) {
bitSet.set(c);
}
bitSet.set('[');
bitSet.set(']');
bitSet.set('!');
bitSet.set('\n');
return bitSet;
private List<InlineContentParserFactory> calculateInlineContentParserFactories(List<InlineContentParserFactory> customFactories) {
// Custom parsers can override built-in parsers if they want, so make sure they are tried first
var list = new ArrayList<>(customFactories);
list.add(new BackslashInlineParser.Factory());
list.add(new BackticksInlineParser.Factory());
list.add(new EntityInlineParser.Factory());
list.add(new AutolinkInlineParser.Factory());
list.add(new HtmlInlineParser.Factory());
return list;
}

public static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) {
Map<Character, DelimiterProcessor> map = new HashMap<>();
addDelimiterProcessors(Arrays.<DelimiterProcessor>asList(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map);
private static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) {
var map = new HashMap<Character, DelimiterProcessor>();
addDelimiterProcessors(List.of(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map);
addDelimiterProcessors(delimiterProcessors, map);
return map;
}

@Override
public Scanner scanner() {
return scanner;
}

private static void addDelimiterProcessors(Iterable<DelimiterProcessor> delimiterProcessors, Map<Character, DelimiterProcessor> map) {
for (DelimiterProcessor delimiterProcessor : delimiterProcessors) {
char opening = delimiterProcessor.getOpeningCharacter();
Expand Down Expand Up @@ -109,6 +95,40 @@ private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterPr
}
}

private static BitSet calculateSpecialCharacters(Set<Character> delimiterCharacters,
List<InlineContentParserFactory> inlineContentParserFactories) {
BitSet bitSet = new BitSet();
for (Character c : delimiterCharacters) {
bitSet.set(c);
}
for (var factory : inlineContentParserFactories) {
for (var c : factory.getTriggerCharacters()) {
bitSet.set(c);
}
}
bitSet.set('[');
bitSet.set(']');
bitSet.set('!');
bitSet.set('\n');
return bitSet;
}

private Map<Character, List<InlineContentParser>> createInlineContentParsers() {
var map = new HashMap<Character, List<InlineContentParser>>();
for (var factory : inlineContentParserFactories) {
var parser = factory.create();
for (var c : factory.getTriggerCharacters()) {
map.computeIfAbsent(c, k -> new ArrayList<>()).add(parser);
}
}
return map;
}

@Override
public Scanner scanner() {
return scanner;
}

/**
* Parse content in block into inline children, appending them to the block node.
*/
Expand All @@ -117,14 +137,13 @@ public void parse(SourceLines lines, Node block) {
reset(lines);

while (true) {
List<? extends Node> nodes = parseInline();
if (nodes != null) {
for (Node node : nodes) {
block.appendChild(node);
}
} else {
var nodes = parseInline();
if (nodes == null) {
break;
}
for (Node node : nodes) {
block.appendChild(node);
}
}

processDelimiters(null);
Expand All @@ -137,6 +156,7 @@ void reset(SourceLines lines) {
this.trailingSpaces = 0;
this.lastDelimiter = null;
this.lastBracket = null;
this.inlineParsers = createInlineContentParsers();
}

private Text text(SourceLines sourceLines) {
Expand All @@ -155,20 +175,20 @@ private List<? extends Node> parseInline() {

switch (c) {
case '[':
return Collections.singletonList(parseOpenBracket());
return List.of(parseOpenBracket());
case '!':
return Collections.singletonList(parseBang());
return List.of(parseBang());
case ']':
return Collections.singletonList(parseCloseBracket());
return List.of(parseCloseBracket());
case '\n':
return Collections.singletonList(parseLineBreak());
return List.of(parseLineBreak());
case Scanner.END:
return null;
}

// No inline parser, delimiter or other special handling.
if (!specialCharacters.get(c)) {
return Collections.singletonList(parseText());
return List.of(parseText());
}

List<InlineContentParser> inlineParsers = this.inlineParsers.get(c);
Expand All @@ -183,7 +203,7 @@ private List<? extends Node> parseInline() {
if (includeSourceSpans && node.getSourceSpans().isEmpty()) {
node.setSourceSpans(scanner.getSource(position, scanner.position()).getSourceSpans());
}
return Collections.singletonList(node);
return List.of(node);
} else {
// Reset position
scanner.setPosition(position);
Expand All @@ -200,7 +220,7 @@ private List<? extends Node> parseInline() {
}

// If we get here, even for a special/delimiter character, we will just treat it as text.
return Collections.singletonList(parseText());
return List.of(parseText());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import org.commonmark.node.Link;
import org.commonmark.node.Text;
import org.commonmark.parser.SourceLines;
import org.commonmark.parser.beta.Position;
import org.commonmark.parser.beta.Scanner;
import org.commonmark.parser.beta.*;

import java.util.Set;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -46,4 +46,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
}
return ParsedInline.none();
}

public static class Factory implements InlineContentParserFactory {
@Override
public Set<Character> getTriggerCharacters() {
return Set.of('<');
}

@Override
public InlineContentParser create() {
return new AutolinkInlineParser();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import org.commonmark.internal.util.Escaping;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Text;
import org.commonmark.parser.beta.Scanner;
import org.commonmark.parser.beta.*;

import java.util.Set;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -32,4 +33,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
return ParsedInline.of(new Text("\\"), scanner.position());
}
}

public static class Factory implements InlineContentParserFactory {
@Override
public Set<Character> getTriggerCharacters() {
return Set.of('\\');
}

@Override
public InlineContentParser create() {
return new BackslashInlineParser();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import org.commonmark.node.Code;
import org.commonmark.node.Text;
import org.commonmark.parser.SourceLines;
import org.commonmark.parser.beta.Position;
import org.commonmark.parser.beta.Scanner;
import org.commonmark.parser.beta.*;
import org.commonmark.text.Characters;

import java.util.Set;

/**
* Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks.
*/
Expand Down Expand Up @@ -47,4 +48,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
Text text = new Text(source.getContent());
return ParsedInline.of(text, afterOpening);
}

public static class Factory implements InlineContentParserFactory {
@Override
public Set<Character> getTriggerCharacters() {
return Set.of('`');
}

@Override
public InlineContentParser create() {
return new BackticksInlineParser();
}
}
}
Loading
Loading