Use a tree‐based approach for advanced text formatting (#1907)
* Use a tree‐based approach for adv. text formatting Sanitizing HTML/Markdown means parsing the content into an HTML tree under‐the‐hood anyway, and it is more accurate to do mention/hashtag replacement on the text nodes in that tree than it is to try to hack it in with regexes et cetera. This undoes the overrides of `#entities` and `#rewrite` on `AdvancedTextFormatter` but also stops using them, instead keeping track of the parsed Nokogiri tree itself and using that in the `#to_s` method. Internally, this tree uses `<mastodon-entity>` nodes to keep track of hashtags, links, and mentions. Sanitization is moved to the beginning, so it should be known that these do not appear in the input. * Also disallow entities inside of `<code>` I think this is generally expected behaviour, and people are annoyed when their code gets turned into links/hashtags/mentions. * Minor cleanup to AdvancedTextFormatter * Change AdvancedTextFormatter to rewrite entities in one pass and sanitize at the end Also, minor refactoring to better match how other formatters are organized. * Add some tests Co-authored-by: Claire <claire.github-309c@sitedethib.com>pull/41/head
parent
4ac6601476
commit
2cabc5d188
|
@ -19,6 +19,8 @@ class AdvancedTextFormatter < TextFormatter
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
attr_reader :content_type
|
||||||
|
|
||||||
# @param [String] text
|
# @param [String] text
|
||||||
# @param [Hash] options
|
# @param [Hash] options
|
||||||
# @option options [Boolean] :multiline
|
# @option options [Boolean] :multiline
|
||||||
|
@ -27,7 +29,7 @@ class AdvancedTextFormatter < TextFormatter
|
||||||
# @option options [Array<Account>] :preloaded_accounts
|
# @option options [Array<Account>] :preloaded_accounts
|
||||||
# @option options [String] :content_type
|
# @option options [String] :content_type
|
||||||
def initialize(text, options = {})
|
def initialize(text, options = {})
|
||||||
content_type = options.delete(:content_type)
|
@content_type = options.delete(:content_type)
|
||||||
super(text, options)
|
super(text, options)
|
||||||
|
|
||||||
@text = format_markdown(text) if content_type == 'text/markdown'
|
@text = format_markdown(text) if content_type == 'text/markdown'
|
||||||
|
@ -50,50 +52,50 @@ class AdvancedTextFormatter < TextFormatter
|
||||||
html.html_safe # rubocop:disable Rails/OutputSafety
|
html.html_safe # rubocop:disable Rails/OutputSafety
|
||||||
end
|
end
|
||||||
|
|
||||||
# Differs from `TextFormatter` by skipping HTML tags and entities
|
# Differs from TextFormatter by operating on the parsed HTML tree
|
||||||
def entities
|
def rewrite
|
||||||
@entities ||= begin
|
if @tree.nil?
|
||||||
gaps = []
|
src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
|
||||||
total_offset = 0
|
@tree = Nokogiri::HTML5.fragment(src)
|
||||||
|
document = @tree.document
|
||||||
|
|
||||||
escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
|
@tree.xpath('.//text()[not(ancestor::a | ancestor::code)]').each do |text_node|
|
||||||
total_offset += match.length - 1
|
# Iterate over text elements and build up their replacements.
|
||||||
end_offset = Regexp.last_match.end(0)
|
content = text_node.content
|
||||||
gaps << [end_offset - total_offset, total_offset]
|
replacement = Nokogiri::XML::NodeSet.new(document)
|
||||||
' '
|
processed_index = 0
|
||||||
end
|
Extractor.extract_entities_with_indices(
|
||||||
|
content,
|
||||||
Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity|
|
extract_url_without_protocol: false
|
||||||
start_pos, end_pos = entity[:indices]
|
) do |entity|
|
||||||
offset_idx = gaps.rindex { |gap| gap.first <= start_pos }
|
# Iterate over entities in this text node.
|
||||||
offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
|
advance = entity[:indices].first - processed_index
|
||||||
entity.merge(indices: [start_pos + offset, end_pos + offset])
|
if advance.positive?
|
||||||
|
# Text node for content which precedes entity.
|
||||||
|
replacement << Nokogiri::XML::Text.new(
|
||||||
|
content[processed_index, advance],
|
||||||
|
document
|
||||||
|
)
|
||||||
|
end
|
||||||
|
replacement << Nokogiri::HTML5.fragment(yield(entity))
|
||||||
|
processed_index = entity[:indices].last
|
||||||
|
end
|
||||||
|
if processed_index < content.size
|
||||||
|
# Text node for remaining content.
|
||||||
|
replacement << Nokogiri::XML::Text.new(
|
||||||
|
content[processed_index, content.size - processed_index],
|
||||||
|
document
|
||||||
|
)
|
||||||
|
end
|
||||||
|
text_node.replace(replacement)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING).to_html
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
# Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe
|
|
||||||
def rewrite
|
|
||||||
entities.sort_by! do |entity|
|
|
||||||
entity[:indices].first
|
|
||||||
end
|
|
||||||
|
|
||||||
result = ''.dup
|
|
||||||
|
|
||||||
last_index = entities.reduce(0) do |index, entity|
|
|
||||||
indices = entity[:indices]
|
|
||||||
result << text[index...indices.first]
|
|
||||||
result << yield(entity)
|
|
||||||
indices.last
|
|
||||||
end
|
|
||||||
|
|
||||||
result << text[last_index..-1]
|
|
||||||
|
|
||||||
Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING)
|
|
||||||
end
|
|
||||||
|
|
||||||
def format_markdown(html)
|
def format_markdown(html)
|
||||||
html = markdown_formatter.render(html)
|
html = markdown_formatter.render(html)
|
||||||
html.delete("\r").delete("\n")
|
html.delete("\r").delete("\n")
|
||||||
|
|
|
@ -35,7 +35,7 @@ RSpec.describe AdvancedTextFormatter do
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'given a block code' do
|
context 'given a block code' do
|
||||||
let(:text) { "test\n\n```\nint main(void) {\n return 0;\n}\n```\n" }
|
let(:text) { "test\n\n```\nint main(void) {\n return 0; // https://joinmastodon.org/foo\n}\n```\n" }
|
||||||
|
|
||||||
it 'formats code using <pre> and <code>' do
|
it 'formats code using <pre> and <code>' do
|
||||||
is_expected.to include '<pre><code>int main'
|
is_expected.to include '<pre><code>int main'
|
||||||
|
@ -44,13 +44,17 @@ RSpec.describe AdvancedTextFormatter do
|
||||||
it 'does not strip leading spaces' do
|
it 'does not strip leading spaces' do
|
||||||
is_expected.to include '> return 0'
|
is_expected.to include '> return 0'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'does not format links' do
|
||||||
|
is_expected.to include 'return 0; // https://joinmastodon.org/foo'
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'given some quote' do
|
context 'given a link in inline code using backticks' do
|
||||||
let(:text) { "> foo\n\nbar" }
|
let(:text) { 'test `https://foo.bar/bar` bar' }
|
||||||
|
|
||||||
it 'formats code using <code>' do
|
it 'does not rewrite the link' do
|
||||||
is_expected.to include '<blockquote><p>foo</p></blockquote>'
|
is_expected.to include 'test <code>https://foo.bar/bar</code> bar'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue