-
Notifications
You must be signed in to change notification settings - Fork 643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Documentation parser fixes #4853
Changes from 1 commit
a04af62
73c2aaa
8241c58
d415a12
2abbc31
5ac35a1
0472e12
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,11 @@ | |
using System.Linq; | ||
using System.Net.Http; | ||
using System.Text; | ||
using System.Text.RegularExpressions; | ||
using System.Threading.Tasks; | ||
using System.Web; | ||
using CommonMark; | ||
using System.Text.RegularExpressions; | ||
using CommonMark.Syntax; | ||
|
||
namespace NuGetGallery | ||
{ | ||
|
@@ -141,6 +142,30 @@ public async Task<bool> SavePendingReadMeMdIfChanged(Package package, EditPackag | |
|
||
return hasReadMe; | ||
} | ||
|
||
/// <summary> | ||
/// HTML encode content except for markdown blockquotes. | ||
/// </summary> | ||
/// <param name="readMeMd">ReadMe.md content.</param> | ||
/// <returns>Encoded ReadMe.md content.</returns> | ||
private static string HtmlEncodeExceptBlockquotes(string readMeMd) | ||
{ | ||
var encoded = HttpUtility.HtmlEncode(readMeMd); | ||
var encodedLines = encoded.Replace("\r\n", "\n").Split('\n'); | ||
|
||
var blockQuotePattern = new Regex("^ {0,3}>"); | ||
for (int i = 0; i < encodedLines.Length; i++) | ||
{ | ||
var match = blockQuotePattern.Match(encodedLines[i]); | ||
if (match.Success) | ||
{ | ||
// For now, only decode the blockquote marker and not any inline html. | ||
encodedLines[i] = "> " + encodedLines[i].Substring(match.Length); | ||
} | ||
} | ||
|
||
return string.Join(Environment.NewLine, encodedLines); | ||
} | ||
|
||
/// <summary> | ||
/// Get converted HTML for readme.md string content. | ||
|
@@ -149,10 +174,65 @@ public async Task<bool> SavePendingReadMeMdIfChanged(Package package, EditPackag | |
/// <returns>HTML content.</returns> | ||
internal static string GetReadMeHtml(string readMeMd) | ||
{ | ||
var encodedMarkdown = HttpUtility.HtmlEncode(readMeMd); | ||
var regex = new Regex("<a href=([\"\']).*?\\1"); | ||
var converted = CommonMarkConverter.Convert(encodedMarkdown); | ||
return regex.Replace(converted, "$0" + " rel=\"nofollow\""); | ||
// HTML encode markdown to block inline html. | ||
var encodedMarkdown = HtmlEncodeExceptBlockquotes(readMeMd); | ||
|
||
var settings = CommonMarkSettings.Default.Clone(); | ||
settings.RenderSoftLineBreaksAsLineBreaks = true; | ||
|
||
// Parse executes CommonMarkConverter's ProcessStage1 and ProcessStage2. | ||
var document = CommonMarkConverter.Parse(encodedMarkdown, settings); | ||
foreach (var node in document.AsEnumerable()) | ||
{ | ||
if (node.IsOpening) | ||
{ | ||
var block = node.Block; | ||
if (block != null) | ||
{ | ||
switch (block.Tag) | ||
{ | ||
// Demote heading tags so they don't overpower expander headings. | ||
case BlockTag.AtxHeading: | ||
case BlockTag.SetextHeading: | ||
var level = (byte)Math.Min(block.Heading.Level + 1, 6); | ||
block.Heading = new HeadingData(level); | ||
break; | ||
|
||
// Decode preformatted blocks to prevent double encoding. | ||
// Skip BlockTag.BlockQuote, which are already decoded. | ||
case BlockTag.FencedCode: | ||
case BlockTag.IndentedCode: | ||
if (block.StringContent != null) | ||
{ | ||
var content = block.StringContent?.TakeFromStart(block.StringContent.Length); | ||
var unencodedContent = HttpUtility.HtmlDecode(content); | ||
block.StringContent.Replace(unencodedContent, 0, unencodedContent.Length); | ||
} | ||
break; | ||
} | ||
} | ||
|
||
var inline = node.Inline; | ||
if (inline != null) | ||
{ | ||
// Block javascript in links. | ||
if ((inline.Tag == InlineTag.Link) && | ||
(inline.TargetUrl.IndexOf("javascript", StringComparison.InvariantCultureIgnoreCase) >= 0)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will change to only allow target urls that use http/https. |
||
{ | ||
inline.TargetUrl = ""; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// CommonMark.Net does not support link attributes, so manually inject nofollow. | ||
using (var htmlWriter = new StringWriter()) | ||
{ | ||
CommonMarkConverter.ProcessStage3(document, htmlWriter, settings); | ||
|
||
var regex = new Regex("<a href=([\"\']).*?\\1"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider making this regex also a static readonly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the current version always has href attribute first, then possible title attribute. There's an open issue to add support for other link attributes, but it won't be added until the commonmark spec includes it. |
||
return regex.Replace(htmlWriter.ToString(), "$0" + " rel=\"nofollow\""); | ||
} | ||
} | ||
|
||
/// <summary> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be better if
blockQuotePattern
was a static readonly property?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, this
HtmlEncodeExceptBlockquotes
method seems rather expensive. Would it be worthwhile to check ifreadMeMd
has the>
substring, and if it doesn't, simply returnencoded
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will play with this to see if I can optimize it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to single MultiLine regex applied to entire markdown