Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Documentation parser fixes #4853

Merged
merged 7 commits into from
Oct 23, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 85 additions & 5 deletions src/NuGetGallery/Services/ReadMeService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
using CommonMark;
using System.Text.RegularExpressions;
using CommonMark.Syntax;

namespace NuGetGallery
{
Expand Down Expand Up @@ -141,6 +142,30 @@ public async Task<bool> SavePendingReadMeMdIfChanged(Package package, EditPackag

return hasReadMe;
}

/// <summary>
/// HTML encode content except for markdown blockquotes.
/// </summary>
/// <param name="readMeMd">ReadMe.md content.</param>
/// <returns>Encoded ReadMe.md content.</returns>
private static string HtmlEncodeExceptBlockquotes(string readMeMd)
{
var encoded = HttpUtility.HtmlEncode(readMeMd);
var encodedLines = encoded.Replace("\r\n", "\n").Split('\n');

var blockQuotePattern = new Regex("^ {0,3}&gt;");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better if blockQuotePattern was a static readonly property?

Copy link
Contributor

@loic-sharma loic-sharma Oct 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this HtmlEncodeExceptBlockquotes method seems rather expensive. Would it be worthwhile to check if readMeMd has the &gt; substring, and if it doesn't, simply return encoded?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will play with this to see if I can optimize it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to single MultiLine regex applied to entire markdown

for (int i = 0; i < encodedLines.Length; i++)
{
var match = blockQuotePattern.Match(encodedLines[i]);
if (match.Success)
{
// For now, only decode the blockquote marker and not any inline html.
encodedLines[i] = "> " + encodedLines[i].Substring(match.Length);
}
}

return string.Join(Environment.NewLine, encodedLines);
}

/// <summary>
/// Get converted HTML for readme.md string content.
Expand All @@ -149,10 +174,65 @@ public async Task<bool> SavePendingReadMeMdIfChanged(Package package, EditPackag
/// <returns>HTML content.</returns>
internal static string GetReadMeHtml(string readMeMd)
{
var encodedMarkdown = HttpUtility.HtmlEncode(readMeMd);
var regex = new Regex("<a href=([\"\']).*?\\1");
var converted = CommonMarkConverter.Convert(encodedMarkdown);
return regex.Replace(converted, "$0" + " rel=\"nofollow\"");
// HTML encode markdown to block inline html.
var encodedMarkdown = HtmlEncodeExceptBlockquotes(readMeMd);

var settings = CommonMarkSettings.Default.Clone();
settings.RenderSoftLineBreaksAsLineBreaks = true;

// Parse executes CommonMarkConverter's ProcessStage1 and ProcessStage2.
var document = CommonMarkConverter.Parse(encodedMarkdown, settings);
foreach (var node in document.AsEnumerable())
{
if (node.IsOpening)
{
var block = node.Block;
if (block != null)
{
switch (block.Tag)
{
// Demote heading tags so they don't overpower expander headings.
case BlockTag.AtxHeading:
case BlockTag.SetextHeading:
var level = (byte)Math.Min(block.Heading.Level + 1, 6);
block.Heading = new HeadingData(level);
break;

// Decode preformatted blocks to prevent double encoding.
// Skip BlockTag.BlockQuote, which are already decoded.
case BlockTag.FencedCode:
case BlockTag.IndentedCode:
if (block.StringContent != null)
{
var content = block.StringContent?.TakeFromStart(block.StringContent.Length);
var unencodedContent = HttpUtility.HtmlDecode(content);
block.StringContent.Replace(unencodedContent, 0, unencodedContent.Length);
}
break;
}
}

var inline = node.Inline;
if (inline != null)
{
// Block javascript in links.
if ((inline.Tag == InlineTag.Link) &&
(inline.TargetUrl.IndexOf("javascript", StringComparison.InvariantCultureIgnoreCase) >= 0))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will change to only allow target urls that use http/https.

{
inline.TargetUrl = "";
}
}
}
}

// CommonMark.Net does not support link attributes, so manually inject nofollow.
using (var htmlWriter = new StringWriter())
{
CommonMarkConverter.ProcessStage3(document, htmlWriter, settings);

var regex = new Regex("<a href=([\"\']).*?\\1");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider making this regex also a static readonly.

Copy link
Contributor

@loic-sharma loic-sharma Oct 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, is CommonMarkConverter guaranteed to always generate links with the format <a href=? If not, this regex may not match all links. Would a simple string replace <a to <a rel="nofollow" work too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the current version always has href attribute first, then possible title attribute. There's an open issue to add support for other link attributes, but it won't be added until the commonmark spec includes it.

return regex.Replace(htmlWriter.ToString(), "$0" + " rel=\"nofollow\"");
}
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,8 @@ public async Task WhenHasReadMeAndMarkdownExists_ReturnsContent()

// Assert
var model = ResultAssert.IsView<DisplayPackageViewModel>(result);
Assert.Equal("<h1>Hello World!</h1>", model.ReadMeHtml);
Assert.Equal("<h1>Hello World!</h1>", model.ReadMeHtmlClamped);
Assert.Equal("<h2>Hello World!</h2>", model.ReadMeHtml);
Assert.Equal("<h2>Hello World!</h2>", model.ReadMeHtmlClamped);
}

[Fact]
Expand Down
4 changes: 3 additions & 1 deletion tests/NuGetGallery.Facts/Services/ReadMeServiceFacts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ public void EncodesHtmlInMarkdown(string originalMd, string expectedHtml)
}

[Theory]
[InlineData("# Heading", "<h1>Heading</h1>")]
[InlineData("# Heading", "<h2>Heading</h2>")]
[InlineData("- List", "<ul><li>List</li></ul>")]
[InlineData("[text](http://www.test.com)", "<p><a href=\"http://www.test.com\" rel=\"nofollow\">text</a></p>")]
[InlineData("[text](javascript:alert('hi'))", "<p><a href=\"\" rel=\"nofollow\">text</a></p>")]
[InlineData("> <text>Blockquote</text>", "<blockquote><p>&lt;text&gt;Blockquote&lt;/text&gt;</p></blockquote>")]
public void ConvertsMarkdownToHtml(string originalMd, string expectedHtml)
{
Assert.Equal(expectedHtml, StripNewLines(ReadMeService.GetReadMeHtml(originalMd)));
Expand Down