Skip to content

Commit

Permalink
Implements CSS3Draft
Browse files Browse the repository at this point in the history
  • Loading branch information
henry0312 committed Oct 24, 2023
1 parent 8c6830d commit 6b3067e
Show file tree
Hide file tree
Showing 5 changed files with 547 additions and 29 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ This extension provides additional options for CJK users.
| Style | Description |
| ----- | ----------- |
| `EastAsianLineBreaksStyleSimple` | Soft line breaks are ignored if both sides of the break are east asian wide character. This behavior is the same as [`east_asian_line_breaks`](https://pandoc.org/MANUAL.html#extension-east_asian_line_breaks) in Pandoc. |
| `EastAsianLineBreaksCSS3Draft` | Soft line breaks are ignored even if only one side of the break is east asian wide character. |
| `EastAsianLineBreaksCSS3Draft` | This option implements CSS text level3 [Segment Break Transformation Rules](https://drafts.csswg.org/css-text-3/#line-break-transform) with [some enhancements](https://github.com/w3c/csswg-drafts/issues/5086). |

#### Example of `EastAsianLineBreaksStyleSimple`

Expand All @@ -401,7 +401,7 @@ GoでWebアプリケーションを開発しています。

Output:

```md
```html
<p>私はプログラマーです。東京の会社に勤めています。\nGoでWebアプリケーションを開発しています。</p>
```

Expand All @@ -417,7 +417,7 @@ GoでWebアプリケーションを開発しています。

Output:

```md
```html
<p>私はプログラマーです。東京の会社に勤めています。GoでWebアプリケーションを開発しています。</p>
```

Expand Down
31 changes: 28 additions & 3 deletions extension/cjk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ func TestEastAsianLineBreaks(t *testing.T) {
t,
)

// Tests with EastAsianLineBreaksStyleSimple
markdown = goldmark.New(goldmark.WithRendererOptions(
html.WithXHTML(),
html.WithUnsafe(),
Expand Down Expand Up @@ -208,8 +209,19 @@ func TestEastAsianLineBreaks(t *testing.T) {
},
t,
)
no = 9
testutil.DoTestCase(
markdown,
testutil.MarkdownTestCase{
No: no,
Description: "Soft line breaks between an east asian wide character and a western character are ignored",
Markdown: "私はプログラマーです。\n東京の会社に勤めています。\nGoでWebアプリケーションを開発しています。",
Expected: "<p>私はプログラマーです。東京の会社に勤めています。\nGoでWebアプリケーションを開発しています。</p>",
},
t,
)

// test with EastAsianLineBreaksCSS3Draft
// Tests with EastAsianLineBreaksCSS3Draft
markdown = goldmark.New(goldmark.WithRendererOptions(
html.WithXHTML(),
html.WithUnsafe(),
Expand All @@ -218,7 +230,7 @@ func TestEastAsianLineBreaks(t *testing.T) {
NewCJK(WithEastAsianLineBreaks(EastAsianLineBreaksCSS3Draft)),
),
)
no = 9
no = 10
testutil.DoTestCase(
markdown,
testutil.MarkdownTestCase{
Expand All @@ -230,7 +242,7 @@ func TestEastAsianLineBreaks(t *testing.T) {
t,
)

no = 10
no = 11
testutil.DoTestCase(
markdown,
testutil.MarkdownTestCase{
Expand All @@ -241,4 +253,17 @@ func TestEastAsianLineBreaks(t *testing.T) {
},
t,
)

no = 12
testutil.DoTestCase(
markdown,
testutil.MarkdownTestCase{
No: no,
Description: "Soft line breaks between an east asian wide character and a western character are ignored",
Markdown: "私はプログラマーです。\n東京の会社に勤めています。\nGoでWebアプリケーションを開発しています。",
Expected: "<p>私はプログラマーです。東京の会社に勤めています。GoでWebアプリケーションを開発しています。</p>",
},
t,
)

}
54 changes: 47 additions & 7 deletions renderer/html/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"bytes"
"fmt"
"strconv"
"unicode"
"unicode/utf8"

"github.com/yuin/goldmark/ast"
Expand Down Expand Up @@ -107,15 +108,13 @@ const optEastAsianLineBreaks renderer.OptionName = "EastAsianLineBreaks"
type EastAsianLineBreaksStyle int

const (
// EastAsianLineBreaksStyleSimple is a style where soft line breaks are ignored
// if both sides of the break are east asian wide characters.
// EastAsianLineBreaksStyleSimple follows east_asian_line_breaks in Pandoc.
EastAsianLineBreaksStyleSimple EastAsianLineBreaksStyle = iota
// EastAsianLineBreaksCSS3Draft is a style where soft line breaks are ignored
// even if only one side of the break is an east asian wide character.
// EastAsianLineBreaksCSS3Draft follows CSS text level3 "Segment Break Transformation Rules" with some enhancements.
EastAsianLineBreaksCSS3Draft
)

type eastAsianLineBreaksFunction interface {
type eastAsianLineBreaker interface {
SoftLineBreak(thisLastRune rune, siblingFirstRune rune) bool
}

Expand All @@ -128,12 +127,53 @@ func (e *eastAsianLineBreaksSimple) SoftLineBreak(thisLastRune rune, siblingFirs
type eastAsianLineBreaksCSS3Draft struct{}

func (e *eastAsianLineBreaksCSS3Draft) SoftLineBreak(thisLastRune rune, siblingFirstRune rune) bool {
return !(util.IsEastAsianWideRune(thisLastRune) || util.IsEastAsianWideRune(siblingFirstRune))
// Implements CSS text level3 Segment Break Transformation Rules with some enhancements.
// References:
// - https://www.w3.org/TR/2020/WD-css-text-3-20200429/#line-break-transform
// - https://github.com/w3c/csswg-drafts/issues/5086

// Rule1:
// If the character immediately before or immediately after the segment break is
// the zero-width space character (U+200B), then the break is removed, leaving behind the zero-width space.
if thisLastRune == '\u200B' || siblingFirstRune == '\u200B' {
return false
}

// Rule2:
// Otherwise, if the East Asian Width property of both the character before and after the segment break is
// F, W, or H (not A), and neither side is Hangul, then the segment break is removed.
thisLastRuneEastAsianWidth := util.EastAsianWidth(thisLastRune)
siblingFirstRuneEastAsianWidth := util.EastAsianWidth(siblingFirstRune)
if (thisLastRuneEastAsianWidth == "F" ||
thisLastRuneEastAsianWidth == "W" ||
thisLastRuneEastAsianWidth == "H") &&
(siblingFirstRuneEastAsianWidth == "F" ||
siblingFirstRuneEastAsianWidth == "W" ||
siblingFirstRuneEastAsianWidth == "H") {
return unicode.Is(unicode.Hangul, thisLastRune) || unicode.Is(unicode.Hangul, siblingFirstRune)
}

// Rule3:
// Otherwise, if either the character before or after the segment break belongs to
// the space-discarding character set and it is a Unicode Punctuation (P*) or U+3000,
// then the segment break is removed.
if util.IsSpaceDiscardingUnicodeRune(thisLastRune) ||
unicode.IsPunct(thisLastRune) ||
thisLastRune == '\u3000' ||
util.IsSpaceDiscardingUnicodeRune(siblingFirstRune) ||
unicode.IsPunct(siblingFirstRune) ||
siblingFirstRune == '\u3000' {
return false
}

// Rule4:
// Otherwise, the segment break is converted to a space (U+0020).
return true
}

type eastAsianLineBreaks struct {
Enabled bool
EastAsianLineBreaksFunction eastAsianLineBreaksFunction
EastAsianLineBreaksFunction eastAsianLineBreaker
}

type withEastAsianLineBreaks struct {
Expand Down
16 changes: 0 additions & 16 deletions util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -836,22 +836,6 @@ func IsAlphaNumeric(c byte) bool {
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
}

// IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false.
func IsEastAsianWideRune(r rune) bool {
// https://en.wikipedia.org/wiki/CJK_Symbols_and_Punctuation
var CJKSymbolsAndPunctuation = &unicode.RangeTable{
R16: []unicode.Range16{
{0x3000, 0x303F, 1},
},
}
return unicode.Is(unicode.Hiragana, r) ||
unicode.Is(unicode.Katakana, r) ||
unicode.Is(unicode.Han, r) ||
unicode.Is(unicode.Lm, r) ||
unicode.Is(unicode.Hangul, r) ||
unicode.Is(CJKSymbolsAndPunctuation, r)
}

// A BufWriter is a subset of the bufio.Writer .
type BufWriter interface {
io.Writer
Expand Down
Loading

0 comments on commit 6b3067e

Please sign in to comment.