From 8e6966c2a828ddc65c43daaf45cdc9610c903b1c Mon Sep 17 00:00:00 2001 From: yuin Date: Thu, 10 Jan 2019 12:44:42 +0900 Subject: [PATCH] add Guess* --- README.md | 7 +++- charsetutil.go | 93 +++++++++++++++++++++++++++++++++++++++++++-- charsetutil_test.go | 30 +++++++++++++++ go.mod | 8 ++++ go.sum | 8 ++++ 5 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 go.mod create mode 100644 go.sum diff --git a/README.md b/README.md index 643c024..87351b9 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ go get github.com/yuin/charsetutil - `Decode*` : Converts from the specified charset to UTF-8. - `Encode*` : Converts from the UTF-8 to specified charset. +- `Guess*` : Guesses a charcter set. - `MustDecode*` : Same as `Decode*`, but panics when errors occur - `MustEncode*` : Same as `Encode*`, but panics when errors occur - ```go b, err = EncodeString("こんにちわ", "Windows-31J") b, err = Encode("こんにちわ", "Windows-31J") @@ -35,6 +35,11 @@ s = MustDecodeString(string(source), "Windows-31J") s = MustDecode(source, "Windows-31J") s = MustDecodeBytes(source, "Windows-31J") s = MustDecodeReader(bytes.NewReader(source), "Windows-31J") + +cs, err := GuessString(string(source)) +cs, err := GuessBytes(source) +cs, err := GuessReader(bytes.NewReader(source)) +cs, err := Guess(source) ``` ## Supported character sets diff --git a/charsetutil.go b/charsetutil.go index 19569b8..66cb89c 100644 --- a/charsetutil.go +++ b/charsetutil.go @@ -2,13 +2,15 @@ package charsetutil import ( "bytes" - "errors" "fmt" - "golang.org/x/net/html/charset" - "golang.org/x/text/transform" "io" "io/ioutil" "strings" + + "github.com/gogs/chardet" + + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" ) func panicIfError(err error) { @@ -17,6 +19,74 @@ func panicIfError(err error) { } } +// CharsetGuess is a guessd charcter set +type CharsetGuess interface { + // Charset returns a guessed charcter set + Charset() string + + // Language returns a guessed language + Language() string + + // Confidence returns a confidence of this guess + Confidence() int +} + +type charsetGuess struct { + *chardet.Result +} + +func (g *charsetGuess) Charset() string { + return g.Result.Charset +} + +func (g *charsetGuess) Language() string { + return g.Result.Language +} + +func (g *charsetGuess) Confidence() int { + return g.Result.Confidence +} + +// GuessBytes guesses a character set of given bytes +func GuessBytes(s []byte) (CharsetGuess, error) { + detector := chardet.NewTextDetector() + result, err := detector.DetectBest(s) + if err != nil { + return nil, err + } + return &charsetGuess{result}, err +} + +// Guess guesses a character set of given bytes +func Guess(s []byte) (CharsetGuess, error) { + return GuessBytes(s) +} + +// GuessBytes guesses a character set of given Reader +func GuessReader(s io.Reader) (CharsetGuess, error) { + detector := chardet.NewTextDetector() + buf := make([]byte, 128) + if _, err := s.Read(buf); err != nil { + return nil, err + } + result, err := detector.DetectBest(buf) + if err != nil { + return nil, err + } + return &charsetGuess{result}, err +} + +// GuessBytes guesses a character set of given string +func GuessString(s string) (CharsetGuess, error) { + detector := chardet.NewTextDetector() + result, err := detector.DetectBest([]byte(s)) + if err != nil { + return nil, err + } + return &charsetGuess{result}, err +} + +// DecodeReader converts given Reader to a UTF-8 string func DecodeReader(s io.Reader, enc string) (string, error) { reader, err := charset.NewReaderLabel(enc, s) if err != nil { @@ -29,46 +99,54 @@ func DecodeReader(s io.Reader, enc string) (string, error) { return string(bytes), nil } +// MustDecodeReader converts given Reader to a UTF-8 string and panics if errros occur. func MustDecodeReader(s io.Reader, enc string) string { ret, err := DecodeReader(s, enc) panicIfError(err) return ret } +// DecodeBytes converts given bytes to a UTF-8 string func DecodeBytes(s []byte, enc string) (string, error) { return DecodeReader(bytes.NewReader(s), enc) } +// MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur. func MustDecodeBytes(s []byte, enc string) string { ret, err := DecodeReader(bytes.NewReader(s), enc) panicIfError(err) return ret } +// DecodeString converts given string to a UTF-8 string func DecodeString(s, enc string) (string, error) { return DecodeReader(strings.NewReader(s), enc) } +// MustDecodeString converts given string to a UTF-8 string and panics if errros occur. func MustDecodeString(s, enc string) string { ret, err := DecodeReader(strings.NewReader(s), enc) panicIfError(err) return ret } +// DecodeBytes converts given bytes to a UTF-8 string func Decode(s []byte, enc string) (string, error) { return DecodeReader(bytes.NewReader(s), enc) } +// MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur. func MustDecode(s []byte, enc string) string { ret, err := DecodeReader(bytes.NewReader(s), enc) panicIfError(err) return ret } +// EncodeReader converts a Reader to bytes encoded with given encoding func EncodeReader(s io.Reader, enc string) ([]byte, error) { e, _ := charset.Lookup(enc) if e == nil { - return nil, errors.New(fmt.Sprintf("unsupported charset: %q", enc)) + return nil, fmt.Errorf("unsupported charset: %q", enc) } var buf bytes.Buffer writer := transform.NewWriter(&buf, e.NewEncoder()) @@ -79,36 +157,43 @@ func EncodeReader(s io.Reader, enc string) ([]byte, error) { return buf.Bytes(), nil } +// MustEncodeReader converts a Reader to bytes encoded with given encoding and panics if errors occur func MustEncodeReader(s io.Reader, enc string) []byte { ret, err := EncodeReader(s, enc) panicIfError(err) return ret } +// EncodeBytes converts bytes to bytes encoded with given encoding func EncodeBytes(s []byte, enc string) ([]byte, error) { return EncodeReader(bytes.NewReader(s), enc) } +// MustEncodeBytes converts a bytes to bytes encoded with given encoding and panics if errors occur func MustEncodeBytes(s []byte, enc string) []byte { ret, err := EncodeReader(bytes.NewReader(s), enc) panicIfError(err) return ret } +// EncodeString converts a string to bytes encoded with given encoding func EncodeString(s, enc string) ([]byte, error) { return EncodeReader(strings.NewReader(s), enc) } +// MustEncodeString converts a bytes to bytes encoded with given encoding and panics if errors occur func MustEncodeString(s, enc string) []byte { ret, err := EncodeReader(strings.NewReader(s), enc) panicIfError(err) return ret } +// Encode converts a string to bytes encoded with given encoding func Encode(s string, enc string) ([]byte, error) { return EncodeReader(strings.NewReader(s), enc) } +// MustEncode converts a bytes to bytes encoded with given encoding and panics if errors occur func MustEncode(s string, enc string) []byte { ret, err := EncodeReader(strings.NewReader(s), enc) panicIfError(err) diff --git a/charsetutil_test.go b/charsetutil_test.go index ef8e963..3db7fda 100644 --- a/charsetutil_test.go +++ b/charsetutil_test.go @@ -160,3 +160,33 @@ func TestDecodeError(t *testing.T) { assertPanic(func() string { return MustDecodeReader(bytes.NewReader(source), "unknown") }) } + +func TestGuess(t *testing.T) { + sourceEuc := []byte{'\xa4', '\xa2', '\xa4', '\xa4', '\xa4', '\xa6', '\xa4', '\xa8', '\xa4', '\xaa', '\x0d', '\x0a', '\xa5', '\xbd', '\xc7', '\xbd', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\x8e', '\xb6', '\x8e', '\xb7', '\x8e', '\xb8', '\x8e', '\xb9', '\x8e', '\xba'} + // sourceSjis := []byte{'\x82', '\xa0', '\x82', '\xa2', '\x82', '\xa4', '\x82', '\xa6', '\x82', '\xa8', '\x0d', '\x0a', '\x83', '\x5c', '\x94', '\x5c', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\xb6', '\xb7', '\xb8', '\xb9', '\xba'} + + assert := func(r CharsetGuess, charset, language string, err error) { + if err != nil { + t.Errorf("Failed:%+v", err) + } + if r.Charset() != charset { + t.Errorf("'%s' expected, but got '%s'", charset, r.Charset()) + } + if r.Language() != language { + t.Errorf("'%s' expected, but got '%s'", language, r.Language()) + } + } + + result, err := Guess(sourceEuc) + assert(result, "EUC-JP", "ja", err) + + result, err = GuessBytes(sourceEuc) + assert(result, "EUC-JP", "ja", err) + + result, err = GuessReader(bytes.NewReader(sourceEuc)) + assert(result, "EUC-JP", "ja", err) + + result, err = GuessString("ああイイ”haa") + assert(result, "UTF-8", "", err) + +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a3a7106 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module github.com/yuin/charsetutil + +require ( + github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561 + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect + golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e + golang.org/x/text v0.3.0 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5321895 --- /dev/null +++ b/go.sum @@ -0,0 +1,8 @@ +github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561 h1:aBzukfDxQlCTVS0NBUjI5YA3iVeaZ9Tb5PxNrrIP1xs= +github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e h1:bRhVy7zSSasaqNksaRZiA5EEI+Ei4I1nO5Jh72wfHlg= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=