diff --git a/go.mod b/go.mod index b1a10d70..12f3a560 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,8 @@ require ( github.com/google/uuid v1.3.0 github.com/hanwen/go-fuse/v2 v2.3.0 github.com/hyponet/eventbus v1.0.0 + github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a + github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 github.com/minio/minio-go/v7 v7.0.52 github.com/onsi/ginkgo v1.16.5 github.com/onsi/gomega v1.27.2 @@ -26,13 +28,15 @@ require ( github.com/tickstep/aliyunpan-api v0.1.6 github.com/tickstep/library-go v0.1.0 go.uber.org/zap v1.24.0 - golang.org/x/net v0.11.0 - golang.org/x/sys v0.9.0 + golang.org/x/net v0.15.0 + golang.org/x/sys v0.12.0 gopkg.in/yaml.v3 v3.0.1 gorm.io/driver/postgres v1.3.7 ) require ( + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.4 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.34 // indirect @@ -46,6 +50,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/sso v1.12.12 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.12 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.19.2 // indirect + github.com/aymerick/douceur v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/btcsuite/btcd v0.22.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect @@ -57,9 +62,13 @@ require ( github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.14.1 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad // indirect github.com/goccy/go-json v0.10.2 // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/go-cmp v0.5.9 // indirect + github.com/gorilla/css v1.0.0 // indirect github.com/jackc/chunkreader/v2 v2.0.1 // indirect github.com/jackc/pgconn v1.12.1 // indirect github.com/jackc/pgio v1.0.0 // indirect @@ -75,6 +84,7 @@ require ( github.com/mattn/go-isatty v0.0.19 // indirect github.com/mattn/go-runewidth v0.0.9 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/microcosm-cc/bluemonday v1.0.25 // indirect github.com/minio/md5-simd v1.1.2 // indirect github.com/minio/sha256-simd v1.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -89,11 +99,12 @@ require ( github.com/sirupsen/logrus v1.9.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect - golang.org/x/text v0.10.0 // indirect + golang.org/x/text v0.13.0 // indirect golang.org/x/time v0.3.0 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect + howett.net/plist v1.0.0 // indirect modernc.org/libc v1.22.2 // indirect modernc.org/mathutil v1.5.0 // indirect modernc.org/memory v1.5.0 // indirect @@ -115,6 +126,6 @@ require ( go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/arch v0.3.0 // indirect - golang.org/x/crypto v0.10.0 // indirect + golang.org/x/crypto v0.13.0 // indirect gorm.io/gorm v1.24.6 ) diff --git a/go.sum b/go.sum index aae7d343..124d4158 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,14 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc= github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/aliyun/aliyun-oss-go-sdk v2.2.7+incompatible h1:KpbJFXwhVeuxNtBJ74MCGbIoaBok2uZvkD7QXp2+Wis= github.com/aliyun/aliyun-oss-go-sdk v2.2.7+incompatible/go.mod h1:T/Aws4fEfogEE9v+HPhhw+CntffsBHJ8nXQCwKr0/g8= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/aws/aws-sdk-go-v2 v1.18.1 h1:+tefE750oAb7ZQGzla6bLkOwfcQCEtC5y2RqoqCeqKo= github.com/aws/aws-sdk-go-v2 v1.18.1/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= @@ -40,6 +45,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.19.2 h1:XFJ2Z6sNUUcAz9poj+245DMkrHE4 github.com/aws/aws-sdk-go-v2/service/sts v1.19.2/go.mod h1:dp0yLPsLBOi++WTxzCjA/oZqi6NPIhoR+uF7GeMU9eg= github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= +github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= +github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -122,6 +129,10 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91 github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos= github.com/go-playground/validator/v10 v10.14.1 h1:9c50NUPC30zyuKprjL3vNZ0m5oG+jU0zvx4AqHGnv4k= github.com/go-playground/validator/v10 v10.14.1/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= +github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4= +github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI= github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= @@ -131,6 +142,8 @@ github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/gofrs/uuid v4.0.0+incompatible h1:1SD/1F5pU8p29ybwgQSwpQk+mwdRrXCYuPhW6m+TnJw= github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= github.com/golang-sql/sqlexp v0.0.0-20170517235910-f1bb20e5a188 h1:+eHOFJl1BaXrQxKX+T06f78590z4qA2ZzBTqahsKSE4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -156,11 +169,15 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= +github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= github.com/hanwen/go-fuse/v2 v2.3.0 h1:t5ivNIH2PK+zw4OBul/iJjsoG9K6kXo4nMDoBpciC8A= github.com/hanwen/go-fuse/v2 v2.3.0/go.mod h1:xKwi1cF7nXAOBCXujD5ie0ZKsxc8GGSA1rlMJc+8IJs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/hyponet/eventbus v1.0.0 h1:Cl1v4Ge1/ILn/z4nBhxu1cTny8joRPDj3pqRudlbO+w= github.com/hyponet/eventbus v1.0.0/go.mod h1:5XPvonkyxwwNSMEqnpuSh1NlW3KZKpRr9DNKkZBBuyk= +github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a h1:h/MjGu9KXGqsIrCH5BEvvwTpMY0ZpuWJhkQi2LNPqGc= +github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a/go.mod h1:2qcy+SgeIQHRG6grhK9oMWFk5Fnh65U5qy47ij5Sw3A= github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= @@ -247,6 +264,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348 h1:MtvEpTB6LX3vkb4ax0b5D2DHbNAUsen0Gx5wZoq3lV4= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q= github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4= @@ -268,6 +287,8 @@ github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m github.com/mattn/go-sqlite3 v1.14.15 h1:vfoHhTN1af61xCRSWzFIWzx2YskyMTwHLrExkBOjvxI= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg= +github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.52 h1:8XhG36F6oKQUDDSuz6dY3rioMzovKjW40W6ANuN0Dps= @@ -331,6 +352,7 @@ github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThC github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= @@ -375,6 +397,7 @@ github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95 github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU= github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= @@ -411,12 +434,14 @@ golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWP golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= -golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/crypto v0.13.0 h1:mvySKfSWJ+UKUii46M40LOvyWfN0s2U+46/jDd0e6Ck= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180719180050-a680a1efc54d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -426,13 +451,20 @@ golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU= -golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -451,26 +483,34 @@ golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= -golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= -golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -483,7 +523,9 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -512,6 +554,7 @@ gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -531,6 +574,8 @@ gorm.io/gorm v1.23.4/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= gorm.io/gorm v1.24.6 h1:wy98aq9oFEetsc4CAbKD2SoBCdMzsbSIvSUUFJuHi5s= gorm.io/gorm v1.24.6/go.mod h1:L4uxeKpfBml98NYqVqwAdmV1a2nBtAec/cf3fpucW/k= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= +howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= modernc.org/libc v1.22.2 h1:4U7v51GyhlWqQmwCHj28Rdq2Yzwk55ovjFrdPjs8Hb0= modernc.org/libc v1.22.2/go.mod h1:uvQavJ1pZ0hIoC/jfqNoMLURIMhKzINIWypNM17puug= modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= diff --git a/pkg/dentry/group.go b/pkg/dentry/group.go index 4e322d72..e67e9712 100644 --- a/pkg/dentry/group.go +++ b/pkg/dentry/group.go @@ -21,7 +21,7 @@ import ( "fmt" "github.com/basenana/nanafs/pkg/metastore" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/utils/logger" "path" @@ -243,7 +243,7 @@ func (e *extGroup) FindEntry(ctx context.Context, name string) (*types.Metadata, } func (e *extGroup) CreateEntry(ctx context.Context, attr EntryAttr) (*types.Metadata, error) { - mirrorEn, err := e.mirror.CreateEntry(ctx, stub.EntryAttr{ + mirrorEn, err := e.mirror.CreateEntry(ctx, pluginapi.EntryAttr{ Name: attr.Name, Kind: attr.Kind, }) @@ -325,7 +325,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error) } recordChildMap := make(map[string]*types.Metadata) - actualChildMap := make(map[string]*stub.Entry) + actualChildMap := make(map[string]*pluginapi.Entry) for i := range recordChild { recordChildMap[recordChild[i].Name] = recordChild[i] } @@ -355,7 +355,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error) return result, nil } -func (e *extGroup) syncEntry(ctx context.Context, mirrored *stub.Entry, crt *types.Metadata) (en *types.Metadata, err error) { +func (e *extGroup) syncEntry(ctx context.Context, mirrored *pluginapi.Entry, crt *types.Metadata) (en *types.Metadata, err error) { grp, err := e.stdGroup.cacheStore.getEntry(ctx, e.stdGroup.entryID) if err != nil { return nil, err diff --git a/pkg/dentry/group_test.go b/pkg/dentry/group_test.go index 44aa98ca..f9bf7474 100644 --- a/pkg/dentry/group_test.go +++ b/pkg/dentry/group_test.go @@ -19,7 +19,7 @@ package dentry import ( "context" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -314,7 +314,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("insert sync_file1.yaml to memfs should be succeed", func() { - _, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{ + _, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{ Name: "sync_file1.yaml", Kind: types.RawKind, }) @@ -335,7 +335,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("insert sync_file2.yaml to memfs should be succeed", func() { - _, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{ + _, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{ Name: "sync_file2.yaml", Kind: types.RawKind, }) @@ -355,7 +355,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("delete sync_file2.yaml should be succeed", func() { - err = memFS.RemoveEntry(context.TODO(), &stub.Entry{ + err = memFS.RemoveEntry(context.TODO(), &pluginapi.Entry{ Name: "sync_file2.yaml", Kind: types.RawKind, }) diff --git a/pkg/plugin/buildin/docloader/csv.go b/pkg/plugin/buildin/docloader/csv.go new file mode 100644 index 00000000..b0297d50 --- /dev/null +++ b/pkg/plugin/buildin/docloader/csv.go @@ -0,0 +1,80 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "encoding/csv" + "errors" + "fmt" + "github.com/basenana/nanafs/pkg/types" + "io" + "os" + "strings" +) + +const ( + csvLoader = "csv" +) + +type CSV struct { + docPath string +} + +func NewCSV(docPath string, option map[string]string) CSV { + return CSV{docPath: docPath} +} + +func (c CSV) Load(_ context.Context) (result []types.FDocument, err error) { + f, err := os.Open(c.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + var header []string + var rown int + + rd := csv.NewReader(f) + for { + row, err := rd.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return nil, err + } + if len(header) == 0 { + header = append(header, row...) + continue + } + + var content []string + for i, value := range row { + line := fmt.Sprintf("%s: %s", header[i], value) + content = append(content, line) + } + + rown++ + result = append(result, types.FDocument{ + Content: strings.Join(content, "\n"), + Metadata: map[string]any{"type": csvLoader, "row": rown}, + }) + } + + return +} diff --git a/pkg/plugin/buildin/docloader/docloader.go b/pkg/plugin/buildin/docloader/docloader.go new file mode 100644 index 00000000..f8403e7a --- /dev/null +++ b/pkg/plugin/buildin/docloader/docloader.go @@ -0,0 +1,102 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "fmt" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" + "github.com/basenana/nanafs/pkg/types" + "os" + "path/filepath" +) + +const ( + PluginName = "docloader" + PluginVersion = "1.0" +) + +type DocLoader struct{} + +func (d DocLoader) Name() string { + return PluginName +} + +func (d DocLoader) Type() types.PluginType { + return types.TypeProcess +} + +func (d DocLoader) Version() string { + return PluginVersion +} + +func (d DocLoader) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { + entryPath := request.Parameter[pluginapi.ResEntryPathKey].(string) + if entryPath == "" { + resp := pluginapi.NewFailedResponse("entry_path is empty") + return resp, nil + } + + _, err := os.Stat(entryPath) + if err != nil { + resp := pluginapi.NewFailedResponse(fmt.Sprintf("stat entry file %s failed: %s", entryPath, err)) + return resp, nil + } + + fileExt := filepath.Ext(entryPath) + var ( + p Parser + parseOption = map[string]string{} + ) + + switch fileExt { + case ".pdf": + p = buildInLoaders[pdfParser](entryPath, parseOption) + case ".txt": + p = buildInLoaders[textParser](entryPath, parseOption) + case ".html", ".htm": + p = buildInLoaders[htmlParser](entryPath, parseOption) + case ".webarchive": + p = buildInLoaders[webArchiveParser](entryPath, parseOption) + default: + resp := pluginapi.NewFailedResponse(fmt.Sprintf("load %s file unsupported", fileExt)) + return resp, nil + } + + documents, err := p.Load(ctx) + if err != nil { + resp := pluginapi.NewFailedResponse(fmt.Sprintf("load file %s failed: %s", entryPath, err)) + return resp, nil + } + + return pluginapi.NewResponseWithResult(map[string]any{pluginapi.ResEntryDocumentsKey: documents}), nil +} + +type Parser interface { + Load(ctx context.Context) (result []types.FDocument, err error) +} + +type parserBuilder func(docPath string, docOption map[string]string) Parser + +var ( + buildInLoaders = map[string]parserBuilder{ + textParser: NewText, + pdfParser: NewPDF, + htmlParser: NewHTML, + webArchiveParser: NewHTML, + } +) diff --git a/pkg/plugin/buildin/docloader/epub.go b/pkg/plugin/buildin/docloader/epub.go new file mode 100644 index 00000000..267f1662 --- /dev/null +++ b/pkg/plugin/buildin/docloader/epub.go @@ -0,0 +1,17 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader diff --git a/pkg/plugin/buildin/docloader/html.go b/pkg/plugin/buildin/docloader/html.go new file mode 100644 index 00000000..e50812c2 --- /dev/null +++ b/pkg/plugin/buildin/docloader/html.go @@ -0,0 +1,66 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "github.com/basenana/nanafs/pkg/types" + "github.com/hyponet/webpage-packer/packer" + "strings" +) + +const ( + htmlParser = "html" + webArchiveParser = "webarchive" +) + +type HTML struct { + docPath string +} + +func NewHTML(docPath string, option map[string]string) Parser { + return HTML{docPath: docPath} +} + +func (h HTML) Load(ctx context.Context) (result []types.FDocument, err error) { + var ( + p packer.Packer + docType = "html" + ) + switch { + case strings.HasSuffix(h.docPath, ".webarchive"): + p = packer.NewWebArchivePacker() + docType = "webarchive" + + case strings.HasSuffix(h.docPath, ".html") || + strings.HasSuffix(h.docPath, ".htm"): + p = packer.NewHtmlPacker() + } + + content, err := p.ReadContent(ctx, packer.Option{ + FilePath: h.docPath, + ClutterFree: true, + }) + if err != nil { + return nil, err + } + + return []types.FDocument{{ + Content: content, + Metadata: map[string]any{"type": docType}, + }}, nil +} diff --git a/pkg/plugin/buildin/docloader/pdf.go b/pkg/plugin/buildin/docloader/pdf.go new file mode 100644 index 00000000..448ae547 --- /dev/null +++ b/pkg/plugin/buildin/docloader/pdf.go @@ -0,0 +1,107 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "github.com/basenana/nanafs/pkg/types" + "github.com/ledongthuc/pdf" + "os" +) + +const ( + pdfParser = "pdf" +) + +type PDF struct { + docPath string + password string +} + +func NewPDF(docPath string, option map[string]string) Parser { + return newPDFWithPassword(docPath, option["password"]) +} + +func newPDFWithPassword(docPath, pass string) Parser { + return &PDF{docPath: docPath, password: pass} +} + +func (p *PDF) Load(_ context.Context) ([]types.FDocument, error) { + fInfo, err := os.Stat(p.docPath) + if err != nil { + return nil, err + } + + f, err := os.Open(p.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + var reader *pdf.Reader + if p.password != "" { + reader, err = pdf.NewReaderEncrypted(f, fInfo.Size(), p.getAndCleanPassword) + if err != nil { + return nil, err + } + } else { + reader, err = pdf.NewReader(f, fInfo.Size()) + if err != nil { + return nil, err + } + } + + var ( + numPages = reader.NumPage() + result = make([]types.FDocument, 0) + ) + + fonts := make(map[string]*pdf.Font) + for i := 1; i < numPages+1; i++ { + page := reader.Page(i) + for _, name := range page.Fonts() { + if _, ok := fonts[name]; !ok { + f := page.Font(name) + fonts[name] = &f + } + } + text, err := page.GetPlainText(fonts) + if err != nil { + return nil, err + } + + result = append(result, types.FDocument{ + Content: text, + Metadata: map[string]any{ + "type": "pdf", + "page": i, + "total_pages": numPages, + }, + }) + } + + return result, nil +} + +func (p *PDF) getAndCleanPassword() string { + pass := p.password + if pass != "" { + // set password empty to stop retry + p.password = "" + } + return pass +} diff --git a/pkg/plugin/buildin/docloader/plaintext.go b/pkg/plugin/buildin/docloader/plaintext.go new file mode 100644 index 00000000..a66d68c4 --- /dev/null +++ b/pkg/plugin/buildin/docloader/plaintext.go @@ -0,0 +1,56 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "bytes" + "context" + "github.com/basenana/nanafs/pkg/types" + "io" + "os" +) + +const ( + textParser = "text" +) + +type Text struct { + docPath string +} + +func NewText(docPath string, option map[string]string) Parser { return Text{docPath: docPath} } + +func (l Text) Load(_ context.Context) ([]types.FDocument, error) { + f, err := os.Open(l.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, f) + if err != nil { + return nil, err + } + + return []types.FDocument{ + { + Content: buf.String(), + Metadata: map[string]any{}, + }, + }, nil +} diff --git a/pkg/plugin/buildin/rss.go b/pkg/plugin/buildin/rss.go index 3cd9d957..130329ab 100644 --- a/pkg/plugin/buildin/rss.go +++ b/pkg/plugin/buildin/rss.go @@ -19,7 +19,7 @@ package buildin import ( "context" "github.com/basenana/nanafs/pkg/metastore" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "go.uber.org/zap" ) @@ -76,7 +76,7 @@ func (r *RssSourcePlugin) listRssSources(ctx context.Context) ([]rssSource, erro return result, nil } -func (r *RssSourcePlugin) Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) { +func (r *RssSourcePlugin) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { rssSourceList, err := r.listRssSources(ctx) if err != nil { r.logger.Errorw("list rss source failed", "err", err) @@ -85,10 +85,10 @@ func (r *RssSourcePlugin) Run(ctx context.Context, request *stub.Request, params for i := range rssSourceList { source := rssSourceList[i] - r.syncRssSource(ctx, source, params) + r.syncRssSource(ctx, source, pluginParams) } - resp := &stub.Response{ + resp := &pluginapi.Response{ IsSucceed: true, } return resp, nil diff --git a/pkg/plugin/mirror.go b/pkg/plugin/mirror.go index c34fb143..3e22674b 100644 --- a/pkg/plugin/mirror.go +++ b/pkg/plugin/mirror.go @@ -19,7 +19,7 @@ package plugin import ( "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/utils" "io" @@ -31,11 +31,11 @@ type MirrorPlugin interface { Plugin IsGroup(ctx context.Context) (bool, error) - FindEntry(ctx context.Context, name string) (*stub.Entry, error) - CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) - UpdateEntry(ctx context.Context, en *stub.Entry) error - RemoveEntry(ctx context.Context, en *stub.Entry) error - ListChildren(ctx context.Context) ([]*stub.Entry, error) + FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) + CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) + UpdateEntry(ctx context.Context, en *pluginapi.Entry) error + RemoveEntry(ctx context.Context, en *pluginapi.Entry) error + ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) WriteAt(ctx context.Context, data []byte, off int64) (int64, error) ReadAt(ctx context.Context, dest []byte, off int64) (int64, error) @@ -104,23 +104,23 @@ func (d *MemFSPlugin) IsGroup(ctx context.Context) (bool, error) { return en.IsGroup, nil } -func (d *MemFSPlugin) FindEntry(ctx context.Context, name string) (*stub.Entry, error) { +func (d *MemFSPlugin) FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) { return d.fs.FindEntry(d.path, name) } -func (d *MemFSPlugin) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) { +func (d *MemFSPlugin) CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { return d.fs.CreateEntry(d.path, attr) } -func (d *MemFSPlugin) UpdateEntry(ctx context.Context, en *stub.Entry) error { +func (d *MemFSPlugin) UpdateEntry(ctx context.Context, en *pluginapi.Entry) error { return d.fs.UpdateEntry(d.path, en) } -func (d *MemFSPlugin) RemoveEntry(ctx context.Context, en *stub.Entry) error { +func (d *MemFSPlugin) RemoveEntry(ctx context.Context, en *pluginapi.Entry) error { return d.fs.RemoveEntry(d.path, en) } -func (d *MemFSPlugin) ListChildren(ctx context.Context) ([]*stub.Entry, error) { +func (d *MemFSPlugin) ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) { return d.fs.ListChildren(d.path) } @@ -145,13 +145,13 @@ func (d *MemFSPlugin) Close(ctx context.Context) error { } type MemFS struct { - entries map[string]*stub.Entry + entries map[string]*pluginapi.Entry files map[string]*memFile groups map[string][]string mux sync.Mutex } -func (m *MemFS) GetEntry(enPath string) (*stub.Entry, error) { +func (m *MemFS) GetEntry(enPath string) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -162,7 +162,7 @@ func (m *MemFS) GetEntry(enPath string) (*stub.Entry, error) { return en, nil } -func (m *MemFS) FindEntry(parentPath string, name string) (*stub.Entry, error) { +func (m *MemFS) FindEntry(parentPath string, name string) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -173,7 +173,7 @@ func (m *MemFS) FindEntry(parentPath string, name string) (*stub.Entry, error) { return en, nil } -func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry, error) { +func (m *MemFS) CreateEntry(parentPath string, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -195,7 +195,7 @@ func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry child = append(child, attr.Name) m.groups[parentPath] = child - en := &stub.Entry{ + en := &pluginapi.Entry{ Name: attr.Name, Kind: attr.Kind, IsGroup: types.IsGroup(attr.Kind), @@ -210,7 +210,7 @@ func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry return en, nil } -func (m *MemFS) UpdateEntry(parentPath string, en *stub.Entry) error { +func (m *MemFS) UpdateEntry(parentPath string, en *pluginapi.Entry) error { m.mux.Lock() defer m.mux.Unlock() @@ -225,7 +225,7 @@ func (m *MemFS) UpdateEntry(parentPath string, en *stub.Entry) error { return nil } -func (m *MemFS) RemoveEntry(parentPath string, en *stub.Entry) error { +func (m *MemFS) RemoveEntry(parentPath string, en *pluginapi.Entry) error { m.mux.Lock() defer m.mux.Unlock() @@ -262,7 +262,7 @@ func (m *MemFS) RemoveEntry(parentPath string, en *stub.Entry) error { return nil } -func (m *MemFS) ListChildren(enPath string) ([]*stub.Entry, error) { +func (m *MemFS) ListChildren(enPath string) ([]*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -276,7 +276,7 @@ func (m *MemFS) ListChildren(enPath string) ([]*stub.Entry, error) { } childNames := m.groups[enPath] - result := make([]*stub.Entry, len(childNames)) + result := make([]*pluginapi.Entry, len(childNames)) for i, chName := range childNames { result[i] = m.entries[path.Join(enPath, chName)] } @@ -322,7 +322,7 @@ func (m *MemFS) Trunc(filePath string) error { func NewMemFS() *MemFS { fs := &MemFS{ - entries: map[string]*stub.Entry{"/": {Name: ".", Kind: types.ExternalGroupKind, Size: 0, IsGroup: true}}, + entries: map[string]*pluginapi.Entry{"/": {Name: ".", Kind: types.ExternalGroupKind, Size: 0, IsGroup: true}}, groups: map[string][]string{"/": {}}, files: map[string]*memFile{}, } @@ -334,11 +334,11 @@ const ( ) type memFile struct { - *stub.Entry + *pluginapi.Entry data []byte } -func newMemFile(entry *stub.Entry) *memFile { +func newMemFile(entry *pluginapi.Entry) *memFile { return &memFile{ Entry: entry, data: utils.NewMemoryBlock(memFileMaxSize / 16), // 1M diff --git a/pkg/plugin/pluginapi/consts.go b/pkg/plugin/pluginapi/consts.go new file mode 100644 index 00000000..4b94c6ad --- /dev/null +++ b/pkg/plugin/pluginapi/consts.go @@ -0,0 +1,28 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package pluginapi + +const ( + ResEntryIdKey = "nanafs.workflow.entry_id" + ResEntryPathKey = "nanafs.workflow.entry_path" + ResEntryDocumentsKey = "nanafs.workflow.entry_documents" + ResCollectManifest = "nanafs.workflow.collect_manifest" + ResPluginName = "nanafs.workflow.plugin_name" + ResPluginVersion = "nanafs.workflow.plugin_version" + ResPluginType = "nanafs.workflow.plugin_type" + ResPluginAction = "nanafs.workflow.plugin_action" +) diff --git a/pkg/plugin/stub/entry.go b/pkg/plugin/pluginapi/entry.go similarity index 97% rename from pkg/plugin/stub/entry.go rename to pkg/plugin/pluginapi/entry.go index 1adc5bee..709d11bb 100644 --- a/pkg/plugin/stub/entry.go +++ b/pkg/plugin/pluginapi/entry.go @@ -14,7 +14,7 @@ limitations under the License. */ -package stub +package pluginapi import ( "github.com/basenana/nanafs/pkg/types" diff --git a/pkg/plugin/stub/process.go b/pkg/plugin/pluginapi/process.go similarity index 74% rename from pkg/plugin/stub/process.go rename to pkg/plugin/pluginapi/process.go index 4649d2da..70ac2f4b 100644 --- a/pkg/plugin/stub/process.go +++ b/pkg/plugin/pluginapi/process.go @@ -14,14 +14,14 @@ limitations under the License. */ -package stub +package pluginapi type Request struct { Action string WorkPath string EntryId int64 EntryPath string - Parameter map[string]string + Parameter map[string]any } func NewRequest() *Request { @@ -31,9 +31,17 @@ func NewRequest() *Request { type Response struct { IsSucceed bool Message string - Entries []Entry + Results map[string]any } func NewResponse() *Response { return &Response{} } + +func NewFailedResponse(msg string) *Response { + return &Response{IsSucceed: false, Message: msg} +} + +func NewResponseWithResult(result map[string]any) *Response { + return &Response{IsSucceed: true, Results: result} +} diff --git a/pkg/plugin/process.go b/pkg/plugin/process.go index 25c4290f..d99e6e69 100644 --- a/pkg/plugin/process.go +++ b/pkg/plugin/process.go @@ -19,17 +19,24 @@ package plugin import ( "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/buildin/docloader" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" + "github.com/basenana/nanafs/utils" "time" ) type ProcessPlugin interface { Plugin - Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) + Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) } -func Call(ctx context.Context, ps types.PlugScope, req *stub.Request) (resp *stub.Response, err error) { +func Call(ctx context.Context, ps types.PlugScope, req *pluginapi.Request) (resp *pluginapi.Response, err error) { + defer func() { + if rErr := utils.Recover(); rErr != nil { + err = rErr + } + }() var plugin Plugin plugin, err = BuildPlugin(ctx, ps) if err != nil { @@ -64,7 +71,7 @@ func (d *DelayProcessPlugin) Version() string { return delayPluginVersion } -func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) { +func (d *DelayProcessPlugin) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { var ( until time.Time nowTime = time.Now() @@ -72,7 +79,7 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par switch request.Action { case "delay": - delayDurationStr := params["delay"] + delayDurationStr := pluginParams["delay"] duration, err := time.ParseDuration(delayDurationStr) if err != nil { return nil, fmt.Errorf("parse delay duration [%s] failed: %s", delayDurationStr, err) @@ -81,14 +88,14 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par case "until": var err error - untilStr := params["until"] + untilStr := pluginParams["until"] until, err = time.Parse(untilStr, time.RFC3339) if err != nil { return nil, fmt.Errorf("parse delay until [%s] failed: %s", untilStr, err) } default: - resp := stub.NewResponse() + resp := pluginapi.NewResponse() resp.Message = fmt.Sprintf("unknown action: %s", request.Action) return resp, nil } @@ -98,16 +105,16 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par defer timer.Stop() select { case <-timer.C: - return &stub.Response{IsSucceed: true}, nil + return &pluginapi.Response{IsSucceed: true}, nil case <-ctx.Done(): - return &stub.Response{IsSucceed: false, Message: ctx.Err().Error()}, nil + return &pluginapi.Response{IsSucceed: false, Message: ctx.Err().Error()}, nil } } - return &stub.Response{IsSucceed: true}, nil + return &pluginapi.Response{IsSucceed: true}, nil } -func registerDelayPlugin(r *registry) { +func registerBuildInProcessPlugin(r *registry) { r.Register( delayPluginName, types.PluginSpec{Name: delayPluginName, Version: delayPluginVersion, Type: types.TypeProcess, Parameters: map[string]string{}}, @@ -115,4 +122,12 @@ func registerDelayPlugin(r *registry) { return &DelayProcessPlugin{}, nil }, ) + + r.Register( + docloader.PluginName, + types.PluginSpec{Name: docloader.PluginName, Version: docloader.PluginVersion, Type: types.TypeProcess, Parameters: map[string]string{}}, + func(ctx context.Context, spec types.PluginSpec, scope types.PlugScope) (Plugin, error) { + return &docloader.DocLoader{}, nil + }, + ) } diff --git a/pkg/plugin/registry.go b/pkg/plugin/registry.go index d4945a1d..d26b93d6 100644 --- a/pkg/plugin/registry.go +++ b/pkg/plugin/registry.go @@ -70,7 +70,7 @@ func Init(cfg *config.Plugin, recorderGetter metastore.PluginRecorderGetter) err } // register build-in plugins - registerDelayPlugin(r) + registerBuildInProcessPlugin(r) registerMemfsPlugin(r) register3BodyPlugin(r) diff --git a/pkg/plugin/source.go b/pkg/plugin/source.go index 8cc327c4..f1565e8a 100644 --- a/pkg/plugin/source.go +++ b/pkg/plugin/source.go @@ -20,7 +20,7 @@ import ( "bytes" "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "io" "io/ioutil" @@ -31,8 +31,8 @@ import ( type SourcePlugin interface { Plugin - Fresh(ctx context.Context, opt stub.FreshOption) ([]*stub.Entry, error) - Open(ctx context.Context, entry *stub.Entry) (io.ReadCloser, error) + Fresh(ctx context.Context, opt pluginapi.FreshOption) ([]*pluginapi.Entry, error) + Open(ctx context.Context, entry *pluginapi.Entry) (io.ReadCloser, error) } const ( @@ -56,14 +56,14 @@ func (d *ThreeBodyPlugin) Version() string { return the3BodyPluginVersion } -func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt stub.FreshOption) ([]*stub.Entry, error) { +func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt pluginapi.FreshOption) ([]*pluginapi.Entry, error) { crtAt := time.Now().Unix() - result := make([]*stub.Entry, 0) + result := make([]*pluginapi.Entry, 0) for i := crtAt - 60; i < crtAt; i += 60 { if i <= opt.LastFreshAt.Unix() { continue } - result = append(result, &stub.Entry{ + result = append(result, &pluginapi.Entry{ Name: fmt.Sprintf("3_body_%d.txt", i), Kind: types.RawKind, IsGroup: false, @@ -72,7 +72,7 @@ func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt stub.FreshOption) ([]*s return result, nil } -func (d *ThreeBodyPlugin) Open(ctx context.Context, entry *stub.Entry) (io.ReadCloser, error) { +func (d *ThreeBodyPlugin) Open(ctx context.Context, entry *pluginapi.Entry) (io.ReadCloser, error) { fileNameParts := strings.Split(entry.Name, "_") sendAtStr := fileNameParts[len(fileNameParts)-1] diff --git a/pkg/types/friday.go b/pkg/types/friday.go new file mode 100644 index 00000000..7094a55f --- /dev/null +++ b/pkg/types/friday.go @@ -0,0 +1,22 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package types + +type FDocument struct { + Content string + Metadata map[string]any +} diff --git a/pkg/workflow/exec/consts.go b/pkg/workflow/exec/consts.go index 414b2ae6..2b01ad0f 100644 --- a/pkg/workflow/exec/consts.go +++ b/pkg/workflow/exec/consts.go @@ -21,12 +21,3 @@ const ( OpEntryCollect = "entryCollect" OpPluginCall = "pluginCall" ) - -const ( - paramEntryIdKey = "nanafs.workflow.entry_id" - paramEntryPathKey = "nanafs.workflow.entry_path" - paramPluginName = "nanafs.workflow.plugin_name" - paramPluginVersion = "nanafs.workflow.plugin_version" - paramPluginType = "nanafs.workflow.plugin_type" - paramPluginAction = "nanafs.workflow.plugin_action" -) diff --git a/pkg/workflow/exec/executor.go b/pkg/workflow/exec/executor.go index a75a2877..608fca07 100644 --- a/pkg/workflow/exec/executor.go +++ b/pkg/workflow/exec/executor.go @@ -22,11 +22,14 @@ import ( "github.com/basenana/nanafs/config" "github.com/basenana/nanafs/pkg/dentry" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/pkg/workflow/jobrun" "github.com/basenana/nanafs/utils/logger" "go.uber.org/zap" + "os" + "path" + "sync" ) const ( @@ -36,8 +39,11 @@ const ( func RegisterOperators(entryMgr dentry.Manager, cfg LocalConfig) error { jobrun.RegisterExecutorBuilder(localExecName, func(job *types.WorkflowJob) jobrun.Executor { return &localExecutor{ - job: job, entryMgr: entryMgr, config: cfg, - logger: logger.NewLogger("localExecutor").With(zap.String("job", job.Id)), + job: job, + entryMgr: entryMgr, + config: cfg, + results: map[string]any{}, + logger: logger.NewLogger("localExecutor").With(zap.String("job", job.Id)), } }) return nil @@ -49,6 +55,8 @@ type localExecutor struct { entryPath string entryMgr dentry.Manager config LocalConfig + results map[string]any + resultMux sync.Mutex logger *zap.SugaredLogger } @@ -72,17 +80,30 @@ func (b *localExecutor) Setup(ctx context.Context) (err error) { return } b.logger.Infow("job setup", "workdir", b.workdir, "entryPath", b.entryPath) + return } func (b *localExecutor) DoOperation(ctx context.Context, step types.WorkflowJobStep) error { - req := stub.NewRequest() + req := pluginapi.NewRequest() req.WorkPath = b.workdir req.EntryId = b.job.Target.EntryID req.EntryPath = b.entryPath + req.Parameter = map[string]any{} + b.resultMux.Lock() + for k, v := range b.results { + req.Parameter[k] = v + } + b.resultMux.Unlock() + req.Parameter[pluginapi.ResEntryIdKey] = b.job.Target.EntryID + req.Parameter[pluginapi.ResEntryPathKey] = b.entryPath + req.Parameter[pluginapi.ResPluginName] = step.Plugin.PluginName + req.Parameter[pluginapi.ResPluginVersion] = step.Plugin.Version + req.Parameter[pluginapi.ResPluginType] = step.Plugin.PluginType + req.Parameter[pluginapi.ResPluginAction] = step.Plugin.Action + req.Action = step.Plugin.PluginName - req.Parameter = step.Plugin.Parameters resp, err := plugin.Call(ctx, *step.Plugin, req) if err != nil { return fmt.Errorf("plugin action error: %s", err) @@ -90,12 +111,29 @@ func (b *localExecutor) DoOperation(ctx context.Context, step types.WorkflowJobS if !resp.IsSucceed { return fmt.Errorf("plugin action failed: %s", resp.Message) } + if len(resp.Results) > 0 { + b.resultMux.Lock() + for k, v := range resp.Results { + b.results[k] = v + } + b.resultMux.Unlock() + } return nil } func (b *localExecutor) Collect(ctx context.Context) error { - //TODO implement me - panic("implement me") + b.resultMux.Lock() + filename, needCollect := b.results[pluginapi.ResCollectManifest] + b.resultMux.Unlock() + if !needCollect { + return nil + } + f, err := os.Open(path.Join(b.workdir, filename.(string))) + if err != nil { + return fmt.Errorf("read collect manifest file failed: %s", err) + } + defer f.Close() + return nil } func (b *localExecutor) Teardown(ctx context.Context) { diff --git a/pkg/workflow/mirrordir.go b/pkg/workflow/mirrordir.go index 5345532e..0a93a560 100644 --- a/pkg/workflow/mirrordir.go +++ b/pkg/workflow/mirrordir.go @@ -21,7 +21,7 @@ import ( "fmt" "github.com/basenana/nanafs/pkg/dentry" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/pkg/workflow/jobrun" "github.com/basenana/nanafs/utils" @@ -121,7 +121,7 @@ func (d *dirHandler) IsGroup(ctx context.Context) (bool, error) { return en.IsGroup, nil } -func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, error) { +func (d *dirHandler) FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) { if d == nil { return nil, types.ErrNoGroup } @@ -138,7 +138,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if d.dirKind == MirrorDirRoot { switch name { case MirrorDirWorkflows, MirrorDirJobs: - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) default: return nil, types.ErrNotFound } @@ -149,7 +149,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if err != nil { return nil, err } - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.RawKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.RawKind}) } if d.dirKind == MirrorDirJobs { @@ -158,7 +158,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if err != nil { return nil, err } - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) } else { jobs, err := d.ListChildren(ctx) if err != nil { @@ -174,7 +174,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e return nil, types.ErrNotFound } -func (d *dirHandler) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) { +func (d *dirHandler) CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { if d.dirKind == MirrorDirRoot { return nil, types.ErrNoAccess } @@ -195,11 +195,11 @@ func (d *dirHandler) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stu return en, nil } -func (d *dirHandler) UpdateEntry(ctx context.Context, en *stub.Entry) error { +func (d *dirHandler) UpdateEntry(ctx context.Context, en *pluginapi.Entry) error { return d.plugin.fs.UpdateEntry(d.plugin.path, en) } -func (d *dirHandler) RemoveEntry(ctx context.Context, en *stub.Entry) error { +func (d *dirHandler) RemoveEntry(ctx context.Context, en *pluginapi.Entry) error { if d == nil { return types.ErrNoGroup } @@ -227,7 +227,7 @@ func (d *dirHandler) RemoveEntry(ctx context.Context, en *stub.Entry) error { return nil } -func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { +func (d *dirHandler) ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) { if d == nil { return nil, types.ErrNoGroup } @@ -236,7 +236,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { return nil, err } - children := make([]*stub.Entry, 0) + children := make([]*pluginapi.Entry, 0) cachedChildMap := make(map[string]struct{}) for i, ch := range cachedChild { cachedChildMap[ch.Name] = struct{}{} @@ -247,7 +247,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { case d.dirKind == MirrorDirRoot: if _, ok := cachedChildMap[MirrorDirJobs]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: MirrorDirJobs, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: MirrorDirJobs, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror dir %s error: %s", MirrorDirJobs, err) return nil, err @@ -256,7 +256,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } if _, ok := cachedChildMap[MirrorDirWorkflows]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror dir %s error: %s", MirrorDirWorkflows, err) return nil, err @@ -271,7 +271,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, wf := range wfList { if _, ok := cachedChildMap[id2MirrorFile(wf.Id)]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: id2MirrorFile(wf.Id), Kind: types.RawKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: id2MirrorFile(wf.Id), Kind: types.RawKind}) if err != nil { wfLogger.Errorf("init mirror workflow file %s error: %s", id2MirrorFile(wf.Id), err) return nil, err @@ -286,7 +286,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, wf := range wfList { if _, ok := cachedChildMap[wf.Id]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: wf.Id, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: wf.Id, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror jobs workflow group %s error: %s", wf.Id, err) return nil, err @@ -301,7 +301,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, j := range jobList { if _, ok := cachedChildMap[id2MirrorFile(j.Id)]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: id2MirrorFile(j.Id), Kind: types.RawKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: id2MirrorFile(j.Id), Kind: types.RawKind}) if err != nil { wfLogger.Errorf("init mirror job file %s error: %s", id2MirrorFile(j.Id), err) return nil, err @@ -368,7 +368,7 @@ func (f *fileHandler) Close(ctx context.Context) error { return nil } -func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *stub.Entry) error { +func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *pluginapi.Entry) error { wf := &types.WorkflowSpec{} decodeErr := yaml.NewDecoder(&memfsFile{filePath: f.plugin.path, entry: en, memfs: f.plugin.fs}).Decode(wf) if decodeErr != nil { @@ -413,7 +413,7 @@ func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *stub.Entry return nil } -func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *stub.Entry) error { +func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *pluginapi.Entry) error { wfJob := &types.WorkflowJob{} decodeErr := yaml.NewDecoder(&memfsFile{filePath: f.plugin.path, entry: en, memfs: f.plugin.fs}).Decode(wfJob) if decodeErr != nil { @@ -469,7 +469,7 @@ func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *stub.E type memfsFile struct { filePath string - entry *stub.Entry + entry *pluginapi.Entry memfs *plugin.MemFS off int64 } @@ -490,12 +490,12 @@ func buildWorkflowMirrorPlugin(mgr Manager) plugin.Builder { mp := &MirrorPlugin{path: "/", fs: plugin.NewMemFS(), mgr: mgr} mp.dirHandler = &dirHandler{plugin: mp, dirKind: MirrorDirRoot} - _, _ = mp.fs.CreateEntry("/", stub.EntryAttr{ + _, _ = mp.fs.CreateEntry("/", pluginapi.EntryAttr{ Name: MirrorDirJobs, Kind: types.ExternalGroupKind, }) - _, _ = mp.fs.CreateEntry("/", stub.EntryAttr{ + _, _ = mp.fs.CreateEntry("/", pluginapi.EntryAttr{ Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind, }) diff --git a/vendor/github.com/PuerkitoBio/goquery/.gitattributes b/vendor/github.com/PuerkitoBio/goquery/.gitattributes new file mode 100644 index 00000000..0cc26ec0 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/.gitattributes @@ -0,0 +1 @@ +testdata/* linguist-vendored diff --git a/vendor/github.com/PuerkitoBio/goquery/.gitignore b/vendor/github.com/PuerkitoBio/goquery/.gitignore new file mode 100644 index 00000000..970381cd --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/.gitignore @@ -0,0 +1,16 @@ +# editor temporary files +*.sublime-* +.DS_Store +*.swp +#*.*# +tags + +# direnv config +.env* + +# test binaries +*.test + +# coverage and profilte outputs +*.out + diff --git a/vendor/github.com/PuerkitoBio/goquery/LICENSE b/vendor/github.com/PuerkitoBio/goquery/LICENSE new file mode 100644 index 00000000..25372c2b --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/LICENSE @@ -0,0 +1,12 @@ +Copyright (c) 2012-2021, Martin Angers & Contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +* Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/PuerkitoBio/goquery/README.md b/vendor/github.com/PuerkitoBio/goquery/README.md new file mode 100644 index 00000000..582ccac9 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/README.md @@ -0,0 +1,198 @@ +# goquery - a little like that j-thing, only in Go + +[![Build Status](https://github.com/PuerkitoBio/goquery/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/PuerkitoBio/goquery/actions) +[![Go Reference](https://pkg.go.dev/badge/github.com/PuerkitoBio/goquery.svg)](https://pkg.go.dev/github.com/PuerkitoBio/goquery) +[![Sourcegraph Badge](https://sourcegraph.com/github.com/PuerkitoBio/goquery/-/badge.svg)](https://sourcegraph.com/github.com/PuerkitoBio/goquery?badge) + +goquery brings a syntax and a set of features similar to [jQuery][] to the [Go language][go]. It is based on Go's [net/html package][html] and the CSS Selector library [cascadia][]. Since the net/html parser returns nodes, and not a full-featured DOM tree, jQuery's stateful manipulation functions (like height(), css(), detach()) have been left off. + +Also, because the net/html parser requires UTF-8 encoding, so does goquery: it is the caller's responsibility to ensure that the source document provides UTF-8 encoded HTML. See the [wiki][] for various options to do this. + +Syntax-wise, it is as close as possible to jQuery, with the same function names when possible, and that warm and fuzzy chainable interface. jQuery being the ultra-popular library that it is, I felt that writing a similar HTML-manipulating library was better to follow its API than to start anew (in the same spirit as Go's `fmt` package), even though some of its methods are less than intuitive (looking at you, [index()][index]...). + +## Table of Contents + +* [Installation](#installation) +* [Changelog](#changelog) +* [API](#api) +* [Examples](#examples) +* [Related Projects](#related-projects) +* [Support](#support) +* [License](#license) + +## Installation + +Please note that because of the net/html dependency, goquery requires Go1.1+ and is tested on Go1.7+. + + $ go get github.com/PuerkitoBio/goquery + +(optional) To run unit tests: + + $ cd $GOPATH/src/github.com/PuerkitoBio/goquery + $ go test + +(optional) To run benchmarks (warning: it runs for a few minutes): + + $ cd $GOPATH/src/github.com/PuerkitoBio/goquery + $ go test -bench=".*" + +## Changelog + +**Note that goquery's API is now stable, and will not break.** + +* **2023-02-18 (v1.8.1)** : Update `go.mod` dependencies, update CI workflow. +* **2021-10-25 (v1.8.0)** : Add `Render` function to render a `Selection` to an `io.Writer` (thanks [@anthonygedeon](https://github.com/anthonygedeon)). +* **2021-07-11 (v1.7.1)** : Update go.mod dependencies and add dependabot config (thanks [@jauderho](https://github.com/jauderho)). +* **2021-06-14 (v1.7.0)** : Add `Single` and `SingleMatcher` functions to optimize first-match selection (thanks [@gdollardollar](https://github.com/gdollardollar)). +* **2021-01-11 (v1.6.1)** : Fix panic when calling `{Prepend,Append,Set}Html` on a `Selection` that contains non-Element nodes. +* **2020-10-08 (v1.6.0)** : Parse html in context of the container node for all functions that deal with html strings (`AfterHtml`, `AppendHtml`, etc.). Thanks to [@thiemok][thiemok] and [@davidjwilkins][djw] for their work on this. +* **2020-02-04 (v1.5.1)** : Update module dependencies. +* **2018-11-15 (v1.5.0)** : Go module support (thanks @Zaba505). +* **2018-06-07 (v1.4.1)** : Add `NewDocumentFromReader` examples. +* **2018-03-24 (v1.4.0)** : Deprecate `NewDocument(url)` and `NewDocumentFromResponse(response)`. +* **2018-01-28 (v1.3.0)** : Add `ToEnd` constant to `Slice` until the end of the selection (thanks to @davidjwilkins for raising the issue). +* **2018-01-11 (v1.2.0)** : Add `AddBack*` and deprecate `AndSelf` (thanks to @davidjwilkins). +* **2017-02-12 (v1.1.0)** : Add `SetHtml` and `SetText` (thanks to @glebtv). +* **2016-12-29 (v1.0.2)** : Optimize allocations for `Selection.Text` (thanks to @radovskyb). +* **2016-08-28 (v1.0.1)** : Optimize performance for large documents. +* **2016-07-27 (v1.0.0)** : Tag version 1.0.0. +* **2016-06-15** : Invalid selector strings internally compile to a `Matcher` implementation that never matches any node (instead of a panic). So for example, `doc.Find("~")` returns an empty `*Selection` object. +* **2016-02-02** : Add `NodeName` utility function similar to the DOM's `nodeName` property. It returns the tag name of the first element in a selection, and other relevant values of non-element nodes (see [doc][] for details). Add `OuterHtml` utility function similar to the DOM's `outerHTML` property (named `OuterHtml` in small caps for consistency with the existing `Html` method on the `Selection`). +* **2015-04-20** : Add `AttrOr` helper method to return the attribute's value or a default value if absent. Thanks to [piotrkowalczuk][piotr]. +* **2015-02-04** : Add more manipulation functions - Prepend* - thanks again to [Andrew Stone][thatguystone]. +* **2014-11-28** : Add more manipulation functions - ReplaceWith*, Wrap* and Unwrap - thanks again to [Andrew Stone][thatguystone]. +* **2014-11-07** : Add manipulation functions (thanks to [Andrew Stone][thatguystone]) and `*Matcher` functions, that receive compiled cascadia selectors instead of selector strings, thus avoiding potential panics thrown by goquery via `cascadia.MustCompile` calls. This results in better performance (selectors can be compiled once and reused) and more idiomatic error handling (you can handle cascadia's compilation errors, instead of recovering from panics, which had been bugging me for a long time). Note that the actual type expected is a `Matcher` interface, that `cascadia.Selector` implements. Other matcher implementations could be used. +* **2014-11-06** : Change import paths of net/html to golang.org/x/net/html (see https://groups.google.com/forum/#!topic/golang-nuts/eD8dh3T9yyA). Make sure to update your code to use the new import path too when you call goquery with `html.Node`s. +* **v0.3.2** : Add `NewDocumentFromReader()` (thanks jweir) which allows creating a goquery document from an io.Reader. +* **v0.3.1** : Add `NewDocumentFromResponse()` (thanks assassingj) which allows creating a goquery document from an http response. +* **v0.3.0** : Add `EachWithBreak()` which allows to break out of an `Each()` loop by returning false. This function was added instead of changing the existing `Each()` to avoid breaking compatibility. +* **v0.2.1** : Make go-getable, now that [go.net/html is Go1.0-compatible][gonet] (thanks to @matrixik for pointing this out). +* **v0.2.0** : Add support for negative indices in Slice(). **BREAKING CHANGE** `Document.Root` is removed, `Document` is now a `Selection` itself (a selection of one, the root element, just like `Document.Root` was before). Add jQuery's Closest() method. +* **v0.1.1** : Add benchmarks to use as baseline for refactorings, refactor Next...() and Prev...() methods to use the new html package's linked list features (Next/PrevSibling, FirstChild). Good performance boost (40+% in some cases). +* **v0.1.0** : Initial release. + +## API + +goquery exposes two structs, `Document` and `Selection`, and the `Matcher` interface. Unlike jQuery, which is loaded as part of a DOM document, and thus acts on its containing document, goquery doesn't know which HTML document to act upon. So it needs to be told, and that's what the `Document` type is for. It holds the root document node as the initial Selection value to manipulate. + +jQuery often has many variants for the same function (no argument, a selector string argument, a jQuery object argument, a DOM element argument, ...). Instead of exposing the same features in goquery as a single method with variadic empty interface arguments, statically-typed signatures are used following this naming convention: + +* When the jQuery equivalent can be called with no argument, it has the same name as jQuery for the no argument signature (e.g.: `Prev()`), and the version with a selector string argument is called `XxxFiltered()` (e.g.: `PrevFiltered()`) +* When the jQuery equivalent **requires** one argument, the same name as jQuery is used for the selector string version (e.g.: `Is()`) +* The signatures accepting a jQuery object as argument are defined in goquery as `XxxSelection()` and take a `*Selection` object as argument (e.g.: `FilterSelection()`) +* The signatures accepting a DOM element as argument in jQuery are defined in goquery as `XxxNodes()` and take a variadic argument of type `*html.Node` (e.g.: `FilterNodes()`) +* The signatures accepting a function as argument in jQuery are defined in goquery as `XxxFunction()` and take a function as argument (e.g.: `FilterFunction()`) +* The goquery methods that can be called with a selector string have a corresponding version that take a `Matcher` interface and are defined as `XxxMatcher()` (e.g.: `IsMatcher()`) + +Utility functions that are not in jQuery but are useful in Go are implemented as functions (that take a `*Selection` as parameter), to avoid a potential naming clash on the `*Selection`'s methods (reserved for jQuery-equivalent behaviour). + +The complete [package reference documentation can be found here][doc]. + +Please note that Cascadia's selectors do not necessarily match all supported selectors of jQuery (Sizzle). See the [cascadia project][cascadia] for details. Invalid selector strings compile to a `Matcher` that fails to match any node. Behaviour of the various functions that take a selector string as argument follows from that fact, e.g. (where `~` is an invalid selector string): + +* `Find("~")` returns an empty selection because the selector string doesn't match anything. +* `Add("~")` returns a new selection that holds the same nodes as the original selection, because it didn't add any node (selector string didn't match anything). +* `ParentsFiltered("~")` returns an empty selection because the selector string doesn't match anything. +* `ParentsUntil("~")` returns all parents of the selection because the selector string didn't match any element to stop before the top element. + +## Examples + +See some tips and tricks in the [wiki][]. + +Adapted from example_test.go: + +```Go +package main + +import ( + "fmt" + "log" + "net/http" + + "github.com/PuerkitoBio/goquery" +) + +func ExampleScrape() { + // Request the HTML page. + res, err := http.Get("http://metalsucks.net") + if err != nil { + log.Fatal(err) + } + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + log.Fatal(err) + } + + // Find the review items + doc.Find(".left-content article .post-title").Each(func(i int, s *goquery.Selection) { + // For each item found, get the title + title := s.Find("a").Text() + fmt.Printf("Review %d: %s\n", i, title) + }) +} + +func main() { + ExampleScrape() +} +``` + +## Related Projects + +- [Goq][goq], an HTML deserialization and scraping library based on goquery and struct tags. +- [andybalholm/cascadia][cascadia], the CSS selector library used by goquery. +- [suntong/cascadia][cascadiacli], a command-line interface to the cascadia CSS selector library, useful to test selectors. +- [gocolly/colly](https://github.com/gocolly/colly), a lightning fast and elegant Scraping Framework +- [gnulnx/goperf](https://github.com/gnulnx/goperf), a website performance test tool that also fetches static assets. +- [MontFerret/ferret](https://github.com/MontFerret/ferret), declarative web scraping. +- [tacusci/berrycms](https://github.com/tacusci/berrycms), a modern simple to use CMS with easy to write plugins +- [Dataflow kit](https://github.com/slotix/dataflowkit), Web Scraping framework for Gophers. +- [Geziyor](https://github.com/geziyor/geziyor), a fast web crawling & scraping framework for Go. Supports JS rendering. +- [Pagser](https://github.com/foolin/pagser), a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags. +- [stitcherd](https://github.com/vhodges/stitcherd), A server for doing server side includes using css selectors and DOM updates. +- [goskyr](https://github.com/jakopako/goskyr), an easily configurable command-line scraper written in Go. +- [goGetJS](https://github.com/davemolk/goGetJS), a tool for extracting, searching, and saving JavaScript files (with optional headless browser). + +## Support + +There are a number of ways you can support the project: + +* Use it, star it, build something with it, spread the word! + - If you do build something open-source or otherwise publicly-visible, let me know so I can add it to the [Related Projects](#related-projects) section! +* Raise issues to improve the project (note: doc typos and clarifications are issues too!) + - Please search existing issues before opening a new one - it may have already been adressed. +* Pull requests: please discuss new code in an issue first, unless the fix is really trivial. + - Make sure new code is tested. + - Be mindful of existing code - PRs that break existing code have a high probability of being declined, unless it fixes a serious issue. +* Sponsor the developer + - See the Github Sponsor button at the top of the repo on github + - or via BuyMeACoffee.com, below + +Buy Me A Coffee + +## License + +The [BSD 3-Clause license][bsd], the same as the [Go language][golic]. Cascadia's license is [here][caslic]. + +[jquery]: http://jquery.com/ +[go]: http://golang.org/ +[cascadia]: https://github.com/andybalholm/cascadia +[cascadiacli]: https://github.com/suntong/cascadia +[bsd]: http://opensource.org/licenses/BSD-3-Clause +[golic]: http://golang.org/LICENSE +[caslic]: https://github.com/andybalholm/cascadia/blob/master/LICENSE +[doc]: https://pkg.go.dev/github.com/PuerkitoBio/goquery +[index]: http://api.jquery.com/index/ +[gonet]: https://github.com/golang/net/ +[html]: https://pkg.go.dev/golang.org/x/net/html +[wiki]: https://github.com/PuerkitoBio/goquery/wiki/Tips-and-tricks +[thatguystone]: https://github.com/thatguystone +[piotr]: https://github.com/piotrkowalczuk +[goq]: https://github.com/andrewstuart/goq +[thiemok]: https://github.com/thiemok +[djw]: https://github.com/davidjwilkins diff --git a/vendor/github.com/PuerkitoBio/goquery/array.go b/vendor/github.com/PuerkitoBio/goquery/array.go new file mode 100644 index 00000000..1b1f6cbe --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/array.go @@ -0,0 +1,124 @@ +package goquery + +import ( + "golang.org/x/net/html" +) + +const ( + maxUint = ^uint(0) + maxInt = int(maxUint >> 1) + + // ToEnd is a special index value that can be used as end index in a call + // to Slice so that all elements are selected until the end of the Selection. + // It is equivalent to passing (*Selection).Length(). + ToEnd = maxInt +) + +// First reduces the set of matched elements to the first in the set. +// It returns a new Selection object, and an empty Selection object if the +// the selection is empty. +func (s *Selection) First() *Selection { + return s.Eq(0) +} + +// Last reduces the set of matched elements to the last in the set. +// It returns a new Selection object, and an empty Selection object if +// the selection is empty. +func (s *Selection) Last() *Selection { + return s.Eq(-1) +} + +// Eq reduces the set of matched elements to the one at the specified index. +// If a negative index is given, it counts backwards starting at the end of the +// set. It returns a new Selection object, and an empty Selection object if the +// index is invalid. +func (s *Selection) Eq(index int) *Selection { + if index < 0 { + index += len(s.Nodes) + } + + if index >= len(s.Nodes) || index < 0 { + return newEmptySelection(s.document) + } + + return s.Slice(index, index+1) +} + +// Slice reduces the set of matched elements to a subset specified by a range +// of indices. The start index is 0-based and indicates the index of the first +// element to select. The end index is 0-based and indicates the index at which +// the elements stop being selected (the end index is not selected). +// +// The indices may be negative, in which case they represent an offset from the +// end of the selection. +// +// The special value ToEnd may be specified as end index, in which case all elements +// until the end are selected. This works both for a positive and negative start +// index. +func (s *Selection) Slice(start, end int) *Selection { + if start < 0 { + start += len(s.Nodes) + } + if end == ToEnd { + end = len(s.Nodes) + } else if end < 0 { + end += len(s.Nodes) + } + return pushStack(s, s.Nodes[start:end]) +} + +// Get retrieves the underlying node at the specified index. +// Get without parameter is not implemented, since the node array is available +// on the Selection object. +func (s *Selection) Get(index int) *html.Node { + if index < 0 { + index += len(s.Nodes) // Negative index gets from the end + } + return s.Nodes[index] +} + +// Index returns the position of the first element within the Selection object +// relative to its sibling elements. +func (s *Selection) Index() int { + if len(s.Nodes) > 0 { + return newSingleSelection(s.Nodes[0], s.document).PrevAll().Length() + } + return -1 +} + +// IndexSelector returns the position of the first element within the +// Selection object relative to the elements matched by the selector, or -1 if +// not found. +func (s *Selection) IndexSelector(selector string) int { + if len(s.Nodes) > 0 { + sel := s.document.Find(selector) + return indexInSlice(sel.Nodes, s.Nodes[0]) + } + return -1 +} + +// IndexMatcher returns the position of the first element within the +// Selection object relative to the elements matched by the matcher, or -1 if +// not found. +func (s *Selection) IndexMatcher(m Matcher) int { + if len(s.Nodes) > 0 { + sel := s.document.FindMatcher(m) + return indexInSlice(sel.Nodes, s.Nodes[0]) + } + return -1 +} + +// IndexOfNode returns the position of the specified node within the Selection +// object, or -1 if not found. +func (s *Selection) IndexOfNode(node *html.Node) int { + return indexInSlice(s.Nodes, node) +} + +// IndexOfSelection returns the position of the first node in the specified +// Selection object within this Selection object, or -1 if not found. +func (s *Selection) IndexOfSelection(sel *Selection) int { + if sel != nil && len(sel.Nodes) > 0 { + return indexInSlice(s.Nodes, sel.Nodes[0]) + } + return -1 +} diff --git a/vendor/github.com/PuerkitoBio/goquery/doc.go b/vendor/github.com/PuerkitoBio/goquery/doc.go new file mode 100644 index 00000000..71146a78 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/doc.go @@ -0,0 +1,123 @@ +// Copyright (c) 2012-2016, Martin Angers & Contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// * Neither the name of the author nor the names of its contributors may be used to +// endorse or promote products derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +// OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +/* +Package goquery implements features similar to jQuery, including the chainable +syntax, to manipulate and query an HTML document. + +It brings a syntax and a set of features similar to jQuery to the Go language. +It is based on Go's net/html package and the CSS Selector library cascadia. +Since the net/html parser returns nodes, and not a full-featured DOM +tree, jQuery's stateful manipulation functions (like height(), css(), detach()) +have been left off. + +Also, because the net/html parser requires UTF-8 encoding, so does goquery: it is +the caller's responsibility to ensure that the source document provides UTF-8 encoded HTML. +See the repository's wiki for various options on how to do this. + +Syntax-wise, it is as close as possible to jQuery, with the same method names when +possible, and that warm and fuzzy chainable interface. jQuery being the +ultra-popular library that it is, writing a similar HTML-manipulating +library was better to follow its API than to start anew (in the same spirit as +Go's fmt package), even though some of its methods are less than intuitive (looking +at you, index()...). + +It is hosted on GitHub, along with additional documentation in the README.md +file: https://github.com/puerkitobio/goquery + +Please note that because of the net/html dependency, goquery requires Go1.1+. + +The various methods are split into files based on the category of behavior. +The three dots (...) indicate that various "overloads" are available. + +* array.go : array-like positional manipulation of the selection. + - Eq() + - First() + - Get() + - Index...() + - Last() + - Slice() + +* expand.go : methods that expand or augment the selection's set. + - Add...() + - AndSelf() + - Union(), which is an alias for AddSelection() + +* filter.go : filtering methods, that reduce the selection's set. + - End() + - Filter...() + - Has...() + - Intersection(), which is an alias of FilterSelection() + - Not...() + +* iteration.go : methods to loop over the selection's nodes. + - Each() + - EachWithBreak() + - Map() + +* manipulation.go : methods for modifying the document + - After...() + - Append...() + - Before...() + - Clone() + - Empty() + - Prepend...() + - Remove...() + - ReplaceWith...() + - Unwrap() + - Wrap...() + - WrapAll...() + - WrapInner...() + +* property.go : methods that inspect and get the node's properties values. + - Attr*(), RemoveAttr(), SetAttr() + - AddClass(), HasClass(), RemoveClass(), ToggleClass() + - Html() + - Length() + - Size(), which is an alias for Length() + - Text() + +* query.go : methods that query, or reflect, a node's identity. + - Contains() + - Is...() + +* traversal.go : methods to traverse the HTML document tree. + - Children...() + - Contents() + - Find...() + - Next...() + - Parent[s]...() + - Prev...() + - Siblings...() + +* type.go : definition of the types exposed by goquery. + - Document + - Selection + - Matcher + +* utilities.go : definition of helper functions (and not methods on a *Selection) +that are not part of jQuery, but are useful to goquery. + - NodeName + - OuterHtml +*/ +package goquery diff --git a/vendor/github.com/PuerkitoBio/goquery/expand.go b/vendor/github.com/PuerkitoBio/goquery/expand.go new file mode 100644 index 00000000..7caade53 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/expand.go @@ -0,0 +1,70 @@ +package goquery + +import "golang.org/x/net/html" + +// Add adds the selector string's matching nodes to those in the current +// selection and returns a new Selection object. +// The selector string is run in the context of the document of the current +// Selection object. +func (s *Selection) Add(selector string) *Selection { + return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, compileMatcher(selector))...) +} + +// AddMatcher adds the matcher's matching nodes to those in the current +// selection and returns a new Selection object. +// The matcher is run in the context of the document of the current +// Selection object. +func (s *Selection) AddMatcher(m Matcher) *Selection { + return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, m)...) +} + +// AddSelection adds the specified Selection object's nodes to those in the +// current selection and returns a new Selection object. +func (s *Selection) AddSelection(sel *Selection) *Selection { + if sel == nil { + return s.AddNodes() + } + return s.AddNodes(sel.Nodes...) +} + +// Union is an alias for AddSelection. +func (s *Selection) Union(sel *Selection) *Selection { + return s.AddSelection(sel) +} + +// AddNodes adds the specified nodes to those in the +// current selection and returns a new Selection object. +func (s *Selection) AddNodes(nodes ...*html.Node) *Selection { + return pushStack(s, appendWithoutDuplicates(s.Nodes, nodes, nil)) +} + +// AndSelf adds the previous set of elements on the stack to the current set. +// It returns a new Selection object containing the current Selection combined +// with the previous one. +// Deprecated: This function has been deprecated and is now an alias for AddBack(). +func (s *Selection) AndSelf() *Selection { + return s.AddBack() +} + +// AddBack adds the previous set of elements on the stack to the current set. +// It returns a new Selection object containing the current Selection combined +// with the previous one. +func (s *Selection) AddBack() *Selection { + return s.AddSelection(s.prevSel) +} + +// AddBackFiltered reduces the previous set of elements on the stack to those that +// match the selector string, and adds them to the current set. +// It returns a new Selection object containing the current Selection combined +// with the filtered previous one +func (s *Selection) AddBackFiltered(selector string) *Selection { + return s.AddSelection(s.prevSel.Filter(selector)) +} + +// AddBackMatcher reduces the previous set of elements on the stack to those that match +// the mateher, and adds them to the curernt set. +// It returns a new Selection object containing the current Selection combined +// with the filtered previous one +func (s *Selection) AddBackMatcher(m Matcher) *Selection { + return s.AddSelection(s.prevSel.FilterMatcher(m)) +} diff --git a/vendor/github.com/PuerkitoBio/goquery/filter.go b/vendor/github.com/PuerkitoBio/goquery/filter.go new file mode 100644 index 00000000..9138ffb3 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/filter.go @@ -0,0 +1,163 @@ +package goquery + +import "golang.org/x/net/html" + +// Filter reduces the set of matched elements to those that match the selector string. +// It returns a new Selection object for this subset of matching elements. +func (s *Selection) Filter(selector string) *Selection { + return s.FilterMatcher(compileMatcher(selector)) +} + +// FilterMatcher reduces the set of matched elements to those that match +// the given matcher. It returns a new Selection object for this subset +// of matching elements. +func (s *Selection) FilterMatcher(m Matcher) *Selection { + return pushStack(s, winnow(s, m, true)) +} + +// Not removes elements from the Selection that match the selector string. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) Not(selector string) *Selection { + return s.NotMatcher(compileMatcher(selector)) +} + +// NotMatcher removes elements from the Selection that match the given matcher. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotMatcher(m Matcher) *Selection { + return pushStack(s, winnow(s, m, false)) +} + +// FilterFunction reduces the set of matched elements to those that pass the function's test. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterFunction(f func(int, *Selection) bool) *Selection { + return pushStack(s, winnowFunction(s, f, true)) +} + +// NotFunction removes elements from the Selection that pass the function's test. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotFunction(f func(int, *Selection) bool) *Selection { + return pushStack(s, winnowFunction(s, f, false)) +} + +// FilterNodes reduces the set of matched elements to those that match the specified nodes. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterNodes(nodes ...*html.Node) *Selection { + return pushStack(s, winnowNodes(s, nodes, true)) +} + +// NotNodes removes elements from the Selection that match the specified nodes. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotNodes(nodes ...*html.Node) *Selection { + return pushStack(s, winnowNodes(s, nodes, false)) +} + +// FilterSelection reduces the set of matched elements to those that match a +// node in the specified Selection object. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, winnowNodes(s, nil, true)) + } + return pushStack(s, winnowNodes(s, sel.Nodes, true)) +} + +// NotSelection removes elements from the Selection that match a node in the specified +// Selection object. It returns a new Selection object with the matching elements removed. +func (s *Selection) NotSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, winnowNodes(s, nil, false)) + } + return pushStack(s, winnowNodes(s, sel.Nodes, false)) +} + +// Intersection is an alias for FilterSelection. +func (s *Selection) Intersection(sel *Selection) *Selection { + return s.FilterSelection(sel) +} + +// Has reduces the set of matched elements to those that have a descendant +// that matches the selector. +// It returns a new Selection object with the matching elements. +func (s *Selection) Has(selector string) *Selection { + return s.HasSelection(s.document.Find(selector)) +} + +// HasMatcher reduces the set of matched elements to those that have a descendant +// that matches the matcher. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasMatcher(m Matcher) *Selection { + return s.HasSelection(s.document.FindMatcher(m)) +} + +// HasNodes reduces the set of matched elements to those that have a +// descendant that matches one of the nodes. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasNodes(nodes ...*html.Node) *Selection { + return s.FilterFunction(func(_ int, sel *Selection) bool { + // Add all nodes that contain one of the specified nodes + for _, n := range nodes { + if sel.Contains(n) { + return true + } + } + return false + }) +} + +// HasSelection reduces the set of matched elements to those that have a +// descendant that matches one of the nodes of the specified Selection object. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasSelection(sel *Selection) *Selection { + if sel == nil { + return s.HasNodes() + } + return s.HasNodes(sel.Nodes...) +} + +// End ends the most recent filtering operation in the current chain and +// returns the set of matched elements to its previous state. +func (s *Selection) End() *Selection { + if s.prevSel != nil { + return s.prevSel + } + return newEmptySelection(s.document) +} + +// Filter based on the matcher, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnow(sel *Selection, m Matcher, keep bool) []*html.Node { + // Optimize if keep is requested + if keep { + return m.Filter(sel.Nodes) + } + // Use grep + return grep(sel, func(i int, s *Selection) bool { + return !m.Match(s.Get(0)) + }) +} + +// Filter based on an array of nodes, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnowNodes(sel *Selection, nodes []*html.Node, keep bool) []*html.Node { + if len(nodes)+len(sel.Nodes) < minNodesForSet { + return grep(sel, func(i int, s *Selection) bool { + return isInSlice(nodes, s.Get(0)) == keep + }) + } + + set := make(map[*html.Node]bool) + for _, n := range nodes { + set[n] = true + } + return grep(sel, func(i int, s *Selection) bool { + return set[s.Get(0)] == keep + }) +} + +// Filter based on a function test, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnowFunction(sel *Selection, f func(int, *Selection) bool, keep bool) []*html.Node { + return grep(sel, func(i int, s *Selection) bool { + return f(i, s) == keep + }) +} diff --git a/vendor/github.com/PuerkitoBio/goquery/iteration.go b/vendor/github.com/PuerkitoBio/goquery/iteration.go new file mode 100644 index 00000000..e246f2e0 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/iteration.go @@ -0,0 +1,39 @@ +package goquery + +// Each iterates over a Selection object, executing a function for each +// matched element. It returns the current Selection object. The function +// f is called for each element in the selection with the index of the +// element in that selection starting at 0, and a *Selection that contains +// only that element. +func (s *Selection) Each(f func(int, *Selection)) *Selection { + for i, n := range s.Nodes { + f(i, newSingleSelection(n, s.document)) + } + return s +} + +// EachWithBreak iterates over a Selection object, executing a function for each +// matched element. It is identical to Each except that it is possible to break +// out of the loop by returning false in the callback function. It returns the +// current Selection object. +func (s *Selection) EachWithBreak(f func(int, *Selection) bool) *Selection { + for i, n := range s.Nodes { + if !f(i, newSingleSelection(n, s.document)) { + return s + } + } + return s +} + +// Map passes each element in the current matched set through a function, +// producing a slice of string holding the returned values. The function +// f is called for each element in the selection with the index of the +// element in that selection starting at 0, and a *Selection that contains +// only that element. +func (s *Selection) Map(f func(int, *Selection) string) (result []string) { + for i, n := range s.Nodes { + result = append(result, f(i, newSingleSelection(n, s.document))) + } + + return result +} diff --git a/vendor/github.com/PuerkitoBio/goquery/manipulation.go b/vendor/github.com/PuerkitoBio/goquery/manipulation.go new file mode 100644 index 00000000..35febf11 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/manipulation.go @@ -0,0 +1,679 @@ +package goquery + +import ( + "strings" + + "golang.org/x/net/html" +) + +// After applies the selector from the root document and inserts the matched elements +// after the elements in the set of matched elements. +// +// If one of the matched elements in the selection is not currently in the +// document, it's impossible to insert nodes after it, so it will be ignored. +// +// This follows the same rules as Selection.Append. +func (s *Selection) After(selector string) *Selection { + return s.AfterMatcher(compileMatcher(selector)) +} + +// AfterMatcher applies the matcher from the root document and inserts the matched elements +// after the elements in the set of matched elements. +// +// If one of the matched elements in the selection is not currently in the +// document, it's impossible to insert nodes after it, so it will be ignored. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterMatcher(m Matcher) *Selection { + return s.AfterNodes(m.MatchAll(s.document.rootNode)...) +} + +// AfterSelection inserts the elements in the selection after each element in the set of matched +// elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterSelection(sel *Selection) *Selection { + return s.AfterNodes(sel.Nodes...) +} + +// AfterHtml parses the html and inserts it after the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, true, func(node *html.Node, nodes []*html.Node) { + nextSibling := node.NextSibling + for _, n := range nodes { + if node.Parent != nil { + node.Parent.InsertBefore(n, nextSibling) + } + } + }) +} + +// AfterNodes inserts the nodes after each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, true, func(sn *html.Node, n *html.Node) { + if sn.Parent != nil { + sn.Parent.InsertBefore(n, sn.NextSibling) + } + }) +} + +// Append appends the elements specified by the selector to the end of each element +// in the set of matched elements, following those rules: +// +// 1) The selector is applied to the root document. +// +// 2) Elements that are part of the document will be moved to the new location. +// +// 3) If there are multiple locations to append to, cloned nodes will be +// appended to all target locations except the last one, which will be moved +// as noted in (2). +func (s *Selection) Append(selector string) *Selection { + return s.AppendMatcher(compileMatcher(selector)) +} + +// AppendMatcher appends the elements specified by the matcher to the end of each element +// in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendMatcher(m Matcher) *Selection { + return s.AppendNodes(m.MatchAll(s.document.rootNode)...) +} + +// AppendSelection appends the elements in the selection to the end of each element +// in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendSelection(sel *Selection) *Selection { + return s.AppendNodes(sel.Nodes...) +} + +// AppendHtml parses the html and appends it to the set of matched elements. +func (s *Selection) AppendHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, false, func(node *html.Node, nodes []*html.Node) { + for _, n := range nodes { + node.AppendChild(n) + } + }) +} + +// AppendNodes appends the specified nodes to each node in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, false, func(sn *html.Node, n *html.Node) { + sn.AppendChild(n) + }) +} + +// Before inserts the matched elements before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) Before(selector string) *Selection { + return s.BeforeMatcher(compileMatcher(selector)) +} + +// BeforeMatcher inserts the matched elements before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeMatcher(m Matcher) *Selection { + return s.BeforeNodes(m.MatchAll(s.document.rootNode)...) +} + +// BeforeSelection inserts the elements in the selection before each element in the set of matched +// elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeSelection(sel *Selection) *Selection { + return s.BeforeNodes(sel.Nodes...) +} + +// BeforeHtml parses the html and inserts it before the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, true, func(node *html.Node, nodes []*html.Node) { + for _, n := range nodes { + if node.Parent != nil { + node.Parent.InsertBefore(n, node) + } + } + }) +} + +// BeforeNodes inserts the nodes before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, false, func(sn *html.Node, n *html.Node) { + if sn.Parent != nil { + sn.Parent.InsertBefore(n, sn) + } + }) +} + +// Clone creates a deep copy of the set of matched nodes. The new nodes will not be +// attached to the document. +func (s *Selection) Clone() *Selection { + ns := newEmptySelection(s.document) + ns.Nodes = cloneNodes(s.Nodes) + return ns +} + +// Empty removes all children nodes from the set of matched elements. +// It returns the children nodes in a new Selection. +func (s *Selection) Empty() *Selection { + var nodes []*html.Node + + for _, n := range s.Nodes { + for c := n.FirstChild; c != nil; c = n.FirstChild { + n.RemoveChild(c) + nodes = append(nodes, c) + } + } + + return pushStack(s, nodes) +} + +// Prepend prepends the elements specified by the selector to each element in +// the set of matched elements, following the same rules as Append. +func (s *Selection) Prepend(selector string) *Selection { + return s.PrependMatcher(compileMatcher(selector)) +} + +// PrependMatcher prepends the elements specified by the matcher to each +// element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependMatcher(m Matcher) *Selection { + return s.PrependNodes(m.MatchAll(s.document.rootNode)...) +} + +// PrependSelection prepends the elements in the selection to each element in +// the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependSelection(sel *Selection) *Selection { + return s.PrependNodes(sel.Nodes...) +} + +// PrependHtml parses the html and prepends it to the set of matched elements. +func (s *Selection) PrependHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, false, func(node *html.Node, nodes []*html.Node) { + firstChild := node.FirstChild + for _, n := range nodes { + node.InsertBefore(n, firstChild) + } + }) +} + +// PrependNodes prepends the specified nodes to each node in the set of +// matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, true, func(sn *html.Node, n *html.Node) { + // sn.FirstChild may be nil, in which case this functions like + // sn.AppendChild() + sn.InsertBefore(n, sn.FirstChild) + }) +} + +// Remove removes the set of matched elements from the document. +// It returns the same selection, now consisting of nodes not in the document. +func (s *Selection) Remove() *Selection { + for _, n := range s.Nodes { + if n.Parent != nil { + n.Parent.RemoveChild(n) + } + } + + return s +} + +// RemoveFiltered removes from the current set of matched elements those that +// match the selector filter. It returns the Selection of removed nodes. +// +// For example if the selection s contains "

", "

" and "

" +// and s.RemoveFiltered("h2") is called, only the "

" node is removed +// (and returned), while "

" and "

" are kept in the document. +func (s *Selection) RemoveFiltered(selector string) *Selection { + return s.RemoveMatcher(compileMatcher(selector)) +} + +// RemoveMatcher removes from the current set of matched elements those that +// match the Matcher filter. It returns the Selection of removed nodes. +// See RemoveFiltered for additional information. +func (s *Selection) RemoveMatcher(m Matcher) *Selection { + return s.FilterMatcher(m).Remove() +} + +// ReplaceWith replaces each element in the set of matched elements with the +// nodes matched by the given selector. +// It returns the removed elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) ReplaceWith(selector string) *Selection { + return s.ReplaceWithMatcher(compileMatcher(selector)) +} + +// ReplaceWithMatcher replaces each element in the set of matched elements with +// the nodes matched by the given Matcher. +// It returns the removed elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) ReplaceWithMatcher(m Matcher) *Selection { + return s.ReplaceWithNodes(m.MatchAll(s.document.rootNode)...) +} + +// ReplaceWithSelection replaces each element in the set of matched elements with +// the nodes from the given Selection. +// It returns the removed elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) ReplaceWithSelection(sel *Selection) *Selection { + return s.ReplaceWithNodes(sel.Nodes...) +} + +// ReplaceWithHtml replaces each element in the set of matched elements with +// the parsed HTML. +// It returns the removed elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) ReplaceWithHtml(htmlStr string) *Selection { + s.eachNodeHtml(htmlStr, true, func(node *html.Node, nodes []*html.Node) { + nextSibling := node.NextSibling + for _, n := range nodes { + if node.Parent != nil { + node.Parent.InsertBefore(n, nextSibling) + } + } + }) + return s.Remove() +} + +// ReplaceWithNodes replaces each element in the set of matched elements with +// the given nodes. +// It returns the removed elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) ReplaceWithNodes(ns ...*html.Node) *Selection { + s.AfterNodes(ns...) + return s.Remove() +} + +// SetHtml sets the html content of each element in the selection to +// specified html string. +func (s *Selection) SetHtml(htmlStr string) *Selection { + for _, context := range s.Nodes { + for c := context.FirstChild; c != nil; c = context.FirstChild { + context.RemoveChild(c) + } + } + return s.eachNodeHtml(htmlStr, false, func(node *html.Node, nodes []*html.Node) { + for _, n := range nodes { + node.AppendChild(n) + } + }) +} + +// SetText sets the content of each element in the selection to specified content. +// The provided text string is escaped. +func (s *Selection) SetText(text string) *Selection { + return s.SetHtml(html.EscapeString(text)) +} + +// Unwrap removes the parents of the set of matched elements, leaving the matched +// elements (and their siblings, if any) in their place. +// It returns the original selection. +func (s *Selection) Unwrap() *Selection { + s.Parent().Each(func(i int, ss *Selection) { + // For some reason, jquery allows unwrap to remove the element, so + // allowing it here too. Same for . Why it allows those elements to + // be unwrapped while not allowing body is a mystery to me. + if ss.Nodes[0].Data != "body" { + ss.ReplaceWithSelection(ss.Contents()) + } + }) + + return s +} + +// Wrap wraps each element in the set of matched elements inside the first +// element matched by the given selector. The matched child is cloned before +// being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) Wrap(selector string) *Selection { + return s.WrapMatcher(compileMatcher(selector)) +} + +// WrapMatcher wraps each element in the set of matched elements inside the +// first element matched by the given matcher. The matched child is cloned +// before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapMatcher(m Matcher) *Selection { + return s.wrapNodes(m.MatchAll(s.document.rootNode)...) +} + +// WrapSelection wraps each element in the set of matched elements inside the +// first element in the given Selection. The element is cloned before being +// inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapSelection(sel *Selection) *Selection { + return s.wrapNodes(sel.Nodes...) +} + +// WrapHtml wraps each element in the set of matched elements inside the inner- +// most child of the given HTML. +// +// It returns the original set of elements. +func (s *Selection) WrapHtml(htmlStr string) *Selection { + nodesMap := make(map[string][]*html.Node) + for _, context := range s.Nodes { + var parent *html.Node + if context.Parent != nil { + parent = context.Parent + } else { + parent = &html.Node{Type: html.ElementNode} + } + nodes, found := nodesMap[nodeName(parent)] + if !found { + nodes = parseHtmlWithContext(htmlStr, parent) + nodesMap[nodeName(parent)] = nodes + } + newSingleSelection(context, s.document).wrapAllNodes(cloneNodes(nodes)...) + } + return s +} + +// WrapNode wraps each element in the set of matched elements inside the inner- +// most child of the given node. The given node is copied before being inserted +// into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapNode(n *html.Node) *Selection { + return s.wrapNodes(n) +} + +func (s *Selection) wrapNodes(ns ...*html.Node) *Selection { + s.Each(func(i int, ss *Selection) { + ss.wrapAllNodes(ns...) + }) + + return s +} + +// WrapAll wraps a single HTML structure, matched by the given selector, around +// all elements in the set of matched elements. The matched child is cloned +// before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapAll(selector string) *Selection { + return s.WrapAllMatcher(compileMatcher(selector)) +} + +// WrapAllMatcher wraps a single HTML structure, matched by the given Matcher, +// around all elements in the set of matched elements. The matched child is +// cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapAllMatcher(m Matcher) *Selection { + return s.wrapAllNodes(m.MatchAll(s.document.rootNode)...) +} + +// WrapAllSelection wraps a single HTML structure, the first node of the given +// Selection, around all elements in the set of matched elements. The matched +// child is cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapAllSelection(sel *Selection) *Selection { + return s.wrapAllNodes(sel.Nodes...) +} + +// WrapAllHtml wraps the given HTML structure around all elements in the set of +// matched elements. The matched child is cloned before being inserted into the +// document. +// +// It returns the original set of elements. +func (s *Selection) WrapAllHtml(htmlStr string) *Selection { + var context *html.Node + var nodes []*html.Node + if len(s.Nodes) > 0 { + context = s.Nodes[0] + if context.Parent != nil { + nodes = parseHtmlWithContext(htmlStr, context) + } else { + nodes = parseHtml(htmlStr) + } + } + return s.wrapAllNodes(nodes...) +} + +func (s *Selection) wrapAllNodes(ns ...*html.Node) *Selection { + if len(ns) > 0 { + return s.WrapAllNode(ns[0]) + } + return s +} + +// WrapAllNode wraps the given node around the first element in the Selection, +// making all other nodes in the Selection children of the given node. The node +// is cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapAllNode(n *html.Node) *Selection { + if s.Size() == 0 { + return s + } + + wrap := cloneNode(n) + + first := s.Nodes[0] + if first.Parent != nil { + first.Parent.InsertBefore(wrap, first) + first.Parent.RemoveChild(first) + } + + for c := getFirstChildEl(wrap); c != nil; c = getFirstChildEl(wrap) { + wrap = c + } + + newSingleSelection(wrap, s.document).AppendSelection(s) + + return s +} + +// WrapInner wraps an HTML structure, matched by the given selector, around the +// content of element in the set of matched elements. The matched child is +// cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapInner(selector string) *Selection { + return s.WrapInnerMatcher(compileMatcher(selector)) +} + +// WrapInnerMatcher wraps an HTML structure, matched by the given selector, +// around the content of element in the set of matched elements. The matched +// child is cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapInnerMatcher(m Matcher) *Selection { + return s.wrapInnerNodes(m.MatchAll(s.document.rootNode)...) +} + +// WrapInnerSelection wraps an HTML structure, matched by the given selector, +// around the content of element in the set of matched elements. The matched +// child is cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapInnerSelection(sel *Selection) *Selection { + return s.wrapInnerNodes(sel.Nodes...) +} + +// WrapInnerHtml wraps an HTML structure, matched by the given selector, around +// the content of element in the set of matched elements. The matched child is +// cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapInnerHtml(htmlStr string) *Selection { + nodesMap := make(map[string][]*html.Node) + for _, context := range s.Nodes { + nodes, found := nodesMap[nodeName(context)] + if !found { + nodes = parseHtmlWithContext(htmlStr, context) + nodesMap[nodeName(context)] = nodes + } + newSingleSelection(context, s.document).wrapInnerNodes(cloneNodes(nodes)...) + } + return s +} + +// WrapInnerNode wraps an HTML structure, matched by the given selector, around +// the content of element in the set of matched elements. The matched child is +// cloned before being inserted into the document. +// +// It returns the original set of elements. +func (s *Selection) WrapInnerNode(n *html.Node) *Selection { + return s.wrapInnerNodes(n) +} + +func (s *Selection) wrapInnerNodes(ns ...*html.Node) *Selection { + if len(ns) == 0 { + return s + } + + s.Each(func(i int, s *Selection) { + contents := s.Contents() + + if contents.Size() > 0 { + contents.wrapAllNodes(ns...) + } else { + s.AppendNodes(cloneNode(ns[0])) + } + }) + + return s +} + +func parseHtml(h string) []*html.Node { + // Errors are only returned when the io.Reader returns any error besides + // EOF, but strings.Reader never will + nodes, err := html.ParseFragment(strings.NewReader(h), &html.Node{Type: html.ElementNode}) + if err != nil { + panic("goquery: failed to parse HTML: " + err.Error()) + } + return nodes +} + +func parseHtmlWithContext(h string, context *html.Node) []*html.Node { + // Errors are only returned when the io.Reader returns any error besides + // EOF, but strings.Reader never will + nodes, err := html.ParseFragment(strings.NewReader(h), context) + if err != nil { + panic("goquery: failed to parse HTML: " + err.Error()) + } + return nodes +} + +// Get the first child that is an ElementNode +func getFirstChildEl(n *html.Node) *html.Node { + c := n.FirstChild + for c != nil && c.Type != html.ElementNode { + c = c.NextSibling + } + return c +} + +// Deep copy a slice of nodes. +func cloneNodes(ns []*html.Node) []*html.Node { + cns := make([]*html.Node, 0, len(ns)) + + for _, n := range ns { + cns = append(cns, cloneNode(n)) + } + + return cns +} + +// Deep copy a node. The new node has clones of all the original node's +// children but none of its parents or siblings. +func cloneNode(n *html.Node) *html.Node { + nn := &html.Node{ + Type: n.Type, + DataAtom: n.DataAtom, + Data: n.Data, + Attr: make([]html.Attribute, len(n.Attr)), + } + + copy(nn.Attr, n.Attr) + for c := n.FirstChild; c != nil; c = c.NextSibling { + nn.AppendChild(cloneNode(c)) + } + + return nn +} + +func (s *Selection) manipulateNodes(ns []*html.Node, reverse bool, + f func(sn *html.Node, n *html.Node)) *Selection { + + lasti := s.Size() - 1 + + // net.Html doesn't provide document fragments for insertion, so to get + // things in the correct order with After() and Prepend(), the callback + // needs to be called on the reverse of the nodes. + if reverse { + for i, j := 0, len(ns)-1; i < j; i, j = i+1, j-1 { + ns[i], ns[j] = ns[j], ns[i] + } + } + + for i, sn := range s.Nodes { + for _, n := range ns { + if i != lasti { + f(sn, cloneNode(n)) + } else { + if n.Parent != nil { + n.Parent.RemoveChild(n) + } + f(sn, n) + } + } + } + + return s +} + +// eachNodeHtml parses the given html string and inserts the resulting nodes in the dom with the mergeFn. +// The parsed nodes are inserted for each element of the selection. +// isParent can be used to indicate that the elements of the selection should be treated as the parent for the parsed html. +// A cache is used to avoid parsing the html multiple times should the elements of the selection result in the same context. +func (s *Selection) eachNodeHtml(htmlStr string, isParent bool, mergeFn func(n *html.Node, nodes []*html.Node)) *Selection { + // cache to avoid parsing the html for the same context multiple times + nodeCache := make(map[string][]*html.Node) + var context *html.Node + for _, n := range s.Nodes { + if isParent { + context = n.Parent + } else { + if n.Type != html.ElementNode { + continue + } + context = n + } + if context != nil { + nodes, found := nodeCache[nodeName(context)] + if !found { + nodes = parseHtmlWithContext(htmlStr, context) + nodeCache[nodeName(context)] = nodes + } + mergeFn(n, cloneNodes(nodes)) + } + } + return s +} diff --git a/vendor/github.com/PuerkitoBio/goquery/property.go b/vendor/github.com/PuerkitoBio/goquery/property.go new file mode 100644 index 00000000..411126db --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/property.go @@ -0,0 +1,275 @@ +package goquery + +import ( + "bytes" + "regexp" + "strings" + + "golang.org/x/net/html" +) + +var rxClassTrim = regexp.MustCompile("[\t\r\n]") + +// Attr gets the specified attribute's value for the first element in the +// Selection. To get the value for each element individually, use a looping +// construct such as Each or Map method. +func (s *Selection) Attr(attrName string) (val string, exists bool) { + if len(s.Nodes) == 0 { + return + } + return getAttributeValue(attrName, s.Nodes[0]) +} + +// AttrOr works like Attr but returns default value if attribute is not present. +func (s *Selection) AttrOr(attrName, defaultValue string) string { + if len(s.Nodes) == 0 { + return defaultValue + } + + val, exists := getAttributeValue(attrName, s.Nodes[0]) + if !exists { + return defaultValue + } + + return val +} + +// RemoveAttr removes the named attribute from each element in the set of matched elements. +func (s *Selection) RemoveAttr(attrName string) *Selection { + for _, n := range s.Nodes { + removeAttr(n, attrName) + } + + return s +} + +// SetAttr sets the given attribute on each element in the set of matched elements. +func (s *Selection) SetAttr(attrName, val string) *Selection { + for _, n := range s.Nodes { + attr := getAttributePtr(attrName, n) + if attr == nil { + n.Attr = append(n.Attr, html.Attribute{Key: attrName, Val: val}) + } else { + attr.Val = val + } + } + + return s +} + +// Text gets the combined text contents of each element in the set of matched +// elements, including their descendants. +func (s *Selection) Text() string { + var buf bytes.Buffer + + // Slightly optimized vs calling Each: no single selection object created + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.TextNode { + // Keep newlines and spaces, like jQuery + buf.WriteString(n.Data) + } + if n.FirstChild != nil { + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + } + for _, n := range s.Nodes { + f(n) + } + + return buf.String() +} + +// Size is an alias for Length. +func (s *Selection) Size() int { + return s.Length() +} + +// Length returns the number of elements in the Selection object. +func (s *Selection) Length() int { + return len(s.Nodes) +} + +// Html gets the HTML contents of the first element in the set of matched +// elements. It includes text and comment nodes. +func (s *Selection) Html() (ret string, e error) { + // Since there is no .innerHtml, the HTML content must be re-created from + // the nodes using html.Render. + var buf bytes.Buffer + + if len(s.Nodes) > 0 { + for c := s.Nodes[0].FirstChild; c != nil; c = c.NextSibling { + e = html.Render(&buf, c) + if e != nil { + return + } + } + ret = buf.String() + } + + return +} + +// AddClass adds the given class(es) to each element in the set of matched elements. +// Multiple class names can be specified, separated by a space or via multiple arguments. +func (s *Selection) AddClass(class ...string) *Selection { + classStr := strings.TrimSpace(strings.Join(class, " ")) + + if classStr == "" { + return s + } + + tcls := getClassesSlice(classStr) + for _, n := range s.Nodes { + curClasses, attr := getClassesAndAttr(n, true) + for _, newClass := range tcls { + if !strings.Contains(curClasses, " "+newClass+" ") { + curClasses += newClass + " " + } + } + + setClasses(n, attr, curClasses) + } + + return s +} + +// HasClass determines whether any of the matched elements are assigned the +// given class. +func (s *Selection) HasClass(class string) bool { + class = " " + class + " " + for _, n := range s.Nodes { + classes, _ := getClassesAndAttr(n, false) + if strings.Contains(classes, class) { + return true + } + } + return false +} + +// RemoveClass removes the given class(es) from each element in the set of matched elements. +// Multiple class names can be specified, separated by a space or via multiple arguments. +// If no class name is provided, all classes are removed. +func (s *Selection) RemoveClass(class ...string) *Selection { + var rclasses []string + + classStr := strings.TrimSpace(strings.Join(class, " ")) + remove := classStr == "" + + if !remove { + rclasses = getClassesSlice(classStr) + } + + for _, n := range s.Nodes { + if remove { + removeAttr(n, "class") + } else { + classes, attr := getClassesAndAttr(n, true) + for _, rcl := range rclasses { + classes = strings.Replace(classes, " "+rcl+" ", " ", -1) + } + + setClasses(n, attr, classes) + } + } + + return s +} + +// ToggleClass adds or removes the given class(es) for each element in the set of matched elements. +// Multiple class names can be specified, separated by a space or via multiple arguments. +func (s *Selection) ToggleClass(class ...string) *Selection { + classStr := strings.TrimSpace(strings.Join(class, " ")) + + if classStr == "" { + return s + } + + tcls := getClassesSlice(classStr) + + for _, n := range s.Nodes { + classes, attr := getClassesAndAttr(n, true) + for _, tcl := range tcls { + if strings.Contains(classes, " "+tcl+" ") { + classes = strings.Replace(classes, " "+tcl+" ", " ", -1) + } else { + classes += tcl + " " + } + } + + setClasses(n, attr, classes) + } + + return s +} + +func getAttributePtr(attrName string, n *html.Node) *html.Attribute { + if n == nil { + return nil + } + + for i, a := range n.Attr { + if a.Key == attrName { + return &n.Attr[i] + } + } + return nil +} + +// Private function to get the specified attribute's value from a node. +func getAttributeValue(attrName string, n *html.Node) (val string, exists bool) { + if a := getAttributePtr(attrName, n); a != nil { + val = a.Val + exists = true + } + return +} + +// Get and normalize the "class" attribute from the node. +func getClassesAndAttr(n *html.Node, create bool) (classes string, attr *html.Attribute) { + // Applies only to element nodes + if n.Type == html.ElementNode { + attr = getAttributePtr("class", n) + if attr == nil && create { + n.Attr = append(n.Attr, html.Attribute{ + Key: "class", + Val: "", + }) + attr = &n.Attr[len(n.Attr)-1] + } + } + + if attr == nil { + classes = " " + } else { + classes = rxClassTrim.ReplaceAllString(" "+attr.Val+" ", " ") + } + + return +} + +func getClassesSlice(classes string) []string { + return strings.Split(rxClassTrim.ReplaceAllString(" "+classes+" ", " "), " ") +} + +func removeAttr(n *html.Node, attrName string) { + for i, a := range n.Attr { + if a.Key == attrName { + n.Attr[i], n.Attr[len(n.Attr)-1], n.Attr = + n.Attr[len(n.Attr)-1], html.Attribute{}, n.Attr[:len(n.Attr)-1] + return + } + } +} + +func setClasses(n *html.Node, attr *html.Attribute, classes string) { + classes = strings.TrimSpace(classes) + if classes == "" { + removeAttr(n, "class") + return + } + + attr.Val = classes +} diff --git a/vendor/github.com/PuerkitoBio/goquery/query.go b/vendor/github.com/PuerkitoBio/goquery/query.go new file mode 100644 index 00000000..fe86bf0b --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/query.go @@ -0,0 +1,49 @@ +package goquery + +import "golang.org/x/net/html" + +// Is checks the current matched set of elements against a selector and +// returns true if at least one of these elements matches. +func (s *Selection) Is(selector string) bool { + return s.IsMatcher(compileMatcher(selector)) +} + +// IsMatcher checks the current matched set of elements against a matcher and +// returns true if at least one of these elements matches. +func (s *Selection) IsMatcher(m Matcher) bool { + if len(s.Nodes) > 0 { + if len(s.Nodes) == 1 { + return m.Match(s.Nodes[0]) + } + return len(m.Filter(s.Nodes)) > 0 + } + + return false +} + +// IsFunction checks the current matched set of elements against a predicate and +// returns true if at least one of these elements matches. +func (s *Selection) IsFunction(f func(int, *Selection) bool) bool { + return s.FilterFunction(f).Length() > 0 +} + +// IsSelection checks the current matched set of elements against a Selection object +// and returns true if at least one of these elements matches. +func (s *Selection) IsSelection(sel *Selection) bool { + return s.FilterSelection(sel).Length() > 0 +} + +// IsNodes checks the current matched set of elements against the specified nodes +// and returns true if at least one of these elements matches. +func (s *Selection) IsNodes(nodes ...*html.Node) bool { + return s.FilterNodes(nodes...).Length() > 0 +} + +// Contains returns true if the specified Node is within, +// at any depth, one of the nodes in the Selection object. +// It is NOT inclusive, to behave like jQuery's implementation, and +// unlike Javascript's .contains, so if the contained +// node is itself in the selection, it returns false. +func (s *Selection) Contains(n *html.Node) bool { + return sliceContains(s.Nodes, n) +} diff --git a/vendor/github.com/PuerkitoBio/goquery/traversal.go b/vendor/github.com/PuerkitoBio/goquery/traversal.go new file mode 100644 index 00000000..5fa5315a --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/traversal.go @@ -0,0 +1,698 @@ +package goquery + +import "golang.org/x/net/html" + +type siblingType int + +// Sibling type, used internally when iterating over children at the same +// level (siblings) to specify which nodes are requested. +const ( + siblingPrevUntil siblingType = iota - 3 + siblingPrevAll + siblingPrev + siblingAll + siblingNext + siblingNextAll + siblingNextUntil + siblingAllIncludingNonElements +) + +// Find gets the descendants of each element in the current set of matched +// elements, filtered by a selector. It returns a new Selection object +// containing these matched elements. +func (s *Selection) Find(selector string) *Selection { + return pushStack(s, findWithMatcher(s.Nodes, compileMatcher(selector))) +} + +// FindMatcher gets the descendants of each element in the current set of matched +// elements, filtered by the matcher. It returns a new Selection object +// containing these matched elements. +func (s *Selection) FindMatcher(m Matcher) *Selection { + return pushStack(s, findWithMatcher(s.Nodes, m)) +} + +// FindSelection gets the descendants of each element in the current +// Selection, filtered by a Selection. It returns a new Selection object +// containing these matched elements. +func (s *Selection) FindSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, nil) + } + return s.FindNodes(sel.Nodes...) +} + +// FindNodes gets the descendants of each element in the current +// Selection, filtered by some nodes. It returns a new Selection object +// containing these matched elements. +func (s *Selection) FindNodes(nodes ...*html.Node) *Selection { + return pushStack(s, mapNodes(nodes, func(i int, n *html.Node) []*html.Node { + if sliceContains(s.Nodes, n) { + return []*html.Node{n} + } + return nil + })) +} + +// Contents gets the children of each element in the Selection, +// including text and comment nodes. It returns a new Selection object +// containing these elements. +func (s *Selection) Contents() *Selection { + return pushStack(s, getChildrenNodes(s.Nodes, siblingAllIncludingNonElements)) +} + +// ContentsFiltered gets the children of each element in the Selection, +// filtered by the specified selector. It returns a new Selection +// object containing these elements. Since selectors only act on Element nodes, +// this function is an alias to ChildrenFiltered unless the selector is empty, +// in which case it is an alias to Contents. +func (s *Selection) ContentsFiltered(selector string) *Selection { + if selector != "" { + return s.ChildrenFiltered(selector) + } + return s.Contents() +} + +// ContentsMatcher gets the children of each element in the Selection, +// filtered by the specified matcher. It returns a new Selection +// object containing these elements. Since matchers only act on Element nodes, +// this function is an alias to ChildrenMatcher. +func (s *Selection) ContentsMatcher(m Matcher) *Selection { + return s.ChildrenMatcher(m) +} + +// Children gets the child elements of each element in the Selection. +// It returns a new Selection object containing these elements. +func (s *Selection) Children() *Selection { + return pushStack(s, getChildrenNodes(s.Nodes, siblingAll)) +} + +// ChildrenFiltered gets the child elements of each element in the Selection, +// filtered by the specified selector. It returns a new +// Selection object containing these elements. +func (s *Selection) ChildrenFiltered(selector string) *Selection { + return filterAndPush(s, getChildrenNodes(s.Nodes, siblingAll), compileMatcher(selector)) +} + +// ChildrenMatcher gets the child elements of each element in the Selection, +// filtered by the specified matcher. It returns a new +// Selection object containing these elements. +func (s *Selection) ChildrenMatcher(m Matcher) *Selection { + return filterAndPush(s, getChildrenNodes(s.Nodes, siblingAll), m) +} + +// Parent gets the parent of each element in the Selection. It returns a +// new Selection object containing the matched elements. +func (s *Selection) Parent() *Selection { + return pushStack(s, getParentNodes(s.Nodes)) +} + +// ParentFiltered gets the parent of each element in the Selection filtered by a +// selector. It returns a new Selection object containing the matched elements. +func (s *Selection) ParentFiltered(selector string) *Selection { + return filterAndPush(s, getParentNodes(s.Nodes), compileMatcher(selector)) +} + +// ParentMatcher gets the parent of each element in the Selection filtered by a +// matcher. It returns a new Selection object containing the matched elements. +func (s *Selection) ParentMatcher(m Matcher) *Selection { + return filterAndPush(s, getParentNodes(s.Nodes), m) +} + +// Closest gets the first element that matches the selector by testing the +// element itself and traversing up through its ancestors in the DOM tree. +func (s *Selection) Closest(selector string) *Selection { + cs := compileMatcher(selector) + return s.ClosestMatcher(cs) +} + +// ClosestMatcher gets the first element that matches the matcher by testing the +// element itself and traversing up through its ancestors in the DOM tree. +func (s *Selection) ClosestMatcher(m Matcher) *Selection { + return pushStack(s, mapNodes(s.Nodes, func(i int, n *html.Node) []*html.Node { + // For each node in the selection, test the node itself, then each parent + // until a match is found. + for ; n != nil; n = n.Parent { + if m.Match(n) { + return []*html.Node{n} + } + } + return nil + })) +} + +// ClosestNodes gets the first element that matches one of the nodes by testing the +// element itself and traversing up through its ancestors in the DOM tree. +func (s *Selection) ClosestNodes(nodes ...*html.Node) *Selection { + set := make(map[*html.Node]bool) + for _, n := range nodes { + set[n] = true + } + return pushStack(s, mapNodes(s.Nodes, func(i int, n *html.Node) []*html.Node { + // For each node in the selection, test the node itself, then each parent + // until a match is found. + for ; n != nil; n = n.Parent { + if set[n] { + return []*html.Node{n} + } + } + return nil + })) +} + +// ClosestSelection gets the first element that matches one of the nodes in the +// Selection by testing the element itself and traversing up through its ancestors +// in the DOM tree. +func (s *Selection) ClosestSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, nil) + } + return s.ClosestNodes(sel.Nodes...) +} + +// Parents gets the ancestors of each element in the current Selection. It +// returns a new Selection object with the matched elements. +func (s *Selection) Parents() *Selection { + return pushStack(s, getParentsNodes(s.Nodes, nil, nil)) +} + +// ParentsFiltered gets the ancestors of each element in the current +// Selection. It returns a new Selection object with the matched elements. +func (s *Selection) ParentsFiltered(selector string) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, nil, nil), compileMatcher(selector)) +} + +// ParentsMatcher gets the ancestors of each element in the current +// Selection. It returns a new Selection object with the matched elements. +func (s *Selection) ParentsMatcher(m Matcher) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, nil, nil), m) +} + +// ParentsUntil gets the ancestors of each element in the Selection, up to but +// not including the element matched by the selector. It returns a new Selection +// object containing the matched elements. +func (s *Selection) ParentsUntil(selector string) *Selection { + return pushStack(s, getParentsNodes(s.Nodes, compileMatcher(selector), nil)) +} + +// ParentsUntilMatcher gets the ancestors of each element in the Selection, up to but +// not including the element matched by the matcher. It returns a new Selection +// object containing the matched elements. +func (s *Selection) ParentsUntilMatcher(m Matcher) *Selection { + return pushStack(s, getParentsNodes(s.Nodes, m, nil)) +} + +// ParentsUntilSelection gets the ancestors of each element in the Selection, +// up to but not including the elements in the specified Selection. It returns a +// new Selection object containing the matched elements. +func (s *Selection) ParentsUntilSelection(sel *Selection) *Selection { + if sel == nil { + return s.Parents() + } + return s.ParentsUntilNodes(sel.Nodes...) +} + +// ParentsUntilNodes gets the ancestors of each element in the Selection, +// up to but not including the specified nodes. It returns a +// new Selection object containing the matched elements. +func (s *Selection) ParentsUntilNodes(nodes ...*html.Node) *Selection { + return pushStack(s, getParentsNodes(s.Nodes, nil, nodes)) +} + +// ParentsFilteredUntil is like ParentsUntil, with the option to filter the +// results based on a selector string. It returns a new Selection +// object containing the matched elements. +func (s *Selection) ParentsFilteredUntil(filterSelector, untilSelector string) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, compileMatcher(untilSelector), nil), compileMatcher(filterSelector)) +} + +// ParentsFilteredUntilMatcher is like ParentsUntilMatcher, with the option to filter the +// results based on a matcher. It returns a new Selection object containing the matched elements. +func (s *Selection) ParentsFilteredUntilMatcher(filter, until Matcher) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, until, nil), filter) +} + +// ParentsFilteredUntilSelection is like ParentsUntilSelection, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) ParentsFilteredUntilSelection(filterSelector string, sel *Selection) *Selection { + return s.ParentsMatcherUntilSelection(compileMatcher(filterSelector), sel) +} + +// ParentsMatcherUntilSelection is like ParentsUntilSelection, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) ParentsMatcherUntilSelection(filter Matcher, sel *Selection) *Selection { + if sel == nil { + return s.ParentsMatcher(filter) + } + return s.ParentsMatcherUntilNodes(filter, sel.Nodes...) +} + +// ParentsFilteredUntilNodes is like ParentsUntilNodes, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) ParentsFilteredUntilNodes(filterSelector string, nodes ...*html.Node) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, nil, nodes), compileMatcher(filterSelector)) +} + +// ParentsMatcherUntilNodes is like ParentsUntilNodes, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) ParentsMatcherUntilNodes(filter Matcher, nodes ...*html.Node) *Selection { + return filterAndPush(s, getParentsNodes(s.Nodes, nil, nodes), filter) +} + +// Siblings gets the siblings of each element in the Selection. It returns +// a new Selection object containing the matched elements. +func (s *Selection) Siblings() *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingAll, nil, nil)) +} + +// SiblingsFiltered gets the siblings of each element in the Selection +// filtered by a selector. It returns a new Selection object containing the +// matched elements. +func (s *Selection) SiblingsFiltered(selector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingAll, nil, nil), compileMatcher(selector)) +} + +// SiblingsMatcher gets the siblings of each element in the Selection +// filtered by a matcher. It returns a new Selection object containing the +// matched elements. +func (s *Selection) SiblingsMatcher(m Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingAll, nil, nil), m) +} + +// Next gets the immediately following sibling of each element in the +// Selection. It returns a new Selection object containing the matched elements. +func (s *Selection) Next() *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingNext, nil, nil)) +} + +// NextFiltered gets the immediately following sibling of each element in the +// Selection filtered by a selector. It returns a new Selection object +// containing the matched elements. +func (s *Selection) NextFiltered(selector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNext, nil, nil), compileMatcher(selector)) +} + +// NextMatcher gets the immediately following sibling of each element in the +// Selection filtered by a matcher. It returns a new Selection object +// containing the matched elements. +func (s *Selection) NextMatcher(m Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNext, nil, nil), m) +} + +// NextAll gets all the following siblings of each element in the +// Selection. It returns a new Selection object containing the matched elements. +func (s *Selection) NextAll() *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingNextAll, nil, nil)) +} + +// NextAllFiltered gets all the following siblings of each element in the +// Selection filtered by a selector. It returns a new Selection object +// containing the matched elements. +func (s *Selection) NextAllFiltered(selector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextAll, nil, nil), compileMatcher(selector)) +} + +// NextAllMatcher gets all the following siblings of each element in the +// Selection filtered by a matcher. It returns a new Selection object +// containing the matched elements. +func (s *Selection) NextAllMatcher(m Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextAll, nil, nil), m) +} + +// Prev gets the immediately preceding sibling of each element in the +// Selection. It returns a new Selection object containing the matched elements. +func (s *Selection) Prev() *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingPrev, nil, nil)) +} + +// PrevFiltered gets the immediately preceding sibling of each element in the +// Selection filtered by a selector. It returns a new Selection object +// containing the matched elements. +func (s *Selection) PrevFiltered(selector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrev, nil, nil), compileMatcher(selector)) +} + +// PrevMatcher gets the immediately preceding sibling of each element in the +// Selection filtered by a matcher. It returns a new Selection object +// containing the matched elements. +func (s *Selection) PrevMatcher(m Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrev, nil, nil), m) +} + +// PrevAll gets all the preceding siblings of each element in the +// Selection. It returns a new Selection object containing the matched elements. +func (s *Selection) PrevAll() *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevAll, nil, nil)) +} + +// PrevAllFiltered gets all the preceding siblings of each element in the +// Selection filtered by a selector. It returns a new Selection object +// containing the matched elements. +func (s *Selection) PrevAllFiltered(selector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevAll, nil, nil), compileMatcher(selector)) +} + +// PrevAllMatcher gets all the preceding siblings of each element in the +// Selection filtered by a matcher. It returns a new Selection object +// containing the matched elements. +func (s *Selection) PrevAllMatcher(m Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevAll, nil, nil), m) +} + +// NextUntil gets all following siblings of each element up to but not +// including the element matched by the selector. It returns a new Selection +// object containing the matched elements. +func (s *Selection) NextUntil(selector string) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingNextUntil, + compileMatcher(selector), nil)) +} + +// NextUntilMatcher gets all following siblings of each element up to but not +// including the element matched by the matcher. It returns a new Selection +// object containing the matched elements. +func (s *Selection) NextUntilMatcher(m Matcher) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingNextUntil, + m, nil)) +} + +// NextUntilSelection gets all following siblings of each element up to but not +// including the element matched by the Selection. It returns a new Selection +// object containing the matched elements. +func (s *Selection) NextUntilSelection(sel *Selection) *Selection { + if sel == nil { + return s.NextAll() + } + return s.NextUntilNodes(sel.Nodes...) +} + +// NextUntilNodes gets all following siblings of each element up to but not +// including the element matched by the nodes. It returns a new Selection +// object containing the matched elements. +func (s *Selection) NextUntilNodes(nodes ...*html.Node) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingNextUntil, + nil, nodes)) +} + +// PrevUntil gets all preceding siblings of each element up to but not +// including the element matched by the selector. It returns a new Selection +// object containing the matched elements. +func (s *Selection) PrevUntil(selector string) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + compileMatcher(selector), nil)) +} + +// PrevUntilMatcher gets all preceding siblings of each element up to but not +// including the element matched by the matcher. It returns a new Selection +// object containing the matched elements. +func (s *Selection) PrevUntilMatcher(m Matcher) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + m, nil)) +} + +// PrevUntilSelection gets all preceding siblings of each element up to but not +// including the element matched by the Selection. It returns a new Selection +// object containing the matched elements. +func (s *Selection) PrevUntilSelection(sel *Selection) *Selection { + if sel == nil { + return s.PrevAll() + } + return s.PrevUntilNodes(sel.Nodes...) +} + +// PrevUntilNodes gets all preceding siblings of each element up to but not +// including the element matched by the nodes. It returns a new Selection +// object containing the matched elements. +func (s *Selection) PrevUntilNodes(nodes ...*html.Node) *Selection { + return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + nil, nodes)) +} + +// NextFilteredUntil is like NextUntil, with the option to filter +// the results based on a selector string. +// It returns a new Selection object containing the matched elements. +func (s *Selection) NextFilteredUntil(filterSelector, untilSelector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextUntil, + compileMatcher(untilSelector), nil), compileMatcher(filterSelector)) +} + +// NextFilteredUntilMatcher is like NextUntilMatcher, with the option to filter +// the results based on a matcher. +// It returns a new Selection object containing the matched elements. +func (s *Selection) NextFilteredUntilMatcher(filter, until Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextUntil, + until, nil), filter) +} + +// NextFilteredUntilSelection is like NextUntilSelection, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) NextFilteredUntilSelection(filterSelector string, sel *Selection) *Selection { + return s.NextMatcherUntilSelection(compileMatcher(filterSelector), sel) +} + +// NextMatcherUntilSelection is like NextUntilSelection, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) NextMatcherUntilSelection(filter Matcher, sel *Selection) *Selection { + if sel == nil { + return s.NextMatcher(filter) + } + return s.NextMatcherUntilNodes(filter, sel.Nodes...) +} + +// NextFilteredUntilNodes is like NextUntilNodes, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) NextFilteredUntilNodes(filterSelector string, nodes ...*html.Node) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextUntil, + nil, nodes), compileMatcher(filterSelector)) +} + +// NextMatcherUntilNodes is like NextUntilNodes, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) NextMatcherUntilNodes(filter Matcher, nodes ...*html.Node) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingNextUntil, + nil, nodes), filter) +} + +// PrevFilteredUntil is like PrevUntil, with the option to filter +// the results based on a selector string. +// It returns a new Selection object containing the matched elements. +func (s *Selection) PrevFilteredUntil(filterSelector, untilSelector string) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + compileMatcher(untilSelector), nil), compileMatcher(filterSelector)) +} + +// PrevFilteredUntilMatcher is like PrevUntilMatcher, with the option to filter +// the results based on a matcher. +// It returns a new Selection object containing the matched elements. +func (s *Selection) PrevFilteredUntilMatcher(filter, until Matcher) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + until, nil), filter) +} + +// PrevFilteredUntilSelection is like PrevUntilSelection, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) PrevFilteredUntilSelection(filterSelector string, sel *Selection) *Selection { + return s.PrevMatcherUntilSelection(compileMatcher(filterSelector), sel) +} + +// PrevMatcherUntilSelection is like PrevUntilSelection, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) PrevMatcherUntilSelection(filter Matcher, sel *Selection) *Selection { + if sel == nil { + return s.PrevMatcher(filter) + } + return s.PrevMatcherUntilNodes(filter, sel.Nodes...) +} + +// PrevFilteredUntilNodes is like PrevUntilNodes, with the +// option to filter the results based on a selector string. It returns a new +// Selection object containing the matched elements. +func (s *Selection) PrevFilteredUntilNodes(filterSelector string, nodes ...*html.Node) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + nil, nodes), compileMatcher(filterSelector)) +} + +// PrevMatcherUntilNodes is like PrevUntilNodes, with the +// option to filter the results based on a matcher. It returns a new +// Selection object containing the matched elements. +func (s *Selection) PrevMatcherUntilNodes(filter Matcher, nodes ...*html.Node) *Selection { + return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, + nil, nodes), filter) +} + +// Filter and push filters the nodes based on a matcher, and pushes the results +// on the stack, with the srcSel as previous selection. +func filterAndPush(srcSel *Selection, nodes []*html.Node, m Matcher) *Selection { + // Create a temporary Selection with the specified nodes to filter using winnow + sel := &Selection{nodes, srcSel.document, nil} + // Filter based on matcher and push on stack + return pushStack(srcSel, winnow(sel, m, true)) +} + +// Internal implementation of Find that return raw nodes. +func findWithMatcher(nodes []*html.Node, m Matcher) []*html.Node { + // Map nodes to find the matches within the children of each node + return mapNodes(nodes, func(i int, n *html.Node) (result []*html.Node) { + // Go down one level, becausejQuery's Find selects only within descendants + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode { + result = append(result, m.MatchAll(c)...) + } + } + return + }) +} + +// Internal implementation to get all parent nodes, stopping at the specified +// node (or nil if no stop). +func getParentsNodes(nodes []*html.Node, stopm Matcher, stopNodes []*html.Node) []*html.Node { + return mapNodes(nodes, func(i int, n *html.Node) (result []*html.Node) { + for p := n.Parent; p != nil; p = p.Parent { + sel := newSingleSelection(p, nil) + if stopm != nil { + if sel.IsMatcher(stopm) { + break + } + } else if len(stopNodes) > 0 { + if sel.IsNodes(stopNodes...) { + break + } + } + if p.Type == html.ElementNode { + result = append(result, p) + } + } + return + }) +} + +// Internal implementation of sibling nodes that return a raw slice of matches. +func getSiblingNodes(nodes []*html.Node, st siblingType, untilm Matcher, untilNodes []*html.Node) []*html.Node { + var f func(*html.Node) bool + + // If the requested siblings are ...Until, create the test function to + // determine if the until condition is reached (returns true if it is) + if st == siblingNextUntil || st == siblingPrevUntil { + f = func(n *html.Node) bool { + if untilm != nil { + // Matcher-based condition + sel := newSingleSelection(n, nil) + return sel.IsMatcher(untilm) + } else if len(untilNodes) > 0 { + // Nodes-based condition + sel := newSingleSelection(n, nil) + return sel.IsNodes(untilNodes...) + } + return false + } + } + + return mapNodes(nodes, func(i int, n *html.Node) []*html.Node { + return getChildrenWithSiblingType(n.Parent, st, n, f) + }) +} + +// Gets the children nodes of each node in the specified slice of nodes, +// based on the sibling type request. +func getChildrenNodes(nodes []*html.Node, st siblingType) []*html.Node { + return mapNodes(nodes, func(i int, n *html.Node) []*html.Node { + return getChildrenWithSiblingType(n, st, nil, nil) + }) +} + +// Gets the children of the specified parent, based on the requested sibling +// type, skipping a specified node if required. +func getChildrenWithSiblingType(parent *html.Node, st siblingType, skipNode *html.Node, + untilFunc func(*html.Node) bool) (result []*html.Node) { + + // Create the iterator function + var iter = func(cur *html.Node) (ret *html.Node) { + // Based on the sibling type requested, iterate the right way + for { + switch st { + case siblingAll, siblingAllIncludingNonElements: + if cur == nil { + // First iteration, start with first child of parent + // Skip node if required + if ret = parent.FirstChild; ret == skipNode && skipNode != nil { + ret = skipNode.NextSibling + } + } else { + // Skip node if required + if ret = cur.NextSibling; ret == skipNode && skipNode != nil { + ret = skipNode.NextSibling + } + } + case siblingPrev, siblingPrevAll, siblingPrevUntil: + if cur == nil { + // Start with previous sibling of the skip node + ret = skipNode.PrevSibling + } else { + ret = cur.PrevSibling + } + case siblingNext, siblingNextAll, siblingNextUntil: + if cur == nil { + // Start with next sibling of the skip node + ret = skipNode.NextSibling + } else { + ret = cur.NextSibling + } + default: + panic("Invalid sibling type.") + } + if ret == nil || ret.Type == html.ElementNode || st == siblingAllIncludingNonElements { + return + } + // Not a valid node, try again from this one + cur = ret + } + } + + for c := iter(nil); c != nil; c = iter(c) { + // If this is an ...Until case, test before append (returns true + // if the until condition is reached) + if st == siblingNextUntil || st == siblingPrevUntil { + if untilFunc(c) { + return + } + } + result = append(result, c) + if st == siblingNext || st == siblingPrev { + // Only one node was requested (immediate next or previous), so exit + return + } + } + return +} + +// Internal implementation of parent nodes that return a raw slice of Nodes. +func getParentNodes(nodes []*html.Node) []*html.Node { + return mapNodes(nodes, func(i int, n *html.Node) []*html.Node { + if n.Parent != nil && n.Parent.Type == html.ElementNode { + return []*html.Node{n.Parent} + } + return nil + }) +} + +// Internal map function used by many traversing methods. Takes the source nodes +// to iterate on and the mapping function that returns an array of nodes. +// Returns an array of nodes mapped by calling the callback function once for +// each node in the source nodes. +func mapNodes(nodes []*html.Node, f func(int, *html.Node) []*html.Node) (result []*html.Node) { + set := make(map[*html.Node]bool) + for i, n := range nodes { + if vals := f(i, n); len(vals) > 0 { + result = appendWithoutDuplicates(result, vals, set) + } + } + return result +} diff --git a/vendor/github.com/PuerkitoBio/goquery/type.go b/vendor/github.com/PuerkitoBio/goquery/type.go new file mode 100644 index 00000000..6646c143 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/type.go @@ -0,0 +1,203 @@ +package goquery + +import ( + "errors" + "io" + "net/http" + "net/url" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" +) + +// Document represents an HTML document to be manipulated. Unlike jQuery, which +// is loaded as part of a DOM document, and thus acts upon its containing +// document, GoQuery doesn't know which HTML document to act upon. So it needs +// to be told, and that's what the Document class is for. It holds the root +// document node to manipulate, and can make selections on this document. +type Document struct { + *Selection + Url *url.URL + rootNode *html.Node +} + +// NewDocumentFromNode is a Document constructor that takes a root html Node +// as argument. +func NewDocumentFromNode(root *html.Node) *Document { + return newDocument(root, nil) +} + +// NewDocument is a Document constructor that takes a string URL as argument. +// It loads the specified document, parses it, and stores the root Document +// node, ready to be manipulated. +// +// Deprecated: Use the net/http standard library package to make the request +// and validate the response before calling goquery.NewDocumentFromReader +// with the response's body. +func NewDocument(url string) (*Document, error) { + // Load the URL + res, e := http.Get(url) + if e != nil { + return nil, e + } + return NewDocumentFromResponse(res) +} + +// NewDocumentFromReader returns a Document from an io.Reader. +// It returns an error as second value if the reader's data cannot be parsed +// as html. It does not check if the reader is also an io.Closer, the +// provided reader is never closed by this call. It is the responsibility +// of the caller to close it if required. +func NewDocumentFromReader(r io.Reader) (*Document, error) { + root, e := html.Parse(r) + if e != nil { + return nil, e + } + return newDocument(root, nil), nil +} + +// NewDocumentFromResponse is another Document constructor that takes an http response as argument. +// It loads the specified response's document, parses it, and stores the root Document +// node, ready to be manipulated. The response's body is closed on return. +// +// Deprecated: Use goquery.NewDocumentFromReader with the response's body. +func NewDocumentFromResponse(res *http.Response) (*Document, error) { + if res == nil { + return nil, errors.New("Response is nil") + } + defer res.Body.Close() + if res.Request == nil { + return nil, errors.New("Response.Request is nil") + } + + // Parse the HTML into nodes + root, e := html.Parse(res.Body) + if e != nil { + return nil, e + } + + // Create and fill the document + return newDocument(root, res.Request.URL), nil +} + +// CloneDocument creates a deep-clone of a document. +func CloneDocument(doc *Document) *Document { + return newDocument(cloneNode(doc.rootNode), doc.Url) +} + +// Private constructor, make sure all fields are correctly filled. +func newDocument(root *html.Node, url *url.URL) *Document { + // Create and fill the document + d := &Document{nil, url, root} + d.Selection = newSingleSelection(root, d) + return d +} + +// Selection represents a collection of nodes matching some criteria. The +// initial Selection can be created by using Document.Find, and then +// manipulated using the jQuery-like chainable syntax and methods. +type Selection struct { + Nodes []*html.Node + document *Document + prevSel *Selection +} + +// Helper constructor to create an empty selection +func newEmptySelection(doc *Document) *Selection { + return &Selection{nil, doc, nil} +} + +// Helper constructor to create a selection of only one node +func newSingleSelection(node *html.Node, doc *Document) *Selection { + return &Selection{[]*html.Node{node}, doc, nil} +} + +// Matcher is an interface that defines the methods to match +// HTML nodes against a compiled selector string. Cascadia's +// Selector implements this interface. +type Matcher interface { + Match(*html.Node) bool + MatchAll(*html.Node) []*html.Node + Filter([]*html.Node) []*html.Node +} + +// Single compiles a selector string to a Matcher that stops after the first +// match is found. +// +// By default, Selection.Find and other functions that accept a selector string +// to select nodes will use all matches corresponding to that selector. By +// using the Matcher returned by Single, at most the first match will be +// selected. +// +// For example, those two statements are semantically equivalent: +// +// sel1 := doc.Find("a").First() +// sel2 := doc.FindMatcher(goquery.Single("a")) +// +// The one using Single is optimized to be potentially much faster on large +// documents. +// +// Only the behaviour of the MatchAll method of the Matcher interface is +// altered compared to standard Matchers. This means that the single-selection +// property of the Matcher only applies for Selection methods where the Matcher +// is used to select nodes, not to filter or check if a node matches the +// Matcher - in those cases, the behaviour of the Matcher is unchanged (e.g. +// FilterMatcher(Single("div")) will still result in a Selection with multiple +// "div"s if there were many "div"s in the Selection to begin with). +func Single(selector string) Matcher { + return singleMatcher{compileMatcher(selector)} +} + +// SingleMatcher returns a Matcher matches the same nodes as m, but that stops +// after the first match is found. +// +// See the documentation of function Single for more details. +func SingleMatcher(m Matcher) Matcher { + if _, ok := m.(singleMatcher); ok { + // m is already a singleMatcher + return m + } + return singleMatcher{m} +} + +// compileMatcher compiles the selector string s and returns +// the corresponding Matcher. If s is an invalid selector string, +// it returns a Matcher that fails all matches. +func compileMatcher(s string) Matcher { + cs, err := cascadia.Compile(s) + if err != nil { + return invalidMatcher{} + } + return cs +} + +type singleMatcher struct { + Matcher +} + +func (m singleMatcher) MatchAll(n *html.Node) []*html.Node { + // Optimized version - stops finding at the first match (cascadia-compiled + // matchers all use this code path). + if mm, ok := m.Matcher.(interface{ MatchFirst(*html.Node) *html.Node }); ok { + node := mm.MatchFirst(n) + if node == nil { + return nil + } + return []*html.Node{node} + } + + // Fallback version, for e.g. test mocks that don't provide the MatchFirst + // method. + nodes := m.Matcher.MatchAll(n) + if len(nodes) > 0 { + return nodes[:1:1] + } + return nil +} + +// invalidMatcher is a Matcher that always fails to match. +type invalidMatcher struct{} + +func (invalidMatcher) Match(n *html.Node) bool { return false } +func (invalidMatcher) MatchAll(n *html.Node) []*html.Node { return nil } +func (invalidMatcher) Filter(ns []*html.Node) []*html.Node { return nil } diff --git a/vendor/github.com/PuerkitoBio/goquery/utilities.go b/vendor/github.com/PuerkitoBio/goquery/utilities.go new file mode 100644 index 00000000..ecd3453f --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/utilities.go @@ -0,0 +1,178 @@ +package goquery + +import ( + "bytes" + "io" + + "golang.org/x/net/html" +) + +// used to determine if a set (map[*html.Node]bool) should be used +// instead of iterating over a slice. The set uses more memory and +// is slower than slice iteration for small N. +const minNodesForSet = 1000 + +var nodeNames = []string{ + html.ErrorNode: "#error", + html.TextNode: "#text", + html.DocumentNode: "#document", + html.CommentNode: "#comment", +} + +// NodeName returns the node name of the first element in the selection. +// It tries to behave in a similar way as the DOM's nodeName property +// (https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName). +// +// Go's net/html package defines the following node types, listed with +// the corresponding returned value from this function: +// +// ErrorNode : #error +// TextNode : #text +// DocumentNode : #document +// ElementNode : the element's tag name +// CommentNode : #comment +// DoctypeNode : the name of the document type +// +func NodeName(s *Selection) string { + if s.Length() == 0 { + return "" + } + return nodeName(s.Get(0)) +} + +// nodeName returns the node name of the given html node. +// See NodeName for additional details on behaviour. +func nodeName(node *html.Node) string { + if node == nil { + return "" + } + + switch node.Type { + case html.ElementNode, html.DoctypeNode: + return node.Data + default: + if int(node.Type) < len(nodeNames) { + return nodeNames[node.Type] + } + return "" + } +} + +// Render renders the HTML of the first item in the selection and writes it to +// the writer. It behaves the same as OuterHtml but writes to w instead of +// returning the string. +func Render(w io.Writer, s *Selection) error { + if s.Length() == 0 { + return nil + } + n := s.Get(0) + return html.Render(w, n) +} + +// OuterHtml returns the outer HTML rendering of the first item in +// the selection - that is, the HTML including the first element's +// tag and attributes. +// +// Unlike Html, this is a function and not a method on the Selection, +// because this is not a jQuery method (in javascript-land, this is +// a property provided by the DOM). +func OuterHtml(s *Selection) (string, error) { + var buf bytes.Buffer + if err := Render(&buf, s); err != nil { + return "", err + } + return buf.String(), nil +} + +// Loop through all container nodes to search for the target node. +func sliceContains(container []*html.Node, contained *html.Node) bool { + for _, n := range container { + if nodeContains(n, contained) { + return true + } + } + + return false +} + +// Checks if the contained node is within the container node. +func nodeContains(container *html.Node, contained *html.Node) bool { + // Check if the parent of the contained node is the container node, traversing + // upward until the top is reached, or the container is found. + for contained = contained.Parent; contained != nil; contained = contained.Parent { + if container == contained { + return true + } + } + return false +} + +// Checks if the target node is in the slice of nodes. +func isInSlice(slice []*html.Node, node *html.Node) bool { + return indexInSlice(slice, node) > -1 +} + +// Returns the index of the target node in the slice, or -1. +func indexInSlice(slice []*html.Node, node *html.Node) int { + if node != nil { + for i, n := range slice { + if n == node { + return i + } + } + } + return -1 +} + +// Appends the new nodes to the target slice, making sure no duplicate is added. +// There is no check to the original state of the target slice, so it may still +// contain duplicates. The target slice is returned because append() may create +// a new underlying array. If targetSet is nil, a local set is created with the +// target if len(target) + len(nodes) is greater than minNodesForSet. +func appendWithoutDuplicates(target []*html.Node, nodes []*html.Node, targetSet map[*html.Node]bool) []*html.Node { + // if there are not that many nodes, don't use the map, faster to just use nested loops + // (unless a non-nil targetSet is passed, in which case the caller knows better). + if targetSet == nil && len(target)+len(nodes) < minNodesForSet { + for _, n := range nodes { + if !isInSlice(target, n) { + target = append(target, n) + } + } + return target + } + + // if a targetSet is passed, then assume it is reliable, otherwise create one + // and initialize it with the current target contents. + if targetSet == nil { + targetSet = make(map[*html.Node]bool, len(target)) + for _, n := range target { + targetSet[n] = true + } + } + for _, n := range nodes { + if !targetSet[n] { + target = append(target, n) + targetSet[n] = true + } + } + + return target +} + +// Loop through a selection, returning only those nodes that pass the predicate +// function. +func grep(sel *Selection, predicate func(i int, s *Selection) bool) (result []*html.Node) { + for i, n := range sel.Nodes { + if predicate(i, newSingleSelection(n, sel.document)) { + result = append(result, n) + } + } + return result +} + +// Creates a new Selection object based on the specified nodes, and keeps the +// source Selection object on the stack (linked list). +func pushStack(fromSel *Selection, nodes []*html.Node) *Selection { + result := &Selection{nodes, fromSel.document, fromSel} + return result +} diff --git a/vendor/github.com/andybalholm/cascadia/.travis.yml b/vendor/github.com/andybalholm/cascadia/.travis.yml new file mode 100644 index 00000000..6f227517 --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/.travis.yml @@ -0,0 +1,14 @@ +language: go + +go: + - 1.3 + - 1.4 + +install: + - go get github.com/andybalholm/cascadia + +script: + - go test -v + +notifications: + email: false diff --git a/vendor/github.com/andybalholm/cascadia/LICENSE b/vendor/github.com/andybalholm/cascadia/LICENSE new file mode 100644 index 00000000..ee5ad35a --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2011 Andy Balholm. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/andybalholm/cascadia/README.md b/vendor/github.com/andybalholm/cascadia/README.md new file mode 100644 index 00000000..6433cb9c --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/README.md @@ -0,0 +1,144 @@ +# cascadia + +[![](https://travis-ci.org/andybalholm/cascadia.svg)](https://travis-ci.org/andybalholm/cascadia) + +The Cascadia package implements CSS selectors for use with the parse trees produced by the html package. + +To test CSS selectors without writing Go code, check out [cascadia](https://github.com/suntong/cascadia) the command line tool, a thin wrapper around this package. + +[Refer to godoc here](https://godoc.org/github.com/andybalholm/cascadia). + +## Example + +The following is an example of how you can use Cascadia. + +```go +package main + +import ( + "fmt" + "log" + "strings" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" +) + +var pricingHtml string = ` +
+
+

Free

+
+
+

$0/mo

+
    +
  • 10 users included
  • +
  • 2 GB of storage
  • +
  • See more
  • +
+
+
+ +
+
+

Pro

+
+
+

$15/mo

+
    +
  • 20 users included
  • +
  • 10 GB of storage
  • +
  • See more
  • +
+
+
+ +
+
+

Enterprise

+
+
+

$29/mo

+
    +
  • 30 users included
  • +
  • 15 GB of storage
  • +
  • See more
  • +
+
+
+` + +func Query(n *html.Node, query string) *html.Node { + sel, err := cascadia.Parse(query) + if err != nil { + return &html.Node{} + } + return cascadia.Query(n, sel) +} + +func QueryAll(n *html.Node, query string) []*html.Node { + sel, err := cascadia.Parse(query) + if err != nil { + return []*html.Node{} + } + return cascadia.QueryAll(n, sel) +} + +func AttrOr(n *html.Node, attrName, or string) string { + for _, a := range n.Attr { + if a.Key == attrName { + return a.Val + } + } + return or +} + +func main() { + doc, err := html.Parse(strings.NewReader(pricingHtml)) + if err != nil { + log.Fatal(err) + } + fmt.Printf("List of pricing plans:\n\n") + for i, p := range QueryAll(doc, "div.card.mb-4.box-shadow") { + planName := Query(p, "h4").FirstChild.Data + price := Query(p, ".pricing-card-title").FirstChild.Data + usersIncluded := Query(p, "li:first-child").FirstChild.Data + storage := Query(p, "li:nth-child(2)").FirstChild.Data + detailsUrl := AttrOr(Query(p, "li:last-child a"), "href", "(No link available)") + fmt.Printf( + "Plan #%d\nName: %s\nPrice: %s\nUsers: %s\nStorage: %s\nDetails: %s\n\n", + i+1, + planName, + price, + usersIncluded, + storage, + detailsUrl, + ) + } +} +``` +The output is: +``` +List of pricing plans: + +Plan #1 +Name: Free +Price: $0/mo +Users: 10 users included +Storage: 2 GB of storage +Details: https://example.com + +Plan #2 +Name: Pro +Price: $15/mo +Users: 20 users included +Storage: 10 GB of storage +Details: https://example.com + +Plan #3 +Name: Enterprise +Price: $29/mo +Users: 30 users included +Storage: 15 GB of storage +Details: (No link available) +``` diff --git a/vendor/github.com/andybalholm/cascadia/parser.go b/vendor/github.com/andybalholm/cascadia/parser.go new file mode 100644 index 00000000..06eccd58 --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/parser.go @@ -0,0 +1,889 @@ +// Package cascadia is an implementation of CSS selectors. +package cascadia + +import ( + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +// a parser for CSS selectors +type parser struct { + s string // the source text + i int // the current position + + // if `false`, parsing a pseudo-element + // returns an error. + acceptPseudoElements bool +} + +// parseEscape parses a backslash escape. +func (p *parser) parseEscape() (result string, err error) { + if len(p.s) < p.i+2 || p.s[p.i] != '\\' { + return "", errors.New("invalid escape sequence") + } + + start := p.i + 1 + c := p.s[start] + switch { + case c == '\r' || c == '\n' || c == '\f': + return "", errors.New("escaped line ending outside string") + case hexDigit(c): + // unicode escape (hex) + var i int + for i = start; i < start+6 && i < len(p.s) && hexDigit(p.s[i]); i++ { + // empty + } + v, _ := strconv.ParseUint(p.s[start:i], 16, 64) + if len(p.s) > i { + switch p.s[i] { + case '\r': + i++ + if len(p.s) > i && p.s[i] == '\n' { + i++ + } + case ' ', '\t', '\n', '\f': + i++ + } + } + p.i = i + return string(rune(v)), nil + } + + // Return the literal character after the backslash. + result = p.s[start : start+1] + p.i += 2 + return result, nil +} + +// toLowerASCII returns s with all ASCII capital letters lowercased. +func toLowerASCII(s string) string { + var b []byte + for i := 0; i < len(s); i++ { + if c := s[i]; 'A' <= c && c <= 'Z' { + if b == nil { + b = make([]byte, len(s)) + copy(b, s) + } + b[i] = s[i] + ('a' - 'A') + } + } + + if b == nil { + return s + } + + return string(b) +} + +func hexDigit(c byte) bool { + return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' +} + +// nameStart returns whether c can be the first character of an identifier +// (not counting an initial hyphen, or an escape sequence). +func nameStart(c byte) bool { + return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127 +} + +// nameChar returns whether c can be a character within an identifier +// (not counting an escape sequence). +func nameChar(c byte) bool { + return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127 || + c == '-' || '0' <= c && c <= '9' +} + +// parseIdentifier parses an identifier. +func (p *parser) parseIdentifier() (result string, err error) { + const prefix = '-' + var numPrefix int + + for len(p.s) > p.i && p.s[p.i] == prefix { + p.i++ + numPrefix++ + } + + if len(p.s) <= p.i { + return "", errors.New("expected identifier, found EOF instead") + } + + if c := p.s[p.i]; !(nameStart(c) || c == '\\') { + return "", fmt.Errorf("expected identifier, found %c instead", c) + } + + result, err = p.parseName() + if numPrefix > 0 && err == nil { + result = strings.Repeat(string(prefix), numPrefix) + result + } + return +} + +// parseName parses a name (which is like an identifier, but doesn't have +// extra restrictions on the first character). +func (p *parser) parseName() (result string, err error) { + i := p.i +loop: + for i < len(p.s) { + c := p.s[i] + switch { + case nameChar(c): + start := i + for i < len(p.s) && nameChar(p.s[i]) { + i++ + } + result += p.s[start:i] + case c == '\\': + p.i = i + val, err := p.parseEscape() + if err != nil { + return "", err + } + i = p.i + result += val + default: + break loop + } + } + + if result == "" { + return "", errors.New("expected name, found EOF instead") + } + + p.i = i + return result, nil +} + +// parseString parses a single- or double-quoted string. +func (p *parser) parseString() (result string, err error) { + i := p.i + if len(p.s) < i+2 { + return "", errors.New("expected string, found EOF instead") + } + + quote := p.s[i] + i++ + +loop: + for i < len(p.s) { + switch p.s[i] { + case '\\': + if len(p.s) > i+1 { + switch c := p.s[i+1]; c { + case '\r': + if len(p.s) > i+2 && p.s[i+2] == '\n' { + i += 3 + continue loop + } + fallthrough + case '\n', '\f': + i += 2 + continue loop + } + } + p.i = i + val, err := p.parseEscape() + if err != nil { + return "", err + } + i = p.i + result += val + case quote: + break loop + case '\r', '\n', '\f': + return "", errors.New("unexpected end of line in string") + default: + start := i + for i < len(p.s) { + if c := p.s[i]; c == quote || c == '\\' || c == '\r' || c == '\n' || c == '\f' { + break + } + i++ + } + result += p.s[start:i] + } + } + + if i >= len(p.s) { + return "", errors.New("EOF in string") + } + + // Consume the final quote. + i++ + + p.i = i + return result, nil +} + +// parseRegex parses a regular expression; the end is defined by encountering an +// unmatched closing ')' or ']' which is not consumed +func (p *parser) parseRegex() (rx *regexp.Regexp, err error) { + i := p.i + if len(p.s) < i+2 { + return nil, errors.New("expected regular expression, found EOF instead") + } + + // number of open parens or brackets; + // when it becomes negative, finished parsing regex + open := 0 + +loop: + for i < len(p.s) { + switch p.s[i] { + case '(', '[': + open++ + case ')', ']': + open-- + if open < 0 { + break loop + } + } + i++ + } + + if i >= len(p.s) { + return nil, errors.New("EOF in regular expression") + } + rx, err = regexp.Compile(p.s[p.i:i]) + p.i = i + return rx, err +} + +// skipWhitespace consumes whitespace characters and comments. +// It returns true if there was actually anything to skip. +func (p *parser) skipWhitespace() bool { + i := p.i + for i < len(p.s) { + switch p.s[i] { + case ' ', '\t', '\r', '\n', '\f': + i++ + continue + case '/': + if strings.HasPrefix(p.s[i:], "/*") { + end := strings.Index(p.s[i+len("/*"):], "*/") + if end != -1 { + i += end + len("/**/") + continue + } + } + } + break + } + + if i > p.i { + p.i = i + return true + } + + return false +} + +// consumeParenthesis consumes an opening parenthesis and any following +// whitespace. It returns true if there was actually a parenthesis to skip. +func (p *parser) consumeParenthesis() bool { + if p.i < len(p.s) && p.s[p.i] == '(' { + p.i++ + p.skipWhitespace() + return true + } + return false +} + +// consumeClosingParenthesis consumes a closing parenthesis and any preceding +// whitespace. It returns true if there was actually a parenthesis to skip. +func (p *parser) consumeClosingParenthesis() bool { + i := p.i + p.skipWhitespace() + if p.i < len(p.s) && p.s[p.i] == ')' { + p.i++ + return true + } + p.i = i + return false +} + +// parseTypeSelector parses a type selector (one that matches by tag name). +func (p *parser) parseTypeSelector() (result tagSelector, err error) { + tag, err := p.parseIdentifier() + if err != nil { + return + } + return tagSelector{tag: toLowerASCII(tag)}, nil +} + +// parseIDSelector parses a selector that matches by id attribute. +func (p *parser) parseIDSelector() (idSelector, error) { + if p.i >= len(p.s) { + return idSelector{}, fmt.Errorf("expected id selector (#id), found EOF instead") + } + if p.s[p.i] != '#' { + return idSelector{}, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i]) + } + + p.i++ + id, err := p.parseName() + if err != nil { + return idSelector{}, err + } + + return idSelector{id: id}, nil +} + +// parseClassSelector parses a selector that matches by class attribute. +func (p *parser) parseClassSelector() (classSelector, error) { + if p.i >= len(p.s) { + return classSelector{}, fmt.Errorf("expected class selector (.class), found EOF instead") + } + if p.s[p.i] != '.' { + return classSelector{}, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i]) + } + + p.i++ + class, err := p.parseIdentifier() + if err != nil { + return classSelector{}, err + } + + return classSelector{class: class}, nil +} + +// parseAttributeSelector parses a selector that matches by attribute value. +func (p *parser) parseAttributeSelector() (attrSelector, error) { + if p.i >= len(p.s) { + return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead") + } + if p.s[p.i] != '[' { + return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i]) + } + + p.i++ + p.skipWhitespace() + key, err := p.parseIdentifier() + if err != nil { + return attrSelector{}, err + } + key = toLowerASCII(key) + + p.skipWhitespace() + if p.i >= len(p.s) { + return attrSelector{}, errors.New("unexpected EOF in attribute selector") + } + + if p.s[p.i] == ']' { + p.i++ + return attrSelector{key: key, operation: ""}, nil + } + + if p.i+2 >= len(p.s) { + return attrSelector{}, errors.New("unexpected EOF in attribute selector") + } + + op := p.s[p.i : p.i+2] + if op[0] == '=' { + op = "=" + } else if op[1] != '=' { + return attrSelector{}, fmt.Errorf(`expected equality operator, found "%s" instead`, op) + } + p.i += len(op) + + p.skipWhitespace() + if p.i >= len(p.s) { + return attrSelector{}, errors.New("unexpected EOF in attribute selector") + } + var val string + var rx *regexp.Regexp + if op == "#=" { + rx, err = p.parseRegex() + } else { + switch p.s[p.i] { + case '\'', '"': + val, err = p.parseString() + default: + val, err = p.parseIdentifier() + } + } + if err != nil { + return attrSelector{}, err + } + + p.skipWhitespace() + if p.i >= len(p.s) { + return attrSelector{}, errors.New("unexpected EOF in attribute selector") + } + + // check if the attribute contains an ignore case flag + ignoreCase := false + if p.s[p.i] == 'i' || p.s[p.i] == 'I' { + ignoreCase = true + p.i++ + } + + p.skipWhitespace() + if p.i >= len(p.s) { + return attrSelector{}, errors.New("unexpected EOF in attribute selector") + } + + if p.s[p.i] != ']' { + return attrSelector{}, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i]) + } + p.i++ + + switch op { + case "=", "!=", "~=", "|=", "^=", "$=", "*=", "#=": + return attrSelector{key: key, val: val, operation: op, regexp: rx, insensitive: ignoreCase}, nil + default: + return attrSelector{}, fmt.Errorf("attribute operator %q is not supported", op) + } +} + +var ( + errExpectedParenthesis = errors.New("expected '(' but didn't find it") + errExpectedClosingParenthesis = errors.New("expected ')' but didn't find it") + errUnmatchedParenthesis = errors.New("unmatched '('") +) + +// parsePseudoclassSelector parses a pseudoclass selector like :not(p) or a pseudo-element +// For backwards compatibility, both ':' and '::' prefix are allowed for pseudo-elements. +// https://drafts.csswg.org/selectors-3/#pseudo-elements +// Returning a nil `Sel` (and a nil `error`) means we found a pseudo-element. +func (p *parser) parsePseudoclassSelector() (out Sel, pseudoElement string, err error) { + if p.i >= len(p.s) { + return nil, "", fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead") + } + if p.s[p.i] != ':' { + return nil, "", fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i]) + } + + p.i++ + var mustBePseudoElement bool + if p.i >= len(p.s) { + return nil, "", fmt.Errorf("got empty pseudoclass (or pseudoelement)") + } + if p.s[p.i] == ':' { // we found a pseudo-element + mustBePseudoElement = true + p.i++ + } + + name, err := p.parseIdentifier() + if err != nil { + return + } + name = toLowerASCII(name) + if mustBePseudoElement && (name != "after" && name != "backdrop" && name != "before" && + name != "cue" && name != "first-letter" && name != "first-line" && name != "grammar-error" && + name != "marker" && name != "placeholder" && name != "selection" && name != "spelling-error") { + return out, "", fmt.Errorf("unknown pseudoelement :%s", name) + } + + switch name { + case "not", "has", "haschild": + if !p.consumeParenthesis() { + return out, "", errExpectedParenthesis + } + sel, parseErr := p.parseSelectorGroup() + if parseErr != nil { + return out, "", parseErr + } + if !p.consumeClosingParenthesis() { + return out, "", errExpectedClosingParenthesis + } + + out = relativePseudoClassSelector{name: name, match: sel} + + case "contains", "containsown": + if !p.consumeParenthesis() { + return out, "", errExpectedParenthesis + } + if p.i == len(p.s) { + return out, "", errUnmatchedParenthesis + } + var val string + switch p.s[p.i] { + case '\'', '"': + val, err = p.parseString() + default: + val, err = p.parseIdentifier() + } + if err != nil { + return out, "", err + } + val = strings.ToLower(val) + p.skipWhitespace() + if p.i >= len(p.s) { + return out, "", errors.New("unexpected EOF in pseudo selector") + } + if !p.consumeClosingParenthesis() { + return out, "", errExpectedClosingParenthesis + } + + out = containsPseudoClassSelector{own: name == "containsown", value: val} + + case "matches", "matchesown": + if !p.consumeParenthesis() { + return out, "", errExpectedParenthesis + } + rx, err := p.parseRegex() + if err != nil { + return out, "", err + } + if p.i >= len(p.s) { + return out, "", errors.New("unexpected EOF in pseudo selector") + } + if !p.consumeClosingParenthesis() { + return out, "", errExpectedClosingParenthesis + } + + out = regexpPseudoClassSelector{own: name == "matchesown", regexp: rx} + + case "nth-child", "nth-last-child", "nth-of-type", "nth-last-of-type": + if !p.consumeParenthesis() { + return out, "", errExpectedParenthesis + } + a, b, err := p.parseNth() + if err != nil { + return out, "", err + } + if !p.consumeClosingParenthesis() { + return out, "", errExpectedClosingParenthesis + } + last := name == "nth-last-child" || name == "nth-last-of-type" + ofType := name == "nth-of-type" || name == "nth-last-of-type" + out = nthPseudoClassSelector{a: a, b: b, last: last, ofType: ofType} + + case "first-child": + out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: false} + case "last-child": + out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: true} + case "first-of-type": + out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: false} + case "last-of-type": + out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: true} + case "only-child": + out = onlyChildPseudoClassSelector{ofType: false} + case "only-of-type": + out = onlyChildPseudoClassSelector{ofType: true} + case "input": + out = inputPseudoClassSelector{} + case "empty": + out = emptyElementPseudoClassSelector{} + case "root": + out = rootPseudoClassSelector{} + case "link": + out = linkPseudoClassSelector{} + case "lang": + if !p.consumeParenthesis() { + return out, "", errExpectedParenthesis + } + if p.i == len(p.s) { + return out, "", errUnmatchedParenthesis + } + val, err := p.parseIdentifier() + if err != nil { + return out, "", err + } + val = strings.ToLower(val) + p.skipWhitespace() + if p.i >= len(p.s) { + return out, "", errors.New("unexpected EOF in pseudo selector") + } + if !p.consumeClosingParenthesis() { + return out, "", errExpectedClosingParenthesis + } + out = langPseudoClassSelector{lang: val} + case "enabled": + out = enabledPseudoClassSelector{} + case "disabled": + out = disabledPseudoClassSelector{} + case "checked": + out = checkedPseudoClassSelector{} + case "visited", "hover", "active", "focus", "target": + // Not applicable in a static context: never match. + out = neverMatchSelector{value: ":" + name} + case "after", "backdrop", "before", "cue", "first-letter", "first-line", "grammar-error", "marker", "placeholder", "selection", "spelling-error": + return nil, name, nil + default: + return out, "", fmt.Errorf("unknown pseudoclass or pseudoelement :%s", name) + } + return +} + +// parseInteger parses a decimal integer. +func (p *parser) parseInteger() (int, error) { + i := p.i + start := i + for i < len(p.s) && '0' <= p.s[i] && p.s[i] <= '9' { + i++ + } + if i == start { + return 0, errors.New("expected integer, but didn't find it") + } + p.i = i + + val, err := strconv.Atoi(p.s[start:i]) + if err != nil { + return 0, err + } + + return val, nil +} + +// parseNth parses the argument for :nth-child (normally of the form an+b). +func (p *parser) parseNth() (a, b int, err error) { + // initial state + if p.i >= len(p.s) { + goto eof + } + switch p.s[p.i] { + case '-': + p.i++ + goto negativeA + case '+': + p.i++ + goto positiveA + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + goto positiveA + case 'n', 'N': + a = 1 + p.i++ + goto readN + case 'o', 'O', 'e', 'E': + id, nameErr := p.parseName() + if nameErr != nil { + return 0, 0, nameErr + } + id = toLowerASCII(id) + if id == "odd" { + return 2, 1, nil + } + if id == "even" { + return 2, 0, nil + } + return 0, 0, fmt.Errorf("expected 'odd' or 'even', but found '%s' instead", id) + default: + goto invalid + } + +positiveA: + if p.i >= len(p.s) { + goto eof + } + switch p.s[p.i] { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + a, err = p.parseInteger() + if err != nil { + return 0, 0, err + } + goto readA + case 'n', 'N': + a = 1 + p.i++ + goto readN + default: + goto invalid + } + +negativeA: + if p.i >= len(p.s) { + goto eof + } + switch p.s[p.i] { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + a, err = p.parseInteger() + if err != nil { + return 0, 0, err + } + a = -a + goto readA + case 'n', 'N': + a = -1 + p.i++ + goto readN + default: + goto invalid + } + +readA: + if p.i >= len(p.s) { + goto eof + } + switch p.s[p.i] { + case 'n', 'N': + p.i++ + goto readN + default: + // The number we read as a is actually b. + return 0, a, nil + } + +readN: + p.skipWhitespace() + if p.i >= len(p.s) { + goto eof + } + switch p.s[p.i] { + case '+': + p.i++ + p.skipWhitespace() + b, err = p.parseInteger() + if err != nil { + return 0, 0, err + } + return a, b, nil + case '-': + p.i++ + p.skipWhitespace() + b, err = p.parseInteger() + if err != nil { + return 0, 0, err + } + return a, -b, nil + default: + return a, 0, nil + } + +eof: + return 0, 0, errors.New("unexpected EOF while attempting to parse expression of form an+b") + +invalid: + return 0, 0, errors.New("unexpected character while attempting to parse expression of form an+b") +} + +// parseSimpleSelectorSequence parses a selector sequence that applies to +// a single element. +func (p *parser) parseSimpleSelectorSequence() (Sel, error) { + var selectors []Sel + + if p.i >= len(p.s) { + return nil, errors.New("expected selector, found EOF instead") + } + + switch p.s[p.i] { + case '*': + // It's the universal selector. Just skip over it, since it doesn't affect the meaning. + p.i++ + if p.i+2 < len(p.s) && p.s[p.i:p.i+2] == "|*" { // other version of universal selector + p.i += 2 + } + case '#', '.', '[', ':': + // There's no type selector. Wait to process the other till the main loop. + default: + r, err := p.parseTypeSelector() + if err != nil { + return nil, err + } + selectors = append(selectors, r) + } + + var pseudoElement string +loop: + for p.i < len(p.s) { + var ( + ns Sel + newPseudoElement string + err error + ) + switch p.s[p.i] { + case '#': + ns, err = p.parseIDSelector() + case '.': + ns, err = p.parseClassSelector() + case '[': + ns, err = p.parseAttributeSelector() + case ':': + ns, newPseudoElement, err = p.parsePseudoclassSelector() + default: + break loop + } + if err != nil { + return nil, err + } + // From https://drafts.csswg.org/selectors-3/#pseudo-elements : + // "Only one pseudo-element may appear per selector, and if present + // it must appear after the sequence of simple selectors that + // represents the subjects of the selector."" + if ns == nil { // we found a pseudo-element + if pseudoElement != "" { + return nil, fmt.Errorf("only one pseudo-element is accepted per selector, got %s and %s", pseudoElement, newPseudoElement) + } + if !p.acceptPseudoElements { + return nil, fmt.Errorf("pseudo-element %s found, but pseudo-elements support is disabled", newPseudoElement) + } + pseudoElement = newPseudoElement + } else { + if pseudoElement != "" { + return nil, fmt.Errorf("pseudo-element %s must be at the end of selector", pseudoElement) + } + selectors = append(selectors, ns) + } + + } + if len(selectors) == 1 && pseudoElement == "" { // no need wrap the selectors in compoundSelector + return selectors[0], nil + } + return compoundSelector{selectors: selectors, pseudoElement: pseudoElement}, nil +} + +// parseSelector parses a selector that may include combinators. +func (p *parser) parseSelector() (Sel, error) { + p.skipWhitespace() + result, err := p.parseSimpleSelectorSequence() + if err != nil { + return nil, err + } + + for { + var ( + combinator byte + c Sel + ) + if p.skipWhitespace() { + combinator = ' ' + } + if p.i >= len(p.s) { + return result, nil + } + + switch p.s[p.i] { + case '+', '>', '~': + combinator = p.s[p.i] + p.i++ + p.skipWhitespace() + case ',', ')': + // These characters can't begin a selector, but they can legally occur after one. + return result, nil + } + + if combinator == 0 { + return result, nil + } + + c, err = p.parseSimpleSelectorSequence() + if err != nil { + return nil, err + } + result = combinedSelector{first: result, combinator: combinator, second: c} + } +} + +// parseSelectorGroup parses a group of selectors, separated by commas. +func (p *parser) parseSelectorGroup() (SelectorGroup, error) { + current, err := p.parseSelector() + if err != nil { + return nil, err + } + result := SelectorGroup{current} + + for p.i < len(p.s) { + if p.s[p.i] != ',' { + break + } + p.i++ + c, err := p.parseSelector() + if err != nil { + return nil, err + } + result = append(result, c) + } + return result, nil +} diff --git a/vendor/github.com/andybalholm/cascadia/pseudo_classes.go b/vendor/github.com/andybalholm/cascadia/pseudo_classes.go new file mode 100644 index 00000000..6234c3eb --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/pseudo_classes.go @@ -0,0 +1,458 @@ +package cascadia + +import ( + "bytes" + "fmt" + "regexp" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// This file implements the pseudo classes selectors, +// which share the implementation of PseudoElement() and Specificity() + +type abstractPseudoClass struct{} + +func (s abstractPseudoClass) Specificity() Specificity { + return Specificity{0, 1, 0} +} + +func (c abstractPseudoClass) PseudoElement() string { + return "" +} + +type relativePseudoClassSelector struct { + name string // one of "not", "has", "haschild" + match SelectorGroup +} + +func (s relativePseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + switch s.name { + case "not": + // matches elements that do not match a. + return !s.match.Match(n) + case "has": + // matches elements with any descendant that matches a. + return hasDescendantMatch(n, s.match) + case "haschild": + // matches elements with a child that matches a. + return hasChildMatch(n, s.match) + default: + panic(fmt.Sprintf("unsupported relative pseudo class selector : %s", s.name)) + } +} + +// hasChildMatch returns whether n has any child that matches a. +func hasChildMatch(n *html.Node, a Matcher) bool { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if a.Match(c) { + return true + } + } + return false +} + +// hasDescendantMatch performs a depth-first search of n's descendants, +// testing whether any of them match a. It returns true as soon as a match is +// found, or false if no match is found. +func hasDescendantMatch(n *html.Node, a Matcher) bool { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if a.Match(c) || (c.Type == html.ElementNode && hasDescendantMatch(c, a)) { + return true + } + } + return false +} + +// Specificity returns the specificity of the most specific selectors +// in the pseudo-class arguments. +// See https://www.w3.org/TR/selectors/#specificity-rules +func (s relativePseudoClassSelector) Specificity() Specificity { + var max Specificity + for _, sel := range s.match { + newSpe := sel.Specificity() + if max.Less(newSpe) { + max = newSpe + } + } + return max +} + +func (c relativePseudoClassSelector) PseudoElement() string { + return "" +} + +type containsPseudoClassSelector struct { + abstractPseudoClass + value string + own bool +} + +func (s containsPseudoClassSelector) Match(n *html.Node) bool { + var text string + if s.own { + // matches nodes that directly contain the given text + text = strings.ToLower(nodeOwnText(n)) + } else { + // matches nodes that contain the given text. + text = strings.ToLower(nodeText(n)) + } + return strings.Contains(text, s.value) +} + +type regexpPseudoClassSelector struct { + abstractPseudoClass + regexp *regexp.Regexp + own bool +} + +func (s regexpPseudoClassSelector) Match(n *html.Node) bool { + var text string + if s.own { + // matches nodes whose text directly matches the specified regular expression + text = nodeOwnText(n) + } else { + // matches nodes whose text matches the specified regular expression + text = nodeText(n) + } + return s.regexp.MatchString(text) +} + +// writeNodeText writes the text contained in n and its descendants to b. +func writeNodeText(n *html.Node, b *bytes.Buffer) { + switch n.Type { + case html.TextNode: + b.WriteString(n.Data) + case html.ElementNode: + for c := n.FirstChild; c != nil; c = c.NextSibling { + writeNodeText(c, b) + } + } +} + +// nodeText returns the text contained in n and its descendants. +func nodeText(n *html.Node) string { + var b bytes.Buffer + writeNodeText(n, &b) + return b.String() +} + +// nodeOwnText returns the contents of the text nodes that are direct +// children of n. +func nodeOwnText(n *html.Node) string { + var b bytes.Buffer + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + b.WriteString(c.Data) + } + } + return b.String() +} + +type nthPseudoClassSelector struct { + abstractPseudoClass + a, b int + last, ofType bool +} + +func (s nthPseudoClassSelector) Match(n *html.Node) bool { + if s.a == 0 { + if s.last { + return simpleNthLastChildMatch(s.b, s.ofType, n) + } else { + return simpleNthChildMatch(s.b, s.ofType, n) + } + } + return nthChildMatch(s.a, s.b, s.last, s.ofType, n) +} + +// nthChildMatch implements :nth-child(an+b). +// If last is true, implements :nth-last-child instead. +// If ofType is true, implements :nth-of-type instead. +func nthChildMatch(a, b int, last, ofType bool, n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + parent := n.Parent + if parent == nil { + return false + } + + i := -1 + count := 0 + for c := parent.FirstChild; c != nil; c = c.NextSibling { + if (c.Type != html.ElementNode) || (ofType && c.Data != n.Data) { + continue + } + count++ + if c == n { + i = count + if !last { + break + } + } + } + + if i == -1 { + // This shouldn't happen, since n should always be one of its parent's children. + return false + } + + if last { + i = count - i + 1 + } + + i -= b + if a == 0 { + return i == 0 + } + + return i%a == 0 && i/a >= 0 +} + +// simpleNthChildMatch implements :nth-child(b). +// If ofType is true, implements :nth-of-type instead. +func simpleNthChildMatch(b int, ofType bool, n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + parent := n.Parent + if parent == nil { + return false + } + + count := 0 + for c := parent.FirstChild; c != nil; c = c.NextSibling { + if c.Type != html.ElementNode || (ofType && c.Data != n.Data) { + continue + } + count++ + if c == n { + return count == b + } + if count >= b { + return false + } + } + return false +} + +// simpleNthLastChildMatch implements :nth-last-child(b). +// If ofType is true, implements :nth-last-of-type instead. +func simpleNthLastChildMatch(b int, ofType bool, n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + parent := n.Parent + if parent == nil { + return false + } + + count := 0 + for c := parent.LastChild; c != nil; c = c.PrevSibling { + if c.Type != html.ElementNode || (ofType && c.Data != n.Data) { + continue + } + count++ + if c == n { + return count == b + } + if count >= b { + return false + } + } + return false +} + +type onlyChildPseudoClassSelector struct { + abstractPseudoClass + ofType bool +} + +// Match implements :only-child. +// If `ofType` is true, it implements :only-of-type instead. +func (s onlyChildPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + parent := n.Parent + if parent == nil { + return false + } + + count := 0 + for c := parent.FirstChild; c != nil; c = c.NextSibling { + if (c.Type != html.ElementNode) || (s.ofType && c.Data != n.Data) { + continue + } + count++ + if count > 1 { + return false + } + } + + return count == 1 +} + +type inputPseudoClassSelector struct { + abstractPseudoClass +} + +// Matches input, select, textarea and button elements. +func (s inputPseudoClassSelector) Match(n *html.Node) bool { + return n.Type == html.ElementNode && (n.Data == "input" || n.Data == "select" || n.Data == "textarea" || n.Data == "button") +} + +type emptyElementPseudoClassSelector struct { + abstractPseudoClass +} + +// Matches empty elements. +func (s emptyElementPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + switch c.Type { + case html.ElementNode: + return false + case html.TextNode: + if strings.TrimSpace(nodeText(c)) == "" { + continue + } else { + return false + } + } + } + + return true +} + +type rootPseudoClassSelector struct { + abstractPseudoClass +} + +// Match implements :root +func (s rootPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + if n.Parent == nil { + return false + } + return n.Parent.Type == html.DocumentNode +} + +func hasAttr(n *html.Node, attr string) bool { + return matchAttribute(n, attr, func(string) bool { return true }) +} + +type linkPseudoClassSelector struct { + abstractPseudoClass +} + +// Match implements :link +func (s linkPseudoClassSelector) Match(n *html.Node) bool { + return (n.DataAtom == atom.A || n.DataAtom == atom.Area || n.DataAtom == atom.Link) && hasAttr(n, "href") +} + +type langPseudoClassSelector struct { + abstractPseudoClass + lang string +} + +func (s langPseudoClassSelector) Match(n *html.Node) bool { + own := matchAttribute(n, "lang", func(val string) bool { + return val == s.lang || strings.HasPrefix(val, s.lang+"-") + }) + if n.Parent == nil { + return own + } + return own || s.Match(n.Parent) +} + +type enabledPseudoClassSelector struct { + abstractPseudoClass +} + +func (s enabledPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + switch n.DataAtom { + case atom.A, atom.Area, atom.Link: + return hasAttr(n, "href") + case atom.Optgroup, atom.Menuitem, atom.Fieldset: + return !hasAttr(n, "disabled") + case atom.Button, atom.Input, atom.Select, atom.Textarea, atom.Option: + return !hasAttr(n, "disabled") && !inDisabledFieldset(n) + } + return false +} + +type disabledPseudoClassSelector struct { + abstractPseudoClass +} + +func (s disabledPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + switch n.DataAtom { + case atom.Optgroup, atom.Menuitem, atom.Fieldset: + return hasAttr(n, "disabled") + case atom.Button, atom.Input, atom.Select, atom.Textarea, atom.Option: + return hasAttr(n, "disabled") || inDisabledFieldset(n) + } + return false +} + +func hasLegendInPreviousSiblings(n *html.Node) bool { + for s := n.PrevSibling; s != nil; s = s.PrevSibling { + if s.DataAtom == atom.Legend { + return true + } + } + return false +} + +func inDisabledFieldset(n *html.Node) bool { + if n.Parent == nil { + return false + } + if n.Parent.DataAtom == atom.Fieldset && hasAttr(n.Parent, "disabled") && + (n.DataAtom != atom.Legend || hasLegendInPreviousSiblings(n)) { + return true + } + return inDisabledFieldset(n.Parent) +} + +type checkedPseudoClassSelector struct { + abstractPseudoClass +} + +func (s checkedPseudoClassSelector) Match(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + switch n.DataAtom { + case atom.Input, atom.Menuitem: + return hasAttr(n, "checked") && matchAttribute(n, "type", func(val string) bool { + t := toLowerASCII(val) + return t == "checkbox" || t == "radio" + }) + case atom.Option: + return hasAttr(n, "selected") + } + return false +} diff --git a/vendor/github.com/andybalholm/cascadia/selector.go b/vendor/github.com/andybalholm/cascadia/selector.go new file mode 100644 index 00000000..87549be2 --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/selector.go @@ -0,0 +1,586 @@ +package cascadia + +import ( + "fmt" + "regexp" + "strings" + + "golang.org/x/net/html" +) + +// Matcher is the interface for basic selector functionality. +// Match returns whether a selector matches n. +type Matcher interface { + Match(n *html.Node) bool +} + +// Sel is the interface for all the functionality provided by selectors. +type Sel interface { + Matcher + Specificity() Specificity + + // Returns a CSS input compiling to this selector. + String() string + + // Returns a pseudo-element, or an empty string. + PseudoElement() string +} + +// Parse parses a selector. Use `ParseWithPseudoElement` +// if you need support for pseudo-elements. +func Parse(sel string) (Sel, error) { + p := &parser{s: sel} + compiled, err := p.parseSelector() + if err != nil { + return nil, err + } + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil +} + +// ParseWithPseudoElement parses a single selector, +// with support for pseudo-element. +func ParseWithPseudoElement(sel string) (Sel, error) { + p := &parser{s: sel, acceptPseudoElements: true} + compiled, err := p.parseSelector() + if err != nil { + return nil, err + } + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil +} + +// ParseGroup parses a selector, or a group of selectors separated by commas. +// Use `ParseGroupWithPseudoElements` +// if you need support for pseudo-elements. +func ParseGroup(sel string) (SelectorGroup, error) { + p := &parser{s: sel} + compiled, err := p.parseSelectorGroup() + if err != nil { + return nil, err + } + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil +} + +// ParseGroupWithPseudoElements parses a selector, or a group of selectors separated by commas. +// It supports pseudo-elements. +func ParseGroupWithPseudoElements(sel string) (SelectorGroup, error) { + p := &parser{s: sel, acceptPseudoElements: true} + compiled, err := p.parseSelectorGroup() + if err != nil { + return nil, err + } + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil +} + +// A Selector is a function which tells whether a node matches or not. +// +// This type is maintained for compatibility; I recommend using the newer and +// more idiomatic interfaces Sel and Matcher. +type Selector func(*html.Node) bool + +// Compile parses a selector and returns, if successful, a Selector object +// that can be used to match against html.Node objects. +func Compile(sel string) (Selector, error) { + compiled, err := ParseGroup(sel) + if err != nil { + return nil, err + } + + return Selector(compiled.Match), nil +} + +// MustCompile is like Compile, but panics instead of returning an error. +func MustCompile(sel string) Selector { + compiled, err := Compile(sel) + if err != nil { + panic(err) + } + return compiled +} + +// MatchAll returns a slice of the nodes that match the selector, +// from n and its children. +func (s Selector) MatchAll(n *html.Node) []*html.Node { + return s.matchAllInto(n, nil) +} + +func (s Selector) matchAllInto(n *html.Node, storage []*html.Node) []*html.Node { + if s(n) { + storage = append(storage, n) + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + storage = s.matchAllInto(child, storage) + } + + return storage +} + +func queryInto(n *html.Node, m Matcher, storage []*html.Node) []*html.Node { + for child := n.FirstChild; child != nil; child = child.NextSibling { + if m.Match(child) { + storage = append(storage, child) + } + storage = queryInto(child, m, storage) + } + + return storage +} + +// QueryAll returns a slice of all the nodes that match m, from the descendants +// of n. +func QueryAll(n *html.Node, m Matcher) []*html.Node { + return queryInto(n, m, nil) +} + +// Match returns true if the node matches the selector. +func (s Selector) Match(n *html.Node) bool { + return s(n) +} + +// MatchFirst returns the first node that matches s, from n and its children. +func (s Selector) MatchFirst(n *html.Node) *html.Node { + if s.Match(n) { + return n + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + m := s.MatchFirst(c) + if m != nil { + return m + } + } + return nil +} + +// Query returns the first node that matches m, from the descendants of n. +// If none matches, it returns nil. +func Query(n *html.Node, m Matcher) *html.Node { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if m.Match(c) { + return c + } + if matched := Query(c, m); matched != nil { + return matched + } + } + + return nil +} + +// Filter returns the nodes in nodes that match the selector. +func (s Selector) Filter(nodes []*html.Node) (result []*html.Node) { + for _, n := range nodes { + if s(n) { + result = append(result, n) + } + } + return result +} + +// Filter returns the nodes that match m. +func Filter(nodes []*html.Node, m Matcher) (result []*html.Node) { + for _, n := range nodes { + if m.Match(n) { + result = append(result, n) + } + } + return result +} + +type tagSelector struct { + tag string +} + +// Matches elements with a given tag name. +func (t tagSelector) Match(n *html.Node) bool { + return n.Type == html.ElementNode && n.Data == t.tag +} + +func (c tagSelector) Specificity() Specificity { + return Specificity{0, 0, 1} +} + +func (c tagSelector) PseudoElement() string { + return "" +} + +type classSelector struct { + class string +} + +// Matches elements by class attribute. +func (t classSelector) Match(n *html.Node) bool { + return matchAttribute(n, "class", func(s string) bool { + return matchInclude(t.class, s, false) + }) +} + +func (c classSelector) Specificity() Specificity { + return Specificity{0, 1, 0} +} + +func (c classSelector) PseudoElement() string { + return "" +} + +type idSelector struct { + id string +} + +// Matches elements by id attribute. +func (t idSelector) Match(n *html.Node) bool { + return matchAttribute(n, "id", func(s string) bool { + return s == t.id + }) +} + +func (c idSelector) Specificity() Specificity { + return Specificity{1, 0, 0} +} + +func (c idSelector) PseudoElement() string { + return "" +} + +type attrSelector struct { + key, val, operation string + regexp *regexp.Regexp + insensitive bool +} + +// Matches elements by attribute value. +func (t attrSelector) Match(n *html.Node) bool { + switch t.operation { + case "": + return matchAttribute(n, t.key, func(string) bool { return true }) + case "=": + return matchAttribute(n, t.key, func(s string) bool { return matchInsensitiveValue(s, t.val, t.insensitive) }) + case "!=": + return attributeNotEqualMatch(t.key, t.val, n, t.insensitive) + case "~=": + // matches elements where the attribute named key is a whitespace-separated list that includes val. + return matchAttribute(n, t.key, func(s string) bool { return matchInclude(t.val, s, t.insensitive) }) + case "|=": + return attributeDashMatch(t.key, t.val, n, t.insensitive) + case "^=": + return attributePrefixMatch(t.key, t.val, n, t.insensitive) + case "$=": + return attributeSuffixMatch(t.key, t.val, n, t.insensitive) + case "*=": + return attributeSubstringMatch(t.key, t.val, n, t.insensitive) + case "#=": + return attributeRegexMatch(t.key, t.regexp, n) + default: + panic(fmt.Sprintf("unsuported operation : %s", t.operation)) + } +} + +// matches elements where we ignore (or not) the case of the attribute value +// the user attribute is the value set by the user to match elements +// the real attribute is the attribute value found in the code parsed +func matchInsensitiveValue(userAttr string, realAttr string, ignoreCase bool) bool { + if ignoreCase { + return strings.EqualFold(userAttr, realAttr) + } + return userAttr == realAttr + +} + +// matches elements where the attribute named key satisifes the function f. +func matchAttribute(n *html.Node, key string, f func(string) bool) bool { + if n.Type != html.ElementNode { + return false + } + for _, a := range n.Attr { + if a.Key == key && f(a.Val) { + return true + } + } + return false +} + +// attributeNotEqualMatch matches elements where +// the attribute named key does not have the value val. +func attributeNotEqualMatch(key, val string, n *html.Node, ignoreCase bool) bool { + if n.Type != html.ElementNode { + return false + } + for _, a := range n.Attr { + if a.Key == key && matchInsensitiveValue(a.Val, val, ignoreCase) { + return false + } + } + return true +} + +// returns true if s is a whitespace-separated list that includes val. +func matchInclude(val string, s string, ignoreCase bool) bool { + for s != "" { + i := strings.IndexAny(s, " \t\r\n\f") + if i == -1 { + return matchInsensitiveValue(s, val, ignoreCase) + } + if matchInsensitiveValue(s[:i], val, ignoreCase) { + return true + } + s = s[i+1:] + } + return false +} + +// matches elements where the attribute named key equals val or starts with val plus a hyphen. +func attributeDashMatch(key, val string, n *html.Node, ignoreCase bool) bool { + return matchAttribute(n, key, + func(s string) bool { + if matchInsensitiveValue(s, val, ignoreCase) { + return true + } + if len(s) <= len(val) { + return false + } + if matchInsensitiveValue(s[:len(val)], val, ignoreCase) && s[len(val)] == '-' { + return true + } + return false + }) +} + +// attributePrefixMatch returns a Selector that matches elements where +// the attribute named key starts with val. +func attributePrefixMatch(key, val string, n *html.Node, ignoreCase bool) bool { + return matchAttribute(n, key, + func(s string) bool { + if strings.TrimSpace(s) == "" { + return false + } + if ignoreCase { + return strings.HasPrefix(strings.ToLower(s), strings.ToLower(val)) + } + return strings.HasPrefix(s, val) + }) +} + +// attributeSuffixMatch matches elements where +// the attribute named key ends with val. +func attributeSuffixMatch(key, val string, n *html.Node, ignoreCase bool) bool { + return matchAttribute(n, key, + func(s string) bool { + if strings.TrimSpace(s) == "" { + return false + } + if ignoreCase { + return strings.HasSuffix(strings.ToLower(s), strings.ToLower(val)) + } + return strings.HasSuffix(s, val) + }) +} + +// attributeSubstringMatch matches nodes where +// the attribute named key contains val. +func attributeSubstringMatch(key, val string, n *html.Node, ignoreCase bool) bool { + return matchAttribute(n, key, + func(s string) bool { + if strings.TrimSpace(s) == "" { + return false + } + if ignoreCase { + return strings.Contains(strings.ToLower(s), strings.ToLower(val)) + } + return strings.Contains(s, val) + }) +} + +// attributeRegexMatch matches nodes where +// the attribute named key matches the regular expression rx +func attributeRegexMatch(key string, rx *regexp.Regexp, n *html.Node) bool { + return matchAttribute(n, key, + func(s string) bool { + return rx.MatchString(s) + }) +} + +func (c attrSelector) Specificity() Specificity { + return Specificity{0, 1, 0} +} + +func (c attrSelector) PseudoElement() string { + return "" +} + +// see pseudo_classes.go for pseudo classes selectors + +// on a static context, some selectors can't match anything +type neverMatchSelector struct { + value string +} + +func (s neverMatchSelector) Match(n *html.Node) bool { + return false +} + +func (s neverMatchSelector) Specificity() Specificity { + return Specificity{0, 0, 0} +} + +func (c neverMatchSelector) PseudoElement() string { + return "" +} + +type compoundSelector struct { + selectors []Sel + pseudoElement string +} + +// Matches elements if each sub-selectors matches. +func (t compoundSelector) Match(n *html.Node) bool { + if len(t.selectors) == 0 { + return n.Type == html.ElementNode + } + + for _, sel := range t.selectors { + if !sel.Match(n) { + return false + } + } + return true +} + +func (s compoundSelector) Specificity() Specificity { + var out Specificity + for _, sel := range s.selectors { + out = out.Add(sel.Specificity()) + } + if s.pseudoElement != "" { + // https://drafts.csswg.org/selectors-3/#specificity + out = out.Add(Specificity{0, 0, 1}) + } + return out +} + +func (c compoundSelector) PseudoElement() string { + return c.pseudoElement +} + +type combinedSelector struct { + first Sel + combinator byte + second Sel +} + +func (t combinedSelector) Match(n *html.Node) bool { + if t.first == nil { + return false // maybe we should panic + } + switch t.combinator { + case 0: + return t.first.Match(n) + case ' ': + return descendantMatch(t.first, t.second, n) + case '>': + return childMatch(t.first, t.second, n) + case '+': + return siblingMatch(t.first, t.second, true, n) + case '~': + return siblingMatch(t.first, t.second, false, n) + default: + panic("unknown combinator") + } +} + +// matches an element if it matches d and has an ancestor that matches a. +func descendantMatch(a, d Matcher, n *html.Node) bool { + if !d.Match(n) { + return false + } + + for p := n.Parent; p != nil; p = p.Parent { + if a.Match(p) { + return true + } + } + + return false +} + +// matches an element if it matches d and its parent matches a. +func childMatch(a, d Matcher, n *html.Node) bool { + return d.Match(n) && n.Parent != nil && a.Match(n.Parent) +} + +// matches an element if it matches s2 and is preceded by an element that matches s1. +// If adjacent is true, the sibling must be immediately before the element. +func siblingMatch(s1, s2 Matcher, adjacent bool, n *html.Node) bool { + if !s2.Match(n) { + return false + } + + if adjacent { + for n = n.PrevSibling; n != nil; n = n.PrevSibling { + if n.Type == html.TextNode || n.Type == html.CommentNode { + continue + } + return s1.Match(n) + } + return false + } + + // Walk backwards looking for element that matches s1 + for c := n.PrevSibling; c != nil; c = c.PrevSibling { + if s1.Match(c) { + return true + } + } + + return false +} + +func (s combinedSelector) Specificity() Specificity { + spec := s.first.Specificity() + if s.second != nil { + spec = spec.Add(s.second.Specificity()) + } + return spec +} + +// on combinedSelector, a pseudo-element only makes sens on the last +// selector, although others increase specificity. +func (c combinedSelector) PseudoElement() string { + if c.second == nil { + return "" + } + return c.second.PseudoElement() +} + +// A SelectorGroup is a list of selectors, which matches if any of the +// individual selectors matches. +type SelectorGroup []Sel + +// Match returns true if the node matches one of the single selectors. +func (s SelectorGroup) Match(n *html.Node) bool { + for _, sel := range s { + if sel.Match(n) { + return true + } + } + return false +} diff --git a/vendor/github.com/andybalholm/cascadia/serialize.go b/vendor/github.com/andybalholm/cascadia/serialize.go new file mode 100644 index 00000000..61acf04e --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/serialize.go @@ -0,0 +1,176 @@ +package cascadia + +import ( + "fmt" + "strconv" + "strings" +) + +// implements the reverse operation Sel -> string + +var specialCharReplacer *strings.Replacer + +func init() { + var pairs []string + for _, s := range ",!\"#$%&'()*+ -./:;<=>?@[\\]^`{|}~" { + pairs = append(pairs, string(s), "\\"+string(s)) + } + specialCharReplacer = strings.NewReplacer(pairs...) +} + +// espace special CSS char +func escape(s string) string { return specialCharReplacer.Replace(s) } + +func (c tagSelector) String() string { + return c.tag +} + +func (c idSelector) String() string { + return "#" + escape(c.id) +} + +func (c classSelector) String() string { + return "." + escape(c.class) +} + +func (c attrSelector) String() string { + val := c.val + if c.operation == "#=" { + val = c.regexp.String() + } else if c.operation != "" { + val = fmt.Sprintf(`"%s"`, val) + } + + ignoreCase := "" + + if c.insensitive { + ignoreCase = " i" + } + + return fmt.Sprintf(`[%s%s%s%s]`, c.key, c.operation, val, ignoreCase) +} + +func (c relativePseudoClassSelector) String() string { + return fmt.Sprintf(":%s(%s)", c.name, c.match.String()) +} + +func (c containsPseudoClassSelector) String() string { + s := "contains" + if c.own { + s += "Own" + } + return fmt.Sprintf(`:%s("%s")`, s, c.value) +} + +func (c regexpPseudoClassSelector) String() string { + s := "matches" + if c.own { + s += "Own" + } + return fmt.Sprintf(":%s(%s)", s, c.regexp.String()) +} + +func (c nthPseudoClassSelector) String() string { + if c.a == 0 && c.b == 1 { // special cases + s := ":first-" + if c.last { + s = ":last-" + } + if c.ofType { + s += "of-type" + } else { + s += "child" + } + return s + } + var name string + switch [2]bool{c.last, c.ofType} { + case [2]bool{true, true}: + name = "nth-last-of-type" + case [2]bool{true, false}: + name = "nth-last-child" + case [2]bool{false, true}: + name = "nth-of-type" + case [2]bool{false, false}: + name = "nth-child" + } + s := fmt.Sprintf("+%d", c.b) + if c.b < 0 { // avoid +-8 invalid syntax + s = strconv.Itoa(c.b) + } + return fmt.Sprintf(":%s(%dn%s)", name, c.a, s) +} + +func (c onlyChildPseudoClassSelector) String() string { + if c.ofType { + return ":only-of-type" + } + return ":only-child" +} + +func (c inputPseudoClassSelector) String() string { + return ":input" +} + +func (c emptyElementPseudoClassSelector) String() string { + return ":empty" +} + +func (c rootPseudoClassSelector) String() string { + return ":root" +} + +func (c linkPseudoClassSelector) String() string { + return ":link" +} + +func (c langPseudoClassSelector) String() string { + return fmt.Sprintf(":lang(%s)", c.lang) +} + +func (c neverMatchSelector) String() string { + return c.value +} + +func (c enabledPseudoClassSelector) String() string { + return ":enabled" +} + +func (c disabledPseudoClassSelector) String() string { + return ":disabled" +} + +func (c checkedPseudoClassSelector) String() string { + return ":checked" +} + +func (c compoundSelector) String() string { + if len(c.selectors) == 0 && c.pseudoElement == "" { + return "*" + } + chunks := make([]string, len(c.selectors)) + for i, sel := range c.selectors { + chunks[i] = sel.String() + } + s := strings.Join(chunks, "") + if c.pseudoElement != "" { + s += "::" + c.pseudoElement + } + return s +} + +func (c combinedSelector) String() string { + start := c.first.String() + if c.second != nil { + start += fmt.Sprintf(" %s %s", string(c.combinator), c.second.String()) + } + return start +} + +func (c SelectorGroup) String() string { + ck := make([]string, len(c)) + for i, s := range c { + ck[i] = s.String() + } + return strings.Join(ck, ", ") +} diff --git a/vendor/github.com/andybalholm/cascadia/specificity.go b/vendor/github.com/andybalholm/cascadia/specificity.go new file mode 100644 index 00000000..8db864f9 --- /dev/null +++ b/vendor/github.com/andybalholm/cascadia/specificity.go @@ -0,0 +1,26 @@ +package cascadia + +// Specificity is the CSS specificity as defined in +// https://www.w3.org/TR/selectors/#specificity-rules +// with the convention Specificity = [A,B,C]. +type Specificity [3]int + +// returns `true` if s < other (strictly), false otherwise +func (s Specificity) Less(other Specificity) bool { + for i := range s { + if s[i] < other[i] { + return true + } + if s[i] > other[i] { + return false + } + } + return false +} + +func (s Specificity) Add(other Specificity) Specificity { + for i, sp := range other { + s[i] += sp + } + return s +} diff --git a/vendor/github.com/aymerick/douceur/LICENSE b/vendor/github.com/aymerick/douceur/LICENSE new file mode 100644 index 00000000..6ce87cd3 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Aymerick JEHANNE + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/aymerick/douceur/css/declaration.go b/vendor/github.com/aymerick/douceur/css/declaration.go new file mode 100644 index 00000000..61d29d33 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/declaration.go @@ -0,0 +1,60 @@ +package css + +import "fmt" + +// Declaration represents a parsed style property +type Declaration struct { + Property string + Value string + Important bool +} + +// NewDeclaration instanciates a new Declaration +func NewDeclaration() *Declaration { + return &Declaration{} +} + +// Returns string representation of the Declaration +func (decl *Declaration) String() string { + return decl.StringWithImportant(true) +} + +// StringWithImportant returns string representation with optional !important part +func (decl *Declaration) StringWithImportant(option bool) string { + result := fmt.Sprintf("%s: %s", decl.Property, decl.Value) + + if option && decl.Important { + result += " !important" + } + + result += ";" + + return result +} + +// Equal returns true if both Declarations are equals +func (decl *Declaration) Equal(other *Declaration) bool { + return (decl.Property == other.Property) && (decl.Value == other.Value) && (decl.Important == other.Important) +} + +// +// DeclarationsByProperty +// + +// DeclarationsByProperty represents sortable style declarations +type DeclarationsByProperty []*Declaration + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Len() int { + return len(declarations) +} + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Swap(i, j int) { + declarations[i], declarations[j] = declarations[j], declarations[i] +} + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Less(i, j int) bool { + return declarations[i].Property < declarations[j].Property +} diff --git a/vendor/github.com/aymerick/douceur/css/rule.go b/vendor/github.com/aymerick/douceur/css/rule.go new file mode 100644 index 00000000..b5a44b54 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/rule.go @@ -0,0 +1,230 @@ +package css + +import ( + "fmt" + "strings" +) + +const ( + indentSpace = 2 +) + +// RuleKind represents a Rule kind +type RuleKind int + +// Rule kinds +const ( + QualifiedRule RuleKind = iota + AtRule +) + +// At Rules than have Rules inside their block instead of Declarations +var atRulesWithRulesBlock = []string{ + "@document", "@font-feature-values", "@keyframes", "@media", "@supports", +} + +// Rule represents a parsed CSS rule +type Rule struct { + Kind RuleKind + + // At Rule name (eg: "@media") + Name string + + // Raw prelude + Prelude string + + // Qualified Rule selectors parsed from prelude + Selectors []string + + // Style properties + Declarations []*Declaration + + // At Rule embedded rules + Rules []*Rule + + // Current rule embedding level + EmbedLevel int +} + +// NewRule instanciates a new Rule +func NewRule(kind RuleKind) *Rule { + return &Rule{ + Kind: kind, + } +} + +// Returns string representation of rule kind +func (kind RuleKind) String() string { + switch kind { + case QualifiedRule: + return "Qualified Rule" + case AtRule: + return "At Rule" + default: + return "WAT" + } +} + +// EmbedsRules returns true if this rule embeds another rules +func (rule *Rule) EmbedsRules() bool { + if rule.Kind == AtRule { + for _, atRuleName := range atRulesWithRulesBlock { + if rule.Name == atRuleName { + return true + } + } + } + + return false +} + +// Equal returns true if both rules are equals +func (rule *Rule) Equal(other *Rule) bool { + if (rule.Kind != other.Kind) || + (rule.Prelude != other.Prelude) || + (rule.Name != other.Name) { + return false + } + + if (len(rule.Selectors) != len(other.Selectors)) || + (len(rule.Declarations) != len(other.Declarations)) || + (len(rule.Rules) != len(other.Rules)) { + return false + } + + for i, sel := range rule.Selectors { + if sel != other.Selectors[i] { + return false + } + } + + for i, decl := range rule.Declarations { + if !decl.Equal(other.Declarations[i]) { + return false + } + } + + for i, rule := range rule.Rules { + if !rule.Equal(other.Rules[i]) { + return false + } + } + + return true +} + +// Diff returns a string representation of rules differences +func (rule *Rule) Diff(other *Rule) []string { + result := []string{} + + if rule.Kind != other.Kind { + result = append(result, fmt.Sprintf("Kind: %s | %s", rule.Kind.String(), other.Kind.String())) + } + + if rule.Prelude != other.Prelude { + result = append(result, fmt.Sprintf("Prelude: \"%s\" | \"%s\"", rule.Prelude, other.Prelude)) + } + + if rule.Name != other.Name { + result = append(result, fmt.Sprintf("Name: \"%s\" | \"%s\"", rule.Name, other.Name)) + } + + if len(rule.Selectors) != len(other.Selectors) { + result = append(result, fmt.Sprintf("Selectors: %v | %v", strings.Join(rule.Selectors, ", "), strings.Join(other.Selectors, ", "))) + } else { + for i, sel := range rule.Selectors { + if sel != other.Selectors[i] { + result = append(result, fmt.Sprintf("Selector: \"%s\" | \"%s\"", sel, other.Selectors[i])) + } + } + } + + if len(rule.Declarations) != len(other.Declarations) { + result = append(result, fmt.Sprintf("Declarations Nb: %d | %d", len(rule.Declarations), len(other.Declarations))) + } else { + for i, decl := range rule.Declarations { + if !decl.Equal(other.Declarations[i]) { + result = append(result, fmt.Sprintf("Declaration: \"%s\" | \"%s\"", decl.String(), other.Declarations[i].String())) + } + } + } + + if len(rule.Rules) != len(other.Rules) { + result = append(result, fmt.Sprintf("Rules Nb: %d | %d", len(rule.Rules), len(other.Rules))) + } else { + + for i, rule := range rule.Rules { + if !rule.Equal(other.Rules[i]) { + result = append(result, fmt.Sprintf("Rule: \"%s\" | \"%s\"", rule.String(), other.Rules[i].String())) + } + } + } + + return result +} + +// Returns the string representation of a rule +func (rule *Rule) String() string { + result := "" + + if rule.Kind == QualifiedRule { + for i, sel := range rule.Selectors { + if i != 0 { + result += ", " + } + result += sel + } + } else { + // AtRule + result += fmt.Sprintf("%s", rule.Name) + + if rule.Prelude != "" { + if result != "" { + result += " " + } + result += fmt.Sprintf("%s", rule.Prelude) + } + } + + if (len(rule.Declarations) == 0) && (len(rule.Rules) == 0) { + result += ";" + } else { + result += " {\n" + + if rule.EmbedsRules() { + for _, subRule := range rule.Rules { + result += fmt.Sprintf("%s%s\n", rule.indent(), subRule.String()) + } + } else { + for _, decl := range rule.Declarations { + result += fmt.Sprintf("%s%s\n", rule.indent(), decl.String()) + } + } + + result += fmt.Sprintf("%s}", rule.indentEndBlock()) + } + + return result +} + +// Returns identation spaces for declarations and rules +func (rule *Rule) indent() string { + result := "" + + for i := 0; i < ((rule.EmbedLevel + 1) * indentSpace); i++ { + result += " " + } + + return result +} + +// Returns identation spaces for end of block character +func (rule *Rule) indentEndBlock() string { + result := "" + + for i := 0; i < (rule.EmbedLevel * indentSpace); i++ { + result += " " + } + + return result +} diff --git a/vendor/github.com/aymerick/douceur/css/stylesheet.go b/vendor/github.com/aymerick/douceur/css/stylesheet.go new file mode 100644 index 00000000..6b32c2ec --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/stylesheet.go @@ -0,0 +1,25 @@ +package css + +// Stylesheet represents a parsed stylesheet +type Stylesheet struct { + Rules []*Rule +} + +// NewStylesheet instanciate a new Stylesheet +func NewStylesheet() *Stylesheet { + return &Stylesheet{} +} + +// Returns string representation of the Stylesheet +func (sheet *Stylesheet) String() string { + result := "" + + for _, rule := range sheet.Rules { + if result != "" { + result += "\n" + } + result += rule.String() + } + + return result +} diff --git a/vendor/github.com/aymerick/douceur/parser/parser.go b/vendor/github.com/aymerick/douceur/parser/parser.go new file mode 100644 index 00000000..6c4917cc --- /dev/null +++ b/vendor/github.com/aymerick/douceur/parser/parser.go @@ -0,0 +1,409 @@ +package parser + +import ( + "errors" + "fmt" + "regexp" + "strings" + + "github.com/gorilla/css/scanner" + + "github.com/aymerick/douceur/css" +) + +const ( + importantSuffixRegexp = `(?i)\s*!important\s*$` +) + +var ( + importantRegexp *regexp.Regexp +) + +// Parser represents a CSS parser +type Parser struct { + scan *scanner.Scanner // Tokenizer + + // Tokens parsed but not consumed yet + tokens []*scanner.Token + + // Rule embedding level + embedLevel int +} + +func init() { + importantRegexp = regexp.MustCompile(importantSuffixRegexp) +} + +// NewParser instanciates a new parser +func NewParser(txt string) *Parser { + return &Parser{ + scan: scanner.New(txt), + } +} + +// Parse parses a whole stylesheet +func Parse(text string) (*css.Stylesheet, error) { + result, err := NewParser(text).ParseStylesheet() + if err != nil { + return nil, err + } + + return result, nil +} + +// ParseDeclarations parses CSS declarations +func ParseDeclarations(text string) ([]*css.Declaration, error) { + result, err := NewParser(text).ParseDeclarations() + if err != nil { + return nil, err + } + + return result, nil +} + +// ParseStylesheet parses a stylesheet +func (parser *Parser) ParseStylesheet() (*css.Stylesheet, error) { + result := css.NewStylesheet() + + // Parse BOM + if _, err := parser.parseBOM(); err != nil { + return result, err + } + + // Parse list of rules + rules, err := parser.ParseRules() + if err != nil { + return result, err + } + + result.Rules = rules + + return result, nil +} + +// ParseRules parses a list of rules +func (parser *Parser) ParseRules() ([]*css.Rule, error) { + result := []*css.Rule{} + + inBlock := false + if parser.tokenChar("{") { + // parsing a block of rules + inBlock = true + parser.embedLevel++ + + parser.shiftToken() + } + + for parser.tokenParsable() { + if parser.tokenIgnorable() { + parser.shiftToken() + } else if parser.tokenChar("}") { + if !inBlock { + errMsg := fmt.Sprintf("Unexpected } character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + parser.shiftToken() + parser.embedLevel-- + + // finished + break + } else { + rule, err := parser.ParseRule() + if err != nil { + return result, err + } + + rule.EmbedLevel = parser.embedLevel + result = append(result, rule) + } + } + + return result, parser.err() +} + +// ParseRule parses a rule +func (parser *Parser) ParseRule() (*css.Rule, error) { + if parser.tokenAtKeyword() { + return parser.parseAtRule() + } + + return parser.parseQualifiedRule() +} + +// ParseDeclarations parses a list of declarations +func (parser *Parser) ParseDeclarations() ([]*css.Declaration, error) { + result := []*css.Declaration{} + + if parser.tokenChar("{") { + parser.shiftToken() + } + + for parser.tokenParsable() { + if parser.tokenIgnorable() { + parser.shiftToken() + } else if parser.tokenChar("}") { + // end of block + parser.shiftToken() + break + } else { + declaration, err := parser.ParseDeclaration() + if err != nil { + return result, err + } + + result = append(result, declaration) + } + } + + return result, parser.err() +} + +// ParseDeclaration parses a declaration +func (parser *Parser) ParseDeclaration() (*css.Declaration, error) { + result := css.NewDeclaration() + curValue := "" + + for parser.tokenParsable() { + if parser.tokenChar(":") { + result.Property = strings.TrimSpace(curValue) + curValue = "" + + parser.shiftToken() + } else if parser.tokenChar(";") || parser.tokenChar("}") { + if result.Property == "" { + errMsg := fmt.Sprintf("Unexpected ; character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + if importantRegexp.MatchString(curValue) { + result.Important = true + curValue = importantRegexp.ReplaceAllString(curValue, "") + } + + result.Value = strings.TrimSpace(curValue) + + if parser.tokenChar(";") { + parser.shiftToken() + } + + // finished + break + } else { + token := parser.shiftToken() + curValue += token.Value + } + } + + // log.Printf("[parsed] Declaration: %s", result.String()) + + return result, parser.err() +} + +// Parse an At Rule +func (parser *Parser) parseAtRule() (*css.Rule, error) { + // parse rule name (eg: "@import") + token := parser.shiftToken() + + result := css.NewRule(css.AtRule) + result.Name = token.Value + + for parser.tokenParsable() { + if parser.tokenChar(";") { + parser.shiftToken() + + // finished + break + } else if parser.tokenChar("{") { + if result.EmbedsRules() { + // parse rules block + rules, err := parser.ParseRules() + if err != nil { + return result, err + } + + result.Rules = rules + } else { + // parse declarations block + declarations, err := parser.ParseDeclarations() + if err != nil { + return result, err + } + + result.Declarations = declarations + } + + // finished + break + } else { + // parse prelude + prelude, err := parser.parsePrelude() + if err != nil { + return result, err + } + + result.Prelude = prelude + } + } + + // log.Printf("[parsed] Rule: %s", result.String()) + + return result, parser.err() +} + +// Parse a Qualified Rule +func (parser *Parser) parseQualifiedRule() (*css.Rule, error) { + result := css.NewRule(css.QualifiedRule) + + for parser.tokenParsable() { + if parser.tokenChar("{") { + if result.Prelude == "" { + errMsg := fmt.Sprintf("Unexpected { character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + // parse declarations block + declarations, err := parser.ParseDeclarations() + if err != nil { + return result, err + } + + result.Declarations = declarations + + // finished + break + } else { + // parse prelude + prelude, err := parser.parsePrelude() + if err != nil { + return result, err + } + + result.Prelude = prelude + } + } + + result.Selectors = strings.Split(result.Prelude, ",") + for i, sel := range result.Selectors { + result.Selectors[i] = strings.TrimSpace(sel) + } + + // log.Printf("[parsed] Rule: %s", result.String()) + + return result, parser.err() +} + +// Parse Rule prelude +func (parser *Parser) parsePrelude() (string, error) { + result := "" + + for parser.tokenParsable() && !parser.tokenEndOfPrelude() { + token := parser.shiftToken() + result += token.Value + } + + result = strings.TrimSpace(result) + + // log.Printf("[parsed] prelude: %s", result) + + return result, parser.err() +} + +// Parse BOM +func (parser *Parser) parseBOM() (bool, error) { + if parser.nextToken().Type == scanner.TokenBOM { + parser.shiftToken() + return true, nil + } + + return false, parser.err() +} + +// Returns next token without removing it from tokens buffer +func (parser *Parser) nextToken() *scanner.Token { + if len(parser.tokens) == 0 { + // fetch next token + nextToken := parser.scan.Next() + + // log.Printf("[token] %s => %v", nextToken.Type.String(), nextToken.Value) + + // queue it + parser.tokens = append(parser.tokens, nextToken) + } + + return parser.tokens[0] +} + +// Returns next token and remove it from the tokens buffer +func (parser *Parser) shiftToken() *scanner.Token { + var result *scanner.Token + + result, parser.tokens = parser.tokens[0], parser.tokens[1:] + return result +} + +// Returns tokenizer error, or nil if no error +func (parser *Parser) err() error { + if parser.tokenError() { + token := parser.nextToken() + return fmt.Errorf("Tokenizer error: %s", token.String()) + } + + return nil +} + +// Returns true if next token is Error +func (parser *Parser) tokenError() bool { + return parser.nextToken().Type == scanner.TokenError +} + +// Returns true if next token is EOF +func (parser *Parser) tokenEOF() bool { + return parser.nextToken().Type == scanner.TokenEOF +} + +// Returns true if next token is a whitespace +func (parser *Parser) tokenWS() bool { + return parser.nextToken().Type == scanner.TokenS +} + +// Returns true if next token is a comment +func (parser *Parser) tokenComment() bool { + return parser.nextToken().Type == scanner.TokenComment +} + +// Returns true if next token is a CDO or a CDC +func (parser *Parser) tokenCDOorCDC() bool { + switch parser.nextToken().Type { + case scanner.TokenCDO, scanner.TokenCDC: + return true + default: + return false + } +} + +// Returns true if next token is ignorable +func (parser *Parser) tokenIgnorable() bool { + return parser.tokenWS() || parser.tokenComment() || parser.tokenCDOorCDC() +} + +// Returns true if next token is parsable +func (parser *Parser) tokenParsable() bool { + return !parser.tokenEOF() && !parser.tokenError() +} + +// Returns true if next token is an At Rule keyword +func (parser *Parser) tokenAtKeyword() bool { + return parser.nextToken().Type == scanner.TokenAtKeyword +} + +// Returns true if next token is given character +func (parser *Parser) tokenChar(value string) bool { + token := parser.nextToken() + return (token.Type == scanner.TokenChar) && (token.Value == value) +} + +// Returns true if next token marks the end of a prelude +func (parser *Parser) tokenEndOfPrelude() bool { + return parser.tokenChar(";") || parser.tokenChar("{") +} diff --git a/vendor/github.com/go-shiori/dom/.gitignore b/vendor/github.com/go-shiori/dom/.gitignore new file mode 100644 index 00000000..f2900cef --- /dev/null +++ b/vendor/github.com/go-shiori/dom/.gitignore @@ -0,0 +1,2 @@ +# Exclude config +/.vscode \ No newline at end of file diff --git a/vendor/github.com/go-shiori/dom/.travis.yml b/vendor/github.com/go-shiori/dom/.travis.yml new file mode 100644 index 00000000..2169a76f --- /dev/null +++ b/vendor/github.com/go-shiori/dom/.travis.yml @@ -0,0 +1,11 @@ +language: go +sudo: true +matrix: + include: + - go: 1.x + env: LATEST=true + +script: + - go get -v ./... + - go vet $(go list ./... | grep -v /vendor/) + - go test -v -race ./... \ No newline at end of file diff --git a/vendor/github.com/go-shiori/dom/LICENSE b/vendor/github.com/go-shiori/dom/LICENSE new file mode 100644 index 00000000..088ecdda --- /dev/null +++ b/vendor/github.com/go-shiori/dom/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019-present Radhi Fadlillah + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/go-shiori/dom/README.md b/vendor/github.com/go-shiori/dom/README.md new file mode 100644 index 00000000..31eefe7e --- /dev/null +++ b/vendor/github.com/go-shiori/dom/README.md @@ -0,0 +1,22 @@ +DOM +=== + +[![GoDoc](https://godoc.org/github.com/go-shiori/dom?status.png)](https://godoc.org/github.com/go-shiori/dom) +[![Travis CI](https://travis-ci.org/go-shiori/dom.svg?branch=master)](https://travis-ci.org/go-shiori/dom) +[![Go Report Card](https://goreportcard.com/badge/github.com/go-shiori/dom)](https://goreportcard.com/report/github.com/go-shiori/dom) +[![Donate PayPal](https://img.shields.io/static/v1?label=donate&message=PayPal&color=00457C&logo=paypal)](https://www.paypal.me/RadhiFadlillah) +[![Donate Ko-fi](https://img.shields.io/static/v1?label=donate&message=Ko-fi&color=F16061&logo=ko-fi)](https://ko-fi.com/radhifadlillah) + +DOM is a Go package for manipulating [HTML](https://godoc.org/golang.org/x/net/html) node. The methods that exist in this package has similar name and purpose as the DOM manipulation methods in JS, which make it useful when porting code from JS to Go. Currently it used in [`warc`](https://github.com/go-shiori/warc) and [`go-readability`](https://github.com/go-shiori/go-readability). + +## Installation + +To install this package, just run `go get` : + +``` +go get -u -v github.com/go-shiori/dom +``` + +## Licenses + +DOM is distributed under [MIT license](https://choosealicense.com/licenses/mit/), which means you can use and modify it however you want. However, if you make an enhancement for it, if possible, please send a pull request. If you like this project, please consider donating to me either via [PayPal](https://www.paypal.me/RadhiFadlillah) or [Ko-Fi](https://ko-fi.com/radhifadlillah). diff --git a/vendor/github.com/go-shiori/dom/dom.go b/vendor/github.com/go-shiori/dom/dom.go new file mode 100644 index 00000000..2a18dc06 --- /dev/null +++ b/vendor/github.com/go-shiori/dom/dom.go @@ -0,0 +1,622 @@ +package dom + +import ( + "bytes" + "regexp" + "strings" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" +) + +var ( + rxPunctuation = regexp.MustCompile(`\s+([.?!,;])\s*(\S*)`) + rxTempNewline = regexp.MustCompile(`\s*\|\\/\|\s*`) + rxDisplayNone = regexp.MustCompile(`(?i)display:\s*none`) + rxVisibilityHidden = regexp.MustCompile(`(?i)visibility:\s*(:?hidden|collapse)`) +) + +// QuerySelectorAll returns array of document's elements that match +// the specified group of selectors. +func QuerySelectorAll(doc *html.Node, selectors string) []*html.Node { + matcher, err := cascadia.ParseGroup(selectors) + if err != nil { + return nil + } + + return cascadia.QueryAll(doc, matcher) +} + +// QuerySelector returns the first document's element that match +// the specified group of selectors. +func QuerySelector(doc *html.Node, selectors string) *html.Node { + matcher, err := cascadia.ParseGroup(selectors) + if err != nil { + return nil + } + + return cascadia.Query(doc, matcher) +} + +// GetElementByID returns a Node object representing the element whose id +// property matches the specified string. +func GetElementByID(doc *html.Node, id string) *html.Node { + if id == "" { + return nil + } + + var results []*html.Node + var finder func(*html.Node) + + finder = func(node *html.Node) { + nodeID := GetAttribute(node, "id") + nodeID = strings.TrimSpace(nodeID) + + if node.Type == html.ElementNode && nodeID == id { + results = append(results, node) + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + for child := doc.FirstChild; child != nil; child = child.NextSibling { + finder(child) + + if len(results) > 0 { + return results[0] + } + } + + return nil +} + +// GetElementsByClassName returns an array of all child elements which +// have all of the given class name(s). +func GetElementsByClassName(doc *html.Node, classNames string) []*html.Node { + // Convert class name to map + classes := map[string]struct{}{} + for _, class := range strings.Fields(classNames) { + classes[class] = struct{}{} + } + + nClasses := len(classes) + if nClasses == 0 { + return nil + } + + // Create finder method + var results []*html.Node + var finder func(*html.Node) + allClassExist := func(node *html.Node) bool { + matchCount := 0 + nodeClasses := GetAttribute(node, "class") + for _, nodeClass := range strings.Fields(nodeClasses) { + if _, exist := classes[nodeClass]; exist { + matchCount++ + } + } + + return matchCount == nClasses + } + + finder = func(node *html.Node) { + if node.Type == html.ElementNode && allClassExist(node) { + results = append(results, node) + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + // Check all nodes + for child := doc.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + + return results +} + +// GetElementsByTagName returns a collection of all elements in the document with +// the specified tag name, as an array of Node object. +// The special tag "*" will represents all elements. +func GetElementsByTagName(doc *html.Node, tagName string) []*html.Node { + var results []*html.Node + var finder func(*html.Node) + + finder = func(node *html.Node) { + if node.Type == html.ElementNode && (tagName == "*" || node.Data == tagName) { + results = append(results, node) + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + for child := doc.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + + return results +} + +// CreateElement creates a new ElementNode with specified tag. +func CreateElement(tagName string) *html.Node { + return &html.Node{ + Type: html.ElementNode, + Data: tagName, + } +} + +// CreateTextNode creates a new Text node. +func CreateTextNode(data string) *html.Node { + return &html.Node{ + Type: html.TextNode, + Data: data, + } +} + +// TagName returns the tag name of a Node. +// If it's not ElementNode, return empty string. +func TagName(node *html.Node) string { + if node == nil { + return "" + } + + if node.Type != html.ElementNode { + return "" + } + + return node.Data +} + +// GetAttribute returns the value of a specified attribute on +// the element. If the given attribute does not exist, the value +// returned will be an empty string. +func GetAttribute(node *html.Node, attrName string) string { + for i := 0; i < len(node.Attr); i++ { + if node.Attr[i].Key == attrName { + return node.Attr[i].Val + } + } + return "" +} + +// SetAttribute sets attribute for node. If attribute already exists, +// it will be replaced. +func SetAttribute(node *html.Node, attrName string, attrValue string) { + attrIdx := -1 + for i := 0; i < len(node.Attr); i++ { + if node.Attr[i].Key == attrName { + attrIdx = i + break + } + } + + if attrIdx >= 0 { + node.Attr[attrIdx].Val = attrValue + } else { + node.Attr = append(node.Attr, html.Attribute{ + Key: attrName, + Val: attrValue, + }) + } +} + +// RemoveAttribute removes attribute with given name. +func RemoveAttribute(node *html.Node, attrName string) { + attrIdx := -1 + for i := 0; i < len(node.Attr); i++ { + if node.Attr[i].Key == attrName { + attrIdx = i + break + } + } + + if attrIdx >= 0 { + a := node.Attr + a = append(a[:attrIdx], a[attrIdx+1:]...) + node.Attr = a + } +} + +// HasAttribute returns a Boolean value indicating whether the +// specified node has the specified attribute or not. +func HasAttribute(node *html.Node, attrName string) bool { + for i := 0; i < len(node.Attr); i++ { + if node.Attr[i].Key == attrName { + return true + } + } + return false +} + +// TextContent returns the text content of the specified node, +// and all its descendants. +func TextContent(node *html.Node) string { + var buffer bytes.Buffer + var finder func(*html.Node) + + finder = func(n *html.Node) { + if n.Type == html.TextNode { + buffer.WriteString(n.Data) + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + finder(node) + return buffer.String() +} + +// InnerText in JS used to capture text from an element while excluding text from hidden +// children. A child is hidden if it's computed width is 0, whether because its CSS (e.g +// `display: none`, `visibility: hidden`, etc), or if the child has `hidden` attribute. +// Since we can't compute stylesheet, we only look at `hidden` attribute and inline style. +// +// Besides excluding text from hidden children, difference between this function and +// `TextContent` is the latter will skip
tag while this function will preserve +//
as newline. +func InnerText(node *html.Node) string { + var buffer bytes.Buffer + var finder func(*html.Node) + + finder = func(n *html.Node) { + switch n.Type { + case html.TextNode: + buffer.WriteString(" " + n.Data + " ") + + case html.ElementNode: + if n.Data == "br" { + buffer.WriteString(`|\/|`) + return + } + + if HasAttribute(n, "hidden") { + return + } + + styleAttr := GetAttribute(n, "style") + if rxDisplayNone.MatchString(styleAttr) || rxVisibilityHidden.MatchString(styleAttr) { + return + } + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + finder(node) + text := buffer.String() + text = strings.Join(strings.Fields(text), " ") + text = rxPunctuation.ReplaceAllString(text, "$1 $2") + text = rxTempNewline.ReplaceAllString(text, "\n") + return text +} + +// OuterHTML returns an HTML serialization of the element and its descendants. +// The returned HTML value is escaped. +func OuterHTML(node *html.Node) string { + if node == nil { + return "" + } + + var buffer bytes.Buffer + err := html.Render(&buffer, node) + if err != nil { + return "" + } + + return buffer.String() +} + +// InnerHTML returns the HTML content (inner HTML) of an element. +// The returned HTML value is escaped. +func InnerHTML(node *html.Node) string { + var err error + var buffer bytes.Buffer + + if node == nil { + return "" + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + err = html.Render(&buffer, child) + if err != nil { + return "" + } + } + + return strings.TrimSpace(buffer.String()) +} + +// DocumentElement returns the Element that is the root element +// of the document. Since we are working with HTML document, +// the root will be element for HTML documents). +func DocumentElement(doc *html.Node) *html.Node { + if nodes := GetElementsByTagName(doc, "html"); len(nodes) > 0 { + return nodes[0] + } + return nil +} + +// ID returns the value of the id attribute of the specified element. +func ID(node *html.Node) string { + id := GetAttribute(node, "id") + id = strings.TrimSpace(id) + return id +} + +// ClassName returns the value of the class attribute of +// the specified element. +func ClassName(node *html.Node) string { + className := GetAttribute(node, "class") + className = strings.TrimSpace(className) + className = strings.Join(strings.Fields(className), " ") + return className +} + +// Children returns an HTMLCollection of the direct child elements of Node. +func Children(node *html.Node) []*html.Node { + var children []*html.Node + if node == nil { + return nil + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + if child.Type == html.ElementNode { + children = append(children, child) + } + } + + return children +} + +// ChildNodes returns list of a node's direct children. +func ChildNodes(node *html.Node) []*html.Node { + var childNodes []*html.Node + for child := node.FirstChild; child != nil; child = child.NextSibling { + childNodes = append(childNodes, child) + } + return childNodes +} + +// FirstElementChild returns the object's first child Element, +// or nil if there are no child elements. +func FirstElementChild(node *html.Node) *html.Node { + for child := node.FirstChild; child != nil; child = child.NextSibling { + if child.Type == html.ElementNode { + return child + } + } + return nil +} + +// PreviousElementSibling returns the the Element immediately prior +// to the specified one in its parent's children list, or null if +// the specified element is the first one in the list. +func PreviousElementSibling(node *html.Node) *html.Node { + for sibling := node.PrevSibling; sibling != nil; sibling = sibling.PrevSibling { + if sibling.Type == html.ElementNode { + return sibling + } + } + return nil +} + +// NextElementSibling returns the Element immediately following +// the specified one in its parent's children list, or nil if the +// specified Element is the last one in the list. +func NextElementSibling(node *html.Node) *html.Node { + for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling { + if sibling.Type == html.ElementNode { + return sibling + } + } + return nil +} + +// AppendChild adds a node to the end of the list of children of a +// specified parent node. If the given child is a reference to an +// existing node in the document, AppendChild() moves it from its +// current position to the new position. +func AppendChild(node *html.Node, child *html.Node) { + // Make sure node is not void + if !IsVoidElement(node) { + DetachChild(child) + node.AppendChild(child) + } +} + +// PrependChild works like AppendChild() except it adds a node to the +// beginning of the list of children of a specified parent node. +func PrependChild(node *html.Node, child *html.Node) { + // Make sure node is not void + if !IsVoidElement(node) { + DetachChild(child) + if node.FirstChild != nil { + node.InsertBefore(child, node.FirstChild) + } else { + node.AppendChild(child) + } + } +} + +// ReplaceChild replaces a child node within the given (parent) node. +// If the new child is already exist in document, ReplaceChild() will move it +// from its current position to replace old child. Returns both the new and old child. +// +// TODO: I'm note sure but I *think* there are some issues here. Check later I guess. +func ReplaceChild(parent *html.Node, newChild *html.Node, oldChild *html.Node) (*html.Node, *html.Node) { + // Make sure parent is specified and not void + if parent == nil && !IsVoidElement(parent) { + return newChild, oldChild + } + + // Make sure the specified parent IS the parent of the old child + if oldChild.Parent != parent { + return newChild, oldChild + } + + // Detach the new child + DetachChild(newChild) + parent.InsertBefore(newChild, oldChild) + parent.RemoveChild(oldChild) + return newChild, oldChild +} + +// IncludeNode determines if node is included inside nodeList. +func IncludeNode(nodeList []*html.Node, node *html.Node) bool { + for i := 0; i < len(nodeList); i++ { + if nodeList[i] == node { + return true + } + } + return false +} + +// Clone returns a clone of the node and (if specified) its children. +// However, it will be detached from the original's parents and siblings. +func Clone(src *html.Node, deep bool) *html.Node { + clone := &html.Node{ + Type: src.Type, + DataAtom: src.DataAtom, + Data: src.Data, + Attr: append([]html.Attribute{}, src.Attr...), + } + + if deep { + for child := src.FirstChild; child != nil; child = child.NextSibling { + clone.AppendChild(Clone(child, deep)) + } + } + + return clone +} + +// GetAllNodesWithTag is wrapper for GetElementsByTagName() +// which allow to get several tags at once. +func GetAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node { + var result []*html.Node + for i := 0; i < len(tagNames); i++ { + result = append(result, GetElementsByTagName(node, tagNames[i])...) + } + return result +} + +// ForEachNode iterates over a NodeList and runs fn on each node. +func ForEachNode(nodeList []*html.Node, fn func(*html.Node, int)) { + for i := 0; i < len(nodeList); i++ { + fn(nodeList[i], i) + } +} + +// RemoveNodes iterates over a NodeList, calls `filterFn` for each node +// and removes node if function returned `true`. If function is not +// passed, removes all the nodes in node list. +func RemoveNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) { + for i := len(nodeList) - 1; i >= 0; i-- { + node := nodeList[i] + parentNode := node.Parent + if parentNode != nil && (filterFn == nil || filterFn(node)) { + parentNode.RemoveChild(node) + } + } +} + +// SetTextContent sets the text content of the specified node. +func SetTextContent(node *html.Node, text string) { + if IsVoidElement(node) { + return + } + + child := node.FirstChild + for child != nil { + nextSibling := child.NextSibling + node.RemoveChild(child) + child = nextSibling + } + + node.AppendChild(&html.Node{ + Type: html.TextNode, + Data: text, + }) +} + +// SetInnerHTML sets inner HTML of the specified node. +func SetInnerHTML(node *html.Node, rawHTML string) { + // Parse raw HTML + parsedHTML, err := html.Parse(strings.NewReader(rawHTML)) + if err != nil || parsedHTML == nil { + return + } + + // Remove node's current children + child := node.FirstChild + for child != nil { + nextSibling := child.NextSibling + node.RemoveChild(child) + child = nextSibling + } + + // Put content of parsed HTML to the node + if body := QuerySelector(parsedHTML, "body"); body != nil { + bodyChild := body.FirstChild + for bodyChild != nil { + nextSibling := bodyChild.NextSibling + AppendChild(node, bodyChild) + bodyChild = nextSibling + } + } +} + +// IsVoidElement check whether a node can have any contents or not. +// Return true if element is void (can't have any children). +func IsVoidElement(n *html.Node) bool { + // If it's not element, it's void + if n.Type != html.ElementNode { + return true + } + + // Check tag name + switch n.Data { + case "area", "base", "br", "col", "embed", "hr", + "img", "input", "keygen", "link", "meta", + "param", "source", "track", "wbr": + return true + default: + return false + } +} + +func DetachChild(child *html.Node) { + if child.Parent != nil || child.PrevSibling != nil || child.NextSibling != nil { + if child.Parent != nil { + if child.Parent.FirstChild == child { + child.Parent.FirstChild = child.NextSibling + } + + if child.Parent.LastChild == child { + child.Parent.LastChild = child.PrevSibling + } + } + + if child.PrevSibling != nil { + child.PrevSibling.NextSibling = child.NextSibling + } + + if child.NextSibling != nil { + child.NextSibling.PrevSibling = child.PrevSibling + } + + child.Parent = nil + child.PrevSibling = nil + child.NextSibling = nil + } +} diff --git a/vendor/github.com/go-shiori/dom/parser.go b/vendor/github.com/go-shiori/dom/parser.go new file mode 100644 index 00000000..20a0f0b9 --- /dev/null +++ b/vendor/github.com/go-shiori/dom/parser.go @@ -0,0 +1,61 @@ +package dom + +import ( + "bytes" + "io" + "io/ioutil" + + "github.com/gogs/chardet" + "golang.org/x/net/html" + "golang.org/x/net/html/charset" + xunicode "golang.org/x/text/encoding/unicode" + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" +) + +// FastParse parses html.Node from the specified reader without caring about +// text encoding. It always assume that the input uses UTF-8 encoding. +func FastParse(r io.Reader) (*html.Node, error) { + return html.Parse(r) +} + +// Parse parses html.Node from the specified reader while converting the character +// encoding into UTF-8. This function is useful to correctly parse web pages that +// uses custom text encoding, e.g. web pages from Asian websites. However, since it +// has to detect charset before parsing, this function is quite slow and expensive +// so if you sure the reader uses valid UTF-8, just use FastParse. +func Parse(r io.Reader) (*html.Node, error) { + // Split the reader using tee + content, err := ioutil.ReadAll(r) + if err != nil { + return nil, err + } + + // Detect page encoding + res, err := chardet.NewHtmlDetector().DetectBest(content) + if err != nil { + return nil, err + } + + pageEncoding, _ := charset.Lookup(res.Charset) + if pageEncoding == nil { + pageEncoding = xunicode.UTF8 + } + + // Parse HTML using the page encoding + r = bytes.NewReader(content) + r = transform.NewReader(r, pageEncoding.NewDecoder()) + r = normalizeTextEncoding(r) + return html.Parse(r) +} + +// normalizeTextEncoding convert text encoding from NFD to NFC. +// It also remove soft hyphen since apparently it's useless in web. +// See: https://web.archive.org/web/19990117011731/http://www.hut.fi/~jkorpela/shy.html +func normalizeTextEncoding(r io.Reader) io.Reader { + fnSoftHyphen := func(r rune) bool { return r == '\u00AD' } + softHyphenSet := runes.Predicate(fnSoftHyphen) + transformer := transform.Chain(norm.NFD, runes.Remove(softHyphenSet), norm.NFC) + return transform.NewReader(r, transformer) +} diff --git a/vendor/github.com/go-shiori/go-readability/.gitattributes b/vendor/github.com/go-shiori/go-readability/.gitattributes new file mode 100644 index 00000000..6c04321d --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/.gitattributes @@ -0,0 +1 @@ +test-pages/* linguist-vendored \ No newline at end of file diff --git a/vendor/github.com/go-shiori/go-readability/.gitignore b/vendor/github.com/go-shiori/go-readability/.gitignore new file mode 100644 index 00000000..186bf5a8 --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +js/* + +# Hidden script for development +scripts/internal/ \ No newline at end of file diff --git a/vendor/github.com/go-shiori/go-readability/.travis.yml b/vendor/github.com/go-shiori/go-readability/.travis.yml new file mode 100644 index 00000000..647a2880 --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/.travis.yml @@ -0,0 +1,16 @@ +language: go +sudo: true +matrix: + include: + - go: 1.x + env: LATEST=true + - go: "1.13" + +install: + - go get github.com/go-shiori/go-readability + +script: + - go get -t -v ./... + - diff -u <(echo -n) <(gofmt -d .) + - go vet $(go list ./... | grep -v /vendor/) + - go test -v -race ./... \ No newline at end of file diff --git a/vendor/github.com/go-shiori/go-readability/LICENSE b/vendor/github.com/go-shiori/go-readability/LICENSE new file mode 100644 index 00000000..e4283b76 --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Radhi Fadlillah + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/go-shiori/go-readability/README.md b/vendor/github.com/go-shiori/go-readability/README.md new file mode 100644 index 00000000..7729c517 --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/README.md @@ -0,0 +1,169 @@ +# Go-Readability [![Go Reference][go-ref-badge]][go-ref] [![PayPal][paypal-badge]][paypal] [![Ko-fi][kofi-badge]][kofi] + +Go-Readability is a Go package that find the main readable content and the metadata from a HTML page. It works by removing clutter like buttons, ads, background images, script, etc. + +This package is based from [Readability.js] by [Mozilla] and written line by line to make sure it looks and works as similar as possible. This way, hopefully all web page that can be parsed by Readability.js are parse-able by go-readability as well. + +## Table of Contents + +- [Table of Contents](#table-of-contents) +- [Status](#status) +- [Installation](#installation) +- [Example](#example) +- [Command Line Usage](#command-line-usage) +- [Licenses](#licenses) + +## Status + +This package is stable enough for use and up to date with Readability.js [v0.4.4][last-version] (commit [`b359811`][last-commit]). + +## Installation + +To install this package, just run `go get` : + +``` +go get -u -v github.com/go-shiori/go-readability +``` + +## Example + +To get the readable content from an URL, you can use `readability.FromURL`. It will fetch the web page from specified url, check if it's readable, then parses the response to find the readable content : + +```go +package main + +import ( + "fmt" + "log" + "os" + "time" + + readability "github.com/go-shiori/go-readability" +) + +var ( + urls = []string{ + // this one is article, so it's parse-able + "https://www.nytimes.com/2019/02/20/climate/climate-national-security-threat.html", + // while this one is not an article, so readability will fail to parse. + "https://www.nytimes.com/", + } +) + +func main() { + for i, url := range urls { + article, err := readability.FromURL(url, 30*time.Second) + if err != nil { + log.Fatalf("failed to parse %s, %v\n", url, err) + } + + dstTxtFile, _ := os.Create(fmt.Sprintf("text-%02d.txt", i+1)) + defer dstTxtFile.Close() + dstTxtFile.WriteString(article.TextContent) + + dstHTMLFile, _ := os.Create(fmt.Sprintf("html-%02d.html", i+1)) + defer dstHTMLFile.Close() + dstHTMLFile.WriteString(article.Content) + + fmt.Printf("URL : %s\n", url) + fmt.Printf("Title : %s\n", article.Title) + fmt.Printf("Author : %s\n", article.Byline) + fmt.Printf("Length : %d\n", article.Length) + fmt.Printf("Excerpt : %s\n", article.Excerpt) + fmt.Printf("SiteName: %s\n", article.SiteName) + fmt.Printf("Image : %s\n", article.Image) + fmt.Printf("Favicon : %s\n", article.Favicon) + fmt.Printf("Text content saved to \"text-%02d.txt\"\n", i+1) + fmt.Printf("HTML content saved to \"html-%02d.html\"\n", i+1) + fmt.Println() + } +} +``` + +However, sometimes you want to parse an URL no matter if it's an article or not. For example is when you only want to get metadata of the page. To do that, you have to download the page manually using `http.Get`, then parse it using `readability.FromReader` : + +```go +package main + +import ( + "fmt" + "log" + "net/http" + + readability "github.com/go-shiori/go-readability" +) + +var ( + urls = []string{ + // Both will be parse-able now + "https://www.nytimes.com/2019/02/20/climate/climate-national-security-threat.html", + // But this one will not have any content + "https://www.nytimes.com/", + } +) + +func main() { + for _, url := range urls { + resp, err := http.Get(url) + if err != nil { + log.Fatalf("failed to download %s: %v\n", url, err) + } + defer resp.Body.Close() + + article, err := readability.FromReader(resp.Body, url) + if err != nil { + log.Fatalf("failed to parse %s: %v\n", url, err) + } + + fmt.Printf("URL : %s\n", url) + fmt.Printf("Title : %s\n", article.Title) + fmt.Printf("Author : %s\n", article.Byline) + fmt.Printf("Length : %d\n", article.Length) + fmt.Printf("Excerpt : %s\n", article.Excerpt) + fmt.Printf("SiteName: %s\n", article.SiteName) + fmt.Printf("Image : %s\n", article.Image) + fmt.Printf("Favicon : %s\n", article.Favicon) + fmt.Println() + } +} +``` + +## Command Line Usage + +You can also use `go-readability` as command line app. To do that, first install the CLI : + +``` +go get -u -v github.com/go-shiori/go-readability/cmd/... +``` + +Now you can use it by running `go-readability` in your terminal : + +``` +$ go-readability -h + +go-readability is parser to fetch the readable content of a web page. +The source can be an url or existing file in your storage. + +Usage: + go-readability [flags] source + +Flags: + -h, --help help for go-readability + -m, --metadata only print the page's metadata +``` + +## Licenses + +Go-Readability is distributed under [MIT license][mit], which means you can use and modify it however you want. However, if you make an enhancement for it, if possible, please send a pull request. If you like this project, please consider donating to me either via [PayPal][paypal] or [Ko-Fi][kofi]. + +[go-ref]: https://pkg.go.dev/github.com/go-shiori/go-readability +[go-ref-badge]: https://img.shields.io/static/v1?label=&message=Reference&color=007d9c&logo=go&logoColor=white +[paypal]: https://www.paypal.me/RadhiFadlillah +[paypal-badge]: https://img.shields.io/static/v1?label=&message=PayPal&color=00457C&logo=paypal&logoColor=white +[kofi]: https://ko-fi.com/radhifadlillah +[kofi-badge]: https://img.shields.io/static/v1?label=&message=Ko-fi&color=F16061&logo=ko-fi&logoColor=white +[readability.js]: https://github.com/mozilla/readability +[mozilla]: https://github.com/mozilla +[last-version]: https://github.com/mozilla/readability/tree/0.4.4 +[last-commit]: https://github.com/mozilla/readability/commit/b359811927a4bb2323eba085be004978fb18a926 +[mit]: https://choosealicense.com/licenses/mit/ diff --git a/vendor/github.com/go-shiori/go-readability/parser-check.go b/vendor/github.com/go-shiori/go-readability/parser-check.go new file mode 100644 index 00000000..e3f71b1c --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/parser-check.go @@ -0,0 +1,79 @@ +package readability + +import ( + "io" + "math" + "strings" + + "github.com/go-shiori/dom" + "golang.org/x/net/html" +) + +// Check checks whether the input is readable without parsing the whole thing. +func (ps *Parser) Check(input io.Reader) bool { + // Parse input + doc, err := dom.Parse(input) + if err != nil { + return false + } + + return ps.CheckDocument(doc) +} + +// CheckDocument checks whether the document is readable without parsing the whole thing. +func (ps *Parser) CheckDocument(doc *html.Node) bool { + // Get

and

 nodes.
+	nodes := dom.QuerySelectorAll(doc, "p, pre, article")
+
+	// Also get 
nodes which have
node(s) and append + // them into the `nodes` variable. + // Some articles' DOM structures might look like : + // + //
+ // Sentences
+ //
+ // Sentences
+ //
+ // + // So we need to make sure only fetch the div once. + // To do so, we will use map as dictionary. + tracker := make(map[*html.Node]struct{}) + for _, br := range dom.QuerySelectorAll(doc, "div > br") { + if br.Parent == nil { + continue + } + + if _, exist := tracker[br.Parent]; !exist { + tracker[br.Parent] = struct{}{} + nodes = append(nodes, br.Parent) + } + } + + // This is a little cheeky, we use the accumulator 'score' to decide what + // to return from this callback. + score := float64(0) + return ps.someNode(nodes, func(node *html.Node) bool { + if !ps.isProbablyVisible(node) { + return false + } + + matchString := dom.ClassName(node) + " " + dom.ID(node) + if rxUnlikelyCandidates.MatchString(matchString) && + !rxOkMaybeItsACandidate.MatchString(matchString) { + return false + } + + if dom.TagName(node) == "p" && ps.hasAncestorTag(node, "li", -1, nil) { + return false + } + + nodeText := strings.TrimSpace(dom.TextContent(node)) + nodeTextLength := len(nodeText) + if nodeTextLength < 140 { + return false + } + + score += math.Sqrt(float64(nodeTextLength - 140)) + return score > 20 + }) +} diff --git a/vendor/github.com/go-shiori/go-readability/parser-parse.go b/vendor/github.com/go-shiori/go-readability/parser-parse.go new file mode 100644 index 00000000..235ce62b --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/parser-parse.go @@ -0,0 +1,129 @@ +package readability + +import ( + "fmt" + "io" + nurl "net/url" + "strings" + + "github.com/go-shiori/dom" + "golang.org/x/net/html" +) + +// Parse parses a reader and find the main readable content. +func (ps *Parser) Parse(input io.Reader, pageURL *nurl.URL) (Article, error) { + // Parse input + doc, err := dom.Parse(input) + if err != nil { + return Article{}, fmt.Errorf("failed to parse input: %v", err) + } + + return ps.ParseDocument(doc, pageURL) +} + +// ParseDocument parses the specified document and find the main readable content. +func (ps *Parser) ParseDocument(doc *html.Node, pageURL *nurl.URL) (Article, error) { + // Clone document to make sure the original kept untouched + ps.doc = dom.Clone(doc, true) + + // Reset parser data + ps.articleTitle = "" + ps.articleByline = "" + ps.articleDir = "" + ps.articleSiteName = "" + ps.documentURI = pageURL + ps.attempts = []parseAttempt{} + ps.flags = flags{ + stripUnlikelys: true, + useWeightClasses: true, + cleanConditionally: true, + } + + // Avoid parsing too large documents, as per configuration option + if ps.MaxElemsToParse > 0 { + numTags := len(dom.GetElementsByTagName(ps.doc, "*")) + if numTags > ps.MaxElemsToParse { + return Article{}, fmt.Errorf("documents too large: %d elements", numTags) + } + } + + // Unwrap image from noscript + ps.unwrapNoscriptImages(ps.doc) + + // Extract JSON-LD metadata before removing scripts + var jsonLd map[string]string + if !ps.DisableJSONLD { + jsonLd, _ = ps.getJSONLD() + } + + // Remove script tags from the document. + ps.removeScripts(ps.doc) + + // Prepares the HTML document + ps.prepDocument() + + // Fetch metadata + metadata := ps.getArticleMetadata(jsonLd) + ps.articleTitle = metadata["title"] + + // Try to grab article content + finalHTMLContent := "" + finalTextContent := "" + articleContent := ps.grabArticle() + var readableNode *html.Node + + if articleContent != nil { + ps.postProcessContent(articleContent) + + // If we haven't found an excerpt in the article's metadata, + // use the article's first paragraph as the excerpt. This is used + // for displaying a preview of the article's content. + if metadata["excerpt"] == "" { + paragraphs := dom.GetElementsByTagName(articleContent, "p") + if len(paragraphs) > 0 { + metadata["excerpt"] = strings.TrimSpace(dom.TextContent(paragraphs[0])) + } + } + + readableNode = dom.FirstElementChild(articleContent) + finalHTMLContent = dom.InnerHTML(articleContent) + finalTextContent = dom.TextContent(articleContent) + finalTextContent = strings.TrimSpace(finalTextContent) + } + + finalByline := metadata["byline"] + if finalByline == "" { + finalByline = ps.articleByline + } + + // Excerpt is an supposed to be short and concise, + // so it shouldn't have any new line + excerpt := strings.TrimSpace(metadata["excerpt"]) + excerpt = strings.Join(strings.Fields(excerpt), " ") + + // go-readability special: + // Internet is dangerous and weird, and sometimes we will find + // metadata isn't encoded using a valid Utf-8, so here we check it. + var replacementTitle string + if pageURL != nil { + replacementTitle = pageURL.String() + } + + validTitle := strings.ToValidUTF8(ps.articleTitle, replacementTitle) + validByline := strings.ToValidUTF8(finalByline, "") + validExcerpt := strings.ToValidUTF8(excerpt, "") + + return Article{ + Title: validTitle, + Byline: validByline, + Node: readableNode, + Content: finalHTMLContent, + TextContent: finalTextContent, + Length: charCount(finalTextContent), + Excerpt: validExcerpt, + SiteName: metadata["siteName"], + Image: metadata["image"], + Favicon: metadata["favicon"], + Language: ps.articleLang, + }, nil +} diff --git a/vendor/github.com/go-shiori/go-readability/parser.go b/vendor/github.com/go-shiori/go-readability/parser.go new file mode 100644 index 00000000..b4c7c83f --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/parser.go @@ -0,0 +1,2300 @@ +package readability + +import ( + "encoding/json" + "fmt" + shtml "html" + "log" + "math" + nurl "net/url" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/go-shiori/dom" + "golang.org/x/net/html" +) + +// All of the regular expressions in use within readability. +// Defined up here so we don't instantiate them repeatedly in loops *. +var ( + rxUnlikelyCandidates = regexp.MustCompile(`(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) + rxOkMaybeItsACandidate = regexp.MustCompile(`(?i)and|article|body|column|content|main|shadow`) + rxPositive = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) + rxNegative = regexp.MustCompile(`(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`) + rxByline = regexp.MustCompile(`(?i)byline|author|dateline|writtenby|p-author`) + rxNormalize = regexp.MustCompile(`(?i)\s{2,}`) + rxVideosx = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`) + rxTokenize = regexp.MustCompile(`(?i)\W+`) + rxWhitespace = regexp.MustCompile(`(?i)^\s*$`) + rxHasContent = regexp.MustCompile(`(?i)\S$`) + rxHashURL = regexp.MustCompile(`(?i)^#.+`) + rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name|image\S*)\s*`) + rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|image)\s*$`) + rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `) + rxTitleHierarchySep = regexp.MustCompile(`(?i) [\\/>»] `) + rxTitleRemoveFinalPart = regexp.MustCompile(`(?i)(.*)[\|\-\\/>»] .*`) + rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`) + rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`) + rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`) + rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`) + rxShareElements = regexp.MustCompile(`(?i)(\b|_)(share|sharedaddy)(\b|_)`) + rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`) + rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) + rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) + rxImgExtensions = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)`) + rxSrcsetURL = regexp.MustCompile(`(?i)(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))`) + rxB64DataURL = regexp.MustCompile(`(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,`) + rxJsonLdArticleTypes = regexp.MustCompile(`(?i)^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$`) + rxCDATA = regexp.MustCompile(`^\s*\s*$`) + rxSchemaOrg = regexp.MustCompile(`(?i)^https?\:\/\/schema\.org$`) +) + +// Constants that used by readability. +var ( + unlikelyRoles = sliceToMap("menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog") + divToPElems = sliceToMap("blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", "select") + alterToDivExceptions = []string{"div", "article", "section", "p"} + presentationalAttributes = []string{"align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace"} + deprecatedSizeAttributeElems = []string{"table", "th", "td", "hr", "pre"} + phrasingElems = []string{ + "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", + "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", + "mark", "math", "meter", "noscript", "object", "output", "progress", "q", + "ruby", "samp", "script", "select", "small", "span", "strong", "sub", + "sup", "textarea", "time", "var", "wbr"} +) + +// flags is flags that used by parser. +type flags struct { + stripUnlikelys bool + useWeightClasses bool + cleanConditionally bool +} + +// parseAttempt is container for the result of previous parse attempts. +type parseAttempt struct { + articleContent *html.Node + textLength int +} + +// Article is the final readable content. +type Article struct { + Title string + Byline string + Node *html.Node + Content string + TextContent string + Length int + Excerpt string + SiteName string + Image string + Favicon string + Language string +} + +// Parser is the parser that parses the page to get the readable content. +type Parser struct { + // MaxElemsToParse is the max number of nodes supported by this + // parser. Default: 0 (no limit) + MaxElemsToParse int + // NTopCandidates is the number of top candidates to consider when + // analysing how tight the competition is among candidates. + NTopCandidates int + // CharThresholds is the default number of chars an article must + // have in order to return a result + CharThresholds int + // ClassesToPreserve are the classes that readability sets itself. + ClassesToPreserve []string + // KeepClasses specify whether the classes should be stripped or not. + KeepClasses bool + // TagsToScore is element tags to score by default. + TagsToScore []string + // Debug determines if the log should be printed or not. Default: false. + Debug bool + // DisableJSONLD determines if metadata in JSON+LD will be extracted + // or not. Default: false. + DisableJSONLD bool + // AllowedVideoRegex is a regular expression that matches video URLs that should be + // allowed to be included in the article content. If undefined, it will use default filter. + AllowedVideoRegex *regexp.Regexp + + doc *html.Node + documentURI *nurl.URL + articleTitle string + articleByline string + articleDir string + articleSiteName string + articleLang string + attempts []parseAttempt + flags flags +} + +// NewParser returns new Parser which set up with default value. +func NewParser() Parser { + return Parser{ + MaxElemsToParse: 0, + NTopCandidates: 5, + CharThresholds: 500, + ClassesToPreserve: []string{"page"}, + KeepClasses: false, + TagsToScore: []string{"section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"}, + Debug: false, + } +} + +// postProcessContent runs any post-process modifications to article +// content as necessary. +func (ps *Parser) postProcessContent(articleContent *html.Node) { + // Readability cannot open relative uris so we convert them to absolute uris. + ps.fixRelativeURIs(articleContent) + + ps.simplifyNestedElements(articleContent) + + // Remove classes. + if !ps.KeepClasses { + ps.cleanClasses(articleContent) + } + + // Remove readability attributes. + ps.clearReadabilityAttr(articleContent) +} + +// removeNodes iterates over a NodeList, calls `filterFn` for each node +// and removes node if function returned `true`. If function is not +// passed, removes all the nodes in node list. +func (ps *Parser) removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) { + for i := len(nodeList) - 1; i >= 0; i-- { + node := nodeList[i] + parentNode := node.Parent + if parentNode != nil && (filterFn == nil || filterFn(node)) { + parentNode.RemoveChild(node) + } + } +} + +// replaceNodeTags iterates over a NodeList, and calls setNodeTag for +// each node. +func (ps *Parser) replaceNodeTags(nodeList []*html.Node, newTagName string) { + for i := len(nodeList) - 1; i >= 0; i-- { + node := nodeList[i] + ps.setNodeTag(node, newTagName) + } +} + +// forEachNode iterates over a NodeList and runs fn on each node. +func (ps *Parser) forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) { + for i := 0; i < len(nodeList); i++ { + fn(nodeList[i], i) + } +} + +// someNode iterates over a NodeList, return true if any of the +// provided iterate function calls returns true, false otherwise. +func (ps *Parser) someNode(nodeList []*html.Node, fn func(*html.Node) bool) bool { + for i := 0; i < len(nodeList); i++ { + if fn(nodeList[i]) { + return true + } + } + return false +} + +// everyNode iterates over a NodeList, return true if all of the +// provided iterate function calls returns true, false otherwise. +func (ps *Parser) everyNode(nodeList []*html.Node, fn func(*html.Node) bool) bool { + for i := 0; i < len(nodeList); i++ { + if !fn(nodeList[i]) { + return false + } + } + return true +} + +// concatNodeLists concats all nodelists passed as arguments. +func (ps *Parser) concatNodeLists(nodeLists ...[]*html.Node) []*html.Node { + var result []*html.Node + for i := 0; i < len(nodeLists); i++ { + result = append(result, nodeLists[i]...) + } + return result +} + +// getAllNodesWithTag returns all nodes that has tag inside tagNames. +func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node { + var result []*html.Node + for i := 0; i < len(tagNames); i++ { + result = append(result, dom.GetElementsByTagName(node, tagNames[i])...) + } + return result +} + +// cleanClasses removes the class="" attribute from every element in the +// given subtree, except those that match CLASSES_TO_PRESERVE and the +// classesToPreserve array from the options object. +func (ps *Parser) cleanClasses(node *html.Node) { + nodeClassName := dom.ClassName(node) + preservedClassName := []string{} + for _, class := range strings.Fields(nodeClassName) { + if indexOf(ps.ClassesToPreserve, class) != -1 { + preservedClassName = append(preservedClassName, class) + } + } + + if len(preservedClassName) > 0 { + dom.SetAttribute(node, "class", strings.Join(preservedClassName, " ")) + } else { + dom.RemoveAttribute(node, "class") + } + + for child := dom.FirstElementChild(node); child != nil; child = dom.NextElementSibling(child) { + ps.cleanClasses(child) + } +} + +// fixRelativeURIs converts each and uri in the given element +// to an absolute URI, ignoring #ref URIs. +func (ps *Parser) fixRelativeURIs(articleContent *html.Node) { + links := ps.getAllNodesWithTag(articleContent, "a") + ps.forEachNode(links, func(link *html.Node, _ int) { + href := dom.GetAttribute(link, "href") + if href == "" { + return + } + + // Remove links with javascript: URIs, since they won't + // work after scripts have been removed from the page. + if strings.HasPrefix(href, "javascript:") { + linkChilds := dom.ChildNodes(link) + + if len(linkChilds) == 1 && linkChilds[0].Type == html.TextNode { + // If the link only contains simple text content, + // it can be converted to a text node + text := dom.CreateTextNode(dom.TextContent(link)) + dom.ReplaceChild(link.Parent, text, link) + } else { + // If the link has multiple children, they should + // all be preserved + container := dom.CreateElement("span") + for link.FirstChild != nil { + dom.AppendChild(container, link.FirstChild) + } + dom.ReplaceChild(link.Parent, container, link) + } + } else { + newHref := toAbsoluteURI(href, ps.documentURI) + if newHref == "" { + dom.RemoveAttribute(link, "href") + } else { + dom.SetAttribute(link, "href", newHref) + } + } + }) + + medias := ps.getAllNodesWithTag(articleContent, "img", "picture", "figure", "video", "audio", "source") + ps.forEachNode(medias, func(media *html.Node, _ int) { + src := dom.GetAttribute(media, "src") + poster := dom.GetAttribute(media, "poster") + srcset := dom.GetAttribute(media, "srcset") + + if src != "" { + newSrc := toAbsoluteURI(src, ps.documentURI) + dom.SetAttribute(media, "src", newSrc) + } + + if poster != "" { + newPoster := toAbsoluteURI(poster, ps.documentURI) + dom.SetAttribute(media, "poster", newPoster) + } + + if srcset != "" { + newSrcset := rxSrcsetURL.ReplaceAllStringFunc(srcset, func(s string) string { + p := rxSrcsetURL.FindStringSubmatch(s) + return toAbsoluteURI(p[1], ps.documentURI) + p[2] + p[3] + }) + + dom.SetAttribute(media, "srcset", newSrcset) + } + }) +} + +func (ps *Parser) simplifyNestedElements(articleContent *html.Node) { + node := articleContent + + for node != nil { + nodeID := dom.ID(node) + nodeTagName := dom.TagName(node) + + if node.Parent != nil && (nodeTagName == "div" || nodeTagName == "section") && + !strings.HasPrefix(nodeID, "readability") { + if ps.isElementWithoutContent(node) { + node = ps.removeAndGetNext(node) + continue + } + + if ps.hasSingleTagInsideElement(node, "div") || ps.hasSingleTagInsideElement(node, "section") { + child := dom.Children(node)[0] + for _, attr := range node.Attr { + dom.SetAttribute(child, attr.Key, attr.Val) + } + + dom.ReplaceChild(node.Parent, child, node) + node = child + continue + } + } + + node = ps.getNextNode(node, false) + } +} + +// getArticleTitle attempts to get the article title. +func (ps *Parser) getArticleTitle() string { + doc := ps.doc + curTitle := "" + origTitle := "" + titleHadHierarchicalSeparators := false + + // If they had an element with tag "title" in their HTML + if nodes := dom.GetElementsByTagName(doc, "title"); len(nodes) > 0 { + origTitle = ps.getInnerText(nodes[0], true) + curTitle = origTitle + } + + // If there's a separator in the title, first remove the final part + if rxTitleSeparator.MatchString(curTitle) { + titleHadHierarchicalSeparators = rxTitleHierarchySep.MatchString(curTitle) + curTitle = rxTitleRemoveFinalPart.ReplaceAllString(origTitle, "$1") + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if wordCount(curTitle) < 3 { + curTitle = rxTitleRemove1stPart.ReplaceAllString(origTitle, "$1") + } + } else if strings.Contains(curTitle, ": ") { + // Check if we have an heading containing this exact string, so + // we could assume it's the full title. + headings := ps.concatNodeLists( + dom.GetElementsByTagName(doc, "h1"), + dom.GetElementsByTagName(doc, "h2"), + ) + + trimmedTitle := strings.TrimSpace(curTitle) + match := ps.someNode(headings, func(heading *html.Node) bool { + return strings.TrimSpace(dom.TextContent(heading)) == trimmedTitle + }) + + // If we don't, let's extract the title out of the original + // title string. + if !match { + curTitle = origTitle[strings.LastIndex(origTitle, ":")+1:] + + // If the title is now too short, try the first colon instead: + if wordCount(curTitle) < 3 { + curTitle = origTitle[strings.Index(origTitle, ":")+1:] + // But if we have too many words before the colon there's + // something weird with the titles and the H tags so let's + // just use the original title instead + } else if wordCount(origTitle[:strings.Index(origTitle, ":")]) > 5 { + curTitle = origTitle + } + } + } else if charCount(curTitle) > 150 || charCount(curTitle) < 15 { + if hOnes := dom.GetElementsByTagName(doc, "h1"); len(hOnes) == 1 { + curTitle = ps.getInnerText(hOnes[0], true) + } + } + + curTitle = strings.TrimSpace(curTitle) + curTitle = rxNormalize.ReplaceAllString(curTitle, " ") + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + curTitleWordCount := wordCount(curTitle) + tmpOrigTitle := rxTitleAnySeparator.ReplaceAllString(origTitle, "") + + if curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(tmpOrigTitle)-1) { + curTitle = origTitle + } + + return curTitle +} + +// prepDocument prepares the HTML document for readability to scrape it. +// This includes things like stripping javascript, CSS, and handling +// terrible markup. +func (ps *Parser) prepDocument() { + doc := ps.doc + + // ADDITIONAL, not exist in readability.js: + // Remove all comments, + ps.removeComments(doc) + + // Remove all style tags in head + ps.removeNodes(dom.GetElementsByTagName(doc, "style"), nil) + + if nodes := dom.GetElementsByTagName(doc, "body"); len(nodes) > 0 && nodes[0] != nil { + ps.replaceBrs(nodes[0]) + } + + ps.replaceNodeTags(dom.GetElementsByTagName(doc, "font"), "span") +} + +// nextNode finds the next element, starting from the given node, and +// ignoring whitespace in between. If the given node is an element, the +// same node is returned. +func (ps *Parser) nextNode(node *html.Node) *html.Node { + next := node + for next != nil && next.Type != html.ElementNode && rxWhitespace.MatchString(dom.TextContent(next)) { + next = next.NextSibling + } + return next +} + +// replaceBrs replaces 2 or more successive
with a single

. +// Whitespace between
elements are ignored. For example: +// +//

foo
bar


abc
+// +// will become: +// +//
foo
bar

abc

+func (ps *Parser) replaceBrs(elem *html.Node) { + ps.forEachNode(ps.getAllNodesWithTag(elem, "br"), func(br *html.Node, _ int) { + next := br.NextSibling + + // Whether 2 or more
elements have been found and replaced + // with a

block. + replaced := false + + // If we find a
chain, remove the
s until we hit another + // element or non-whitespace. This leaves behind the first
+ // in the chain (which will be replaced with a

later). + for { + next = ps.nextNode(next) + if next == nil || dom.TagName(next) != "br" { + break + } + + replaced = true + brSibling := next.NextSibling + next.Parent.RemoveChild(next) + next = brSibling + } + + // If we removed a
chain, replace the remaining
with a

. Add + // all sibling nodes as children of the

until we hit another
+ // chain. + if replaced { + p := dom.CreateElement("p") + dom.ReplaceChild(br.Parent, p, br) + + next = p.NextSibling + for next != nil { + // If we've hit another

, we're done adding children to this

. + if dom.TagName(next) == "br" { + nextElem := ps.nextNode(next.NextSibling) + if nextElem != nil && dom.TagName(nextElem) == "br" { + break + } + } + + if !ps.isPhrasingContent(next) { + break + } + + // Otherwise, make this node a child of the new

. + sibling := next.NextSibling + dom.AppendChild(p, next) + next = sibling + } + + for p.LastChild != nil && ps.isWhitespace(p.LastChild) { + p.RemoveChild(p.LastChild) + } + + if dom.TagName(p.Parent) == "p" { + ps.setNodeTag(p.Parent, "div") + } + } + }) +} + +// setNodeTag changes tag of the node to newTagName. +func (ps *Parser) setNodeTag(node *html.Node, newTagName string) { + if node.Type == html.ElementNode { + node.Data = newTagName + } +} + +// prepArticle prepares the article node for display. Clean out any +// inline styles, iframes, forms, strip extraneous

tags, etc. +func (ps *Parser) prepArticle(articleContent *html.Node) { + ps.cleanStyles(articleContent) + + // Check for data tables before we continue, to avoid removing + // items in those tables, which will often be isolated even + // though they're visually linked to other content-ful elements + // (text, images, etc.). + ps.markDataTables(articleContent) + + ps.fixLazyImages(articleContent) + + // Clean out junk from the article content + ps.cleanConditionally(articleContent, "form") + ps.cleanConditionally(articleContent, "fieldset") + ps.clean(articleContent, "object") + ps.clean(articleContent, "embed") + ps.clean(articleContent, "footer") + ps.clean(articleContent, "link") + ps.clean(articleContent, "aside") + + // Clean out elements have "share" in their id/class combinations + // from final top candidates, which means we don't remove the top + // candidates even they have "share". + shareElementThreshold := ps.CharThresholds + + ps.forEachNode(dom.Children(articleContent), func(topCandidate *html.Node, _ int) { + ps.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool { + return rxShareElements.MatchString(nodeClassID) && charCount(dom.TextContent(node)) < shareElementThreshold + }) + }) + + ps.clean(articleContent, "iframe") + ps.clean(articleContent, "input") + ps.clean(articleContent, "textarea") + ps.clean(articleContent, "select") + ps.clean(articleContent, "button") + ps.cleanHeaders(articleContent) + + // Do these last as the previous stuff may have removed junk + // that will affect these + ps.cleanConditionally(articleContent, "table") + ps.cleanConditionally(articleContent, "ul") + ps.cleanConditionally(articleContent, "div") + + // Replace H1 with H2 as H1 should be only title that is displayed separately + ps.replaceNodeTags(ps.getAllNodesWithTag(articleContent, "h1"), "h2") + + // Remove extra paragraphs + ps.removeNodes(dom.GetElementsByTagName(articleContent, "p"), func(p *html.Node) bool { + imgCount := len(dom.GetElementsByTagName(p, "img")) + embedCount := len(dom.GetElementsByTagName(p, "embed")) + objectCount := len(dom.GetElementsByTagName(p, "object")) + // At this point, nasty iframes have been removed, only + // remain embedded video ones. + iframeCount := len(dom.GetElementsByTagName(p, "iframe")) + totalCount := imgCount + embedCount + objectCount + iframeCount + + return totalCount == 0 && ps.getInnerText(p, false) == "" + }) + + ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { + next := ps.nextNode(br.NextSibling) + if next != nil && dom.TagName(next) == "p" { + br.Parent.RemoveChild(br) + } + }) + + // Remove single-cell tables + ps.forEachNode(dom.GetElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) { + tbody := table + if ps.hasSingleTagInsideElement(table, "tbody") { + tbody = dom.FirstElementChild(table) + } + + if ps.hasSingleTagInsideElement(tbody, "tr") { + row := dom.FirstElementChild(tbody) + if ps.hasSingleTagInsideElement(row, "td") { + cell := dom.FirstElementChild(row) + + newTag := "div" + if ps.everyNode(dom.ChildNodes(cell), ps.isPhrasingContent) { + newTag = "p" + } + + ps.setNodeTag(cell, newTag) + dom.ReplaceChild(table.Parent, cell, table) + } + } + }) +} + +// initializeNode initializes a node with the readability score. +// Also checks the className/id for special names to add to its score. +func (ps *Parser) initializeNode(node *html.Node) { + contentScore := float64(ps.getClassWeight(node)) + switch dom.TagName(node) { + case "div": + contentScore += 5 + case "pre", "td", "blockquote": + contentScore += 3 + case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": + contentScore -= 3 + case "h1", "h2", "h3", "h4", "h5", "h6", "th": + contentScore -= 5 + } + + ps.setContentScore(node, contentScore) +} + +// removeAndGetNext remove node and returns its next node. +func (ps *Parser) removeAndGetNext(node *html.Node) *html.Node { + nextNode := ps.getNextNode(node, true) + if node.Parent != nil { + node.Parent.RemoveChild(node) + } + return nextNode +} + +// getNextNode traverses the DOM from node to node, starting at the +// node passed in. Pass true for the second parameter to indicate +// this node itself (and its kids) are going away, and we want the +// next node over. Calling this in a loop will traverse the DOM +// depth-first. +// In Readability.js, ignoreSelfAndKids default to false. +func (ps *Parser) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node { + // First check for kids if those aren't being ignored + if firstChild := dom.FirstElementChild(node); !ignoreSelfAndKids && firstChild != nil { + return firstChild + } + + // Then for siblings... + if sibling := dom.NextElementSibling(node); sibling != nil { + return sibling + } + + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + for { + node = node.Parent + if node == nil || dom.NextElementSibling(node) != nil { + break + } + } + + if node != nil { + return dom.NextElementSibling(node) + } + + return nil +} + +// textSimilarity compares second text to first one. 1 = same text, 0 = completely different text. +// The way it works: it splits both texts into words and then finds words that are unique in +// second text the result is given by the lower length of unique parts. +func (ps *Parser) textSimilarity(textA, textB string) float64 { + tokensA := rxTokenize.Split(strings.ToLower(textA), -1) + tokensA = strFilter(tokensA, func(s string) bool { return s != "" }) + mapTokensA := sliceToMap(tokensA...) + + tokensB := rxTokenize.Split(strings.ToLower(textB), -1) + tokensB = strFilter(tokensB, func(s string) bool { return s != "" }) + uniqueTokensB := strFilter(tokensB, func(s string) bool { + _, existInA := mapTokensA[s] + return !existInA + }) + + mergedB := strings.Join(tokensB, " ") + mergedUniqueB := strings.Join(uniqueTokensB, " ") + distanceB := float64(charCount(mergedUniqueB)) / float64(charCount(mergedB)) + + return 1 - distanceB +} + +// checkByline determines if a node is used as byline. +func (ps *Parser) checkByline(node *html.Node, matchString string) bool { + if ps.articleByline != "" { + return false + } + + rel := dom.GetAttribute(node, "rel") + itemprop := dom.GetAttribute(node, "itemprop") + nodeText := dom.TextContent(node) + if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) && + ps.isValidByline(nodeText) { + nodeText = strings.TrimSpace(nodeText) + nodeText = strings.Join(strings.Fields(nodeText), " ") + ps.articleByline = nodeText + return true + } + + return false +} + +func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 { + textLength := charCount(ps.getInnerText(node, true)) + if textLength == 0 { + return 0 + } + + var childrenLength int + children := ps.getAllNodesWithTag(node, tags...) + ps.forEachNode(children, func(child *html.Node, _ int) { + childrenLength += charCount(ps.getInnerText(child, true)) + }) + + return float64(childrenLength) / float64(textLength) +} + +// getNodeAncestors gets the node's direct parent and grandparents. +// In Readability.js, maxDepth default to 0. +func (ps *Parser) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node { + i := 0 + var ancestors []*html.Node + + for node.Parent != nil { + i++ + ancestors = append(ancestors, node.Parent) + if maxDepth > 0 && i == maxDepth { + break + } + node = node.Parent + } + return ancestors +} + +// grabArticle uses a variety of metrics (content score, classname, +// element types), find the content that is most likely to be the +// stuff a user wants to read. Then return it wrapped up in a div. +func (ps *Parser) grabArticle() *html.Node { + ps.log("**** GRAB ARTICLE ****") + + for { + doc := dom.Clone(ps.doc, true) + + var page *html.Node + if nodes := dom.GetElementsByTagName(doc, "body"); len(nodes) > 0 { + page = nodes[0] + } + + // We can't grab an article if we don't have a page! + if page == nil { + ps.log("no body found in document, abort") + return nil + } + + // First, node prepping. Trash nodes that look cruddy (like ones + // with the class name "comment", etc), and turn divs into P + // tags where they have been used inappropriately (as in, where + // they contain no other block level elements.) + var elementsToScore []*html.Node + var node = dom.DocumentElement(doc) + shouldRemoveTitleHeader := true + + for node != nil { + matchString := dom.ClassName(node) + " " + dom.ID(node) + + if dom.TagName(node) == "html" { + ps.articleLang = dom.GetAttribute(node, "lang") + } + + if !ps.isProbablyVisible(node) { + ps.logf("removing hidden node: %q\n", matchString) + node = ps.removeAndGetNext(node) + continue + } + + // User is not able to see elements applied with both "aria-modal = true" + // and "role = dialog" + if dom.GetAttribute(node, "aria-modal") == "true" && + dom.GetAttribute(node, "role") == "dialog" { + node = ps.removeAndGetNext(node) + continue + } + + // Check to see if this node is a byline, and remove it if + // it is true. + if ps.checkByline(node, matchString) { + node = ps.removeAndGetNext(node) + continue + } + + if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) { + ps.logf("removing header: %q duplicate of %q\n", + trim(dom.TextContent(node)), trim(ps.articleTitle)) + shouldRemoveTitleHeader = false + node = ps.removeAndGetNext(node) + continue + } + + // Remove unlikely candidates + nodeTagName := dom.TagName(node) + if ps.flags.stripUnlikelys { + if rxUnlikelyCandidates.MatchString(matchString) && + !rxOkMaybeItsACandidate.MatchString(matchString) && + !ps.hasAncestorTag(node, "table", 3, nil) && + !ps.hasAncestorTag(node, "code", 3, nil) && + nodeTagName != "body" && nodeTagName != "a" { + ps.logf("removing unlikely candidate: %q\n", matchString) + node = ps.removeAndGetNext(node) + continue + } + + role := dom.GetAttribute(node, "role") + if _, include := unlikelyRoles[role]; include { + ps.logf("removing content with role %q: %q\n", role, matchString) + node = ps.removeAndGetNext(node) + continue + } + } + + // Remove DIV, SECTION, and HEADER nodes without any + // content(e.g. text, image, video, or iframe). + switch nodeTagName { + case "div", "section", "header", + "h1", "h2", "h3", "h4", "h5", "h6": + if ps.isElementWithoutContent(node) { + node = ps.removeAndGetNext(node) + continue + } + } + + if indexOf(ps.TagsToScore, nodeTagName) != -1 { + elementsToScore = append(elementsToScore, node) + } + + // Turn all divs that don't have children block level + // elements into p's + if nodeTagName == "div" { + // Put phrasing content into paragraphs. + var p *html.Node + childNode := node.FirstChild + for childNode != nil { + nextSibling := childNode.NextSibling + if ps.isPhrasingContent(childNode) { + if p != nil { + dom.AppendChild(p, childNode) + } else if !ps.isWhitespace(childNode) { + p = dom.CreateElement("p") + dom.AppendChild(p, dom.Clone(childNode, true)) + dom.ReplaceChild(node, p, childNode) + } + } else if p != nil { + for p.LastChild != nil && ps.isWhitespace(p.LastChild) { + p.RemoveChild(p.LastChild) + } + p = nil + } + childNode = nextSibling + } + + // Sites like http://mobile.slate.com encloses each + // paragraph with a DIV element. DIVs with only a P + // element inside and no text content can be safely + // converted into plain P elements to avoid confusing + // the scoring algorithm with DIVs with are, in + // practice, paragraphs. + if ps.hasSingleTagInsideElement(node, "p") && ps.getLinkDensity(node) < 0.25 { + newNode := dom.Children(node)[0] + node, _ = dom.ReplaceChild(node.Parent, newNode, node) + elementsToScore = append(elementsToScore, node) + } else if !ps.hasChildBlockElement(node) { + ps.setNodeTag(node, "p") + elementsToScore = append(elementsToScore, node) + } + } + node = ps.getNextNode(node, false) + } + + // Loop through all paragraphs, and assign a score to them based + // on how content-y they look. Then add their score to their + // parent node. A score is determined by things like number of + // commas, class names, etc. Maybe eventually link density. + var candidates []*html.Node + ps.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) { + if elementToScore.Parent == nil || dom.TagName(elementToScore.Parent) == "" { + return + } + + // If this paragraph is less than 25 characters, don't even count it. + innerText := ps.getInnerText(elementToScore, true) + if charCount(innerText) < 25 { + return + } + + // Exclude nodes with no ancestor. + ancestors := ps.getNodeAncestors(elementToScore, 5) + if len(ancestors) == 0 { + return + } + + // Add a point for the paragraph itself as a base. + contentScore := 1 + + // Add points for any commas within this paragraph. + contentScore += strings.Count(innerText, ",") + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0)) + + // Initialize and score ancestors. + ps.forEachNode(ancestors, func(ancestor *html.Node, level int) { + if dom.TagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode { + return + } + + if !ps.hasContentScore(ancestor) { + ps.initializeNode(ancestor) + candidates = append(candidates, ancestor) + } + + // Node score divider: + // - parent: 1 (no division) + // - grandparent: 2 + // - great grandparent+: ancestor level * 3 + scoreDivider := 1 + switch level { + case 0: + scoreDivider = 1 + case 1: + scoreDivider = 2 + default: + scoreDivider = level * 3 + } + + ancestorScore := ps.getContentScore(ancestor) + ancestorScore += float64(contentScore) / float64(scoreDivider) + ps.setContentScore(ancestor, ancestorScore) + }) + }) + + // These lines are a bit different compared to Readability.js. + // In Readability.js, they fetch NTopCandidates utilising array + // method like `splice` and `pop`. In Go, array method like that + // is not as simple, especially since we are working with pointer. + // So, here we simply sort top candidates, and limit it to + // max NTopCandidates. + + // Scale the final candidates score based on link density. Good + // content should have a relatively small link density (5% or + // less) and be mostly unaffected by this operation. + for i := 0; i < len(candidates); i++ { + candidate := candidates[i] + candidateScore := ps.getContentScore(candidate) * (1 - ps.getLinkDensity(candidate)) + ps.logf("candidate %q with score: %f\n", dom.OuterHTML(candidate), candidateScore) + ps.setContentScore(candidate, candidateScore) + } + + // After we've calculated scores, sort through all of the possible + // candidate nodes we found and find the one with the highest score. + sort.Slice(candidates, func(i int, j int) bool { + return ps.getContentScore(candidates[i]) > ps.getContentScore(candidates[j]) + }) + + var topCandidates []*html.Node + if len(candidates) > ps.NTopCandidates { + topCandidates = candidates[:ps.NTopCandidates] + } else { + topCandidates = candidates + } + + var topCandidate, parentOfTopCandidate *html.Node + neededToCreateTopCandidate := false + if len(topCandidates) > 0 { + topCandidate = topCandidates[0] + } + + // If we still have no top candidate, just use the body as a last + // resort. We also have to copy the body node so it is something + // we can modify. + if topCandidate == nil || dom.TagName(topCandidate) == "body" { + // Move all of the page's children into topCandidate + topCandidate = dom.CreateElement("div") + neededToCreateTopCandidate = true + // Move everything (not just elements, also text nodes etc.) + // into the container so we even include text directly in the body: + for page.FirstChild != nil { + ps.logf("moving child out: %q\n", dom.OuterHTML(page.FirstChild)) + dom.AppendChild(topCandidate, page.FirstChild) + } + + dom.AppendChild(page, topCandidate) + ps.initializeNode(topCandidate) + } else if topCandidate != nil { + // Find a better top candidate node if it contains (at least three) + // nodes which belong to `topCandidates` array and whose scores are + // quite closed with current `topCandidate` node. + topCandidateScore := ps.getContentScore(topCandidate) + var alternativeCandidateAncestors [][]*html.Node + for i := 1; i < len(topCandidates); i++ { + if ps.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 { + topCandidateAncestors := ps.getNodeAncestors(topCandidates[i], 0) + alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors) + } + } + + minimumTopCandidates := 3 + if len(alternativeCandidateAncestors) >= minimumTopCandidates { + parentOfTopCandidate = topCandidate.Parent + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" { + listContainingThisAncestor := 0 + for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ { + if dom.IncludeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) { + listContainingThisAncestor++ + } + } + + if listContainingThisAncestor >= minimumTopCandidates { + topCandidate = parentOfTopCandidate + break + } + + parentOfTopCandidate = parentOfTopCandidate.Parent + } + } + + if !ps.hasContentScore(topCandidate) { + ps.initializeNode(topCandidate) + } + + // Because of our bonus system, parents of candidates might + // have scores themselves. They get half of the node. There + // won't be nodes with higher scores than our topCandidate, + // but if we see the score going *up* in the first few steps * + // up the tree, that's a decent sign that there might be more + // content lurking in other places that we want to unify in. + // The sibling stuff below does some of that - but only if + // we've looked high enough up the DOM tree. + parentOfTopCandidate = topCandidate.Parent + lastScore := ps.getContentScore(topCandidate) + // The scores shouldn't get too lops. + scoreThreshold := lastScore / 3.0 + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" { + if !ps.hasContentScore(parentOfTopCandidate) { + parentOfTopCandidate = parentOfTopCandidate.Parent + continue + } + + parentScore := ps.getContentScore(parentOfTopCandidate) + if parentScore < scoreThreshold { + break + } + + if parentScore > lastScore { + // Alright! We found a better parent to use. + topCandidate = parentOfTopCandidate + break + } + + lastScore = parentScore + parentOfTopCandidate = parentOfTopCandidate.Parent + } + + // If the top candidate is the only child, use parent + // instead. This will help sibling joining logic when + // adjacent content is actually located in parent's + // sibling node. + parentOfTopCandidate = topCandidate.Parent + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" && len(dom.Children(parentOfTopCandidate)) == 1 { + topCandidate = parentOfTopCandidate + parentOfTopCandidate = topCandidate.Parent + } + + if !ps.hasContentScore(topCandidate) { + ps.initializeNode(topCandidate) + } + } + + // Now that we have the top candidate, look through its siblings + // for content that might also be related. Things like preambles, + // content split by ads that we removed, etc. + articleContent := dom.CreateElement("div") + siblingScoreThreshold := math.Max(10, ps.getContentScore(topCandidate)*0.2) + + // Keep potential top candidate's parent node to try to get text direction of it later. + topCandidateScore := ps.getContentScore(topCandidate) + topCandidateClassName := dom.ClassName(topCandidate) + + parentOfTopCandidate = topCandidate.Parent + siblings := dom.Children(parentOfTopCandidate) + for s := 0; s < len(siblings); s++ { + sibling := siblings[s] + appendNode := false + + if sibling == topCandidate { + appendNode = true + } else { + contentBonus := float64(0) + + // Give a bonus if sibling nodes and top candidates have the example same classname + if dom.ClassName(sibling) == topCandidateClassName && topCandidateClassName != "" { + contentBonus += topCandidateScore * 0.2 + } + + if ps.hasContentScore(sibling) && ps.getContentScore(sibling)+contentBonus >= siblingScoreThreshold { + appendNode = true + } else if dom.TagName(sibling) == "p" { + linkDensity := ps.getLinkDensity(sibling) + nodeContent := ps.getInnerText(sibling, true) + nodeLength := charCount(nodeContent) + + if nodeLength > 80 && linkDensity < 0.25 { + appendNode = true + } else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 && + rxSentencePeriod.MatchString(nodeContent) { + appendNode = true + } + } + } + + if appendNode { + // We have a node that isn't a common block level + // element, like a form or td tag. Turn it into a div + // so it doesn't get filtered out later by accident. + if indexOf(alterToDivExceptions, dom.TagName(sibling)) == -1 { + ps.setNodeTag(sibling, "div") + } + + dom.AppendChild(articleContent, sibling) + + // TODO: + // this line is implemented in Readability.js, however + // it doesn't seem to be useful for our port. + // siblings = dom.Children(parentOfTopCandidate) + } + } + + // So we have all of the content that we need. Now we clean + // it up for presentation. + ps.prepArticle(articleContent) + + if neededToCreateTopCandidate { + // We already created a fake div thing, and there wouldn't + // have been any siblings left for the previous loop, so + // there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class + // names here. No need to append because that already + // happened anyway. + // + // By the way, this line is different with Readability.js. + // In Readability.js, when using `appendChild`, the node is + // still referenced. Meanwhile here, our `appendChild` will + // clone the node, put it in the new place, then delete + // the original. + firstChild := dom.FirstElementChild(articleContent) + if firstChild != nil && dom.TagName(firstChild) == "div" { + dom.SetAttribute(firstChild, "id", "readability-page-1") + dom.SetAttribute(firstChild, "class", "page") + } + } else { + div := dom.CreateElement("div") + dom.SetAttribute(div, "id", "readability-page-1") + dom.SetAttribute(div, "class", "page") + for articleContent.FirstChild != nil { + dom.AppendChild(div, articleContent.FirstChild) + } + dom.AppendChild(articleContent, div) + } + + parseSuccessful := true + + // Now that we've gone through the full algorithm, check to + // see if we got any meaningful content. If we didn't, we may + // need to re-run grabArticle with different flags set. This + // gives us a higher likelihood of finding the content, and + // the sieve approach gives us a higher likelihood of + // finding the -right- content. + textLength := charCount(ps.getInnerText(articleContent, true)) + if textLength < ps.CharThresholds { + parseSuccessful = false + + if ps.flags.stripUnlikelys { + ps.flags.stripUnlikelys = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else if ps.flags.useWeightClasses { + ps.flags.useWeightClasses = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else if ps.flags.cleanConditionally { + ps.flags.cleanConditionally = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else { + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + + // No luck after removing flags, just return the + // longest text we found during the different loops * + sort.Slice(ps.attempts, func(i, j int) bool { + return ps.attempts[i].textLength > ps.attempts[j].textLength + }) + + // But first check if we actually have something + if ps.attempts[0].textLength == 0 { + return nil + } + + articleContent = ps.attempts[0].articleContent + parseSuccessful = true + } + } + + if parseSuccessful { + return articleContent + } + } +} + +// isValidByline checks whether the input string could be a byline. +// This verifies that the input is a string, and that the length +// is less than 100 chars. +func (ps *Parser) isValidByline(byline string) bool { + byline = strings.TrimSpace(byline) + nChar := charCount(byline) + return nChar > 0 && nChar < 100 +} + +// getJSONLD try to extract metadata from JSON-LD object. +// For now, only Schema.org objects of type Article or its subtypes are supported. +func (ps *Parser) getJSONLD() (map[string]string, error) { + var metadata map[string]string + + scripts := dom.QuerySelectorAll(ps.doc, `script[type="application/ld+json"]`) + ps.forEachNode(scripts, func(jsonLdElement *html.Node, _ int) { + if metadata != nil { + return + } + + // Strip CDATA markers if present + content := rxCDATA.ReplaceAllString(dom.TextContent(jsonLdElement), "") + + // Decode JSON + var parsed map[string]interface{} + err := json.Unmarshal([]byte(content), &parsed) + if err != nil { + ps.logf("error while decoding json: %v", err) + return + } + + // Check context + strContext, isString := parsed["@context"].(string) + if !isString || !rxSchemaOrg.MatchString(strContext) { + return + } + + // If parsed doesn't have any @type, find it in its graph list + if _, typeExist := parsed["@type"]; !typeExist { + graphList, isArray := parsed["@graph"].([]interface{}) + if !isArray { + return + } + + for _, graph := range graphList { + objGraph, isObj := graph.(map[string]interface{}) + if !isObj { + continue + } + + strType, isString := objGraph["@type"].(string) + if isString && rxJsonLdArticleTypes.MatchString(strType) { + parsed = objGraph + break + } + } + } + + // Once again, make sure parsed has valid @type + strType, isString := parsed["@type"].(string) + if !isString || !rxJsonLdArticleTypes.MatchString(strType) { + return + } + + // Initiate metadata + metadata = make(map[string]string) + + // Title + name, nameIsString := parsed["name"].(string) + headline, headlineIsString := parsed["headline"].(string) + + if nameIsString && headlineIsString && name != headline { + // We have both name and headline element in the JSON-LD. They should both be the same + // but some websites like aktualne.cz put their own name into "name" and the article + // title to "headline" which confuses Readability. So we try to check if either "name" + // or "headline" closely matches the html title, and if so, use that one. If not, then + // we use "name" by default. + title := ps.getArticleTitle() + nameMatches := ps.textSimilarity(name, title) > 0.75 + headlineMatches := ps.textSimilarity(headline, title) > 0.75 + + if headlineMatches && !nameMatches { + metadata["title"] = headline + } else { + metadata["title"] = name + } + } else if name, isString := parsed["name"].(string); isString { + metadata["title"] = strings.TrimSpace(name) + } else if headline, isString := parsed["headline"].(string); isString { + metadata["title"] = strings.TrimSpace(headline) + } + + // Author + switch val := parsed["author"].(type) { + case map[string]interface{}: + if name, isString := val["name"].(string); isString { + metadata["byline"] = strings.TrimSpace(name) + } + + case []interface{}: + var authors []string + for _, author := range val { + objAuthor, isObj := author.(map[string]interface{}) + if !isObj { + continue + } + + if name, isString := objAuthor["name"].(string); isString { + authors = append(authors, strings.TrimSpace(name)) + } + } + metadata["byline"] = strings.Join(authors, ", ") + } + + // Description + if description, isString := parsed["description"].(string); isString { + metadata["excerpt"] = strings.TrimSpace(description) + } + + // Publisher + if objPublisher, isObj := parsed["publisher"].(map[string]interface{}); isObj { + if name, isString := objPublisher["name"].(string); isString { + metadata["siteName"] = strings.TrimSpace(name) + } + } + }) + + return metadata, nil +} + +// getArticleMetadata attempts to get excerpt and byline +// metadata for the article. +func (ps *Parser) getArticleMetadata(jsonLd map[string]string) map[string]string { + values := make(map[string]string) + metaElements := dom.GetElementsByTagName(ps.doc, "meta") + + // Find description tags. + ps.forEachNode(metaElements, func(element *html.Node, _ int) { + elementName := dom.GetAttribute(element, "name") + elementProperty := dom.GetAttribute(element, "property") + content := dom.GetAttribute(element, "content") + if content == "" { + return + } + matches := []string{} + name := "" + + if elementProperty != "" { + matches = rxPropertyPattern.FindAllString(elementProperty, -1) + for i := len(matches) - 1; i >= 0; i-- { + // Convert to lowercase, and remove any whitespace + // so we can match belops. + name = strings.ToLower(matches[i]) + name = strings.Join(strings.Fields(name), "") + // multiple authors + values[name] = strings.TrimSpace(content) + } + } + + if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) { + // Convert to lowercase, remove any whitespace, and convert + // dots to colons so we can match belops. + name = strings.ToLower(elementName) + name = strings.Join(strings.Fields(name), "") + name = strings.Replace(name, ".", ":", -1) + values[name] = strings.TrimSpace(content) + } + }) + + // get title + metadataTitle := strOr( + jsonLd["title"], + values["dc:title"], + values["dcterm:title"], + values["og:title"], + values["weibo:article:title"], + values["weibo:webpage:title"], + values["title"], + values["twitter:title"]) + + if metadataTitle == "" { + metadataTitle = ps.getArticleTitle() + } + + // get author + metadataByline := strOr( + jsonLd["byline"], + values["dc:creator"], + values["dcterm:creator"], + values["author"]) + + // get description + metadataExcerpt := strOr( + jsonLd["excerpt"], + values["dc:description"], + values["dcterm:description"], + values["og:description"], + values["weibo:article:description"], + values["weibo:webpage:description"], + values["description"], + values["twitter:description"]) + + // get site name + metadataSiteName := strOr(jsonLd["siteName"], values["og:site_name"]) + + // get image thumbnail + metadataImage := strOr( + values["og:image"], + values["image"], + values["twitter:image"]) + + // get favicon + metadataFavicon := ps.getArticleFavicon() + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadataTitle = shtml.UnescapeString(metadataTitle) + metadataByline = shtml.UnescapeString(metadataByline) + metadataExcerpt = shtml.UnescapeString(metadataExcerpt) + metadataSiteName = shtml.UnescapeString(metadataSiteName) + + return map[string]string{ + "title": metadataTitle, + "byline": metadataByline, + "excerpt": metadataExcerpt, + "siteName": metadataSiteName, + "image": metadataImage, + "favicon": metadataFavicon, + } +} + +// isSingleImage checks if node is image, or if node contains exactly +// only one image whether as a direct child or as its descendants. +func (ps *Parser) isSingleImage(node *html.Node) bool { + if dom.TagName(node) == "img" { + return true + } + + children := dom.Children(node) + textContent := dom.TextContent(node) + if len(children) != 1 || strings.TrimSpace(textContent) != "" { + return false + } + + return ps.isSingleImage(children[0]) +} + +// unwrapNoscriptImages finds all


. +func (ps *Parser) isElementWithoutContent(node *html.Node) bool { + brs := dom.GetElementsByTagName(node, "br") + hrs := dom.GetElementsByTagName(node, "hr") + childs := dom.Children(node) + + return node.Type == html.ElementNode && + strings.TrimSpace(dom.TextContent(node)) == "" && + (len(childs) == 0 || len(childs) == len(brs)+len(hrs)) +} + +// hasChildBlockElement determines whether element has any children +// block level elements. +func (ps *Parser) hasChildBlockElement(element *html.Node) bool { + return ps.someNode(dom.ChildNodes(element), func(node *html.Node) bool { + _, exist := divToPElems[dom.TagName(node)] + return exist || ps.hasChildBlockElement(node) + }) +} + +// isPhrasingContent determines if a node qualifies as phrasing content. +func (ps *Parser) isPhrasingContent(node *html.Node) bool { + nodeTagName := dom.TagName(node) + return node.Type == html.TextNode || indexOf(phrasingElems, nodeTagName) != -1 || + ((nodeTagName == "a" || nodeTagName == "del" || nodeTagName == "ins") && + ps.everyNode(dom.ChildNodes(node), ps.isPhrasingContent)) +} + +// isWhitespace determines if a node only used as whitespace. +func (ps *Parser) isWhitespace(node *html.Node) bool { + return (node.Type == html.TextNode && strings.TrimSpace(dom.TextContent(node)) == "") || + (node.Type == html.ElementNode && dom.TagName(node) == "br") +} + +// getInnerText gets the inner text of a node. +// This also strips * out any excess whitespace to be found. +// In Readability.js, normalizeSpaces default to true. +func (ps *Parser) getInnerText(node *html.Node, normalizeSpaces bool) string { + textContent := strings.TrimSpace(dom.TextContent(node)) + if normalizeSpaces { + textContent = rxNormalize.ReplaceAllString(textContent, " ") + } + return textContent +} + +// getCharCount returns the number of times a string s +// appears in the node. +func (ps *Parser) getCharCount(node *html.Node, s string) int { + innerText := ps.getInnerText(node, true) + return strings.Count(innerText, s) +} + +// cleanStyles removes the style attribute on every node and under. +func (ps *Parser) cleanStyles(node *html.Node) { + nodeTagName := dom.TagName(node) + if node == nil || nodeTagName == "svg" { + return + } + + // Remove `style` and deprecated presentational attributes + for i := 0; i < len(presentationalAttributes); i++ { + dom.RemoveAttribute(node, presentationalAttributes[i]) + } + + if indexOf(deprecatedSizeAttributeElems, nodeTagName) != -1 { + dom.RemoveAttribute(node, "width") + dom.RemoveAttribute(node, "height") + } + + for child := dom.FirstElementChild(node); child != nil; child = dom.NextElementSibling(child) { + ps.cleanStyles(child) + } +} + +// getLinkDensity gets the density of links as a percentage of the +// content. This is the amount of text that is inside a link divided +// by the total text in the node. +func (ps *Parser) getLinkDensity(element *html.Node) float64 { + textLength := charCount(ps.getInnerText(element, true)) + if textLength == 0 { + return 0 + } + + var linkLength float64 + ps.forEachNode(dom.GetElementsByTagName(element, "a"), func(linkNode *html.Node, _ int) { + href := dom.GetAttribute(linkNode, "href") + href = strings.TrimSpace(href) + + coefficient := 1.0 + if href != "" && rxHashURL.MatchString(href) { + coefficient = 0.3 + } + + nodeLength := charCount(ps.getInnerText(linkNode, true)) + linkLength += float64(nodeLength) * coefficient + }) + + return linkLength / float64(textLength) +} + +// getClassWeight gets an elements class/id weight. Uses regular +// expressions to tell if this element looks good or bad. +func (ps *Parser) getClassWeight(node *html.Node) int { + if !ps.flags.useWeightClasses { + return 0 + } + + weight := 0 + + // Look for a special classname + if nodeClassName := dom.ClassName(node); nodeClassName != "" { + if rxNegative.MatchString(nodeClassName) { + weight -= 25 + } + + if rxPositive.MatchString(nodeClassName) { + weight += 25 + } + } + + // Look for a special ID + if nodeID := dom.ID(node); nodeID != "" { + if rxNegative.MatchString(nodeID) { + weight -= 25 + } + + if rxPositive.MatchString(nodeID) { + weight += 25 + } + } + + return weight +} + +// clean cleans a node of all elements of type "tag". +// (Unless it's a youtube/vimeo video. People love movies.) +func (ps *Parser) clean(node *html.Node, tag string) { + isEmbed := indexOf([]string{"object", "embed", "iframe"}, tag) != -1 + rxVideoVilter := ps.AllowedVideoRegex + if rxVideoVilter == nil { + rxVideoVilter = rxVideosx + } + + ps.removeNodes(dom.GetElementsByTagName(node, tag), func(element *html.Node) bool { + // Allow youtube and vimeo videos through as people usually want to see those. + if isEmbed { + // First, check the elements attributes to see if any of them contain + // youtube or vimeo + for _, attr := range element.Attr { + if rxVideoVilter.MatchString(attr.Val) { + return false + } + } + + // For embed with tag, check inner HTML as well. + if dom.TagName(element) == "object" && rxVideoVilter.MatchString(dom.InnerHTML(element)) { + return false + } + } + return true + }) +} + +// hasAncestorTag checks if a given node has one of its ancestor tag +// name matching the provided one. In Readability.js, default value +// for maxDepth is 3. +func (ps *Parser) hasAncestorTag(node *html.Node, tag string, maxDepth int, filterFn func(*html.Node) bool) bool { + depth := 0 + for node.Parent != nil { + if maxDepth > 0 && depth > maxDepth { + return false + } + + if dom.TagName(node.Parent) == tag && (filterFn == nil || filterFn(node.Parent)) { + return true + } + + node = node.Parent + depth++ + } + return false +} + +// getRowAndColumnCount returns how many rows and columns this table has. +func (ps *Parser) getRowAndColumnCount(table *html.Node) (int, int) { + rows := 0 + columns := 0 + trs := dom.GetElementsByTagName(table, "tr") + for i := 0; i < len(trs); i++ { + strRowSpan := dom.GetAttribute(trs[i], "rowspan") + rowSpan, _ := strconv.Atoi(strRowSpan) + if rowSpan == 0 { + rowSpan = 1 + } + rows += rowSpan + + // Now look for column-related info + columnsInThisRow := 0 + cells := dom.GetElementsByTagName(trs[i], "td") + for j := 0; j < len(cells); j++ { + strColSpan := dom.GetAttribute(cells[j], "colspan") + colSpan, _ := strconv.Atoi(strColSpan) + if colSpan == 0 { + colSpan = 1 + } + columnsInThisRow += colSpan + } + + if columnsInThisRow > columns { + columns = columnsInThisRow + } + } + + return rows, columns +} + +// markDataTables looks for 'data' (as opposed to 'layout') tables +// and mark it, which similar as used in Firefox: +// https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 +func (ps *Parser) markDataTables(root *html.Node) { + tables := dom.GetElementsByTagName(root, "table") + for i := 0; i < len(tables); i++ { + table := tables[i] + + role := dom.GetAttribute(table, "role") + if role == "presentation" { + ps.setReadabilityDataTable(table, false) + continue + } + + datatable := dom.GetAttribute(table, "datatable") + if datatable == "0" { + ps.setReadabilityDataTable(table, false) + continue + } + + if dom.HasAttribute(table, "summary") { + ps.setReadabilityDataTable(table, true) + continue + } + + if captions := dom.GetElementsByTagName(table, "caption"); len(captions) > 0 { + if caption := captions[0]; caption != nil && len(dom.ChildNodes(caption)) > 0 { + ps.setReadabilityDataTable(table, true) + continue + } + } + + // If the table has a descendant with any of these tags, consider a data table: + hasDataTableDescendantTags := false + for _, descendantTag := range []string{"col", "colgroup", "tfoot", "thead", "th"} { + descendants := dom.GetElementsByTagName(table, descendantTag) + if len(descendants) > 0 && descendants[0] != nil { + hasDataTableDescendantTags = true + break + } + } + + if hasDataTableDescendantTags { + ps.setReadabilityDataTable(table, true) + continue + } + + // Nested tables indicates a layout table: + if len(dom.GetElementsByTagName(table, "table")) > 0 { + ps.setReadabilityDataTable(table, false) + continue + } + + rows, columns := ps.getRowAndColumnCount(table) + if rows >= 10 || columns > 4 { + ps.setReadabilityDataTable(table, true) + continue + } + + // Now just go by size entirely: + if rows*columns > 10 { + ps.setReadabilityDataTable(table, true) + } + } +} + +// fixLazyImages convert images and figures that have properties like data-src into +// images that can be loaded without JS. +func (ps *Parser) fixLazyImages(root *html.Node) { + imageNodes := ps.getAllNodesWithTag(root, "img", "picture", "figure") + ps.forEachNode(imageNodes, func(elem *html.Node, _ int) { + src := dom.GetAttribute(elem, "src") + srcset := dom.GetAttribute(elem, "srcset") + nodeTag := dom.TagName(elem) + nodeClass := dom.ClassName(elem) + + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in + // the src attribute. So, here we check if the data uri is too short, just might + // as well remove it. + if src != "" && rxB64DataURL.MatchString(src) { + // Make sure it's not SVG, because SVG can have a meaningful image in + // under 133 bytes. + parts := rxB64DataURL.FindStringSubmatch(src) + if parts[1] == "image/svg+xml" { + return + } + + // Make sure this element has other attributes which contains image. + // If it doesn't, then this src is important and shouldn't be removed. + srcCouldBeRemoved := false + for _, attr := range elem.Attr { + if attr.Key == "src" { + continue + } + + if rxImgExtensions.MatchString(attr.Val) && isValidURL(attr.Val) { + srcCouldBeRemoved = true + break + } + } + + // Here we assume if image is less than 100 bytes (or 133B + // after encoded to base64) it will be too small, therefore + // it might be placeholder image. + if srcCouldBeRemoved { + b64starts := strings.Index(src, "base64") + 7 + b64length := len(src) - b64starts + if b64length < 133 { + src = "" + dom.RemoveAttribute(elem, "src") + } + } + } + + if (src != "" || srcset != "") && !strings.Contains(strings.ToLower(nodeClass), "lazy") { + return + } + + for i := 0; i < len(elem.Attr); i++ { + attr := elem.Attr[i] + if attr.Key == "src" || attr.Key == "srcset" || attr.Key == "alt" { + continue + } + + copyTo := "" + if rxLazyImageSrcset.MatchString(attr.Val) { + copyTo = "srcset" + } else if rxLazyImageSrc.MatchString(attr.Val) { + copyTo = "src" + } + + if copyTo == "" || !isValidURL(attr.Val) { + continue + } + + if nodeTag == "img" || nodeTag == "picture" { + // if this is an img or picture, set the attribute directly + dom.SetAttribute(elem, copyTo, attr.Val) + } else if nodeTag == "figure" && len(ps.getAllNodesWithTag(elem, "img", "picture")) == 0 { + // if the item is a
that does not contain an image or picture, + // create one and place it inside the figure see the nytimes-3 + // testcase for an example + img := dom.CreateElement("img") + dom.SetAttribute(img, copyTo, attr.Val) + dom.AppendChild(elem, img) + } + } + }) +} + +// cleanConditionally cleans an element of all tags of type "tag" if +// they look fishy. "Fishy" is an algorithm based on content length, +// classnames, link density, number of images & embeds, etc. +func (ps *Parser) cleanConditionally(element *html.Node, tag string) { + if !ps.flags.cleanConditionally { + return + } + + // Prepare regex video filter + rxVideoVilter := ps.AllowedVideoRegex + if rxVideoVilter == nil { + rxVideoVilter = rxVideosx + } + + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time + // without effecting the traversal. + // TODO: Consider taking into account original contentScore here. + ps.removeNodes(dom.GetElementsByTagName(element, tag), func(node *html.Node) bool { + // First check if this node IS data table, in which case don't remove it. + if tag == "table" && ps.isReadabilityDataTable(node) { + return false + } + + isList := tag == "ul" || tag == "ol" + if !isList { + var listLength int + listNodes := ps.getAllNodesWithTag(node, "ul", "ol") + ps.forEachNode(listNodes, func(list *html.Node, _ int) { + listLength += charCount(ps.getInnerText(list, true)) + }) + + nodeLength := charCount(ps.getInnerText(node, true)) + isList = float64(listLength)/float64(nodeLength) > 0.9 + } + + // Next check if we're inside a data table, in which case don't remove it as well. + if ps.hasAncestorTag(node, "table", -1, ps.isReadabilityDataTable) { + return false + } + + if ps.hasAncestorTag(node, "code", 3, nil) { + return false + } + + var contentScore int + weight := ps.getClassWeight(node) + if weight+contentScore < 0 { + return true + } + + if ps.getCharCount(node, ",") < 10 { + // If there are not very many commas, and the number of + // non-paragraph elements is more than paragraphs or other + // ominous signs, remove the element. + p := float64(len(dom.GetElementsByTagName(node, "p"))) + img := float64(len(dom.GetElementsByTagName(node, "img"))) + li := float64(len(dom.GetElementsByTagName(node, "li")) - 100) + input := float64(len(dom.GetElementsByTagName(node, "input"))) + headingDensity := ps.getTextDensity(node, "h1", "h2", "h3", "h4", "h5", "h6") + + embedCount := 0 + embeds := ps.getAllNodesWithTag(node, "object", "embed", "iframe") + + for _, embed := range embeds { + // If this embed has attribute that matches video regex, + // don't delete it. + for _, attr := range embed.Attr { + if rxVideoVilter.MatchString(attr.Val) { + return false + } + } + + // For embed with tag, check inner HTML as well. + if dom.TagName(embed) == "object" && rxVideoVilter.MatchString(dom.InnerHTML(embed)) { + return false + } + + embedCount++ + } + + linkDensity := ps.getLinkDensity(node) + contentLength := charCount(ps.getInnerText(node, true)) + haveToRemove := (img > 1 && p/img < 0.5 && !ps.hasAncestorTag(node, "figure", 3, nil)) || + (!isList && li > p) || + (input > math.Floor(p/3)) || + (!isList && headingDensity < 0.9 && contentLength < 25 && (img == 0 || img > 2) && !ps.hasAncestorTag(node, "figure", 3, nil)) || + (!isList && weight < 25 && linkDensity > 0.2) || + (weight >= 25 && linkDensity > 0.5) || + ((embedCount == 1 && contentLength < 75) || embedCount > 1) + + // Allow simple lists of images to remain in pages + if isList && haveToRemove { + for _, child := range dom.Children(node) { + // Don't filter in lists with li's that contain more than one child + if len(dom.Children(child)) > 1 { + return haveToRemove + } + } + + // Only allow the list to remain if every li contains an image + liCount := len(dom.GetElementsByTagName(node, "li")) + if int(img) == liCount { + return false + } + } + + return haveToRemove + } + + return false + }) +} + +// cleanMatchedNodes cleans out elements whose id/class +// combinations match specific string. +func (ps *Parser) cleanMatchedNodes(e *html.Node, filter func(*html.Node, string) bool) { + endOfSearchMarkerNode := ps.getNextNode(e, true) + next := ps.getNextNode(e, false) + for next != nil && next != endOfSearchMarkerNode { + if filter != nil && filter(next, dom.ClassName(next)+" "+dom.ID(next)) { + next = ps.removeAndGetNext(next) + } else { + next = ps.getNextNode(next, false) + } + } +} + +// cleanHeaders cleans out spurious headers from an Element. +func (ps *Parser) cleanHeaders(e *html.Node) { + headingNodes := ps.getAllNodesWithTag(e, "h1", "h2") + ps.removeNodes(headingNodes, func(node *html.Node) bool { + // Removing header with low class weight + if ps.getClassWeight(node) < 0 { + ps.logf("removing header with low class weight: %q\n", dom.OuterHTML(node)) + return true + } + return false + }) +} + +// headerDuplicateTitle check if this node is an H1 or H2 element whose content +// is mostly the same as the article title. +func (ps *Parser) headerDuplicatesTitle(node *html.Node) bool { + if tag := dom.TagName(node); tag != "h1" && tag != "h2" { + return false + } + + heading := ps.getInnerText(node, false) + ps.logf("evaluating similarity of header: %q and %q\n", heading, ps.articleTitle) + return ps.textSimilarity(ps.articleTitle, heading) > 0.75 +} + +// isProbablyVisible determines if a node is visible. +func (ps *Parser) isProbablyVisible(node *html.Node) bool { + nodeStyle := dom.GetAttribute(node, "style") + nodeAriaHidden := dom.GetAttribute(node, "aria-hidden") + className := dom.GetAttribute(node, "class") + + // Have to null-check node.style and node.className.indexOf to deal + // with SVG and MathML nodes. Also check for "fallback-image" so that + // Wikimedia Math images are displayed + return (nodeStyle == "" || !rxDisplayNone.MatchString(nodeStyle)) && + !dom.HasAttribute(node, "hidden") && + (nodeAriaHidden == "" || nodeAriaHidden != "true" || strings.Contains(className, "fallback-image")) +} + +// ====================== INFORMATION ====================== +// Methods below these point are not exist in Readability.js. +// They are only used as workaround since Readability.js is +// written in JS which is a dynamic language, while this +// package is written in Go, which is static. +// ========================================================= + +// getArticleFavicon attempts to get high quality favicon +// that used in article. It will only pick favicon in PNG +// format, so small favicon that uses ico file won't be picked. +// Using algorithm by philippe_b. +func (ps *Parser) getArticleFavicon() string { + favicon := "" + faviconSize := -1 + linkElements := dom.GetElementsByTagName(ps.doc, "link") + + ps.forEachNode(linkElements, func(link *html.Node, _ int) { + linkRel := strings.TrimSpace(dom.GetAttribute(link, "rel")) + linkType := strings.TrimSpace(dom.GetAttribute(link, "type")) + linkHref := strings.TrimSpace(dom.GetAttribute(link, "href")) + linkSizes := strings.TrimSpace(dom.GetAttribute(link, "sizes")) + + if linkHref == "" || !strings.Contains(linkRel, "icon") { + return + } + + if linkType != "image/png" && !strings.Contains(linkHref, ".png") { + return + } + + size := 0 + for _, sizesLocation := range []string{linkSizes, linkHref} { + sizeParts := rxFaviconSize.FindStringSubmatch(sizesLocation) + if len(sizeParts) != 3 || sizeParts[1] != sizeParts[2] { + continue + } + + size, _ = strconv.Atoi(sizeParts[1]) + break + } + + if size > faviconSize { + faviconSize = size + favicon = linkHref + } + }) + + return toAbsoluteURI(favicon, ps.documentURI) +} + +// removeComments find all comments in document then remove it. +func (ps *Parser) removeComments(doc *html.Node) { + // Find all comments + var comments []*html.Node + var finder func(*html.Node) + + finder = func(node *html.Node) { + if node.Type == html.CommentNode { + comments = append(comments, node) + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + for child := doc.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + + // Remove it + ps.removeNodes(comments, nil) +} + +// In dynamic language like JavaScript, we can easily add new +// property to an existing object by simply writing : +// +// obj.newProperty = newValue +// +// This is extensively used in Readability.js to save readability +// content score; and to mark whether a table is data container or +// only used for layout. +// +// However, since Go is static typed, we can't do it that way. +// As workaround, we just saved those data as attribute in the +// HTML nodes. Hence why these methods exists. + +// setReadabilityDataTable marks whether a Node is data table or not. +func (ps *Parser) setReadabilityDataTable(node *html.Node, isDataTable bool) { + if isDataTable { + dom.SetAttribute(node, "data-readability-table", "true") + } else { + dom.RemoveAttribute(node, "data-readability-table") + } +} + +// isReadabilityDataTable determines if node is data table. +func (ps *Parser) isReadabilityDataTable(node *html.Node) bool { + return dom.HasAttribute(node, "data-readability-table") +} + +// setContentScore sets the readability score for a node. +func (ps *Parser) setContentScore(node *html.Node, score float64) { + dom.SetAttribute(node, "data-readability-score", fmt.Sprintf("%.4f", score)) +} + +// hasContentScore checks if node has readability score. +func (ps *Parser) hasContentScore(node *html.Node) bool { + return dom.HasAttribute(node, "data-readability-score") +} + +// getContentScore gets the readability score of a node. +func (ps *Parser) getContentScore(node *html.Node) float64 { + strScore := dom.GetAttribute(node, "data-readability-score") + strScore = strings.TrimSpace(strScore) + if strScore == "" { + return 0 + } + + score, _ := strconv.ParseFloat(strScore, 64) + return score +} + +// clearReadabilityAttr removes Readability attribute that +// created by this package. Used in `postProcessContent`. +func (ps *Parser) clearReadabilityAttr(node *html.Node) { + dom.RemoveAttribute(node, "data-readability-score") + dom.RemoveAttribute(node, "data-readability-table") + + for child := dom.FirstElementChild(node); child != nil; child = dom.NextElementSibling(child) { + ps.clearReadabilityAttr(child) + } +} + +func (ps *Parser) log(args ...interface{}) { + if ps.Debug { + log.Println(args...) + } +} + +func (ps *Parser) logf(format string, args ...interface{}) { + if ps.Debug { + log.Printf(format, args...) + } +} + +// UNUSED CODES +// Codes below these points are defined in original Readability.js but not used, +// so here we commented it out so it can be used later if necessary. + +// var ( +// rxExtraneous = regexp.MustCompile(`(?i)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility`) +// rxReplaceFonts = regexp.MustCompile(`(?i)<(/?)font[^>]*>`) +// rxNextLink = regexp.MustCompile(`(?i)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))`) +// rxPrevLink = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`) +// ) + +// // findNode iterates over a NodeList and return the first node that passes +// // the supplied test function. +// func (ps *Parser) findNode(nodeList []*html.Node, fn func(*html.Node) bool) *html.Node { +// for i := 0; i < len(nodeList); i++ { +// if fn(nodeList[i]) { +// return nodeList[i] +// } +// } +// return nil +// } diff --git a/vendor/github.com/go-shiori/go-readability/readability.go b/vendor/github.com/go-shiori/go-readability/readability.go new file mode 100644 index 00000000..0a9f23bb --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/readability.go @@ -0,0 +1,77 @@ +// Package readability is a Go package that find the main readable +// content from a HTML page. It works by removing clutter like buttons, +// ads, background images, script, etc. +// +// This package is based from Readability.js by Mozilla, and written line +// by line to make sure it looks and works as similar as possible. This +// way, hopefully all web page that can be parsed by Readability.js +// are parse-able by go-readability as well. +package readability + +import ( + "fmt" + "io" + "net/http" + nurl "net/url" + "strings" + "time" + + "golang.org/x/net/html" +) + +// FromReader parses an `io.Reader` and returns the readable content. It's the wrapper +// or `Parser.Parse()` and useful if you only want to use the default parser. +func FromReader(input io.Reader, pageURL *nurl.URL) (Article, error) { + parser := NewParser() + return parser.Parse(input, pageURL) +} + +// FromDocument parses an document and returns the readable content. It's the wrapper +// or `Parser.ParseDocument()` and useful if you only want to use the default parser. +func FromDocument(doc *html.Node, pageURL *nurl.URL) (Article, error) { + parser := NewParser() + return parser.ParseDocument(doc, pageURL) +} + +// FromURL fetch the web page from specified url then parses the response to find +// the readable content. +func FromURL(pageURL string, timeout time.Duration) (Article, error) { + // Make sure URL is valid + parsedURL, err := nurl.ParseRequestURI(pageURL) + if err != nil { + return Article{}, fmt.Errorf("failed to parse URL: %v", err) + } + + // Fetch page from URL + client := &http.Client{Timeout: timeout} + resp, err := client.Get(pageURL) + if err != nil { + return Article{}, fmt.Errorf("failed to fetch the page: %v", err) + } + defer resp.Body.Close() + + // Make sure content type is HTML + cp := resp.Header.Get("Content-Type") + if !strings.Contains(cp, "text/html") { + return Article{}, fmt.Errorf("URL is not a HTML document") + } + + // Parse content + parser := NewParser() + return parser.Parse(resp.Body, parsedURL) +} + +// Check checks whether the input is readable without parsing the whole thing. It's the +// wrapper for `Parser.Check()` and useful if you only use the default parser. +func Check(input io.Reader) bool { + parser := NewParser() + return parser.Check(input) +} + +// CheckDocument checks whether the document is readable without parsing the whole thing. +// It's the wrapper for `Parser.CheckDocument()` and useful if you only use the default +// parser. +func CheckDocument(doc *html.Node) bool { + parser := NewParser() + return parser.CheckDocument(doc) +} diff --git a/vendor/github.com/go-shiori/go-readability/utils.go b/vendor/github.com/go-shiori/go-readability/utils.go new file mode 100644 index 00000000..3c24de8c --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/utils.go @@ -0,0 +1,100 @@ +package readability + +import ( + nurl "net/url" + "strings" + "unicode/utf8" +) + +// indexOf returns the position of the first occurrence of a +// specified value in a string array. Returns -1 if the +// value to search for never occurs. +func indexOf(array []string, key string) int { + for i := 0; i < len(array); i++ { + if array[i] == key { + return i + } + } + return -1 +} + +// wordCount returns number of word in str. +func wordCount(str string) int { + return len(strings.Fields(str)) +} + +// charCount returns number of char in str. +func charCount(str string) int { + return utf8.RuneCountInString(str) +} + +// isValidURL checks if URL is valid. +func isValidURL(s string) bool { + _, err := nurl.ParseRequestURI(s) + return err == nil +} + +// toAbsoluteURI convert uri to absolute path based on base. +// However, if uri is prefixed with hash (#), the uri won't be changed. +func toAbsoluteURI(uri string, base *nurl.URL) string { + if uri == "" || base == nil { + return uri + } + + // If it is hash tag, return as it is + if strings.HasPrefix(uri, "#") { + return uri + } + + // If it is data URI, return as it is + if strings.HasPrefix(uri, "data:") { + return uri + } + + // If it is already an absolute URL, return as it is + tmp, err := nurl.ParseRequestURI(uri) + if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" { + return uri + } + + // Otherwise, resolve against base URI. + tmp, err = nurl.Parse(uri) + if err != nil { + return uri + } + + return base.ResolveReference(tmp).String() +} + +// strOr returns the first not empty string in args. +func strOr(args ...string) string { + for i := 0; i < len(args); i++ { + if args[i] != "" { + return args[i] + } + } + return "" +} + +func sliceToMap(strings ...string) map[string]struct{} { + result := make(map[string]struct{}) + for _, s := range strings { + result[s] = struct{}{} + } + return result +} + +func strFilter(strs []string, filter func(string) bool) []string { + var result []string + for _, s := range strs { + if filter(s) { + result = append(result, s) + } + } + return result +} + +func trim(s string) string { + s = strings.Join(strings.Fields(s), " ") + return strings.TrimSpace(s) +} diff --git a/vendor/github.com/gogs/chardet/2022.go b/vendor/github.com/gogs/chardet/2022.go new file mode 100644 index 00000000..e667225e --- /dev/null +++ b/vendor/github.com/gogs/chardet/2022.go @@ -0,0 +1,102 @@ +package chardet + +import ( + "bytes" +) + +type recognizer2022 struct { + charset string + escapes [][]byte +} + +func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) { + return recognizerOutput{ + Charset: r.charset, + Confidence: r.matchConfidence(input.input), + } +} + +func (r *recognizer2022) matchConfidence(input []byte) int { + var hits, misses, shifts int +input: + for i := 0; i < len(input); i++ { + c := input[i] + if c == 0x1B { + for _, esc := range r.escapes { + if bytes.HasPrefix(input[i+1:], esc) { + hits++ + i += len(esc) + continue input + } + } + misses++ + } else if c == 0x0E || c == 0x0F { + shifts++ + } + } + if hits == 0 { + return 0 + } + quality := (100*hits - 100*misses) / (hits + misses) + if hits+shifts < 5 { + quality -= (5 - (hits + shifts)) * 10 + } + if quality < 0 { + quality = 0 + } + return quality +} + +var escapeSequences_2022JP = [][]byte{ + {0x24, 0x28, 0x43}, // KS X 1001:1992 + {0x24, 0x28, 0x44}, // JIS X 212-1990 + {0x24, 0x40}, // JIS C 6226-1978 + {0x24, 0x41}, // GB 2312-80 + {0x24, 0x42}, // JIS X 208-1983 + {0x26, 0x40}, // JIS X 208 1990, 1997 + {0x28, 0x42}, // ASCII + {0x28, 0x48}, // JIS-Roman + {0x28, 0x49}, // Half-width katakana + {0x28, 0x4a}, // JIS-Roman + {0x2e, 0x41}, // ISO 8859-1 + {0x2e, 0x46}, // ISO 8859-7 +} + +var escapeSequences_2022KR = [][]byte{ + {0x24, 0x29, 0x43}, +} + +var escapeSequences_2022CN = [][]byte{ + {0x24, 0x29, 0x41}, // GB 2312-80 + {0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 + {0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 + {0x24, 0x29, 0x45}, // ISO-IR-165 + {0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 + {0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 + {0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 + {0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 + {0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 + {0x4e}, // SS2 + {0x4f}, // SS3 +} + +func newRecognizer_2022JP() *recognizer2022 { + return &recognizer2022{ + "ISO-2022-JP", + escapeSequences_2022JP, + } +} + +func newRecognizer_2022KR() *recognizer2022 { + return &recognizer2022{ + "ISO-2022-KR", + escapeSequences_2022KR, + } +} + +func newRecognizer_2022CN() *recognizer2022 { + return &recognizer2022{ + "ISO-2022-CN", + escapeSequences_2022CN, + } +} diff --git a/vendor/github.com/gogs/chardet/AUTHORS b/vendor/github.com/gogs/chardet/AUTHORS new file mode 100644 index 00000000..842d0216 --- /dev/null +++ b/vendor/github.com/gogs/chardet/AUTHORS @@ -0,0 +1 @@ +Sheng Yu (yusheng dot sjtu at gmail dot com) diff --git a/vendor/github.com/gogs/chardet/LICENSE b/vendor/github.com/gogs/chardet/LICENSE new file mode 100644 index 00000000..35ee796b --- /dev/null +++ b/vendor/github.com/gogs/chardet/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012 chardet Authors + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Partial of the Software is derived from ICU project. See icu-license.html for +license of the derivative portions. diff --git a/vendor/github.com/gogs/chardet/README.md b/vendor/github.com/gogs/chardet/README.md new file mode 100644 index 00000000..f5c7cc20 --- /dev/null +++ b/vendor/github.com/gogs/chardet/README.md @@ -0,0 +1,12 @@ +# chardet + +chardet is library to automatically detect +[charset](http://en.wikipedia.org/wiki/Character_encoding) of texts for [Go +programming language](http://golang.org/). It's based on the algorithm and data +in [ICU](http://icu-project.org/)'s implementation. + +The project was created by [saintfish](http://github.com/saintfish/chardet). In January 2015 it was forked by the gogits project in order to incorporate bugfixes and new features. + +## Documentation and Usage + +See [pkgdoc](http://godoc.org/github.com/gogs/chardet) diff --git a/vendor/github.com/gogs/chardet/detector.go b/vendor/github.com/gogs/chardet/detector.go new file mode 100644 index 00000000..027e1c37 --- /dev/null +++ b/vendor/github.com/gogs/chardet/detector.go @@ -0,0 +1,147 @@ +// Package chardet ports character set detection from ICU. +package chardet + +import ( + "errors" + "sort" +) + +// Result contains all the information that charset detector gives. +type Result struct { + // IANA name of the detected charset. + Charset string + // IANA name of the detected language. It may be empty for some charsets. + Language string + // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident. + Confidence int +} + +// Detector implements charset detection. +type Detector struct { + recognizers []recognizer + stripTag bool +} + +// List of charset recognizers +var recognizers = []recognizer{ + newRecognizer_utf8(), + newRecognizer_utf16be(), + newRecognizer_utf16le(), + newRecognizer_utf32be(), + newRecognizer_utf32le(), + newRecognizer_8859_1_en(), + newRecognizer_8859_1_da(), + newRecognizer_8859_1_de(), + newRecognizer_8859_1_es(), + newRecognizer_8859_1_fr(), + newRecognizer_8859_1_it(), + newRecognizer_8859_1_nl(), + newRecognizer_8859_1_no(), + newRecognizer_8859_1_pt(), + newRecognizer_8859_1_sv(), + newRecognizer_8859_2_cs(), + newRecognizer_8859_2_hu(), + newRecognizer_8859_2_pl(), + newRecognizer_8859_2_ro(), + newRecognizer_8859_5_ru(), + newRecognizer_8859_6_ar(), + newRecognizer_8859_7_el(), + newRecognizer_8859_8_I_he(), + newRecognizer_8859_8_he(), + newRecognizer_windows_1251(), + newRecognizer_windows_1256(), + newRecognizer_KOI8_R(), + newRecognizer_8859_9_tr(), + + newRecognizer_sjis(), + newRecognizer_gb_18030(), + newRecognizer_euc_jp(), + newRecognizer_euc_kr(), + newRecognizer_big5(), + + newRecognizer_2022JP(), + newRecognizer_2022KR(), + newRecognizer_2022CN(), + + newRecognizer_IBM424_he_rtl(), + newRecognizer_IBM424_he_ltr(), + newRecognizer_IBM420_ar_rtl(), + newRecognizer_IBM420_ar_ltr(), +} + +// NewTextDetector creates a Detector for plain text. +func NewTextDetector() *Detector { + return &Detector{recognizers, false} +} + +// NewHtmlDetector creates a Detector for Html. +func NewHtmlDetector() *Detector { + return &Detector{recognizers, true} +} + +var ( + NotDetectedError = errors.New("Charset not detected.") +) + +// DetectBest returns the Result with highest Confidence. +func (d *Detector) DetectBest(b []byte) (r *Result, err error) { + input := newRecognizerInput(b, d.stripTag) + outputChan := make(chan recognizerOutput) + for _, r := range d.recognizers { + go matchHelper(r, input, outputChan) + } + var output Result + for i := 0; i < len(d.recognizers); i++ { + o := <-outputChan + if output.Confidence < o.Confidence { + output = Result(o) + } + } + if output.Confidence == 0 { + return nil, NotDetectedError + } + return &output, nil +} + +// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order. +func (d *Detector) DetectAll(b []byte) ([]Result, error) { + input := newRecognizerInput(b, d.stripTag) + outputChan := make(chan recognizerOutput) + for _, r := range d.recognizers { + go matchHelper(r, input, outputChan) + } + outputs := make(recognizerOutputs, 0, len(d.recognizers)) + for i := 0; i < len(d.recognizers); i++ { + o := <-outputChan + if o.Confidence > 0 { + outputs = append(outputs, o) + } + } + if len(outputs) == 0 { + return nil, NotDetectedError + } + + sort.Sort(outputs) + dedupOutputs := make([]Result, 0, len(outputs)) + foundCharsets := make(map[string]struct{}, len(outputs)) + for _, o := range outputs { + if _, found := foundCharsets[o.Charset]; !found { + dedupOutputs = append(dedupOutputs, Result(o)) + foundCharsets[o.Charset] = struct{}{} + } + } + if len(dedupOutputs) == 0 { + return nil, NotDetectedError + } + return dedupOutputs, nil +} + +func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) { + outputChan <- r.Match(input) +} + +type recognizerOutputs []recognizerOutput + +func (r recognizerOutputs) Len() int { return len(r) } +func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence } +func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] } diff --git a/vendor/github.com/gogs/chardet/icu-license.html b/vendor/github.com/gogs/chardet/icu-license.html new file mode 100644 index 00000000..d078d057 --- /dev/null +++ b/vendor/github.com/gogs/chardet/icu-license.html @@ -0,0 +1,51 @@ + + + + +ICU License - ICU 1.8.1 and later + + + +

ICU License - ICU 1.8.1 and later

+ +

COPYRIGHT AND PERMISSION NOTICE

+ +

+Copyright (c) 1995-2012 International Business Machines Corporation and others +

+

+All rights reserved. +

+

+Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies +of the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. +

+

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL +THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, +OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE +USE OR PERFORMANCE OF THIS SOFTWARE. +

+

+Except as contained in this notice, the name of a copyright holder shall not be +used in advertising or otherwise to promote the sale, use or other dealings in +this Software without prior written authorization of the copyright holder. +

+ +
+

+All trademarks and registered trademarks mentioned herein are the property of their respective owners. +

+ + diff --git a/vendor/github.com/gogs/chardet/multi_byte.go b/vendor/github.com/gogs/chardet/multi_byte.go new file mode 100644 index 00000000..b5cdf3d6 --- /dev/null +++ b/vendor/github.com/gogs/chardet/multi_byte.go @@ -0,0 +1,345 @@ +package chardet + +import ( + "errors" + "math" +) + +type recognizerMultiByte struct { + charset string + language string + decoder charDecoder + commonChars []uint16 +} + +type charDecoder interface { + DecodeOneChar([]byte) (c uint16, remain []byte, err error) +} + +func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) { + return recognizerOutput{ + Charset: r.charset, + Language: r.language, + Confidence: r.matchConfidence(input), + } +} + +func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int { + raw := input.raw + var c uint16 + var err error + var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int + for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) { + totalCharCount++ + if err != nil { + badCharCount++ + } else if c <= 0xFF { + singleByteCharCount++ + } else { + doubleByteCharCount++ + if r.commonChars != nil && binarySearch(r.commonChars, c) { + commonCharCount++ + } + } + if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount { + return 0 + } + } + + if doubleByteCharCount <= 10 && badCharCount == 0 { + if doubleByteCharCount == 0 && totalCharCount < 10 { + return 0 + } else { + return 10 + } + } + + if doubleByteCharCount < 20*badCharCount { + return 0 + } + if r.commonChars == nil { + confidence := 30 + doubleByteCharCount - 20*badCharCount + if confidence > 100 { + confidence = 100 + } + return confidence + } + maxVal := math.Log(float64(doubleByteCharCount) / 4) + scaleFactor := 90 / maxVal + confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10) + if confidence > 100 { + confidence = 100 + } + if confidence < 0 { + confidence = 0 + } + return confidence +} + +func binarySearch(l []uint16, c uint16) bool { + start := 0 + end := len(l) - 1 + for start <= end { + mid := (start + end) / 2 + if c == l[mid] { + return true + } else if c < l[mid] { + end = mid - 1 + } else { + start = mid + 1 + } + } + return false +} + +var eobError = errors.New("End of input buffer") +var badCharError = errors.New("Decode a bad char") + +type charDecoder_sjis struct { +} + +func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { + if len(input) == 0 { + return 0, nil, eobError + } + first := input[0] + c = uint16(first) + remain = input[1:] + if first <= 0x7F || (first > 0xA0 && first <= 0xDF) { + return + } + if len(remain) == 0 { + return c, remain, badCharError + } + second := remain[0] + remain = remain[1:] + c = c<<8 | uint16(second) + if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) { + } else { + err = badCharError + } + return +} + +var commonChars_sjis = []uint16{ + 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, + 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, + 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, + 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, + 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, + 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa, +} + +func newRecognizer_sjis() *recognizerMultiByte { + return &recognizerMultiByte{ + "Shift_JIS", + "ja", + charDecoder_sjis{}, + commonChars_sjis, + } +} + +type charDecoder_euc struct { +} + +func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { + if len(input) == 0 { + return 0, nil, eobError + } + first := input[0] + remain = input[1:] + c = uint16(first) + if first <= 0x8D { + return uint16(first), remain, nil + } + if len(remain) == 0 { + return 0, nil, eobError + } + second := remain[0] + remain = remain[1:] + c = c<<8 | uint16(second) + if first >= 0xA1 && first <= 0xFE { + if second < 0xA1 { + err = badCharError + } + return + } + if first == 0x8E { + if second < 0xA1 { + err = badCharError + } + return + } + if first == 0x8F { + if len(remain) == 0 { + return 0, nil, eobError + } + third := remain[0] + remain = remain[1:] + c = c<<0 | uint16(third) + if third < 0xa1 { + err = badCharError + } + } + return +} + +var commonChars_euc_jp = []uint16{ + 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, + 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, + 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, + 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, + 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, + 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, + 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, + 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, + 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, + 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1, +} + +var commonChars_euc_kr = []uint16{ + 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, + 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, + 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, + 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, + 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, + 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, + 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, + 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, + 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, + 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad, +} + +func newRecognizer_euc_jp() *recognizerMultiByte { + return &recognizerMultiByte{ + "EUC-JP", + "ja", + charDecoder_euc{}, + commonChars_euc_jp, + } +} + +func newRecognizer_euc_kr() *recognizerMultiByte { + return &recognizerMultiByte{ + "EUC-KR", + "ko", + charDecoder_euc{}, + commonChars_euc_kr, + } +} + +type charDecoder_big5 struct { +} + +func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { + if len(input) == 0 { + return 0, nil, eobError + } + first := input[0] + remain = input[1:] + c = uint16(first) + if first <= 0x7F || first == 0xFF { + return + } + if len(remain) == 0 { + return c, nil, eobError + } + second := remain[0] + remain = remain[1:] + c = c<<8 | uint16(second) + if second < 0x40 || second == 0x7F || second == 0xFF { + err = badCharError + } + return +} + +var commonChars_big5 = []uint16{ + 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, + 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, + 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, + 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, + 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, + 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, + 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, + 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, + 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, + 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f, +} + +func newRecognizer_big5() *recognizerMultiByte { + return &recognizerMultiByte{ + "Big5", + "zh", + charDecoder_big5{}, + commonChars_big5, + } +} + +type charDecoder_gb_18030 struct { +} + +func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { + if len(input) == 0 { + return 0, nil, eobError + } + first := input[0] + remain = input[1:] + c = uint16(first) + if first <= 0x80 { + return + } + if len(remain) == 0 { + return 0, nil, eobError + } + second := remain[0] + remain = remain[1:] + c = c<<8 | uint16(second) + if first >= 0x81 && first <= 0xFE { + if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) { + return + } + + if second >= 0x30 && second <= 0x39 { + if len(remain) == 0 { + return 0, nil, eobError + } + third := remain[0] + remain = remain[1:] + if third >= 0x81 && third <= 0xFE { + if len(remain) == 0 { + return 0, nil, eobError + } + fourth := remain[0] + remain = remain[1:] + if fourth >= 0x30 && fourth <= 0x39 { + c = c<<16 | uint16(third)<<8 | uint16(fourth) + return + } + } + } + err = badCharError + } + return +} + +var commonChars_gb_18030 = []uint16{ + 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, + 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, + 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, + 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, + 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, + 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, + 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, + 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, + 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, + 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0, +} + +func newRecognizer_gb_18030() *recognizerMultiByte { + return &recognizerMultiByte{ + "GB18030", + "zh", + charDecoder_gb_18030{}, + commonChars_gb_18030, + } +} diff --git a/vendor/github.com/gogs/chardet/recognizer.go b/vendor/github.com/gogs/chardet/recognizer.go new file mode 100644 index 00000000..1bf8461c --- /dev/null +++ b/vendor/github.com/gogs/chardet/recognizer.go @@ -0,0 +1,83 @@ +package chardet + +type recognizer interface { + Match(*recognizerInput) recognizerOutput +} + +type recognizerOutput Result + +type recognizerInput struct { + raw []byte + input []byte + tagStripped bool + byteStats []int + hasC1Bytes bool +} + +func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput { + input, stripped := mayStripInput(raw, stripTag) + byteStats := computeByteStats(input) + return &recognizerInput{ + raw: raw, + input: input, + tagStripped: stripped, + byteStats: byteStats, + hasC1Bytes: computeHasC1Bytes(byteStats), + } +} + +func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) { + const inputBufferSize = 8192 + out = make([]byte, 0, inputBufferSize) + var badTags, openTags int32 + var inMarkup bool = false + stripped = false + if stripTag { + stripped = true + for _, c := range raw { + if c == '<' { + if inMarkup { + badTags += 1 + } + inMarkup = true + openTags += 1 + } + if !inMarkup { + out = append(out, c) + if len(out) >= inputBufferSize { + break + } + } + if c == '>' { + inMarkup = false + } + } + } + if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) { + limit := len(raw) + if limit > inputBufferSize { + limit = inputBufferSize + } + out = make([]byte, limit) + copy(out, raw[:limit]) + stripped = false + } + return +} + +func computeByteStats(input []byte) []int { + r := make([]int, 256) + for _, c := range input { + r[c] += 1 + } + return r +} + +func computeHasC1Bytes(byteStats []int) bool { + for _, count := range byteStats[0x80 : 0x9F+1] { + if count > 0 { + return true + } + } + return false +} diff --git a/vendor/github.com/gogs/chardet/single_byte.go b/vendor/github.com/gogs/chardet/single_byte.go new file mode 100644 index 00000000..a7ce39bc --- /dev/null +++ b/vendor/github.com/gogs/chardet/single_byte.go @@ -0,0 +1,882 @@ +package chardet + +// Recognizer for single byte charset family +type recognizerSingleByte struct { + charset string + hasC1ByteCharset string + language string + charMap *[256]byte + ngram *[64]uint32 +} + +func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput { + var charset string = r.charset + if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 { + charset = r.hasC1ByteCharset + } + return recognizerOutput{ + Charset: charset, + Language: r.language, + Confidence: r.parseNgram(input.input), + } +} + +type ngramState struct { + ngram uint32 + ignoreSpace bool + ngramCount, ngramHit uint32 + table *[64]uint32 +} + +func newNgramState(table *[64]uint32) *ngramState { + return &ngramState{ + ngram: 0, + ignoreSpace: false, + ngramCount: 0, + ngramHit: 0, + table: table, + } +} + +func (s *ngramState) AddByte(b byte) { + const ngramMask = 0xFFFFFF + if !(b == 0x20 && s.ignoreSpace) { + s.ngram = ((s.ngram << 8) | uint32(b)) & ngramMask + s.ignoreSpace = (s.ngram == 0x20) + s.ngramCount++ + if s.lookup() { + s.ngramHit++ + } + } + s.ignoreSpace = (b == 0x20) +} + +func (s *ngramState) HitRate() float32 { + if s.ngramCount == 0 { + return 0 + } + return float32(s.ngramHit) / float32(s.ngramCount) +} + +func (s *ngramState) lookup() bool { + var index int + if s.table[index+32] <= s.ngram { + index += 32 + } + if s.table[index+16] <= s.ngram { + index += 16 + } + if s.table[index+8] <= s.ngram { + index += 8 + } + if s.table[index+4] <= s.ngram { + index += 4 + } + if s.table[index+2] <= s.ngram { + index += 2 + } + if s.table[index+1] <= s.ngram { + index += 1 + } + if s.table[index] > s.ngram { + index -= 1 + } + if index < 0 || s.table[index] != s.ngram { + return false + } + return true +} + +func (r *recognizerSingleByte) parseNgram(input []byte) int { + state := newNgramState(r.ngram) + for _, inChar := range input { + c := r.charMap[inChar] + if c != 0 { + state.AddByte(c) + } + } + state.AddByte(0x20) + rate := state.HitRate() + if rate > 0.33 { + return 98 + } + return int(rate * 300) +} + +var charMap_8859_1 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +} + +var ngrams_8859_1_en = [64]uint32{ + 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, + 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, + 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, + 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, +} + +var ngrams_8859_1_da = [64]uint32{ + 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, + 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, + 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, + 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, +} + +var ngrams_8859_1_de = [64]uint32{ + 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, + 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, + 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, + 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, +} + +var ngrams_8859_1_es = [64]uint32{ + 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, + 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, + 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, + 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, +} + +var ngrams_8859_1_fr = [64]uint32{ + 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, + 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, + 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, + 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, +} + +var ngrams_8859_1_it = [64]uint32{ + 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, + 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, + 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, + 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, +} + +var ngrams_8859_1_nl = [64]uint32{ + 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, + 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, + 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, + 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, +} + +var ngrams_8859_1_no = [64]uint32{ + 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, + 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, + 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, + 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, +} + +var ngrams_8859_1_pt = [64]uint32{ + 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, + 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, + 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, + 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, +} + +var ngrams_8859_1_sv = [64]uint32{ + 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, + 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, + 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, + 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, +} + +func newRecognizer_8859_1(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-1", + hasC1ByteCharset: "windows-1252", + language: language, + charMap: &charMap_8859_1, + ngram: ngram, + } +} + +func newRecognizer_8859_1_en() *recognizerSingleByte { + return newRecognizer_8859_1("en", &ngrams_8859_1_en) +} +func newRecognizer_8859_1_da() *recognizerSingleByte { + return newRecognizer_8859_1("da", &ngrams_8859_1_da) +} +func newRecognizer_8859_1_de() *recognizerSingleByte { + return newRecognizer_8859_1("de", &ngrams_8859_1_de) +} +func newRecognizer_8859_1_es() *recognizerSingleByte { + return newRecognizer_8859_1("es", &ngrams_8859_1_es) +} +func newRecognizer_8859_1_fr() *recognizerSingleByte { + return newRecognizer_8859_1("fr", &ngrams_8859_1_fr) +} +func newRecognizer_8859_1_it() *recognizerSingleByte { + return newRecognizer_8859_1("it", &ngrams_8859_1_it) +} +func newRecognizer_8859_1_nl() *recognizerSingleByte { + return newRecognizer_8859_1("nl", &ngrams_8859_1_nl) +} +func newRecognizer_8859_1_no() *recognizerSingleByte { + return newRecognizer_8859_1("no", &ngrams_8859_1_no) +} +func newRecognizer_8859_1_pt() *recognizerSingleByte { + return newRecognizer_8859_1("pt", &ngrams_8859_1_pt) +} +func newRecognizer_8859_1_sv() *recognizerSingleByte { + return newRecognizer_8859_1("sv", &ngrams_8859_1_sv) +} + +var charMap_8859_2 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20, + 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, + 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7, + 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, +} + +var ngrams_8859_2_cs = [64]uint32{ + 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, + 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, + 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, + 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, +} + +var ngrams_8859_2_hu = [64]uint32{ + 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, + 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, + 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, + 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, +} + +var ngrams_8859_2_pl = [64]uint32{ + 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, + 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, + 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, + 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, +} + +var ngrams_8859_2_ro = [64]uint32{ + 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, + 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, + 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, + 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, +} + +func newRecognizer_8859_2(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-2", + hasC1ByteCharset: "windows-1250", + language: language, + charMap: &charMap_8859_2, + ngram: ngram, + } +} + +func newRecognizer_8859_2_cs() *recognizerSingleByte { + return newRecognizer_8859_2("cs", &ngrams_8859_2_cs) +} +func newRecognizer_8859_2_hu() *recognizerSingleByte { + return newRecognizer_8859_2("hu", &ngrams_8859_2_hu) +} +func newRecognizer_8859_2_pl() *recognizerSingleByte { + return newRecognizer_8859_2("pl", &ngrams_8859_2_pl) +} +func newRecognizer_8859_2_ro() *recognizerSingleByte { + return newRecognizer_8859_2("ro", &ngrams_8859_2_ro) +} + +var charMap_8859_5 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, +} + +var ngrams_8859_5_ru = [64]uint32{ + 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE, + 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD, + 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2, + 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520, +} + +func newRecognizer_8859_5(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-5", + language: language, + charMap: &charMap_8859_5, + ngram: ngram, + } +} + +func newRecognizer_8859_5_ru() *recognizerSingleByte { + return newRecognizer_8859_5("ru", &ngrams_8859_5_ru) +} + +var charMap_8859_6 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +} + +var ngrams_8859_6_ar = [64]uint32{ + 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8, + 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1, + 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20, + 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620, +} + +func newRecognizer_8859_6(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-6", + language: language, + charMap: &charMap_8859_6, + ngram: ngram, + } +} + +func newRecognizer_8859_6_ar() *recognizerSingleByte { + return newRecognizer_8859_6("ar", &ngrams_8859_6_ar) +} + +var charMap_8859_7 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20, + 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE, + 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, +} + +var ngrams_8859_7_el = [64]uint32{ + 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7, + 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120, + 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5, + 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20, +} + +func newRecognizer_8859_7(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-7", + hasC1ByteCharset: "windows-1253", + language: language, + charMap: &charMap_8859_7, + ngram: ngram, + } +} + +func newRecognizer_8859_7_el() *recognizerSingleByte { + return newRecognizer_8859_7("el", &ngrams_8859_7_el) +} + +var charMap_8859_8 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20, +} + +var ngrams_8859_8_I_he = [64]uint32{ + 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, + 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4, + 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE, + 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9, +} + +var ngrams_8859_8_he = [64]uint32{ + 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0, + 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC, + 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920, + 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9, +} + +func newRecognizer_8859_8(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-8", + hasC1ByteCharset: "windows-1255", + language: language, + charMap: &charMap_8859_8, + ngram: ngram, + } +} + +func newRecognizer_8859_8_I_he() *recognizerSingleByte { + r := newRecognizer_8859_8("he", &ngrams_8859_8_I_he) + r.charset = "ISO-8859-8-I" + return r +} + +func newRecognizer_8859_8_he() *recognizerSingleByte { + return newRecognizer_8859_8("he", &ngrams_8859_8_he) +} + +var charMap_8859_9 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +} + +var ngrams_8859_9_tr = [64]uint32{ + 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961, + 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062, + 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, + 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, +} + +func newRecognizer_8859_9(language string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "ISO-8859-9", + hasC1ByteCharset: "windows-1254", + language: language, + charMap: &charMap_8859_9, + ngram: ngram, + } +} + +func newRecognizer_8859_9_tr() *recognizerSingleByte { + return newRecognizer_8859_9("tr", &ngrams_8859_9_tr) +} + +var charMap_windows_1256 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20, + 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF, +} + +var ngrams_windows_1256 = [64]uint32{ + 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, + 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, + 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, + 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420, +} + +func newRecognizer_windows_1256() *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "windows-1256", + language: "ar", + charMap: &charMap_windows_1256, + ngram: &ngrams_windows_1256, + } +} + +var charMap_windows_1251 = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF, + 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +} + +var ngrams_windows_1251 = [64]uint32{ + 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, + 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, + 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, + 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, +} + +func newRecognizer_windows_1251() *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "windows-1251", + language: "ar", + charMap: &charMap_windows_1251, + ngram: &ngrams_windows_1251, + } +} + +var charMap_KOI8_R = [256]byte{ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, +} + +var ngrams_KOI8_R = [64]uint32{ + 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, + 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, + 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, + 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF, +} + +func newRecognizer_KOI8_R() *recognizerSingleByte { + return &recognizerSingleByte{ + charset: "KOI8-R", + language: "ru", + charMap: &charMap_KOI8_R, + ngram: &ngrams_KOI8_R, + } +} + +var charMap_IBM424_he = [256]byte{ + /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ + /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40, + /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +} + +var ngrams_IBM424_he_rtl = [64]uint32{ + 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, + 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, + 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, + 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, +} + +var ngrams_IBM424_he_ltr = [64]uint32{ + 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, + 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, + 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, + 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651, +} + +func newRecognizer_IBM424_he(charset string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: charset, + language: "he", + charMap: &charMap_IBM424_he, + ngram: ngram, + } +} + +func newRecognizer_IBM424_he_rtl() *recognizerSingleByte { + return newRecognizer_IBM424_he("IBM424_rtl", &ngrams_IBM424_he_rtl) +} + +func newRecognizer_IBM424_he_ltr() *recognizerSingleByte { + return newRecognizer_IBM424_he("IBM424_ltr", &ngrams_IBM424_he_ltr) +} + +var charMap_IBM420_ar = [256]byte{ + /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ + /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF, + /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, + /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, +} + +var ngrams_IBM420_ar_rtl = [64]uint32{ + 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, + 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, + 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, + 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, +} + +var ngrams_IBM420_ar_ltr = [64]uint32{ + 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, + 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, + 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, + 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156, +} + +func newRecognizer_IBM420_ar(charset string, ngram *[64]uint32) *recognizerSingleByte { + return &recognizerSingleByte{ + charset: charset, + language: "ar", + charMap: &charMap_IBM420_ar, + ngram: ngram, + } +} + +func newRecognizer_IBM420_ar_rtl() *recognizerSingleByte { + return newRecognizer_IBM420_ar("IBM420_rtl", &ngrams_IBM420_ar_rtl) +} + +func newRecognizer_IBM420_ar_ltr() *recognizerSingleByte { + return newRecognizer_IBM420_ar("IBM420_ltr", &ngrams_IBM420_ar_ltr) +} diff --git a/vendor/github.com/gogs/chardet/unicode.go b/vendor/github.com/gogs/chardet/unicode.go new file mode 100644 index 00000000..6f9fa9e6 --- /dev/null +++ b/vendor/github.com/gogs/chardet/unicode.go @@ -0,0 +1,103 @@ +package chardet + +import ( + "bytes" +) + +var ( + utf16beBom = []byte{0xFE, 0xFF} + utf16leBom = []byte{0xFF, 0xFE} + utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF} + utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00} +) + +type recognizerUtf16be struct { +} + +func newRecognizer_utf16be() *recognizerUtf16be { + return &recognizerUtf16be{} +} + +func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) { + output = recognizerOutput{ + Charset: "UTF-16BE", + } + if bytes.HasPrefix(input.raw, utf16beBom) { + output.Confidence = 100 + } + return +} + +type recognizerUtf16le struct { +} + +func newRecognizer_utf16le() *recognizerUtf16le { + return &recognizerUtf16le{} +} + +func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) { + output = recognizerOutput{ + Charset: "UTF-16LE", + } + if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) { + output.Confidence = 100 + } + return +} + +type recognizerUtf32 struct { + name string + bom []byte + decodeChar func(input []byte) uint32 +} + +func decodeUtf32be(input []byte) uint32 { + return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3]) +} + +func decodeUtf32le(input []byte) uint32 { + return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0]) +} + +func newRecognizer_utf32be() *recognizerUtf32 { + return &recognizerUtf32{ + "UTF-32BE", + utf32beBom, + decodeUtf32be, + } +} + +func newRecognizer_utf32le() *recognizerUtf32 { + return &recognizerUtf32{ + "UTF-32LE", + utf32leBom, + decodeUtf32le, + } +} + +func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) { + output = recognizerOutput{ + Charset: r.name, + } + hasBom := bytes.HasPrefix(input.raw, r.bom) + var numValid, numInvalid uint32 + for b := input.raw; len(b) >= 4; b = b[4:] { + if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) { + numInvalid++ + } else { + numValid++ + } + } + if hasBom && numInvalid == 0 { + output.Confidence = 100 + } else if hasBom && numValid > numInvalid*10 { + output.Confidence = 80 + } else if numValid > 3 && numInvalid == 0 { + output.Confidence = 100 + } else if numValid > 0 && numInvalid == 0 { + output.Confidence = 80 + } else if numValid > numInvalid*10 { + output.Confidence = 25 + } + return +} diff --git a/vendor/github.com/gogs/chardet/utf8.go b/vendor/github.com/gogs/chardet/utf8.go new file mode 100644 index 00000000..ae036ad9 --- /dev/null +++ b/vendor/github.com/gogs/chardet/utf8.go @@ -0,0 +1,71 @@ +package chardet + +import ( + "bytes" +) + +var utf8Bom = []byte{0xEF, 0xBB, 0xBF} + +type recognizerUtf8 struct { +} + +func newRecognizer_utf8() *recognizerUtf8 { + return &recognizerUtf8{} +} + +func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) { + output = recognizerOutput{ + Charset: "UTF-8", + } + hasBom := bytes.HasPrefix(input.raw, utf8Bom) + inputLen := len(input.raw) + var numValid, numInvalid uint32 + var trailBytes uint8 + for i := 0; i < inputLen; i++ { + c := input.raw[i] + if c&0x80 == 0 { + continue + } + if c&0xE0 == 0xC0 { + trailBytes = 1 + } else if c&0xF0 == 0xE0 { + trailBytes = 2 + } else if c&0xF8 == 0xF0 { + trailBytes = 3 + } else { + numInvalid++ + if numInvalid > 5 { + break + } + trailBytes = 0 + } + + for i++; i < inputLen; i++ { + c = input.raw[i] + if c&0xC0 != 0x80 { + numInvalid++ + break + } + if trailBytes--; trailBytes == 0 { + numValid++ + break + } + } + } + + if hasBom && numInvalid == 0 { + output.Confidence = 100 + } else if hasBom && numValid > numInvalid*10 { + output.Confidence = 80 + } else if numValid > 3 && numInvalid == 0 { + output.Confidence = 100 + } else if numValid > 0 && numInvalid == 0 { + output.Confidence = 80 + } else if numValid == 0 && numInvalid == 0 { + // Plain ASCII + output.Confidence = 10 + } else if numValid > numInvalid*10 { + output.Confidence = 25 + } + return +} diff --git a/vendor/github.com/gorilla/css/LICENSE b/vendor/github.com/gorilla/css/LICENSE new file mode 100644 index 00000000..bee2a059 --- /dev/null +++ b/vendor/github.com/gorilla/css/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2013, Gorilla web toolkit +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + + Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/gorilla/css/scanner/doc.go b/vendor/github.com/gorilla/css/scanner/doc.go new file mode 100644 index 00000000..f19850e1 --- /dev/null +++ b/vendor/github.com/gorilla/css/scanner/doc.go @@ -0,0 +1,33 @@ +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package gorilla/css/scanner generates tokens for a CSS3 input. + +It follows the CSS3 specification located at: + + http://www.w3.org/TR/css3-syntax/ + +To use it, create a new scanner for a given CSS string and call Next() until +the token returned has type TokenEOF or TokenError: + + s := scanner.New(myCSS) + for { + token := s.Next() + if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { + break + } + // Do something with the token... + } + +Following the CSS3 specification, an error can only occur when the scanner +finds an unclosed quote or unclosed comment. In these cases the text becomes +"untokenizable". Everything else is tokenizable and it is up to a parser +to make sense of the token stream (or ignore nonsensical token sequences). + +Note: the scanner doesn't perform lexical analysis or, in other words, it +doesn't care about the token context. It is intended to be used by a +lexer or parser. +*/ +package scanner diff --git a/vendor/github.com/gorilla/css/scanner/scanner.go b/vendor/github.com/gorilla/css/scanner/scanner.go new file mode 100644 index 00000000..23fa7404 --- /dev/null +++ b/vendor/github.com/gorilla/css/scanner/scanner.go @@ -0,0 +1,356 @@ +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner + +import ( + "fmt" + "regexp" + "strings" + "unicode" + "unicode/utf8" +) + +// tokenType identifies the type of lexical tokens. +type tokenType int + +// String returns a string representation of the token type. +func (t tokenType) String() string { + return tokenNames[t] +} + +// Token represents a token and the corresponding string. +type Token struct { + Type tokenType + Value string + Line int + Column int +} + +// String returns a string representation of the token. +func (t *Token) String() string { + if len(t.Value) > 10 { + return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", + t.Type, t.Line, t.Column, t.Value) + } + return fmt.Sprintf("%s (line: %d, column: %d): %q", + t.Type, t.Line, t.Column, t.Value) +} + +// All tokens ----------------------------------------------------------------- + +// The complete list of tokens in CSS3. +const ( + // Scanner flags. + TokenError tokenType = iota + TokenEOF + // From now on, only tokens from the CSS specification. + TokenIdent + TokenAtKeyword + TokenString + TokenHash + TokenNumber + TokenPercentage + TokenDimension + TokenURI + TokenUnicodeRange + TokenCDO + TokenCDC + TokenS + TokenComment + TokenFunction + TokenIncludes + TokenDashMatch + TokenPrefixMatch + TokenSuffixMatch + TokenSubstringMatch + TokenChar + TokenBOM +) + +// tokenNames maps tokenType's to their names. Used for conversion to string. +var tokenNames = map[tokenType]string{ + TokenError: "error", + TokenEOF: "EOF", + TokenIdent: "IDENT", + TokenAtKeyword: "ATKEYWORD", + TokenString: "STRING", + TokenHash: "HASH", + TokenNumber: "NUMBER", + TokenPercentage: "PERCENTAGE", + TokenDimension: "DIMENSION", + TokenURI: "URI", + TokenUnicodeRange: "UNICODE-RANGE", + TokenCDO: "CDO", + TokenCDC: "CDC", + TokenS: "S", + TokenComment: "COMMENT", + TokenFunction: "FUNCTION", + TokenIncludes: "INCLUDES", + TokenDashMatch: "DASHMATCH", + TokenPrefixMatch: "PREFIXMATCH", + TokenSuffixMatch: "SUFFIXMATCH", + TokenSubstringMatch: "SUBSTRINGMATCH", + TokenChar: "CHAR", + TokenBOM: "BOM", +} + +// Macros and productions ----------------------------------------------------- +// http://www.w3.org/TR/css3-syntax/#tokenization + +var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) + +// macros maps macro names to patterns to be expanded. +var macros = map[string]string{ + // must be escaped: `\.+*?()|[]{}^$` + "ident": `-?{nmstart}{nmchar}*`, + "name": `{nmchar}+`, + "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, + "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", + "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, + "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", + "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, + "num": `[0-9]*\.[0-9]+|[0-9]+`, + "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, + "stringchar": `{urlchar}|[ ]|\\{nl}`, + "nl": `[\n\r\f]|\r\n`, + "w": `{wc}*`, + "wc": `[\t\n\f\r ]`, + + // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] + // ASCII characters range = `[\u0020-\u007e]` + // Skip space \u0020 = `[\u0021-\u007e]` + // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` + // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` + // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` + // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves + "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", +} + +// productions maps the list of tokens to patterns to be expanded. +var productions = map[tokenType]string{ + // Unused regexps (matched using other methods) are commented out. + TokenIdent: `{ident}`, + TokenAtKeyword: `@{ident}`, + TokenString: `{string}`, + TokenHash: `#{name}`, + TokenNumber: `{num}`, + TokenPercentage: `{num}%`, + TokenDimension: `{num}{ident}`, + TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`, + TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, + //TokenCDO: ``, + TokenS: `{wc}+`, + TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, + TokenFunction: `{ident}\(`, + //TokenIncludes: `~=`, + //TokenDashMatch: `\|=`, + //TokenPrefixMatch: `\^=`, + //TokenSuffixMatch: `\$=`, + //TokenSubstringMatch: `\*=`, + //TokenChar: `[^"']`, + //TokenBOM: "\uFEFF", +} + +// matchers maps the list of tokens to compiled regular expressions. +// +// The map is filled on init() using the macros and productions defined in +// the CSS specification. +var matchers = map[tokenType]*regexp.Regexp{} + +// matchOrder is the order to test regexps when first-char shortcuts +// can't be used. +var matchOrder = []tokenType{ + TokenURI, + TokenFunction, + TokenUnicodeRange, + TokenIdent, + TokenDimension, + TokenPercentage, + TokenNumber, + TokenCDC, +} + +func init() { + // replace macros and compile regexps for productions. + replaceMacro := func(s string) string { + return "(?:" + macros[s[1:len(s)-1]] + ")" + } + for t, s := range productions { + for macroRegexp.MatchString(s) { + s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) + } + matchers[t] = regexp.MustCompile("^(?:" + s + ")") + } +} + +// Scanner -------------------------------------------------------------------- + +// New returns a new CSS scanner for the given input. +func New(input string) *Scanner { + // Normalize newlines. + input = strings.Replace(input, "\r\n", "\n", -1) + return &Scanner{ + input: input, + row: 1, + col: 1, + } +} + +// Scanner scans an input and emits tokens following the CSS3 specification. +type Scanner struct { + input string + pos int + row int + col int + err *Token +} + +// Next returns the next token from the input. +// +// At the end of the input the token type is TokenEOF. +// +// If the input can't be tokenized the token type is TokenError. This occurs +// in case of unclosed quotation marks or comments. +func (s *Scanner) Next() *Token { + if s.err != nil { + return s.err + } + if s.pos >= len(s.input) { + s.err = &Token{TokenEOF, "", s.row, s.col} + return s.err + } + if s.pos == 0 { + // Test BOM only once, at the beginning of the file. + if strings.HasPrefix(s.input, "\uFEFF") { + return s.emitSimple(TokenBOM, "\uFEFF") + } + } + // There's a lot we can guess based on the first byte so we'll take a + // shortcut before testing multiple regexps. + input := s.input[s.pos:] + switch input[0] { + case '\t', '\n', '\f', '\r', ' ': + // Whitespace. + return s.emitToken(TokenS, matchers[TokenS].FindString(input)) + case '.': + // Dot is too common to not have a quick check. + // We'll test if this is a Char; if it is followed by a number it is a + // dimension/percentage/number, and this will be matched later. + if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { + return s.emitSimple(TokenChar, ".") + } + case '#': + // Another common one: Hash or Char. + if match := matchers[TokenHash].FindString(input); match != "" { + return s.emitToken(TokenHash, match) + } + return s.emitSimple(TokenChar, "#") + case '@': + // Another common one: AtKeyword or Char. + if match := matchers[TokenAtKeyword].FindString(input); match != "" { + return s.emitSimple(TokenAtKeyword, match) + } + return s.emitSimple(TokenChar, "@") + case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': + // More common chars. + return s.emitSimple(TokenChar, string(input[0])) + case '"', '\'': + // String or error. + match := matchers[TokenString].FindString(input) + if match != "" { + return s.emitToken(TokenString, match) + } + + s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} + return s.err + case '/': + // Comment, error or Char. + if len(input) > 1 && input[1] == '*' { + match := matchers[TokenComment].FindString(input) + if match != "" { + return s.emitToken(TokenComment, match) + } else { + s.err = &Token{TokenError, "unclosed comment", s.row, s.col} + return s.err + } + } + return s.emitSimple(TokenChar, "/") + case '~': + // Includes or Char. + return s.emitPrefixOrChar(TokenIncludes, "~=") + case '|': + // DashMatch or Char. + return s.emitPrefixOrChar(TokenDashMatch, "|=") + case '^': + // PrefixMatch or Char. + return s.emitPrefixOrChar(TokenPrefixMatch, "^=") + case '$': + // SuffixMatch or Char. + return s.emitPrefixOrChar(TokenSuffixMatch, "$=") + case '*': + // SubstringMatch or Char. + return s.emitPrefixOrChar(TokenSubstringMatch, "*=") + case '<': + // CDO or Char. + return s.emitPrefixOrChar(TokenCDO, " which includes the use of that to permit +// conditionals as per https://docs.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/compatibility/ms537512(v=vs.85)?redirectedfrom=MSDN +// +// What is not permitted are CDATA XML comments, as the x/net/html package we depend +// on does not handle this fully and we are not choosing to take on that work: +// https://pkg.go.dev/golang.org/x/net/html#Tokenizer.AllowCDATA . If the x/net/html +// package changes this then these will be considered, otherwise if you AllowComments +// but provide a CDATA comment, then as per the documentation in x/net/html this will +// be treated as a plain HTML comment. +func (p *Policy) AllowComments() { + p.allowComments = true +} + +// AllowNoAttrs says that attributes on element are optional. +// +// The attribute policy is only added to the core policy when OnElements(...) +// are called. +func (p *Policy) AllowNoAttrs() *attrPolicyBuilder { + + p.init() + + abp := attrPolicyBuilder{ + p: p, + allowEmpty: true, + } + return &abp +} + +// AllowNoAttrs says that attributes on element are optional. +// +// The attribute policy is only added to the core policy when OnElements(...) +// are called. +func (abp *attrPolicyBuilder) AllowNoAttrs() *attrPolicyBuilder { + + abp.allowEmpty = true + + return abp +} + +// Matching allows a regular expression to be applied to a nascent attribute +// policy, and returns the attribute policy. +func (abp *attrPolicyBuilder) Matching(regex *regexp.Regexp) *attrPolicyBuilder { + + abp.regexp = regex + + return abp +} + +// OnElements will bind an attribute policy to a given range of HTML elements +// and return the updated policy +func (abp *attrPolicyBuilder) OnElements(elements ...string) *Policy { + + for _, element := range elements { + element = strings.ToLower(element) + + for _, attr := range abp.attrNames { + + if _, ok := abp.p.elsAndAttrs[element]; !ok { + abp.p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + + abp.p.elsAndAttrs[element][attr] = append(abp.p.elsAndAttrs[element][attr], ap) + } + + if abp.allowEmpty { + abp.p.setOfElementsAllowedWithoutAttrs[element] = struct{}{} + + if _, ok := abp.p.elsAndAttrs[element]; !ok { + abp.p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + } + } + + return abp.p +} + +// OnElementsMatching will bind an attribute policy to all elements matching a given regex +// and return the updated policy +func (abp *attrPolicyBuilder) OnElementsMatching(regex *regexp.Regexp) *Policy { + for _, attr := range abp.attrNames { + if _, ok := abp.p.elsMatchingAndAttrs[regex]; !ok { + abp.p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + abp.p.elsMatchingAndAttrs[regex][attr] = append(abp.p.elsMatchingAndAttrs[regex][attr], ap) + } + + if abp.allowEmpty { + abp.p.setOfElementsMatchingAllowedWithoutAttrs = append(abp.p.setOfElementsMatchingAllowedWithoutAttrs, regex) + if _, ok := abp.p.elsMatchingAndAttrs[regex]; !ok { + abp.p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + } + + return abp.p +} + +// Globally will bind an attribute policy to all HTML elements and return the +// updated policy +func (abp *attrPolicyBuilder) Globally() *Policy { + + for _, attr := range abp.attrNames { + if _, ok := abp.p.globalAttrs[attr]; !ok { + abp.p.globalAttrs[attr] = []attrPolicy{} + } + + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + + abp.p.globalAttrs[attr] = append(abp.p.globalAttrs[attr], ap) + } + + return abp.p +} + +// AllowStyles takes a range of CSS property names and returns a +// style policy builder that allows you to specify the pattern and scope of +// the allowed property. +// +// The style policy is only added to the core policy when either Globally() +// or OnElements(...) are called. +func (p *Policy) AllowStyles(propertyNames ...string) *stylePolicyBuilder { + + p.init() + + abp := stylePolicyBuilder{ + p: p, + } + + for _, propertyName := range propertyNames { + abp.propertyNames = append(abp.propertyNames, strings.ToLower(propertyName)) + } + + return &abp +} + +// Matching allows a regular expression to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) Matching(regex *regexp.Regexp) *stylePolicyBuilder { + + spb.regexp = regex + + return spb +} + +// MatchingEnum allows a list of allowed values to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) MatchingEnum(enum ...string) *stylePolicyBuilder { + + spb.enum = enum + + return spb +} + +// MatchingHandler allows a handler to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) MatchingHandler(handler func(string) bool) *stylePolicyBuilder { + + spb.handler = handler + + return spb +} + +// OnElements will bind a style policy to a given range of HTML elements +// and return the updated policy +func (spb *stylePolicyBuilder) OnElements(elements ...string) *Policy { + + for _, element := range elements { + element = strings.ToLower(element) + + for _, attr := range spb.propertyNames { + + if _, ok := spb.p.elsAndStyles[element]; !ok { + spb.p.elsAndStyles[element] = make(map[string][]stylePolicy) + } + + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.elsAndStyles[element][attr] = append(spb.p.elsAndStyles[element][attr], sp) + } + } + + return spb.p +} + +// OnElementsMatching will bind a style policy to any HTML elements matching the pattern +// and return the updated policy +func (spb *stylePolicyBuilder) OnElementsMatching(regex *regexp.Regexp) *Policy { + + for _, attr := range spb.propertyNames { + + if _, ok := spb.p.elsMatchingAndStyles[regex]; !ok { + spb.p.elsMatchingAndStyles[regex] = make(map[string][]stylePolicy) + } + + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.elsMatchingAndStyles[regex][attr] = append(spb.p.elsMatchingAndStyles[regex][attr], sp) + } + + return spb.p +} + +// Globally will bind a style policy to all HTML elements and return the +// updated policy +func (spb *stylePolicyBuilder) Globally() *Policy { + + for _, attr := range spb.propertyNames { + if _, ok := spb.p.globalStyles[attr]; !ok { + spb.p.globalStyles[attr] = []stylePolicy{} + } + + // Use only one strategy for validating styles, fallback to default + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.globalStyles[attr] = append(spb.p.globalStyles[attr], sp) + } + + return spb.p +} + +// AllowElements will append HTML elements to the allowlist without applying an +// attribute policy to those elements (the elements are permitted +// sans-attributes) +func (p *Policy) AllowElements(names ...string) *Policy { + p.init() + + for _, element := range names { + element = strings.ToLower(element) + + if _, ok := p.elsAndAttrs[element]; !ok { + p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + } + + return p +} + +// AllowElementsMatching will append HTML elements to the allowlist if they +// match a regexp. +func (p *Policy) AllowElementsMatching(regex *regexp.Regexp) *Policy { + p.init() + if _, ok := p.elsMatchingAndAttrs[regex]; !ok { + p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + return p +} + +// AllowURLSchemesMatching will append URL schemes to the allowlist if they +// match a regexp. +func (p *Policy) AllowURLSchemesMatching(r *regexp.Regexp) *Policy { + p.allowURLSchemeRegexps = append(p.allowURLSchemeRegexps, r) + return p +} + +// RewriteSrc will rewrite the src attribute of a resource downloading tag +// (e.g. , tag. +func (p *Policy) addDefaultSkipElementContent() { + p.init() + + p.setOfElementsToSkipContent["frame"] = struct{}{} + p.setOfElementsToSkipContent["frameset"] = struct{}{} + p.setOfElementsToSkipContent["iframe"] = struct{}{} + p.setOfElementsToSkipContent["noembed"] = struct{}{} + p.setOfElementsToSkipContent["noframes"] = struct{}{} + p.setOfElementsToSkipContent["noscript"] = struct{}{} + p.setOfElementsToSkipContent["nostyle"] = struct{}{} + p.setOfElementsToSkipContent["object"] = struct{}{} + p.setOfElementsToSkipContent["script"] = struct{}{} + p.setOfElementsToSkipContent["style"] = struct{}{} + p.setOfElementsToSkipContent["title"] = struct{}{} +} diff --git a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go new file mode 100644 index 00000000..2792fb33 --- /dev/null +++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go @@ -0,0 +1,1089 @@ +// Copyright (c) 2014, David Kitchen +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the organisation (Microcosm) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package bluemonday + +import ( + "bytes" + "fmt" + "io" + "net/url" + "regexp" + "strconv" + "strings" + + "golang.org/x/net/html" + + "github.com/aymerick/douceur/parser" +) + +var ( + dataAttribute = regexp.MustCompile("^data-.+") + dataAttributeXMLPrefix = regexp.MustCompile("^xml.+") + dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+") + cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`) + dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`) +) + +// Sanitize takes a string that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a HTML string that has been sanitized by the policy or an empty +// string if an error has occurred (most likely as a consequence of extremely +// malformed input) +func (p *Policy) Sanitize(s string) string { + if strings.TrimSpace(s) == "" { + return s + } + + return p.sanitizeWithBuff(strings.NewReader(s)).String() +} + +// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a []byte containing the HTML that has been sanitized by the policy +// or an empty []byte if an error has occurred (most likely as a consequence of +// extremely malformed input) +func (p *Policy) SanitizeBytes(b []byte) []byte { + if len(bytes.TrimSpace(b)) == 0 { + return b + } + + return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes() +} + +// SanitizeReader takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist. +// +// It returns a bytes.Buffer containing the HTML that has been sanitized by the +// policy. Errors during sanitization will merely return an empty result. +func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { + return p.sanitizeWithBuff(r) +} + +// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist and writes to the provided writer returning +// an error if there is one. +func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error { + return p.sanitize(r, w) +} + +// Query represents a single part of the query string, a query param +type Query struct { + Key string + Value string + HasValue bool +} + +func parseQuery(query string) (values []Query, err error) { + // This is essentially a copy of parseQuery from + // https://golang.org/src/net/url/url.go but adjusted to build our values + // based on our type, which we need to preserve the ordering of the query + // string + for query != "" { + key := query + if i := strings.IndexAny(key, "&;"); i >= 0 { + key, query = key[:i], key[i+1:] + } else { + query = "" + } + if key == "" { + continue + } + value := "" + hasValue := false + if i := strings.Index(key, "="); i >= 0 { + key, value = key[:i], key[i+1:] + hasValue = true + } + key, err1 := url.QueryUnescape(key) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + value, err1 = url.QueryUnescape(value) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + values = append(values, Query{ + Key: key, + Value: value, + HasValue: hasValue, + }) + } + return values, err +} + +func encodeQueries(queries []Query) string { + var buff bytes.Buffer + for i, query := range queries { + buff.WriteString(url.QueryEscape(query.Key)) + if query.HasValue { + buff.WriteString("=") + buff.WriteString(url.QueryEscape(query.Value)) + } + if i < len(queries)-1 { + buff.WriteString("&") + } + } + return buff.String() +} + +func sanitizedURL(val string) (string, error) { + u, err := url.Parse(val) + if err != nil { + return "", err + } + + // we use parseQuery but not u.Query to keep the order not change because + // url.Values is a map which has a random order. + queryValues, err := parseQuery(u.RawQuery) + if err != nil { + return "", err + } + // sanitize the url query params + for i, query := range queryValues { + queryValues[i].Key = html.EscapeString(query.Key) + } + u.RawQuery = encodeQueries(queryValues) + // u.String() will also sanitize host/scheme/user/pass + return u.String(), nil +} + +// Performs the actual sanitization process. +func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer { + var buff bytes.Buffer + if err := p.sanitize(r, &buff); err != nil { + return &bytes.Buffer{} + } + return &buff +} + +type asStringWriter struct { + io.Writer +} + +func (a *asStringWriter) WriteString(s string) (int, error) { + return a.Write([]byte(s)) +} + +func (p *Policy) sanitize(r io.Reader, w io.Writer) error { + // It is possible that the developer has created the policy via: + // p := bluemonday.Policy{} + // rather than: + // p := bluemonday.NewPolicy() + // If this is the case, and if they haven't yet triggered an action that + // would initialize the maps, then we need to do that. + p.init() + + buff, ok := w.(stringWriterWriter) + if !ok { + buff = &asStringWriter{w} + } + + var ( + skipElementContent bool + skippingElementsCount int64 + skipClosingTag bool + closingTagToSkipStack []string + mostRecentlyStartedToken string + ) + + tokenizer := html.NewTokenizer(r) + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + // End of input means end of processing + return nil + } + + // Raw tokenizer error + return err + } + + token := tokenizer.Token() + switch token.Type { + case html.DoctypeToken: + + // DocType is not handled as there is no safe parsing mechanism + // provided by golang.org/x/net/html for the content, and this can + // be misused to insert HTML tags that are not then sanitized + // + // One might wish to recursively sanitize here using the same policy + // but I will need to do some further testing before considering + // this. + + case html.CommentToken: + + // Comments are ignored by default + if p.allowComments { + // But if allowed then write the comment out as-is + buff.WriteString(token.String()) + } + + case html.StartTagToken: + + mostRecentlyStartedToken = normaliseElementName(token.Data) + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { + skipElementContent = true + skippingElementsCount++ + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 { + if !p.allowNoAttrs(token.Data) { + skipClosingTag = true + closingTagToSkipStack = append(closingTagToSkipStack, token.Data) + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.EndTagToken: + + if mostRecentlyStartedToken == normaliseElementName(token.Data) { + mostRecentlyStartedToken = "" + } + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { + closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] + if len(closingTagToSkipStack) == 0 { + skipClosingTag = false + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + if _, ok := p.elsAndAttrs[token.Data]; !ok { + match := false + for regex := range p.elsMatchingAndAttrs { + if regex.MatchString(token.Data) { + skipElementContent = false + match = true + break + } + } + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match { + skippingElementsCount-- + if skippingElementsCount == 0 { + skipElementContent = false + } + } + if !match { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.SelfClosingTagToken: + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if p.addSpaces && !matched { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.TextToken: + + if !skipElementContent { + switch mostRecentlyStartedToken { + case `script`: + // not encouraged, but if a policy allows JavaScript we + // should not HTML escape it as that would break the output + // + // requires p.AllowUnsafe() + if p.allowUnsafe { + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + } + case "style": + // not encouraged, but if a policy allows CSS styles we + // should not HTML escape it as that would break the output + // + // requires p.AllowUnsafe() + if p.allowUnsafe { + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + } + default: + // HTML escape the text + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + } + + default: + // A token that didn't exist in the html package when we wrote this + return fmt.Errorf("unknown token: %v", token) + } + } +} + +// sanitizeAttrs takes a set of element attribute policies and the global +// attribute policies and applies them to the []html.Attribute returning a set +// of html.Attributes that match the policies +func (p *Policy) sanitizeAttrs( + elementName string, + attrs []html.Attribute, + aps map[string][]attrPolicy, +) []html.Attribute { + + if len(attrs) == 0 { + return attrs + } + + hasStylePolicies := false + sps, elementHasStylePolicies := p.elsAndStyles[elementName] + if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) { + hasStylePolicies = true + } + // no specific element policy found, look for a pattern match + if !hasStylePolicies { + for k, v := range p.elsMatchingAndStyles { + if k.MatchString(elementName) { + if len(v) > 0 { + hasStylePolicies = true + break + } + } + } + } + + // Builds a new attribute slice based on the whether the attribute has been + // allowed explicitly or globally. + cleanAttrs := []html.Attribute{} +attrsLoop: + for _, htmlAttr := range attrs { + if p.allowDataAttributes { + // If we see a data attribute, let it through. + if isDataAttribute(htmlAttr.Key) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + // Is this a "style" attribute, and if so, do we need to sanitize it? + if htmlAttr.Key == "style" && hasStylePolicies { + htmlAttr = p.sanitizeStyles(htmlAttr, elementName) + if htmlAttr.Val == "" { + // We've sanitized away any and all styles; don't bother to + // output the style attribute (even if it's allowed) + continue + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + + // Is there an element specific attribute policy that applies? + if apl, ok := aps[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } + } + + // Is there a global attribute policy that applies? + if apl, ok := p.globalAttrs[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } + } + } + + if len(cleanAttrs) == 0 { + // If nothing was allowed, let's get out of here + return cleanAttrs + } + // cleanAttrs now contains the attributes that are permitted + + if linkable(elementName) { + if p.requireParseableURLs { + // Ensure URLs are parseable: + // - a.href + // - area.href + // - link.href + // - blockquote.cite + // - q.cite + // - img.src + // - script.src + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + switch elementName { + case "a", "area", "base", "link": + if htmlAttr.Key == "href" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "blockquote", "del", "ins", "q": + if htmlAttr.Key == "cite" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "audio", "embed", "iframe", "img", "script", "source", "track", "video": + if htmlAttr.Key == "src" { + if u, ok := p.validURL(htmlAttr.Val); ok { + if p.srcRewriter != nil { + parsedURL, err := url.Parse(u) + if err != nil { + fmt.Println(err) + } + p.srcRewriter(parsedURL) + u = parsedURL.String() + } + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + default: + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + cleanAttrs = tmpAttrs + } + + if (p.requireNoFollow || + p.requireNoFollowFullyQualifiedLinks || + p.requireNoReferrer || + p.requireNoReferrerFullyQualifiedLinks || + p.addTargetBlankToFullyQualifiedLinks) && + len(cleanAttrs) > 0 { + + // Add rel="nofollow" if a "href" exists + switch elementName { + case "a", "area", "base", "link": + var hrefFound bool + var externalLink bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "href" { + hrefFound = true + + u, err := url.Parse(htmlAttr.Val) + if err != nil { + continue + } + if u.Host != "" { + externalLink = true + } + + continue + } + } + + if hrefFound { + var ( + noFollowFound bool + noReferrerFound bool + targetBlankFound bool + ) + + addNoFollow := (p.requireNoFollow || + externalLink && p.requireNoFollowFullyQualifiedLinks) + + addNoReferrer := (p.requireNoReferrer || + externalLink && p.requireNoReferrerFullyQualifiedLinks) + + addTargetBlank := (externalLink && + p.addTargetBlankToFullyQualifiedLinks) + + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + + var appended bool + if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) { + + if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") { + htmlAttr.Val += " nofollow" + } + if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") { + htmlAttr.Val += " noreferrer" + } + noFollowFound = addNoFollow + noReferrerFound = addNoReferrer + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + + if elementName == "a" && htmlAttr.Key == "target" { + if htmlAttr.Val == "_blank" { + targetBlankFound = true + } + if addTargetBlank && !targetBlankFound { + htmlAttr.Val = "_blank" + targetBlankFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + } + + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noFollowFound || noReferrerFound || targetBlankFound { + cleanAttrs = tmpAttrs + } + + if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) { + rel := html.Attribute{} + rel.Key = "rel" + if addNoFollow { + rel.Val = "nofollow" + } + if addNoReferrer { + if rel.Val != "" { + rel.Val += " " + } + rel.Val += "noreferrer" + } + cleanAttrs = append(cleanAttrs, rel) + } + + if elementName == "a" && addTargetBlank && !targetBlankFound { + rel := html.Attribute{} + rel.Key = "target" + rel.Val = "_blank" + targetBlankFound = true + cleanAttrs = append(cleanAttrs, rel) + } + + if targetBlankFound { + // target="_blank" has a security risk that allows the + // opened window/tab to issue JavaScript calls against + // window.opener, which in effect allow the destination + // of the link to control the source: + // https://dev.to/ben/the-targetblank-vulnerability-by-example + // + // To mitigate this risk, we need to add a specific rel + // attribute if it is not already present. + // rel="noopener" + // + // Unfortunately this is processing the rel twice (we + // already looked at it earlier ^^) as we cannot be sure + // of the ordering of the href and rel, and whether we + // have fully satisfied that we need to do this. This + // double processing only happens *if* target="_blank" + // is true. + var noOpenerAdded bool + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + var appended bool + if htmlAttr.Key == "rel" { + if strings.Contains(htmlAttr.Val, "noopener") { + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } else { + htmlAttr.Val += " noopener" + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } + + appended = true + } + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noOpenerAdded { + cleanAttrs = tmpAttrs + } else { + // rel attr was not found, or else noopener would + // have been added already + rel := html.Attribute{} + rel.Key = "rel" + rel.Val = "noopener" + cleanAttrs = append(cleanAttrs, rel) + } + + } + } + default: + } + } + } + + if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 { + switch elementName { + case "audio", "img", "link", "script", "video": + var crossOriginFound bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "crossorigin" { + crossOriginFound = true + htmlAttr.Val = "anonymous" + } + } + + if !crossOriginFound { + crossOrigin := html.Attribute{} + crossOrigin.Key = "crossorigin" + crossOrigin.Val = "anonymous" + cleanAttrs = append(cleanAttrs, crossOrigin) + } + } + } + + if p.requireSandboxOnIFrame != nil && elementName == "iframe" { + var sandboxFound bool + for i, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "sandbox" { + sandboxFound = true + var cleanVals []string + cleanValsSet := make(map[string]bool) + for _, val := range strings.Fields(htmlAttr.Val) { + if p.requireSandboxOnIFrame[val] { + if !cleanValsSet[val] { + cleanVals = append(cleanVals, val) + cleanValsSet[val] = true + } + } + } + cleanAttrs[i].Val = strings.Join(cleanVals, " ") + } + } + + if !sandboxFound { + sandbox := html.Attribute{} + sandbox.Key = "sandbox" + sandbox.Val = "" + cleanAttrs = append(cleanAttrs, sandbox) + } + } + + return cleanAttrs +} + +func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute { + sps := p.elsAndStyles[elementName] + if len(sps) == 0 { + sps = map[string][]stylePolicy{} + // check for any matching elements, if we don't already have a policy found + // if multiple matches are found they will be overwritten, it's best + // to not have overlapping matchers + for regex, policies := range p.elsMatchingAndStyles { + if regex.MatchString(elementName) { + for k, v := range policies { + sps[k] = append(sps[k], v...) + } + } + } + } + + //Add semi-colon to end to fix parsing issue + attr.Val = strings.TrimRight(attr.Val, " ") + if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' { + attr.Val = attr.Val + ";" + } + decs, err := parser.ParseDeclarations(attr.Val) + if err != nil { + attr.Val = "" + return attr + } + clean := []string{} + prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"} + +decLoop: + for _, dec := range decs { + tempProperty := strings.ToLower(dec.Property) + tempValue := removeUnicode(strings.ToLower(dec.Value)) + for _, i := range prefixes { + tempProperty = strings.TrimPrefix(tempProperty, i) + } + if spl, ok := sps[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + if spl, ok := p.globalStyles[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + } + if len(clean) > 0 { + attr.Val = strings.Join(clean, "; ") + } else { + attr.Val = "" + } + return attr +} + +func (p *Policy) allowNoAttrs(elementName string) bool { + _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] + if !ok { + for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs { + if r.MatchString(elementName) { + ok = true + break + } + } + } + return ok +} + +func (p *Policy) validURL(rawurl string) (string, bool) { + if p.requireParseableURLs { + // URLs are valid if when space is trimmed the URL is valid + rawurl = strings.TrimSpace(rawurl) + + // URLs cannot contain whitespace, unless it is a data-uri + if strings.Contains(rawurl, " ") || + strings.Contains(rawurl, "\t") || + strings.Contains(rawurl, "\n") { + if !strings.HasPrefix(rawurl, `data:`) { + return "", false + } + + // Remove \r and \n from base64 encoded data to pass url.Parse. + matched := dataURIbase64Prefix.FindString(rawurl) + if matched != "" { + rawurl = matched + strings.Replace( + strings.Replace( + rawurl[len(matched):], + "\r", + "", + -1, + ), + "\n", + "", + -1, + ) + } + } + + // URLs are valid if they parse + u, err := url.Parse(rawurl) + if err != nil { + return "", false + } + + if u.Scheme != "" { + urlPolicies, ok := p.allowURLSchemes[u.Scheme] + if !ok { + for _, r := range p.allowURLSchemeRegexps { + if r.MatchString(u.Scheme) { + return u.String(), true + } + } + + return "", false + } + + if len(urlPolicies) == 0 { + return u.String(), true + } + + for _, urlPolicy := range urlPolicies { + if urlPolicy(u) { + return u.String(), true + } + } + + return "", false + } + + if p.allowRelativeURLs { + if u.String() != "" { + return u.String(), true + } + } + + return "", false + } + + return rawurl, true +} + +func linkable(elementName string) bool { + switch elementName { + case "a", "area", "base", "link": + // elements that allow .href + return true + case "blockquote", "del", "ins", "q": + // elements that allow .cite + return true + case "audio", "embed", "iframe", "img", "input", "script", "track", "video": + // elements that allow .src + return true + default: + return false + } +} + +// stringInSlice returns true if needle exists in haystack +func stringInSlice(needle string, haystack []string) bool { + for _, straw := range haystack { + if strings.ToLower(straw) == strings.ToLower(needle) { + return true + } + } + return false +} + +func isDataAttribute(val string) bool { + if !dataAttribute.MatchString(val) { + return false + } + rest := strings.Split(val, "data-") + if len(rest) == 1 { + return false + } + // data-xml* is invalid. + if dataAttributeXMLPrefix.MatchString(rest[1]) { + return false + } + // no uppercase or semi-colons allowed. + if dataAttributeInvalidChars.MatchString(rest[1]) { + return false + } + return true +} + +func removeUnicode(value string) string { + substitutedValue := value + currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue) + for currentLoc != nil { + + character := substitutedValue[currentLoc[0]+1 : currentLoc[1]] + character = strings.TrimSpace(character) + if len(character) < 4 { + character = strings.Repeat("0", 4-len(character)) + character + } else { + for len(character) > 4 { + if character[0] != '0' { + character = "" + break + } else { + character = character[1:] + } + } + } + character = "\\u" + character + translatedChar, err := strconv.Unquote(`"` + character + `"`) + translatedChar = strings.TrimSpace(translatedChar) + if err != nil { + return "" + } + substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:] + currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue) + } + return substitutedValue +} + +func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) { + aps := make(map[string][]attrPolicy, 0) + matched := false + for regex, attrs := range p.elsMatchingAndAttrs { + if regex.MatchString(elementName) { + matched = true + for k, v := range attrs { + aps[k] = append(aps[k], v...) + } + } + } + return aps, matched +} + +// normaliseElementName takes a HTML element like