diff --git a/go.mod b/go.mod index b1a10d70..12f3a560 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,8 @@ require ( github.com/google/uuid v1.3.0 github.com/hanwen/go-fuse/v2 v2.3.0 github.com/hyponet/eventbus v1.0.0 + github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a + github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 github.com/minio/minio-go/v7 v7.0.52 github.com/onsi/ginkgo v1.16.5 github.com/onsi/gomega v1.27.2 @@ -26,13 +28,15 @@ require ( github.com/tickstep/aliyunpan-api v0.1.6 github.com/tickstep/library-go v0.1.0 go.uber.org/zap v1.24.0 - golang.org/x/net v0.11.0 - golang.org/x/sys v0.9.0 + golang.org/x/net v0.15.0 + golang.org/x/sys v0.12.0 gopkg.in/yaml.v3 v3.0.1 gorm.io/driver/postgres v1.3.7 ) require ( + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.4 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.34 // indirect @@ -46,6 +50,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/sso v1.12.12 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.12 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.19.2 // indirect + github.com/aymerick/douceur v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/btcsuite/btcd v0.22.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect @@ -57,9 +62,13 @@ require ( github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.14.1 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad // indirect github.com/goccy/go-json v0.10.2 // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/go-cmp v0.5.9 // indirect + github.com/gorilla/css v1.0.0 // indirect github.com/jackc/chunkreader/v2 v2.0.1 // indirect github.com/jackc/pgconn v1.12.1 // indirect github.com/jackc/pgio v1.0.0 // indirect @@ -75,6 +84,7 @@ require ( github.com/mattn/go-isatty v0.0.19 // indirect github.com/mattn/go-runewidth v0.0.9 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/microcosm-cc/bluemonday v1.0.25 // indirect github.com/minio/md5-simd v1.1.2 // indirect github.com/minio/sha256-simd v1.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -89,11 +99,12 @@ require ( github.com/sirupsen/logrus v1.9.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect - golang.org/x/text v0.10.0 // indirect + golang.org/x/text v0.13.0 // indirect golang.org/x/time v0.3.0 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect + howett.net/plist v1.0.0 // indirect modernc.org/libc v1.22.2 // indirect modernc.org/mathutil v1.5.0 // indirect modernc.org/memory v1.5.0 // indirect @@ -115,6 +126,6 @@ require ( go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/arch v0.3.0 // indirect - golang.org/x/crypto v0.10.0 // indirect + golang.org/x/crypto v0.13.0 // indirect gorm.io/gorm v1.24.6 ) diff --git a/go.sum b/go.sum index aae7d343..124d4158 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,14 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc= github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/aliyun/aliyun-oss-go-sdk v2.2.7+incompatible h1:KpbJFXwhVeuxNtBJ74MCGbIoaBok2uZvkD7QXp2+Wis= github.com/aliyun/aliyun-oss-go-sdk v2.2.7+incompatible/go.mod h1:T/Aws4fEfogEE9v+HPhhw+CntffsBHJ8nXQCwKr0/g8= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/aws/aws-sdk-go-v2 v1.18.1 h1:+tefE750oAb7ZQGzla6bLkOwfcQCEtC5y2RqoqCeqKo= github.com/aws/aws-sdk-go-v2 v1.18.1/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= @@ -40,6 +45,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.19.2 h1:XFJ2Z6sNUUcAz9poj+245DMkrHE4 github.com/aws/aws-sdk-go-v2/service/sts v1.19.2/go.mod h1:dp0yLPsLBOi++WTxzCjA/oZqi6NPIhoR+uF7GeMU9eg= github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= +github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= +github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -122,6 +129,10 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91 github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos= github.com/go-playground/validator/v10 v10.14.1 h1:9c50NUPC30zyuKprjL3vNZ0m5oG+jU0zvx4AqHGnv4k= github.com/go-playground/validator/v10 v10.14.1/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= +github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4= +github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI= github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= @@ -131,6 +142,8 @@ github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/gofrs/uuid v4.0.0+incompatible h1:1SD/1F5pU8p29ybwgQSwpQk+mwdRrXCYuPhW6m+TnJw= github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= github.com/golang-sql/sqlexp v0.0.0-20170517235910-f1bb20e5a188 h1:+eHOFJl1BaXrQxKX+T06f78590z4qA2ZzBTqahsKSE4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -156,11 +169,15 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= +github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= github.com/hanwen/go-fuse/v2 v2.3.0 h1:t5ivNIH2PK+zw4OBul/iJjsoG9K6kXo4nMDoBpciC8A= github.com/hanwen/go-fuse/v2 v2.3.0/go.mod h1:xKwi1cF7nXAOBCXujD5ie0ZKsxc8GGSA1rlMJc+8IJs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/hyponet/eventbus v1.0.0 h1:Cl1v4Ge1/ILn/z4nBhxu1cTny8joRPDj3pqRudlbO+w= github.com/hyponet/eventbus v1.0.0/go.mod h1:5XPvonkyxwwNSMEqnpuSh1NlW3KZKpRr9DNKkZBBuyk= +github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a h1:h/MjGu9KXGqsIrCH5BEvvwTpMY0ZpuWJhkQi2LNPqGc= +github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a/go.mod h1:2qcy+SgeIQHRG6grhK9oMWFk5Fnh65U5qy47ij5Sw3A= github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= @@ -247,6 +264,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348 h1:MtvEpTB6LX3vkb4ax0b5D2DHbNAUsen0Gx5wZoq3lV4= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q= github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4= @@ -268,6 +287,8 @@ github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m github.com/mattn/go-sqlite3 v1.14.15 h1:vfoHhTN1af61xCRSWzFIWzx2YskyMTwHLrExkBOjvxI= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg= +github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.52 h1:8XhG36F6oKQUDDSuz6dY3rioMzovKjW40W6ANuN0Dps= @@ -331,6 +352,7 @@ github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThC github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= @@ -375,6 +397,7 @@ github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95 github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU= github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= @@ -411,12 +434,14 @@ golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWP golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= -golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/crypto v0.13.0 h1:mvySKfSWJ+UKUii46M40LOvyWfN0s2U+46/jDd0e6Ck= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180719180050-a680a1efc54d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -426,13 +451,20 @@ golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU= -golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -451,26 +483,34 @@ golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= -golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= -golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -483,7 +523,9 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -512,6 +554,7 @@ gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -531,6 +574,8 @@ gorm.io/gorm v1.23.4/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= gorm.io/gorm v1.24.6 h1:wy98aq9oFEetsc4CAbKD2SoBCdMzsbSIvSUUFJuHi5s= gorm.io/gorm v1.24.6/go.mod h1:L4uxeKpfBml98NYqVqwAdmV1a2nBtAec/cf3fpucW/k= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= +howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= modernc.org/libc v1.22.2 h1:4U7v51GyhlWqQmwCHj28Rdq2Yzwk55ovjFrdPjs8Hb0= modernc.org/libc v1.22.2/go.mod h1:uvQavJ1pZ0hIoC/jfqNoMLURIMhKzINIWypNM17puug= modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= diff --git a/pkg/dentry/group.go b/pkg/dentry/group.go index 4e322d72..e67e9712 100644 --- a/pkg/dentry/group.go +++ b/pkg/dentry/group.go @@ -21,7 +21,7 @@ import ( "fmt" "github.com/basenana/nanafs/pkg/metastore" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/utils/logger" "path" @@ -243,7 +243,7 @@ func (e *extGroup) FindEntry(ctx context.Context, name string) (*types.Metadata, } func (e *extGroup) CreateEntry(ctx context.Context, attr EntryAttr) (*types.Metadata, error) { - mirrorEn, err := e.mirror.CreateEntry(ctx, stub.EntryAttr{ + mirrorEn, err := e.mirror.CreateEntry(ctx, pluginapi.EntryAttr{ Name: attr.Name, Kind: attr.Kind, }) @@ -325,7 +325,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error) } recordChildMap := make(map[string]*types.Metadata) - actualChildMap := make(map[string]*stub.Entry) + actualChildMap := make(map[string]*pluginapi.Entry) for i := range recordChild { recordChildMap[recordChild[i].Name] = recordChild[i] } @@ -355,7 +355,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error) return result, nil } -func (e *extGroup) syncEntry(ctx context.Context, mirrored *stub.Entry, crt *types.Metadata) (en *types.Metadata, err error) { +func (e *extGroup) syncEntry(ctx context.Context, mirrored *pluginapi.Entry, crt *types.Metadata) (en *types.Metadata, err error) { grp, err := e.stdGroup.cacheStore.getEntry(ctx, e.stdGroup.entryID) if err != nil { return nil, err diff --git a/pkg/dentry/group_test.go b/pkg/dentry/group_test.go index 44aa98ca..f9bf7474 100644 --- a/pkg/dentry/group_test.go +++ b/pkg/dentry/group_test.go @@ -19,7 +19,7 @@ package dentry import ( "context" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -314,7 +314,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("insert sync_file1.yaml to memfs should be succeed", func() { - _, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{ + _, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{ Name: "sync_file1.yaml", Kind: types.RawKind, }) @@ -335,7 +335,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("insert sync_file2.yaml to memfs should be succeed", func() { - _, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{ + _, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{ Name: "sync_file2.yaml", Kind: types.RawKind, }) @@ -355,7 +355,7 @@ var _ = Describe("TestExtGroupEntry", func() { Expect(len(need)).Should(Equal(0)) }) It("delete sync_file2.yaml should be succeed", func() { - err = memFS.RemoveEntry(context.TODO(), &stub.Entry{ + err = memFS.RemoveEntry(context.TODO(), &pluginapi.Entry{ Name: "sync_file2.yaml", Kind: types.RawKind, }) diff --git a/pkg/plugin/buildin/docloader/csv.go b/pkg/plugin/buildin/docloader/csv.go new file mode 100644 index 00000000..b0297d50 --- /dev/null +++ b/pkg/plugin/buildin/docloader/csv.go @@ -0,0 +1,80 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "encoding/csv" + "errors" + "fmt" + "github.com/basenana/nanafs/pkg/types" + "io" + "os" + "strings" +) + +const ( + csvLoader = "csv" +) + +type CSV struct { + docPath string +} + +func NewCSV(docPath string, option map[string]string) CSV { + return CSV{docPath: docPath} +} + +func (c CSV) Load(_ context.Context) (result []types.FDocument, err error) { + f, err := os.Open(c.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + var header []string + var rown int + + rd := csv.NewReader(f) + for { + row, err := rd.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return nil, err + } + if len(header) == 0 { + header = append(header, row...) + continue + } + + var content []string + for i, value := range row { + line := fmt.Sprintf("%s: %s", header[i], value) + content = append(content, line) + } + + rown++ + result = append(result, types.FDocument{ + Content: strings.Join(content, "\n"), + Metadata: map[string]any{"type": csvLoader, "row": rown}, + }) + } + + return +} diff --git a/pkg/plugin/buildin/docloader/docloader.go b/pkg/plugin/buildin/docloader/docloader.go new file mode 100644 index 00000000..f8403e7a --- /dev/null +++ b/pkg/plugin/buildin/docloader/docloader.go @@ -0,0 +1,102 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "fmt" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" + "github.com/basenana/nanafs/pkg/types" + "os" + "path/filepath" +) + +const ( + PluginName = "docloader" + PluginVersion = "1.0" +) + +type DocLoader struct{} + +func (d DocLoader) Name() string { + return PluginName +} + +func (d DocLoader) Type() types.PluginType { + return types.TypeProcess +} + +func (d DocLoader) Version() string { + return PluginVersion +} + +func (d DocLoader) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { + entryPath := request.Parameter[pluginapi.ResEntryPathKey].(string) + if entryPath == "" { + resp := pluginapi.NewFailedResponse("entry_path is empty") + return resp, nil + } + + _, err := os.Stat(entryPath) + if err != nil { + resp := pluginapi.NewFailedResponse(fmt.Sprintf("stat entry file %s failed: %s", entryPath, err)) + return resp, nil + } + + fileExt := filepath.Ext(entryPath) + var ( + p Parser + parseOption = map[string]string{} + ) + + switch fileExt { + case ".pdf": + p = buildInLoaders[pdfParser](entryPath, parseOption) + case ".txt": + p = buildInLoaders[textParser](entryPath, parseOption) + case ".html", ".htm": + p = buildInLoaders[htmlParser](entryPath, parseOption) + case ".webarchive": + p = buildInLoaders[webArchiveParser](entryPath, parseOption) + default: + resp := pluginapi.NewFailedResponse(fmt.Sprintf("load %s file unsupported", fileExt)) + return resp, nil + } + + documents, err := p.Load(ctx) + if err != nil { + resp := pluginapi.NewFailedResponse(fmt.Sprintf("load file %s failed: %s", entryPath, err)) + return resp, nil + } + + return pluginapi.NewResponseWithResult(map[string]any{pluginapi.ResEntryDocumentsKey: documents}), nil +} + +type Parser interface { + Load(ctx context.Context) (result []types.FDocument, err error) +} + +type parserBuilder func(docPath string, docOption map[string]string) Parser + +var ( + buildInLoaders = map[string]parserBuilder{ + textParser: NewText, + pdfParser: NewPDF, + htmlParser: NewHTML, + webArchiveParser: NewHTML, + } +) diff --git a/pkg/plugin/buildin/docloader/epub.go b/pkg/plugin/buildin/docloader/epub.go new file mode 100644 index 00000000..267f1662 --- /dev/null +++ b/pkg/plugin/buildin/docloader/epub.go @@ -0,0 +1,17 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader diff --git a/pkg/plugin/buildin/docloader/html.go b/pkg/plugin/buildin/docloader/html.go new file mode 100644 index 00000000..e50812c2 --- /dev/null +++ b/pkg/plugin/buildin/docloader/html.go @@ -0,0 +1,66 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "github.com/basenana/nanafs/pkg/types" + "github.com/hyponet/webpage-packer/packer" + "strings" +) + +const ( + htmlParser = "html" + webArchiveParser = "webarchive" +) + +type HTML struct { + docPath string +} + +func NewHTML(docPath string, option map[string]string) Parser { + return HTML{docPath: docPath} +} + +func (h HTML) Load(ctx context.Context) (result []types.FDocument, err error) { + var ( + p packer.Packer + docType = "html" + ) + switch { + case strings.HasSuffix(h.docPath, ".webarchive"): + p = packer.NewWebArchivePacker() + docType = "webarchive" + + case strings.HasSuffix(h.docPath, ".html") || + strings.HasSuffix(h.docPath, ".htm"): + p = packer.NewHtmlPacker() + } + + content, err := p.ReadContent(ctx, packer.Option{ + FilePath: h.docPath, + ClutterFree: true, + }) + if err != nil { + return nil, err + } + + return []types.FDocument{{ + Content: content, + Metadata: map[string]any{"type": docType}, + }}, nil +} diff --git a/pkg/plugin/buildin/docloader/pdf.go b/pkg/plugin/buildin/docloader/pdf.go new file mode 100644 index 00000000..448ae547 --- /dev/null +++ b/pkg/plugin/buildin/docloader/pdf.go @@ -0,0 +1,107 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "context" + "github.com/basenana/nanafs/pkg/types" + "github.com/ledongthuc/pdf" + "os" +) + +const ( + pdfParser = "pdf" +) + +type PDF struct { + docPath string + password string +} + +func NewPDF(docPath string, option map[string]string) Parser { + return newPDFWithPassword(docPath, option["password"]) +} + +func newPDFWithPassword(docPath, pass string) Parser { + return &PDF{docPath: docPath, password: pass} +} + +func (p *PDF) Load(_ context.Context) ([]types.FDocument, error) { + fInfo, err := os.Stat(p.docPath) + if err != nil { + return nil, err + } + + f, err := os.Open(p.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + var reader *pdf.Reader + if p.password != "" { + reader, err = pdf.NewReaderEncrypted(f, fInfo.Size(), p.getAndCleanPassword) + if err != nil { + return nil, err + } + } else { + reader, err = pdf.NewReader(f, fInfo.Size()) + if err != nil { + return nil, err + } + } + + var ( + numPages = reader.NumPage() + result = make([]types.FDocument, 0) + ) + + fonts := make(map[string]*pdf.Font) + for i := 1; i < numPages+1; i++ { + page := reader.Page(i) + for _, name := range page.Fonts() { + if _, ok := fonts[name]; !ok { + f := page.Font(name) + fonts[name] = &f + } + } + text, err := page.GetPlainText(fonts) + if err != nil { + return nil, err + } + + result = append(result, types.FDocument{ + Content: text, + Metadata: map[string]any{ + "type": "pdf", + "page": i, + "total_pages": numPages, + }, + }) + } + + return result, nil +} + +func (p *PDF) getAndCleanPassword() string { + pass := p.password + if pass != "" { + // set password empty to stop retry + p.password = "" + } + return pass +} diff --git a/pkg/plugin/buildin/docloader/plaintext.go b/pkg/plugin/buildin/docloader/plaintext.go new file mode 100644 index 00000000..a66d68c4 --- /dev/null +++ b/pkg/plugin/buildin/docloader/plaintext.go @@ -0,0 +1,56 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package docloader + +import ( + "bytes" + "context" + "github.com/basenana/nanafs/pkg/types" + "io" + "os" +) + +const ( + textParser = "text" +) + +type Text struct { + docPath string +} + +func NewText(docPath string, option map[string]string) Parser { return Text{docPath: docPath} } + +func (l Text) Load(_ context.Context) ([]types.FDocument, error) { + f, err := os.Open(l.docPath) + if err != nil { + return nil, err + } + defer f.Close() + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, f) + if err != nil { + return nil, err + } + + return []types.FDocument{ + { + Content: buf.String(), + Metadata: map[string]any{}, + }, + }, nil +} diff --git a/pkg/plugin/buildin/rss.go b/pkg/plugin/buildin/rss.go index 3cd9d957..130329ab 100644 --- a/pkg/plugin/buildin/rss.go +++ b/pkg/plugin/buildin/rss.go @@ -19,7 +19,7 @@ package buildin import ( "context" "github.com/basenana/nanafs/pkg/metastore" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "go.uber.org/zap" ) @@ -76,7 +76,7 @@ func (r *RssSourcePlugin) listRssSources(ctx context.Context) ([]rssSource, erro return result, nil } -func (r *RssSourcePlugin) Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) { +func (r *RssSourcePlugin) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { rssSourceList, err := r.listRssSources(ctx) if err != nil { r.logger.Errorw("list rss source failed", "err", err) @@ -85,10 +85,10 @@ func (r *RssSourcePlugin) Run(ctx context.Context, request *stub.Request, params for i := range rssSourceList { source := rssSourceList[i] - r.syncRssSource(ctx, source, params) + r.syncRssSource(ctx, source, pluginParams) } - resp := &stub.Response{ + resp := &pluginapi.Response{ IsSucceed: true, } return resp, nil diff --git a/pkg/plugin/mirror.go b/pkg/plugin/mirror.go index c34fb143..3e22674b 100644 --- a/pkg/plugin/mirror.go +++ b/pkg/plugin/mirror.go @@ -19,7 +19,7 @@ package plugin import ( "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/utils" "io" @@ -31,11 +31,11 @@ type MirrorPlugin interface { Plugin IsGroup(ctx context.Context) (bool, error) - FindEntry(ctx context.Context, name string) (*stub.Entry, error) - CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) - UpdateEntry(ctx context.Context, en *stub.Entry) error - RemoveEntry(ctx context.Context, en *stub.Entry) error - ListChildren(ctx context.Context) ([]*stub.Entry, error) + FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) + CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) + UpdateEntry(ctx context.Context, en *pluginapi.Entry) error + RemoveEntry(ctx context.Context, en *pluginapi.Entry) error + ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) WriteAt(ctx context.Context, data []byte, off int64) (int64, error) ReadAt(ctx context.Context, dest []byte, off int64) (int64, error) @@ -104,23 +104,23 @@ func (d *MemFSPlugin) IsGroup(ctx context.Context) (bool, error) { return en.IsGroup, nil } -func (d *MemFSPlugin) FindEntry(ctx context.Context, name string) (*stub.Entry, error) { +func (d *MemFSPlugin) FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) { return d.fs.FindEntry(d.path, name) } -func (d *MemFSPlugin) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) { +func (d *MemFSPlugin) CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { return d.fs.CreateEntry(d.path, attr) } -func (d *MemFSPlugin) UpdateEntry(ctx context.Context, en *stub.Entry) error { +func (d *MemFSPlugin) UpdateEntry(ctx context.Context, en *pluginapi.Entry) error { return d.fs.UpdateEntry(d.path, en) } -func (d *MemFSPlugin) RemoveEntry(ctx context.Context, en *stub.Entry) error { +func (d *MemFSPlugin) RemoveEntry(ctx context.Context, en *pluginapi.Entry) error { return d.fs.RemoveEntry(d.path, en) } -func (d *MemFSPlugin) ListChildren(ctx context.Context) ([]*stub.Entry, error) { +func (d *MemFSPlugin) ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) { return d.fs.ListChildren(d.path) } @@ -145,13 +145,13 @@ func (d *MemFSPlugin) Close(ctx context.Context) error { } type MemFS struct { - entries map[string]*stub.Entry + entries map[string]*pluginapi.Entry files map[string]*memFile groups map[string][]string mux sync.Mutex } -func (m *MemFS) GetEntry(enPath string) (*stub.Entry, error) { +func (m *MemFS) GetEntry(enPath string) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -162,7 +162,7 @@ func (m *MemFS) GetEntry(enPath string) (*stub.Entry, error) { return en, nil } -func (m *MemFS) FindEntry(parentPath string, name string) (*stub.Entry, error) { +func (m *MemFS) FindEntry(parentPath string, name string) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -173,7 +173,7 @@ func (m *MemFS) FindEntry(parentPath string, name string) (*stub.Entry, error) { return en, nil } -func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry, error) { +func (m *MemFS) CreateEntry(parentPath string, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -195,7 +195,7 @@ func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry child = append(child, attr.Name) m.groups[parentPath] = child - en := &stub.Entry{ + en := &pluginapi.Entry{ Name: attr.Name, Kind: attr.Kind, IsGroup: types.IsGroup(attr.Kind), @@ -210,7 +210,7 @@ func (m *MemFS) CreateEntry(parentPath string, attr stub.EntryAttr) (*stub.Entry return en, nil } -func (m *MemFS) UpdateEntry(parentPath string, en *stub.Entry) error { +func (m *MemFS) UpdateEntry(parentPath string, en *pluginapi.Entry) error { m.mux.Lock() defer m.mux.Unlock() @@ -225,7 +225,7 @@ func (m *MemFS) UpdateEntry(parentPath string, en *stub.Entry) error { return nil } -func (m *MemFS) RemoveEntry(parentPath string, en *stub.Entry) error { +func (m *MemFS) RemoveEntry(parentPath string, en *pluginapi.Entry) error { m.mux.Lock() defer m.mux.Unlock() @@ -262,7 +262,7 @@ func (m *MemFS) RemoveEntry(parentPath string, en *stub.Entry) error { return nil } -func (m *MemFS) ListChildren(enPath string) ([]*stub.Entry, error) { +func (m *MemFS) ListChildren(enPath string) ([]*pluginapi.Entry, error) { m.mux.Lock() defer m.mux.Unlock() @@ -276,7 +276,7 @@ func (m *MemFS) ListChildren(enPath string) ([]*stub.Entry, error) { } childNames := m.groups[enPath] - result := make([]*stub.Entry, len(childNames)) + result := make([]*pluginapi.Entry, len(childNames)) for i, chName := range childNames { result[i] = m.entries[path.Join(enPath, chName)] } @@ -322,7 +322,7 @@ func (m *MemFS) Trunc(filePath string) error { func NewMemFS() *MemFS { fs := &MemFS{ - entries: map[string]*stub.Entry{"/": {Name: ".", Kind: types.ExternalGroupKind, Size: 0, IsGroup: true}}, + entries: map[string]*pluginapi.Entry{"/": {Name: ".", Kind: types.ExternalGroupKind, Size: 0, IsGroup: true}}, groups: map[string][]string{"/": {}}, files: map[string]*memFile{}, } @@ -334,11 +334,11 @@ const ( ) type memFile struct { - *stub.Entry + *pluginapi.Entry data []byte } -func newMemFile(entry *stub.Entry) *memFile { +func newMemFile(entry *pluginapi.Entry) *memFile { return &memFile{ Entry: entry, data: utils.NewMemoryBlock(memFileMaxSize / 16), // 1M diff --git a/pkg/plugin/pluginapi/consts.go b/pkg/plugin/pluginapi/consts.go new file mode 100644 index 00000000..4b94c6ad --- /dev/null +++ b/pkg/plugin/pluginapi/consts.go @@ -0,0 +1,28 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package pluginapi + +const ( + ResEntryIdKey = "nanafs.workflow.entry_id" + ResEntryPathKey = "nanafs.workflow.entry_path" + ResEntryDocumentsKey = "nanafs.workflow.entry_documents" + ResCollectManifest = "nanafs.workflow.collect_manifest" + ResPluginName = "nanafs.workflow.plugin_name" + ResPluginVersion = "nanafs.workflow.plugin_version" + ResPluginType = "nanafs.workflow.plugin_type" + ResPluginAction = "nanafs.workflow.plugin_action" +) diff --git a/pkg/plugin/stub/entry.go b/pkg/plugin/pluginapi/entry.go similarity index 97% rename from pkg/plugin/stub/entry.go rename to pkg/plugin/pluginapi/entry.go index 1adc5bee..709d11bb 100644 --- a/pkg/plugin/stub/entry.go +++ b/pkg/plugin/pluginapi/entry.go @@ -14,7 +14,7 @@ limitations under the License. */ -package stub +package pluginapi import ( "github.com/basenana/nanafs/pkg/types" diff --git a/pkg/plugin/stub/process.go b/pkg/plugin/pluginapi/process.go similarity index 74% rename from pkg/plugin/stub/process.go rename to pkg/plugin/pluginapi/process.go index 4649d2da..70ac2f4b 100644 --- a/pkg/plugin/stub/process.go +++ b/pkg/plugin/pluginapi/process.go @@ -14,14 +14,14 @@ limitations under the License. */ -package stub +package pluginapi type Request struct { Action string WorkPath string EntryId int64 EntryPath string - Parameter map[string]string + Parameter map[string]any } func NewRequest() *Request { @@ -31,9 +31,17 @@ func NewRequest() *Request { type Response struct { IsSucceed bool Message string - Entries []Entry + Results map[string]any } func NewResponse() *Response { return &Response{} } + +func NewFailedResponse(msg string) *Response { + return &Response{IsSucceed: false, Message: msg} +} + +func NewResponseWithResult(result map[string]any) *Response { + return &Response{IsSucceed: true, Results: result} +} diff --git a/pkg/plugin/process.go b/pkg/plugin/process.go index 25c4290f..d99e6e69 100644 --- a/pkg/plugin/process.go +++ b/pkg/plugin/process.go @@ -19,17 +19,24 @@ package plugin import ( "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/buildin/docloader" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" + "github.com/basenana/nanafs/utils" "time" ) type ProcessPlugin interface { Plugin - Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) + Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) } -func Call(ctx context.Context, ps types.PlugScope, req *stub.Request) (resp *stub.Response, err error) { +func Call(ctx context.Context, ps types.PlugScope, req *pluginapi.Request) (resp *pluginapi.Response, err error) { + defer func() { + if rErr := utils.Recover(); rErr != nil { + err = rErr + } + }() var plugin Plugin plugin, err = BuildPlugin(ctx, ps) if err != nil { @@ -64,7 +71,7 @@ func (d *DelayProcessPlugin) Version() string { return delayPluginVersion } -func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, params map[string]string) (*stub.Response, error) { +func (d *DelayProcessPlugin) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) { var ( until time.Time nowTime = time.Now() @@ -72,7 +79,7 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par switch request.Action { case "delay": - delayDurationStr := params["delay"] + delayDurationStr := pluginParams["delay"] duration, err := time.ParseDuration(delayDurationStr) if err != nil { return nil, fmt.Errorf("parse delay duration [%s] failed: %s", delayDurationStr, err) @@ -81,14 +88,14 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par case "until": var err error - untilStr := params["until"] + untilStr := pluginParams["until"] until, err = time.Parse(untilStr, time.RFC3339) if err != nil { return nil, fmt.Errorf("parse delay until [%s] failed: %s", untilStr, err) } default: - resp := stub.NewResponse() + resp := pluginapi.NewResponse() resp.Message = fmt.Sprintf("unknown action: %s", request.Action) return resp, nil } @@ -98,16 +105,16 @@ func (d *DelayProcessPlugin) Run(ctx context.Context, request *stub.Request, par defer timer.Stop() select { case <-timer.C: - return &stub.Response{IsSucceed: true}, nil + return &pluginapi.Response{IsSucceed: true}, nil case <-ctx.Done(): - return &stub.Response{IsSucceed: false, Message: ctx.Err().Error()}, nil + return &pluginapi.Response{IsSucceed: false, Message: ctx.Err().Error()}, nil } } - return &stub.Response{IsSucceed: true}, nil + return &pluginapi.Response{IsSucceed: true}, nil } -func registerDelayPlugin(r *registry) { +func registerBuildInProcessPlugin(r *registry) { r.Register( delayPluginName, types.PluginSpec{Name: delayPluginName, Version: delayPluginVersion, Type: types.TypeProcess, Parameters: map[string]string{}}, @@ -115,4 +122,12 @@ func registerDelayPlugin(r *registry) { return &DelayProcessPlugin{}, nil }, ) + + r.Register( + docloader.PluginName, + types.PluginSpec{Name: docloader.PluginName, Version: docloader.PluginVersion, Type: types.TypeProcess, Parameters: map[string]string{}}, + func(ctx context.Context, spec types.PluginSpec, scope types.PlugScope) (Plugin, error) { + return &docloader.DocLoader{}, nil + }, + ) } diff --git a/pkg/plugin/registry.go b/pkg/plugin/registry.go index d4945a1d..d26b93d6 100644 --- a/pkg/plugin/registry.go +++ b/pkg/plugin/registry.go @@ -70,7 +70,7 @@ func Init(cfg *config.Plugin, recorderGetter metastore.PluginRecorderGetter) err } // register build-in plugins - registerDelayPlugin(r) + registerBuildInProcessPlugin(r) registerMemfsPlugin(r) register3BodyPlugin(r) diff --git a/pkg/plugin/source.go b/pkg/plugin/source.go index 8cc327c4..f1565e8a 100644 --- a/pkg/plugin/source.go +++ b/pkg/plugin/source.go @@ -20,7 +20,7 @@ import ( "bytes" "context" "fmt" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "io" "io/ioutil" @@ -31,8 +31,8 @@ import ( type SourcePlugin interface { Plugin - Fresh(ctx context.Context, opt stub.FreshOption) ([]*stub.Entry, error) - Open(ctx context.Context, entry *stub.Entry) (io.ReadCloser, error) + Fresh(ctx context.Context, opt pluginapi.FreshOption) ([]*pluginapi.Entry, error) + Open(ctx context.Context, entry *pluginapi.Entry) (io.ReadCloser, error) } const ( @@ -56,14 +56,14 @@ func (d *ThreeBodyPlugin) Version() string { return the3BodyPluginVersion } -func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt stub.FreshOption) ([]*stub.Entry, error) { +func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt pluginapi.FreshOption) ([]*pluginapi.Entry, error) { crtAt := time.Now().Unix() - result := make([]*stub.Entry, 0) + result := make([]*pluginapi.Entry, 0) for i := crtAt - 60; i < crtAt; i += 60 { if i <= opt.LastFreshAt.Unix() { continue } - result = append(result, &stub.Entry{ + result = append(result, &pluginapi.Entry{ Name: fmt.Sprintf("3_body_%d.txt", i), Kind: types.RawKind, IsGroup: false, @@ -72,7 +72,7 @@ func (d *ThreeBodyPlugin) Fresh(ctx context.Context, opt stub.FreshOption) ([]*s return result, nil } -func (d *ThreeBodyPlugin) Open(ctx context.Context, entry *stub.Entry) (io.ReadCloser, error) { +func (d *ThreeBodyPlugin) Open(ctx context.Context, entry *pluginapi.Entry) (io.ReadCloser, error) { fileNameParts := strings.Split(entry.Name, "_") sendAtStr := fileNameParts[len(fileNameParts)-1] diff --git a/pkg/types/friday.go b/pkg/types/friday.go new file mode 100644 index 00000000..7094a55f --- /dev/null +++ b/pkg/types/friday.go @@ -0,0 +1,22 @@ +/* + Copyright 2023 NanaFS Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package types + +type FDocument struct { + Content string + Metadata map[string]any +} diff --git a/pkg/workflow/exec/consts.go b/pkg/workflow/exec/consts.go index 414b2ae6..2b01ad0f 100644 --- a/pkg/workflow/exec/consts.go +++ b/pkg/workflow/exec/consts.go @@ -21,12 +21,3 @@ const ( OpEntryCollect = "entryCollect" OpPluginCall = "pluginCall" ) - -const ( - paramEntryIdKey = "nanafs.workflow.entry_id" - paramEntryPathKey = "nanafs.workflow.entry_path" - paramPluginName = "nanafs.workflow.plugin_name" - paramPluginVersion = "nanafs.workflow.plugin_version" - paramPluginType = "nanafs.workflow.plugin_type" - paramPluginAction = "nanafs.workflow.plugin_action" -) diff --git a/pkg/workflow/exec/executor.go b/pkg/workflow/exec/executor.go index a75a2877..608fca07 100644 --- a/pkg/workflow/exec/executor.go +++ b/pkg/workflow/exec/executor.go @@ -22,11 +22,14 @@ import ( "github.com/basenana/nanafs/config" "github.com/basenana/nanafs/pkg/dentry" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/pkg/workflow/jobrun" "github.com/basenana/nanafs/utils/logger" "go.uber.org/zap" + "os" + "path" + "sync" ) const ( @@ -36,8 +39,11 @@ const ( func RegisterOperators(entryMgr dentry.Manager, cfg LocalConfig) error { jobrun.RegisterExecutorBuilder(localExecName, func(job *types.WorkflowJob) jobrun.Executor { return &localExecutor{ - job: job, entryMgr: entryMgr, config: cfg, - logger: logger.NewLogger("localExecutor").With(zap.String("job", job.Id)), + job: job, + entryMgr: entryMgr, + config: cfg, + results: map[string]any{}, + logger: logger.NewLogger("localExecutor").With(zap.String("job", job.Id)), } }) return nil @@ -49,6 +55,8 @@ type localExecutor struct { entryPath string entryMgr dentry.Manager config LocalConfig + results map[string]any + resultMux sync.Mutex logger *zap.SugaredLogger } @@ -72,17 +80,30 @@ func (b *localExecutor) Setup(ctx context.Context) (err error) { return } b.logger.Infow("job setup", "workdir", b.workdir, "entryPath", b.entryPath) + return } func (b *localExecutor) DoOperation(ctx context.Context, step types.WorkflowJobStep) error { - req := stub.NewRequest() + req := pluginapi.NewRequest() req.WorkPath = b.workdir req.EntryId = b.job.Target.EntryID req.EntryPath = b.entryPath + req.Parameter = map[string]any{} + b.resultMux.Lock() + for k, v := range b.results { + req.Parameter[k] = v + } + b.resultMux.Unlock() + req.Parameter[pluginapi.ResEntryIdKey] = b.job.Target.EntryID + req.Parameter[pluginapi.ResEntryPathKey] = b.entryPath + req.Parameter[pluginapi.ResPluginName] = step.Plugin.PluginName + req.Parameter[pluginapi.ResPluginVersion] = step.Plugin.Version + req.Parameter[pluginapi.ResPluginType] = step.Plugin.PluginType + req.Parameter[pluginapi.ResPluginAction] = step.Plugin.Action + req.Action = step.Plugin.PluginName - req.Parameter = step.Plugin.Parameters resp, err := plugin.Call(ctx, *step.Plugin, req) if err != nil { return fmt.Errorf("plugin action error: %s", err) @@ -90,12 +111,29 @@ func (b *localExecutor) DoOperation(ctx context.Context, step types.WorkflowJobS if !resp.IsSucceed { return fmt.Errorf("plugin action failed: %s", resp.Message) } + if len(resp.Results) > 0 { + b.resultMux.Lock() + for k, v := range resp.Results { + b.results[k] = v + } + b.resultMux.Unlock() + } return nil } func (b *localExecutor) Collect(ctx context.Context) error { - //TODO implement me - panic("implement me") + b.resultMux.Lock() + filename, needCollect := b.results[pluginapi.ResCollectManifest] + b.resultMux.Unlock() + if !needCollect { + return nil + } + f, err := os.Open(path.Join(b.workdir, filename.(string))) + if err != nil { + return fmt.Errorf("read collect manifest file failed: %s", err) + } + defer f.Close() + return nil } func (b *localExecutor) Teardown(ctx context.Context) { diff --git a/pkg/workflow/mirrordir.go b/pkg/workflow/mirrordir.go index 5345532e..0a93a560 100644 --- a/pkg/workflow/mirrordir.go +++ b/pkg/workflow/mirrordir.go @@ -21,7 +21,7 @@ import ( "fmt" "github.com/basenana/nanafs/pkg/dentry" "github.com/basenana/nanafs/pkg/plugin" - "github.com/basenana/nanafs/pkg/plugin/stub" + "github.com/basenana/nanafs/pkg/plugin/pluginapi" "github.com/basenana/nanafs/pkg/types" "github.com/basenana/nanafs/pkg/workflow/jobrun" "github.com/basenana/nanafs/utils" @@ -121,7 +121,7 @@ func (d *dirHandler) IsGroup(ctx context.Context) (bool, error) { return en.IsGroup, nil } -func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, error) { +func (d *dirHandler) FindEntry(ctx context.Context, name string) (*pluginapi.Entry, error) { if d == nil { return nil, types.ErrNoGroup } @@ -138,7 +138,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if d.dirKind == MirrorDirRoot { switch name { case MirrorDirWorkflows, MirrorDirJobs: - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) default: return nil, types.ErrNotFound } @@ -149,7 +149,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if err != nil { return nil, err } - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.RawKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.RawKind}) } if d.dirKind == MirrorDirJobs { @@ -158,7 +158,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e if err != nil { return nil, err } - return d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) + return d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: name, Kind: types.ExternalGroupKind}) } else { jobs, err := d.ListChildren(ctx) if err != nil { @@ -174,7 +174,7 @@ func (d *dirHandler) FindEntry(ctx context.Context, name string) (*stub.Entry, e return nil, types.ErrNotFound } -func (d *dirHandler) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stub.Entry, error) { +func (d *dirHandler) CreateEntry(ctx context.Context, attr pluginapi.EntryAttr) (*pluginapi.Entry, error) { if d.dirKind == MirrorDirRoot { return nil, types.ErrNoAccess } @@ -195,11 +195,11 @@ func (d *dirHandler) CreateEntry(ctx context.Context, attr stub.EntryAttr) (*stu return en, nil } -func (d *dirHandler) UpdateEntry(ctx context.Context, en *stub.Entry) error { +func (d *dirHandler) UpdateEntry(ctx context.Context, en *pluginapi.Entry) error { return d.plugin.fs.UpdateEntry(d.plugin.path, en) } -func (d *dirHandler) RemoveEntry(ctx context.Context, en *stub.Entry) error { +func (d *dirHandler) RemoveEntry(ctx context.Context, en *pluginapi.Entry) error { if d == nil { return types.ErrNoGroup } @@ -227,7 +227,7 @@ func (d *dirHandler) RemoveEntry(ctx context.Context, en *stub.Entry) error { return nil } -func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { +func (d *dirHandler) ListChildren(ctx context.Context) ([]*pluginapi.Entry, error) { if d == nil { return nil, types.ErrNoGroup } @@ -236,7 +236,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { return nil, err } - children := make([]*stub.Entry, 0) + children := make([]*pluginapi.Entry, 0) cachedChildMap := make(map[string]struct{}) for i, ch := range cachedChild { cachedChildMap[ch.Name] = struct{}{} @@ -247,7 +247,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { case d.dirKind == MirrorDirRoot: if _, ok := cachedChildMap[MirrorDirJobs]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: MirrorDirJobs, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: MirrorDirJobs, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror dir %s error: %s", MirrorDirJobs, err) return nil, err @@ -256,7 +256,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } if _, ok := cachedChildMap[MirrorDirWorkflows]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror dir %s error: %s", MirrorDirWorkflows, err) return nil, err @@ -271,7 +271,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, wf := range wfList { if _, ok := cachedChildMap[id2MirrorFile(wf.Id)]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: id2MirrorFile(wf.Id), Kind: types.RawKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: id2MirrorFile(wf.Id), Kind: types.RawKind}) if err != nil { wfLogger.Errorf("init mirror workflow file %s error: %s", id2MirrorFile(wf.Id), err) return nil, err @@ -286,7 +286,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, wf := range wfList { if _, ok := cachedChildMap[wf.Id]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: wf.Id, Kind: types.ExternalGroupKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: wf.Id, Kind: types.ExternalGroupKind}) if err != nil { wfLogger.Errorf("init mirror jobs workflow group %s error: %s", wf.Id, err) return nil, err @@ -301,7 +301,7 @@ func (d *dirHandler) ListChildren(ctx context.Context) ([]*stub.Entry, error) { } for _, j := range jobList { if _, ok := cachedChildMap[id2MirrorFile(j.Id)]; !ok { - child, err := d.plugin.fs.CreateEntry(d.plugin.path, stub.EntryAttr{Name: id2MirrorFile(j.Id), Kind: types.RawKind}) + child, err := d.plugin.fs.CreateEntry(d.plugin.path, pluginapi.EntryAttr{Name: id2MirrorFile(j.Id), Kind: types.RawKind}) if err != nil { wfLogger.Errorf("init mirror job file %s error: %s", id2MirrorFile(j.Id), err) return nil, err @@ -368,7 +368,7 @@ func (f *fileHandler) Close(ctx context.Context) error { return nil } -func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *stub.Entry) error { +func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *pluginapi.Entry) error { wf := &types.WorkflowSpec{} decodeErr := yaml.NewDecoder(&memfsFile{filePath: f.plugin.path, entry: en, memfs: f.plugin.fs}).Decode(wf) if decodeErr != nil { @@ -413,7 +413,7 @@ func (f *fileHandler) createOrUpdateWorkflow(ctx context.Context, en *stub.Entry return nil } -func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *stub.Entry) error { +func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *pluginapi.Entry) error { wfJob := &types.WorkflowJob{} decodeErr := yaml.NewDecoder(&memfsFile{filePath: f.plugin.path, entry: en, memfs: f.plugin.fs}).Decode(wfJob) if decodeErr != nil { @@ -469,7 +469,7 @@ func (f *fileHandler) triggerOrUpdateWorkflowJob(ctx context.Context, en *stub.E type memfsFile struct { filePath string - entry *stub.Entry + entry *pluginapi.Entry memfs *plugin.MemFS off int64 } @@ -490,12 +490,12 @@ func buildWorkflowMirrorPlugin(mgr Manager) plugin.Builder { mp := &MirrorPlugin{path: "/", fs: plugin.NewMemFS(), mgr: mgr} mp.dirHandler = &dirHandler{plugin: mp, dirKind: MirrorDirRoot} - _, _ = mp.fs.CreateEntry("/", stub.EntryAttr{ + _, _ = mp.fs.CreateEntry("/", pluginapi.EntryAttr{ Name: MirrorDirJobs, Kind: types.ExternalGroupKind, }) - _, _ = mp.fs.CreateEntry("/", stub.EntryAttr{ + _, _ = mp.fs.CreateEntry("/", pluginapi.EntryAttr{ Name: MirrorDirWorkflows, Kind: types.ExternalGroupKind, }) diff --git a/vendor/github.com/PuerkitoBio/goquery/.gitattributes b/vendor/github.com/PuerkitoBio/goquery/.gitattributes new file mode 100644 index 00000000..0cc26ec0 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/.gitattributes @@ -0,0 +1 @@ +testdata/* linguist-vendored diff --git a/vendor/github.com/PuerkitoBio/goquery/.gitignore b/vendor/github.com/PuerkitoBio/goquery/.gitignore new file mode 100644 index 00000000..970381cd --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/.gitignore @@ -0,0 +1,16 @@ +# editor temporary files +*.sublime-* +.DS_Store +*.swp +#*.*# +tags + +# direnv config +.env* + +# test binaries +*.test + +# coverage and profilte outputs +*.out + diff --git a/vendor/github.com/PuerkitoBio/goquery/LICENSE b/vendor/github.com/PuerkitoBio/goquery/LICENSE new file mode 100644 index 00000000..25372c2b --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/LICENSE @@ -0,0 +1,12 @@ +Copyright (c) 2012-2021, Martin Angers & Contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +* Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/PuerkitoBio/goquery/README.md b/vendor/github.com/PuerkitoBio/goquery/README.md new file mode 100644 index 00000000..582ccac9 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/README.md @@ -0,0 +1,198 @@ +# goquery - a little like that j-thing, only in Go + +[![Build Status](https://github.com/PuerkitoBio/goquery/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/PuerkitoBio/goquery/actions) +[![Go Reference](https://pkg.go.dev/badge/github.com/PuerkitoBio/goquery.svg)](https://pkg.go.dev/github.com/PuerkitoBio/goquery) +[![Sourcegraph Badge](https://sourcegraph.com/github.com/PuerkitoBio/goquery/-/badge.svg)](https://sourcegraph.com/github.com/PuerkitoBio/goquery?badge) + +goquery brings a syntax and a set of features similar to [jQuery][] to the [Go language][go]. It is based on Go's [net/html package][html] and the CSS Selector library [cascadia][]. Since the net/html parser returns nodes, and not a full-featured DOM tree, jQuery's stateful manipulation functions (like height(), css(), detach()) have been left off. + +Also, because the net/html parser requires UTF-8 encoding, so does goquery: it is the caller's responsibility to ensure that the source document provides UTF-8 encoded HTML. See the [wiki][] for various options to do this. + +Syntax-wise, it is as close as possible to jQuery, with the same function names when possible, and that warm and fuzzy chainable interface. jQuery being the ultra-popular library that it is, I felt that writing a similar HTML-manipulating library was better to follow its API than to start anew (in the same spirit as Go's `fmt` package), even though some of its methods are less than intuitive (looking at you, [index()][index]...). + +## Table of Contents + +* [Installation](#installation) +* [Changelog](#changelog) +* [API](#api) +* [Examples](#examples) +* [Related Projects](#related-projects) +* [Support](#support) +* [License](#license) + +## Installation + +Please note that because of the net/html dependency, goquery requires Go1.1+ and is tested on Go1.7+. + + $ go get github.com/PuerkitoBio/goquery + +(optional) To run unit tests: + + $ cd $GOPATH/src/github.com/PuerkitoBio/goquery + $ go test + +(optional) To run benchmarks (warning: it runs for a few minutes): + + $ cd $GOPATH/src/github.com/PuerkitoBio/goquery + $ go test -bench=".*" + +## Changelog + +**Note that goquery's API is now stable, and will not break.** + +* **2023-02-18 (v1.8.1)** : Update `go.mod` dependencies, update CI workflow. +* **2021-10-25 (v1.8.0)** : Add `Render` function to render a `Selection` to an `io.Writer` (thanks [@anthonygedeon](https://github.com/anthonygedeon)). +* **2021-07-11 (v1.7.1)** : Update go.mod dependencies and add dependabot config (thanks [@jauderho](https://github.com/jauderho)). +* **2021-06-14 (v1.7.0)** : Add `Single` and `SingleMatcher` functions to optimize first-match selection (thanks [@gdollardollar](https://github.com/gdollardollar)). +* **2021-01-11 (v1.6.1)** : Fix panic when calling `{Prepend,Append,Set}Html` on a `Selection` that contains non-Element nodes. +* **2020-10-08 (v1.6.0)** : Parse html in context of the container node for all functions that deal with html strings (`AfterHtml`, `AppendHtml`, etc.). Thanks to [@thiemok][thiemok] and [@davidjwilkins][djw] for their work on this. +* **2020-02-04 (v1.5.1)** : Update module dependencies. +* **2018-11-15 (v1.5.0)** : Go module support (thanks @Zaba505). +* **2018-06-07 (v1.4.1)** : Add `NewDocumentFromReader` examples. +* **2018-03-24 (v1.4.0)** : Deprecate `NewDocument(url)` and `NewDocumentFromResponse(response)`. +* **2018-01-28 (v1.3.0)** : Add `ToEnd` constant to `Slice` until the end of the selection (thanks to @davidjwilkins for raising the issue). +* **2018-01-11 (v1.2.0)** : Add `AddBack*` and deprecate `AndSelf` (thanks to @davidjwilkins). +* **2017-02-12 (v1.1.0)** : Add `SetHtml` and `SetText` (thanks to @glebtv). +* **2016-12-29 (v1.0.2)** : Optimize allocations for `Selection.Text` (thanks to @radovskyb). +* **2016-08-28 (v1.0.1)** : Optimize performance for large documents. +* **2016-07-27 (v1.0.0)** : Tag version 1.0.0. +* **2016-06-15** : Invalid selector strings internally compile to a `Matcher` implementation that never matches any node (instead of a panic). So for example, `doc.Find("~")` returns an empty `*Selection` object. +* **2016-02-02** : Add `NodeName` utility function similar to the DOM's `nodeName` property. It returns the tag name of the first element in a selection, and other relevant values of non-element nodes (see [doc][] for details). Add `OuterHtml` utility function similar to the DOM's `outerHTML` property (named `OuterHtml` in small caps for consistency with the existing `Html` method on the `Selection`). +* **2015-04-20** : Add `AttrOr` helper method to return the attribute's value or a default value if absent. Thanks to [piotrkowalczuk][piotr]. +* **2015-02-04** : Add more manipulation functions - Prepend* - thanks again to [Andrew Stone][thatguystone]. +* **2014-11-28** : Add more manipulation functions - ReplaceWith*, Wrap* and Unwrap - thanks again to [Andrew Stone][thatguystone]. +* **2014-11-07** : Add manipulation functions (thanks to [Andrew Stone][thatguystone]) and `*Matcher` functions, that receive compiled cascadia selectors instead of selector strings, thus avoiding potential panics thrown by goquery via `cascadia.MustCompile` calls. This results in better performance (selectors can be compiled once and reused) and more idiomatic error handling (you can handle cascadia's compilation errors, instead of recovering from panics, which had been bugging me for a long time). Note that the actual type expected is a `Matcher` interface, that `cascadia.Selector` implements. Other matcher implementations could be used. +* **2014-11-06** : Change import paths of net/html to golang.org/x/net/html (see https://groups.google.com/forum/#!topic/golang-nuts/eD8dh3T9yyA). Make sure to update your code to use the new import path too when you call goquery with `html.Node`s. +* **v0.3.2** : Add `NewDocumentFromReader()` (thanks jweir) which allows creating a goquery document from an io.Reader. +* **v0.3.1** : Add `NewDocumentFromResponse()` (thanks assassingj) which allows creating a goquery document from an http response. +* **v0.3.0** : Add `EachWithBreak()` which allows to break out of an `Each()` loop by returning false. This function was added instead of changing the existing `Each()` to avoid breaking compatibility. +* **v0.2.1** : Make go-getable, now that [go.net/html is Go1.0-compatible][gonet] (thanks to @matrixik for pointing this out). +* **v0.2.0** : Add support for negative indices in Slice(). **BREAKING CHANGE** `Document.Root` is removed, `Document` is now a `Selection` itself (a selection of one, the root element, just like `Document.Root` was before). Add jQuery's Closest() method. +* **v0.1.1** : Add benchmarks to use as baseline for refactorings, refactor Next...() and Prev...() methods to use the new html package's linked list features (Next/PrevSibling, FirstChild). Good performance boost (40+% in some cases). +* **v0.1.0** : Initial release. + +## API + +goquery exposes two structs, `Document` and `Selection`, and the `Matcher` interface. Unlike jQuery, which is loaded as part of a DOM document, and thus acts on its containing document, goquery doesn't know which HTML document to act upon. So it needs to be told, and that's what the `Document` type is for. It holds the root document node as the initial Selection value to manipulate. + +jQuery often has many variants for the same function (no argument, a selector string argument, a jQuery object argument, a DOM element argument, ...). Instead of exposing the same features in goquery as a single method with variadic empty interface arguments, statically-typed signatures are used following this naming convention: + +* When the jQuery equivalent can be called with no argument, it has the same name as jQuery for the no argument signature (e.g.: `Prev()`), and the version with a selector string argument is called `XxxFiltered()` (e.g.: `PrevFiltered()`) +* When the jQuery equivalent **requires** one argument, the same name as jQuery is used for the selector string version (e.g.: `Is()`) +* The signatures accepting a jQuery object as argument are defined in goquery as `XxxSelection()` and take a `*Selection` object as argument (e.g.: `FilterSelection()`) +* The signatures accepting a DOM element as argument in jQuery are defined in goquery as `XxxNodes()` and take a variadic argument of type `*html.Node` (e.g.: `FilterNodes()`) +* The signatures accepting a function as argument in jQuery are defined in goquery as `XxxFunction()` and take a function as argument (e.g.: `FilterFunction()`) +* The goquery methods that can be called with a selector string have a corresponding version that take a `Matcher` interface and are defined as `XxxMatcher()` (e.g.: `IsMatcher()`) + +Utility functions that are not in jQuery but are useful in Go are implemented as functions (that take a `*Selection` as parameter), to avoid a potential naming clash on the `*Selection`'s methods (reserved for jQuery-equivalent behaviour). + +The complete [package reference documentation can be found here][doc]. + +Please note that Cascadia's selectors do not necessarily match all supported selectors of jQuery (Sizzle). See the [cascadia project][cascadia] for details. Invalid selector strings compile to a `Matcher` that fails to match any node. Behaviour of the various functions that take a selector string as argument follows from that fact, e.g. (where `~` is an invalid selector string): + +* `Find("~")` returns an empty selection because the selector string doesn't match anything. +* `Add("~")` returns a new selection that holds the same nodes as the original selection, because it didn't add any node (selector string didn't match anything). +* `ParentsFiltered("~")` returns an empty selection because the selector string doesn't match anything. +* `ParentsUntil("~")` returns all parents of the selection because the selector string didn't match any element to stop before the top element. + +## Examples + +See some tips and tricks in the [wiki][]. + +Adapted from example_test.go: + +```Go +package main + +import ( + "fmt" + "log" + "net/http" + + "github.com/PuerkitoBio/goquery" +) + +func ExampleScrape() { + // Request the HTML page. + res, err := http.Get("http://metalsucks.net") + if err != nil { + log.Fatal(err) + } + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + log.Fatal(err) + } + + // Find the review items + doc.Find(".left-content article .post-title").Each(func(i int, s *goquery.Selection) { + // For each item found, get the title + title := s.Find("a").Text() + fmt.Printf("Review %d: %s\n", i, title) + }) +} + +func main() { + ExampleScrape() +} +``` + +## Related Projects + +- [Goq][goq], an HTML deserialization and scraping library based on goquery and struct tags. +- [andybalholm/cascadia][cascadia], the CSS selector library used by goquery. +- [suntong/cascadia][cascadiacli], a command-line interface to the cascadia CSS selector library, useful to test selectors. +- [gocolly/colly](https://github.com/gocolly/colly), a lightning fast and elegant Scraping Framework +- [gnulnx/goperf](https://github.com/gnulnx/goperf), a website performance test tool that also fetches static assets. +- [MontFerret/ferret](https://github.com/MontFerret/ferret), declarative web scraping. +- [tacusci/berrycms](https://github.com/tacusci/berrycms), a modern simple to use CMS with easy to write plugins +- [Dataflow kit](https://github.com/slotix/dataflowkit), Web Scraping framework for Gophers. +- [Geziyor](https://github.com/geziyor/geziyor), a fast web crawling & scraping framework for Go. Supports JS rendering. +- [Pagser](https://github.com/foolin/pagser), a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags. +- [stitcherd](https://github.com/vhodges/stitcherd), A server for doing server side includes using css selectors and DOM updates. +- [goskyr](https://github.com/jakopako/goskyr), an easily configurable command-line scraper written in Go. +- [goGetJS](https://github.com/davemolk/goGetJS), a tool for extracting, searching, and saving JavaScript files (with optional headless browser). + +## Support + +There are a number of ways you can support the project: + +* Use it, star it, build something with it, spread the word! + - If you do build something open-source or otherwise publicly-visible, let me know so I can add it to the [Related Projects](#related-projects) section! +* Raise issues to improve the project (note: doc typos and clarifications are issues too!) + - Please search existing issues before opening a new one - it may have already been adressed. +* Pull requests: please discuss new code in an issue first, unless the fix is really trivial. + - Make sure new code is tested. + - Be mindful of existing code - PRs that break existing code have a high probability of being declined, unless it fixes a serious issue. +* Sponsor the developer + - See the Github Sponsor button at the top of the repo on github + - or via BuyMeACoffee.com, below + + + +## License + +The [BSD 3-Clause license][bsd], the same as the [Go language][golic]. Cascadia's license is [here][caslic]. + +[jquery]: http://jquery.com/ +[go]: http://golang.org/ +[cascadia]: https://github.com/andybalholm/cascadia +[cascadiacli]: https://github.com/suntong/cascadia +[bsd]: http://opensource.org/licenses/BSD-3-Clause +[golic]: http://golang.org/LICENSE +[caslic]: https://github.com/andybalholm/cascadia/blob/master/LICENSE +[doc]: https://pkg.go.dev/github.com/PuerkitoBio/goquery +[index]: http://api.jquery.com/index/ +[gonet]: https://github.com/golang/net/ +[html]: https://pkg.go.dev/golang.org/x/net/html +[wiki]: https://github.com/PuerkitoBio/goquery/wiki/Tips-and-tricks +[thatguystone]: https://github.com/thatguystone +[piotr]: https://github.com/piotrkowalczuk +[goq]: https://github.com/andrewstuart/goq +[thiemok]: https://github.com/thiemok +[djw]: https://github.com/davidjwilkins diff --git a/vendor/github.com/PuerkitoBio/goquery/array.go b/vendor/github.com/PuerkitoBio/goquery/array.go new file mode 100644 index 00000000..1b1f6cbe --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/array.go @@ -0,0 +1,124 @@ +package goquery + +import ( + "golang.org/x/net/html" +) + +const ( + maxUint = ^uint(0) + maxInt = int(maxUint >> 1) + + // ToEnd is a special index value that can be used as end index in a call + // to Slice so that all elements are selected until the end of the Selection. + // It is equivalent to passing (*Selection).Length(). + ToEnd = maxInt +) + +// First reduces the set of matched elements to the first in the set. +// It returns a new Selection object, and an empty Selection object if the +// the selection is empty. +func (s *Selection) First() *Selection { + return s.Eq(0) +} + +// Last reduces the set of matched elements to the last in the set. +// It returns a new Selection object, and an empty Selection object if +// the selection is empty. +func (s *Selection) Last() *Selection { + return s.Eq(-1) +} + +// Eq reduces the set of matched elements to the one at the specified index. +// If a negative index is given, it counts backwards starting at the end of the +// set. It returns a new Selection object, and an empty Selection object if the +// index is invalid. +func (s *Selection) Eq(index int) *Selection { + if index < 0 { + index += len(s.Nodes) + } + + if index >= len(s.Nodes) || index < 0 { + return newEmptySelection(s.document) + } + + return s.Slice(index, index+1) +} + +// Slice reduces the set of matched elements to a subset specified by a range +// of indices. The start index is 0-based and indicates the index of the first +// element to select. The end index is 0-based and indicates the index at which +// the elements stop being selected (the end index is not selected). +// +// The indices may be negative, in which case they represent an offset from the +// end of the selection. +// +// The special value ToEnd may be specified as end index, in which case all elements +// until the end are selected. This works both for a positive and negative start +// index. +func (s *Selection) Slice(start, end int) *Selection { + if start < 0 { + start += len(s.Nodes) + } + if end == ToEnd { + end = len(s.Nodes) + } else if end < 0 { + end += len(s.Nodes) + } + return pushStack(s, s.Nodes[start:end]) +} + +// Get retrieves the underlying node at the specified index. +// Get without parameter is not implemented, since the node array is available +// on the Selection object. +func (s *Selection) Get(index int) *html.Node { + if index < 0 { + index += len(s.Nodes) // Negative index gets from the end + } + return s.Nodes[index] +} + +// Index returns the position of the first element within the Selection object +// relative to its sibling elements. +func (s *Selection) Index() int { + if len(s.Nodes) > 0 { + return newSingleSelection(s.Nodes[0], s.document).PrevAll().Length() + } + return -1 +} + +// IndexSelector returns the position of the first element within the +// Selection object relative to the elements matched by the selector, or -1 if +// not found. +func (s *Selection) IndexSelector(selector string) int { + if len(s.Nodes) > 0 { + sel := s.document.Find(selector) + return indexInSlice(sel.Nodes, s.Nodes[0]) + } + return -1 +} + +// IndexMatcher returns the position of the first element within the +// Selection object relative to the elements matched by the matcher, or -1 if +// not found. +func (s *Selection) IndexMatcher(m Matcher) int { + if len(s.Nodes) > 0 { + sel := s.document.FindMatcher(m) + return indexInSlice(sel.Nodes, s.Nodes[0]) + } + return -1 +} + +// IndexOfNode returns the position of the specified node within the Selection +// object, or -1 if not found. +func (s *Selection) IndexOfNode(node *html.Node) int { + return indexInSlice(s.Nodes, node) +} + +// IndexOfSelection returns the position of the first node in the specified +// Selection object within this Selection object, or -1 if not found. +func (s *Selection) IndexOfSelection(sel *Selection) int { + if sel != nil && len(sel.Nodes) > 0 { + return indexInSlice(s.Nodes, sel.Nodes[0]) + } + return -1 +} diff --git a/vendor/github.com/PuerkitoBio/goquery/doc.go b/vendor/github.com/PuerkitoBio/goquery/doc.go new file mode 100644 index 00000000..71146a78 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/doc.go @@ -0,0 +1,123 @@ +// Copyright (c) 2012-2016, Martin Angers & Contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// * Neither the name of the author nor the names of its contributors may be used to +// endorse or promote products derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +// OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +/* +Package goquery implements features similar to jQuery, including the chainable +syntax, to manipulate and query an HTML document. + +It brings a syntax and a set of features similar to jQuery to the Go language. +It is based on Go's net/html package and the CSS Selector library cascadia. +Since the net/html parser returns nodes, and not a full-featured DOM +tree, jQuery's stateful manipulation functions (like height(), css(), detach()) +have been left off. + +Also, because the net/html parser requires UTF-8 encoding, so does goquery: it is +the caller's responsibility to ensure that the source document provides UTF-8 encoded HTML. +See the repository's wiki for various options on how to do this. + +Syntax-wise, it is as close as possible to jQuery, with the same method names when +possible, and that warm and fuzzy chainable interface. jQuery being the +ultra-popular library that it is, writing a similar HTML-manipulating +library was better to follow its API than to start anew (in the same spirit as +Go's fmt package), even though some of its methods are less than intuitive (looking +at you, index()...). + +It is hosted on GitHub, along with additional documentation in the README.md +file: https://github.com/puerkitobio/goquery + +Please note that because of the net/html dependency, goquery requires Go1.1+. + +The various methods are split into files based on the category of behavior. +The three dots (...) indicate that various "overloads" are available. + +* array.go : array-like positional manipulation of the selection. + - Eq() + - First() + - Get() + - Index...() + - Last() + - Slice() + +* expand.go : methods that expand or augment the selection's set. + - Add...() + - AndSelf() + - Union(), which is an alias for AddSelection() + +* filter.go : filtering methods, that reduce the selection's set. + - End() + - Filter...() + - Has...() + - Intersection(), which is an alias of FilterSelection() + - Not...() + +* iteration.go : methods to loop over the selection's nodes. + - Each() + - EachWithBreak() + - Map() + +* manipulation.go : methods for modifying the document + - After...() + - Append...() + - Before...() + - Clone() + - Empty() + - Prepend...() + - Remove...() + - ReplaceWith...() + - Unwrap() + - Wrap...() + - WrapAll...() + - WrapInner...() + +* property.go : methods that inspect and get the node's properties values. + - Attr*(), RemoveAttr(), SetAttr() + - AddClass(), HasClass(), RemoveClass(), ToggleClass() + - Html() + - Length() + - Size(), which is an alias for Length() + - Text() + +* query.go : methods that query, or reflect, a node's identity. + - Contains() + - Is...() + +* traversal.go : methods to traverse the HTML document tree. + - Children...() + - Contents() + - Find...() + - Next...() + - Parent[s]...() + - Prev...() + - Siblings...() + +* type.go : definition of the types exposed by goquery. + - Document + - Selection + - Matcher + +* utilities.go : definition of helper functions (and not methods on a *Selection) +that are not part of jQuery, but are useful to goquery. + - NodeName + - OuterHtml +*/ +package goquery diff --git a/vendor/github.com/PuerkitoBio/goquery/expand.go b/vendor/github.com/PuerkitoBio/goquery/expand.go new file mode 100644 index 00000000..7caade53 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/expand.go @@ -0,0 +1,70 @@ +package goquery + +import "golang.org/x/net/html" + +// Add adds the selector string's matching nodes to those in the current +// selection and returns a new Selection object. +// The selector string is run in the context of the document of the current +// Selection object. +func (s *Selection) Add(selector string) *Selection { + return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, compileMatcher(selector))...) +} + +// AddMatcher adds the matcher's matching nodes to those in the current +// selection and returns a new Selection object. +// The matcher is run in the context of the document of the current +// Selection object. +func (s *Selection) AddMatcher(m Matcher) *Selection { + return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, m)...) +} + +// AddSelection adds the specified Selection object's nodes to those in the +// current selection and returns a new Selection object. +func (s *Selection) AddSelection(sel *Selection) *Selection { + if sel == nil { + return s.AddNodes() + } + return s.AddNodes(sel.Nodes...) +} + +// Union is an alias for AddSelection. +func (s *Selection) Union(sel *Selection) *Selection { + return s.AddSelection(sel) +} + +// AddNodes adds the specified nodes to those in the +// current selection and returns a new Selection object. +func (s *Selection) AddNodes(nodes ...*html.Node) *Selection { + return pushStack(s, appendWithoutDuplicates(s.Nodes, nodes, nil)) +} + +// AndSelf adds the previous set of elements on the stack to the current set. +// It returns a new Selection object containing the current Selection combined +// with the previous one. +// Deprecated: This function has been deprecated and is now an alias for AddBack(). +func (s *Selection) AndSelf() *Selection { + return s.AddBack() +} + +// AddBack adds the previous set of elements on the stack to the current set. +// It returns a new Selection object containing the current Selection combined +// with the previous one. +func (s *Selection) AddBack() *Selection { + return s.AddSelection(s.prevSel) +} + +// AddBackFiltered reduces the previous set of elements on the stack to those that +// match the selector string, and adds them to the current set. +// It returns a new Selection object containing the current Selection combined +// with the filtered previous one +func (s *Selection) AddBackFiltered(selector string) *Selection { + return s.AddSelection(s.prevSel.Filter(selector)) +} + +// AddBackMatcher reduces the previous set of elements on the stack to those that match +// the mateher, and adds them to the curernt set. +// It returns a new Selection object containing the current Selection combined +// with the filtered previous one +func (s *Selection) AddBackMatcher(m Matcher) *Selection { + return s.AddSelection(s.prevSel.FilterMatcher(m)) +} diff --git a/vendor/github.com/PuerkitoBio/goquery/filter.go b/vendor/github.com/PuerkitoBio/goquery/filter.go new file mode 100644 index 00000000..9138ffb3 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/filter.go @@ -0,0 +1,163 @@ +package goquery + +import "golang.org/x/net/html" + +// Filter reduces the set of matched elements to those that match the selector string. +// It returns a new Selection object for this subset of matching elements. +func (s *Selection) Filter(selector string) *Selection { + return s.FilterMatcher(compileMatcher(selector)) +} + +// FilterMatcher reduces the set of matched elements to those that match +// the given matcher. It returns a new Selection object for this subset +// of matching elements. +func (s *Selection) FilterMatcher(m Matcher) *Selection { + return pushStack(s, winnow(s, m, true)) +} + +// Not removes elements from the Selection that match the selector string. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) Not(selector string) *Selection { + return s.NotMatcher(compileMatcher(selector)) +} + +// NotMatcher removes elements from the Selection that match the given matcher. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotMatcher(m Matcher) *Selection { + return pushStack(s, winnow(s, m, false)) +} + +// FilterFunction reduces the set of matched elements to those that pass the function's test. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterFunction(f func(int, *Selection) bool) *Selection { + return pushStack(s, winnowFunction(s, f, true)) +} + +// NotFunction removes elements from the Selection that pass the function's test. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotFunction(f func(int, *Selection) bool) *Selection { + return pushStack(s, winnowFunction(s, f, false)) +} + +// FilterNodes reduces the set of matched elements to those that match the specified nodes. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterNodes(nodes ...*html.Node) *Selection { + return pushStack(s, winnowNodes(s, nodes, true)) +} + +// NotNodes removes elements from the Selection that match the specified nodes. +// It returns a new Selection object with the matching elements removed. +func (s *Selection) NotNodes(nodes ...*html.Node) *Selection { + return pushStack(s, winnowNodes(s, nodes, false)) +} + +// FilterSelection reduces the set of matched elements to those that match a +// node in the specified Selection object. +// It returns a new Selection object for this subset of elements. +func (s *Selection) FilterSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, winnowNodes(s, nil, true)) + } + return pushStack(s, winnowNodes(s, sel.Nodes, true)) +} + +// NotSelection removes elements from the Selection that match a node in the specified +// Selection object. It returns a new Selection object with the matching elements removed. +func (s *Selection) NotSelection(sel *Selection) *Selection { + if sel == nil { + return pushStack(s, winnowNodes(s, nil, false)) + } + return pushStack(s, winnowNodes(s, sel.Nodes, false)) +} + +// Intersection is an alias for FilterSelection. +func (s *Selection) Intersection(sel *Selection) *Selection { + return s.FilterSelection(sel) +} + +// Has reduces the set of matched elements to those that have a descendant +// that matches the selector. +// It returns a new Selection object with the matching elements. +func (s *Selection) Has(selector string) *Selection { + return s.HasSelection(s.document.Find(selector)) +} + +// HasMatcher reduces the set of matched elements to those that have a descendant +// that matches the matcher. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasMatcher(m Matcher) *Selection { + return s.HasSelection(s.document.FindMatcher(m)) +} + +// HasNodes reduces the set of matched elements to those that have a +// descendant that matches one of the nodes. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasNodes(nodes ...*html.Node) *Selection { + return s.FilterFunction(func(_ int, sel *Selection) bool { + // Add all nodes that contain one of the specified nodes + for _, n := range nodes { + if sel.Contains(n) { + return true + } + } + return false + }) +} + +// HasSelection reduces the set of matched elements to those that have a +// descendant that matches one of the nodes of the specified Selection object. +// It returns a new Selection object with the matching elements. +func (s *Selection) HasSelection(sel *Selection) *Selection { + if sel == nil { + return s.HasNodes() + } + return s.HasNodes(sel.Nodes...) +} + +// End ends the most recent filtering operation in the current chain and +// returns the set of matched elements to its previous state. +func (s *Selection) End() *Selection { + if s.prevSel != nil { + return s.prevSel + } + return newEmptySelection(s.document) +} + +// Filter based on the matcher, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnow(sel *Selection, m Matcher, keep bool) []*html.Node { + // Optimize if keep is requested + if keep { + return m.Filter(sel.Nodes) + } + // Use grep + return grep(sel, func(i int, s *Selection) bool { + return !m.Match(s.Get(0)) + }) +} + +// Filter based on an array of nodes, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnowNodes(sel *Selection, nodes []*html.Node, keep bool) []*html.Node { + if len(nodes)+len(sel.Nodes) < minNodesForSet { + return grep(sel, func(i int, s *Selection) bool { + return isInSlice(nodes, s.Get(0)) == keep + }) + } + + set := make(map[*html.Node]bool) + for _, n := range nodes { + set[n] = true + } + return grep(sel, func(i int, s *Selection) bool { + return set[s.Get(0)] == keep + }) +} + +// Filter based on a function test, and the indicator to keep (Filter) or +// to get rid of (Not) the matching elements. +func winnowFunction(sel *Selection, f func(int, *Selection) bool, keep bool) []*html.Node { + return grep(sel, func(i int, s *Selection) bool { + return f(i, s) == keep + }) +} diff --git a/vendor/github.com/PuerkitoBio/goquery/iteration.go b/vendor/github.com/PuerkitoBio/goquery/iteration.go new file mode 100644 index 00000000..e246f2e0 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/iteration.go @@ -0,0 +1,39 @@ +package goquery + +// Each iterates over a Selection object, executing a function for each +// matched element. It returns the current Selection object. The function +// f is called for each element in the selection with the index of the +// element in that selection starting at 0, and a *Selection that contains +// only that element. +func (s *Selection) Each(f func(int, *Selection)) *Selection { + for i, n := range s.Nodes { + f(i, newSingleSelection(n, s.document)) + } + return s +} + +// EachWithBreak iterates over a Selection object, executing a function for each +// matched element. It is identical to Each except that it is possible to break +// out of the loop by returning false in the callback function. It returns the +// current Selection object. +func (s *Selection) EachWithBreak(f func(int, *Selection) bool) *Selection { + for i, n := range s.Nodes { + if !f(i, newSingleSelection(n, s.document)) { + return s + } + } + return s +} + +// Map passes each element in the current matched set through a function, +// producing a slice of string holding the returned values. The function +// f is called for each element in the selection with the index of the +// element in that selection starting at 0, and a *Selection that contains +// only that element. +func (s *Selection) Map(f func(int, *Selection) string) (result []string) { + for i, n := range s.Nodes { + result = append(result, f(i, newSingleSelection(n, s.document))) + } + + return result +} diff --git a/vendor/github.com/PuerkitoBio/goquery/manipulation.go b/vendor/github.com/PuerkitoBio/goquery/manipulation.go new file mode 100644 index 00000000..35febf11 --- /dev/null +++ b/vendor/github.com/PuerkitoBio/goquery/manipulation.go @@ -0,0 +1,679 @@ +package goquery + +import ( + "strings" + + "golang.org/x/net/html" +) + +// After applies the selector from the root document and inserts the matched elements +// after the elements in the set of matched elements. +// +// If one of the matched elements in the selection is not currently in the +// document, it's impossible to insert nodes after it, so it will be ignored. +// +// This follows the same rules as Selection.Append. +func (s *Selection) After(selector string) *Selection { + return s.AfterMatcher(compileMatcher(selector)) +} + +// AfterMatcher applies the matcher from the root document and inserts the matched elements +// after the elements in the set of matched elements. +// +// If one of the matched elements in the selection is not currently in the +// document, it's impossible to insert nodes after it, so it will be ignored. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterMatcher(m Matcher) *Selection { + return s.AfterNodes(m.MatchAll(s.document.rootNode)...) +} + +// AfterSelection inserts the elements in the selection after each element in the set of matched +// elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterSelection(sel *Selection) *Selection { + return s.AfterNodes(sel.Nodes...) +} + +// AfterHtml parses the html and inserts it after the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, true, func(node *html.Node, nodes []*html.Node) { + nextSibling := node.NextSibling + for _, n := range nodes { + if node.Parent != nil { + node.Parent.InsertBefore(n, nextSibling) + } + } + }) +} + +// AfterNodes inserts the nodes after each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AfterNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, true, func(sn *html.Node, n *html.Node) { + if sn.Parent != nil { + sn.Parent.InsertBefore(n, sn.NextSibling) + } + }) +} + +// Append appends the elements specified by the selector to the end of each element +// in the set of matched elements, following those rules: +// +// 1) The selector is applied to the root document. +// +// 2) Elements that are part of the document will be moved to the new location. +// +// 3) If there are multiple locations to append to, cloned nodes will be +// appended to all target locations except the last one, which will be moved +// as noted in (2). +func (s *Selection) Append(selector string) *Selection { + return s.AppendMatcher(compileMatcher(selector)) +} + +// AppendMatcher appends the elements specified by the matcher to the end of each element +// in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendMatcher(m Matcher) *Selection { + return s.AppendNodes(m.MatchAll(s.document.rootNode)...) +} + +// AppendSelection appends the elements in the selection to the end of each element +// in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendSelection(sel *Selection) *Selection { + return s.AppendNodes(sel.Nodes...) +} + +// AppendHtml parses the html and appends it to the set of matched elements. +func (s *Selection) AppendHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, false, func(node *html.Node, nodes []*html.Node) { + for _, n := range nodes { + node.AppendChild(n) + } + }) +} + +// AppendNodes appends the specified nodes to each node in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) AppendNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, false, func(sn *html.Node, n *html.Node) { + sn.AppendChild(n) + }) +} + +// Before inserts the matched elements before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) Before(selector string) *Selection { + return s.BeforeMatcher(compileMatcher(selector)) +} + +// BeforeMatcher inserts the matched elements before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeMatcher(m Matcher) *Selection { + return s.BeforeNodes(m.MatchAll(s.document.rootNode)...) +} + +// BeforeSelection inserts the elements in the selection before each element in the set of matched +// elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeSelection(sel *Selection) *Selection { + return s.BeforeNodes(sel.Nodes...) +} + +// BeforeHtml parses the html and inserts it before the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, true, func(node *html.Node, nodes []*html.Node) { + for _, n := range nodes { + if node.Parent != nil { + node.Parent.InsertBefore(n, node) + } + } + }) +} + +// BeforeNodes inserts the nodes before each element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) BeforeNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, false, func(sn *html.Node, n *html.Node) { + if sn.Parent != nil { + sn.Parent.InsertBefore(n, sn) + } + }) +} + +// Clone creates a deep copy of the set of matched nodes. The new nodes will not be +// attached to the document. +func (s *Selection) Clone() *Selection { + ns := newEmptySelection(s.document) + ns.Nodes = cloneNodes(s.Nodes) + return ns +} + +// Empty removes all children nodes from the set of matched elements. +// It returns the children nodes in a new Selection. +func (s *Selection) Empty() *Selection { + var nodes []*html.Node + + for _, n := range s.Nodes { + for c := n.FirstChild; c != nil; c = n.FirstChild { + n.RemoveChild(c) + nodes = append(nodes, c) + } + } + + return pushStack(s, nodes) +} + +// Prepend prepends the elements specified by the selector to each element in +// the set of matched elements, following the same rules as Append. +func (s *Selection) Prepend(selector string) *Selection { + return s.PrependMatcher(compileMatcher(selector)) +} + +// PrependMatcher prepends the elements specified by the matcher to each +// element in the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependMatcher(m Matcher) *Selection { + return s.PrependNodes(m.MatchAll(s.document.rootNode)...) +} + +// PrependSelection prepends the elements in the selection to each element in +// the set of matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependSelection(sel *Selection) *Selection { + return s.PrependNodes(sel.Nodes...) +} + +// PrependHtml parses the html and prepends it to the set of matched elements. +func (s *Selection) PrependHtml(htmlStr string) *Selection { + return s.eachNodeHtml(htmlStr, false, func(node *html.Node, nodes []*html.Node) { + firstChild := node.FirstChild + for _, n := range nodes { + node.InsertBefore(n, firstChild) + } + }) +} + +// PrependNodes prepends the specified nodes to each node in the set of +// matched elements. +// +// This follows the same rules as Selection.Append. +func (s *Selection) PrependNodes(ns ...*html.Node) *Selection { + return s.manipulateNodes(ns, true, func(sn *html.Node, n *html.Node) { + // sn.FirstChild may be nil, in which case this functions like + // sn.AppendChild() + sn.InsertBefore(n, sn.FirstChild) + }) +} + +// Remove removes the set of matched elements from the document. +// It returns the same selection, now consisting of nodes not in the document. +func (s *Selection) Remove() *Selection { + for _, n := range s.Nodes { + if n.Parent != nil { + n.Parent.RemoveChild(n) + } + } + + return s +} + +// RemoveFiltered removes from the current set of matched elements those that +// match the selector filter. It returns the Selection of removed nodes. +// +// For example if the selection s contains "
and
nodes. + nodes := dom.QuerySelectorAll(doc, "p, pre, article") + + // Also getnodes which have
node(s) and append + // them into the `nodes` variable. + // Some articles' DOM structures might look like : + // + //+ // Sentences+ // + // So we need to make sure only fetch the div once. + // To do so, we will use map as dictionary. + tracker := make(map[*html.Node]struct{}) + for _, br := range dom.QuerySelectorAll(doc, "div > br") { + if br.Parent == nil { + continue + } + + if _, exist := tracker[br.Parent]; !exist { + tracker[br.Parent] = struct{}{} + nodes = append(nodes, br.Parent) + } + } + + // This is a little cheeky, we use the accumulator 'score' to decide what + // to return from this callback. + score := float64(0) + return ps.someNode(nodes, func(node *html.Node) bool { + if !ps.isProbablyVisible(node) { + return false + } + + matchString := dom.ClassName(node) + " " + dom.ID(node) + if rxUnlikelyCandidates.MatchString(matchString) && + !rxOkMaybeItsACandidate.MatchString(matchString) { + return false + } + + if dom.TagName(node) == "p" && ps.hasAncestorTag(node, "li", -1, nil) { + return false + } + + nodeText := strings.TrimSpace(dom.TextContent(node)) + nodeTextLength := len(nodeText) + if nodeTextLength < 140 { + return false + } + + score += math.Sqrt(float64(nodeTextLength - 140)) + return score > 20 + }) +} diff --git a/vendor/github.com/go-shiori/go-readability/parser-parse.go b/vendor/github.com/go-shiori/go-readability/parser-parse.go new file mode 100644 index 00000000..235ce62b --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/parser-parse.go @@ -0,0 +1,129 @@ +package readability + +import ( + "fmt" + "io" + nurl "net/url" + "strings" + + "github.com/go-shiori/dom" + "golang.org/x/net/html" +) + +// Parse parses a reader and find the main readable content. +func (ps *Parser) Parse(input io.Reader, pageURL *nurl.URL) (Article, error) { + // Parse input + doc, err := dom.Parse(input) + if err != nil { + return Article{}, fmt.Errorf("failed to parse input: %v", err) + } + + return ps.ParseDocument(doc, pageURL) +} + +// ParseDocument parses the specified document and find the main readable content. +func (ps *Parser) ParseDocument(doc *html.Node, pageURL *nurl.URL) (Article, error) { + // Clone document to make sure the original kept untouched + ps.doc = dom.Clone(doc, true) + + // Reset parser data + ps.articleTitle = "" + ps.articleByline = "" + ps.articleDir = "" + ps.articleSiteName = "" + ps.documentURI = pageURL + ps.attempts = []parseAttempt{} + ps.flags = flags{ + stripUnlikelys: true, + useWeightClasses: true, + cleanConditionally: true, + } + + // Avoid parsing too large documents, as per configuration option + if ps.MaxElemsToParse > 0 { + numTags := len(dom.GetElementsByTagName(ps.doc, "*")) + if numTags > ps.MaxElemsToParse { + return Article{}, fmt.Errorf("documents too large: %d elements", numTags) + } + } + + // Unwrap image from noscript + ps.unwrapNoscriptImages(ps.doc) + + // Extract JSON-LD metadata before removing scripts + var jsonLd map[string]string + if !ps.DisableJSONLD { + jsonLd, _ = ps.getJSONLD() + } + + // Remove script tags from the document. + ps.removeScripts(ps.doc) + + // Prepares the HTML document + ps.prepDocument() + + // Fetch metadata + metadata := ps.getArticleMetadata(jsonLd) + ps.articleTitle = metadata["title"] + + // Try to grab article content + finalHTMLContent := "" + finalTextContent := "" + articleContent := ps.grabArticle() + var readableNode *html.Node + + if articleContent != nil { + ps.postProcessContent(articleContent) + + // If we haven't found an excerpt in the article's metadata, + // use the article's first paragraph as the excerpt. This is used + // for displaying a preview of the article's content. + if metadata["excerpt"] == "" { + paragraphs := dom.GetElementsByTagName(articleContent, "p") + if len(paragraphs) > 0 { + metadata["excerpt"] = strings.TrimSpace(dom.TextContent(paragraphs[0])) + } + } + + readableNode = dom.FirstElementChild(articleContent) + finalHTMLContent = dom.InnerHTML(articleContent) + finalTextContent = dom.TextContent(articleContent) + finalTextContent = strings.TrimSpace(finalTextContent) + } + + finalByline := metadata["byline"] + if finalByline == "" { + finalByline = ps.articleByline + } + + // Excerpt is an supposed to be short and concise, + // so it shouldn't have any new line + excerpt := strings.TrimSpace(metadata["excerpt"]) + excerpt = strings.Join(strings.Fields(excerpt), " ") + + // go-readability special: + // Internet is dangerous and weird, and sometimes we will find + // metadata isn't encoded using a valid Utf-8, so here we check it. + var replacementTitle string + if pageURL != nil { + replacementTitle = pageURL.String() + } + + validTitle := strings.ToValidUTF8(ps.articleTitle, replacementTitle) + validByline := strings.ToValidUTF8(finalByline, "") + validExcerpt := strings.ToValidUTF8(excerpt, "") + + return Article{ + Title: validTitle, + Byline: validByline, + Node: readableNode, + Content: finalHTMLContent, + TextContent: finalTextContent, + Length: charCount(finalTextContent), + Excerpt: validExcerpt, + SiteName: metadata["siteName"], + Image: metadata["image"], + Favicon: metadata["favicon"], + Language: ps.articleLang, + }, nil +} diff --git a/vendor/github.com/go-shiori/go-readability/parser.go b/vendor/github.com/go-shiori/go-readability/parser.go new file mode 100644 index 00000000..b4c7c83f --- /dev/null +++ b/vendor/github.com/go-shiori/go-readability/parser.go @@ -0,0 +1,2300 @@ +package readability + +import ( + "encoding/json" + "fmt" + shtml "html" + "log" + "math" + nurl "net/url" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/go-shiori/dom" + "golang.org/x/net/html" +) + +// All of the regular expressions in use within readability. +// Defined up here so we don't instantiate them repeatedly in loops *. +var ( + rxUnlikelyCandidates = regexp.MustCompile(`(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) + rxOkMaybeItsACandidate = regexp.MustCompile(`(?i)and|article|body|column|content|main|shadow`) + rxPositive = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) + rxNegative = regexp.MustCompile(`(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`) + rxByline = regexp.MustCompile(`(?i)byline|author|dateline|writtenby|p-author`) + rxNormalize = regexp.MustCompile(`(?i)\s{2,}`) + rxVideosx = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`) + rxTokenize = regexp.MustCompile(`(?i)\W+`) + rxWhitespace = regexp.MustCompile(`(?i)^\s*$`) + rxHasContent = regexp.MustCompile(`(?i)\S$`) + rxHashURL = regexp.MustCompile(`(?i)^#.+`) + rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name|image\S*)\s*`) + rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|image)\s*$`) + rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `) + rxTitleHierarchySep = regexp.MustCompile(`(?i) [\\/>»] `) + rxTitleRemoveFinalPart = regexp.MustCompile(`(?i)(.*)[\|\-\\/>»] .*`) + rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`) + rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`) + rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`) + rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`) + rxShareElements = regexp.MustCompile(`(?i)(\b|_)(share|sharedaddy)(\b|_)`) + rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`) + rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) + rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) + rxImgExtensions = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)`) + rxSrcsetURL = regexp.MustCompile(`(?i)(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))`) + rxB64DataURL = regexp.MustCompile(`(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,`) + rxJsonLdArticleTypes = regexp.MustCompile(`(?i)^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$`) + rxCDATA = regexp.MustCompile(`^\s*\s*$`) + rxSchemaOrg = regexp.MustCompile(`(?i)^https?\:\/\/schema\.org$`) +) + +// Constants that used by readability. +var ( + unlikelyRoles = sliceToMap("menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog") + divToPElems = sliceToMap("blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", "select") + alterToDivExceptions = []string{"div", "article", "section", "p"} + presentationalAttributes = []string{"align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace"} + deprecatedSizeAttributeElems = []string{"table", "th", "td", "hr", "pre"} + phrasingElems = []string{ + "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", + "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", + "mark", "math", "meter", "noscript", "object", "output", "progress", "q", + "ruby", "samp", "script", "select", "small", "span", "strong", "sub", + "sup", "textarea", "time", "var", "wbr"} +) + +// flags is flags that used by parser. +type flags struct { + stripUnlikelys bool + useWeightClasses bool + cleanConditionally bool +} + +// parseAttempt is container for the result of previous parse attempts. +type parseAttempt struct { + articleContent *html.Node + textLength int +} + +// Article is the final readable content. +type Article struct { + Title string + Byline string + Node *html.Node + Content string + TextContent string + Length int + Excerpt string + SiteName string + Image string + Favicon string + Language string +} + +// Parser is the parser that parses the page to get the readable content. +type Parser struct { + // MaxElemsToParse is the max number of nodes supported by this + // parser. Default: 0 (no limit) + MaxElemsToParse int + // NTopCandidates is the number of top candidates to consider when + // analysing how tight the competition is among candidates. + NTopCandidates int + // CharThresholds is the default number of chars an article must + // have in order to return a result + CharThresholds int + // ClassesToPreserve are the classes that readability sets itself. + ClassesToPreserve []string + // KeepClasses specify whether the classes should be stripped or not. + KeepClasses bool + // TagsToScore is element tags to score by default. + TagsToScore []string + // Debug determines if the log should be printed or not. Default: false. + Debug bool + // DisableJSONLD determines if metadata in JSON+LD will be extracted + // or not. Default: false. + DisableJSONLD bool + // AllowedVideoRegex is a regular expression that matches video URLs that should be + // allowed to be included in the article content. If undefined, it will use default filter. + AllowedVideoRegex *regexp.Regexp + + doc *html.Node + documentURI *nurl.URL + articleTitle string + articleByline string + articleDir string + articleSiteName string + articleLang string + attempts []parseAttempt + flags flags +} + +// NewParser returns new Parser which set up with default value. +func NewParser() Parser { + return Parser{ + MaxElemsToParse: 0, + NTopCandidates: 5, + CharThresholds: 500, + ClassesToPreserve: []string{"page"}, + KeepClasses: false, + TagsToScore: []string{"section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"}, + Debug: false, + } +} + +// postProcessContent runs any post-process modifications to article +// content as necessary. +func (ps *Parser) postProcessContent(articleContent *html.Node) { + // Readability cannot open relative uris so we convert them to absolute uris. + ps.fixRelativeURIs(articleContent) + + ps.simplifyNestedElements(articleContent) + + // Remove classes. + if !ps.KeepClasses { + ps.cleanClasses(articleContent) + } + + // Remove readability attributes. + ps.clearReadabilityAttr(articleContent) +} + +// removeNodes iterates over a NodeList, calls `filterFn` for each node +// and removes node if function returned `true`. If function is not +// passed, removes all the nodes in node list. +func (ps *Parser) removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) { + for i := len(nodeList) - 1; i >= 0; i-- { + node := nodeList[i] + parentNode := node.Parent + if parentNode != nil && (filterFn == nil || filterFn(node)) { + parentNode.RemoveChild(node) + } + } +} + +// replaceNodeTags iterates over a NodeList, and calls setNodeTag for +// each node. +func (ps *Parser) replaceNodeTags(nodeList []*html.Node, newTagName string) { + for i := len(nodeList) - 1; i >= 0; i-- { + node := nodeList[i] + ps.setNodeTag(node, newTagName) + } +} + +// forEachNode iterates over a NodeList and runs fn on each node. +func (ps *Parser) forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) { + for i := 0; i < len(nodeList); i++ { + fn(nodeList[i], i) + } +} + +// someNode iterates over a NodeList, return true if any of the +// provided iterate function calls returns true, false otherwise. +func (ps *Parser) someNode(nodeList []*html.Node, fn func(*html.Node) bool) bool { + for i := 0; i < len(nodeList); i++ { + if fn(nodeList[i]) { + return true + } + } + return false +} + +// everyNode iterates over a NodeList, return true if all of the +// provided iterate function calls returns true, false otherwise. +func (ps *Parser) everyNode(nodeList []*html.Node, fn func(*html.Node) bool) bool { + for i := 0; i < len(nodeList); i++ { + if !fn(nodeList[i]) { + return false + } + } + return true +} + +// concatNodeLists concats all nodelists passed as arguments. +func (ps *Parser) concatNodeLists(nodeLists ...[]*html.Node) []*html.Node { + var result []*html.Node + for i := 0; i < len(nodeLists); i++ { + result = append(result, nodeLists[i]...) + } + return result +} + +// getAllNodesWithTag returns all nodes that has tag inside tagNames. +func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node { + var result []*html.Node + for i := 0; i < len(tagNames); i++ { + result = append(result, dom.GetElementsByTagName(node, tagNames[i])...) + } + return result +} + +// cleanClasses removes the class="" attribute from every element in the +// given subtree, except those that match CLASSES_TO_PRESERVE and the +// classesToPreserve array from the options object. +func (ps *Parser) cleanClasses(node *html.Node) { + nodeClassName := dom.ClassName(node) + preservedClassName := []string{} + for _, class := range strings.Fields(nodeClassName) { + if indexOf(ps.ClassesToPreserve, class) != -1 { + preservedClassName = append(preservedClassName, class) + } + } + + if len(preservedClassName) > 0 { + dom.SetAttribute(node, "class", strings.Join(preservedClassName, " ")) + } else { + dom.RemoveAttribute(node, "class") + } + + for child := dom.FirstElementChild(node); child != nil; child = dom.NextElementSibling(child) { + ps.cleanClasses(child) + } +} + +// fixRelativeURIs converts each and uri in the given element +// to an absolute URI, ignoring #ref URIs. +func (ps *Parser) fixRelativeURIs(articleContent *html.Node) { + links := ps.getAllNodesWithTag(articleContent, "a") + ps.forEachNode(links, func(link *html.Node, _ int) { + href := dom.GetAttribute(link, "href") + if href == "" { + return + } + + // Remove links with javascript: URIs, since they won't + // work after scripts have been removed from the page. + if strings.HasPrefix(href, "javascript:") { + linkChilds := dom.ChildNodes(link) + + if len(linkChilds) == 1 && linkChilds[0].Type == html.TextNode { + // If the link only contains simple text content, + // it can be converted to a text node + text := dom.CreateTextNode(dom.TextContent(link)) + dom.ReplaceChild(link.Parent, text, link) + } else { + // If the link has multiple children, they should + // all be preserved + container := dom.CreateElement("span") + for link.FirstChild != nil { + dom.AppendChild(container, link.FirstChild) + } + dom.ReplaceChild(link.Parent, container, link) + } + } else { + newHref := toAbsoluteURI(href, ps.documentURI) + if newHref == "" { + dom.RemoveAttribute(link, "href") + } else { + dom.SetAttribute(link, "href", newHref) + } + } + }) + + medias := ps.getAllNodesWithTag(articleContent, "img", "picture", "figure", "video", "audio", "source") + ps.forEachNode(medias, func(media *html.Node, _ int) { + src := dom.GetAttribute(media, "src") + poster := dom.GetAttribute(media, "poster") + srcset := dom.GetAttribute(media, "srcset") + + if src != "" { + newSrc := toAbsoluteURI(src, ps.documentURI) + dom.SetAttribute(media, "src", newSrc) + } + + if poster != "" { + newPoster := toAbsoluteURI(poster, ps.documentURI) + dom.SetAttribute(media, "poster", newPoster) + } + + if srcset != "" { + newSrcset := rxSrcsetURL.ReplaceAllStringFunc(srcset, func(s string) string { + p := rxSrcsetURL.FindStringSubmatch(s) + return toAbsoluteURI(p[1], ps.documentURI) + p[2] + p[3] + }) + + dom.SetAttribute(media, "srcset", newSrcset) + } + }) +} + +func (ps *Parser) simplifyNestedElements(articleContent *html.Node) { + node := articleContent + + for node != nil { + nodeID := dom.ID(node) + nodeTagName := dom.TagName(node) + + if node.Parent != nil && (nodeTagName == "div" || nodeTagName == "section") && + !strings.HasPrefix(nodeID, "readability") { + if ps.isElementWithoutContent(node) { + node = ps.removeAndGetNext(node) + continue + } + + if ps.hasSingleTagInsideElement(node, "div") || ps.hasSingleTagInsideElement(node, "section") { + child := dom.Children(node)[0] + for _, attr := range node.Attr { + dom.SetAttribute(child, attr.Key, attr.Val) + } + + dom.ReplaceChild(node.Parent, child, node) + node = child + continue + } + } + + node = ps.getNextNode(node, false) + } +} + +// getArticleTitle attempts to get the article title. +func (ps *Parser) getArticleTitle() string { + doc := ps.doc + curTitle := "" + origTitle := "" + titleHadHierarchicalSeparators := false + + // If they had an element with tag "title" in their HTML + if nodes := dom.GetElementsByTagName(doc, "title"); len(nodes) > 0 { + origTitle = ps.getInnerText(nodes[0], true) + curTitle = origTitle + } + + // If there's a separator in the title, first remove the final part + if rxTitleSeparator.MatchString(curTitle) { + titleHadHierarchicalSeparators = rxTitleHierarchySep.MatchString(curTitle) + curTitle = rxTitleRemoveFinalPart.ReplaceAllString(origTitle, "$1") + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if wordCount(curTitle) < 3 { + curTitle = rxTitleRemove1stPart.ReplaceAllString(origTitle, "$1") + } + } else if strings.Contains(curTitle, ": ") { + // Check if we have an heading containing this exact string, so + // we could assume it's the full title. + headings := ps.concatNodeLists( + dom.GetElementsByTagName(doc, "h1"), + dom.GetElementsByTagName(doc, "h2"), + ) + + trimmedTitle := strings.TrimSpace(curTitle) + match := ps.someNode(headings, func(heading *html.Node) bool { + return strings.TrimSpace(dom.TextContent(heading)) == trimmedTitle + }) + + // If we don't, let's extract the title out of the original + // title string. + if !match { + curTitle = origTitle[strings.LastIndex(origTitle, ":")+1:] + + // If the title is now too short, try the first colon instead: + if wordCount(curTitle) < 3 { + curTitle = origTitle[strings.Index(origTitle, ":")+1:] + // But if we have too many words before the colon there's + // something weird with the titles and the H tags so let's + // just use the original title instead + } else if wordCount(origTitle[:strings.Index(origTitle, ":")]) > 5 { + curTitle = origTitle + } + } + } else if charCount(curTitle) > 150 || charCount(curTitle) < 15 { + if hOnes := dom.GetElementsByTagName(doc, "h1"); len(hOnes) == 1 { + curTitle = ps.getInnerText(hOnes[0], true) + } + } + + curTitle = strings.TrimSpace(curTitle) + curTitle = rxNormalize.ReplaceAllString(curTitle, " ") + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + curTitleWordCount := wordCount(curTitle) + tmpOrigTitle := rxTitleAnySeparator.ReplaceAllString(origTitle, "") + + if curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(tmpOrigTitle)-1) { + curTitle = origTitle + } + + return curTitle +} + +// prepDocument prepares the HTML document for readability to scrape it. +// This includes things like stripping javascript, CSS, and handling +// terrible markup. +func (ps *Parser) prepDocument() { + doc := ps.doc + + // ADDITIONAL, not exist in readability.js: + // Remove all comments, + ps.removeComments(doc) + + // Remove all style tags in head + ps.removeNodes(dom.GetElementsByTagName(doc, "style"), nil) + + if nodes := dom.GetElementsByTagName(doc, "body"); len(nodes) > 0 && nodes[0] != nil { + ps.replaceBrs(nodes[0]) + } + + ps.replaceNodeTags(dom.GetElementsByTagName(doc, "font"), "span") +} + +// nextNode finds the next element, starting from the given node, and +// ignoring whitespace in between. If the given node is an element, the +// same node is returned. +func (ps *Parser) nextNode(node *html.Node) *html.Node { + next := node + for next != nil && next.Type != html.ElementNode && rxWhitespace.MatchString(dom.TextContent(next)) { + next = next.NextSibling + } + return next +} + +// replaceBrs replaces 2 or more successive
+ //
+ // Sentences
+ //
with a single. +// Whitespace between
elements are ignored. For example: +// +//foo+// +// will become: +// +//
bar
abcfoo+func (ps *Parser) replaceBrs(elem *html.Node) { + ps.forEachNode(ps.getAllNodesWithTag(elem, "br"), func(br *html.Node, _ int) { + next := br.NextSibling + + // Whether 2 or more
barabc
elements have been found and replaced + // with ablock. + replaced := false + + // If we find a
chain, remove the
s until we hit another + // element or non-whitespace. This leaves behind the first
+ // in the chain (which will be replaced with alater). + for { + next = ps.nextNode(next) + if next == nil || dom.TagName(next) != "br" { + break + } + + replaced = true + brSibling := next.NextSibling + next.Parent.RemoveChild(next) + next = brSibling + } + + // If we removed a
chain, replace the remaining
with a. Add + // all sibling nodes as children of the
until we hit another
+ // chain. + if replaced { + p := dom.CreateElement("p") + dom.ReplaceChild(br.Parent, p, br) + + next = p.NextSibling + for next != nil { + // If we've hit another
, we're done adding children to this. + if dom.TagName(next) == "br" { + nextElem := ps.nextNode(next.NextSibling) + if nextElem != nil && dom.TagName(nextElem) == "br" { + break + } + } + + if !ps.isPhrasingContent(next) { + break + } + + // Otherwise, make this node a child of the new
. + sibling := next.NextSibling + dom.AppendChild(p, next) + next = sibling + } + + for p.LastChild != nil && ps.isWhitespace(p.LastChild) { + p.RemoveChild(p.LastChild) + } + + if dom.TagName(p.Parent) == "p" { + ps.setNodeTag(p.Parent, "div") + } + } + }) +} + +// setNodeTag changes tag of the node to newTagName. +func (ps *Parser) setNodeTag(node *html.Node, newTagName string) { + if node.Type == html.ElementNode { + node.Data = newTagName + } +} + +// prepArticle prepares the article node for display. Clean out any +// inline styles, iframes, forms, strip extraneous
tags, etc. +func (ps *Parser) prepArticle(articleContent *html.Node) { + ps.cleanStyles(articleContent) + + // Check for data tables before we continue, to avoid removing + // items in those tables, which will often be isolated even + // though they're visually linked to other content-ful elements + // (text, images, etc.). + ps.markDataTables(articleContent) + + ps.fixLazyImages(articleContent) + + // Clean out junk from the article content + ps.cleanConditionally(articleContent, "form") + ps.cleanConditionally(articleContent, "fieldset") + ps.clean(articleContent, "object") + ps.clean(articleContent, "embed") + ps.clean(articleContent, "footer") + ps.clean(articleContent, "link") + ps.clean(articleContent, "aside") + + // Clean out elements have "share" in their id/class combinations + // from final top candidates, which means we don't remove the top + // candidates even they have "share". + shareElementThreshold := ps.CharThresholds + + ps.forEachNode(dom.Children(articleContent), func(topCandidate *html.Node, _ int) { + ps.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool { + return rxShareElements.MatchString(nodeClassID) && charCount(dom.TextContent(node)) < shareElementThreshold + }) + }) + + ps.clean(articleContent, "iframe") + ps.clean(articleContent, "input") + ps.clean(articleContent, "textarea") + ps.clean(articleContent, "select") + ps.clean(articleContent, "button") + ps.cleanHeaders(articleContent) + + // Do these last as the previous stuff may have removed junk + // that will affect these + ps.cleanConditionally(articleContent, "table") + ps.cleanConditionally(articleContent, "ul") + ps.cleanConditionally(articleContent, "div") + + // Replace H1 with H2 as H1 should be only title that is displayed separately + ps.replaceNodeTags(ps.getAllNodesWithTag(articleContent, "h1"), "h2") + + // Remove extra paragraphs + ps.removeNodes(dom.GetElementsByTagName(articleContent, "p"), func(p *html.Node) bool { + imgCount := len(dom.GetElementsByTagName(p, "img")) + embedCount := len(dom.GetElementsByTagName(p, "embed")) + objectCount := len(dom.GetElementsByTagName(p, "object")) + // At this point, nasty iframes have been removed, only + // remain embedded video ones. + iframeCount := len(dom.GetElementsByTagName(p, "iframe")) + totalCount := imgCount + embedCount + objectCount + iframeCount + + return totalCount == 0 && ps.getInnerText(p, false) == "" + }) + + ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { + next := ps.nextNode(br.NextSibling) + if next != nil && dom.TagName(next) == "p" { + br.Parent.RemoveChild(br) + } + }) + + // Remove single-cell tables + ps.forEachNode(dom.GetElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) { + tbody := table + if ps.hasSingleTagInsideElement(table, "tbody") { + tbody = dom.FirstElementChild(table) + } + + if ps.hasSingleTagInsideElement(tbody, "tr") { + row := dom.FirstElementChild(tbody) + if ps.hasSingleTagInsideElement(row, "td") { + cell := dom.FirstElementChild(row) + + newTag := "div" + if ps.everyNode(dom.ChildNodes(cell), ps.isPhrasingContent) { + newTag = "p" + } + + ps.setNodeTag(cell, newTag) + dom.ReplaceChild(table.Parent, cell, table) + } + } + }) +} + +// initializeNode initializes a node with the readability score. +// Also checks the className/id for special names to add to its score. +func (ps *Parser) initializeNode(node *html.Node) { + contentScore := float64(ps.getClassWeight(node)) + switch dom.TagName(node) { + case "div": + contentScore += 5 + case "pre", "td", "blockquote": + contentScore += 3 + case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": + contentScore -= 3 + case "h1", "h2", "h3", "h4", "h5", "h6", "th": + contentScore -= 5 + } + + ps.setContentScore(node, contentScore) +} + +// removeAndGetNext remove node and returns its next node. +func (ps *Parser) removeAndGetNext(node *html.Node) *html.Node { + nextNode := ps.getNextNode(node, true) + if node.Parent != nil { + node.Parent.RemoveChild(node) + } + return nextNode +} + +// getNextNode traverses the DOM from node to node, starting at the +// node passed in. Pass true for the second parameter to indicate +// this node itself (and its kids) are going away, and we want the +// next node over. Calling this in a loop will traverse the DOM +// depth-first. +// In Readability.js, ignoreSelfAndKids default to false. +func (ps *Parser) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node { + // First check for kids if those aren't being ignored + if firstChild := dom.FirstElementChild(node); !ignoreSelfAndKids && firstChild != nil { + return firstChild + } + + // Then for siblings... + if sibling := dom.NextElementSibling(node); sibling != nil { + return sibling + } + + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + for { + node = node.Parent + if node == nil || dom.NextElementSibling(node) != nil { + break + } + } + + if node != nil { + return dom.NextElementSibling(node) + } + + return nil +} + +// textSimilarity compares second text to first one. 1 = same text, 0 = completely different text. +// The way it works: it splits both texts into words and then finds words that are unique in +// second text the result is given by the lower length of unique parts. +func (ps *Parser) textSimilarity(textA, textB string) float64 { + tokensA := rxTokenize.Split(strings.ToLower(textA), -1) + tokensA = strFilter(tokensA, func(s string) bool { return s != "" }) + mapTokensA := sliceToMap(tokensA...) + + tokensB := rxTokenize.Split(strings.ToLower(textB), -1) + tokensB = strFilter(tokensB, func(s string) bool { return s != "" }) + uniqueTokensB := strFilter(tokensB, func(s string) bool { + _, existInA := mapTokensA[s] + return !existInA + }) + + mergedB := strings.Join(tokensB, " ") + mergedUniqueB := strings.Join(uniqueTokensB, " ") + distanceB := float64(charCount(mergedUniqueB)) / float64(charCount(mergedB)) + + return 1 - distanceB +} + +// checkByline determines if a node is used as byline. +func (ps *Parser) checkByline(node *html.Node, matchString string) bool { + if ps.articleByline != "" { + return false + } + + rel := dom.GetAttribute(node, "rel") + itemprop := dom.GetAttribute(node, "itemprop") + nodeText := dom.TextContent(node) + if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) && + ps.isValidByline(nodeText) { + nodeText = strings.TrimSpace(nodeText) + nodeText = strings.Join(strings.Fields(nodeText), " ") + ps.articleByline = nodeText + return true + } + + return false +} + +func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 { + textLength := charCount(ps.getInnerText(node, true)) + if textLength == 0 { + return 0 + } + + var childrenLength int + children := ps.getAllNodesWithTag(node, tags...) + ps.forEachNode(children, func(child *html.Node, _ int) { + childrenLength += charCount(ps.getInnerText(child, true)) + }) + + return float64(childrenLength) / float64(textLength) +} + +// getNodeAncestors gets the node's direct parent and grandparents. +// In Readability.js, maxDepth default to 0. +func (ps *Parser) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node { + i := 0 + var ancestors []*html.Node + + for node.Parent != nil { + i++ + ancestors = append(ancestors, node.Parent) + if maxDepth > 0 && i == maxDepth { + break + } + node = node.Parent + } + return ancestors +} + +// grabArticle uses a variety of metrics (content score, classname, +// element types), find the content that is most likely to be the +// stuff a user wants to read. Then return it wrapped up in a div. +func (ps *Parser) grabArticle() *html.Node { + ps.log("**** GRAB ARTICLE ****") + + for { + doc := dom.Clone(ps.doc, true) + + var page *html.Node + if nodes := dom.GetElementsByTagName(doc, "body"); len(nodes) > 0 { + page = nodes[0] + } + + // We can't grab an article if we don't have a page! + if page == nil { + ps.log("no body found in document, abort") + return nil + } + + // First, node prepping. Trash nodes that look cruddy (like ones + // with the class name "comment", etc), and turn divs into P + // tags where they have been used inappropriately (as in, where + // they contain no other block level elements.) + var elementsToScore []*html.Node + var node = dom.DocumentElement(doc) + shouldRemoveTitleHeader := true + + for node != nil { + matchString := dom.ClassName(node) + " " + dom.ID(node) + + if dom.TagName(node) == "html" { + ps.articleLang = dom.GetAttribute(node, "lang") + } + + if !ps.isProbablyVisible(node) { + ps.logf("removing hidden node: %q\n", matchString) + node = ps.removeAndGetNext(node) + continue + } + + // User is not able to see elements applied with both "aria-modal = true" + // and "role = dialog" + if dom.GetAttribute(node, "aria-modal") == "true" && + dom.GetAttribute(node, "role") == "dialog" { + node = ps.removeAndGetNext(node) + continue + } + + // Check to see if this node is a byline, and remove it if + // it is true. + if ps.checkByline(node, matchString) { + node = ps.removeAndGetNext(node) + continue + } + + if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) { + ps.logf("removing header: %q duplicate of %q\n", + trim(dom.TextContent(node)), trim(ps.articleTitle)) + shouldRemoveTitleHeader = false + node = ps.removeAndGetNext(node) + continue + } + + // Remove unlikely candidates + nodeTagName := dom.TagName(node) + if ps.flags.stripUnlikelys { + if rxUnlikelyCandidates.MatchString(matchString) && + !rxOkMaybeItsACandidate.MatchString(matchString) && + !ps.hasAncestorTag(node, "table", 3, nil) && + !ps.hasAncestorTag(node, "code", 3, nil) && + nodeTagName != "body" && nodeTagName != "a" { + ps.logf("removing unlikely candidate: %q\n", matchString) + node = ps.removeAndGetNext(node) + continue + } + + role := dom.GetAttribute(node, "role") + if _, include := unlikelyRoles[role]; include { + ps.logf("removing content with role %q: %q\n", role, matchString) + node = ps.removeAndGetNext(node) + continue + } + } + + // Remove DIV, SECTION, and HEADER nodes without any + // content(e.g. text, image, video, or iframe). + switch nodeTagName { + case "div", "section", "header", + "h1", "h2", "h3", "h4", "h5", "h6": + if ps.isElementWithoutContent(node) { + node = ps.removeAndGetNext(node) + continue + } + } + + if indexOf(ps.TagsToScore, nodeTagName) != -1 { + elementsToScore = append(elementsToScore, node) + } + + // Turn all divs that don't have children block level + // elements into p's + if nodeTagName == "div" { + // Put phrasing content into paragraphs. + var p *html.Node + childNode := node.FirstChild + for childNode != nil { + nextSibling := childNode.NextSibling + if ps.isPhrasingContent(childNode) { + if p != nil { + dom.AppendChild(p, childNode) + } else if !ps.isWhitespace(childNode) { + p = dom.CreateElement("p") + dom.AppendChild(p, dom.Clone(childNode, true)) + dom.ReplaceChild(node, p, childNode) + } + } else if p != nil { + for p.LastChild != nil && ps.isWhitespace(p.LastChild) { + p.RemoveChild(p.LastChild) + } + p = nil + } + childNode = nextSibling + } + + // Sites like http://mobile.slate.com encloses each + // paragraph with a DIV element. DIVs with only a P + // element inside and no text content can be safely + // converted into plain P elements to avoid confusing + // the scoring algorithm with DIVs with are, in + // practice, paragraphs. + if ps.hasSingleTagInsideElement(node, "p") && ps.getLinkDensity(node) < 0.25 { + newNode := dom.Children(node)[0] + node, _ = dom.ReplaceChild(node.Parent, newNode, node) + elementsToScore = append(elementsToScore, node) + } else if !ps.hasChildBlockElement(node) { + ps.setNodeTag(node, "p") + elementsToScore = append(elementsToScore, node) + } + } + node = ps.getNextNode(node, false) + } + + // Loop through all paragraphs, and assign a score to them based + // on how content-y they look. Then add their score to their + // parent node. A score is determined by things like number of + // commas, class names, etc. Maybe eventually link density. + var candidates []*html.Node + ps.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) { + if elementToScore.Parent == nil || dom.TagName(elementToScore.Parent) == "" { + return + } + + // If this paragraph is less than 25 characters, don't even count it. + innerText := ps.getInnerText(elementToScore, true) + if charCount(innerText) < 25 { + return + } + + // Exclude nodes with no ancestor. + ancestors := ps.getNodeAncestors(elementToScore, 5) + if len(ancestors) == 0 { + return + } + + // Add a point for the paragraph itself as a base. + contentScore := 1 + + // Add points for any commas within this paragraph. + contentScore += strings.Count(innerText, ",") + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0)) + + // Initialize and score ancestors. + ps.forEachNode(ancestors, func(ancestor *html.Node, level int) { + if dom.TagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode { + return + } + + if !ps.hasContentScore(ancestor) { + ps.initializeNode(ancestor) + candidates = append(candidates, ancestor) + } + + // Node score divider: + // - parent: 1 (no division) + // - grandparent: 2 + // - great grandparent+: ancestor level * 3 + scoreDivider := 1 + switch level { + case 0: + scoreDivider = 1 + case 1: + scoreDivider = 2 + default: + scoreDivider = level * 3 + } + + ancestorScore := ps.getContentScore(ancestor) + ancestorScore += float64(contentScore) / float64(scoreDivider) + ps.setContentScore(ancestor, ancestorScore) + }) + }) + + // These lines are a bit different compared to Readability.js. + // In Readability.js, they fetch NTopCandidates utilising array + // method like `splice` and `pop`. In Go, array method like that + // is not as simple, especially since we are working with pointer. + // So, here we simply sort top candidates, and limit it to + // max NTopCandidates. + + // Scale the final candidates score based on link density. Good + // content should have a relatively small link density (5% or + // less) and be mostly unaffected by this operation. + for i := 0; i < len(candidates); i++ { + candidate := candidates[i] + candidateScore := ps.getContentScore(candidate) * (1 - ps.getLinkDensity(candidate)) + ps.logf("candidate %q with score: %f\n", dom.OuterHTML(candidate), candidateScore) + ps.setContentScore(candidate, candidateScore) + } + + // After we've calculated scores, sort through all of the possible + // candidate nodes we found and find the one with the highest score. + sort.Slice(candidates, func(i int, j int) bool { + return ps.getContentScore(candidates[i]) > ps.getContentScore(candidates[j]) + }) + + var topCandidates []*html.Node + if len(candidates) > ps.NTopCandidates { + topCandidates = candidates[:ps.NTopCandidates] + } else { + topCandidates = candidates + } + + var topCandidate, parentOfTopCandidate *html.Node + neededToCreateTopCandidate := false + if len(topCandidates) > 0 { + topCandidate = topCandidates[0] + } + + // If we still have no top candidate, just use the body as a last + // resort. We also have to copy the body node so it is something + // we can modify. + if topCandidate == nil || dom.TagName(topCandidate) == "body" { + // Move all of the page's children into topCandidate + topCandidate = dom.CreateElement("div") + neededToCreateTopCandidate = true + // Move everything (not just elements, also text nodes etc.) + // into the container so we even include text directly in the body: + for page.FirstChild != nil { + ps.logf("moving child out: %q\n", dom.OuterHTML(page.FirstChild)) + dom.AppendChild(topCandidate, page.FirstChild) + } + + dom.AppendChild(page, topCandidate) + ps.initializeNode(topCandidate) + } else if topCandidate != nil { + // Find a better top candidate node if it contains (at least three) + // nodes which belong to `topCandidates` array and whose scores are + // quite closed with current `topCandidate` node. + topCandidateScore := ps.getContentScore(topCandidate) + var alternativeCandidateAncestors [][]*html.Node + for i := 1; i < len(topCandidates); i++ { + if ps.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 { + topCandidateAncestors := ps.getNodeAncestors(topCandidates[i], 0) + alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors) + } + } + + minimumTopCandidates := 3 + if len(alternativeCandidateAncestors) >= minimumTopCandidates { + parentOfTopCandidate = topCandidate.Parent + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" { + listContainingThisAncestor := 0 + for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ { + if dom.IncludeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) { + listContainingThisAncestor++ + } + } + + if listContainingThisAncestor >= minimumTopCandidates { + topCandidate = parentOfTopCandidate + break + } + + parentOfTopCandidate = parentOfTopCandidate.Parent + } + } + + if !ps.hasContentScore(topCandidate) { + ps.initializeNode(topCandidate) + } + + // Because of our bonus system, parents of candidates might + // have scores themselves. They get half of the node. There + // won't be nodes with higher scores than our topCandidate, + // but if we see the score going *up* in the first few steps * + // up the tree, that's a decent sign that there might be more + // content lurking in other places that we want to unify in. + // The sibling stuff below does some of that - but only if + // we've looked high enough up the DOM tree. + parentOfTopCandidate = topCandidate.Parent + lastScore := ps.getContentScore(topCandidate) + // The scores shouldn't get too lops. + scoreThreshold := lastScore / 3.0 + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" { + if !ps.hasContentScore(parentOfTopCandidate) { + parentOfTopCandidate = parentOfTopCandidate.Parent + continue + } + + parentScore := ps.getContentScore(parentOfTopCandidate) + if parentScore < scoreThreshold { + break + } + + if parentScore > lastScore { + // Alright! We found a better parent to use. + topCandidate = parentOfTopCandidate + break + } + + lastScore = parentScore + parentOfTopCandidate = parentOfTopCandidate.Parent + } + + // If the top candidate is the only child, use parent + // instead. This will help sibling joining logic when + // adjacent content is actually located in parent's + // sibling node. + parentOfTopCandidate = topCandidate.Parent + for parentOfTopCandidate != nil && dom.TagName(parentOfTopCandidate) != "body" && len(dom.Children(parentOfTopCandidate)) == 1 { + topCandidate = parentOfTopCandidate + parentOfTopCandidate = topCandidate.Parent + } + + if !ps.hasContentScore(topCandidate) { + ps.initializeNode(topCandidate) + } + } + + // Now that we have the top candidate, look through its siblings + // for content that might also be related. Things like preambles, + // content split by ads that we removed, etc. + articleContent := dom.CreateElement("div") + siblingScoreThreshold := math.Max(10, ps.getContentScore(topCandidate)*0.2) + + // Keep potential top candidate's parent node to try to get text direction of it later. + topCandidateScore := ps.getContentScore(topCandidate) + topCandidateClassName := dom.ClassName(topCandidate) + + parentOfTopCandidate = topCandidate.Parent + siblings := dom.Children(parentOfTopCandidate) + for s := 0; s < len(siblings); s++ { + sibling := siblings[s] + appendNode := false + + if sibling == topCandidate { + appendNode = true + } else { + contentBonus := float64(0) + + // Give a bonus if sibling nodes and top candidates have the example same classname + if dom.ClassName(sibling) == topCandidateClassName && topCandidateClassName != "" { + contentBonus += topCandidateScore * 0.2 + } + + if ps.hasContentScore(sibling) && ps.getContentScore(sibling)+contentBonus >= siblingScoreThreshold { + appendNode = true + } else if dom.TagName(sibling) == "p" { + linkDensity := ps.getLinkDensity(sibling) + nodeContent := ps.getInnerText(sibling, true) + nodeLength := charCount(nodeContent) + + if nodeLength > 80 && linkDensity < 0.25 { + appendNode = true + } else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 && + rxSentencePeriod.MatchString(nodeContent) { + appendNode = true + } + } + } + + if appendNode { + // We have a node that isn't a common block level + // element, like a form or td tag. Turn it into a div + // so it doesn't get filtered out later by accident. + if indexOf(alterToDivExceptions, dom.TagName(sibling)) == -1 { + ps.setNodeTag(sibling, "div") + } + + dom.AppendChild(articleContent, sibling) + + // TODO: + // this line is implemented in Readability.js, however + // it doesn't seem to be useful for our port. + // siblings = dom.Children(parentOfTopCandidate) + } + } + + // So we have all of the content that we need. Now we clean + // it up for presentation. + ps.prepArticle(articleContent) + + if neededToCreateTopCandidate { + // We already created a fake div thing, and there wouldn't + // have been any siblings left for the previous loop, so + // there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class + // names here. No need to append because that already + // happened anyway. + // + // By the way, this line is different with Readability.js. + // In Readability.js, when using `appendChild`, the node is + // still referenced. Meanwhile here, our `appendChild` will + // clone the node, put it in the new place, then delete + // the original. + firstChild := dom.FirstElementChild(articleContent) + if firstChild != nil && dom.TagName(firstChild) == "div" { + dom.SetAttribute(firstChild, "id", "readability-page-1") + dom.SetAttribute(firstChild, "class", "page") + } + } else { + div := dom.CreateElement("div") + dom.SetAttribute(div, "id", "readability-page-1") + dom.SetAttribute(div, "class", "page") + for articleContent.FirstChild != nil { + dom.AppendChild(div, articleContent.FirstChild) + } + dom.AppendChild(articleContent, div) + } + + parseSuccessful := true + + // Now that we've gone through the full algorithm, check to + // see if we got any meaningful content. If we didn't, we may + // need to re-run grabArticle with different flags set. This + // gives us a higher likelihood of finding the content, and + // the sieve approach gives us a higher likelihood of + // finding the -right- content. + textLength := charCount(ps.getInnerText(articleContent, true)) + if textLength < ps.CharThresholds { + parseSuccessful = false + + if ps.flags.stripUnlikelys { + ps.flags.stripUnlikelys = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else if ps.flags.useWeightClasses { + ps.flags.useWeightClasses = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else if ps.flags.cleanConditionally { + ps.flags.cleanConditionally = false + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + } else { + ps.attempts = append(ps.attempts, parseAttempt{ + articleContent: articleContent, + textLength: textLength, + }) + + // No luck after removing flags, just return the + // longest text we found during the different loops * + sort.Slice(ps.attempts, func(i, j int) bool { + return ps.attempts[i].textLength > ps.attempts[j].textLength + }) + + // But first check if we actually have something + if ps.attempts[0].textLength == 0 { + return nil + } + + articleContent = ps.attempts[0].articleContent + parseSuccessful = true + } + } + + if parseSuccessful { + return articleContent + } + } +} + +// isValidByline checks whether the input string could be a byline. +// This verifies that the input is a string, and that the length +// is less than 100 chars. +func (ps *Parser) isValidByline(byline string) bool { + byline = strings.TrimSpace(byline) + nChar := charCount(byline) + return nChar > 0 && nChar < 100 +} + +// getJSONLD try to extract metadata from JSON-LD object. +// For now, only Schema.org objects of type Article or its subtypes are supported. +func (ps *Parser) getJSONLD() (map[string]string, error) { + var metadata map[string]string + + scripts := dom.QuerySelectorAll(ps.doc, `script[type="application/ld+json"]`) + ps.forEachNode(scripts, func(jsonLdElement *html.Node, _ int) { + if metadata != nil { + return + } + + // Strip CDATA markers if present + content := rxCDATA.ReplaceAllString(dom.TextContent(jsonLdElement), "") + + // Decode JSON + var parsed map[string]interface{} + err := json.Unmarshal([]byte(content), &parsed) + if err != nil { + ps.logf("error while decoding json: %v", err) + return + } + + // Check context + strContext, isString := parsed["@context"].(string) + if !isString || !rxSchemaOrg.MatchString(strContext) { + return + } + + // If parsed doesn't have any @type, find it in its graph list + if _, typeExist := parsed["@type"]; !typeExist { + graphList, isArray := parsed["@graph"].([]interface{}) + if !isArray { + return + } + + for _, graph := range graphList { + objGraph, isObj := graph.(map[string]interface{}) + if !isObj { + continue + } + + strType, isString := objGraph["@type"].(string) + if isString && rxJsonLdArticleTypes.MatchString(strType) { + parsed = objGraph + break + } + } + } + + // Once again, make sure parsed has valid @type + strType, isString := parsed["@type"].(string) + if !isString || !rxJsonLdArticleTypes.MatchString(strType) { + return + } + + // Initiate metadata + metadata = make(map[string]string) + + // Title + name, nameIsString := parsed["name"].(string) + headline, headlineIsString := parsed["headline"].(string) + + if nameIsString && headlineIsString && name != headline { + // We have both name and headline element in the JSON-LD. They should both be the same + // but some websites like aktualne.cz put their own name into "name" and the article + // title to "headline" which confuses Readability. So we try to check if either "name" + // or "headline" closely matches the html title, and if so, use that one. If not, then + // we use "name" by default. + title := ps.getArticleTitle() + nameMatches := ps.textSimilarity(name, title) > 0.75 + headlineMatches := ps.textSimilarity(headline, title) > 0.75 + + if headlineMatches && !nameMatches { + metadata["title"] = headline + } else { + metadata["title"] = name + } + } else if name, isString := parsed["name"].(string); isString { + metadata["title"] = strings.TrimSpace(name) + } else if headline, isString := parsed["headline"].(string); isString { + metadata["title"] = strings.TrimSpace(headline) + } + + // Author + switch val := parsed["author"].(type) { + case map[string]interface{}: + if name, isString := val["name"].(string); isString { + metadata["byline"] = strings.TrimSpace(name) + } + + case []interface{}: + var authors []string + for _, author := range val { + objAuthor, isObj := author.(map[string]interface{}) + if !isObj { + continue + } + + if name, isString := objAuthor["name"].(string); isString { + authors = append(authors, strings.TrimSpace(name)) + } + } + metadata["byline"] = strings.Join(authors, ", ") + } + + // Description + if description, isString := parsed["description"].(string); isString { + metadata["excerpt"] = strings.TrimSpace(description) + } + + // Publisher + if objPublisher, isObj := parsed["publisher"].(map[string]interface{}); isObj { + if name, isString := objPublisher["name"].(string); isString { + metadata["siteName"] = strings.TrimSpace(name) + } + } + }) + + return metadata, nil +} + +// getArticleMetadata attempts to get excerpt and byline +// metadata for the article. +func (ps *Parser) getArticleMetadata(jsonLd map[string]string) map[string]string { + values := make(map[string]string) + metaElements := dom.GetElementsByTagName(ps.doc, "meta") + + // Find description tags. + ps.forEachNode(metaElements, func(element *html.Node, _ int) { + elementName := dom.GetAttribute(element, "name") + elementProperty := dom.GetAttribute(element, "property") + content := dom.GetAttribute(element, "content") + if content == "" { + return + } + matches := []string{} + name := "" + + if elementProperty != "" { + matches = rxPropertyPattern.FindAllString(elementProperty, -1) + for i := len(matches) - 1; i >= 0; i-- { + // Convert to lowercase, and remove any whitespace + // so we can match belops. + name = strings.ToLower(matches[i]) + name = strings.Join(strings.Fields(name), "") + // multiple authors + values[name] = strings.TrimSpace(content) + } + } + + if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) { + // Convert to lowercase, remove any whitespace, and convert + // dots to colons so we can match belops. + name = strings.ToLower(elementName) + name = strings.Join(strings.Fields(name), "") + name = strings.Replace(name, ".", ":", -1) + values[name] = strings.TrimSpace(content) + } + }) + + // get title + metadataTitle := strOr( + jsonLd["title"], + values["dc:title"], + values["dcterm:title"], + values["og:title"], + values["weibo:article:title"], + values["weibo:webpage:title"], + values["title"], + values["twitter:title"]) + + if metadataTitle == "" { + metadataTitle = ps.getArticleTitle() + } + + // get author + metadataByline := strOr( + jsonLd["byline"], + values["dc:creator"], + values["dcterm:creator"], + values["author"]) + + // get description + metadataExcerpt := strOr( + jsonLd["excerpt"], + values["dc:description"], + values["dcterm:description"], + values["og:description"], + values["weibo:article:description"], + values["weibo:webpage:description"], + values["description"], + values["twitter:description"]) + + // get site name + metadataSiteName := strOr(jsonLd["siteName"], values["og:site_name"]) + + // get image thumbnail + metadataImage := strOr( + values["og:image"], + values["image"], + values["twitter:image"]) + + // get favicon + metadataFavicon := ps.getArticleFavicon() + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadataTitle = shtml.UnescapeString(metadataTitle) + metadataByline = shtml.UnescapeString(metadataByline) + metadataExcerpt = shtml.UnescapeString(metadataExcerpt) + metadataSiteName = shtml.UnescapeString(metadataSiteName) + + return map[string]string{ + "title": metadataTitle, + "byline": metadataByline, + "excerpt": metadataExcerpt, + "siteName": metadataSiteName, + "image": metadataImage, + "favicon": metadataFavicon, + } +} + +// isSingleImage checks if node is image, or if node contains exactly +// only one image whether as a direct child or as its descendants. +func (ps *Parser) isSingleImage(node *html.Node) bool { + if dom.TagName(node) == "img" { + return true + } + + children := dom.Children(node) + textContent := dom.TextContent(node) + if len(children) != 1 || strings.TrimSpace(textContent) != "" { + return false + } + + return ps.isSingleImage(children[0]) +} + +// unwrapNoscriptImages finds all