Skip to content

Commit

Permalink
Merge pull request #58 from basenana/feature/docloader
Browse files Browse the repository at this point in the history
feta: docloader plugin
  • Loading branch information
hyponet authored Sep 30, 2023
2 parents 6c54d36 + 5975b91 commit e271738
Show file tree
Hide file tree
Showing 233 changed files with 57,758 additions and 4,389 deletions.
19 changes: 15 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ require (
github.com/google/uuid v1.3.0
github.com/hanwen/go-fuse/v2 v2.3.0
github.com/hyponet/eventbus v1.0.0
github.com/hyponet/webpage-packer v0.0.0-20230930052235-73553a8dce4a
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
github.com/minio/minio-go/v7 v7.0.52
github.com/onsi/ginkgo v1.16.5
github.com/onsi/gomega v1.27.2
Expand All @@ -26,13 +28,15 @@ require (
github.com/tickstep/aliyunpan-api v0.1.6
github.com/tickstep/library-go v0.1.0
go.uber.org/zap v1.24.0
golang.org/x/net v0.11.0
golang.org/x/sys v0.9.0
golang.org/x/net v0.15.0
golang.org/x/sys v0.12.0
gopkg.in/yaml.v3 v3.0.1
gorm.io/driver/postgres v1.3.7
)

require (
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.34 // indirect
Expand All @@ -46,6 +50,7 @@ require (
github.com/aws/aws-sdk-go-v2/service/sso v1.12.12 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.12 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.19.2 // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/btcsuite/btcd v0.22.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
Expand All @@ -57,9 +62,13 @@ require (
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.14.1 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/gorilla/css v1.0.0 // indirect
github.com/jackc/chunkreader/v2 v2.0.1 // indirect
github.com/jackc/pgconn v1.12.1 // indirect
github.com/jackc/pgio v1.0.0 // indirect
Expand All @@ -75,6 +84,7 @@ require (
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/mattn/go-runewidth v0.0.9 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/microcosm-cc/bluemonday v1.0.25 // indirect
github.com/minio/md5-simd v1.1.2 // indirect
github.com/minio/sha256-simd v1.0.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
Expand All @@ -89,11 +99,12 @@ require (
github.com/sirupsen/logrus v1.9.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
golang.org/x/text v0.10.0 // indirect
golang.org/x/text v0.13.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
howett.net/plist v1.0.0 // indirect
modernc.org/libc v1.22.2 // indirect
modernc.org/mathutil v1.5.0 // indirect
modernc.org/memory v1.5.0 // indirect
Expand All @@ -115,6 +126,6 @@ require (
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.10.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
gorm.io/gorm v1.24.6
)
61 changes: 53 additions & 8 deletions go.sum

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pkg/dentry/group.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"fmt"
"github.com/basenana/nanafs/pkg/metastore"
"github.com/basenana/nanafs/pkg/plugin"
"github.com/basenana/nanafs/pkg/plugin/stub"
"github.com/basenana/nanafs/pkg/plugin/pluginapi"
"github.com/basenana/nanafs/pkg/types"
"github.com/basenana/nanafs/utils/logger"
"path"
Expand Down Expand Up @@ -243,7 +243,7 @@ func (e *extGroup) FindEntry(ctx context.Context, name string) (*types.Metadata,
}

func (e *extGroup) CreateEntry(ctx context.Context, attr EntryAttr) (*types.Metadata, error) {
mirrorEn, err := e.mirror.CreateEntry(ctx, stub.EntryAttr{
mirrorEn, err := e.mirror.CreateEntry(ctx, pluginapi.EntryAttr{
Name: attr.Name,
Kind: attr.Kind,
})
Expand Down Expand Up @@ -325,7 +325,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error)
}

recordChildMap := make(map[string]*types.Metadata)
actualChildMap := make(map[string]*stub.Entry)
actualChildMap := make(map[string]*pluginapi.Entry)
for i := range recordChild {
recordChildMap[recordChild[i].Name] = recordChild[i]
}
Expand Down Expand Up @@ -355,7 +355,7 @@ func (e *extGroup) ListChildren(ctx context.Context) ([]*types.Metadata, error)
return result, nil
}

func (e *extGroup) syncEntry(ctx context.Context, mirrored *stub.Entry, crt *types.Metadata) (en *types.Metadata, err error) {
func (e *extGroup) syncEntry(ctx context.Context, mirrored *pluginapi.Entry, crt *types.Metadata) (en *types.Metadata, err error) {
grp, err := e.stdGroup.cacheStore.getEntry(ctx, e.stdGroup.entryID)
if err != nil {
return nil, err
Expand Down
8 changes: 4 additions & 4 deletions pkg/dentry/group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package dentry
import (
"context"
"github.com/basenana/nanafs/pkg/plugin"
"github.com/basenana/nanafs/pkg/plugin/stub"
"github.com/basenana/nanafs/pkg/plugin/pluginapi"
"github.com/basenana/nanafs/pkg/types"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -314,7 +314,7 @@ var _ = Describe("TestExtGroupEntry", func() {
Expect(len(need)).Should(Equal(0))
})
It("insert sync_file1.yaml to memfs should be succeed", func() {
_, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{
_, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{
Name: "sync_file1.yaml",
Kind: types.RawKind,
})
Expand All @@ -335,7 +335,7 @@ var _ = Describe("TestExtGroupEntry", func() {
Expect(len(need)).Should(Equal(0))
})
It("insert sync_file2.yaml to memfs should be succeed", func() {
_, err = memFS.CreateEntry(context.TODO(), stub.EntryAttr{
_, err = memFS.CreateEntry(context.TODO(), pluginapi.EntryAttr{
Name: "sync_file2.yaml",
Kind: types.RawKind,
})
Expand All @@ -355,7 +355,7 @@ var _ = Describe("TestExtGroupEntry", func() {
Expect(len(need)).Should(Equal(0))
})
It("delete sync_file2.yaml should be succeed", func() {
err = memFS.RemoveEntry(context.TODO(), &stub.Entry{
err = memFS.RemoveEntry(context.TODO(), &pluginapi.Entry{
Name: "sync_file2.yaml",
Kind: types.RawKind,
})
Expand Down
80 changes: 80 additions & 0 deletions pkg/plugin/buildin/docloader/csv.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
Copyright 2023 NanaFS Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package docloader

import (
"context"
"encoding/csv"
"errors"
"fmt"
"github.com/basenana/nanafs/pkg/types"
"io"
"os"
"strings"
)

const (
csvLoader = "csv"
)

type CSV struct {
docPath string
}

func NewCSV(docPath string, option map[string]string) CSV {
return CSV{docPath: docPath}
}

func (c CSV) Load(_ context.Context) (result []types.FDocument, err error) {
f, err := os.Open(c.docPath)
if err != nil {
return nil, err
}
defer f.Close()

var header []string
var rown int

rd := csv.NewReader(f)
for {
row, err := rd.Read()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, err
}
if len(header) == 0 {
header = append(header, row...)
continue
}

var content []string
for i, value := range row {
line := fmt.Sprintf("%s: %s", header[i], value)
content = append(content, line)
}

rown++
result = append(result, types.FDocument{
Content: strings.Join(content, "\n"),
Metadata: map[string]any{"type": csvLoader, "row": rown},
})
}

return
}
102 changes: 102 additions & 0 deletions pkg/plugin/buildin/docloader/docloader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Copyright 2023 NanaFS Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package docloader

import (
"context"
"fmt"
"github.com/basenana/nanafs/pkg/plugin/pluginapi"
"github.com/basenana/nanafs/pkg/types"
"os"
"path/filepath"
)

const (
PluginName = "docloader"
PluginVersion = "1.0"
)

type DocLoader struct{}

func (d DocLoader) Name() string {
return PluginName
}

func (d DocLoader) Type() types.PluginType {
return types.TypeProcess
}

func (d DocLoader) Version() string {
return PluginVersion
}

func (d DocLoader) Run(ctx context.Context, request *pluginapi.Request, pluginParams map[string]string) (*pluginapi.Response, error) {
entryPath := request.Parameter[pluginapi.ResEntryPathKey].(string)
if entryPath == "" {
resp := pluginapi.NewFailedResponse("entry_path is empty")
return resp, nil
}

_, err := os.Stat(entryPath)
if err != nil {
resp := pluginapi.NewFailedResponse(fmt.Sprintf("stat entry file %s failed: %s", entryPath, err))
return resp, nil
}

fileExt := filepath.Ext(entryPath)
var (
p Parser
parseOption = map[string]string{}
)

switch fileExt {
case ".pdf":
p = buildInLoaders[pdfParser](entryPath, parseOption)
case ".txt":
p = buildInLoaders[textParser](entryPath, parseOption)
case ".html", ".htm":
p = buildInLoaders[htmlParser](entryPath, parseOption)
case ".webarchive":
p = buildInLoaders[webArchiveParser](entryPath, parseOption)
default:
resp := pluginapi.NewFailedResponse(fmt.Sprintf("load %s file unsupported", fileExt))
return resp, nil
}

documents, err := p.Load(ctx)
if err != nil {
resp := pluginapi.NewFailedResponse(fmt.Sprintf("load file %s failed: %s", entryPath, err))
return resp, nil
}

return pluginapi.NewResponseWithResult(map[string]any{pluginapi.ResEntryDocumentsKey: documents}), nil
}

type Parser interface {
Load(ctx context.Context) (result []types.FDocument, err error)
}

type parserBuilder func(docPath string, docOption map[string]string) Parser

var (
buildInLoaders = map[string]parserBuilder{
textParser: NewText,
pdfParser: NewPDF,
htmlParser: NewHTML,
webArchiveParser: NewHTML,
}
)
17 changes: 17 additions & 0 deletions pkg/plugin/buildin/docloader/epub.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
Copyright 2023 NanaFS Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package docloader
Loading

0 comments on commit e271738

Please sign in to comment.