Skip to content

Commit

Permalink
Fetch and use Botkube website and docs for AI assistant (#25)
Browse files Browse the repository at this point in the history
- Fetch Botkube website and docs for AI assistant
- Use Botkube website and docs for AI assistant
- Add Readme for assistant setup
- Migrate assistant setup to TypeScript
   - we need to use beta functionality (vertex store, file search) which isn't available in the unofficial Go client.
  • Loading branch information
pkosiec authored May 8, 2024
1 parent 6b03bac commit 18faad0
Show file tree
Hide file tree
Showing 258 changed files with 18,762 additions and 301 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
dist/
plugins-index.yaml
/.idea/

node_modules
51 changes: 51 additions & 0 deletions hack/assistant-setup/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# OpenAI Assistant setup

The tool configures the OpenAI assistant for Botkube AI plugin. It uses documents from the `assets` directory for file search capabilities.

## Toolchain

This project uses [Volta](https://github.com/volta-cli/volta) to manage JS tools. Volta automatically downloads and installs the right Node.js version when you run any of the `node` or `npm` commands.

It is recommended to install it before running the script, to ensure the right Node.js version is used.

## Usage

Navigate to the directory `hack/assistant-setup` and execute one of the following commands.

### Install dependencies

To install all dependencies, run:

```sh
npm install
```

### Start app

```sh
export OPENAI_API_KEY=... # your Open AI API key
export ASSISTANT_ENV=dev # dev or prod
npm run start
```

To use your own assistant, modify the `assistantID` variable in the `index.ts` file.

## Development

## Refetch content for file search

> **NOTE:** The process uses [Jina.AI Reader API](https://github.com/jina-ai/reader) and usually takes 10-15 minutes. All files will be removed before the process starts.
To scrap the content from the latest Botkube website and Botkube Docs, run the following command:

```sh
npm run fetch-content
```

## Format code

To format code, run:

```sh
npm run format
```
209 changes: 209 additions & 0 deletions hack/assistant-setup/content-fetcher/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
package main

import (
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"regexp"
"strings"
"time"

"github.com/avast/retry-go/v4"

"github.com/kubeshop/botkube/pkg/config"
"github.com/kubeshop/botkube/pkg/loggerx"
"github.com/kubeshop/botkube/pkg/multierror"
"github.com/sirupsen/logrus"
)

const (
marketingSitemapURL = "https://botkube.io/sitemap.xml"
docsSitemapURL = "https://docs.botkube.io/sitemap.xml"
processingAPIURL = "https://r.jina.ai"
contentDir = "content"
maxRetries = 5
retryInterval = 1 * time.Second
httpCliTimeout = 1 * time.Minute
)

var excludedDocsPagesRegex = regexp.MustCompile(`^https:\/\/docs\.botkube\.io\/(?:\d+\.\d+|next)\/.*`)

func main() {
log := loggerx.New(config.Logger{
Level: "info",
Formatter: "text",
})

fetcher := &contentFetcher{
log: log,
httpCli: &http.Client{
Timeout: httpCliTimeout,
},
}

log.Infof("Removing old %q directory...", contentDir)
err := os.RemoveAll(contentDir)
loggerx.ExitOnError(err, "while removing old directory")

log.Info("Fetching Botkube sitemap...")
marketingPages, err := fetcher.getURLsToDownloadFromSitemap(marketingSitemapURL)
loggerx.ExitOnError(err, "while fetching Botkube sitemap")

log.Info("Fetching Botkube docs sitemap...")
docsPages, err := fetcher.getURLsToDownloadFromSitemap(docsSitemapURL)
loggerx.ExitOnError(err, "while fetching Botkube docs sitemap")

log.Info("Preparing list of pages to fetch...")
pagesToFetch := fetcher.preparePageList(docsPages, marketingPages)
log.Infof("Found %d pages to fetch", len(pagesToFetch))

log.Infof("Creating %q directory...", contentDir)
err = os.MkdirAll(contentDir, os.ModePerm)
loggerx.ExitOnError(err, "while creating directory")

errs := multierror.New()
for i, page := range pagesToFetch {
filePath, err := fetcher.filePathForURL(page)
if err != nil {
errs = multierror.Append(errs, err)
continue
}
log.WithFields(logrus.Fields{
"url": page,
"filePath": filePath,
}).Infof("Fetching and saving page %d of %d...", i+1, len(pagesToFetch))

err = retry.Do(
func() error {
return fetcher.fetchAndSavePage(page, filePath)
},
retry.Attempts(maxRetries),
retry.OnRetry(func(n uint, err error) {
log.WithError(err).Errorf("while fetching and saving page %q. Retrying...", page)
}),
retry.Delay(retryInterval),
)

if err != nil {
errs = multierror.Append(errs, err)
}
}

loggerx.ExitOnError(errs.ErrorOrNil(), "while fetching and saving docs pages")

log.Infof("Saved %d docs pages", len(pagesToFetch))
}

type contentFetcher struct {
log logrus.FieldLogger
httpCli *http.Client
}

type sitemapURLSet struct {
URLs []sitemapURL `xml:"url"`
}

type sitemapURL struct {
Loc string `xml:"loc"`
}

func (f *contentFetcher) getURLsToDownloadFromSitemap(sitemapURL string) ([]string, error) {
log := f.log.WithField("sitemapURL", sitemapURL)
// nolint:gosec
res, err := http.Get(sitemapURL)
if err != nil {
return nil, fmt.Errorf("while fetching sitemap %q: %w", sitemapURL, err)
}
defer res.Body.Close()

if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("invalid status code when fetching Botkube sitemap: %d", res.StatusCode)
}

log.Info("Decoding sitemap...")
var sitemap sitemapURLSet
err = xml.NewDecoder(res.Body).Decode(&sitemap)
if err != nil {
return nil, fmt.Errorf("while decoding sitemap %q: %w", sitemapURL, err)
}

var urls []string
for _, part := range sitemap.URLs {
urls = append(urls, part.Loc)
}

log.Infof("Found %d sitemap entries", len(urls))
return urls, nil
}

func (f *contentFetcher) fetchAndSavePage(inURL, filePath string) error {
pageURL := fmt.Sprintf("%s/%s", processingAPIURL, inURL)

req, err := http.NewRequest(http.MethodGet, pageURL, nil)
if err != nil {
return fmt.Errorf("while creating request for page %q: %w", pageURL, err)
}
req.Header.Set("Content-Type", "text/event-stream")

res, err := f.httpCli.Do(req)
if err != nil {
return fmt.Errorf("while fetching page %q: %w", pageURL, err)
}
defer res.Body.Close()

if res.StatusCode != http.StatusOK {
return fmt.Errorf("invalid status code when fetching page %q: %d", pageURL, res.StatusCode)
}

// nolint:gosec
file, err := os.Create(filePath)
if err != nil {
return fmt.Errorf("while creating file %q: %w", filePath, err)
}
defer file.Close()

_, err = io.Copy(file, res.Body)
if err != nil {
return fmt.Errorf("while writing to file %q: %w", filePath, err)
}

return nil
}

func (f *contentFetcher) preparePageList(docsPages, marketingPages []string) []string {
var out []string
for _, page := range docsPages {
// remove all docs for previous and upcoming versions
if excludedDocsPagesRegex.MatchString(page) {
continue
}

out = append(out, strings.TrimSpace(page))
}
for _, page := range marketingPages {
out = append(out, strings.TrimSpace(page))
}

return out
}

func (f *contentFetcher) filePathForURL(inURL string) (string, error) {
parsedInURL, err := url.Parse(inURL)
if err != nil {
return "", fmt.Errorf("while parsing url %q: %w", inURL, err)
}

prefix := parsedInURL.Host
urlPath := strings.Trim(parsedInURL.Path, "/")
urlPath = strings.Replace(urlPath, "/", "__", -1)

fileName := prefix
if urlPath != "" {
fileName = fmt.Sprintf("%s__%s", prefix, urlPath)
}

return fmt.Sprintf("%s/%s.md", contentDir, fileName), nil
}
Loading

0 comments on commit 18faad0

Please sign in to comment.