From b933b115df669d6f451c82f2e7f76936301e15e8 Mon Sep 17 00:00:00 2001 From: Peter Stanko Date: Sun, 21 Apr 2024 21:31:21 +0200 Subject: [PATCH] feat: support the iframe and image resolving --- .gitignore | 5 +- config/categories/food.yml | 16 +-- internal/scraper/resolvers/html_node.go | 122 ++++++++++++++++++ internal/scraper/resolvers/image.go | 27 +++- internal/scraper/resolvers/page_content.go | 75 ++--------- internal/scraper/resolvers/url_only.go | 9 ++ .../src/routes/(app)/pages/+page.svelte | 4 + 7 files changed, 177 insertions(+), 81 deletions(-) create mode 100644 internal/scraper/resolvers/html_node.go diff --git a/.gitignore b/.gitignore index e54f903..8ca1bc5 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,7 @@ reports/ node_modules /build -runtime/ \ No newline at end of file +runtime/ + +.env* +!.env.sample \ No newline at end of file diff --git a/config/categories/food.yml b/config/categories/food.yml index c5f72ee..05891d5 100644 --- a/config/categories/food.yml +++ b/config/categories/food.yml @@ -24,7 +24,7 @@ pages: homepage: "https://www.himalayarestaurace.cz/" url: "https://www.himalayarestaurace.cz/denni-menu/" resolver: "img" - query: "#menu > img" + query: "div#menu img" cache_policy: "no-cache" tags: ["ns", "vlnena", "india", "city"] @@ -100,17 +100,13 @@ pages: - codename: "globus" name: "Globus Brno" - url: "https://www.globus.cz/brno/nabidka/restaurace.html" - command: - content: - name: python - args: - - "-c" - - 'import requests; import sys; sys.stdout.buffer.write(requests.get("https://www.globus.cz/brno/nabidka/restaurace.html").content)' + url: "https://www.globus.cz/brno/sluzby-a-produkty/restaurace" homepage: "https://www.globus.cz/brno/" tags: ["rh", "fast"] - query: ".restaurant__menu-table-row--active" + xpath: "/html/body/div[2]/div/div[2]/div[1]/div/div/div" filters: + day: + enabled: true html: tables: custom @@ -118,7 +114,7 @@ pages: name: "Annapurna" homepage: "http://www.indicka-restaurace-annapurna.cz/" url: "http://www.indicka-restaurace-annapurna.cz/" - query: "div#T_menu" + query: "div.TJStrana" tags: ["city", "ns", "vlnena", "india", "asia"] filters: day: diff --git a/internal/scraper/resolvers/html_node.go b/internal/scraper/resolvers/html_node.go new file mode 100644 index 0000000..119d99c --- /dev/null +++ b/internal/scraper/resolvers/html_node.go @@ -0,0 +1,122 @@ +package resolvers + +import ( + "bytes" + "context" + + "github.com/PuerkitoBio/goquery" + "github.com/antchfx/htmlquery" + "github.com/pestanko/miniscrape/internal/models" + "github.com/rs/zerolog" + "golang.org/x/net/html" +) + +// HTMLPageNode represents a node in the HTML page +type HTMLPageNode struct { + Content string + Attrs []html.Attribute +} + +// ParseWebPageContent parses the web page content +func ParseWebPageContent( + ctx context.Context, + page *models.Page, + bodyContent []byte, +) (contentArray []HTMLPageNode, err error) { + if page.Query != "" { + contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query) + } else { + contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath) + } + return +} + +func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]HTMLPageNode, error) { + zerolog.Ctx(ctx).Trace(). + Str("xpath", xpath). + Msg("Parse using the the XPath") + + root, err := htmlquery.Parse(bytes.NewReader(content)) + if err != nil { + return []HTMLPageNode{}, err + } + nodes, err := htmlquery.QueryAll(root, xpath) + if err != nil { + return []HTMLPageNode{}, err + } + + var result []HTMLPageNode + + for _, node := range nodes { + if node == nil { + continue + } + htmlContent := htmlquery.OutputHTML(node, true) + result = append(result, HTMLPageNode{ + Content: htmlContent, + Attrs: node.Attr, + }) + } + + return result, nil +} + +func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]HTMLPageNode, error) { + ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger() + ll.Trace().Msg("Parse using the the CSS query") + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent)) + if err != nil { + return []HTMLPageNode{}, err + } + + var content []HTMLPageNode + doc.Find(query).Each(func(idx int, selection *goquery.Selection) { + htmlContent, err := selection.Html() + if err != nil { + ll.Warn(). + Err(err). + Msg("Text extraction failed") + return + } + + attrs := getAttributesFromSelection(selection) + + content = append(content, HTMLPageNode{ + Content: htmlContent, + Attrs: attrs, + }) + }) + + if len(content) == 0 { + ll.Warn().Msg("No content found") + } + + return content, nil +} + +func getAttributesFromSelection(selection *goquery.Selection) []html.Attribute { + if selection == nil || len(selection.Nodes) == 0 { + return []html.Attribute{} + } + + return selection.Nodes[0].Attr +} + +// getAttrValue returns the value of the attribute +func getAttrValue(attrs []html.Attribute, name string) string { + for _, attr := range attrs { + if attr.Key == name { + return attr.Val + } + } + return "" +} + +// concatContent concats the content +func concatContent(contentArray []HTMLPageNode) string { + var content string + for _, node := range contentArray { + content += node.Content + "\n" + } + return content +} diff --git a/internal/scraper/resolvers/image.go b/internal/scraper/resolvers/image.go index bc9da7b..c7bf8fd 100644 --- a/internal/scraper/resolvers/image.go +++ b/internal/scraper/resolvers/image.go @@ -2,10 +2,10 @@ package resolvers import ( "context" + "net/http" + "github.com/pestanko/miniscrape/internal/models" "github.com/rs/zerolog" - "net/http" - "strings" ) type imageResolver struct { @@ -15,12 +15,25 @@ type imageResolver struct { // Resolve implements PageResolver func (r *imageResolver) Resolve(ctx context.Context) models.RunResult { + + ll := zerolog.Ctx(ctx).With(). + Interface("page", + zerolog.Dict(). + Str("codename", r.page.CodeName). + Str("url", r.page.URL). + Str("namespace", r.page.Namespace()). + Str("resolver", r.page.Resolver), + ). + Logger() + + ll.Debug().Msg("Resolving manu") + bodyContent, err := getContentForWebPage(ctx, &r.page) if err != nil { return makeErrorResult(r.page, err) } - contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent) + contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent) if err != nil { zerolog.Ctx(ctx). Warn(). @@ -32,7 +45,13 @@ func (r *imageResolver) Resolve(ctx context.Context) models.RunResult { return makeErrorResult(r.page, err) } - content := strings.Join(contentArray, "") + if len(contentArray) == 0 { + ll.Warn().Msg("No content found") + return makeEmptyResult(r.page, "img") + } + + // Pick the first image + content := getAttrValue(contentArray[0].Attrs, "src") return models.RunResult{ Page: r.page, diff --git a/internal/scraper/resolvers/page_content.go b/internal/scraper/resolvers/page_content.go index a2ae597..4a9502f 100644 --- a/internal/scraper/resolvers/page_content.go +++ b/internal/scraper/resolvers/page_content.go @@ -5,8 +5,6 @@ import ( "bytes" "context" "fmt" - "github.com/pestanko/miniscrape/internal/models" - "github.com/pestanko/miniscrape/internal/scraper/filters" "io" "math/rand" "net/http" @@ -14,8 +12,9 @@ import ( "strings" "time" - "github.com/PuerkitoBio/goquery" - "github.com/antchfx/htmlquery" + "github.com/pestanko/miniscrape/internal/models" + "github.com/pestanko/miniscrape/internal/scraper/filters" + "github.com/rs/zerolog" "golang.org/x/net/html/charset" "golang.org/x/text/encoding" @@ -53,7 +52,7 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult { ll.Trace().Bytes("body", bodyContent).Msg("page body") - contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent) + contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent) if err != nil { ll. Err(err). @@ -62,7 +61,11 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult { return makeErrorResult(r.page, err) } - content := strings.Join(contentArray, "\n") + if len(contentArray) == 0 { + return makeEmptyResult(r.page, "content") + } + + content := concatContent(contentArray) content = r.applyFilters(ctx, content) var status = models.RunSuccess @@ -178,66 +181,6 @@ func getContentByRequest(ctx context.Context, page *models.Page) ([]byte, error) return bodyContent, err } -func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]string, error) { - zerolog.Ctx(ctx).Trace(). - Str("xpath", xpath). - Msg("Parse using the the XPath") - - root, err := htmlquery.Parse(bytes.NewReader(content)) - if err != nil { - return []string{}, err - } - nodes, err := htmlquery.QueryAll(root, xpath) - if err != nil { - return []string{}, err - } - - var result []string - - for _, node := range nodes { - html := htmlquery.OutputHTML(node, true) - result = append(result, html) - } - - return result, nil -} - -func parseWebPageContent( - ctx context.Context, - page *models.Page, - bodyContent []byte, -) (contentArray []string, err error) { - if page.Query != "" { - contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query) - } else { - contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath) - } - return -} - -func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]string, error) { - ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger() - ll.Trace().Msg("Parse using the the CSS query") - doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent)) - if err != nil { - return []string{}, err - } - - var content []string - doc.Find(query).Each(func(idx int, selection *goquery.Selection) { - htmlContent, err := selection.Html() - if err != nil { - ll.Warn(). - Err(err). - Msg("Text extraction failed") - return - } - content = append(content, htmlContent) - }) - - return content, nil -} - func (r *pageContentResolver) applyFilters(ctx context.Context, content string) string { if strings.TrimSpace(content) == "" { return "" diff --git a/internal/scraper/resolvers/url_only.go b/internal/scraper/resolvers/url_only.go index f1fa781..7073804 100644 --- a/internal/scraper/resolvers/url_only.go +++ b/internal/scraper/resolvers/url_only.go @@ -28,3 +28,12 @@ func makeErrorResult(page models.Page, err error) models.RunResult { Kind: "error", } } + +func makeEmptyResult(page models.Page, kind string) models.RunResult { + return models.RunResult{ + Page: page, + Content: "", + Status: models.RunEmpty, + Kind: kind, + } +} diff --git a/miniscrape-ui/src/routes/(app)/pages/+page.svelte b/miniscrape-ui/src/routes/(app)/pages/+page.svelte index e2ae27a..a1dd7a4 100644 --- a/miniscrape-ui/src/routes/(app)/pages/+page.svelte +++ b/miniscrape-ui/src/routes/(app)/pages/+page.svelte @@ -31,6 +31,10 @@ {#if page.status === 'ok'} {#if page.resolver === 'pdf'} + {:else if page.resolver === 'img'} + Daily Menu: {page.page.name} + {:else if page.resolver === 'url_only'} +