Skip to content

Commit

Permalink
feat: support the iframe and image resolving
Browse files Browse the repository at this point in the history
  • Loading branch information
pestanko committed Apr 21, 2024
1 parent b265fc8 commit b933b11
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 81 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ reports/
node_modules
/build

runtime/
runtime/

.env*
!.env.sample
16 changes: 6 additions & 10 deletions config/categories/food.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pages:
homepage: "https://www.himalayarestaurace.cz/"
url: "https://www.himalayarestaurace.cz/denni-menu/"
resolver: "img"
query: "#menu > img"
query: "div#menu img"
cache_policy: "no-cache"
tags: ["ns", "vlnena", "india", "city"]

Expand Down Expand Up @@ -100,25 +100,21 @@ pages:

- codename: "globus"
name: "Globus Brno"
url: "https://www.globus.cz/brno/nabidka/restaurace.html"
command:
content:
name: python
args:
- "-c"
- 'import requests; import sys; sys.stdout.buffer.write(requests.get("https://www.globus.cz/brno/nabidka/restaurace.html").content)'
url: "https://www.globus.cz/brno/sluzby-a-produkty/restaurace"
homepage: "https://www.globus.cz/brno/"
tags: ["rh", "fast"]
query: ".restaurant__menu-table-row--active"
xpath: "/html/body/div[2]/div/div[2]/div[1]/div/div/div"
filters:
day:
enabled: true
html:
tables: custom

- codename: "annapurna"
name: "Annapurna"
homepage: "http://www.indicka-restaurace-annapurna.cz/"
url: "http://www.indicka-restaurace-annapurna.cz/"
query: "div#T_menu"
query: "div.TJStrana"
tags: ["city", "ns", "vlnena", "india", "asia"]
filters:
day:
Expand Down
122 changes: 122 additions & 0 deletions internal/scraper/resolvers/html_node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package resolvers

import (
"bytes"
"context"

"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"github.com/pestanko/miniscrape/internal/models"
"github.com/rs/zerolog"
"golang.org/x/net/html"
)

// HTMLPageNode represents a node in the HTML page
type HTMLPageNode struct {
Content string
Attrs []html.Attribute
}

// ParseWebPageContent parses the web page content
func ParseWebPageContent(
ctx context.Context,
page *models.Page,
bodyContent []byte,
) (contentArray []HTMLPageNode, err error) {
if page.Query != "" {
contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query)
} else {
contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath)
}
return
}

func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]HTMLPageNode, error) {
zerolog.Ctx(ctx).Trace().
Str("xpath", xpath).
Msg("Parse using the the XPath")

root, err := htmlquery.Parse(bytes.NewReader(content))
if err != nil {
return []HTMLPageNode{}, err
}
nodes, err := htmlquery.QueryAll(root, xpath)
if err != nil {
return []HTMLPageNode{}, err
}

var result []HTMLPageNode

for _, node := range nodes {
if node == nil {
continue
}
htmlContent := htmlquery.OutputHTML(node, true)
result = append(result, HTMLPageNode{
Content: htmlContent,
Attrs: node.Attr,
})
}

return result, nil
}

func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]HTMLPageNode, error) {
ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger()
ll.Trace().Msg("Parse using the the CSS query")
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent))
if err != nil {
return []HTMLPageNode{}, err
}

var content []HTMLPageNode
doc.Find(query).Each(func(idx int, selection *goquery.Selection) {
htmlContent, err := selection.Html()
if err != nil {
ll.Warn().
Err(err).
Msg("Text extraction failed")
return
}

attrs := getAttributesFromSelection(selection)

content = append(content, HTMLPageNode{
Content: htmlContent,
Attrs: attrs,
})
})

if len(content) == 0 {
ll.Warn().Msg("No content found")
}

return content, nil
}

func getAttributesFromSelection(selection *goquery.Selection) []html.Attribute {
if selection == nil || len(selection.Nodes) == 0 {
return []html.Attribute{}
}

return selection.Nodes[0].Attr
}

// getAttrValue returns the value of the attribute
func getAttrValue(attrs []html.Attribute, name string) string {
for _, attr := range attrs {
if attr.Key == name {
return attr.Val
}
}
return ""
}

// concatContent concats the content
func concatContent(contentArray []HTMLPageNode) string {
var content string
for _, node := range contentArray {
content += node.Content + "\n"
}
return content
}
27 changes: 23 additions & 4 deletions internal/scraper/resolvers/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ package resolvers

import (
"context"
"net/http"

"github.com/pestanko/miniscrape/internal/models"
"github.com/rs/zerolog"
"net/http"
"strings"
)

type imageResolver struct {
Expand All @@ -15,12 +15,25 @@ type imageResolver struct {

// Resolve implements PageResolver
func (r *imageResolver) Resolve(ctx context.Context) models.RunResult {

ll := zerolog.Ctx(ctx).With().
Interface("page",
zerolog.Dict().
Str("codename", r.page.CodeName).
Str("url", r.page.URL).
Str("namespace", r.page.Namespace()).
Str("resolver", r.page.Resolver),
).
Logger()

ll.Debug().Msg("Resolving manu")

bodyContent, err := getContentForWebPage(ctx, &r.page)
if err != nil {
return makeErrorResult(r.page, err)
}

contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent)
contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent)
if err != nil {
zerolog.Ctx(ctx).
Warn().
Expand All @@ -32,7 +45,13 @@ func (r *imageResolver) Resolve(ctx context.Context) models.RunResult {
return makeErrorResult(r.page, err)
}

content := strings.Join(contentArray, "")
if len(contentArray) == 0 {
ll.Warn().Msg("No content found")
return makeEmptyResult(r.page, "img")
}

// Pick the first image
content := getAttrValue(contentArray[0].Attrs, "src")

return models.RunResult{
Page: r.page,
Expand Down
75 changes: 9 additions & 66 deletions internal/scraper/resolvers/page_content.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@ import (
"bytes"
"context"
"fmt"
"github.com/pestanko/miniscrape/internal/models"
"github.com/pestanko/miniscrape/internal/scraper/filters"
"io"
"math/rand"
"net/http"
"os/exec"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"github.com/pestanko/miniscrape/internal/models"
"github.com/pestanko/miniscrape/internal/scraper/filters"

"github.com/rs/zerolog"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
Expand Down Expand Up @@ -53,7 +52,7 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult {

ll.Trace().Bytes("body", bodyContent).Msg("page body")

contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent)
contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent)
if err != nil {
ll.
Err(err).
Expand All @@ -62,7 +61,11 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult {
return makeErrorResult(r.page, err)
}

content := strings.Join(contentArray, "\n")
if len(contentArray) == 0 {
return makeEmptyResult(r.page, "content")
}

content := concatContent(contentArray)
content = r.applyFilters(ctx, content)

var status = models.RunSuccess
Expand Down Expand Up @@ -178,66 +181,6 @@ func getContentByRequest(ctx context.Context, page *models.Page) ([]byte, error)
return bodyContent, err
}

func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]string, error) {
zerolog.Ctx(ctx).Trace().
Str("xpath", xpath).
Msg("Parse using the the XPath")

root, err := htmlquery.Parse(bytes.NewReader(content))
if err != nil {
return []string{}, err
}
nodes, err := htmlquery.QueryAll(root, xpath)
if err != nil {
return []string{}, err
}

var result []string

for _, node := range nodes {
html := htmlquery.OutputHTML(node, true)
result = append(result, html)
}

return result, nil
}

func parseWebPageContent(
ctx context.Context,
page *models.Page,
bodyContent []byte,
) (contentArray []string, err error) {
if page.Query != "" {
contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query)
} else {
contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath)
}
return
}

func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]string, error) {
ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger()
ll.Trace().Msg("Parse using the the CSS query")
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent))
if err != nil {
return []string{}, err
}

var content []string
doc.Find(query).Each(func(idx int, selection *goquery.Selection) {
htmlContent, err := selection.Html()
if err != nil {
ll.Warn().
Err(err).
Msg("Text extraction failed")
return
}
content = append(content, htmlContent)
})

return content, nil
}

func (r *pageContentResolver) applyFilters(ctx context.Context, content string) string {
if strings.TrimSpace(content) == "" {
return ""
Expand Down
9 changes: 9 additions & 0 deletions internal/scraper/resolvers/url_only.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,12 @@ func makeErrorResult(page models.Page, err error) models.RunResult {
Kind: "error",
}
}

func makeEmptyResult(page models.Page, kind string) models.RunResult {
return models.RunResult{
Page: page,
Content: "",
Status: models.RunEmpty,
Kind: kind,
}
}
4 changes: 4 additions & 0 deletions miniscrape-ui/src/routes/(app)/pages/+page.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
{#if page.status === 'ok'}
{#if page.resolver === 'pdf'}
<embed src={page.content} type="application/pdf" width="100%" height="600px" />
{:else if page.resolver === 'img'}
<img src={page.content} alt="Daily Menu: {page.page.name}" />
{:else if page.resolver === 'url_only'}
<iframe src={page.content} width="100%" height="600px" title="Daily Menu: {page.page.name}" />
{:else}
<pre>
{page.content}
Expand Down

0 comments on commit b933b11

Please sign in to comment.