feat: support the iframe and image resolving

pestanko · Apr 21, 2024 · b933b11 · b933b11
1 parent b265fc8
commit b933b11
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 81 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,7 @@ reports/
 node_modules
 /build
 
-runtime/
+runtime/
+
+.env*
+!.env.sample
diff --git a/config/categories/food.yml b/config/categories/food.yml
@@ -24,7 +24,7 @@ pages:
     homepage: "https://www.himalayarestaurace.cz/"
     url: "https://www.himalayarestaurace.cz/denni-menu/"
     resolver: "img"
-    query: "#menu > img"
+    query: "div#menu img"
     cache_policy: "no-cache" 
     tags: ["ns", "vlnena", "india", "city"]
 
@@ -100,25 +100,21 @@ pages:
 
   - codename: "globus"
     name: "Globus Brno"
-    url: "https://www.globus.cz/brno/nabidka/restaurace.html"
-    command:
-      content:
-        name: python
-        args:
-          - "-c"
-          - 'import requests; import sys; sys.stdout.buffer.write(requests.get("https://www.globus.cz/brno/nabidka/restaurace.html").content)'
+    url: "https://www.globus.cz/brno/sluzby-a-produkty/restaurace"
     homepage: "https://www.globus.cz/brno/"
     tags: ["rh", "fast"]
-    query: ".restaurant__menu-table-row--active"
+    xpath: "/html/body/div[2]/div/div[2]/div[1]/div/div/div"
     filters:
+      day:
+        enabled: true
       html:
         tables: custom
 
   - codename: "annapurna"
     name: "Annapurna"
     homepage: "http://www.indicka-restaurace-annapurna.cz/"
     url: "http://www.indicka-restaurace-annapurna.cz/"
-    query: "div#T_menu"
+    query: "div.TJStrana"
     tags: ["city", "ns", "vlnena", "india", "asia"]
     filters:
       day:

diff --git a/internal/scraper/resolvers/html_node.go b/internal/scraper/resolvers/html_node.go
@@ -0,0 +1,122 @@
+package resolvers
+
+import (
+	"bytes"
+	"context"
+
+	"github.com/PuerkitoBio/goquery"
+	"github.com/antchfx/htmlquery"
+	"github.com/pestanko/miniscrape/internal/models"
+	"github.com/rs/zerolog"
+	"golang.org/x/net/html"
+)
+
+// HTMLPageNode represents a node in the HTML page
+type HTMLPageNode struct {
+	Content string
+	Attrs   []html.Attribute
+}
+
+// ParseWebPageContent parses the web page content
+func ParseWebPageContent(
+	ctx context.Context,
+	page *models.Page,
+	bodyContent []byte,
+) (contentArray []HTMLPageNode, err error) {
+	if page.Query != "" {
+		contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query)
+	} else {
+		contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath)
+	}
+	return
+}
+
+func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]HTMLPageNode, error) {
+	zerolog.Ctx(ctx).Trace().
+		Str("xpath", xpath).
+		Msg("Parse using the the XPath")
+
+	root, err := htmlquery.Parse(bytes.NewReader(content))
+	if err != nil {
+		return []HTMLPageNode{}, err
+	}
+	nodes, err := htmlquery.QueryAll(root, xpath)
+	if err != nil {
+		return []HTMLPageNode{}, err
+	}
+
+	var result []HTMLPageNode
+
+	for _, node := range nodes {
+		if node == nil {
+			continue
+		}
+		htmlContent := htmlquery.OutputHTML(node, true)
+		result = append(result, HTMLPageNode{
+			Content: htmlContent,
+			Attrs:   node.Attr,
+		})
+	}
+
+	return result, nil
+}
+
+func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]HTMLPageNode, error) {
+	ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger()
+	ll.Trace().Msg("Parse using the the CSS query")
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent))
+	if err != nil {
+		return []HTMLPageNode{}, err
+	}
+
+	var content []HTMLPageNode
+	doc.Find(query).Each(func(idx int, selection *goquery.Selection) {
+		htmlContent, err := selection.Html()
+		if err != nil {
+			ll.Warn().
+				Err(err).
+				Msg("Text extraction failed")
+			return
+		}
+
+		attrs := getAttributesFromSelection(selection)
+
+		content = append(content, HTMLPageNode{
+			Content: htmlContent,
+			Attrs:   attrs,
+		})
+	})
+
+	if len(content) == 0 {
+		ll.Warn().Msg("No content found")
+	}
+
+	return content, nil
+}
+
+func getAttributesFromSelection(selection *goquery.Selection) []html.Attribute {
+	if selection == nil || len(selection.Nodes) == 0 {
+		return []html.Attribute{}
+	}
+
+	return selection.Nodes[0].Attr
+}
+
+// getAttrValue returns the value of the attribute
+func getAttrValue(attrs []html.Attribute, name string) string {
+	for _, attr := range attrs {
+		if attr.Key == name {
+			return attr.Val
+		}
+	}
+	return ""
+}
+
+// concatContent concats the content
+func concatContent(contentArray []HTMLPageNode) string {
+	var content string
+	for _, node := range contentArray {
+		content += node.Content + "\n"
+	}
+	return content
+}
diff --git a/internal/scraper/resolvers/image.go b/internal/scraper/resolvers/image.go
@@ -2,10 +2,10 @@ package resolvers
 
 import (
 	"context"
+	"net/http"
+
 	"github.com/pestanko/miniscrape/internal/models"
 	"github.com/rs/zerolog"
-	"net/http"
-	"strings"
 )
 
 type imageResolver struct {
@@ -15,12 +15,25 @@ type imageResolver struct {
 
 // Resolve implements PageResolver
 func (r *imageResolver) Resolve(ctx context.Context) models.RunResult {
+
+	ll := zerolog.Ctx(ctx).With().
+		Interface("page",
+			zerolog.Dict().
+				Str("codename", r.page.CodeName).
+				Str("url", r.page.URL).
+				Str("namespace", r.page.Namespace()).
+				Str("resolver", r.page.Resolver),
+		).
+		Logger()
+
+	ll.Debug().Msg("Resolving manu")
+
 	bodyContent, err := getContentForWebPage(ctx, &r.page)
 	if err != nil {
 		return makeErrorResult(r.page, err)
 	}
 
-	contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent)
+	contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent)
 	if err != nil {
 		zerolog.Ctx(ctx).
 			Warn().
@@ -32,7 +45,13 @@ func (r *imageResolver) Resolve(ctx context.Context) models.RunResult {
 		return makeErrorResult(r.page, err)
 	}
 
-	content := strings.Join(contentArray, "")
+	if len(contentArray) == 0 {
+		ll.Warn().Msg("No content found")
+		return makeEmptyResult(r.page, "img")
+	}
+
+	// Pick the first image
+	content := getAttrValue(contentArray[0].Attrs, "src")
 
 	return models.RunResult{
 		Page:    r.page,

diff --git a/internal/scraper/resolvers/page_content.go b/internal/scraper/resolvers/page_content.go
@@ -5,17 +5,16 @@ import (
 	"bytes"
 	"context"
 	"fmt"
-	"github.com/pestanko/miniscrape/internal/models"
-	"github.com/pestanko/miniscrape/internal/scraper/filters"
 	"io"
 	"math/rand"
 	"net/http"
 	"os/exec"
 	"strings"
 	"time"
 
-	"github.com/PuerkitoBio/goquery"
-	"github.com/antchfx/htmlquery"
+	"github.com/pestanko/miniscrape/internal/models"
+	"github.com/pestanko/miniscrape/internal/scraper/filters"
+
 	"github.com/rs/zerolog"
 	"golang.org/x/net/html/charset"
 	"golang.org/x/text/encoding"
@@ -53,7 +52,7 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult {
 
 	ll.Trace().Bytes("body", bodyContent).Msg("page body")
 
-	contentArray, err := parseWebPageContent(ctx, &r.page, bodyContent)
+	contentArray, err := ParseWebPageContent(ctx, &r.page, bodyContent)
 	if err != nil {
 		ll.
 			Err(err).
@@ -62,7 +61,11 @@ func (r *pageContentResolver) Resolve(ctx context.Context) models.RunResult {
 		return makeErrorResult(r.page, err)
 	}
 
-	content := strings.Join(contentArray, "\n")
+	if len(contentArray) == 0 {
+		return makeEmptyResult(r.page, "content")
+	}
+
+	content := concatContent(contentArray)
 	content = r.applyFilters(ctx, content)
 
 	var status = models.RunSuccess
@@ -178,66 +181,6 @@ func getContentByRequest(ctx context.Context, page *models.Page) ([]byte, error)
 	return bodyContent, err
 }
 
-func parseUsingXPathQuery(ctx context.Context, content []byte, xpath string) ([]string, error) {
-	zerolog.Ctx(ctx).Trace().
-		Str("xpath", xpath).
-		Msg("Parse using the the XPath")
-
-	root, err := htmlquery.Parse(bytes.NewReader(content))
-	if err != nil {
-		return []string{}, err
-	}
-	nodes, err := htmlquery.QueryAll(root, xpath)
-	if err != nil {
-		return []string{}, err
-	}
-
-	var result []string
-
-	for _, node := range nodes {
-		html := htmlquery.OutputHTML(node, true)
-		result = append(result, html)
-	}
-
-	return result, nil
-}
-
-func parseWebPageContent(
-	ctx context.Context,
-	page *models.Page,
-	bodyContent []byte,
-) (contentArray []string, err error) {
-	if page.Query != "" {
-		contentArray, err = parseUsingCSSQuery(ctx, bodyContent, page.Query)
-	} else {
-		contentArray, err = parseUsingXPathQuery(ctx, bodyContent, page.XPath)
-	}
-	return
-}
-
-func parseUsingCSSQuery(ctx context.Context, bodyContent []byte, query string) ([]string, error) {
-	ll := zerolog.Ctx(ctx).With().Str("css_query", query).Logger()
-	ll.Trace().Msg("Parse using the the CSS query")
-	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(bodyContent))
-	if err != nil {
-		return []string{}, err
-	}
-
-	var content []string
-	doc.Find(query).Each(func(idx int, selection *goquery.Selection) {
-		htmlContent, err := selection.Html()
-		if err != nil {
-			ll.Warn().
-				Err(err).
-				Msg("Text extraction failed")
-			return
-		}
-		content = append(content, htmlContent)
-	})
-
-	return content, nil
-}
-
 func (r *pageContentResolver) applyFilters(ctx context.Context, content string) string {
 	if strings.TrimSpace(content) == "" {
 		return ""

diff --git a/internal/scraper/resolvers/url_only.go b/internal/scraper/resolvers/url_only.go
@@ -28,3 +28,12 @@ func makeErrorResult(page models.Page, err error) models.RunResult {
 		Kind:    "error",
 	}
 }
+
+func makeEmptyResult(page models.Page, kind string) models.RunResult {
+	return models.RunResult{
+		Page:    page,
+		Content: "",
+		Status:  models.RunEmpty,
+		Kind:    kind,
+	}
+}
diff --git a/miniscrape-ui/src/routes/(app)/pages/+page.svelte b/miniscrape-ui/src/routes/(app)/pages/+page.svelte
@@ -31,6 +31,10 @@
 					{#if page.status === 'ok'}
 						{#if page.resolver === 'pdf'}
 						<embed src={page.content} type="application/pdf" width="100%" height="600px" />
+						{:else if page.resolver === 'img'}
+						<img src={page.content} alt="Daily Menu: {page.page.name}" />
+						{:else if page.resolver === 'url_only'}
+						<iframe src={page.content} width="100%" height="600px" title="Daily Menu: {page.page.name}" />
 						{:else}
 						<pre>
                         	{page.content}