diff --git a/.github/workflows/build-and-push.yml b/.github/workflows/build-and-push.yml index d104f0e..c61c19d 100644 --- a/.github/workflows/build-and-push.yml +++ b/.github/workflows/build-and-push.yml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up QEMU uses: docker/setup-qemu-action@v2 diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index d6b6122..4a544af 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -13,13 +13,13 @@ jobs: name: lint runs-on: ubuntu-latest steps: - - uses: actions/setup-go@v3 - with: - go-version: 1.19 + - uses: actions/checkout@v4 - - uses: actions/checkout@v3 + - uses: actions/setup-go@v4 + with: + go-version-file: go.mod - name: golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v4 with: - args: --timeout 3m0s \ No newline at end of file + args: --timeout 3m0s diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index c32f26e..a5788f3 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -14,13 +14,13 @@ jobs: unit-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 name: Checkout code - name: setup golang v1.x - uses: actions/setup-go@v3 + uses: actions/setup-go@v4 with: - go-version: ^1.19 + go-version-file: go.mod - name: Install Task uses: arduino/setup-task@v1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f3e3484..17024c3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,8 +9,3 @@ repos: - id: check-yaml args: ['--unsafe'] - id: check-added-large-files - - - repo: https://github.com/golangci/golangci-lint - rev: v1.50.1 - hooks: - - id: golangci-lint diff --git a/Dockerfile b/Dockerfile index 59faee1..4f9be50 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.20-alpine AS build +FROM golang:1.22-alpine AS build WORKDIR /app diff --git a/README.md b/README.md index 5f1ea84..0670dbb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Miniscraper +# MiniScrape Simple webpages scrapper written in GO @@ -23,6 +23,7 @@ go get . ``` ## Build the scraper + ```shell make build ``` @@ -33,13 +34,23 @@ make build go run main.go scrape ``` +Scrape the single webpage: + +```shell +# For food category +go run main.go scrape -C food -N ubaumanu +``` + ### Run the server + ```shell make run-serve ``` ## Add/Edit available webpages + The webpages list is located in ``./config/default.yml``. ## License -Miniscrape is released under the Apache 2.0 license. See LICENSE \ No newline at end of file + +Miniscrape is released under the Apache 2.0 license. See LICENSE diff --git a/Taskfile.yaml b/Taskfile.yaml index 8e88343..1021638 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -5,7 +5,7 @@ version: '3' vars: APPNAME: 'miniscrape' GO_VERSION: - sh: go version | awk '{print $$3}' + sh: go version | awk '{print $3}' GO_PATH: sh: go env GOPATH NC: '\033[0m' diff --git a/config/categories/food.yml b/config/categories/food.yml index e14d48e..bc2cb51 100644 --- a/config/categories/food.yml +++ b/config/categories/food.yml @@ -264,3 +264,15 @@ pages: enabled: true html: tables: custom + + - codename: sagram + name: sagram + homepage: https://sargamrestaurace.cz/ + tags: ["ns", "india", "asia"] + url: https://sargamrestaurace.cz/DMenuItems + query: "main div.row:nth-child(n+5)" + filters: + day: + enabled: true + cut: + after: "#Sunday" diff --git a/internal/scraper/filters/new_line.go b/internal/scraper/filters/new_line.go index d365487..c85991b 100644 --- a/internal/scraper/filters/new_line.go +++ b/internal/scraper/filters/new_line.go @@ -1,15 +1,16 @@ package filters import ( - "github.com/pestanko/miniscrape/internal/models" "regexp" + + "github.com/pestanko/miniscrape/internal/models" ) -var normPattern = regexp.MustCompile("\n\n") +var normPattern = regexp.MustCompile("[\n]+") // NewNewLineTrimConverter a new instance of the filter that // cuts the line of the content -func NewNewLineTrimConverter(page *models.Page) PageFilter { +func NewNewLineTrimConverter(_ *models.Page) PageFilter { return &newLineTrimConverter{} }