autoscraper.yaml

id: autoscraper
namespace: company.team

tasks:
  - id: scrape
    type: io.kestra.plugin.scripts.python.Script
    beforeCommands:
      - pip install autoscraper kestra
    warningOnStdErr: false
    script: >
      from autoscraper import AutoScraper
      from kestra import Kestra

      url = "https://stackoverflow.com/questions/2081586/web-scraping-with-python"

      # You can also put urls here to retrieve urls.
      wanted_list = ["What are metaclasses in Python?"]

      scraper = AutoScraper()
      result = scraper.build(url, wanted_list)

      # get related topics of any stackoverflow page:
      related = scraper.get_result_similar(
          "https://stackoverflow.com/questions/606191/convert-bytes-to-a-string"
      )

      Kestra.outputs({"data": result, "related": related})

  - id: use_output_data
    type: io.kestra.plugin.core.debug.Return
    format: "{{ outputs.scrape.vars.data }}"

  - id: use_output_related
    type: io.kestra.plugin.core.debug.Return
    format: "{{ outputs.scrape.vars.related }}"

extend:
  title: Scrape StackOverflow using AutoScraper in Python
  description: This flow shows how to scrape a web page using AutoScraper in
    Python. It uses the
    [AutoScraper](https://github.com/alirezamika/autoscraper) library to extract
    data from StackOverflow, and the Kestra Python SDK to send the output from a
    Python script to Kestra. This way, you can pass data between Python scripts
    and other Kestra tasks.
  tags:
    - Python
  ee: false
  demo: true
  meta_description: This flow shows how to scrape a web page using AutoScraper in Python.