diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..8d96444 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Existing Dockerfile", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + } + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0b1e1e7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +**/__pycache__ +**/.venv +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/bin +**/charts +**/docker-compose* +**/compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..f33a02c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for more information: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +# https://containers.dev/guide/dependabot + +version: 2 +updates: + - package-ecosystem: "devcontainers" + directory: "/" + schedule: + interval: weekly diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..f3d8430 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + "configurations": [ + { + "name": "Docker: Python - General", + "type": "docker", + "request": "launch", + "preLaunchTask": "docker-run: debug", + "python": { + "pathMappings": [ + { + "localRoot": "${workspaceFolder}", + "remoteRoot": "/app" + } + ], + "projectType": "general" + } + } + ] +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..815dffe --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,26 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "docker-build", + "label": "docker-build", + "platform": "python", + "dockerBuild": { + "tag": "crawler:latest", + "dockerfile": "${workspaceFolder}/Dockerfile", + "context": "${workspaceFolder}", + "pull": true + } + }, + { + "type": "docker-run", + "label": "docker-run: debug", + "dependsOn": [ + "docker-build" + ], + "python": { + "file": "src/main.py" + } + } + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..35926b7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +# For more information, please refer to https://aka.ms/vscode-docker-python +FROM ubuntu:latest + +# Install python3 +RUN apt-get update +RUN apt-get install -y python3 +RUN apt-get install -y python3-pip + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +# Install pip requirements +COPY requirements.txt . +RUN python3 -m pip install -r requirements.txt --break-system-packages + +WORKDIR /app +COPY . /app + +# Creates a non-root user with an explicit UID and adds permission to access the /app folder +# For more info, please refer to https://aka.ms/vscode-docker-python-configure-containers +RUN adduser -u 5678 --disabled-password --gecos "" appuser && chown -R appuser /app +USER appuser + +# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug +CMD ["python3", "src/main.py"] diff --git a/src/routes.py b/src/routes.py index 5e691d9..11cd890 100644 --- a/src/routes.py +++ b/src/routes.py @@ -27,7 +27,7 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f"Processing page: {context.request.url}") url = context.request.url - html_page = str(context.soup).replace("\/", "/") + html_page = str(context.soup).replace(r"\/", "/") matches = re.finditer(regex, html_page)